Commit 08bafc0341f2f7920e9045bc32c40299cac8c21b
Committed by
Jens Axboe
1 parent
7c239517d9
block: Supress Buffer I/O errors when SCSI REQ_QUIET flag set
Allow the scsi request REQ_QUIET flag to be propagated to the buffer file system layer. The basic ideas is to pass the flag from the scsi request to the bio (block IO) and then to the buffer layer. The buffer layer can then suppress needless printks. This patch declutters the kernel log by removed the 40-50 (per lun) buffer io error messages seen during a boot in my multipath setup . It is a good chance any real errors will be missed in the "noise" it the logs without this patch. During boot I see blocks of messages like " __ratelimit: 211 callbacks suppressed Buffer I/O error on device sdm, logical block 5242879 Buffer I/O error on device sdm, logical block 5242879 Buffer I/O error on device sdm, logical block 5242847 Buffer I/O error on device sdm, logical block 1 Buffer I/O error on device sdm, logical block 5242878 Buffer I/O error on device sdm, logical block 5242879 Buffer I/O error on device sdm, logical block 5242879 Buffer I/O error on device sdm, logical block 5242879 Buffer I/O error on device sdm, logical block 5242879 Buffer I/O error on device sdm, logical block 5242872 " in my logs. My disk environment is multipath fiber channel using the SCSI_DH_RDAC code and multipathd. This topology includes an "active" and "ghost" path for each lun. IO's to the "ghost" path will never complete and the SCSI layer, via the scsi device handler rdac code, quick returns the IOs to theses paths and sets the REQ_QUIET scsi flag to suppress the scsi layer messages. I am wanting to extend the QUIET behavior to include the buffer file system layer to deal with these errors as well. I have been running this patch for a while now on several boxes without issue. A few runs of bonnie++ show no noticeable difference in performance in my setup. Thanks for John Stultz for the quiet_error finalization. Submitted-by: Keith Mannthey <kmannth@us.ibm.com> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Showing 4 changed files with 20 additions and 4 deletions Inline Diff
block/blk-core.c
1 | /* | 1 | /* |
2 | * Copyright (C) 1991, 1992 Linus Torvalds | 2 | * Copyright (C) 1991, 1992 Linus Torvalds |
3 | * Copyright (C) 1994, Karl Keyte: Added support for disk statistics | 3 | * Copyright (C) 1994, Karl Keyte: Added support for disk statistics |
4 | * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE | 4 | * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE |
5 | * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> | 5 | * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> |
6 | * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> | 6 | * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> |
7 | * - July2000 | 7 | * - July2000 |
8 | * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 | 8 | * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 |
9 | */ | 9 | */ |
10 | 10 | ||
11 | /* | 11 | /* |
12 | * This handles all read/write requests to block devices | 12 | * This handles all read/write requests to block devices |
13 | */ | 13 | */ |
14 | #include <linux/kernel.h> | 14 | #include <linux/kernel.h> |
15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
16 | #include <linux/backing-dev.h> | 16 | #include <linux/backing-dev.h> |
17 | #include <linux/bio.h> | 17 | #include <linux/bio.h> |
18 | #include <linux/blkdev.h> | 18 | #include <linux/blkdev.h> |
19 | #include <linux/highmem.h> | 19 | #include <linux/highmem.h> |
20 | #include <linux/mm.h> | 20 | #include <linux/mm.h> |
21 | #include <linux/kernel_stat.h> | 21 | #include <linux/kernel_stat.h> |
22 | #include <linux/string.h> | 22 | #include <linux/string.h> |
23 | #include <linux/init.h> | 23 | #include <linux/init.h> |
24 | #include <linux/completion.h> | 24 | #include <linux/completion.h> |
25 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
26 | #include <linux/swap.h> | 26 | #include <linux/swap.h> |
27 | #include <linux/writeback.h> | 27 | #include <linux/writeback.h> |
28 | #include <linux/task_io_accounting_ops.h> | 28 | #include <linux/task_io_accounting_ops.h> |
29 | #include <linux/blktrace_api.h> | 29 | #include <linux/blktrace_api.h> |
30 | #include <linux/fault-inject.h> | 30 | #include <linux/fault-inject.h> |
31 | #include <trace/block.h> | 31 | #include <trace/block.h> |
32 | 32 | ||
33 | #include "blk.h" | 33 | #include "blk.h" |
34 | 34 | ||
35 | DEFINE_TRACE(block_plug); | 35 | DEFINE_TRACE(block_plug); |
36 | DEFINE_TRACE(block_unplug_io); | 36 | DEFINE_TRACE(block_unplug_io); |
37 | DEFINE_TRACE(block_unplug_timer); | 37 | DEFINE_TRACE(block_unplug_timer); |
38 | DEFINE_TRACE(block_getrq); | 38 | DEFINE_TRACE(block_getrq); |
39 | DEFINE_TRACE(block_sleeprq); | 39 | DEFINE_TRACE(block_sleeprq); |
40 | DEFINE_TRACE(block_rq_requeue); | 40 | DEFINE_TRACE(block_rq_requeue); |
41 | DEFINE_TRACE(block_bio_backmerge); | 41 | DEFINE_TRACE(block_bio_backmerge); |
42 | DEFINE_TRACE(block_bio_frontmerge); | 42 | DEFINE_TRACE(block_bio_frontmerge); |
43 | DEFINE_TRACE(block_bio_queue); | 43 | DEFINE_TRACE(block_bio_queue); |
44 | DEFINE_TRACE(block_rq_complete); | 44 | DEFINE_TRACE(block_rq_complete); |
45 | DEFINE_TRACE(block_remap); /* Also used in drivers/md/dm.c */ | 45 | DEFINE_TRACE(block_remap); /* Also used in drivers/md/dm.c */ |
46 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap); | 46 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap); |
47 | 47 | ||
48 | static int __make_request(struct request_queue *q, struct bio *bio); | 48 | static int __make_request(struct request_queue *q, struct bio *bio); |
49 | 49 | ||
50 | /* | 50 | /* |
51 | * For the allocated request tables | 51 | * For the allocated request tables |
52 | */ | 52 | */ |
53 | static struct kmem_cache *request_cachep; | 53 | static struct kmem_cache *request_cachep; |
54 | 54 | ||
55 | /* | 55 | /* |
56 | * For queue allocation | 56 | * For queue allocation |
57 | */ | 57 | */ |
58 | struct kmem_cache *blk_requestq_cachep; | 58 | struct kmem_cache *blk_requestq_cachep; |
59 | 59 | ||
60 | /* | 60 | /* |
61 | * Controlling structure to kblockd | 61 | * Controlling structure to kblockd |
62 | */ | 62 | */ |
63 | static struct workqueue_struct *kblockd_workqueue; | 63 | static struct workqueue_struct *kblockd_workqueue; |
64 | 64 | ||
65 | static void drive_stat_acct(struct request *rq, int new_io) | 65 | static void drive_stat_acct(struct request *rq, int new_io) |
66 | { | 66 | { |
67 | struct hd_struct *part; | 67 | struct hd_struct *part; |
68 | int rw = rq_data_dir(rq); | 68 | int rw = rq_data_dir(rq); |
69 | int cpu; | 69 | int cpu; |
70 | 70 | ||
71 | if (!blk_fs_request(rq) || !rq->rq_disk) | 71 | if (!blk_fs_request(rq) || !rq->rq_disk) |
72 | return; | 72 | return; |
73 | 73 | ||
74 | cpu = part_stat_lock(); | 74 | cpu = part_stat_lock(); |
75 | part = disk_map_sector_rcu(rq->rq_disk, rq->sector); | 75 | part = disk_map_sector_rcu(rq->rq_disk, rq->sector); |
76 | 76 | ||
77 | if (!new_io) | 77 | if (!new_io) |
78 | part_stat_inc(cpu, part, merges[rw]); | 78 | part_stat_inc(cpu, part, merges[rw]); |
79 | else { | 79 | else { |
80 | part_round_stats(cpu, part); | 80 | part_round_stats(cpu, part); |
81 | part_inc_in_flight(part); | 81 | part_inc_in_flight(part); |
82 | } | 82 | } |
83 | 83 | ||
84 | part_stat_unlock(); | 84 | part_stat_unlock(); |
85 | } | 85 | } |
86 | 86 | ||
87 | void blk_queue_congestion_threshold(struct request_queue *q) | 87 | void blk_queue_congestion_threshold(struct request_queue *q) |
88 | { | 88 | { |
89 | int nr; | 89 | int nr; |
90 | 90 | ||
91 | nr = q->nr_requests - (q->nr_requests / 8) + 1; | 91 | nr = q->nr_requests - (q->nr_requests / 8) + 1; |
92 | if (nr > q->nr_requests) | 92 | if (nr > q->nr_requests) |
93 | nr = q->nr_requests; | 93 | nr = q->nr_requests; |
94 | q->nr_congestion_on = nr; | 94 | q->nr_congestion_on = nr; |
95 | 95 | ||
96 | nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1; | 96 | nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1; |
97 | if (nr < 1) | 97 | if (nr < 1) |
98 | nr = 1; | 98 | nr = 1; |
99 | q->nr_congestion_off = nr; | 99 | q->nr_congestion_off = nr; |
100 | } | 100 | } |
101 | 101 | ||
102 | /** | 102 | /** |
103 | * blk_get_backing_dev_info - get the address of a queue's backing_dev_info | 103 | * blk_get_backing_dev_info - get the address of a queue's backing_dev_info |
104 | * @bdev: device | 104 | * @bdev: device |
105 | * | 105 | * |
106 | * Locates the passed device's request queue and returns the address of its | 106 | * Locates the passed device's request queue and returns the address of its |
107 | * backing_dev_info | 107 | * backing_dev_info |
108 | * | 108 | * |
109 | * Will return NULL if the request queue cannot be located. | 109 | * Will return NULL if the request queue cannot be located. |
110 | */ | 110 | */ |
111 | struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) | 111 | struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) |
112 | { | 112 | { |
113 | struct backing_dev_info *ret = NULL; | 113 | struct backing_dev_info *ret = NULL; |
114 | struct request_queue *q = bdev_get_queue(bdev); | 114 | struct request_queue *q = bdev_get_queue(bdev); |
115 | 115 | ||
116 | if (q) | 116 | if (q) |
117 | ret = &q->backing_dev_info; | 117 | ret = &q->backing_dev_info; |
118 | return ret; | 118 | return ret; |
119 | } | 119 | } |
120 | EXPORT_SYMBOL(blk_get_backing_dev_info); | 120 | EXPORT_SYMBOL(blk_get_backing_dev_info); |
121 | 121 | ||
122 | void blk_rq_init(struct request_queue *q, struct request *rq) | 122 | void blk_rq_init(struct request_queue *q, struct request *rq) |
123 | { | 123 | { |
124 | memset(rq, 0, sizeof(*rq)); | 124 | memset(rq, 0, sizeof(*rq)); |
125 | 125 | ||
126 | INIT_LIST_HEAD(&rq->queuelist); | 126 | INIT_LIST_HEAD(&rq->queuelist); |
127 | INIT_LIST_HEAD(&rq->timeout_list); | 127 | INIT_LIST_HEAD(&rq->timeout_list); |
128 | rq->cpu = -1; | 128 | rq->cpu = -1; |
129 | rq->q = q; | 129 | rq->q = q; |
130 | rq->sector = rq->hard_sector = (sector_t) -1; | 130 | rq->sector = rq->hard_sector = (sector_t) -1; |
131 | INIT_HLIST_NODE(&rq->hash); | 131 | INIT_HLIST_NODE(&rq->hash); |
132 | RB_CLEAR_NODE(&rq->rb_node); | 132 | RB_CLEAR_NODE(&rq->rb_node); |
133 | rq->cmd = rq->__cmd; | 133 | rq->cmd = rq->__cmd; |
134 | rq->tag = -1; | 134 | rq->tag = -1; |
135 | rq->ref_count = 1; | 135 | rq->ref_count = 1; |
136 | } | 136 | } |
137 | EXPORT_SYMBOL(blk_rq_init); | 137 | EXPORT_SYMBOL(blk_rq_init); |
138 | 138 | ||
139 | static void req_bio_endio(struct request *rq, struct bio *bio, | 139 | static void req_bio_endio(struct request *rq, struct bio *bio, |
140 | unsigned int nbytes, int error) | 140 | unsigned int nbytes, int error) |
141 | { | 141 | { |
142 | struct request_queue *q = rq->q; | 142 | struct request_queue *q = rq->q; |
143 | 143 | ||
144 | if (&q->bar_rq != rq) { | 144 | if (&q->bar_rq != rq) { |
145 | if (error) | 145 | if (error) |
146 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | 146 | clear_bit(BIO_UPTODATE, &bio->bi_flags); |
147 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | 147 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) |
148 | error = -EIO; | 148 | error = -EIO; |
149 | 149 | ||
150 | if (unlikely(nbytes > bio->bi_size)) { | 150 | if (unlikely(nbytes > bio->bi_size)) { |
151 | printk(KERN_ERR "%s: want %u bytes done, %u left\n", | 151 | printk(KERN_ERR "%s: want %u bytes done, %u left\n", |
152 | __func__, nbytes, bio->bi_size); | 152 | __func__, nbytes, bio->bi_size); |
153 | nbytes = bio->bi_size; | 153 | nbytes = bio->bi_size; |
154 | } | 154 | } |
155 | 155 | ||
156 | if (unlikely(rq->cmd_flags & REQ_QUIET)) | ||
157 | set_bit(BIO_QUIET, &bio->bi_flags); | ||
158 | |||
156 | bio->bi_size -= nbytes; | 159 | bio->bi_size -= nbytes; |
157 | bio->bi_sector += (nbytes >> 9); | 160 | bio->bi_sector += (nbytes >> 9); |
158 | 161 | ||
159 | if (bio_integrity(bio)) | 162 | if (bio_integrity(bio)) |
160 | bio_integrity_advance(bio, nbytes); | 163 | bio_integrity_advance(bio, nbytes); |
161 | 164 | ||
162 | if (bio->bi_size == 0) | 165 | if (bio->bi_size == 0) |
163 | bio_endio(bio, error); | 166 | bio_endio(bio, error); |
164 | } else { | 167 | } else { |
165 | 168 | ||
166 | /* | 169 | /* |
167 | * Okay, this is the barrier request in progress, just | 170 | * Okay, this is the barrier request in progress, just |
168 | * record the error; | 171 | * record the error; |
169 | */ | 172 | */ |
170 | if (error && !q->orderr) | 173 | if (error && !q->orderr) |
171 | q->orderr = error; | 174 | q->orderr = error; |
172 | } | 175 | } |
173 | } | 176 | } |
174 | 177 | ||
175 | void blk_dump_rq_flags(struct request *rq, char *msg) | 178 | void blk_dump_rq_flags(struct request *rq, char *msg) |
176 | { | 179 | { |
177 | int bit; | 180 | int bit; |
178 | 181 | ||
179 | printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg, | 182 | printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg, |
180 | rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, | 183 | rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, |
181 | rq->cmd_flags); | 184 | rq->cmd_flags); |
182 | 185 | ||
183 | printk(KERN_INFO " sector %llu, nr/cnr %lu/%u\n", | 186 | printk(KERN_INFO " sector %llu, nr/cnr %lu/%u\n", |
184 | (unsigned long long)rq->sector, | 187 | (unsigned long long)rq->sector, |
185 | rq->nr_sectors, | 188 | rq->nr_sectors, |
186 | rq->current_nr_sectors); | 189 | rq->current_nr_sectors); |
187 | printk(KERN_INFO " bio %p, biotail %p, buffer %p, data %p, len %u\n", | 190 | printk(KERN_INFO " bio %p, biotail %p, buffer %p, data %p, len %u\n", |
188 | rq->bio, rq->biotail, | 191 | rq->bio, rq->biotail, |
189 | rq->buffer, rq->data, | 192 | rq->buffer, rq->data, |
190 | rq->data_len); | 193 | rq->data_len); |
191 | 194 | ||
192 | if (blk_pc_request(rq)) { | 195 | if (blk_pc_request(rq)) { |
193 | printk(KERN_INFO " cdb: "); | 196 | printk(KERN_INFO " cdb: "); |
194 | for (bit = 0; bit < BLK_MAX_CDB; bit++) | 197 | for (bit = 0; bit < BLK_MAX_CDB; bit++) |
195 | printk("%02x ", rq->cmd[bit]); | 198 | printk("%02x ", rq->cmd[bit]); |
196 | printk("\n"); | 199 | printk("\n"); |
197 | } | 200 | } |
198 | } | 201 | } |
199 | EXPORT_SYMBOL(blk_dump_rq_flags); | 202 | EXPORT_SYMBOL(blk_dump_rq_flags); |
200 | 203 | ||
201 | /* | 204 | /* |
202 | * "plug" the device if there are no outstanding requests: this will | 205 | * "plug" the device if there are no outstanding requests: this will |
203 | * force the transfer to start only after we have put all the requests | 206 | * force the transfer to start only after we have put all the requests |
204 | * on the list. | 207 | * on the list. |
205 | * | 208 | * |
206 | * This is called with interrupts off and no requests on the queue and | 209 | * This is called with interrupts off and no requests on the queue and |
207 | * with the queue lock held. | 210 | * with the queue lock held. |
208 | */ | 211 | */ |
209 | void blk_plug_device(struct request_queue *q) | 212 | void blk_plug_device(struct request_queue *q) |
210 | { | 213 | { |
211 | WARN_ON(!irqs_disabled()); | 214 | WARN_ON(!irqs_disabled()); |
212 | 215 | ||
213 | /* | 216 | /* |
214 | * don't plug a stopped queue, it must be paired with blk_start_queue() | 217 | * don't plug a stopped queue, it must be paired with blk_start_queue() |
215 | * which will restart the queueing | 218 | * which will restart the queueing |
216 | */ | 219 | */ |
217 | if (blk_queue_stopped(q)) | 220 | if (blk_queue_stopped(q)) |
218 | return; | 221 | return; |
219 | 222 | ||
220 | if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) { | 223 | if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) { |
221 | mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); | 224 | mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); |
222 | trace_block_plug(q); | 225 | trace_block_plug(q); |
223 | } | 226 | } |
224 | } | 227 | } |
225 | EXPORT_SYMBOL(blk_plug_device); | 228 | EXPORT_SYMBOL(blk_plug_device); |
226 | 229 | ||
227 | /** | 230 | /** |
228 | * blk_plug_device_unlocked - plug a device without queue lock held | 231 | * blk_plug_device_unlocked - plug a device without queue lock held |
229 | * @q: The &struct request_queue to plug | 232 | * @q: The &struct request_queue to plug |
230 | * | 233 | * |
231 | * Description: | 234 | * Description: |
232 | * Like @blk_plug_device(), but grabs the queue lock and disables | 235 | * Like @blk_plug_device(), but grabs the queue lock and disables |
233 | * interrupts. | 236 | * interrupts. |
234 | **/ | 237 | **/ |
235 | void blk_plug_device_unlocked(struct request_queue *q) | 238 | void blk_plug_device_unlocked(struct request_queue *q) |
236 | { | 239 | { |
237 | unsigned long flags; | 240 | unsigned long flags; |
238 | 241 | ||
239 | spin_lock_irqsave(q->queue_lock, flags); | 242 | spin_lock_irqsave(q->queue_lock, flags); |
240 | blk_plug_device(q); | 243 | blk_plug_device(q); |
241 | spin_unlock_irqrestore(q->queue_lock, flags); | 244 | spin_unlock_irqrestore(q->queue_lock, flags); |
242 | } | 245 | } |
243 | EXPORT_SYMBOL(blk_plug_device_unlocked); | 246 | EXPORT_SYMBOL(blk_plug_device_unlocked); |
244 | 247 | ||
245 | /* | 248 | /* |
246 | * remove the queue from the plugged list, if present. called with | 249 | * remove the queue from the plugged list, if present. called with |
247 | * queue lock held and interrupts disabled. | 250 | * queue lock held and interrupts disabled. |
248 | */ | 251 | */ |
249 | int blk_remove_plug(struct request_queue *q) | 252 | int blk_remove_plug(struct request_queue *q) |
250 | { | 253 | { |
251 | WARN_ON(!irqs_disabled()); | 254 | WARN_ON(!irqs_disabled()); |
252 | 255 | ||
253 | if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q)) | 256 | if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q)) |
254 | return 0; | 257 | return 0; |
255 | 258 | ||
256 | del_timer(&q->unplug_timer); | 259 | del_timer(&q->unplug_timer); |
257 | return 1; | 260 | return 1; |
258 | } | 261 | } |
259 | EXPORT_SYMBOL(blk_remove_plug); | 262 | EXPORT_SYMBOL(blk_remove_plug); |
260 | 263 | ||
261 | /* | 264 | /* |
262 | * remove the plug and let it rip.. | 265 | * remove the plug and let it rip.. |
263 | */ | 266 | */ |
264 | void __generic_unplug_device(struct request_queue *q) | 267 | void __generic_unplug_device(struct request_queue *q) |
265 | { | 268 | { |
266 | if (unlikely(blk_queue_stopped(q))) | 269 | if (unlikely(blk_queue_stopped(q))) |
267 | return; | 270 | return; |
268 | 271 | ||
269 | if (!blk_remove_plug(q)) | 272 | if (!blk_remove_plug(q)) |
270 | return; | 273 | return; |
271 | 274 | ||
272 | q->request_fn(q); | 275 | q->request_fn(q); |
273 | } | 276 | } |
274 | 277 | ||
275 | /** | 278 | /** |
276 | * generic_unplug_device - fire a request queue | 279 | * generic_unplug_device - fire a request queue |
277 | * @q: The &struct request_queue in question | 280 | * @q: The &struct request_queue in question |
278 | * | 281 | * |
279 | * Description: | 282 | * Description: |
280 | * Linux uses plugging to build bigger requests queues before letting | 283 | * Linux uses plugging to build bigger requests queues before letting |
281 | * the device have at them. If a queue is plugged, the I/O scheduler | 284 | * the device have at them. If a queue is plugged, the I/O scheduler |
282 | * is still adding and merging requests on the queue. Once the queue | 285 | * is still adding and merging requests on the queue. Once the queue |
283 | * gets unplugged, the request_fn defined for the queue is invoked and | 286 | * gets unplugged, the request_fn defined for the queue is invoked and |
284 | * transfers started. | 287 | * transfers started. |
285 | **/ | 288 | **/ |
286 | void generic_unplug_device(struct request_queue *q) | 289 | void generic_unplug_device(struct request_queue *q) |
287 | { | 290 | { |
288 | if (blk_queue_plugged(q)) { | 291 | if (blk_queue_plugged(q)) { |
289 | spin_lock_irq(q->queue_lock); | 292 | spin_lock_irq(q->queue_lock); |
290 | __generic_unplug_device(q); | 293 | __generic_unplug_device(q); |
291 | spin_unlock_irq(q->queue_lock); | 294 | spin_unlock_irq(q->queue_lock); |
292 | } | 295 | } |
293 | } | 296 | } |
294 | EXPORT_SYMBOL(generic_unplug_device); | 297 | EXPORT_SYMBOL(generic_unplug_device); |
295 | 298 | ||
296 | static void blk_backing_dev_unplug(struct backing_dev_info *bdi, | 299 | static void blk_backing_dev_unplug(struct backing_dev_info *bdi, |
297 | struct page *page) | 300 | struct page *page) |
298 | { | 301 | { |
299 | struct request_queue *q = bdi->unplug_io_data; | 302 | struct request_queue *q = bdi->unplug_io_data; |
300 | 303 | ||
301 | blk_unplug(q); | 304 | blk_unplug(q); |
302 | } | 305 | } |
303 | 306 | ||
304 | void blk_unplug_work(struct work_struct *work) | 307 | void blk_unplug_work(struct work_struct *work) |
305 | { | 308 | { |
306 | struct request_queue *q = | 309 | struct request_queue *q = |
307 | container_of(work, struct request_queue, unplug_work); | 310 | container_of(work, struct request_queue, unplug_work); |
308 | 311 | ||
309 | trace_block_unplug_io(q); | 312 | trace_block_unplug_io(q); |
310 | q->unplug_fn(q); | 313 | q->unplug_fn(q); |
311 | } | 314 | } |
312 | 315 | ||
313 | void blk_unplug_timeout(unsigned long data) | 316 | void blk_unplug_timeout(unsigned long data) |
314 | { | 317 | { |
315 | struct request_queue *q = (struct request_queue *)data; | 318 | struct request_queue *q = (struct request_queue *)data; |
316 | 319 | ||
317 | trace_block_unplug_timer(q); | 320 | trace_block_unplug_timer(q); |
318 | kblockd_schedule_work(q, &q->unplug_work); | 321 | kblockd_schedule_work(q, &q->unplug_work); |
319 | } | 322 | } |
320 | 323 | ||
321 | void blk_unplug(struct request_queue *q) | 324 | void blk_unplug(struct request_queue *q) |
322 | { | 325 | { |
323 | /* | 326 | /* |
324 | * devices don't necessarily have an ->unplug_fn defined | 327 | * devices don't necessarily have an ->unplug_fn defined |
325 | */ | 328 | */ |
326 | if (q->unplug_fn) { | 329 | if (q->unplug_fn) { |
327 | trace_block_unplug_io(q); | 330 | trace_block_unplug_io(q); |
328 | q->unplug_fn(q); | 331 | q->unplug_fn(q); |
329 | } | 332 | } |
330 | } | 333 | } |
331 | EXPORT_SYMBOL(blk_unplug); | 334 | EXPORT_SYMBOL(blk_unplug); |
332 | 335 | ||
333 | static void blk_invoke_request_fn(struct request_queue *q) | 336 | static void blk_invoke_request_fn(struct request_queue *q) |
334 | { | 337 | { |
335 | if (unlikely(blk_queue_stopped(q))) | 338 | if (unlikely(blk_queue_stopped(q))) |
336 | return; | 339 | return; |
337 | 340 | ||
338 | /* | 341 | /* |
339 | * one level of recursion is ok and is much faster than kicking | 342 | * one level of recursion is ok and is much faster than kicking |
340 | * the unplug handling | 343 | * the unplug handling |
341 | */ | 344 | */ |
342 | if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { | 345 | if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { |
343 | q->request_fn(q); | 346 | q->request_fn(q); |
344 | queue_flag_clear(QUEUE_FLAG_REENTER, q); | 347 | queue_flag_clear(QUEUE_FLAG_REENTER, q); |
345 | } else { | 348 | } else { |
346 | queue_flag_set(QUEUE_FLAG_PLUGGED, q); | 349 | queue_flag_set(QUEUE_FLAG_PLUGGED, q); |
347 | kblockd_schedule_work(q, &q->unplug_work); | 350 | kblockd_schedule_work(q, &q->unplug_work); |
348 | } | 351 | } |
349 | } | 352 | } |
350 | 353 | ||
351 | /** | 354 | /** |
352 | * blk_start_queue - restart a previously stopped queue | 355 | * blk_start_queue - restart a previously stopped queue |
353 | * @q: The &struct request_queue in question | 356 | * @q: The &struct request_queue in question |
354 | * | 357 | * |
355 | * Description: | 358 | * Description: |
356 | * blk_start_queue() will clear the stop flag on the queue, and call | 359 | * blk_start_queue() will clear the stop flag on the queue, and call |
357 | * the request_fn for the queue if it was in a stopped state when | 360 | * the request_fn for the queue if it was in a stopped state when |
358 | * entered. Also see blk_stop_queue(). Queue lock must be held. | 361 | * entered. Also see blk_stop_queue(). Queue lock must be held. |
359 | **/ | 362 | **/ |
360 | void blk_start_queue(struct request_queue *q) | 363 | void blk_start_queue(struct request_queue *q) |
361 | { | 364 | { |
362 | WARN_ON(!irqs_disabled()); | 365 | WARN_ON(!irqs_disabled()); |
363 | 366 | ||
364 | queue_flag_clear(QUEUE_FLAG_STOPPED, q); | 367 | queue_flag_clear(QUEUE_FLAG_STOPPED, q); |
365 | blk_invoke_request_fn(q); | 368 | blk_invoke_request_fn(q); |
366 | } | 369 | } |
367 | EXPORT_SYMBOL(blk_start_queue); | 370 | EXPORT_SYMBOL(blk_start_queue); |
368 | 371 | ||
369 | /** | 372 | /** |
370 | * blk_stop_queue - stop a queue | 373 | * blk_stop_queue - stop a queue |
371 | * @q: The &struct request_queue in question | 374 | * @q: The &struct request_queue in question |
372 | * | 375 | * |
373 | * Description: | 376 | * Description: |
374 | * The Linux block layer assumes that a block driver will consume all | 377 | * The Linux block layer assumes that a block driver will consume all |
375 | * entries on the request queue when the request_fn strategy is called. | 378 | * entries on the request queue when the request_fn strategy is called. |
376 | * Often this will not happen, because of hardware limitations (queue | 379 | * Often this will not happen, because of hardware limitations (queue |
377 | * depth settings). If a device driver gets a 'queue full' response, | 380 | * depth settings). If a device driver gets a 'queue full' response, |
378 | * or if it simply chooses not to queue more I/O at one point, it can | 381 | * or if it simply chooses not to queue more I/O at one point, it can |
379 | * call this function to prevent the request_fn from being called until | 382 | * call this function to prevent the request_fn from being called until |
380 | * the driver has signalled it's ready to go again. This happens by calling | 383 | * the driver has signalled it's ready to go again. This happens by calling |
381 | * blk_start_queue() to restart queue operations. Queue lock must be held. | 384 | * blk_start_queue() to restart queue operations. Queue lock must be held. |
382 | **/ | 385 | **/ |
383 | void blk_stop_queue(struct request_queue *q) | 386 | void blk_stop_queue(struct request_queue *q) |
384 | { | 387 | { |
385 | blk_remove_plug(q); | 388 | blk_remove_plug(q); |
386 | queue_flag_set(QUEUE_FLAG_STOPPED, q); | 389 | queue_flag_set(QUEUE_FLAG_STOPPED, q); |
387 | } | 390 | } |
388 | EXPORT_SYMBOL(blk_stop_queue); | 391 | EXPORT_SYMBOL(blk_stop_queue); |
389 | 392 | ||
390 | /** | 393 | /** |
391 | * blk_sync_queue - cancel any pending callbacks on a queue | 394 | * blk_sync_queue - cancel any pending callbacks on a queue |
392 | * @q: the queue | 395 | * @q: the queue |
393 | * | 396 | * |
394 | * Description: | 397 | * Description: |
395 | * The block layer may perform asynchronous callback activity | 398 | * The block layer may perform asynchronous callback activity |
396 | * on a queue, such as calling the unplug function after a timeout. | 399 | * on a queue, such as calling the unplug function after a timeout. |
397 | * A block device may call blk_sync_queue to ensure that any | 400 | * A block device may call blk_sync_queue to ensure that any |
398 | * such activity is cancelled, thus allowing it to release resources | 401 | * such activity is cancelled, thus allowing it to release resources |
399 | * that the callbacks might use. The caller must already have made sure | 402 | * that the callbacks might use. The caller must already have made sure |
400 | * that its ->make_request_fn will not re-add plugging prior to calling | 403 | * that its ->make_request_fn will not re-add plugging prior to calling |
401 | * this function. | 404 | * this function. |
402 | * | 405 | * |
403 | */ | 406 | */ |
404 | void blk_sync_queue(struct request_queue *q) | 407 | void blk_sync_queue(struct request_queue *q) |
405 | { | 408 | { |
406 | del_timer_sync(&q->unplug_timer); | 409 | del_timer_sync(&q->unplug_timer); |
407 | del_timer_sync(&q->timeout); | 410 | del_timer_sync(&q->timeout); |
408 | kblockd_flush_work(&q->unplug_work); | 411 | kblockd_flush_work(&q->unplug_work); |
409 | } | 412 | } |
410 | EXPORT_SYMBOL(blk_sync_queue); | 413 | EXPORT_SYMBOL(blk_sync_queue); |
411 | 414 | ||
412 | /** | 415 | /** |
413 | * __blk_run_queue - run a single device queue | 416 | * __blk_run_queue - run a single device queue |
414 | * @q: The queue to run | 417 | * @q: The queue to run |
415 | * | 418 | * |
416 | * Description: | 419 | * Description: |
417 | * See @blk_run_queue. This variant must be called with the queue lock | 420 | * See @blk_run_queue. This variant must be called with the queue lock |
418 | * held and interrupts disabled. | 421 | * held and interrupts disabled. |
419 | * | 422 | * |
420 | */ | 423 | */ |
421 | void __blk_run_queue(struct request_queue *q) | 424 | void __blk_run_queue(struct request_queue *q) |
422 | { | 425 | { |
423 | blk_remove_plug(q); | 426 | blk_remove_plug(q); |
424 | 427 | ||
425 | /* | 428 | /* |
426 | * Only recurse once to avoid overrunning the stack, let the unplug | 429 | * Only recurse once to avoid overrunning the stack, let the unplug |
427 | * handling reinvoke the handler shortly if we already got there. | 430 | * handling reinvoke the handler shortly if we already got there. |
428 | */ | 431 | */ |
429 | if (!elv_queue_empty(q)) | 432 | if (!elv_queue_empty(q)) |
430 | blk_invoke_request_fn(q); | 433 | blk_invoke_request_fn(q); |
431 | } | 434 | } |
432 | EXPORT_SYMBOL(__blk_run_queue); | 435 | EXPORT_SYMBOL(__blk_run_queue); |
433 | 436 | ||
434 | /** | 437 | /** |
435 | * blk_run_queue - run a single device queue | 438 | * blk_run_queue - run a single device queue |
436 | * @q: The queue to run | 439 | * @q: The queue to run |
437 | * | 440 | * |
438 | * Description: | 441 | * Description: |
439 | * Invoke request handling on this queue, if it has pending work to do. | 442 | * Invoke request handling on this queue, if it has pending work to do. |
440 | * May be used to restart queueing when a request has completed. Also | 443 | * May be used to restart queueing when a request has completed. Also |
441 | * See @blk_start_queueing. | 444 | * See @blk_start_queueing. |
442 | * | 445 | * |
443 | */ | 446 | */ |
444 | void blk_run_queue(struct request_queue *q) | 447 | void blk_run_queue(struct request_queue *q) |
445 | { | 448 | { |
446 | unsigned long flags; | 449 | unsigned long flags; |
447 | 450 | ||
448 | spin_lock_irqsave(q->queue_lock, flags); | 451 | spin_lock_irqsave(q->queue_lock, flags); |
449 | __blk_run_queue(q); | 452 | __blk_run_queue(q); |
450 | spin_unlock_irqrestore(q->queue_lock, flags); | 453 | spin_unlock_irqrestore(q->queue_lock, flags); |
451 | } | 454 | } |
452 | EXPORT_SYMBOL(blk_run_queue); | 455 | EXPORT_SYMBOL(blk_run_queue); |
453 | 456 | ||
454 | void blk_put_queue(struct request_queue *q) | 457 | void blk_put_queue(struct request_queue *q) |
455 | { | 458 | { |
456 | kobject_put(&q->kobj); | 459 | kobject_put(&q->kobj); |
457 | } | 460 | } |
458 | 461 | ||
459 | void blk_cleanup_queue(struct request_queue *q) | 462 | void blk_cleanup_queue(struct request_queue *q) |
460 | { | 463 | { |
461 | /* | 464 | /* |
462 | * We know we have process context here, so we can be a little | 465 | * We know we have process context here, so we can be a little |
463 | * cautious and ensure that pending block actions on this device | 466 | * cautious and ensure that pending block actions on this device |
464 | * are done before moving on. Going into this function, we should | 467 | * are done before moving on. Going into this function, we should |
465 | * not have processes doing IO to this device. | 468 | * not have processes doing IO to this device. |
466 | */ | 469 | */ |
467 | blk_sync_queue(q); | 470 | blk_sync_queue(q); |
468 | 471 | ||
469 | mutex_lock(&q->sysfs_lock); | 472 | mutex_lock(&q->sysfs_lock); |
470 | queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); | 473 | queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); |
471 | mutex_unlock(&q->sysfs_lock); | 474 | mutex_unlock(&q->sysfs_lock); |
472 | 475 | ||
473 | if (q->elevator) | 476 | if (q->elevator) |
474 | elevator_exit(q->elevator); | 477 | elevator_exit(q->elevator); |
475 | 478 | ||
476 | blk_put_queue(q); | 479 | blk_put_queue(q); |
477 | } | 480 | } |
478 | EXPORT_SYMBOL(blk_cleanup_queue); | 481 | EXPORT_SYMBOL(blk_cleanup_queue); |
479 | 482 | ||
480 | static int blk_init_free_list(struct request_queue *q) | 483 | static int blk_init_free_list(struct request_queue *q) |
481 | { | 484 | { |
482 | struct request_list *rl = &q->rq; | 485 | struct request_list *rl = &q->rq; |
483 | 486 | ||
484 | rl->count[READ] = rl->count[WRITE] = 0; | 487 | rl->count[READ] = rl->count[WRITE] = 0; |
485 | rl->starved[READ] = rl->starved[WRITE] = 0; | 488 | rl->starved[READ] = rl->starved[WRITE] = 0; |
486 | rl->elvpriv = 0; | 489 | rl->elvpriv = 0; |
487 | init_waitqueue_head(&rl->wait[READ]); | 490 | init_waitqueue_head(&rl->wait[READ]); |
488 | init_waitqueue_head(&rl->wait[WRITE]); | 491 | init_waitqueue_head(&rl->wait[WRITE]); |
489 | 492 | ||
490 | rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, | 493 | rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, |
491 | mempool_free_slab, request_cachep, q->node); | 494 | mempool_free_slab, request_cachep, q->node); |
492 | 495 | ||
493 | if (!rl->rq_pool) | 496 | if (!rl->rq_pool) |
494 | return -ENOMEM; | 497 | return -ENOMEM; |
495 | 498 | ||
496 | return 0; | 499 | return 0; |
497 | } | 500 | } |
498 | 501 | ||
499 | struct request_queue *blk_alloc_queue(gfp_t gfp_mask) | 502 | struct request_queue *blk_alloc_queue(gfp_t gfp_mask) |
500 | { | 503 | { |
501 | return blk_alloc_queue_node(gfp_mask, -1); | 504 | return blk_alloc_queue_node(gfp_mask, -1); |
502 | } | 505 | } |
503 | EXPORT_SYMBOL(blk_alloc_queue); | 506 | EXPORT_SYMBOL(blk_alloc_queue); |
504 | 507 | ||
505 | struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) | 508 | struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) |
506 | { | 509 | { |
507 | struct request_queue *q; | 510 | struct request_queue *q; |
508 | int err; | 511 | int err; |
509 | 512 | ||
510 | q = kmem_cache_alloc_node(blk_requestq_cachep, | 513 | q = kmem_cache_alloc_node(blk_requestq_cachep, |
511 | gfp_mask | __GFP_ZERO, node_id); | 514 | gfp_mask | __GFP_ZERO, node_id); |
512 | if (!q) | 515 | if (!q) |
513 | return NULL; | 516 | return NULL; |
514 | 517 | ||
515 | q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; | 518 | q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; |
516 | q->backing_dev_info.unplug_io_data = q; | 519 | q->backing_dev_info.unplug_io_data = q; |
517 | err = bdi_init(&q->backing_dev_info); | 520 | err = bdi_init(&q->backing_dev_info); |
518 | if (err) { | 521 | if (err) { |
519 | kmem_cache_free(blk_requestq_cachep, q); | 522 | kmem_cache_free(blk_requestq_cachep, q); |
520 | return NULL; | 523 | return NULL; |
521 | } | 524 | } |
522 | 525 | ||
523 | init_timer(&q->unplug_timer); | 526 | init_timer(&q->unplug_timer); |
524 | setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); | 527 | setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); |
525 | INIT_LIST_HEAD(&q->timeout_list); | 528 | INIT_LIST_HEAD(&q->timeout_list); |
526 | INIT_WORK(&q->unplug_work, blk_unplug_work); | 529 | INIT_WORK(&q->unplug_work, blk_unplug_work); |
527 | 530 | ||
528 | kobject_init(&q->kobj, &blk_queue_ktype); | 531 | kobject_init(&q->kobj, &blk_queue_ktype); |
529 | 532 | ||
530 | mutex_init(&q->sysfs_lock); | 533 | mutex_init(&q->sysfs_lock); |
531 | spin_lock_init(&q->__queue_lock); | 534 | spin_lock_init(&q->__queue_lock); |
532 | 535 | ||
533 | return q; | 536 | return q; |
534 | } | 537 | } |
535 | EXPORT_SYMBOL(blk_alloc_queue_node); | 538 | EXPORT_SYMBOL(blk_alloc_queue_node); |
536 | 539 | ||
537 | /** | 540 | /** |
538 | * blk_init_queue - prepare a request queue for use with a block device | 541 | * blk_init_queue - prepare a request queue for use with a block device |
539 | * @rfn: The function to be called to process requests that have been | 542 | * @rfn: The function to be called to process requests that have been |
540 | * placed on the queue. | 543 | * placed on the queue. |
541 | * @lock: Request queue spin lock | 544 | * @lock: Request queue spin lock |
542 | * | 545 | * |
543 | * Description: | 546 | * Description: |
544 | * If a block device wishes to use the standard request handling procedures, | 547 | * If a block device wishes to use the standard request handling procedures, |
545 | * which sorts requests and coalesces adjacent requests, then it must | 548 | * which sorts requests and coalesces adjacent requests, then it must |
546 | * call blk_init_queue(). The function @rfn will be called when there | 549 | * call blk_init_queue(). The function @rfn will be called when there |
547 | * are requests on the queue that need to be processed. If the device | 550 | * are requests on the queue that need to be processed. If the device |
548 | * supports plugging, then @rfn may not be called immediately when requests | 551 | * supports plugging, then @rfn may not be called immediately when requests |
549 | * are available on the queue, but may be called at some time later instead. | 552 | * are available on the queue, but may be called at some time later instead. |
550 | * Plugged queues are generally unplugged when a buffer belonging to one | 553 | * Plugged queues are generally unplugged when a buffer belonging to one |
551 | * of the requests on the queue is needed, or due to memory pressure. | 554 | * of the requests on the queue is needed, or due to memory pressure. |
552 | * | 555 | * |
553 | * @rfn is not required, or even expected, to remove all requests off the | 556 | * @rfn is not required, or even expected, to remove all requests off the |
554 | * queue, but only as many as it can handle at a time. If it does leave | 557 | * queue, but only as many as it can handle at a time. If it does leave |
555 | * requests on the queue, it is responsible for arranging that the requests | 558 | * requests on the queue, it is responsible for arranging that the requests |
556 | * get dealt with eventually. | 559 | * get dealt with eventually. |
557 | * | 560 | * |
558 | * The queue spin lock must be held while manipulating the requests on the | 561 | * The queue spin lock must be held while manipulating the requests on the |
559 | * request queue; this lock will be taken also from interrupt context, so irq | 562 | * request queue; this lock will be taken also from interrupt context, so irq |
560 | * disabling is needed for it. | 563 | * disabling is needed for it. |
561 | * | 564 | * |
562 | * Function returns a pointer to the initialized request queue, or %NULL if | 565 | * Function returns a pointer to the initialized request queue, or %NULL if |
563 | * it didn't succeed. | 566 | * it didn't succeed. |
564 | * | 567 | * |
565 | * Note: | 568 | * Note: |
566 | * blk_init_queue() must be paired with a blk_cleanup_queue() call | 569 | * blk_init_queue() must be paired with a blk_cleanup_queue() call |
567 | * when the block device is deactivated (such as at module unload). | 570 | * when the block device is deactivated (such as at module unload). |
568 | **/ | 571 | **/ |
569 | 572 | ||
570 | struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) | 573 | struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) |
571 | { | 574 | { |
572 | return blk_init_queue_node(rfn, lock, -1); | 575 | return blk_init_queue_node(rfn, lock, -1); |
573 | } | 576 | } |
574 | EXPORT_SYMBOL(blk_init_queue); | 577 | EXPORT_SYMBOL(blk_init_queue); |
575 | 578 | ||
576 | struct request_queue * | 579 | struct request_queue * |
577 | blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) | 580 | blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) |
578 | { | 581 | { |
579 | struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); | 582 | struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); |
580 | 583 | ||
581 | if (!q) | 584 | if (!q) |
582 | return NULL; | 585 | return NULL; |
583 | 586 | ||
584 | q->node = node_id; | 587 | q->node = node_id; |
585 | if (blk_init_free_list(q)) { | 588 | if (blk_init_free_list(q)) { |
586 | kmem_cache_free(blk_requestq_cachep, q); | 589 | kmem_cache_free(blk_requestq_cachep, q); |
587 | return NULL; | 590 | return NULL; |
588 | } | 591 | } |
589 | 592 | ||
590 | /* | 593 | /* |
591 | * if caller didn't supply a lock, they get per-queue locking with | 594 | * if caller didn't supply a lock, they get per-queue locking with |
592 | * our embedded lock | 595 | * our embedded lock |
593 | */ | 596 | */ |
594 | if (!lock) | 597 | if (!lock) |
595 | lock = &q->__queue_lock; | 598 | lock = &q->__queue_lock; |
596 | 599 | ||
597 | q->request_fn = rfn; | 600 | q->request_fn = rfn; |
598 | q->prep_rq_fn = NULL; | 601 | q->prep_rq_fn = NULL; |
599 | q->unplug_fn = generic_unplug_device; | 602 | q->unplug_fn = generic_unplug_device; |
600 | q->queue_flags = (1 << QUEUE_FLAG_CLUSTER | | 603 | q->queue_flags = (1 << QUEUE_FLAG_CLUSTER | |
601 | 1 << QUEUE_FLAG_STACKABLE); | 604 | 1 << QUEUE_FLAG_STACKABLE); |
602 | q->queue_lock = lock; | 605 | q->queue_lock = lock; |
603 | 606 | ||
604 | blk_queue_segment_boundary(q, BLK_SEG_BOUNDARY_MASK); | 607 | blk_queue_segment_boundary(q, BLK_SEG_BOUNDARY_MASK); |
605 | 608 | ||
606 | blk_queue_make_request(q, __make_request); | 609 | blk_queue_make_request(q, __make_request); |
607 | blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE); | 610 | blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE); |
608 | 611 | ||
609 | blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); | 612 | blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); |
610 | blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); | 613 | blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); |
611 | 614 | ||
612 | q->sg_reserved_size = INT_MAX; | 615 | q->sg_reserved_size = INT_MAX; |
613 | 616 | ||
614 | blk_set_cmd_filter_defaults(&q->cmd_filter); | 617 | blk_set_cmd_filter_defaults(&q->cmd_filter); |
615 | 618 | ||
616 | /* | 619 | /* |
617 | * all done | 620 | * all done |
618 | */ | 621 | */ |
619 | if (!elevator_init(q, NULL)) { | 622 | if (!elevator_init(q, NULL)) { |
620 | blk_queue_congestion_threshold(q); | 623 | blk_queue_congestion_threshold(q); |
621 | return q; | 624 | return q; |
622 | } | 625 | } |
623 | 626 | ||
624 | blk_put_queue(q); | 627 | blk_put_queue(q); |
625 | return NULL; | 628 | return NULL; |
626 | } | 629 | } |
627 | EXPORT_SYMBOL(blk_init_queue_node); | 630 | EXPORT_SYMBOL(blk_init_queue_node); |
628 | 631 | ||
629 | int blk_get_queue(struct request_queue *q) | 632 | int blk_get_queue(struct request_queue *q) |
630 | { | 633 | { |
631 | if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { | 634 | if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { |
632 | kobject_get(&q->kobj); | 635 | kobject_get(&q->kobj); |
633 | return 0; | 636 | return 0; |
634 | } | 637 | } |
635 | 638 | ||
636 | return 1; | 639 | return 1; |
637 | } | 640 | } |
638 | 641 | ||
639 | static inline void blk_free_request(struct request_queue *q, struct request *rq) | 642 | static inline void blk_free_request(struct request_queue *q, struct request *rq) |
640 | { | 643 | { |
641 | if (rq->cmd_flags & REQ_ELVPRIV) | 644 | if (rq->cmd_flags & REQ_ELVPRIV) |
642 | elv_put_request(q, rq); | 645 | elv_put_request(q, rq); |
643 | mempool_free(rq, q->rq.rq_pool); | 646 | mempool_free(rq, q->rq.rq_pool); |
644 | } | 647 | } |
645 | 648 | ||
646 | static struct request * | 649 | static struct request * |
647 | blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask) | 650 | blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask) |
648 | { | 651 | { |
649 | struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); | 652 | struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); |
650 | 653 | ||
651 | if (!rq) | 654 | if (!rq) |
652 | return NULL; | 655 | return NULL; |
653 | 656 | ||
654 | blk_rq_init(q, rq); | 657 | blk_rq_init(q, rq); |
655 | 658 | ||
656 | rq->cmd_flags = rw | REQ_ALLOCED; | 659 | rq->cmd_flags = rw | REQ_ALLOCED; |
657 | 660 | ||
658 | if (priv) { | 661 | if (priv) { |
659 | if (unlikely(elv_set_request(q, rq, gfp_mask))) { | 662 | if (unlikely(elv_set_request(q, rq, gfp_mask))) { |
660 | mempool_free(rq, q->rq.rq_pool); | 663 | mempool_free(rq, q->rq.rq_pool); |
661 | return NULL; | 664 | return NULL; |
662 | } | 665 | } |
663 | rq->cmd_flags |= REQ_ELVPRIV; | 666 | rq->cmd_flags |= REQ_ELVPRIV; |
664 | } | 667 | } |
665 | 668 | ||
666 | return rq; | 669 | return rq; |
667 | } | 670 | } |
668 | 671 | ||
669 | /* | 672 | /* |
670 | * ioc_batching returns true if the ioc is a valid batching request and | 673 | * ioc_batching returns true if the ioc is a valid batching request and |
671 | * should be given priority access to a request. | 674 | * should be given priority access to a request. |
672 | */ | 675 | */ |
673 | static inline int ioc_batching(struct request_queue *q, struct io_context *ioc) | 676 | static inline int ioc_batching(struct request_queue *q, struct io_context *ioc) |
674 | { | 677 | { |
675 | if (!ioc) | 678 | if (!ioc) |
676 | return 0; | 679 | return 0; |
677 | 680 | ||
678 | /* | 681 | /* |
679 | * Make sure the process is able to allocate at least 1 request | 682 | * Make sure the process is able to allocate at least 1 request |
680 | * even if the batch times out, otherwise we could theoretically | 683 | * even if the batch times out, otherwise we could theoretically |
681 | * lose wakeups. | 684 | * lose wakeups. |
682 | */ | 685 | */ |
683 | return ioc->nr_batch_requests == q->nr_batching || | 686 | return ioc->nr_batch_requests == q->nr_batching || |
684 | (ioc->nr_batch_requests > 0 | 687 | (ioc->nr_batch_requests > 0 |
685 | && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME)); | 688 | && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME)); |
686 | } | 689 | } |
687 | 690 | ||
688 | /* | 691 | /* |
689 | * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This | 692 | * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This |
690 | * will cause the process to be a "batcher" on all queues in the system. This | 693 | * will cause the process to be a "batcher" on all queues in the system. This |
691 | * is the behaviour we want though - once it gets a wakeup it should be given | 694 | * is the behaviour we want though - once it gets a wakeup it should be given |
692 | * a nice run. | 695 | * a nice run. |
693 | */ | 696 | */ |
694 | static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) | 697 | static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) |
695 | { | 698 | { |
696 | if (!ioc || ioc_batching(q, ioc)) | 699 | if (!ioc || ioc_batching(q, ioc)) |
697 | return; | 700 | return; |
698 | 701 | ||
699 | ioc->nr_batch_requests = q->nr_batching; | 702 | ioc->nr_batch_requests = q->nr_batching; |
700 | ioc->last_waited = jiffies; | 703 | ioc->last_waited = jiffies; |
701 | } | 704 | } |
702 | 705 | ||
703 | static void __freed_request(struct request_queue *q, int rw) | 706 | static void __freed_request(struct request_queue *q, int rw) |
704 | { | 707 | { |
705 | struct request_list *rl = &q->rq; | 708 | struct request_list *rl = &q->rq; |
706 | 709 | ||
707 | if (rl->count[rw] < queue_congestion_off_threshold(q)) | 710 | if (rl->count[rw] < queue_congestion_off_threshold(q)) |
708 | blk_clear_queue_congested(q, rw); | 711 | blk_clear_queue_congested(q, rw); |
709 | 712 | ||
710 | if (rl->count[rw] + 1 <= q->nr_requests) { | 713 | if (rl->count[rw] + 1 <= q->nr_requests) { |
711 | if (waitqueue_active(&rl->wait[rw])) | 714 | if (waitqueue_active(&rl->wait[rw])) |
712 | wake_up(&rl->wait[rw]); | 715 | wake_up(&rl->wait[rw]); |
713 | 716 | ||
714 | blk_clear_queue_full(q, rw); | 717 | blk_clear_queue_full(q, rw); |
715 | } | 718 | } |
716 | } | 719 | } |
717 | 720 | ||
718 | /* | 721 | /* |
719 | * A request has just been released. Account for it, update the full and | 722 | * A request has just been released. Account for it, update the full and |
720 | * congestion status, wake up any waiters. Called under q->queue_lock. | 723 | * congestion status, wake up any waiters. Called under q->queue_lock. |
721 | */ | 724 | */ |
722 | static void freed_request(struct request_queue *q, int rw, int priv) | 725 | static void freed_request(struct request_queue *q, int rw, int priv) |
723 | { | 726 | { |
724 | struct request_list *rl = &q->rq; | 727 | struct request_list *rl = &q->rq; |
725 | 728 | ||
726 | rl->count[rw]--; | 729 | rl->count[rw]--; |
727 | if (priv) | 730 | if (priv) |
728 | rl->elvpriv--; | 731 | rl->elvpriv--; |
729 | 732 | ||
730 | __freed_request(q, rw); | 733 | __freed_request(q, rw); |
731 | 734 | ||
732 | if (unlikely(rl->starved[rw ^ 1])) | 735 | if (unlikely(rl->starved[rw ^ 1])) |
733 | __freed_request(q, rw ^ 1); | 736 | __freed_request(q, rw ^ 1); |
734 | } | 737 | } |
735 | 738 | ||
736 | #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist) | 739 | #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist) |
737 | /* | 740 | /* |
738 | * Get a free request, queue_lock must be held. | 741 | * Get a free request, queue_lock must be held. |
739 | * Returns NULL on failure, with queue_lock held. | 742 | * Returns NULL on failure, with queue_lock held. |
740 | * Returns !NULL on success, with queue_lock *not held*. | 743 | * Returns !NULL on success, with queue_lock *not held*. |
741 | */ | 744 | */ |
742 | static struct request *get_request(struct request_queue *q, int rw_flags, | 745 | static struct request *get_request(struct request_queue *q, int rw_flags, |
743 | struct bio *bio, gfp_t gfp_mask) | 746 | struct bio *bio, gfp_t gfp_mask) |
744 | { | 747 | { |
745 | struct request *rq = NULL; | 748 | struct request *rq = NULL; |
746 | struct request_list *rl = &q->rq; | 749 | struct request_list *rl = &q->rq; |
747 | struct io_context *ioc = NULL; | 750 | struct io_context *ioc = NULL; |
748 | const int rw = rw_flags & 0x01; | 751 | const int rw = rw_flags & 0x01; |
749 | int may_queue, priv; | 752 | int may_queue, priv; |
750 | 753 | ||
751 | may_queue = elv_may_queue(q, rw_flags); | 754 | may_queue = elv_may_queue(q, rw_flags); |
752 | if (may_queue == ELV_MQUEUE_NO) | 755 | if (may_queue == ELV_MQUEUE_NO) |
753 | goto rq_starved; | 756 | goto rq_starved; |
754 | 757 | ||
755 | if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) { | 758 | if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) { |
756 | if (rl->count[rw]+1 >= q->nr_requests) { | 759 | if (rl->count[rw]+1 >= q->nr_requests) { |
757 | ioc = current_io_context(GFP_ATOMIC, q->node); | 760 | ioc = current_io_context(GFP_ATOMIC, q->node); |
758 | /* | 761 | /* |
759 | * The queue will fill after this allocation, so set | 762 | * The queue will fill after this allocation, so set |
760 | * it as full, and mark this process as "batching". | 763 | * it as full, and mark this process as "batching". |
761 | * This process will be allowed to complete a batch of | 764 | * This process will be allowed to complete a batch of |
762 | * requests, others will be blocked. | 765 | * requests, others will be blocked. |
763 | */ | 766 | */ |
764 | if (!blk_queue_full(q, rw)) { | 767 | if (!blk_queue_full(q, rw)) { |
765 | ioc_set_batching(q, ioc); | 768 | ioc_set_batching(q, ioc); |
766 | blk_set_queue_full(q, rw); | 769 | blk_set_queue_full(q, rw); |
767 | } else { | 770 | } else { |
768 | if (may_queue != ELV_MQUEUE_MUST | 771 | if (may_queue != ELV_MQUEUE_MUST |
769 | && !ioc_batching(q, ioc)) { | 772 | && !ioc_batching(q, ioc)) { |
770 | /* | 773 | /* |
771 | * The queue is full and the allocating | 774 | * The queue is full and the allocating |
772 | * process is not a "batcher", and not | 775 | * process is not a "batcher", and not |
773 | * exempted by the IO scheduler | 776 | * exempted by the IO scheduler |
774 | */ | 777 | */ |
775 | goto out; | 778 | goto out; |
776 | } | 779 | } |
777 | } | 780 | } |
778 | } | 781 | } |
779 | blk_set_queue_congested(q, rw); | 782 | blk_set_queue_congested(q, rw); |
780 | } | 783 | } |
781 | 784 | ||
782 | /* | 785 | /* |
783 | * Only allow batching queuers to allocate up to 50% over the defined | 786 | * Only allow batching queuers to allocate up to 50% over the defined |
784 | * limit of requests, otherwise we could have thousands of requests | 787 | * limit of requests, otherwise we could have thousands of requests |
785 | * allocated with any setting of ->nr_requests | 788 | * allocated with any setting of ->nr_requests |
786 | */ | 789 | */ |
787 | if (rl->count[rw] >= (3 * q->nr_requests / 2)) | 790 | if (rl->count[rw] >= (3 * q->nr_requests / 2)) |
788 | goto out; | 791 | goto out; |
789 | 792 | ||
790 | rl->count[rw]++; | 793 | rl->count[rw]++; |
791 | rl->starved[rw] = 0; | 794 | rl->starved[rw] = 0; |
792 | 795 | ||
793 | priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); | 796 | priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); |
794 | if (priv) | 797 | if (priv) |
795 | rl->elvpriv++; | 798 | rl->elvpriv++; |
796 | 799 | ||
797 | spin_unlock_irq(q->queue_lock); | 800 | spin_unlock_irq(q->queue_lock); |
798 | 801 | ||
799 | rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); | 802 | rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); |
800 | if (unlikely(!rq)) { | 803 | if (unlikely(!rq)) { |
801 | /* | 804 | /* |
802 | * Allocation failed presumably due to memory. Undo anything | 805 | * Allocation failed presumably due to memory. Undo anything |
803 | * we might have messed up. | 806 | * we might have messed up. |
804 | * | 807 | * |
805 | * Allocating task should really be put onto the front of the | 808 | * Allocating task should really be put onto the front of the |
806 | * wait queue, but this is pretty rare. | 809 | * wait queue, but this is pretty rare. |
807 | */ | 810 | */ |
808 | spin_lock_irq(q->queue_lock); | 811 | spin_lock_irq(q->queue_lock); |
809 | freed_request(q, rw, priv); | 812 | freed_request(q, rw, priv); |
810 | 813 | ||
811 | /* | 814 | /* |
812 | * in the very unlikely event that allocation failed and no | 815 | * in the very unlikely event that allocation failed and no |
813 | * requests for this direction was pending, mark us starved | 816 | * requests for this direction was pending, mark us starved |
814 | * so that freeing of a request in the other direction will | 817 | * so that freeing of a request in the other direction will |
815 | * notice us. another possible fix would be to split the | 818 | * notice us. another possible fix would be to split the |
816 | * rq mempool into READ and WRITE | 819 | * rq mempool into READ and WRITE |
817 | */ | 820 | */ |
818 | rq_starved: | 821 | rq_starved: |
819 | if (unlikely(rl->count[rw] == 0)) | 822 | if (unlikely(rl->count[rw] == 0)) |
820 | rl->starved[rw] = 1; | 823 | rl->starved[rw] = 1; |
821 | 824 | ||
822 | goto out; | 825 | goto out; |
823 | } | 826 | } |
824 | 827 | ||
825 | /* | 828 | /* |
826 | * ioc may be NULL here, and ioc_batching will be false. That's | 829 | * ioc may be NULL here, and ioc_batching will be false. That's |
827 | * OK, if the queue is under the request limit then requests need | 830 | * OK, if the queue is under the request limit then requests need |
828 | * not count toward the nr_batch_requests limit. There will always | 831 | * not count toward the nr_batch_requests limit. There will always |
829 | * be some limit enforced by BLK_BATCH_TIME. | 832 | * be some limit enforced by BLK_BATCH_TIME. |
830 | */ | 833 | */ |
831 | if (ioc_batching(q, ioc)) | 834 | if (ioc_batching(q, ioc)) |
832 | ioc->nr_batch_requests--; | 835 | ioc->nr_batch_requests--; |
833 | 836 | ||
834 | trace_block_getrq(q, bio, rw); | 837 | trace_block_getrq(q, bio, rw); |
835 | out: | 838 | out: |
836 | return rq; | 839 | return rq; |
837 | } | 840 | } |
838 | 841 | ||
839 | /* | 842 | /* |
840 | * No available requests for this queue, unplug the device and wait for some | 843 | * No available requests for this queue, unplug the device and wait for some |
841 | * requests to become available. | 844 | * requests to become available. |
842 | * | 845 | * |
843 | * Called with q->queue_lock held, and returns with it unlocked. | 846 | * Called with q->queue_lock held, and returns with it unlocked. |
844 | */ | 847 | */ |
845 | static struct request *get_request_wait(struct request_queue *q, int rw_flags, | 848 | static struct request *get_request_wait(struct request_queue *q, int rw_flags, |
846 | struct bio *bio) | 849 | struct bio *bio) |
847 | { | 850 | { |
848 | const int rw = rw_flags & 0x01; | 851 | const int rw = rw_flags & 0x01; |
849 | struct request *rq; | 852 | struct request *rq; |
850 | 853 | ||
851 | rq = get_request(q, rw_flags, bio, GFP_NOIO); | 854 | rq = get_request(q, rw_flags, bio, GFP_NOIO); |
852 | while (!rq) { | 855 | while (!rq) { |
853 | DEFINE_WAIT(wait); | 856 | DEFINE_WAIT(wait); |
854 | struct io_context *ioc; | 857 | struct io_context *ioc; |
855 | struct request_list *rl = &q->rq; | 858 | struct request_list *rl = &q->rq; |
856 | 859 | ||
857 | prepare_to_wait_exclusive(&rl->wait[rw], &wait, | 860 | prepare_to_wait_exclusive(&rl->wait[rw], &wait, |
858 | TASK_UNINTERRUPTIBLE); | 861 | TASK_UNINTERRUPTIBLE); |
859 | 862 | ||
860 | trace_block_sleeprq(q, bio, rw); | 863 | trace_block_sleeprq(q, bio, rw); |
861 | 864 | ||
862 | __generic_unplug_device(q); | 865 | __generic_unplug_device(q); |
863 | spin_unlock_irq(q->queue_lock); | 866 | spin_unlock_irq(q->queue_lock); |
864 | io_schedule(); | 867 | io_schedule(); |
865 | 868 | ||
866 | /* | 869 | /* |
867 | * After sleeping, we become a "batching" process and | 870 | * After sleeping, we become a "batching" process and |
868 | * will be able to allocate at least one request, and | 871 | * will be able to allocate at least one request, and |
869 | * up to a big batch of them for a small period time. | 872 | * up to a big batch of them for a small period time. |
870 | * See ioc_batching, ioc_set_batching | 873 | * See ioc_batching, ioc_set_batching |
871 | */ | 874 | */ |
872 | ioc = current_io_context(GFP_NOIO, q->node); | 875 | ioc = current_io_context(GFP_NOIO, q->node); |
873 | ioc_set_batching(q, ioc); | 876 | ioc_set_batching(q, ioc); |
874 | 877 | ||
875 | spin_lock_irq(q->queue_lock); | 878 | spin_lock_irq(q->queue_lock); |
876 | finish_wait(&rl->wait[rw], &wait); | 879 | finish_wait(&rl->wait[rw], &wait); |
877 | 880 | ||
878 | rq = get_request(q, rw_flags, bio, GFP_NOIO); | 881 | rq = get_request(q, rw_flags, bio, GFP_NOIO); |
879 | }; | 882 | }; |
880 | 883 | ||
881 | return rq; | 884 | return rq; |
882 | } | 885 | } |
883 | 886 | ||
884 | struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) | 887 | struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) |
885 | { | 888 | { |
886 | struct request *rq; | 889 | struct request *rq; |
887 | 890 | ||
888 | BUG_ON(rw != READ && rw != WRITE); | 891 | BUG_ON(rw != READ && rw != WRITE); |
889 | 892 | ||
890 | spin_lock_irq(q->queue_lock); | 893 | spin_lock_irq(q->queue_lock); |
891 | if (gfp_mask & __GFP_WAIT) { | 894 | if (gfp_mask & __GFP_WAIT) { |
892 | rq = get_request_wait(q, rw, NULL); | 895 | rq = get_request_wait(q, rw, NULL); |
893 | } else { | 896 | } else { |
894 | rq = get_request(q, rw, NULL, gfp_mask); | 897 | rq = get_request(q, rw, NULL, gfp_mask); |
895 | if (!rq) | 898 | if (!rq) |
896 | spin_unlock_irq(q->queue_lock); | 899 | spin_unlock_irq(q->queue_lock); |
897 | } | 900 | } |
898 | /* q->queue_lock is unlocked at this point */ | 901 | /* q->queue_lock is unlocked at this point */ |
899 | 902 | ||
900 | return rq; | 903 | return rq; |
901 | } | 904 | } |
902 | EXPORT_SYMBOL(blk_get_request); | 905 | EXPORT_SYMBOL(blk_get_request); |
903 | 906 | ||
904 | /** | 907 | /** |
905 | * blk_start_queueing - initiate dispatch of requests to device | 908 | * blk_start_queueing - initiate dispatch of requests to device |
906 | * @q: request queue to kick into gear | 909 | * @q: request queue to kick into gear |
907 | * | 910 | * |
908 | * This is basically a helper to remove the need to know whether a queue | 911 | * This is basically a helper to remove the need to know whether a queue |
909 | * is plugged or not if someone just wants to initiate dispatch of requests | 912 | * is plugged or not if someone just wants to initiate dispatch of requests |
910 | * for this queue. Should be used to start queueing on a device outside | 913 | * for this queue. Should be used to start queueing on a device outside |
911 | * of ->request_fn() context. Also see @blk_run_queue. | 914 | * of ->request_fn() context. Also see @blk_run_queue. |
912 | * | 915 | * |
913 | * The queue lock must be held with interrupts disabled. | 916 | * The queue lock must be held with interrupts disabled. |
914 | */ | 917 | */ |
915 | void blk_start_queueing(struct request_queue *q) | 918 | void blk_start_queueing(struct request_queue *q) |
916 | { | 919 | { |
917 | if (!blk_queue_plugged(q)) { | 920 | if (!blk_queue_plugged(q)) { |
918 | if (unlikely(blk_queue_stopped(q))) | 921 | if (unlikely(blk_queue_stopped(q))) |
919 | return; | 922 | return; |
920 | q->request_fn(q); | 923 | q->request_fn(q); |
921 | } else | 924 | } else |
922 | __generic_unplug_device(q); | 925 | __generic_unplug_device(q); |
923 | } | 926 | } |
924 | EXPORT_SYMBOL(blk_start_queueing); | 927 | EXPORT_SYMBOL(blk_start_queueing); |
925 | 928 | ||
926 | /** | 929 | /** |
927 | * blk_requeue_request - put a request back on queue | 930 | * blk_requeue_request - put a request back on queue |
928 | * @q: request queue where request should be inserted | 931 | * @q: request queue where request should be inserted |
929 | * @rq: request to be inserted | 932 | * @rq: request to be inserted |
930 | * | 933 | * |
931 | * Description: | 934 | * Description: |
932 | * Drivers often keep queueing requests until the hardware cannot accept | 935 | * Drivers often keep queueing requests until the hardware cannot accept |
933 | * more, when that condition happens we need to put the request back | 936 | * more, when that condition happens we need to put the request back |
934 | * on the queue. Must be called with queue lock held. | 937 | * on the queue. Must be called with queue lock held. |
935 | */ | 938 | */ |
936 | void blk_requeue_request(struct request_queue *q, struct request *rq) | 939 | void blk_requeue_request(struct request_queue *q, struct request *rq) |
937 | { | 940 | { |
938 | blk_delete_timer(rq); | 941 | blk_delete_timer(rq); |
939 | blk_clear_rq_complete(rq); | 942 | blk_clear_rq_complete(rq); |
940 | trace_block_rq_requeue(q, rq); | 943 | trace_block_rq_requeue(q, rq); |
941 | 944 | ||
942 | if (blk_rq_tagged(rq)) | 945 | if (blk_rq_tagged(rq)) |
943 | blk_queue_end_tag(q, rq); | 946 | blk_queue_end_tag(q, rq); |
944 | 947 | ||
945 | elv_requeue_request(q, rq); | 948 | elv_requeue_request(q, rq); |
946 | } | 949 | } |
947 | EXPORT_SYMBOL(blk_requeue_request); | 950 | EXPORT_SYMBOL(blk_requeue_request); |
948 | 951 | ||
949 | /** | 952 | /** |
950 | * blk_insert_request - insert a special request into a request queue | 953 | * blk_insert_request - insert a special request into a request queue |
951 | * @q: request queue where request should be inserted | 954 | * @q: request queue where request should be inserted |
952 | * @rq: request to be inserted | 955 | * @rq: request to be inserted |
953 | * @at_head: insert request at head or tail of queue | 956 | * @at_head: insert request at head or tail of queue |
954 | * @data: private data | 957 | * @data: private data |
955 | * | 958 | * |
956 | * Description: | 959 | * Description: |
957 | * Many block devices need to execute commands asynchronously, so they don't | 960 | * Many block devices need to execute commands asynchronously, so they don't |
958 | * block the whole kernel from preemption during request execution. This is | 961 | * block the whole kernel from preemption during request execution. This is |
959 | * accomplished normally by inserting aritficial requests tagged as | 962 | * accomplished normally by inserting aritficial requests tagged as |
960 | * REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them | 963 | * REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them |
961 | * be scheduled for actual execution by the request queue. | 964 | * be scheduled for actual execution by the request queue. |
962 | * | 965 | * |
963 | * We have the option of inserting the head or the tail of the queue. | 966 | * We have the option of inserting the head or the tail of the queue. |
964 | * Typically we use the tail for new ioctls and so forth. We use the head | 967 | * Typically we use the tail for new ioctls and so forth. We use the head |
965 | * of the queue for things like a QUEUE_FULL message from a device, or a | 968 | * of the queue for things like a QUEUE_FULL message from a device, or a |
966 | * host that is unable to accept a particular command. | 969 | * host that is unable to accept a particular command. |
967 | */ | 970 | */ |
968 | void blk_insert_request(struct request_queue *q, struct request *rq, | 971 | void blk_insert_request(struct request_queue *q, struct request *rq, |
969 | int at_head, void *data) | 972 | int at_head, void *data) |
970 | { | 973 | { |
971 | int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; | 974 | int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; |
972 | unsigned long flags; | 975 | unsigned long flags; |
973 | 976 | ||
974 | /* | 977 | /* |
975 | * tell I/O scheduler that this isn't a regular read/write (ie it | 978 | * tell I/O scheduler that this isn't a regular read/write (ie it |
976 | * must not attempt merges on this) and that it acts as a soft | 979 | * must not attempt merges on this) and that it acts as a soft |
977 | * barrier | 980 | * barrier |
978 | */ | 981 | */ |
979 | rq->cmd_type = REQ_TYPE_SPECIAL; | 982 | rq->cmd_type = REQ_TYPE_SPECIAL; |
980 | rq->cmd_flags |= REQ_SOFTBARRIER; | 983 | rq->cmd_flags |= REQ_SOFTBARRIER; |
981 | 984 | ||
982 | rq->special = data; | 985 | rq->special = data; |
983 | 986 | ||
984 | spin_lock_irqsave(q->queue_lock, flags); | 987 | spin_lock_irqsave(q->queue_lock, flags); |
985 | 988 | ||
986 | /* | 989 | /* |
987 | * If command is tagged, release the tag | 990 | * If command is tagged, release the tag |
988 | */ | 991 | */ |
989 | if (blk_rq_tagged(rq)) | 992 | if (blk_rq_tagged(rq)) |
990 | blk_queue_end_tag(q, rq); | 993 | blk_queue_end_tag(q, rq); |
991 | 994 | ||
992 | drive_stat_acct(rq, 1); | 995 | drive_stat_acct(rq, 1); |
993 | __elv_add_request(q, rq, where, 0); | 996 | __elv_add_request(q, rq, where, 0); |
994 | blk_start_queueing(q); | 997 | blk_start_queueing(q); |
995 | spin_unlock_irqrestore(q->queue_lock, flags); | 998 | spin_unlock_irqrestore(q->queue_lock, flags); |
996 | } | 999 | } |
997 | EXPORT_SYMBOL(blk_insert_request); | 1000 | EXPORT_SYMBOL(blk_insert_request); |
998 | 1001 | ||
999 | /* | 1002 | /* |
1000 | * add-request adds a request to the linked list. | 1003 | * add-request adds a request to the linked list. |
1001 | * queue lock is held and interrupts disabled, as we muck with the | 1004 | * queue lock is held and interrupts disabled, as we muck with the |
1002 | * request queue list. | 1005 | * request queue list. |
1003 | */ | 1006 | */ |
1004 | static inline void add_request(struct request_queue *q, struct request *req) | 1007 | static inline void add_request(struct request_queue *q, struct request *req) |
1005 | { | 1008 | { |
1006 | drive_stat_acct(req, 1); | 1009 | drive_stat_acct(req, 1); |
1007 | 1010 | ||
1008 | /* | 1011 | /* |
1009 | * elevator indicated where it wants this request to be | 1012 | * elevator indicated where it wants this request to be |
1010 | * inserted at elevator_merge time | 1013 | * inserted at elevator_merge time |
1011 | */ | 1014 | */ |
1012 | __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); | 1015 | __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); |
1013 | } | 1016 | } |
1014 | 1017 | ||
1015 | static void part_round_stats_single(int cpu, struct hd_struct *part, | 1018 | static void part_round_stats_single(int cpu, struct hd_struct *part, |
1016 | unsigned long now) | 1019 | unsigned long now) |
1017 | { | 1020 | { |
1018 | if (now == part->stamp) | 1021 | if (now == part->stamp) |
1019 | return; | 1022 | return; |
1020 | 1023 | ||
1021 | if (part->in_flight) { | 1024 | if (part->in_flight) { |
1022 | __part_stat_add(cpu, part, time_in_queue, | 1025 | __part_stat_add(cpu, part, time_in_queue, |
1023 | part->in_flight * (now - part->stamp)); | 1026 | part->in_flight * (now - part->stamp)); |
1024 | __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); | 1027 | __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); |
1025 | } | 1028 | } |
1026 | part->stamp = now; | 1029 | part->stamp = now; |
1027 | } | 1030 | } |
1028 | 1031 | ||
1029 | /** | 1032 | /** |
1030 | * part_round_stats() - Round off the performance stats on a struct disk_stats. | 1033 | * part_round_stats() - Round off the performance stats on a struct disk_stats. |
1031 | * @cpu: cpu number for stats access | 1034 | * @cpu: cpu number for stats access |
1032 | * @part: target partition | 1035 | * @part: target partition |
1033 | * | 1036 | * |
1034 | * The average IO queue length and utilisation statistics are maintained | 1037 | * The average IO queue length and utilisation statistics are maintained |
1035 | * by observing the current state of the queue length and the amount of | 1038 | * by observing the current state of the queue length and the amount of |
1036 | * time it has been in this state for. | 1039 | * time it has been in this state for. |
1037 | * | 1040 | * |
1038 | * Normally, that accounting is done on IO completion, but that can result | 1041 | * Normally, that accounting is done on IO completion, but that can result |
1039 | * in more than a second's worth of IO being accounted for within any one | 1042 | * in more than a second's worth of IO being accounted for within any one |
1040 | * second, leading to >100% utilisation. To deal with that, we call this | 1043 | * second, leading to >100% utilisation. To deal with that, we call this |
1041 | * function to do a round-off before returning the results when reading | 1044 | * function to do a round-off before returning the results when reading |
1042 | * /proc/diskstats. This accounts immediately for all queue usage up to | 1045 | * /proc/diskstats. This accounts immediately for all queue usage up to |
1043 | * the current jiffies and restarts the counters again. | 1046 | * the current jiffies and restarts the counters again. |
1044 | */ | 1047 | */ |
1045 | void part_round_stats(int cpu, struct hd_struct *part) | 1048 | void part_round_stats(int cpu, struct hd_struct *part) |
1046 | { | 1049 | { |
1047 | unsigned long now = jiffies; | 1050 | unsigned long now = jiffies; |
1048 | 1051 | ||
1049 | if (part->partno) | 1052 | if (part->partno) |
1050 | part_round_stats_single(cpu, &part_to_disk(part)->part0, now); | 1053 | part_round_stats_single(cpu, &part_to_disk(part)->part0, now); |
1051 | part_round_stats_single(cpu, part, now); | 1054 | part_round_stats_single(cpu, part, now); |
1052 | } | 1055 | } |
1053 | EXPORT_SYMBOL_GPL(part_round_stats); | 1056 | EXPORT_SYMBOL_GPL(part_round_stats); |
1054 | 1057 | ||
1055 | /* | 1058 | /* |
1056 | * queue lock must be held | 1059 | * queue lock must be held |
1057 | */ | 1060 | */ |
1058 | void __blk_put_request(struct request_queue *q, struct request *req) | 1061 | void __blk_put_request(struct request_queue *q, struct request *req) |
1059 | { | 1062 | { |
1060 | if (unlikely(!q)) | 1063 | if (unlikely(!q)) |
1061 | return; | 1064 | return; |
1062 | if (unlikely(--req->ref_count)) | 1065 | if (unlikely(--req->ref_count)) |
1063 | return; | 1066 | return; |
1064 | 1067 | ||
1065 | elv_completed_request(q, req); | 1068 | elv_completed_request(q, req); |
1066 | 1069 | ||
1067 | /* | 1070 | /* |
1068 | * Request may not have originated from ll_rw_blk. if not, | 1071 | * Request may not have originated from ll_rw_blk. if not, |
1069 | * it didn't come out of our reserved rq pools | 1072 | * it didn't come out of our reserved rq pools |
1070 | */ | 1073 | */ |
1071 | if (req->cmd_flags & REQ_ALLOCED) { | 1074 | if (req->cmd_flags & REQ_ALLOCED) { |
1072 | int rw = rq_data_dir(req); | 1075 | int rw = rq_data_dir(req); |
1073 | int priv = req->cmd_flags & REQ_ELVPRIV; | 1076 | int priv = req->cmd_flags & REQ_ELVPRIV; |
1074 | 1077 | ||
1075 | BUG_ON(!list_empty(&req->queuelist)); | 1078 | BUG_ON(!list_empty(&req->queuelist)); |
1076 | BUG_ON(!hlist_unhashed(&req->hash)); | 1079 | BUG_ON(!hlist_unhashed(&req->hash)); |
1077 | 1080 | ||
1078 | blk_free_request(q, req); | 1081 | blk_free_request(q, req); |
1079 | freed_request(q, rw, priv); | 1082 | freed_request(q, rw, priv); |
1080 | } | 1083 | } |
1081 | } | 1084 | } |
1082 | EXPORT_SYMBOL_GPL(__blk_put_request); | 1085 | EXPORT_SYMBOL_GPL(__blk_put_request); |
1083 | 1086 | ||
1084 | void blk_put_request(struct request *req) | 1087 | void blk_put_request(struct request *req) |
1085 | { | 1088 | { |
1086 | unsigned long flags; | 1089 | unsigned long flags; |
1087 | struct request_queue *q = req->q; | 1090 | struct request_queue *q = req->q; |
1088 | 1091 | ||
1089 | spin_lock_irqsave(q->queue_lock, flags); | 1092 | spin_lock_irqsave(q->queue_lock, flags); |
1090 | __blk_put_request(q, req); | 1093 | __blk_put_request(q, req); |
1091 | spin_unlock_irqrestore(q->queue_lock, flags); | 1094 | spin_unlock_irqrestore(q->queue_lock, flags); |
1092 | } | 1095 | } |
1093 | EXPORT_SYMBOL(blk_put_request); | 1096 | EXPORT_SYMBOL(blk_put_request); |
1094 | 1097 | ||
1095 | void init_request_from_bio(struct request *req, struct bio *bio) | 1098 | void init_request_from_bio(struct request *req, struct bio *bio) |
1096 | { | 1099 | { |
1097 | req->cpu = bio->bi_comp_cpu; | 1100 | req->cpu = bio->bi_comp_cpu; |
1098 | req->cmd_type = REQ_TYPE_FS; | 1101 | req->cmd_type = REQ_TYPE_FS; |
1099 | 1102 | ||
1100 | /* | 1103 | /* |
1101 | * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST) | 1104 | * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST) |
1102 | */ | 1105 | */ |
1103 | if (bio_rw_ahead(bio)) | 1106 | if (bio_rw_ahead(bio)) |
1104 | req->cmd_flags |= (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | | 1107 | req->cmd_flags |= (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | |
1105 | REQ_FAILFAST_DRIVER); | 1108 | REQ_FAILFAST_DRIVER); |
1106 | if (bio_failfast_dev(bio)) | 1109 | if (bio_failfast_dev(bio)) |
1107 | req->cmd_flags |= REQ_FAILFAST_DEV; | 1110 | req->cmd_flags |= REQ_FAILFAST_DEV; |
1108 | if (bio_failfast_transport(bio)) | 1111 | if (bio_failfast_transport(bio)) |
1109 | req->cmd_flags |= REQ_FAILFAST_TRANSPORT; | 1112 | req->cmd_flags |= REQ_FAILFAST_TRANSPORT; |
1110 | if (bio_failfast_driver(bio)) | 1113 | if (bio_failfast_driver(bio)) |
1111 | req->cmd_flags |= REQ_FAILFAST_DRIVER; | 1114 | req->cmd_flags |= REQ_FAILFAST_DRIVER; |
1112 | 1115 | ||
1113 | /* | 1116 | /* |
1114 | * REQ_BARRIER implies no merging, but lets make it explicit | 1117 | * REQ_BARRIER implies no merging, but lets make it explicit |
1115 | */ | 1118 | */ |
1116 | if (unlikely(bio_discard(bio))) { | 1119 | if (unlikely(bio_discard(bio))) { |
1117 | req->cmd_flags |= REQ_DISCARD; | 1120 | req->cmd_flags |= REQ_DISCARD; |
1118 | if (bio_barrier(bio)) | 1121 | if (bio_barrier(bio)) |
1119 | req->cmd_flags |= REQ_SOFTBARRIER; | 1122 | req->cmd_flags |= REQ_SOFTBARRIER; |
1120 | req->q->prepare_discard_fn(req->q, req); | 1123 | req->q->prepare_discard_fn(req->q, req); |
1121 | } else if (unlikely(bio_barrier(bio))) | 1124 | } else if (unlikely(bio_barrier(bio))) |
1122 | req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE); | 1125 | req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE); |
1123 | 1126 | ||
1124 | if (bio_sync(bio)) | 1127 | if (bio_sync(bio)) |
1125 | req->cmd_flags |= REQ_RW_SYNC; | 1128 | req->cmd_flags |= REQ_RW_SYNC; |
1126 | if (bio_rw_meta(bio)) | 1129 | if (bio_rw_meta(bio)) |
1127 | req->cmd_flags |= REQ_RW_META; | 1130 | req->cmd_flags |= REQ_RW_META; |
1128 | 1131 | ||
1129 | req->errors = 0; | 1132 | req->errors = 0; |
1130 | req->hard_sector = req->sector = bio->bi_sector; | 1133 | req->hard_sector = req->sector = bio->bi_sector; |
1131 | req->ioprio = bio_prio(bio); | 1134 | req->ioprio = bio_prio(bio); |
1132 | req->start_time = jiffies; | 1135 | req->start_time = jiffies; |
1133 | blk_rq_bio_prep(req->q, req, bio); | 1136 | blk_rq_bio_prep(req->q, req, bio); |
1134 | } | 1137 | } |
1135 | 1138 | ||
1136 | static int __make_request(struct request_queue *q, struct bio *bio) | 1139 | static int __make_request(struct request_queue *q, struct bio *bio) |
1137 | { | 1140 | { |
1138 | struct request *req; | 1141 | struct request *req; |
1139 | int el_ret, nr_sectors, barrier, discard, err; | 1142 | int el_ret, nr_sectors, barrier, discard, err; |
1140 | const unsigned short prio = bio_prio(bio); | 1143 | const unsigned short prio = bio_prio(bio); |
1141 | const int sync = bio_sync(bio); | 1144 | const int sync = bio_sync(bio); |
1142 | int rw_flags; | 1145 | int rw_flags; |
1143 | 1146 | ||
1144 | nr_sectors = bio_sectors(bio); | 1147 | nr_sectors = bio_sectors(bio); |
1145 | 1148 | ||
1146 | /* | 1149 | /* |
1147 | * low level driver can indicate that it wants pages above a | 1150 | * low level driver can indicate that it wants pages above a |
1148 | * certain limit bounced to low memory (ie for highmem, or even | 1151 | * certain limit bounced to low memory (ie for highmem, or even |
1149 | * ISA dma in theory) | 1152 | * ISA dma in theory) |
1150 | */ | 1153 | */ |
1151 | blk_queue_bounce(q, &bio); | 1154 | blk_queue_bounce(q, &bio); |
1152 | 1155 | ||
1153 | barrier = bio_barrier(bio); | 1156 | barrier = bio_barrier(bio); |
1154 | if (unlikely(barrier) && bio_has_data(bio) && | 1157 | if (unlikely(barrier) && bio_has_data(bio) && |
1155 | (q->next_ordered == QUEUE_ORDERED_NONE)) { | 1158 | (q->next_ordered == QUEUE_ORDERED_NONE)) { |
1156 | err = -EOPNOTSUPP; | 1159 | err = -EOPNOTSUPP; |
1157 | goto end_io; | 1160 | goto end_io; |
1158 | } | 1161 | } |
1159 | 1162 | ||
1160 | discard = bio_discard(bio); | 1163 | discard = bio_discard(bio); |
1161 | if (unlikely(discard) && !q->prepare_discard_fn) { | 1164 | if (unlikely(discard) && !q->prepare_discard_fn) { |
1162 | err = -EOPNOTSUPP; | 1165 | err = -EOPNOTSUPP; |
1163 | goto end_io; | 1166 | goto end_io; |
1164 | } | 1167 | } |
1165 | 1168 | ||
1166 | spin_lock_irq(q->queue_lock); | 1169 | spin_lock_irq(q->queue_lock); |
1167 | 1170 | ||
1168 | if (unlikely(barrier) || elv_queue_empty(q)) | 1171 | if (unlikely(barrier) || elv_queue_empty(q)) |
1169 | goto get_rq; | 1172 | goto get_rq; |
1170 | 1173 | ||
1171 | el_ret = elv_merge(q, &req, bio); | 1174 | el_ret = elv_merge(q, &req, bio); |
1172 | switch (el_ret) { | 1175 | switch (el_ret) { |
1173 | case ELEVATOR_BACK_MERGE: | 1176 | case ELEVATOR_BACK_MERGE: |
1174 | BUG_ON(!rq_mergeable(req)); | 1177 | BUG_ON(!rq_mergeable(req)); |
1175 | 1178 | ||
1176 | if (!ll_back_merge_fn(q, req, bio)) | 1179 | if (!ll_back_merge_fn(q, req, bio)) |
1177 | break; | 1180 | break; |
1178 | 1181 | ||
1179 | trace_block_bio_backmerge(q, bio); | 1182 | trace_block_bio_backmerge(q, bio); |
1180 | 1183 | ||
1181 | req->biotail->bi_next = bio; | 1184 | req->biotail->bi_next = bio; |
1182 | req->biotail = bio; | 1185 | req->biotail = bio; |
1183 | req->nr_sectors = req->hard_nr_sectors += nr_sectors; | 1186 | req->nr_sectors = req->hard_nr_sectors += nr_sectors; |
1184 | req->ioprio = ioprio_best(req->ioprio, prio); | 1187 | req->ioprio = ioprio_best(req->ioprio, prio); |
1185 | if (!blk_rq_cpu_valid(req)) | 1188 | if (!blk_rq_cpu_valid(req)) |
1186 | req->cpu = bio->bi_comp_cpu; | 1189 | req->cpu = bio->bi_comp_cpu; |
1187 | drive_stat_acct(req, 0); | 1190 | drive_stat_acct(req, 0); |
1188 | if (!attempt_back_merge(q, req)) | 1191 | if (!attempt_back_merge(q, req)) |
1189 | elv_merged_request(q, req, el_ret); | 1192 | elv_merged_request(q, req, el_ret); |
1190 | goto out; | 1193 | goto out; |
1191 | 1194 | ||
1192 | case ELEVATOR_FRONT_MERGE: | 1195 | case ELEVATOR_FRONT_MERGE: |
1193 | BUG_ON(!rq_mergeable(req)); | 1196 | BUG_ON(!rq_mergeable(req)); |
1194 | 1197 | ||
1195 | if (!ll_front_merge_fn(q, req, bio)) | 1198 | if (!ll_front_merge_fn(q, req, bio)) |
1196 | break; | 1199 | break; |
1197 | 1200 | ||
1198 | trace_block_bio_frontmerge(q, bio); | 1201 | trace_block_bio_frontmerge(q, bio); |
1199 | 1202 | ||
1200 | bio->bi_next = req->bio; | 1203 | bio->bi_next = req->bio; |
1201 | req->bio = bio; | 1204 | req->bio = bio; |
1202 | 1205 | ||
1203 | /* | 1206 | /* |
1204 | * may not be valid. if the low level driver said | 1207 | * may not be valid. if the low level driver said |
1205 | * it didn't need a bounce buffer then it better | 1208 | * it didn't need a bounce buffer then it better |
1206 | * not touch req->buffer either... | 1209 | * not touch req->buffer either... |
1207 | */ | 1210 | */ |
1208 | req->buffer = bio_data(bio); | 1211 | req->buffer = bio_data(bio); |
1209 | req->current_nr_sectors = bio_cur_sectors(bio); | 1212 | req->current_nr_sectors = bio_cur_sectors(bio); |
1210 | req->hard_cur_sectors = req->current_nr_sectors; | 1213 | req->hard_cur_sectors = req->current_nr_sectors; |
1211 | req->sector = req->hard_sector = bio->bi_sector; | 1214 | req->sector = req->hard_sector = bio->bi_sector; |
1212 | req->nr_sectors = req->hard_nr_sectors += nr_sectors; | 1215 | req->nr_sectors = req->hard_nr_sectors += nr_sectors; |
1213 | req->ioprio = ioprio_best(req->ioprio, prio); | 1216 | req->ioprio = ioprio_best(req->ioprio, prio); |
1214 | if (!blk_rq_cpu_valid(req)) | 1217 | if (!blk_rq_cpu_valid(req)) |
1215 | req->cpu = bio->bi_comp_cpu; | 1218 | req->cpu = bio->bi_comp_cpu; |
1216 | drive_stat_acct(req, 0); | 1219 | drive_stat_acct(req, 0); |
1217 | if (!attempt_front_merge(q, req)) | 1220 | if (!attempt_front_merge(q, req)) |
1218 | elv_merged_request(q, req, el_ret); | 1221 | elv_merged_request(q, req, el_ret); |
1219 | goto out; | 1222 | goto out; |
1220 | 1223 | ||
1221 | /* ELV_NO_MERGE: elevator says don't/can't merge. */ | 1224 | /* ELV_NO_MERGE: elevator says don't/can't merge. */ |
1222 | default: | 1225 | default: |
1223 | ; | 1226 | ; |
1224 | } | 1227 | } |
1225 | 1228 | ||
1226 | get_rq: | 1229 | get_rq: |
1227 | /* | 1230 | /* |
1228 | * This sync check and mask will be re-done in init_request_from_bio(), | 1231 | * This sync check and mask will be re-done in init_request_from_bio(), |
1229 | * but we need to set it earlier to expose the sync flag to the | 1232 | * but we need to set it earlier to expose the sync flag to the |
1230 | * rq allocator and io schedulers. | 1233 | * rq allocator and io schedulers. |
1231 | */ | 1234 | */ |
1232 | rw_flags = bio_data_dir(bio); | 1235 | rw_flags = bio_data_dir(bio); |
1233 | if (sync) | 1236 | if (sync) |
1234 | rw_flags |= REQ_RW_SYNC; | 1237 | rw_flags |= REQ_RW_SYNC; |
1235 | 1238 | ||
1236 | /* | 1239 | /* |
1237 | * Grab a free request. This is might sleep but can not fail. | 1240 | * Grab a free request. This is might sleep but can not fail. |
1238 | * Returns with the queue unlocked. | 1241 | * Returns with the queue unlocked. |
1239 | */ | 1242 | */ |
1240 | req = get_request_wait(q, rw_flags, bio); | 1243 | req = get_request_wait(q, rw_flags, bio); |
1241 | 1244 | ||
1242 | /* | 1245 | /* |
1243 | * After dropping the lock and possibly sleeping here, our request | 1246 | * After dropping the lock and possibly sleeping here, our request |
1244 | * may now be mergeable after it had proven unmergeable (above). | 1247 | * may now be mergeable after it had proven unmergeable (above). |
1245 | * We don't worry about that case for efficiency. It won't happen | 1248 | * We don't worry about that case for efficiency. It won't happen |
1246 | * often, and the elevators are able to handle it. | 1249 | * often, and the elevators are able to handle it. |
1247 | */ | 1250 | */ |
1248 | init_request_from_bio(req, bio); | 1251 | init_request_from_bio(req, bio); |
1249 | 1252 | ||
1250 | spin_lock_irq(q->queue_lock); | 1253 | spin_lock_irq(q->queue_lock); |
1251 | if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || | 1254 | if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || |
1252 | bio_flagged(bio, BIO_CPU_AFFINE)) | 1255 | bio_flagged(bio, BIO_CPU_AFFINE)) |
1253 | req->cpu = blk_cpu_to_group(smp_processor_id()); | 1256 | req->cpu = blk_cpu_to_group(smp_processor_id()); |
1254 | if (elv_queue_empty(q)) | 1257 | if (elv_queue_empty(q)) |
1255 | blk_plug_device(q); | 1258 | blk_plug_device(q); |
1256 | add_request(q, req); | 1259 | add_request(q, req); |
1257 | out: | 1260 | out: |
1258 | if (sync) | 1261 | if (sync) |
1259 | __generic_unplug_device(q); | 1262 | __generic_unplug_device(q); |
1260 | spin_unlock_irq(q->queue_lock); | 1263 | spin_unlock_irq(q->queue_lock); |
1261 | return 0; | 1264 | return 0; |
1262 | 1265 | ||
1263 | end_io: | 1266 | end_io: |
1264 | bio_endio(bio, err); | 1267 | bio_endio(bio, err); |
1265 | return 0; | 1268 | return 0; |
1266 | } | 1269 | } |
1267 | 1270 | ||
1268 | /* | 1271 | /* |
1269 | * If bio->bi_dev is a partition, remap the location | 1272 | * If bio->bi_dev is a partition, remap the location |
1270 | */ | 1273 | */ |
1271 | static inline void blk_partition_remap(struct bio *bio) | 1274 | static inline void blk_partition_remap(struct bio *bio) |
1272 | { | 1275 | { |
1273 | struct block_device *bdev = bio->bi_bdev; | 1276 | struct block_device *bdev = bio->bi_bdev; |
1274 | 1277 | ||
1275 | if (bio_sectors(bio) && bdev != bdev->bd_contains) { | 1278 | if (bio_sectors(bio) && bdev != bdev->bd_contains) { |
1276 | struct hd_struct *p = bdev->bd_part; | 1279 | struct hd_struct *p = bdev->bd_part; |
1277 | 1280 | ||
1278 | bio->bi_sector += p->start_sect; | 1281 | bio->bi_sector += p->start_sect; |
1279 | bio->bi_bdev = bdev->bd_contains; | 1282 | bio->bi_bdev = bdev->bd_contains; |
1280 | 1283 | ||
1281 | trace_block_remap(bdev_get_queue(bio->bi_bdev), bio, | 1284 | trace_block_remap(bdev_get_queue(bio->bi_bdev), bio, |
1282 | bdev->bd_dev, bio->bi_sector, | 1285 | bdev->bd_dev, bio->bi_sector, |
1283 | bio->bi_sector - p->start_sect); | 1286 | bio->bi_sector - p->start_sect); |
1284 | } | 1287 | } |
1285 | } | 1288 | } |
1286 | 1289 | ||
1287 | static void handle_bad_sector(struct bio *bio) | 1290 | static void handle_bad_sector(struct bio *bio) |
1288 | { | 1291 | { |
1289 | char b[BDEVNAME_SIZE]; | 1292 | char b[BDEVNAME_SIZE]; |
1290 | 1293 | ||
1291 | printk(KERN_INFO "attempt to access beyond end of device\n"); | 1294 | printk(KERN_INFO "attempt to access beyond end of device\n"); |
1292 | printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n", | 1295 | printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n", |
1293 | bdevname(bio->bi_bdev, b), | 1296 | bdevname(bio->bi_bdev, b), |
1294 | bio->bi_rw, | 1297 | bio->bi_rw, |
1295 | (unsigned long long)bio->bi_sector + bio_sectors(bio), | 1298 | (unsigned long long)bio->bi_sector + bio_sectors(bio), |
1296 | (long long)(bio->bi_bdev->bd_inode->i_size >> 9)); | 1299 | (long long)(bio->bi_bdev->bd_inode->i_size >> 9)); |
1297 | 1300 | ||
1298 | set_bit(BIO_EOF, &bio->bi_flags); | 1301 | set_bit(BIO_EOF, &bio->bi_flags); |
1299 | } | 1302 | } |
1300 | 1303 | ||
1301 | #ifdef CONFIG_FAIL_MAKE_REQUEST | 1304 | #ifdef CONFIG_FAIL_MAKE_REQUEST |
1302 | 1305 | ||
1303 | static DECLARE_FAULT_ATTR(fail_make_request); | 1306 | static DECLARE_FAULT_ATTR(fail_make_request); |
1304 | 1307 | ||
1305 | static int __init setup_fail_make_request(char *str) | 1308 | static int __init setup_fail_make_request(char *str) |
1306 | { | 1309 | { |
1307 | return setup_fault_attr(&fail_make_request, str); | 1310 | return setup_fault_attr(&fail_make_request, str); |
1308 | } | 1311 | } |
1309 | __setup("fail_make_request=", setup_fail_make_request); | 1312 | __setup("fail_make_request=", setup_fail_make_request); |
1310 | 1313 | ||
1311 | static int should_fail_request(struct bio *bio) | 1314 | static int should_fail_request(struct bio *bio) |
1312 | { | 1315 | { |
1313 | struct hd_struct *part = bio->bi_bdev->bd_part; | 1316 | struct hd_struct *part = bio->bi_bdev->bd_part; |
1314 | 1317 | ||
1315 | if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail) | 1318 | if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail) |
1316 | return should_fail(&fail_make_request, bio->bi_size); | 1319 | return should_fail(&fail_make_request, bio->bi_size); |
1317 | 1320 | ||
1318 | return 0; | 1321 | return 0; |
1319 | } | 1322 | } |
1320 | 1323 | ||
1321 | static int __init fail_make_request_debugfs(void) | 1324 | static int __init fail_make_request_debugfs(void) |
1322 | { | 1325 | { |
1323 | return init_fault_attr_dentries(&fail_make_request, | 1326 | return init_fault_attr_dentries(&fail_make_request, |
1324 | "fail_make_request"); | 1327 | "fail_make_request"); |
1325 | } | 1328 | } |
1326 | 1329 | ||
1327 | late_initcall(fail_make_request_debugfs); | 1330 | late_initcall(fail_make_request_debugfs); |
1328 | 1331 | ||
1329 | #else /* CONFIG_FAIL_MAKE_REQUEST */ | 1332 | #else /* CONFIG_FAIL_MAKE_REQUEST */ |
1330 | 1333 | ||
1331 | static inline int should_fail_request(struct bio *bio) | 1334 | static inline int should_fail_request(struct bio *bio) |
1332 | { | 1335 | { |
1333 | return 0; | 1336 | return 0; |
1334 | } | 1337 | } |
1335 | 1338 | ||
1336 | #endif /* CONFIG_FAIL_MAKE_REQUEST */ | 1339 | #endif /* CONFIG_FAIL_MAKE_REQUEST */ |
1337 | 1340 | ||
1338 | /* | 1341 | /* |
1339 | * Check whether this bio extends beyond the end of the device. | 1342 | * Check whether this bio extends beyond the end of the device. |
1340 | */ | 1343 | */ |
1341 | static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) | 1344 | static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) |
1342 | { | 1345 | { |
1343 | sector_t maxsector; | 1346 | sector_t maxsector; |
1344 | 1347 | ||
1345 | if (!nr_sectors) | 1348 | if (!nr_sectors) |
1346 | return 0; | 1349 | return 0; |
1347 | 1350 | ||
1348 | /* Test device or partition size, when known. */ | 1351 | /* Test device or partition size, when known. */ |
1349 | maxsector = bio->bi_bdev->bd_inode->i_size >> 9; | 1352 | maxsector = bio->bi_bdev->bd_inode->i_size >> 9; |
1350 | if (maxsector) { | 1353 | if (maxsector) { |
1351 | sector_t sector = bio->bi_sector; | 1354 | sector_t sector = bio->bi_sector; |
1352 | 1355 | ||
1353 | if (maxsector < nr_sectors || maxsector - nr_sectors < sector) { | 1356 | if (maxsector < nr_sectors || maxsector - nr_sectors < sector) { |
1354 | /* | 1357 | /* |
1355 | * This may well happen - the kernel calls bread() | 1358 | * This may well happen - the kernel calls bread() |
1356 | * without checking the size of the device, e.g., when | 1359 | * without checking the size of the device, e.g., when |
1357 | * mounting a device. | 1360 | * mounting a device. |
1358 | */ | 1361 | */ |
1359 | handle_bad_sector(bio); | 1362 | handle_bad_sector(bio); |
1360 | return 1; | 1363 | return 1; |
1361 | } | 1364 | } |
1362 | } | 1365 | } |
1363 | 1366 | ||
1364 | return 0; | 1367 | return 0; |
1365 | } | 1368 | } |
1366 | 1369 | ||
1367 | /** | 1370 | /** |
1368 | * generic_make_request - hand a buffer to its device driver for I/O | 1371 | * generic_make_request - hand a buffer to its device driver for I/O |
1369 | * @bio: The bio describing the location in memory and on the device. | 1372 | * @bio: The bio describing the location in memory and on the device. |
1370 | * | 1373 | * |
1371 | * generic_make_request() is used to make I/O requests of block | 1374 | * generic_make_request() is used to make I/O requests of block |
1372 | * devices. It is passed a &struct bio, which describes the I/O that needs | 1375 | * devices. It is passed a &struct bio, which describes the I/O that needs |
1373 | * to be done. | 1376 | * to be done. |
1374 | * | 1377 | * |
1375 | * generic_make_request() does not return any status. The | 1378 | * generic_make_request() does not return any status. The |
1376 | * success/failure status of the request, along with notification of | 1379 | * success/failure status of the request, along with notification of |
1377 | * completion, is delivered asynchronously through the bio->bi_end_io | 1380 | * completion, is delivered asynchronously through the bio->bi_end_io |
1378 | * function described (one day) else where. | 1381 | * function described (one day) else where. |
1379 | * | 1382 | * |
1380 | * The caller of generic_make_request must make sure that bi_io_vec | 1383 | * The caller of generic_make_request must make sure that bi_io_vec |
1381 | * are set to describe the memory buffer, and that bi_dev and bi_sector are | 1384 | * are set to describe the memory buffer, and that bi_dev and bi_sector are |
1382 | * set to describe the device address, and the | 1385 | * set to describe the device address, and the |
1383 | * bi_end_io and optionally bi_private are set to describe how | 1386 | * bi_end_io and optionally bi_private are set to describe how |
1384 | * completion notification should be signaled. | 1387 | * completion notification should be signaled. |
1385 | * | 1388 | * |
1386 | * generic_make_request and the drivers it calls may use bi_next if this | 1389 | * generic_make_request and the drivers it calls may use bi_next if this |
1387 | * bio happens to be merged with someone else, and may change bi_dev and | 1390 | * bio happens to be merged with someone else, and may change bi_dev and |
1388 | * bi_sector for remaps as it sees fit. So the values of these fields | 1391 | * bi_sector for remaps as it sees fit. So the values of these fields |
1389 | * should NOT be depended on after the call to generic_make_request. | 1392 | * should NOT be depended on after the call to generic_make_request. |
1390 | */ | 1393 | */ |
1391 | static inline void __generic_make_request(struct bio *bio) | 1394 | static inline void __generic_make_request(struct bio *bio) |
1392 | { | 1395 | { |
1393 | struct request_queue *q; | 1396 | struct request_queue *q; |
1394 | sector_t old_sector; | 1397 | sector_t old_sector; |
1395 | int ret, nr_sectors = bio_sectors(bio); | 1398 | int ret, nr_sectors = bio_sectors(bio); |
1396 | dev_t old_dev; | 1399 | dev_t old_dev; |
1397 | int err = -EIO; | 1400 | int err = -EIO; |
1398 | 1401 | ||
1399 | might_sleep(); | 1402 | might_sleep(); |
1400 | 1403 | ||
1401 | if (bio_check_eod(bio, nr_sectors)) | 1404 | if (bio_check_eod(bio, nr_sectors)) |
1402 | goto end_io; | 1405 | goto end_io; |
1403 | 1406 | ||
1404 | /* | 1407 | /* |
1405 | * Resolve the mapping until finished. (drivers are | 1408 | * Resolve the mapping until finished. (drivers are |
1406 | * still free to implement/resolve their own stacking | 1409 | * still free to implement/resolve their own stacking |
1407 | * by explicitly returning 0) | 1410 | * by explicitly returning 0) |
1408 | * | 1411 | * |
1409 | * NOTE: we don't repeat the blk_size check for each new device. | 1412 | * NOTE: we don't repeat the blk_size check for each new device. |
1410 | * Stacking drivers are expected to know what they are doing. | 1413 | * Stacking drivers are expected to know what they are doing. |
1411 | */ | 1414 | */ |
1412 | old_sector = -1; | 1415 | old_sector = -1; |
1413 | old_dev = 0; | 1416 | old_dev = 0; |
1414 | do { | 1417 | do { |
1415 | char b[BDEVNAME_SIZE]; | 1418 | char b[BDEVNAME_SIZE]; |
1416 | 1419 | ||
1417 | q = bdev_get_queue(bio->bi_bdev); | 1420 | q = bdev_get_queue(bio->bi_bdev); |
1418 | if (!q) { | 1421 | if (!q) { |
1419 | printk(KERN_ERR | 1422 | printk(KERN_ERR |
1420 | "generic_make_request: Trying to access " | 1423 | "generic_make_request: Trying to access " |
1421 | "nonexistent block-device %s (%Lu)\n", | 1424 | "nonexistent block-device %s (%Lu)\n", |
1422 | bdevname(bio->bi_bdev, b), | 1425 | bdevname(bio->bi_bdev, b), |
1423 | (long long) bio->bi_sector); | 1426 | (long long) bio->bi_sector); |
1424 | end_io: | 1427 | end_io: |
1425 | bio_endio(bio, err); | 1428 | bio_endio(bio, err); |
1426 | break; | 1429 | break; |
1427 | } | 1430 | } |
1428 | 1431 | ||
1429 | if (unlikely(nr_sectors > q->max_hw_sectors)) { | 1432 | if (unlikely(nr_sectors > q->max_hw_sectors)) { |
1430 | printk(KERN_ERR "bio too big device %s (%u > %u)\n", | 1433 | printk(KERN_ERR "bio too big device %s (%u > %u)\n", |
1431 | bdevname(bio->bi_bdev, b), | 1434 | bdevname(bio->bi_bdev, b), |
1432 | bio_sectors(bio), | 1435 | bio_sectors(bio), |
1433 | q->max_hw_sectors); | 1436 | q->max_hw_sectors); |
1434 | goto end_io; | 1437 | goto end_io; |
1435 | } | 1438 | } |
1436 | 1439 | ||
1437 | if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) | 1440 | if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) |
1438 | goto end_io; | 1441 | goto end_io; |
1439 | 1442 | ||
1440 | if (should_fail_request(bio)) | 1443 | if (should_fail_request(bio)) |
1441 | goto end_io; | 1444 | goto end_io; |
1442 | 1445 | ||
1443 | /* | 1446 | /* |
1444 | * If this device has partitions, remap block n | 1447 | * If this device has partitions, remap block n |
1445 | * of partition p to block n+start(p) of the disk. | 1448 | * of partition p to block n+start(p) of the disk. |
1446 | */ | 1449 | */ |
1447 | blk_partition_remap(bio); | 1450 | blk_partition_remap(bio); |
1448 | 1451 | ||
1449 | if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) | 1452 | if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) |
1450 | goto end_io; | 1453 | goto end_io; |
1451 | 1454 | ||
1452 | if (old_sector != -1) | 1455 | if (old_sector != -1) |
1453 | trace_block_remap(q, bio, old_dev, bio->bi_sector, | 1456 | trace_block_remap(q, bio, old_dev, bio->bi_sector, |
1454 | old_sector); | 1457 | old_sector); |
1455 | 1458 | ||
1456 | trace_block_bio_queue(q, bio); | 1459 | trace_block_bio_queue(q, bio); |
1457 | 1460 | ||
1458 | old_sector = bio->bi_sector; | 1461 | old_sector = bio->bi_sector; |
1459 | old_dev = bio->bi_bdev->bd_dev; | 1462 | old_dev = bio->bi_bdev->bd_dev; |
1460 | 1463 | ||
1461 | if (bio_check_eod(bio, nr_sectors)) | 1464 | if (bio_check_eod(bio, nr_sectors)) |
1462 | goto end_io; | 1465 | goto end_io; |
1463 | if ((bio_empty_barrier(bio) && !q->prepare_flush_fn) || | 1466 | if ((bio_empty_barrier(bio) && !q->prepare_flush_fn) || |
1464 | (bio_discard(bio) && !q->prepare_discard_fn)) { | 1467 | (bio_discard(bio) && !q->prepare_discard_fn)) { |
1465 | err = -EOPNOTSUPP; | 1468 | err = -EOPNOTSUPP; |
1466 | goto end_io; | 1469 | goto end_io; |
1467 | } | 1470 | } |
1468 | 1471 | ||
1469 | ret = q->make_request_fn(q, bio); | 1472 | ret = q->make_request_fn(q, bio); |
1470 | } while (ret); | 1473 | } while (ret); |
1471 | } | 1474 | } |
1472 | 1475 | ||
1473 | /* | 1476 | /* |
1474 | * We only want one ->make_request_fn to be active at a time, | 1477 | * We only want one ->make_request_fn to be active at a time, |
1475 | * else stack usage with stacked devices could be a problem. | 1478 | * else stack usage with stacked devices could be a problem. |
1476 | * So use current->bio_{list,tail} to keep a list of requests | 1479 | * So use current->bio_{list,tail} to keep a list of requests |
1477 | * submited by a make_request_fn function. | 1480 | * submited by a make_request_fn function. |
1478 | * current->bio_tail is also used as a flag to say if | 1481 | * current->bio_tail is also used as a flag to say if |
1479 | * generic_make_request is currently active in this task or not. | 1482 | * generic_make_request is currently active in this task or not. |
1480 | * If it is NULL, then no make_request is active. If it is non-NULL, | 1483 | * If it is NULL, then no make_request is active. If it is non-NULL, |
1481 | * then a make_request is active, and new requests should be added | 1484 | * then a make_request is active, and new requests should be added |
1482 | * at the tail | 1485 | * at the tail |
1483 | */ | 1486 | */ |
1484 | void generic_make_request(struct bio *bio) | 1487 | void generic_make_request(struct bio *bio) |
1485 | { | 1488 | { |
1486 | if (current->bio_tail) { | 1489 | if (current->bio_tail) { |
1487 | /* make_request is active */ | 1490 | /* make_request is active */ |
1488 | *(current->bio_tail) = bio; | 1491 | *(current->bio_tail) = bio; |
1489 | bio->bi_next = NULL; | 1492 | bio->bi_next = NULL; |
1490 | current->bio_tail = &bio->bi_next; | 1493 | current->bio_tail = &bio->bi_next; |
1491 | return; | 1494 | return; |
1492 | } | 1495 | } |
1493 | /* following loop may be a bit non-obvious, and so deserves some | 1496 | /* following loop may be a bit non-obvious, and so deserves some |
1494 | * explanation. | 1497 | * explanation. |
1495 | * Before entering the loop, bio->bi_next is NULL (as all callers | 1498 | * Before entering the loop, bio->bi_next is NULL (as all callers |
1496 | * ensure that) so we have a list with a single bio. | 1499 | * ensure that) so we have a list with a single bio. |
1497 | * We pretend that we have just taken it off a longer list, so | 1500 | * We pretend that we have just taken it off a longer list, so |
1498 | * we assign bio_list to the next (which is NULL) and bio_tail | 1501 | * we assign bio_list to the next (which is NULL) and bio_tail |
1499 | * to &bio_list, thus initialising the bio_list of new bios to be | 1502 | * to &bio_list, thus initialising the bio_list of new bios to be |
1500 | * added. __generic_make_request may indeed add some more bios | 1503 | * added. __generic_make_request may indeed add some more bios |
1501 | * through a recursive call to generic_make_request. If it | 1504 | * through a recursive call to generic_make_request. If it |
1502 | * did, we find a non-NULL value in bio_list and re-enter the loop | 1505 | * did, we find a non-NULL value in bio_list and re-enter the loop |
1503 | * from the top. In this case we really did just take the bio | 1506 | * from the top. In this case we really did just take the bio |
1504 | * of the top of the list (no pretending) and so fixup bio_list and | 1507 | * of the top of the list (no pretending) and so fixup bio_list and |
1505 | * bio_tail or bi_next, and call into __generic_make_request again. | 1508 | * bio_tail or bi_next, and call into __generic_make_request again. |
1506 | * | 1509 | * |
1507 | * The loop was structured like this to make only one call to | 1510 | * The loop was structured like this to make only one call to |
1508 | * __generic_make_request (which is important as it is large and | 1511 | * __generic_make_request (which is important as it is large and |
1509 | * inlined) and to keep the structure simple. | 1512 | * inlined) and to keep the structure simple. |
1510 | */ | 1513 | */ |
1511 | BUG_ON(bio->bi_next); | 1514 | BUG_ON(bio->bi_next); |
1512 | do { | 1515 | do { |
1513 | current->bio_list = bio->bi_next; | 1516 | current->bio_list = bio->bi_next; |
1514 | if (bio->bi_next == NULL) | 1517 | if (bio->bi_next == NULL) |
1515 | current->bio_tail = ¤t->bio_list; | 1518 | current->bio_tail = ¤t->bio_list; |
1516 | else | 1519 | else |
1517 | bio->bi_next = NULL; | 1520 | bio->bi_next = NULL; |
1518 | __generic_make_request(bio); | 1521 | __generic_make_request(bio); |
1519 | bio = current->bio_list; | 1522 | bio = current->bio_list; |
1520 | } while (bio); | 1523 | } while (bio); |
1521 | current->bio_tail = NULL; /* deactivate */ | 1524 | current->bio_tail = NULL; /* deactivate */ |
1522 | } | 1525 | } |
1523 | EXPORT_SYMBOL(generic_make_request); | 1526 | EXPORT_SYMBOL(generic_make_request); |
1524 | 1527 | ||
1525 | /** | 1528 | /** |
1526 | * submit_bio - submit a bio to the block device layer for I/O | 1529 | * submit_bio - submit a bio to the block device layer for I/O |
1527 | * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) | 1530 | * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) |
1528 | * @bio: The &struct bio which describes the I/O | 1531 | * @bio: The &struct bio which describes the I/O |
1529 | * | 1532 | * |
1530 | * submit_bio() is very similar in purpose to generic_make_request(), and | 1533 | * submit_bio() is very similar in purpose to generic_make_request(), and |
1531 | * uses that function to do most of the work. Both are fairly rough | 1534 | * uses that function to do most of the work. Both are fairly rough |
1532 | * interfaces; @bio must be presetup and ready for I/O. | 1535 | * interfaces; @bio must be presetup and ready for I/O. |
1533 | * | 1536 | * |
1534 | */ | 1537 | */ |
1535 | void submit_bio(int rw, struct bio *bio) | 1538 | void submit_bio(int rw, struct bio *bio) |
1536 | { | 1539 | { |
1537 | int count = bio_sectors(bio); | 1540 | int count = bio_sectors(bio); |
1538 | 1541 | ||
1539 | bio->bi_rw |= rw; | 1542 | bio->bi_rw |= rw; |
1540 | 1543 | ||
1541 | /* | 1544 | /* |
1542 | * If it's a regular read/write or a barrier with data attached, | 1545 | * If it's a regular read/write or a barrier with data attached, |
1543 | * go through the normal accounting stuff before submission. | 1546 | * go through the normal accounting stuff before submission. |
1544 | */ | 1547 | */ |
1545 | if (bio_has_data(bio)) { | 1548 | if (bio_has_data(bio)) { |
1546 | if (rw & WRITE) { | 1549 | if (rw & WRITE) { |
1547 | count_vm_events(PGPGOUT, count); | 1550 | count_vm_events(PGPGOUT, count); |
1548 | } else { | 1551 | } else { |
1549 | task_io_account_read(bio->bi_size); | 1552 | task_io_account_read(bio->bi_size); |
1550 | count_vm_events(PGPGIN, count); | 1553 | count_vm_events(PGPGIN, count); |
1551 | } | 1554 | } |
1552 | 1555 | ||
1553 | if (unlikely(block_dump)) { | 1556 | if (unlikely(block_dump)) { |
1554 | char b[BDEVNAME_SIZE]; | 1557 | char b[BDEVNAME_SIZE]; |
1555 | printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", | 1558 | printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", |
1556 | current->comm, task_pid_nr(current), | 1559 | current->comm, task_pid_nr(current), |
1557 | (rw & WRITE) ? "WRITE" : "READ", | 1560 | (rw & WRITE) ? "WRITE" : "READ", |
1558 | (unsigned long long)bio->bi_sector, | 1561 | (unsigned long long)bio->bi_sector, |
1559 | bdevname(bio->bi_bdev, b)); | 1562 | bdevname(bio->bi_bdev, b)); |
1560 | } | 1563 | } |
1561 | } | 1564 | } |
1562 | 1565 | ||
1563 | generic_make_request(bio); | 1566 | generic_make_request(bio); |
1564 | } | 1567 | } |
1565 | EXPORT_SYMBOL(submit_bio); | 1568 | EXPORT_SYMBOL(submit_bio); |
1566 | 1569 | ||
1567 | /** | 1570 | /** |
1568 | * blk_rq_check_limits - Helper function to check a request for the queue limit | 1571 | * blk_rq_check_limits - Helper function to check a request for the queue limit |
1569 | * @q: the queue | 1572 | * @q: the queue |
1570 | * @rq: the request being checked | 1573 | * @rq: the request being checked |
1571 | * | 1574 | * |
1572 | * Description: | 1575 | * Description: |
1573 | * @rq may have been made based on weaker limitations of upper-level queues | 1576 | * @rq may have been made based on weaker limitations of upper-level queues |
1574 | * in request stacking drivers, and it may violate the limitation of @q. | 1577 | * in request stacking drivers, and it may violate the limitation of @q. |
1575 | * Since the block layer and the underlying device driver trust @rq | 1578 | * Since the block layer and the underlying device driver trust @rq |
1576 | * after it is inserted to @q, it should be checked against @q before | 1579 | * after it is inserted to @q, it should be checked against @q before |
1577 | * the insertion using this generic function. | 1580 | * the insertion using this generic function. |
1578 | * | 1581 | * |
1579 | * This function should also be useful for request stacking drivers | 1582 | * This function should also be useful for request stacking drivers |
1580 | * in some cases below, so export this fuction. | 1583 | * in some cases below, so export this fuction. |
1581 | * Request stacking drivers like request-based dm may change the queue | 1584 | * Request stacking drivers like request-based dm may change the queue |
1582 | * limits while requests are in the queue (e.g. dm's table swapping). | 1585 | * limits while requests are in the queue (e.g. dm's table swapping). |
1583 | * Such request stacking drivers should check those requests agaist | 1586 | * Such request stacking drivers should check those requests agaist |
1584 | * the new queue limits again when they dispatch those requests, | 1587 | * the new queue limits again when they dispatch those requests, |
1585 | * although such checkings are also done against the old queue limits | 1588 | * although such checkings are also done against the old queue limits |
1586 | * when submitting requests. | 1589 | * when submitting requests. |
1587 | */ | 1590 | */ |
1588 | int blk_rq_check_limits(struct request_queue *q, struct request *rq) | 1591 | int blk_rq_check_limits(struct request_queue *q, struct request *rq) |
1589 | { | 1592 | { |
1590 | if (rq->nr_sectors > q->max_sectors || | 1593 | if (rq->nr_sectors > q->max_sectors || |
1591 | rq->data_len > q->max_hw_sectors << 9) { | 1594 | rq->data_len > q->max_hw_sectors << 9) { |
1592 | printk(KERN_ERR "%s: over max size limit.\n", __func__); | 1595 | printk(KERN_ERR "%s: over max size limit.\n", __func__); |
1593 | return -EIO; | 1596 | return -EIO; |
1594 | } | 1597 | } |
1595 | 1598 | ||
1596 | /* | 1599 | /* |
1597 | * queue's settings related to segment counting like q->bounce_pfn | 1600 | * queue's settings related to segment counting like q->bounce_pfn |
1598 | * may differ from that of other stacking queues. | 1601 | * may differ from that of other stacking queues. |
1599 | * Recalculate it to check the request correctly on this queue's | 1602 | * Recalculate it to check the request correctly on this queue's |
1600 | * limitation. | 1603 | * limitation. |
1601 | */ | 1604 | */ |
1602 | blk_recalc_rq_segments(rq); | 1605 | blk_recalc_rq_segments(rq); |
1603 | if (rq->nr_phys_segments > q->max_phys_segments || | 1606 | if (rq->nr_phys_segments > q->max_phys_segments || |
1604 | rq->nr_phys_segments > q->max_hw_segments) { | 1607 | rq->nr_phys_segments > q->max_hw_segments) { |
1605 | printk(KERN_ERR "%s: over max segments limit.\n", __func__); | 1608 | printk(KERN_ERR "%s: over max segments limit.\n", __func__); |
1606 | return -EIO; | 1609 | return -EIO; |
1607 | } | 1610 | } |
1608 | 1611 | ||
1609 | return 0; | 1612 | return 0; |
1610 | } | 1613 | } |
1611 | EXPORT_SYMBOL_GPL(blk_rq_check_limits); | 1614 | EXPORT_SYMBOL_GPL(blk_rq_check_limits); |
1612 | 1615 | ||
1613 | /** | 1616 | /** |
1614 | * blk_insert_cloned_request - Helper for stacking drivers to submit a request | 1617 | * blk_insert_cloned_request - Helper for stacking drivers to submit a request |
1615 | * @q: the queue to submit the request | 1618 | * @q: the queue to submit the request |
1616 | * @rq: the request being queued | 1619 | * @rq: the request being queued |
1617 | */ | 1620 | */ |
1618 | int blk_insert_cloned_request(struct request_queue *q, struct request *rq) | 1621 | int blk_insert_cloned_request(struct request_queue *q, struct request *rq) |
1619 | { | 1622 | { |
1620 | unsigned long flags; | 1623 | unsigned long flags; |
1621 | 1624 | ||
1622 | if (blk_rq_check_limits(q, rq)) | 1625 | if (blk_rq_check_limits(q, rq)) |
1623 | return -EIO; | 1626 | return -EIO; |
1624 | 1627 | ||
1625 | #ifdef CONFIG_FAIL_MAKE_REQUEST | 1628 | #ifdef CONFIG_FAIL_MAKE_REQUEST |
1626 | if (rq->rq_disk && rq->rq_disk->part0.make_it_fail && | 1629 | if (rq->rq_disk && rq->rq_disk->part0.make_it_fail && |
1627 | should_fail(&fail_make_request, blk_rq_bytes(rq))) | 1630 | should_fail(&fail_make_request, blk_rq_bytes(rq))) |
1628 | return -EIO; | 1631 | return -EIO; |
1629 | #endif | 1632 | #endif |
1630 | 1633 | ||
1631 | spin_lock_irqsave(q->queue_lock, flags); | 1634 | spin_lock_irqsave(q->queue_lock, flags); |
1632 | 1635 | ||
1633 | /* | 1636 | /* |
1634 | * Submitting request must be dequeued before calling this function | 1637 | * Submitting request must be dequeued before calling this function |
1635 | * because it will be linked to another request_queue | 1638 | * because it will be linked to another request_queue |
1636 | */ | 1639 | */ |
1637 | BUG_ON(blk_queued_rq(rq)); | 1640 | BUG_ON(blk_queued_rq(rq)); |
1638 | 1641 | ||
1639 | drive_stat_acct(rq, 1); | 1642 | drive_stat_acct(rq, 1); |
1640 | __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0); | 1643 | __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0); |
1641 | 1644 | ||
1642 | spin_unlock_irqrestore(q->queue_lock, flags); | 1645 | spin_unlock_irqrestore(q->queue_lock, flags); |
1643 | 1646 | ||
1644 | return 0; | 1647 | return 0; |
1645 | } | 1648 | } |
1646 | EXPORT_SYMBOL_GPL(blk_insert_cloned_request); | 1649 | EXPORT_SYMBOL_GPL(blk_insert_cloned_request); |
1647 | 1650 | ||
1648 | /** | 1651 | /** |
1649 | * blkdev_dequeue_request - dequeue request and start timeout timer | 1652 | * blkdev_dequeue_request - dequeue request and start timeout timer |
1650 | * @req: request to dequeue | 1653 | * @req: request to dequeue |
1651 | * | 1654 | * |
1652 | * Dequeue @req and start timeout timer on it. This hands off the | 1655 | * Dequeue @req and start timeout timer on it. This hands off the |
1653 | * request to the driver. | 1656 | * request to the driver. |
1654 | * | 1657 | * |
1655 | * Block internal functions which don't want to start timer should | 1658 | * Block internal functions which don't want to start timer should |
1656 | * call elv_dequeue_request(). | 1659 | * call elv_dequeue_request(). |
1657 | */ | 1660 | */ |
1658 | void blkdev_dequeue_request(struct request *req) | 1661 | void blkdev_dequeue_request(struct request *req) |
1659 | { | 1662 | { |
1660 | elv_dequeue_request(req->q, req); | 1663 | elv_dequeue_request(req->q, req); |
1661 | 1664 | ||
1662 | /* | 1665 | /* |
1663 | * We are now handing the request to the hardware, add the | 1666 | * We are now handing the request to the hardware, add the |
1664 | * timeout handler. | 1667 | * timeout handler. |
1665 | */ | 1668 | */ |
1666 | blk_add_timer(req); | 1669 | blk_add_timer(req); |
1667 | } | 1670 | } |
1668 | EXPORT_SYMBOL(blkdev_dequeue_request); | 1671 | EXPORT_SYMBOL(blkdev_dequeue_request); |
1669 | 1672 | ||
1670 | /** | 1673 | /** |
1671 | * __end_that_request_first - end I/O on a request | 1674 | * __end_that_request_first - end I/O on a request |
1672 | * @req: the request being processed | 1675 | * @req: the request being processed |
1673 | * @error: %0 for success, < %0 for error | 1676 | * @error: %0 for success, < %0 for error |
1674 | * @nr_bytes: number of bytes to complete | 1677 | * @nr_bytes: number of bytes to complete |
1675 | * | 1678 | * |
1676 | * Description: | 1679 | * Description: |
1677 | * Ends I/O on a number of bytes attached to @req, and sets it up | 1680 | * Ends I/O on a number of bytes attached to @req, and sets it up |
1678 | * for the next range of segments (if any) in the cluster. | 1681 | * for the next range of segments (if any) in the cluster. |
1679 | * | 1682 | * |
1680 | * Return: | 1683 | * Return: |
1681 | * %0 - we are done with this request, call end_that_request_last() | 1684 | * %0 - we are done with this request, call end_that_request_last() |
1682 | * %1 - still buffers pending for this request | 1685 | * %1 - still buffers pending for this request |
1683 | **/ | 1686 | **/ |
1684 | static int __end_that_request_first(struct request *req, int error, | 1687 | static int __end_that_request_first(struct request *req, int error, |
1685 | int nr_bytes) | 1688 | int nr_bytes) |
1686 | { | 1689 | { |
1687 | int total_bytes, bio_nbytes, next_idx = 0; | 1690 | int total_bytes, bio_nbytes, next_idx = 0; |
1688 | struct bio *bio; | 1691 | struct bio *bio; |
1689 | 1692 | ||
1690 | trace_block_rq_complete(req->q, req); | 1693 | trace_block_rq_complete(req->q, req); |
1691 | 1694 | ||
1692 | /* | 1695 | /* |
1693 | * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual | 1696 | * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual |
1694 | * sense key with us all the way through | 1697 | * sense key with us all the way through |
1695 | */ | 1698 | */ |
1696 | if (!blk_pc_request(req)) | 1699 | if (!blk_pc_request(req)) |
1697 | req->errors = 0; | 1700 | req->errors = 0; |
1698 | 1701 | ||
1699 | if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) { | 1702 | if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) { |
1700 | printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n", | 1703 | printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n", |
1701 | req->rq_disk ? req->rq_disk->disk_name : "?", | 1704 | req->rq_disk ? req->rq_disk->disk_name : "?", |
1702 | (unsigned long long)req->sector); | 1705 | (unsigned long long)req->sector); |
1703 | } | 1706 | } |
1704 | 1707 | ||
1705 | if (blk_fs_request(req) && req->rq_disk) { | 1708 | if (blk_fs_request(req) && req->rq_disk) { |
1706 | const int rw = rq_data_dir(req); | 1709 | const int rw = rq_data_dir(req); |
1707 | struct hd_struct *part; | 1710 | struct hd_struct *part; |
1708 | int cpu; | 1711 | int cpu; |
1709 | 1712 | ||
1710 | cpu = part_stat_lock(); | 1713 | cpu = part_stat_lock(); |
1711 | part = disk_map_sector_rcu(req->rq_disk, req->sector); | 1714 | part = disk_map_sector_rcu(req->rq_disk, req->sector); |
1712 | part_stat_add(cpu, part, sectors[rw], nr_bytes >> 9); | 1715 | part_stat_add(cpu, part, sectors[rw], nr_bytes >> 9); |
1713 | part_stat_unlock(); | 1716 | part_stat_unlock(); |
1714 | } | 1717 | } |
1715 | 1718 | ||
1716 | total_bytes = bio_nbytes = 0; | 1719 | total_bytes = bio_nbytes = 0; |
1717 | while ((bio = req->bio) != NULL) { | 1720 | while ((bio = req->bio) != NULL) { |
1718 | int nbytes; | 1721 | int nbytes; |
1719 | 1722 | ||
1720 | /* | 1723 | /* |
1721 | * For an empty barrier request, the low level driver must | 1724 | * For an empty barrier request, the low level driver must |
1722 | * store a potential error location in ->sector. We pass | 1725 | * store a potential error location in ->sector. We pass |
1723 | * that back up in ->bi_sector. | 1726 | * that back up in ->bi_sector. |
1724 | */ | 1727 | */ |
1725 | if (blk_empty_barrier(req)) | 1728 | if (blk_empty_barrier(req)) |
1726 | bio->bi_sector = req->sector; | 1729 | bio->bi_sector = req->sector; |
1727 | 1730 | ||
1728 | if (nr_bytes >= bio->bi_size) { | 1731 | if (nr_bytes >= bio->bi_size) { |
1729 | req->bio = bio->bi_next; | 1732 | req->bio = bio->bi_next; |
1730 | nbytes = bio->bi_size; | 1733 | nbytes = bio->bi_size; |
1731 | req_bio_endio(req, bio, nbytes, error); | 1734 | req_bio_endio(req, bio, nbytes, error); |
1732 | next_idx = 0; | 1735 | next_idx = 0; |
1733 | bio_nbytes = 0; | 1736 | bio_nbytes = 0; |
1734 | } else { | 1737 | } else { |
1735 | int idx = bio->bi_idx + next_idx; | 1738 | int idx = bio->bi_idx + next_idx; |
1736 | 1739 | ||
1737 | if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { | 1740 | if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { |
1738 | blk_dump_rq_flags(req, "__end_that"); | 1741 | blk_dump_rq_flags(req, "__end_that"); |
1739 | printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n", | 1742 | printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n", |
1740 | __func__, bio->bi_idx, bio->bi_vcnt); | 1743 | __func__, bio->bi_idx, bio->bi_vcnt); |
1741 | break; | 1744 | break; |
1742 | } | 1745 | } |
1743 | 1746 | ||
1744 | nbytes = bio_iovec_idx(bio, idx)->bv_len; | 1747 | nbytes = bio_iovec_idx(bio, idx)->bv_len; |
1745 | BIO_BUG_ON(nbytes > bio->bi_size); | 1748 | BIO_BUG_ON(nbytes > bio->bi_size); |
1746 | 1749 | ||
1747 | /* | 1750 | /* |
1748 | * not a complete bvec done | 1751 | * not a complete bvec done |
1749 | */ | 1752 | */ |
1750 | if (unlikely(nbytes > nr_bytes)) { | 1753 | if (unlikely(nbytes > nr_bytes)) { |
1751 | bio_nbytes += nr_bytes; | 1754 | bio_nbytes += nr_bytes; |
1752 | total_bytes += nr_bytes; | 1755 | total_bytes += nr_bytes; |
1753 | break; | 1756 | break; |
1754 | } | 1757 | } |
1755 | 1758 | ||
1756 | /* | 1759 | /* |
1757 | * advance to the next vector | 1760 | * advance to the next vector |
1758 | */ | 1761 | */ |
1759 | next_idx++; | 1762 | next_idx++; |
1760 | bio_nbytes += nbytes; | 1763 | bio_nbytes += nbytes; |
1761 | } | 1764 | } |
1762 | 1765 | ||
1763 | total_bytes += nbytes; | 1766 | total_bytes += nbytes; |
1764 | nr_bytes -= nbytes; | 1767 | nr_bytes -= nbytes; |
1765 | 1768 | ||
1766 | bio = req->bio; | 1769 | bio = req->bio; |
1767 | if (bio) { | 1770 | if (bio) { |
1768 | /* | 1771 | /* |
1769 | * end more in this run, or just return 'not-done' | 1772 | * end more in this run, or just return 'not-done' |
1770 | */ | 1773 | */ |
1771 | if (unlikely(nr_bytes <= 0)) | 1774 | if (unlikely(nr_bytes <= 0)) |
1772 | break; | 1775 | break; |
1773 | } | 1776 | } |
1774 | } | 1777 | } |
1775 | 1778 | ||
1776 | /* | 1779 | /* |
1777 | * completely done | 1780 | * completely done |
1778 | */ | 1781 | */ |
1779 | if (!req->bio) | 1782 | if (!req->bio) |
1780 | return 0; | 1783 | return 0; |
1781 | 1784 | ||
1782 | /* | 1785 | /* |
1783 | * if the request wasn't completed, update state | 1786 | * if the request wasn't completed, update state |
1784 | */ | 1787 | */ |
1785 | if (bio_nbytes) { | 1788 | if (bio_nbytes) { |
1786 | req_bio_endio(req, bio, bio_nbytes, error); | 1789 | req_bio_endio(req, bio, bio_nbytes, error); |
1787 | bio->bi_idx += next_idx; | 1790 | bio->bi_idx += next_idx; |
1788 | bio_iovec(bio)->bv_offset += nr_bytes; | 1791 | bio_iovec(bio)->bv_offset += nr_bytes; |
1789 | bio_iovec(bio)->bv_len -= nr_bytes; | 1792 | bio_iovec(bio)->bv_len -= nr_bytes; |
1790 | } | 1793 | } |
1791 | 1794 | ||
1792 | blk_recalc_rq_sectors(req, total_bytes >> 9); | 1795 | blk_recalc_rq_sectors(req, total_bytes >> 9); |
1793 | blk_recalc_rq_segments(req); | 1796 | blk_recalc_rq_segments(req); |
1794 | return 1; | 1797 | return 1; |
1795 | } | 1798 | } |
1796 | 1799 | ||
1797 | /* | 1800 | /* |
1798 | * queue lock must be held | 1801 | * queue lock must be held |
1799 | */ | 1802 | */ |
1800 | static void end_that_request_last(struct request *req, int error) | 1803 | static void end_that_request_last(struct request *req, int error) |
1801 | { | 1804 | { |
1802 | struct gendisk *disk = req->rq_disk; | 1805 | struct gendisk *disk = req->rq_disk; |
1803 | 1806 | ||
1804 | if (blk_rq_tagged(req)) | 1807 | if (blk_rq_tagged(req)) |
1805 | blk_queue_end_tag(req->q, req); | 1808 | blk_queue_end_tag(req->q, req); |
1806 | 1809 | ||
1807 | if (blk_queued_rq(req)) | 1810 | if (blk_queued_rq(req)) |
1808 | elv_dequeue_request(req->q, req); | 1811 | elv_dequeue_request(req->q, req); |
1809 | 1812 | ||
1810 | if (unlikely(laptop_mode) && blk_fs_request(req)) | 1813 | if (unlikely(laptop_mode) && blk_fs_request(req)) |
1811 | laptop_io_completion(); | 1814 | laptop_io_completion(); |
1812 | 1815 | ||
1813 | blk_delete_timer(req); | 1816 | blk_delete_timer(req); |
1814 | 1817 | ||
1815 | /* | 1818 | /* |
1816 | * Account IO completion. bar_rq isn't accounted as a normal | 1819 | * Account IO completion. bar_rq isn't accounted as a normal |
1817 | * IO on queueing nor completion. Accounting the containing | 1820 | * IO on queueing nor completion. Accounting the containing |
1818 | * request is enough. | 1821 | * request is enough. |
1819 | */ | 1822 | */ |
1820 | if (disk && blk_fs_request(req) && req != &req->q->bar_rq) { | 1823 | if (disk && blk_fs_request(req) && req != &req->q->bar_rq) { |
1821 | unsigned long duration = jiffies - req->start_time; | 1824 | unsigned long duration = jiffies - req->start_time; |
1822 | const int rw = rq_data_dir(req); | 1825 | const int rw = rq_data_dir(req); |
1823 | struct hd_struct *part; | 1826 | struct hd_struct *part; |
1824 | int cpu; | 1827 | int cpu; |
1825 | 1828 | ||
1826 | cpu = part_stat_lock(); | 1829 | cpu = part_stat_lock(); |
1827 | part = disk_map_sector_rcu(disk, req->sector); | 1830 | part = disk_map_sector_rcu(disk, req->sector); |
1828 | 1831 | ||
1829 | part_stat_inc(cpu, part, ios[rw]); | 1832 | part_stat_inc(cpu, part, ios[rw]); |
1830 | part_stat_add(cpu, part, ticks[rw], duration); | 1833 | part_stat_add(cpu, part, ticks[rw], duration); |
1831 | part_round_stats(cpu, part); | 1834 | part_round_stats(cpu, part); |
1832 | part_dec_in_flight(part); | 1835 | part_dec_in_flight(part); |
1833 | 1836 | ||
1834 | part_stat_unlock(); | 1837 | part_stat_unlock(); |
1835 | } | 1838 | } |
1836 | 1839 | ||
1837 | if (req->end_io) | 1840 | if (req->end_io) |
1838 | req->end_io(req, error); | 1841 | req->end_io(req, error); |
1839 | else { | 1842 | else { |
1840 | if (blk_bidi_rq(req)) | 1843 | if (blk_bidi_rq(req)) |
1841 | __blk_put_request(req->next_rq->q, req->next_rq); | 1844 | __blk_put_request(req->next_rq->q, req->next_rq); |
1842 | 1845 | ||
1843 | __blk_put_request(req->q, req); | 1846 | __blk_put_request(req->q, req); |
1844 | } | 1847 | } |
1845 | } | 1848 | } |
1846 | 1849 | ||
1847 | /** | 1850 | /** |
1848 | * blk_rq_bytes - Returns bytes left to complete in the entire request | 1851 | * blk_rq_bytes - Returns bytes left to complete in the entire request |
1849 | * @rq: the request being processed | 1852 | * @rq: the request being processed |
1850 | **/ | 1853 | **/ |
1851 | unsigned int blk_rq_bytes(struct request *rq) | 1854 | unsigned int blk_rq_bytes(struct request *rq) |
1852 | { | 1855 | { |
1853 | if (blk_fs_request(rq)) | 1856 | if (blk_fs_request(rq)) |
1854 | return rq->hard_nr_sectors << 9; | 1857 | return rq->hard_nr_sectors << 9; |
1855 | 1858 | ||
1856 | return rq->data_len; | 1859 | return rq->data_len; |
1857 | } | 1860 | } |
1858 | EXPORT_SYMBOL_GPL(blk_rq_bytes); | 1861 | EXPORT_SYMBOL_GPL(blk_rq_bytes); |
1859 | 1862 | ||
1860 | /** | 1863 | /** |
1861 | * blk_rq_cur_bytes - Returns bytes left to complete in the current segment | 1864 | * blk_rq_cur_bytes - Returns bytes left to complete in the current segment |
1862 | * @rq: the request being processed | 1865 | * @rq: the request being processed |
1863 | **/ | 1866 | **/ |
1864 | unsigned int blk_rq_cur_bytes(struct request *rq) | 1867 | unsigned int blk_rq_cur_bytes(struct request *rq) |
1865 | { | 1868 | { |
1866 | if (blk_fs_request(rq)) | 1869 | if (blk_fs_request(rq)) |
1867 | return rq->current_nr_sectors << 9; | 1870 | return rq->current_nr_sectors << 9; |
1868 | 1871 | ||
1869 | if (rq->bio) | 1872 | if (rq->bio) |
1870 | return rq->bio->bi_size; | 1873 | return rq->bio->bi_size; |
1871 | 1874 | ||
1872 | return rq->data_len; | 1875 | return rq->data_len; |
1873 | } | 1876 | } |
1874 | EXPORT_SYMBOL_GPL(blk_rq_cur_bytes); | 1877 | EXPORT_SYMBOL_GPL(blk_rq_cur_bytes); |
1875 | 1878 | ||
1876 | /** | 1879 | /** |
1877 | * end_request - end I/O on the current segment of the request | 1880 | * end_request - end I/O on the current segment of the request |
1878 | * @req: the request being processed | 1881 | * @req: the request being processed |
1879 | * @uptodate: error value or %0/%1 uptodate flag | 1882 | * @uptodate: error value or %0/%1 uptodate flag |
1880 | * | 1883 | * |
1881 | * Description: | 1884 | * Description: |
1882 | * Ends I/O on the current segment of a request. If that is the only | 1885 | * Ends I/O on the current segment of a request. If that is the only |
1883 | * remaining segment, the request is also completed and freed. | 1886 | * remaining segment, the request is also completed and freed. |
1884 | * | 1887 | * |
1885 | * This is a remnant of how older block drivers handled I/O completions. | 1888 | * This is a remnant of how older block drivers handled I/O completions. |
1886 | * Modern drivers typically end I/O on the full request in one go, unless | 1889 | * Modern drivers typically end I/O on the full request in one go, unless |
1887 | * they have a residual value to account for. For that case this function | 1890 | * they have a residual value to account for. For that case this function |
1888 | * isn't really useful, unless the residual just happens to be the | 1891 | * isn't really useful, unless the residual just happens to be the |
1889 | * full current segment. In other words, don't use this function in new | 1892 | * full current segment. In other words, don't use this function in new |
1890 | * code. Use blk_end_request() or __blk_end_request() to end a request. | 1893 | * code. Use blk_end_request() or __blk_end_request() to end a request. |
1891 | **/ | 1894 | **/ |
1892 | void end_request(struct request *req, int uptodate) | 1895 | void end_request(struct request *req, int uptodate) |
1893 | { | 1896 | { |
1894 | int error = 0; | 1897 | int error = 0; |
1895 | 1898 | ||
1896 | if (uptodate <= 0) | 1899 | if (uptodate <= 0) |
1897 | error = uptodate ? uptodate : -EIO; | 1900 | error = uptodate ? uptodate : -EIO; |
1898 | 1901 | ||
1899 | __blk_end_request(req, error, req->hard_cur_sectors << 9); | 1902 | __blk_end_request(req, error, req->hard_cur_sectors << 9); |
1900 | } | 1903 | } |
1901 | EXPORT_SYMBOL(end_request); | 1904 | EXPORT_SYMBOL(end_request); |
1902 | 1905 | ||
1903 | static int end_that_request_data(struct request *rq, int error, | 1906 | static int end_that_request_data(struct request *rq, int error, |
1904 | unsigned int nr_bytes, unsigned int bidi_bytes) | 1907 | unsigned int nr_bytes, unsigned int bidi_bytes) |
1905 | { | 1908 | { |
1906 | if (rq->bio) { | 1909 | if (rq->bio) { |
1907 | if (__end_that_request_first(rq, error, nr_bytes)) | 1910 | if (__end_that_request_first(rq, error, nr_bytes)) |
1908 | return 1; | 1911 | return 1; |
1909 | 1912 | ||
1910 | /* Bidi request must be completed as a whole */ | 1913 | /* Bidi request must be completed as a whole */ |
1911 | if (blk_bidi_rq(rq) && | 1914 | if (blk_bidi_rq(rq) && |
1912 | __end_that_request_first(rq->next_rq, error, bidi_bytes)) | 1915 | __end_that_request_first(rq->next_rq, error, bidi_bytes)) |
1913 | return 1; | 1916 | return 1; |
1914 | } | 1917 | } |
1915 | 1918 | ||
1916 | return 0; | 1919 | return 0; |
1917 | } | 1920 | } |
1918 | 1921 | ||
1919 | /** | 1922 | /** |
1920 | * blk_end_io - Generic end_io function to complete a request. | 1923 | * blk_end_io - Generic end_io function to complete a request. |
1921 | * @rq: the request being processed | 1924 | * @rq: the request being processed |
1922 | * @error: %0 for success, < %0 for error | 1925 | * @error: %0 for success, < %0 for error |
1923 | * @nr_bytes: number of bytes to complete @rq | 1926 | * @nr_bytes: number of bytes to complete @rq |
1924 | * @bidi_bytes: number of bytes to complete @rq->next_rq | 1927 | * @bidi_bytes: number of bytes to complete @rq->next_rq |
1925 | * @drv_callback: function called between completion of bios in the request | 1928 | * @drv_callback: function called between completion of bios in the request |
1926 | * and completion of the request. | 1929 | * and completion of the request. |
1927 | * If the callback returns non %0, this helper returns without | 1930 | * If the callback returns non %0, this helper returns without |
1928 | * completion of the request. | 1931 | * completion of the request. |
1929 | * | 1932 | * |
1930 | * Description: | 1933 | * Description: |
1931 | * Ends I/O on a number of bytes attached to @rq and @rq->next_rq. | 1934 | * Ends I/O on a number of bytes attached to @rq and @rq->next_rq. |
1932 | * If @rq has leftover, sets it up for the next range of segments. | 1935 | * If @rq has leftover, sets it up for the next range of segments. |
1933 | * | 1936 | * |
1934 | * Return: | 1937 | * Return: |
1935 | * %0 - we are done with this request | 1938 | * %0 - we are done with this request |
1936 | * %1 - this request is not freed yet, it still has pending buffers. | 1939 | * %1 - this request is not freed yet, it still has pending buffers. |
1937 | **/ | 1940 | **/ |
1938 | static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes, | 1941 | static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes, |
1939 | unsigned int bidi_bytes, | 1942 | unsigned int bidi_bytes, |
1940 | int (drv_callback)(struct request *)) | 1943 | int (drv_callback)(struct request *)) |
1941 | { | 1944 | { |
1942 | struct request_queue *q = rq->q; | 1945 | struct request_queue *q = rq->q; |
1943 | unsigned long flags = 0UL; | 1946 | unsigned long flags = 0UL; |
1944 | 1947 | ||
1945 | if (end_that_request_data(rq, error, nr_bytes, bidi_bytes)) | 1948 | if (end_that_request_data(rq, error, nr_bytes, bidi_bytes)) |
1946 | return 1; | 1949 | return 1; |
1947 | 1950 | ||
1948 | /* Special feature for tricky drivers */ | 1951 | /* Special feature for tricky drivers */ |
1949 | if (drv_callback && drv_callback(rq)) | 1952 | if (drv_callback && drv_callback(rq)) |
1950 | return 1; | 1953 | return 1; |
1951 | 1954 | ||
1952 | add_disk_randomness(rq->rq_disk); | 1955 | add_disk_randomness(rq->rq_disk); |
1953 | 1956 | ||
1954 | spin_lock_irqsave(q->queue_lock, flags); | 1957 | spin_lock_irqsave(q->queue_lock, flags); |
1955 | end_that_request_last(rq, error); | 1958 | end_that_request_last(rq, error); |
1956 | spin_unlock_irqrestore(q->queue_lock, flags); | 1959 | spin_unlock_irqrestore(q->queue_lock, flags); |
1957 | 1960 | ||
1958 | return 0; | 1961 | return 0; |
1959 | } | 1962 | } |
1960 | 1963 | ||
1961 | /** | 1964 | /** |
1962 | * blk_end_request - Helper function for drivers to complete the request. | 1965 | * blk_end_request - Helper function for drivers to complete the request. |
1963 | * @rq: the request being processed | 1966 | * @rq: the request being processed |
1964 | * @error: %0 for success, < %0 for error | 1967 | * @error: %0 for success, < %0 for error |
1965 | * @nr_bytes: number of bytes to complete | 1968 | * @nr_bytes: number of bytes to complete |
1966 | * | 1969 | * |
1967 | * Description: | 1970 | * Description: |
1968 | * Ends I/O on a number of bytes attached to @rq. | 1971 | * Ends I/O on a number of bytes attached to @rq. |
1969 | * If @rq has leftover, sets it up for the next range of segments. | 1972 | * If @rq has leftover, sets it up for the next range of segments. |
1970 | * | 1973 | * |
1971 | * Return: | 1974 | * Return: |
1972 | * %0 - we are done with this request | 1975 | * %0 - we are done with this request |
1973 | * %1 - still buffers pending for this request | 1976 | * %1 - still buffers pending for this request |
1974 | **/ | 1977 | **/ |
1975 | int blk_end_request(struct request *rq, int error, unsigned int nr_bytes) | 1978 | int blk_end_request(struct request *rq, int error, unsigned int nr_bytes) |
1976 | { | 1979 | { |
1977 | return blk_end_io(rq, error, nr_bytes, 0, NULL); | 1980 | return blk_end_io(rq, error, nr_bytes, 0, NULL); |
1978 | } | 1981 | } |
1979 | EXPORT_SYMBOL_GPL(blk_end_request); | 1982 | EXPORT_SYMBOL_GPL(blk_end_request); |
1980 | 1983 | ||
1981 | /** | 1984 | /** |
1982 | * __blk_end_request - Helper function for drivers to complete the request. | 1985 | * __blk_end_request - Helper function for drivers to complete the request. |
1983 | * @rq: the request being processed | 1986 | * @rq: the request being processed |
1984 | * @error: %0 for success, < %0 for error | 1987 | * @error: %0 for success, < %0 for error |
1985 | * @nr_bytes: number of bytes to complete | 1988 | * @nr_bytes: number of bytes to complete |
1986 | * | 1989 | * |
1987 | * Description: | 1990 | * Description: |
1988 | * Must be called with queue lock held unlike blk_end_request(). | 1991 | * Must be called with queue lock held unlike blk_end_request(). |
1989 | * | 1992 | * |
1990 | * Return: | 1993 | * Return: |
1991 | * %0 - we are done with this request | 1994 | * %0 - we are done with this request |
1992 | * %1 - still buffers pending for this request | 1995 | * %1 - still buffers pending for this request |
1993 | **/ | 1996 | **/ |
1994 | int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes) | 1997 | int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes) |
1995 | { | 1998 | { |
1996 | if (rq->bio && __end_that_request_first(rq, error, nr_bytes)) | 1999 | if (rq->bio && __end_that_request_first(rq, error, nr_bytes)) |
1997 | return 1; | 2000 | return 1; |
1998 | 2001 | ||
1999 | add_disk_randomness(rq->rq_disk); | 2002 | add_disk_randomness(rq->rq_disk); |
2000 | 2003 | ||
2001 | end_that_request_last(rq, error); | 2004 | end_that_request_last(rq, error); |
2002 | 2005 | ||
2003 | return 0; | 2006 | return 0; |
2004 | } | 2007 | } |
2005 | EXPORT_SYMBOL_GPL(__blk_end_request); | 2008 | EXPORT_SYMBOL_GPL(__blk_end_request); |
2006 | 2009 | ||
2007 | /** | 2010 | /** |
2008 | * blk_end_bidi_request - Helper function for drivers to complete bidi request. | 2011 | * blk_end_bidi_request - Helper function for drivers to complete bidi request. |
2009 | * @rq: the bidi request being processed | 2012 | * @rq: the bidi request being processed |
2010 | * @error: %0 for success, < %0 for error | 2013 | * @error: %0 for success, < %0 for error |
2011 | * @nr_bytes: number of bytes to complete @rq | 2014 | * @nr_bytes: number of bytes to complete @rq |
2012 | * @bidi_bytes: number of bytes to complete @rq->next_rq | 2015 | * @bidi_bytes: number of bytes to complete @rq->next_rq |
2013 | * | 2016 | * |
2014 | * Description: | 2017 | * Description: |
2015 | * Ends I/O on a number of bytes attached to @rq and @rq->next_rq. | 2018 | * Ends I/O on a number of bytes attached to @rq and @rq->next_rq. |
2016 | * | 2019 | * |
2017 | * Return: | 2020 | * Return: |
2018 | * %0 - we are done with this request | 2021 | * %0 - we are done with this request |
2019 | * %1 - still buffers pending for this request | 2022 | * %1 - still buffers pending for this request |
2020 | **/ | 2023 | **/ |
2021 | int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, | 2024 | int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, |
2022 | unsigned int bidi_bytes) | 2025 | unsigned int bidi_bytes) |
2023 | { | 2026 | { |
2024 | return blk_end_io(rq, error, nr_bytes, bidi_bytes, NULL); | 2027 | return blk_end_io(rq, error, nr_bytes, bidi_bytes, NULL); |
2025 | } | 2028 | } |
2026 | EXPORT_SYMBOL_GPL(blk_end_bidi_request); | 2029 | EXPORT_SYMBOL_GPL(blk_end_bidi_request); |
2027 | 2030 | ||
2028 | /** | 2031 | /** |
2029 | * blk_update_request - Special helper function for request stacking drivers | 2032 | * blk_update_request - Special helper function for request stacking drivers |
2030 | * @rq: the request being processed | 2033 | * @rq: the request being processed |
2031 | * @error: %0 for success, < %0 for error | 2034 | * @error: %0 for success, < %0 for error |
2032 | * @nr_bytes: number of bytes to complete @rq | 2035 | * @nr_bytes: number of bytes to complete @rq |
2033 | * | 2036 | * |
2034 | * Description: | 2037 | * Description: |
2035 | * Ends I/O on a number of bytes attached to @rq, but doesn't complete | 2038 | * Ends I/O on a number of bytes attached to @rq, but doesn't complete |
2036 | * the request structure even if @rq doesn't have leftover. | 2039 | * the request structure even if @rq doesn't have leftover. |
2037 | * If @rq has leftover, sets it up for the next range of segments. | 2040 | * If @rq has leftover, sets it up for the next range of segments. |
2038 | * | 2041 | * |
2039 | * This special helper function is only for request stacking drivers | 2042 | * This special helper function is only for request stacking drivers |
2040 | * (e.g. request-based dm) so that they can handle partial completion. | 2043 | * (e.g. request-based dm) so that they can handle partial completion. |
2041 | * Actual device drivers should use blk_end_request instead. | 2044 | * Actual device drivers should use blk_end_request instead. |
2042 | */ | 2045 | */ |
2043 | void blk_update_request(struct request *rq, int error, unsigned int nr_bytes) | 2046 | void blk_update_request(struct request *rq, int error, unsigned int nr_bytes) |
2044 | { | 2047 | { |
2045 | if (!end_that_request_data(rq, error, nr_bytes, 0)) { | 2048 | if (!end_that_request_data(rq, error, nr_bytes, 0)) { |
2046 | /* | 2049 | /* |
2047 | * These members are not updated in end_that_request_data() | 2050 | * These members are not updated in end_that_request_data() |
2048 | * when all bios are completed. | 2051 | * when all bios are completed. |
2049 | * Update them so that the request stacking driver can find | 2052 | * Update them so that the request stacking driver can find |
2050 | * how many bytes remain in the request later. | 2053 | * how many bytes remain in the request later. |
2051 | */ | 2054 | */ |
2052 | rq->nr_sectors = rq->hard_nr_sectors = 0; | 2055 | rq->nr_sectors = rq->hard_nr_sectors = 0; |
2053 | rq->current_nr_sectors = rq->hard_cur_sectors = 0; | 2056 | rq->current_nr_sectors = rq->hard_cur_sectors = 0; |
2054 | } | 2057 | } |
2055 | } | 2058 | } |
2056 | EXPORT_SYMBOL_GPL(blk_update_request); | 2059 | EXPORT_SYMBOL_GPL(blk_update_request); |
2057 | 2060 | ||
2058 | /** | 2061 | /** |
2059 | * blk_end_request_callback - Special helper function for tricky drivers | 2062 | * blk_end_request_callback - Special helper function for tricky drivers |
2060 | * @rq: the request being processed | 2063 | * @rq: the request being processed |
2061 | * @error: %0 for success, < %0 for error | 2064 | * @error: %0 for success, < %0 for error |
2062 | * @nr_bytes: number of bytes to complete | 2065 | * @nr_bytes: number of bytes to complete |
2063 | * @drv_callback: function called between completion of bios in the request | 2066 | * @drv_callback: function called between completion of bios in the request |
2064 | * and completion of the request. | 2067 | * and completion of the request. |
2065 | * If the callback returns non %0, this helper returns without | 2068 | * If the callback returns non %0, this helper returns without |
2066 | * completion of the request. | 2069 | * completion of the request. |
2067 | * | 2070 | * |
2068 | * Description: | 2071 | * Description: |
2069 | * Ends I/O on a number of bytes attached to @rq. | 2072 | * Ends I/O on a number of bytes attached to @rq. |
2070 | * If @rq has leftover, sets it up for the next range of segments. | 2073 | * If @rq has leftover, sets it up for the next range of segments. |
2071 | * | 2074 | * |
2072 | * This special helper function is used only for existing tricky drivers. | 2075 | * This special helper function is used only for existing tricky drivers. |
2073 | * (e.g. cdrom_newpc_intr() of ide-cd) | 2076 | * (e.g. cdrom_newpc_intr() of ide-cd) |
2074 | * This interface will be removed when such drivers are rewritten. | 2077 | * This interface will be removed when such drivers are rewritten. |
2075 | * Don't use this interface in other places anymore. | 2078 | * Don't use this interface in other places anymore. |
2076 | * | 2079 | * |
2077 | * Return: | 2080 | * Return: |
2078 | * %0 - we are done with this request | 2081 | * %0 - we are done with this request |
2079 | * %1 - this request is not freed yet. | 2082 | * %1 - this request is not freed yet. |
2080 | * this request still has pending buffers or | 2083 | * this request still has pending buffers or |
2081 | * the driver doesn't want to finish this request yet. | 2084 | * the driver doesn't want to finish this request yet. |
2082 | **/ | 2085 | **/ |
2083 | int blk_end_request_callback(struct request *rq, int error, | 2086 | int blk_end_request_callback(struct request *rq, int error, |
2084 | unsigned int nr_bytes, | 2087 | unsigned int nr_bytes, |
2085 | int (drv_callback)(struct request *)) | 2088 | int (drv_callback)(struct request *)) |
2086 | { | 2089 | { |
2087 | return blk_end_io(rq, error, nr_bytes, 0, drv_callback); | 2090 | return blk_end_io(rq, error, nr_bytes, 0, drv_callback); |
2088 | } | 2091 | } |
2089 | EXPORT_SYMBOL_GPL(blk_end_request_callback); | 2092 | EXPORT_SYMBOL_GPL(blk_end_request_callback); |
2090 | 2093 | ||
2091 | void blk_rq_bio_prep(struct request_queue *q, struct request *rq, | 2094 | void blk_rq_bio_prep(struct request_queue *q, struct request *rq, |
2092 | struct bio *bio) | 2095 | struct bio *bio) |
2093 | { | 2096 | { |
2094 | /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and | 2097 | /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and |
2095 | we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */ | 2098 | we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */ |
2096 | rq->cmd_flags |= (bio->bi_rw & 3); | 2099 | rq->cmd_flags |= (bio->bi_rw & 3); |
2097 | 2100 | ||
2098 | if (bio_has_data(bio)) { | 2101 | if (bio_has_data(bio)) { |
2099 | rq->nr_phys_segments = bio_phys_segments(q, bio); | 2102 | rq->nr_phys_segments = bio_phys_segments(q, bio); |
2100 | rq->buffer = bio_data(bio); | 2103 | rq->buffer = bio_data(bio); |
2101 | } | 2104 | } |
2102 | rq->current_nr_sectors = bio_cur_sectors(bio); | 2105 | rq->current_nr_sectors = bio_cur_sectors(bio); |
2103 | rq->hard_cur_sectors = rq->current_nr_sectors; | 2106 | rq->hard_cur_sectors = rq->current_nr_sectors; |
2104 | rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio); | 2107 | rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio); |
2105 | rq->data_len = bio->bi_size; | 2108 | rq->data_len = bio->bi_size; |
2106 | 2109 | ||
2107 | rq->bio = rq->biotail = bio; | 2110 | rq->bio = rq->biotail = bio; |
2108 | 2111 | ||
2109 | if (bio->bi_bdev) | 2112 | if (bio->bi_bdev) |
2110 | rq->rq_disk = bio->bi_bdev->bd_disk; | 2113 | rq->rq_disk = bio->bi_bdev->bd_disk; |
2111 | } | 2114 | } |
2112 | 2115 | ||
2113 | /** | 2116 | /** |
2114 | * blk_lld_busy - Check if underlying low-level drivers of a device are busy | 2117 | * blk_lld_busy - Check if underlying low-level drivers of a device are busy |
2115 | * @q : the queue of the device being checked | 2118 | * @q : the queue of the device being checked |
2116 | * | 2119 | * |
2117 | * Description: | 2120 | * Description: |
2118 | * Check if underlying low-level drivers of a device are busy. | 2121 | * Check if underlying low-level drivers of a device are busy. |
2119 | * If the drivers want to export their busy state, they must set own | 2122 | * If the drivers want to export their busy state, they must set own |
2120 | * exporting function using blk_queue_lld_busy() first. | 2123 | * exporting function using blk_queue_lld_busy() first. |
2121 | * | 2124 | * |
2122 | * Basically, this function is used only by request stacking drivers | 2125 | * Basically, this function is used only by request stacking drivers |
2123 | * to stop dispatching requests to underlying devices when underlying | 2126 | * to stop dispatching requests to underlying devices when underlying |
2124 | * devices are busy. This behavior helps more I/O merging on the queue | 2127 | * devices are busy. This behavior helps more I/O merging on the queue |
2125 | * of the request stacking driver and prevents I/O throughput regression | 2128 | * of the request stacking driver and prevents I/O throughput regression |
2126 | * on burst I/O load. | 2129 | * on burst I/O load. |
2127 | * | 2130 | * |
2128 | * Return: | 2131 | * Return: |
2129 | * 0 - Not busy (The request stacking driver should dispatch request) | 2132 | * 0 - Not busy (The request stacking driver should dispatch request) |
2130 | * 1 - Busy (The request stacking driver should stop dispatching request) | 2133 | * 1 - Busy (The request stacking driver should stop dispatching request) |
2131 | */ | 2134 | */ |
2132 | int blk_lld_busy(struct request_queue *q) | 2135 | int blk_lld_busy(struct request_queue *q) |
2133 | { | 2136 | { |
2134 | if (q->lld_busy_fn) | 2137 | if (q->lld_busy_fn) |
2135 | return q->lld_busy_fn(q); | 2138 | return q->lld_busy_fn(q); |
2136 | 2139 | ||
2137 | return 0; | 2140 | return 0; |
2138 | } | 2141 | } |
2139 | EXPORT_SYMBOL_GPL(blk_lld_busy); | 2142 | EXPORT_SYMBOL_GPL(blk_lld_busy); |
2140 | 2143 | ||
2141 | int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) | 2144 | int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) |
2142 | { | 2145 | { |
2143 | return queue_work(kblockd_workqueue, work); | 2146 | return queue_work(kblockd_workqueue, work); |
2144 | } | 2147 | } |
2145 | EXPORT_SYMBOL(kblockd_schedule_work); | 2148 | EXPORT_SYMBOL(kblockd_schedule_work); |
2146 | 2149 | ||
2147 | void kblockd_flush_work(struct work_struct *work) | 2150 | void kblockd_flush_work(struct work_struct *work) |
2148 | { | 2151 | { |
2149 | cancel_work_sync(work); | 2152 | cancel_work_sync(work); |
2150 | } | 2153 | } |
2151 | EXPORT_SYMBOL(kblockd_flush_work); | 2154 | EXPORT_SYMBOL(kblockd_flush_work); |
2152 | 2155 | ||
2153 | int __init blk_dev_init(void) | 2156 | int __init blk_dev_init(void) |
2154 | { | 2157 | { |
2155 | kblockd_workqueue = create_workqueue("kblockd"); | 2158 | kblockd_workqueue = create_workqueue("kblockd"); |
2156 | if (!kblockd_workqueue) | 2159 | if (!kblockd_workqueue) |
2157 | panic("Failed to create kblockd\n"); | 2160 | panic("Failed to create kblockd\n"); |
2158 | 2161 | ||
2159 | request_cachep = kmem_cache_create("blkdev_requests", | 2162 | request_cachep = kmem_cache_create("blkdev_requests", |
2160 | sizeof(struct request), 0, SLAB_PANIC, NULL); | 2163 | sizeof(struct request), 0, SLAB_PANIC, NULL); |
2161 | 2164 | ||
2162 | blk_requestq_cachep = kmem_cache_create("blkdev_queue", | 2165 | blk_requestq_cachep = kmem_cache_create("blkdev_queue", |
2163 | sizeof(struct request_queue), 0, SLAB_PANIC, NULL); | 2166 | sizeof(struct request_queue), 0, SLAB_PANIC, NULL); |
2164 | 2167 | ||
2165 | return 0; | 2168 | return 0; |
2166 | } | 2169 | } |
2167 | 2170 | ||
2168 | 2171 |
fs/buffer.c
1 | /* | 1 | /* |
2 | * linux/fs/buffer.c | 2 | * linux/fs/buffer.c |
3 | * | 3 | * |
4 | * Copyright (C) 1991, 1992, 2002 Linus Torvalds | 4 | * Copyright (C) 1991, 1992, 2002 Linus Torvalds |
5 | */ | 5 | */ |
6 | 6 | ||
7 | /* | 7 | /* |
8 | * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 | 8 | * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 |
9 | * | 9 | * |
10 | * Removed a lot of unnecessary code and simplified things now that | 10 | * Removed a lot of unnecessary code and simplified things now that |
11 | * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 | 11 | * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 |
12 | * | 12 | * |
13 | * Speed up hash, lru, and free list operations. Use gfp() for allocating | 13 | * Speed up hash, lru, and free list operations. Use gfp() for allocating |
14 | * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM | 14 | * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM |
15 | * | 15 | * |
16 | * Added 32k buffer block sizes - these are required older ARM systems. - RMK | 16 | * Added 32k buffer block sizes - these are required older ARM systems. - RMK |
17 | * | 17 | * |
18 | * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> | 18 | * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include <linux/kernel.h> | 21 | #include <linux/kernel.h> |
22 | #include <linux/syscalls.h> | 22 | #include <linux/syscalls.h> |
23 | #include <linux/fs.h> | 23 | #include <linux/fs.h> |
24 | #include <linux/mm.h> | 24 | #include <linux/mm.h> |
25 | #include <linux/percpu.h> | 25 | #include <linux/percpu.h> |
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/capability.h> | 27 | #include <linux/capability.h> |
28 | #include <linux/blkdev.h> | 28 | #include <linux/blkdev.h> |
29 | #include <linux/file.h> | 29 | #include <linux/file.h> |
30 | #include <linux/quotaops.h> | 30 | #include <linux/quotaops.h> |
31 | #include <linux/highmem.h> | 31 | #include <linux/highmem.h> |
32 | #include <linux/module.h> | 32 | #include <linux/module.h> |
33 | #include <linux/writeback.h> | 33 | #include <linux/writeback.h> |
34 | #include <linux/hash.h> | 34 | #include <linux/hash.h> |
35 | #include <linux/suspend.h> | 35 | #include <linux/suspend.h> |
36 | #include <linux/buffer_head.h> | 36 | #include <linux/buffer_head.h> |
37 | #include <linux/task_io_accounting_ops.h> | 37 | #include <linux/task_io_accounting_ops.h> |
38 | #include <linux/bio.h> | 38 | #include <linux/bio.h> |
39 | #include <linux/notifier.h> | 39 | #include <linux/notifier.h> |
40 | #include <linux/cpu.h> | 40 | #include <linux/cpu.h> |
41 | #include <linux/bitops.h> | 41 | #include <linux/bitops.h> |
42 | #include <linux/mpage.h> | 42 | #include <linux/mpage.h> |
43 | #include <linux/bit_spinlock.h> | 43 | #include <linux/bit_spinlock.h> |
44 | 44 | ||
45 | static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); | 45 | static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); |
46 | 46 | ||
47 | #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) | 47 | #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) |
48 | 48 | ||
49 | inline void | 49 | inline void |
50 | init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) | 50 | init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) |
51 | { | 51 | { |
52 | bh->b_end_io = handler; | 52 | bh->b_end_io = handler; |
53 | bh->b_private = private; | 53 | bh->b_private = private; |
54 | } | 54 | } |
55 | 55 | ||
56 | static int sync_buffer(void *word) | 56 | static int sync_buffer(void *word) |
57 | { | 57 | { |
58 | struct block_device *bd; | 58 | struct block_device *bd; |
59 | struct buffer_head *bh | 59 | struct buffer_head *bh |
60 | = container_of(word, struct buffer_head, b_state); | 60 | = container_of(word, struct buffer_head, b_state); |
61 | 61 | ||
62 | smp_mb(); | 62 | smp_mb(); |
63 | bd = bh->b_bdev; | 63 | bd = bh->b_bdev; |
64 | if (bd) | 64 | if (bd) |
65 | blk_run_address_space(bd->bd_inode->i_mapping); | 65 | blk_run_address_space(bd->bd_inode->i_mapping); |
66 | io_schedule(); | 66 | io_schedule(); |
67 | return 0; | 67 | return 0; |
68 | } | 68 | } |
69 | 69 | ||
70 | void __lock_buffer(struct buffer_head *bh) | 70 | void __lock_buffer(struct buffer_head *bh) |
71 | { | 71 | { |
72 | wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer, | 72 | wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer, |
73 | TASK_UNINTERRUPTIBLE); | 73 | TASK_UNINTERRUPTIBLE); |
74 | } | 74 | } |
75 | EXPORT_SYMBOL(__lock_buffer); | 75 | EXPORT_SYMBOL(__lock_buffer); |
76 | 76 | ||
77 | void unlock_buffer(struct buffer_head *bh) | 77 | void unlock_buffer(struct buffer_head *bh) |
78 | { | 78 | { |
79 | clear_bit_unlock(BH_Lock, &bh->b_state); | 79 | clear_bit_unlock(BH_Lock, &bh->b_state); |
80 | smp_mb__after_clear_bit(); | 80 | smp_mb__after_clear_bit(); |
81 | wake_up_bit(&bh->b_state, BH_Lock); | 81 | wake_up_bit(&bh->b_state, BH_Lock); |
82 | } | 82 | } |
83 | 83 | ||
84 | /* | 84 | /* |
85 | * Block until a buffer comes unlocked. This doesn't stop it | 85 | * Block until a buffer comes unlocked. This doesn't stop it |
86 | * from becoming locked again - you have to lock it yourself | 86 | * from becoming locked again - you have to lock it yourself |
87 | * if you want to preserve its state. | 87 | * if you want to preserve its state. |
88 | */ | 88 | */ |
89 | void __wait_on_buffer(struct buffer_head * bh) | 89 | void __wait_on_buffer(struct buffer_head * bh) |
90 | { | 90 | { |
91 | wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); | 91 | wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); |
92 | } | 92 | } |
93 | 93 | ||
94 | static void | 94 | static void |
95 | __clear_page_buffers(struct page *page) | 95 | __clear_page_buffers(struct page *page) |
96 | { | 96 | { |
97 | ClearPagePrivate(page); | 97 | ClearPagePrivate(page); |
98 | set_page_private(page, 0); | 98 | set_page_private(page, 0); |
99 | page_cache_release(page); | 99 | page_cache_release(page); |
100 | } | 100 | } |
101 | 101 | ||
102 | |||
103 | static int quiet_error(struct buffer_head *bh) | ||
104 | { | ||
105 | if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit()) | ||
106 | return 0; | ||
107 | return 1; | ||
108 | } | ||
109 | |||
110 | |||
102 | static void buffer_io_error(struct buffer_head *bh) | 111 | static void buffer_io_error(struct buffer_head *bh) |
103 | { | 112 | { |
104 | char b[BDEVNAME_SIZE]; | 113 | char b[BDEVNAME_SIZE]; |
105 | |||
106 | printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", | 114 | printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", |
107 | bdevname(bh->b_bdev, b), | 115 | bdevname(bh->b_bdev, b), |
108 | (unsigned long long)bh->b_blocknr); | 116 | (unsigned long long)bh->b_blocknr); |
109 | } | 117 | } |
110 | 118 | ||
111 | /* | 119 | /* |
112 | * End-of-IO handler helper function which does not touch the bh after | 120 | * End-of-IO handler helper function which does not touch the bh after |
113 | * unlocking it. | 121 | * unlocking it. |
114 | * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but | 122 | * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but |
115 | * a race there is benign: unlock_buffer() only use the bh's address for | 123 | * a race there is benign: unlock_buffer() only use the bh's address for |
116 | * hashing after unlocking the buffer, so it doesn't actually touch the bh | 124 | * hashing after unlocking the buffer, so it doesn't actually touch the bh |
117 | * itself. | 125 | * itself. |
118 | */ | 126 | */ |
119 | static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) | 127 | static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) |
120 | { | 128 | { |
121 | if (uptodate) { | 129 | if (uptodate) { |
122 | set_buffer_uptodate(bh); | 130 | set_buffer_uptodate(bh); |
123 | } else { | 131 | } else { |
124 | /* This happens, due to failed READA attempts. */ | 132 | /* This happens, due to failed READA attempts. */ |
125 | clear_buffer_uptodate(bh); | 133 | clear_buffer_uptodate(bh); |
126 | } | 134 | } |
127 | unlock_buffer(bh); | 135 | unlock_buffer(bh); |
128 | } | 136 | } |
129 | 137 | ||
130 | /* | 138 | /* |
131 | * Default synchronous end-of-IO handler.. Just mark it up-to-date and | 139 | * Default synchronous end-of-IO handler.. Just mark it up-to-date and |
132 | * unlock the buffer. This is what ll_rw_block uses too. | 140 | * unlock the buffer. This is what ll_rw_block uses too. |
133 | */ | 141 | */ |
134 | void end_buffer_read_sync(struct buffer_head *bh, int uptodate) | 142 | void end_buffer_read_sync(struct buffer_head *bh, int uptodate) |
135 | { | 143 | { |
136 | __end_buffer_read_notouch(bh, uptodate); | 144 | __end_buffer_read_notouch(bh, uptodate); |
137 | put_bh(bh); | 145 | put_bh(bh); |
138 | } | 146 | } |
139 | 147 | ||
140 | void end_buffer_write_sync(struct buffer_head *bh, int uptodate) | 148 | void end_buffer_write_sync(struct buffer_head *bh, int uptodate) |
141 | { | 149 | { |
142 | char b[BDEVNAME_SIZE]; | 150 | char b[BDEVNAME_SIZE]; |
143 | 151 | ||
144 | if (uptodate) { | 152 | if (uptodate) { |
145 | set_buffer_uptodate(bh); | 153 | set_buffer_uptodate(bh); |
146 | } else { | 154 | } else { |
147 | if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { | 155 | if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) { |
148 | buffer_io_error(bh); | 156 | buffer_io_error(bh); |
149 | printk(KERN_WARNING "lost page write due to " | 157 | printk(KERN_WARNING "lost page write due to " |
150 | "I/O error on %s\n", | 158 | "I/O error on %s\n", |
151 | bdevname(bh->b_bdev, b)); | 159 | bdevname(bh->b_bdev, b)); |
152 | } | 160 | } |
153 | set_buffer_write_io_error(bh); | 161 | set_buffer_write_io_error(bh); |
154 | clear_buffer_uptodate(bh); | 162 | clear_buffer_uptodate(bh); |
155 | } | 163 | } |
156 | unlock_buffer(bh); | 164 | unlock_buffer(bh); |
157 | put_bh(bh); | 165 | put_bh(bh); |
158 | } | 166 | } |
159 | 167 | ||
160 | /* | 168 | /* |
161 | * Write out and wait upon all the dirty data associated with a block | 169 | * Write out and wait upon all the dirty data associated with a block |
162 | * device via its mapping. Does not take the superblock lock. | 170 | * device via its mapping. Does not take the superblock lock. |
163 | */ | 171 | */ |
164 | int sync_blockdev(struct block_device *bdev) | 172 | int sync_blockdev(struct block_device *bdev) |
165 | { | 173 | { |
166 | int ret = 0; | 174 | int ret = 0; |
167 | 175 | ||
168 | if (bdev) | 176 | if (bdev) |
169 | ret = filemap_write_and_wait(bdev->bd_inode->i_mapping); | 177 | ret = filemap_write_and_wait(bdev->bd_inode->i_mapping); |
170 | return ret; | 178 | return ret; |
171 | } | 179 | } |
172 | EXPORT_SYMBOL(sync_blockdev); | 180 | EXPORT_SYMBOL(sync_blockdev); |
173 | 181 | ||
174 | /* | 182 | /* |
175 | * Write out and wait upon all dirty data associated with this | 183 | * Write out and wait upon all dirty data associated with this |
176 | * device. Filesystem data as well as the underlying block | 184 | * device. Filesystem data as well as the underlying block |
177 | * device. Takes the superblock lock. | 185 | * device. Takes the superblock lock. |
178 | */ | 186 | */ |
179 | int fsync_bdev(struct block_device *bdev) | 187 | int fsync_bdev(struct block_device *bdev) |
180 | { | 188 | { |
181 | struct super_block *sb = get_super(bdev); | 189 | struct super_block *sb = get_super(bdev); |
182 | if (sb) { | 190 | if (sb) { |
183 | int res = fsync_super(sb); | 191 | int res = fsync_super(sb); |
184 | drop_super(sb); | 192 | drop_super(sb); |
185 | return res; | 193 | return res; |
186 | } | 194 | } |
187 | return sync_blockdev(bdev); | 195 | return sync_blockdev(bdev); |
188 | } | 196 | } |
189 | 197 | ||
190 | /** | 198 | /** |
191 | * freeze_bdev -- lock a filesystem and force it into a consistent state | 199 | * freeze_bdev -- lock a filesystem and force it into a consistent state |
192 | * @bdev: blockdevice to lock | 200 | * @bdev: blockdevice to lock |
193 | * | 201 | * |
194 | * This takes the block device bd_mount_sem to make sure no new mounts | 202 | * This takes the block device bd_mount_sem to make sure no new mounts |
195 | * happen on bdev until thaw_bdev() is called. | 203 | * happen on bdev until thaw_bdev() is called. |
196 | * If a superblock is found on this device, we take the s_umount semaphore | 204 | * If a superblock is found on this device, we take the s_umount semaphore |
197 | * on it to make sure nobody unmounts until the snapshot creation is done. | 205 | * on it to make sure nobody unmounts until the snapshot creation is done. |
198 | */ | 206 | */ |
199 | struct super_block *freeze_bdev(struct block_device *bdev) | 207 | struct super_block *freeze_bdev(struct block_device *bdev) |
200 | { | 208 | { |
201 | struct super_block *sb; | 209 | struct super_block *sb; |
202 | 210 | ||
203 | down(&bdev->bd_mount_sem); | 211 | down(&bdev->bd_mount_sem); |
204 | sb = get_super(bdev); | 212 | sb = get_super(bdev); |
205 | if (sb && !(sb->s_flags & MS_RDONLY)) { | 213 | if (sb && !(sb->s_flags & MS_RDONLY)) { |
206 | sb->s_frozen = SB_FREEZE_WRITE; | 214 | sb->s_frozen = SB_FREEZE_WRITE; |
207 | smp_wmb(); | 215 | smp_wmb(); |
208 | 216 | ||
209 | __fsync_super(sb); | 217 | __fsync_super(sb); |
210 | 218 | ||
211 | sb->s_frozen = SB_FREEZE_TRANS; | 219 | sb->s_frozen = SB_FREEZE_TRANS; |
212 | smp_wmb(); | 220 | smp_wmb(); |
213 | 221 | ||
214 | sync_blockdev(sb->s_bdev); | 222 | sync_blockdev(sb->s_bdev); |
215 | 223 | ||
216 | if (sb->s_op->write_super_lockfs) | 224 | if (sb->s_op->write_super_lockfs) |
217 | sb->s_op->write_super_lockfs(sb); | 225 | sb->s_op->write_super_lockfs(sb); |
218 | } | 226 | } |
219 | 227 | ||
220 | sync_blockdev(bdev); | 228 | sync_blockdev(bdev); |
221 | return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ | 229 | return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ |
222 | } | 230 | } |
223 | EXPORT_SYMBOL(freeze_bdev); | 231 | EXPORT_SYMBOL(freeze_bdev); |
224 | 232 | ||
225 | /** | 233 | /** |
226 | * thaw_bdev -- unlock filesystem | 234 | * thaw_bdev -- unlock filesystem |
227 | * @bdev: blockdevice to unlock | 235 | * @bdev: blockdevice to unlock |
228 | * @sb: associated superblock | 236 | * @sb: associated superblock |
229 | * | 237 | * |
230 | * Unlocks the filesystem and marks it writeable again after freeze_bdev(). | 238 | * Unlocks the filesystem and marks it writeable again after freeze_bdev(). |
231 | */ | 239 | */ |
232 | void thaw_bdev(struct block_device *bdev, struct super_block *sb) | 240 | void thaw_bdev(struct block_device *bdev, struct super_block *sb) |
233 | { | 241 | { |
234 | if (sb) { | 242 | if (sb) { |
235 | BUG_ON(sb->s_bdev != bdev); | 243 | BUG_ON(sb->s_bdev != bdev); |
236 | 244 | ||
237 | if (sb->s_op->unlockfs) | 245 | if (sb->s_op->unlockfs) |
238 | sb->s_op->unlockfs(sb); | 246 | sb->s_op->unlockfs(sb); |
239 | sb->s_frozen = SB_UNFROZEN; | 247 | sb->s_frozen = SB_UNFROZEN; |
240 | smp_wmb(); | 248 | smp_wmb(); |
241 | wake_up(&sb->s_wait_unfrozen); | 249 | wake_up(&sb->s_wait_unfrozen); |
242 | drop_super(sb); | 250 | drop_super(sb); |
243 | } | 251 | } |
244 | 252 | ||
245 | up(&bdev->bd_mount_sem); | 253 | up(&bdev->bd_mount_sem); |
246 | } | 254 | } |
247 | EXPORT_SYMBOL(thaw_bdev); | 255 | EXPORT_SYMBOL(thaw_bdev); |
248 | 256 | ||
249 | /* | 257 | /* |
250 | * Various filesystems appear to want __find_get_block to be non-blocking. | 258 | * Various filesystems appear to want __find_get_block to be non-blocking. |
251 | * But it's the page lock which protects the buffers. To get around this, | 259 | * But it's the page lock which protects the buffers. To get around this, |
252 | * we get exclusion from try_to_free_buffers with the blockdev mapping's | 260 | * we get exclusion from try_to_free_buffers with the blockdev mapping's |
253 | * private_lock. | 261 | * private_lock. |
254 | * | 262 | * |
255 | * Hack idea: for the blockdev mapping, i_bufferlist_lock contention | 263 | * Hack idea: for the blockdev mapping, i_bufferlist_lock contention |
256 | * may be quite high. This code could TryLock the page, and if that | 264 | * may be quite high. This code could TryLock the page, and if that |
257 | * succeeds, there is no need to take private_lock. (But if | 265 | * succeeds, there is no need to take private_lock. (But if |
258 | * private_lock is contended then so is mapping->tree_lock). | 266 | * private_lock is contended then so is mapping->tree_lock). |
259 | */ | 267 | */ |
260 | static struct buffer_head * | 268 | static struct buffer_head * |
261 | __find_get_block_slow(struct block_device *bdev, sector_t block) | 269 | __find_get_block_slow(struct block_device *bdev, sector_t block) |
262 | { | 270 | { |
263 | struct inode *bd_inode = bdev->bd_inode; | 271 | struct inode *bd_inode = bdev->bd_inode; |
264 | struct address_space *bd_mapping = bd_inode->i_mapping; | 272 | struct address_space *bd_mapping = bd_inode->i_mapping; |
265 | struct buffer_head *ret = NULL; | 273 | struct buffer_head *ret = NULL; |
266 | pgoff_t index; | 274 | pgoff_t index; |
267 | struct buffer_head *bh; | 275 | struct buffer_head *bh; |
268 | struct buffer_head *head; | 276 | struct buffer_head *head; |
269 | struct page *page; | 277 | struct page *page; |
270 | int all_mapped = 1; | 278 | int all_mapped = 1; |
271 | 279 | ||
272 | index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); | 280 | index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); |
273 | page = find_get_page(bd_mapping, index); | 281 | page = find_get_page(bd_mapping, index); |
274 | if (!page) | 282 | if (!page) |
275 | goto out; | 283 | goto out; |
276 | 284 | ||
277 | spin_lock(&bd_mapping->private_lock); | 285 | spin_lock(&bd_mapping->private_lock); |
278 | if (!page_has_buffers(page)) | 286 | if (!page_has_buffers(page)) |
279 | goto out_unlock; | 287 | goto out_unlock; |
280 | head = page_buffers(page); | 288 | head = page_buffers(page); |
281 | bh = head; | 289 | bh = head; |
282 | do { | 290 | do { |
283 | if (bh->b_blocknr == block) { | 291 | if (bh->b_blocknr == block) { |
284 | ret = bh; | 292 | ret = bh; |
285 | get_bh(bh); | 293 | get_bh(bh); |
286 | goto out_unlock; | 294 | goto out_unlock; |
287 | } | 295 | } |
288 | if (!buffer_mapped(bh)) | 296 | if (!buffer_mapped(bh)) |
289 | all_mapped = 0; | 297 | all_mapped = 0; |
290 | bh = bh->b_this_page; | 298 | bh = bh->b_this_page; |
291 | } while (bh != head); | 299 | } while (bh != head); |
292 | 300 | ||
293 | /* we might be here because some of the buffers on this page are | 301 | /* we might be here because some of the buffers on this page are |
294 | * not mapped. This is due to various races between | 302 | * not mapped. This is due to various races between |
295 | * file io on the block device and getblk. It gets dealt with | 303 | * file io on the block device and getblk. It gets dealt with |
296 | * elsewhere, don't buffer_error if we had some unmapped buffers | 304 | * elsewhere, don't buffer_error if we had some unmapped buffers |
297 | */ | 305 | */ |
298 | if (all_mapped) { | 306 | if (all_mapped) { |
299 | printk("__find_get_block_slow() failed. " | 307 | printk("__find_get_block_slow() failed. " |
300 | "block=%llu, b_blocknr=%llu\n", | 308 | "block=%llu, b_blocknr=%llu\n", |
301 | (unsigned long long)block, | 309 | (unsigned long long)block, |
302 | (unsigned long long)bh->b_blocknr); | 310 | (unsigned long long)bh->b_blocknr); |
303 | printk("b_state=0x%08lx, b_size=%zu\n", | 311 | printk("b_state=0x%08lx, b_size=%zu\n", |
304 | bh->b_state, bh->b_size); | 312 | bh->b_state, bh->b_size); |
305 | printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits); | 313 | printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits); |
306 | } | 314 | } |
307 | out_unlock: | 315 | out_unlock: |
308 | spin_unlock(&bd_mapping->private_lock); | 316 | spin_unlock(&bd_mapping->private_lock); |
309 | page_cache_release(page); | 317 | page_cache_release(page); |
310 | out: | 318 | out: |
311 | return ret; | 319 | return ret; |
312 | } | 320 | } |
313 | 321 | ||
314 | /* If invalidate_buffers() will trash dirty buffers, it means some kind | 322 | /* If invalidate_buffers() will trash dirty buffers, it means some kind |
315 | of fs corruption is going on. Trashing dirty data always imply losing | 323 | of fs corruption is going on. Trashing dirty data always imply losing |
316 | information that was supposed to be just stored on the physical layer | 324 | information that was supposed to be just stored on the physical layer |
317 | by the user. | 325 | by the user. |
318 | 326 | ||
319 | Thus invalidate_buffers in general usage is not allwowed to trash | 327 | Thus invalidate_buffers in general usage is not allwowed to trash |
320 | dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to | 328 | dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to |
321 | be preserved. These buffers are simply skipped. | 329 | be preserved. These buffers are simply skipped. |
322 | 330 | ||
323 | We also skip buffers which are still in use. For example this can | 331 | We also skip buffers which are still in use. For example this can |
324 | happen if a userspace program is reading the block device. | 332 | happen if a userspace program is reading the block device. |
325 | 333 | ||
326 | NOTE: In the case where the user removed a removable-media-disk even if | 334 | NOTE: In the case where the user removed a removable-media-disk even if |
327 | there's still dirty data not synced on disk (due a bug in the device driver | 335 | there's still dirty data not synced on disk (due a bug in the device driver |
328 | or due an error of the user), by not destroying the dirty buffers we could | 336 | or due an error of the user), by not destroying the dirty buffers we could |
329 | generate corruption also on the next media inserted, thus a parameter is | 337 | generate corruption also on the next media inserted, thus a parameter is |
330 | necessary to handle this case in the most safe way possible (trying | 338 | necessary to handle this case in the most safe way possible (trying |
331 | to not corrupt also the new disk inserted with the data belonging to | 339 | to not corrupt also the new disk inserted with the data belonging to |
332 | the old now corrupted disk). Also for the ramdisk the natural thing | 340 | the old now corrupted disk). Also for the ramdisk the natural thing |
333 | to do in order to release the ramdisk memory is to destroy dirty buffers. | 341 | to do in order to release the ramdisk memory is to destroy dirty buffers. |
334 | 342 | ||
335 | These are two special cases. Normal usage imply the device driver | 343 | These are two special cases. Normal usage imply the device driver |
336 | to issue a sync on the device (without waiting I/O completion) and | 344 | to issue a sync on the device (without waiting I/O completion) and |
337 | then an invalidate_buffers call that doesn't trash dirty buffers. | 345 | then an invalidate_buffers call that doesn't trash dirty buffers. |
338 | 346 | ||
339 | For handling cache coherency with the blkdev pagecache the 'update' case | 347 | For handling cache coherency with the blkdev pagecache the 'update' case |
340 | is been introduced. It is needed to re-read from disk any pinned | 348 | is been introduced. It is needed to re-read from disk any pinned |
341 | buffer. NOTE: re-reading from disk is destructive so we can do it only | 349 | buffer. NOTE: re-reading from disk is destructive so we can do it only |
342 | when we assume nobody is changing the buffercache under our I/O and when | 350 | when we assume nobody is changing the buffercache under our I/O and when |
343 | we think the disk contains more recent information than the buffercache. | 351 | we think the disk contains more recent information than the buffercache. |
344 | The update == 1 pass marks the buffers we need to update, the update == 2 | 352 | The update == 1 pass marks the buffers we need to update, the update == 2 |
345 | pass does the actual I/O. */ | 353 | pass does the actual I/O. */ |
346 | void invalidate_bdev(struct block_device *bdev) | 354 | void invalidate_bdev(struct block_device *bdev) |
347 | { | 355 | { |
348 | struct address_space *mapping = bdev->bd_inode->i_mapping; | 356 | struct address_space *mapping = bdev->bd_inode->i_mapping; |
349 | 357 | ||
350 | if (mapping->nrpages == 0) | 358 | if (mapping->nrpages == 0) |
351 | return; | 359 | return; |
352 | 360 | ||
353 | invalidate_bh_lrus(); | 361 | invalidate_bh_lrus(); |
354 | invalidate_mapping_pages(mapping, 0, -1); | 362 | invalidate_mapping_pages(mapping, 0, -1); |
355 | } | 363 | } |
356 | 364 | ||
357 | /* | 365 | /* |
358 | * Kick pdflush then try to free up some ZONE_NORMAL memory. | 366 | * Kick pdflush then try to free up some ZONE_NORMAL memory. |
359 | */ | 367 | */ |
360 | static void free_more_memory(void) | 368 | static void free_more_memory(void) |
361 | { | 369 | { |
362 | struct zone *zone; | 370 | struct zone *zone; |
363 | int nid; | 371 | int nid; |
364 | 372 | ||
365 | wakeup_pdflush(1024); | 373 | wakeup_pdflush(1024); |
366 | yield(); | 374 | yield(); |
367 | 375 | ||
368 | for_each_online_node(nid) { | 376 | for_each_online_node(nid) { |
369 | (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS), | 377 | (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS), |
370 | gfp_zone(GFP_NOFS), NULL, | 378 | gfp_zone(GFP_NOFS), NULL, |
371 | &zone); | 379 | &zone); |
372 | if (zone) | 380 | if (zone) |
373 | try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, | 381 | try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, |
374 | GFP_NOFS); | 382 | GFP_NOFS); |
375 | } | 383 | } |
376 | } | 384 | } |
377 | 385 | ||
378 | /* | 386 | /* |
379 | * I/O completion handler for block_read_full_page() - pages | 387 | * I/O completion handler for block_read_full_page() - pages |
380 | * which come unlocked at the end of I/O. | 388 | * which come unlocked at the end of I/O. |
381 | */ | 389 | */ |
382 | static void end_buffer_async_read(struct buffer_head *bh, int uptodate) | 390 | static void end_buffer_async_read(struct buffer_head *bh, int uptodate) |
383 | { | 391 | { |
384 | unsigned long flags; | 392 | unsigned long flags; |
385 | struct buffer_head *first; | 393 | struct buffer_head *first; |
386 | struct buffer_head *tmp; | 394 | struct buffer_head *tmp; |
387 | struct page *page; | 395 | struct page *page; |
388 | int page_uptodate = 1; | 396 | int page_uptodate = 1; |
389 | 397 | ||
390 | BUG_ON(!buffer_async_read(bh)); | 398 | BUG_ON(!buffer_async_read(bh)); |
391 | 399 | ||
392 | page = bh->b_page; | 400 | page = bh->b_page; |
393 | if (uptodate) { | 401 | if (uptodate) { |
394 | set_buffer_uptodate(bh); | 402 | set_buffer_uptodate(bh); |
395 | } else { | 403 | } else { |
396 | clear_buffer_uptodate(bh); | 404 | clear_buffer_uptodate(bh); |
397 | if (printk_ratelimit()) | 405 | if (!quiet_error(bh)) |
398 | buffer_io_error(bh); | 406 | buffer_io_error(bh); |
399 | SetPageError(page); | 407 | SetPageError(page); |
400 | } | 408 | } |
401 | 409 | ||
402 | /* | 410 | /* |
403 | * Be _very_ careful from here on. Bad things can happen if | 411 | * Be _very_ careful from here on. Bad things can happen if |
404 | * two buffer heads end IO at almost the same time and both | 412 | * two buffer heads end IO at almost the same time and both |
405 | * decide that the page is now completely done. | 413 | * decide that the page is now completely done. |
406 | */ | 414 | */ |
407 | first = page_buffers(page); | 415 | first = page_buffers(page); |
408 | local_irq_save(flags); | 416 | local_irq_save(flags); |
409 | bit_spin_lock(BH_Uptodate_Lock, &first->b_state); | 417 | bit_spin_lock(BH_Uptodate_Lock, &first->b_state); |
410 | clear_buffer_async_read(bh); | 418 | clear_buffer_async_read(bh); |
411 | unlock_buffer(bh); | 419 | unlock_buffer(bh); |
412 | tmp = bh; | 420 | tmp = bh; |
413 | do { | 421 | do { |
414 | if (!buffer_uptodate(tmp)) | 422 | if (!buffer_uptodate(tmp)) |
415 | page_uptodate = 0; | 423 | page_uptodate = 0; |
416 | if (buffer_async_read(tmp)) { | 424 | if (buffer_async_read(tmp)) { |
417 | BUG_ON(!buffer_locked(tmp)); | 425 | BUG_ON(!buffer_locked(tmp)); |
418 | goto still_busy; | 426 | goto still_busy; |
419 | } | 427 | } |
420 | tmp = tmp->b_this_page; | 428 | tmp = tmp->b_this_page; |
421 | } while (tmp != bh); | 429 | } while (tmp != bh); |
422 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); | 430 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); |
423 | local_irq_restore(flags); | 431 | local_irq_restore(flags); |
424 | 432 | ||
425 | /* | 433 | /* |
426 | * If none of the buffers had errors and they are all | 434 | * If none of the buffers had errors and they are all |
427 | * uptodate then we can set the page uptodate. | 435 | * uptodate then we can set the page uptodate. |
428 | */ | 436 | */ |
429 | if (page_uptodate && !PageError(page)) | 437 | if (page_uptodate && !PageError(page)) |
430 | SetPageUptodate(page); | 438 | SetPageUptodate(page); |
431 | unlock_page(page); | 439 | unlock_page(page); |
432 | return; | 440 | return; |
433 | 441 | ||
434 | still_busy: | 442 | still_busy: |
435 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); | 443 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); |
436 | local_irq_restore(flags); | 444 | local_irq_restore(flags); |
437 | return; | 445 | return; |
438 | } | 446 | } |
439 | 447 | ||
440 | /* | 448 | /* |
441 | * Completion handler for block_write_full_page() - pages which are unlocked | 449 | * Completion handler for block_write_full_page() - pages which are unlocked |
442 | * during I/O, and which have PageWriteback cleared upon I/O completion. | 450 | * during I/O, and which have PageWriteback cleared upon I/O completion. |
443 | */ | 451 | */ |
444 | static void end_buffer_async_write(struct buffer_head *bh, int uptodate) | 452 | static void end_buffer_async_write(struct buffer_head *bh, int uptodate) |
445 | { | 453 | { |
446 | char b[BDEVNAME_SIZE]; | 454 | char b[BDEVNAME_SIZE]; |
447 | unsigned long flags; | 455 | unsigned long flags; |
448 | struct buffer_head *first; | 456 | struct buffer_head *first; |
449 | struct buffer_head *tmp; | 457 | struct buffer_head *tmp; |
450 | struct page *page; | 458 | struct page *page; |
451 | 459 | ||
452 | BUG_ON(!buffer_async_write(bh)); | 460 | BUG_ON(!buffer_async_write(bh)); |
453 | 461 | ||
454 | page = bh->b_page; | 462 | page = bh->b_page; |
455 | if (uptodate) { | 463 | if (uptodate) { |
456 | set_buffer_uptodate(bh); | 464 | set_buffer_uptodate(bh); |
457 | } else { | 465 | } else { |
458 | if (printk_ratelimit()) { | 466 | if (!quiet_error(bh)) { |
459 | buffer_io_error(bh); | 467 | buffer_io_error(bh); |
460 | printk(KERN_WARNING "lost page write due to " | 468 | printk(KERN_WARNING "lost page write due to " |
461 | "I/O error on %s\n", | 469 | "I/O error on %s\n", |
462 | bdevname(bh->b_bdev, b)); | 470 | bdevname(bh->b_bdev, b)); |
463 | } | 471 | } |
464 | set_bit(AS_EIO, &page->mapping->flags); | 472 | set_bit(AS_EIO, &page->mapping->flags); |
465 | set_buffer_write_io_error(bh); | 473 | set_buffer_write_io_error(bh); |
466 | clear_buffer_uptodate(bh); | 474 | clear_buffer_uptodate(bh); |
467 | SetPageError(page); | 475 | SetPageError(page); |
468 | } | 476 | } |
469 | 477 | ||
470 | first = page_buffers(page); | 478 | first = page_buffers(page); |
471 | local_irq_save(flags); | 479 | local_irq_save(flags); |
472 | bit_spin_lock(BH_Uptodate_Lock, &first->b_state); | 480 | bit_spin_lock(BH_Uptodate_Lock, &first->b_state); |
473 | 481 | ||
474 | clear_buffer_async_write(bh); | 482 | clear_buffer_async_write(bh); |
475 | unlock_buffer(bh); | 483 | unlock_buffer(bh); |
476 | tmp = bh->b_this_page; | 484 | tmp = bh->b_this_page; |
477 | while (tmp != bh) { | 485 | while (tmp != bh) { |
478 | if (buffer_async_write(tmp)) { | 486 | if (buffer_async_write(tmp)) { |
479 | BUG_ON(!buffer_locked(tmp)); | 487 | BUG_ON(!buffer_locked(tmp)); |
480 | goto still_busy; | 488 | goto still_busy; |
481 | } | 489 | } |
482 | tmp = tmp->b_this_page; | 490 | tmp = tmp->b_this_page; |
483 | } | 491 | } |
484 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); | 492 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); |
485 | local_irq_restore(flags); | 493 | local_irq_restore(flags); |
486 | end_page_writeback(page); | 494 | end_page_writeback(page); |
487 | return; | 495 | return; |
488 | 496 | ||
489 | still_busy: | 497 | still_busy: |
490 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); | 498 | bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); |
491 | local_irq_restore(flags); | 499 | local_irq_restore(flags); |
492 | return; | 500 | return; |
493 | } | 501 | } |
494 | 502 | ||
495 | /* | 503 | /* |
496 | * If a page's buffers are under async readin (end_buffer_async_read | 504 | * If a page's buffers are under async readin (end_buffer_async_read |
497 | * completion) then there is a possibility that another thread of | 505 | * completion) then there is a possibility that another thread of |
498 | * control could lock one of the buffers after it has completed | 506 | * control could lock one of the buffers after it has completed |
499 | * but while some of the other buffers have not completed. This | 507 | * but while some of the other buffers have not completed. This |
500 | * locked buffer would confuse end_buffer_async_read() into not unlocking | 508 | * locked buffer would confuse end_buffer_async_read() into not unlocking |
501 | * the page. So the absence of BH_Async_Read tells end_buffer_async_read() | 509 | * the page. So the absence of BH_Async_Read tells end_buffer_async_read() |
502 | * that this buffer is not under async I/O. | 510 | * that this buffer is not under async I/O. |
503 | * | 511 | * |
504 | * The page comes unlocked when it has no locked buffer_async buffers | 512 | * The page comes unlocked when it has no locked buffer_async buffers |
505 | * left. | 513 | * left. |
506 | * | 514 | * |
507 | * PageLocked prevents anyone starting new async I/O reads any of | 515 | * PageLocked prevents anyone starting new async I/O reads any of |
508 | * the buffers. | 516 | * the buffers. |
509 | * | 517 | * |
510 | * PageWriteback is used to prevent simultaneous writeout of the same | 518 | * PageWriteback is used to prevent simultaneous writeout of the same |
511 | * page. | 519 | * page. |
512 | * | 520 | * |
513 | * PageLocked prevents anyone from starting writeback of a page which is | 521 | * PageLocked prevents anyone from starting writeback of a page which is |
514 | * under read I/O (PageWriteback is only ever set against a locked page). | 522 | * under read I/O (PageWriteback is only ever set against a locked page). |
515 | */ | 523 | */ |
516 | static void mark_buffer_async_read(struct buffer_head *bh) | 524 | static void mark_buffer_async_read(struct buffer_head *bh) |
517 | { | 525 | { |
518 | bh->b_end_io = end_buffer_async_read; | 526 | bh->b_end_io = end_buffer_async_read; |
519 | set_buffer_async_read(bh); | 527 | set_buffer_async_read(bh); |
520 | } | 528 | } |
521 | 529 | ||
522 | void mark_buffer_async_write(struct buffer_head *bh) | 530 | void mark_buffer_async_write(struct buffer_head *bh) |
523 | { | 531 | { |
524 | bh->b_end_io = end_buffer_async_write; | 532 | bh->b_end_io = end_buffer_async_write; |
525 | set_buffer_async_write(bh); | 533 | set_buffer_async_write(bh); |
526 | } | 534 | } |
527 | EXPORT_SYMBOL(mark_buffer_async_write); | 535 | EXPORT_SYMBOL(mark_buffer_async_write); |
528 | 536 | ||
529 | 537 | ||
530 | /* | 538 | /* |
531 | * fs/buffer.c contains helper functions for buffer-backed address space's | 539 | * fs/buffer.c contains helper functions for buffer-backed address space's |
532 | * fsync functions. A common requirement for buffer-based filesystems is | 540 | * fsync functions. A common requirement for buffer-based filesystems is |
533 | * that certain data from the backing blockdev needs to be written out for | 541 | * that certain data from the backing blockdev needs to be written out for |
534 | * a successful fsync(). For example, ext2 indirect blocks need to be | 542 | * a successful fsync(). For example, ext2 indirect blocks need to be |
535 | * written back and waited upon before fsync() returns. | 543 | * written back and waited upon before fsync() returns. |
536 | * | 544 | * |
537 | * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), | 545 | * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), |
538 | * inode_has_buffers() and invalidate_inode_buffers() are provided for the | 546 | * inode_has_buffers() and invalidate_inode_buffers() are provided for the |
539 | * management of a list of dependent buffers at ->i_mapping->private_list. | 547 | * management of a list of dependent buffers at ->i_mapping->private_list. |
540 | * | 548 | * |
541 | * Locking is a little subtle: try_to_free_buffers() will remove buffers | 549 | * Locking is a little subtle: try_to_free_buffers() will remove buffers |
542 | * from their controlling inode's queue when they are being freed. But | 550 | * from their controlling inode's queue when they are being freed. But |
543 | * try_to_free_buffers() will be operating against the *blockdev* mapping | 551 | * try_to_free_buffers() will be operating against the *blockdev* mapping |
544 | * at the time, not against the S_ISREG file which depends on those buffers. | 552 | * at the time, not against the S_ISREG file which depends on those buffers. |
545 | * So the locking for private_list is via the private_lock in the address_space | 553 | * So the locking for private_list is via the private_lock in the address_space |
546 | * which backs the buffers. Which is different from the address_space | 554 | * which backs the buffers. Which is different from the address_space |
547 | * against which the buffers are listed. So for a particular address_space, | 555 | * against which the buffers are listed. So for a particular address_space, |
548 | * mapping->private_lock does *not* protect mapping->private_list! In fact, | 556 | * mapping->private_lock does *not* protect mapping->private_list! In fact, |
549 | * mapping->private_list will always be protected by the backing blockdev's | 557 | * mapping->private_list will always be protected by the backing blockdev's |
550 | * ->private_lock. | 558 | * ->private_lock. |
551 | * | 559 | * |
552 | * Which introduces a requirement: all buffers on an address_space's | 560 | * Which introduces a requirement: all buffers on an address_space's |
553 | * ->private_list must be from the same address_space: the blockdev's. | 561 | * ->private_list must be from the same address_space: the blockdev's. |
554 | * | 562 | * |
555 | * address_spaces which do not place buffers at ->private_list via these | 563 | * address_spaces which do not place buffers at ->private_list via these |
556 | * utility functions are free to use private_lock and private_list for | 564 | * utility functions are free to use private_lock and private_list for |
557 | * whatever they want. The only requirement is that list_empty(private_list) | 565 | * whatever they want. The only requirement is that list_empty(private_list) |
558 | * be true at clear_inode() time. | 566 | * be true at clear_inode() time. |
559 | * | 567 | * |
560 | * FIXME: clear_inode should not call invalidate_inode_buffers(). The | 568 | * FIXME: clear_inode should not call invalidate_inode_buffers(). The |
561 | * filesystems should do that. invalidate_inode_buffers() should just go | 569 | * filesystems should do that. invalidate_inode_buffers() should just go |
562 | * BUG_ON(!list_empty). | 570 | * BUG_ON(!list_empty). |
563 | * | 571 | * |
564 | * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should | 572 | * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should |
565 | * take an address_space, not an inode. And it should be called | 573 | * take an address_space, not an inode. And it should be called |
566 | * mark_buffer_dirty_fsync() to clearly define why those buffers are being | 574 | * mark_buffer_dirty_fsync() to clearly define why those buffers are being |
567 | * queued up. | 575 | * queued up. |
568 | * | 576 | * |
569 | * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the | 577 | * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the |
570 | * list if it is already on a list. Because if the buffer is on a list, | 578 | * list if it is already on a list. Because if the buffer is on a list, |
571 | * it *must* already be on the right one. If not, the filesystem is being | 579 | * it *must* already be on the right one. If not, the filesystem is being |
572 | * silly. This will save a ton of locking. But first we have to ensure | 580 | * silly. This will save a ton of locking. But first we have to ensure |
573 | * that buffers are taken *off* the old inode's list when they are freed | 581 | * that buffers are taken *off* the old inode's list when they are freed |
574 | * (presumably in truncate). That requires careful auditing of all | 582 | * (presumably in truncate). That requires careful auditing of all |
575 | * filesystems (do it inside bforget()). It could also be done by bringing | 583 | * filesystems (do it inside bforget()). It could also be done by bringing |
576 | * b_inode back. | 584 | * b_inode back. |
577 | */ | 585 | */ |
578 | 586 | ||
579 | /* | 587 | /* |
580 | * The buffer's backing address_space's private_lock must be held | 588 | * The buffer's backing address_space's private_lock must be held |
581 | */ | 589 | */ |
582 | static void __remove_assoc_queue(struct buffer_head *bh) | 590 | static void __remove_assoc_queue(struct buffer_head *bh) |
583 | { | 591 | { |
584 | list_del_init(&bh->b_assoc_buffers); | 592 | list_del_init(&bh->b_assoc_buffers); |
585 | WARN_ON(!bh->b_assoc_map); | 593 | WARN_ON(!bh->b_assoc_map); |
586 | if (buffer_write_io_error(bh)) | 594 | if (buffer_write_io_error(bh)) |
587 | set_bit(AS_EIO, &bh->b_assoc_map->flags); | 595 | set_bit(AS_EIO, &bh->b_assoc_map->flags); |
588 | bh->b_assoc_map = NULL; | 596 | bh->b_assoc_map = NULL; |
589 | } | 597 | } |
590 | 598 | ||
591 | int inode_has_buffers(struct inode *inode) | 599 | int inode_has_buffers(struct inode *inode) |
592 | { | 600 | { |
593 | return !list_empty(&inode->i_data.private_list); | 601 | return !list_empty(&inode->i_data.private_list); |
594 | } | 602 | } |
595 | 603 | ||
596 | /* | 604 | /* |
597 | * osync is designed to support O_SYNC io. It waits synchronously for | 605 | * osync is designed to support O_SYNC io. It waits synchronously for |
598 | * all already-submitted IO to complete, but does not queue any new | 606 | * all already-submitted IO to complete, but does not queue any new |
599 | * writes to the disk. | 607 | * writes to the disk. |
600 | * | 608 | * |
601 | * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as | 609 | * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as |
602 | * you dirty the buffers, and then use osync_inode_buffers to wait for | 610 | * you dirty the buffers, and then use osync_inode_buffers to wait for |
603 | * completion. Any other dirty buffers which are not yet queued for | 611 | * completion. Any other dirty buffers which are not yet queued for |
604 | * write will not be flushed to disk by the osync. | 612 | * write will not be flushed to disk by the osync. |
605 | */ | 613 | */ |
606 | static int osync_buffers_list(spinlock_t *lock, struct list_head *list) | 614 | static int osync_buffers_list(spinlock_t *lock, struct list_head *list) |
607 | { | 615 | { |
608 | struct buffer_head *bh; | 616 | struct buffer_head *bh; |
609 | struct list_head *p; | 617 | struct list_head *p; |
610 | int err = 0; | 618 | int err = 0; |
611 | 619 | ||
612 | spin_lock(lock); | 620 | spin_lock(lock); |
613 | repeat: | 621 | repeat: |
614 | list_for_each_prev(p, list) { | 622 | list_for_each_prev(p, list) { |
615 | bh = BH_ENTRY(p); | 623 | bh = BH_ENTRY(p); |
616 | if (buffer_locked(bh)) { | 624 | if (buffer_locked(bh)) { |
617 | get_bh(bh); | 625 | get_bh(bh); |
618 | spin_unlock(lock); | 626 | spin_unlock(lock); |
619 | wait_on_buffer(bh); | 627 | wait_on_buffer(bh); |
620 | if (!buffer_uptodate(bh)) | 628 | if (!buffer_uptodate(bh)) |
621 | err = -EIO; | 629 | err = -EIO; |
622 | brelse(bh); | 630 | brelse(bh); |
623 | spin_lock(lock); | 631 | spin_lock(lock); |
624 | goto repeat; | 632 | goto repeat; |
625 | } | 633 | } |
626 | } | 634 | } |
627 | spin_unlock(lock); | 635 | spin_unlock(lock); |
628 | return err; | 636 | return err; |
629 | } | 637 | } |
630 | 638 | ||
631 | /** | 639 | /** |
632 | * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers | 640 | * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers |
633 | * @mapping: the mapping which wants those buffers written | 641 | * @mapping: the mapping which wants those buffers written |
634 | * | 642 | * |
635 | * Starts I/O against the buffers at mapping->private_list, and waits upon | 643 | * Starts I/O against the buffers at mapping->private_list, and waits upon |
636 | * that I/O. | 644 | * that I/O. |
637 | * | 645 | * |
638 | * Basically, this is a convenience function for fsync(). | 646 | * Basically, this is a convenience function for fsync(). |
639 | * @mapping is a file or directory which needs those buffers to be written for | 647 | * @mapping is a file or directory which needs those buffers to be written for |
640 | * a successful fsync(). | 648 | * a successful fsync(). |
641 | */ | 649 | */ |
642 | int sync_mapping_buffers(struct address_space *mapping) | 650 | int sync_mapping_buffers(struct address_space *mapping) |
643 | { | 651 | { |
644 | struct address_space *buffer_mapping = mapping->assoc_mapping; | 652 | struct address_space *buffer_mapping = mapping->assoc_mapping; |
645 | 653 | ||
646 | if (buffer_mapping == NULL || list_empty(&mapping->private_list)) | 654 | if (buffer_mapping == NULL || list_empty(&mapping->private_list)) |
647 | return 0; | 655 | return 0; |
648 | 656 | ||
649 | return fsync_buffers_list(&buffer_mapping->private_lock, | 657 | return fsync_buffers_list(&buffer_mapping->private_lock, |
650 | &mapping->private_list); | 658 | &mapping->private_list); |
651 | } | 659 | } |
652 | EXPORT_SYMBOL(sync_mapping_buffers); | 660 | EXPORT_SYMBOL(sync_mapping_buffers); |
653 | 661 | ||
654 | /* | 662 | /* |
655 | * Called when we've recently written block `bblock', and it is known that | 663 | * Called when we've recently written block `bblock', and it is known that |
656 | * `bblock' was for a buffer_boundary() buffer. This means that the block at | 664 | * `bblock' was for a buffer_boundary() buffer. This means that the block at |
657 | * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's | 665 | * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's |
658 | * dirty, schedule it for IO. So that indirects merge nicely with their data. | 666 | * dirty, schedule it for IO. So that indirects merge nicely with their data. |
659 | */ | 667 | */ |
660 | void write_boundary_block(struct block_device *bdev, | 668 | void write_boundary_block(struct block_device *bdev, |
661 | sector_t bblock, unsigned blocksize) | 669 | sector_t bblock, unsigned blocksize) |
662 | { | 670 | { |
663 | struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); | 671 | struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); |
664 | if (bh) { | 672 | if (bh) { |
665 | if (buffer_dirty(bh)) | 673 | if (buffer_dirty(bh)) |
666 | ll_rw_block(WRITE, 1, &bh); | 674 | ll_rw_block(WRITE, 1, &bh); |
667 | put_bh(bh); | 675 | put_bh(bh); |
668 | } | 676 | } |
669 | } | 677 | } |
670 | 678 | ||
671 | void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) | 679 | void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) |
672 | { | 680 | { |
673 | struct address_space *mapping = inode->i_mapping; | 681 | struct address_space *mapping = inode->i_mapping; |
674 | struct address_space *buffer_mapping = bh->b_page->mapping; | 682 | struct address_space *buffer_mapping = bh->b_page->mapping; |
675 | 683 | ||
676 | mark_buffer_dirty(bh); | 684 | mark_buffer_dirty(bh); |
677 | if (!mapping->assoc_mapping) { | 685 | if (!mapping->assoc_mapping) { |
678 | mapping->assoc_mapping = buffer_mapping; | 686 | mapping->assoc_mapping = buffer_mapping; |
679 | } else { | 687 | } else { |
680 | BUG_ON(mapping->assoc_mapping != buffer_mapping); | 688 | BUG_ON(mapping->assoc_mapping != buffer_mapping); |
681 | } | 689 | } |
682 | if (!bh->b_assoc_map) { | 690 | if (!bh->b_assoc_map) { |
683 | spin_lock(&buffer_mapping->private_lock); | 691 | spin_lock(&buffer_mapping->private_lock); |
684 | list_move_tail(&bh->b_assoc_buffers, | 692 | list_move_tail(&bh->b_assoc_buffers, |
685 | &mapping->private_list); | 693 | &mapping->private_list); |
686 | bh->b_assoc_map = mapping; | 694 | bh->b_assoc_map = mapping; |
687 | spin_unlock(&buffer_mapping->private_lock); | 695 | spin_unlock(&buffer_mapping->private_lock); |
688 | } | 696 | } |
689 | } | 697 | } |
690 | EXPORT_SYMBOL(mark_buffer_dirty_inode); | 698 | EXPORT_SYMBOL(mark_buffer_dirty_inode); |
691 | 699 | ||
692 | /* | 700 | /* |
693 | * Mark the page dirty, and set it dirty in the radix tree, and mark the inode | 701 | * Mark the page dirty, and set it dirty in the radix tree, and mark the inode |
694 | * dirty. | 702 | * dirty. |
695 | * | 703 | * |
696 | * If warn is true, then emit a warning if the page is not uptodate and has | 704 | * If warn is true, then emit a warning if the page is not uptodate and has |
697 | * not been truncated. | 705 | * not been truncated. |
698 | */ | 706 | */ |
699 | static int __set_page_dirty(struct page *page, | 707 | static int __set_page_dirty(struct page *page, |
700 | struct address_space *mapping, int warn) | 708 | struct address_space *mapping, int warn) |
701 | { | 709 | { |
702 | if (unlikely(!mapping)) | 710 | if (unlikely(!mapping)) |
703 | return !TestSetPageDirty(page); | 711 | return !TestSetPageDirty(page); |
704 | 712 | ||
705 | if (TestSetPageDirty(page)) | 713 | if (TestSetPageDirty(page)) |
706 | return 0; | 714 | return 0; |
707 | 715 | ||
708 | spin_lock_irq(&mapping->tree_lock); | 716 | spin_lock_irq(&mapping->tree_lock); |
709 | if (page->mapping) { /* Race with truncate? */ | 717 | if (page->mapping) { /* Race with truncate? */ |
710 | WARN_ON_ONCE(warn && !PageUptodate(page)); | 718 | WARN_ON_ONCE(warn && !PageUptodate(page)); |
711 | 719 | ||
712 | if (mapping_cap_account_dirty(mapping)) { | 720 | if (mapping_cap_account_dirty(mapping)) { |
713 | __inc_zone_page_state(page, NR_FILE_DIRTY); | 721 | __inc_zone_page_state(page, NR_FILE_DIRTY); |
714 | __inc_bdi_stat(mapping->backing_dev_info, | 722 | __inc_bdi_stat(mapping->backing_dev_info, |
715 | BDI_RECLAIMABLE); | 723 | BDI_RECLAIMABLE); |
716 | task_io_account_write(PAGE_CACHE_SIZE); | 724 | task_io_account_write(PAGE_CACHE_SIZE); |
717 | } | 725 | } |
718 | radix_tree_tag_set(&mapping->page_tree, | 726 | radix_tree_tag_set(&mapping->page_tree, |
719 | page_index(page), PAGECACHE_TAG_DIRTY); | 727 | page_index(page), PAGECACHE_TAG_DIRTY); |
720 | } | 728 | } |
721 | spin_unlock_irq(&mapping->tree_lock); | 729 | spin_unlock_irq(&mapping->tree_lock); |
722 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | 730 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
723 | 731 | ||
724 | return 1; | 732 | return 1; |
725 | } | 733 | } |
726 | 734 | ||
727 | /* | 735 | /* |
728 | * Add a page to the dirty page list. | 736 | * Add a page to the dirty page list. |
729 | * | 737 | * |
730 | * It is a sad fact of life that this function is called from several places | 738 | * It is a sad fact of life that this function is called from several places |
731 | * deeply under spinlocking. It may not sleep. | 739 | * deeply under spinlocking. It may not sleep. |
732 | * | 740 | * |
733 | * If the page has buffers, the uptodate buffers are set dirty, to preserve | 741 | * If the page has buffers, the uptodate buffers are set dirty, to preserve |
734 | * dirty-state coherency between the page and the buffers. It the page does | 742 | * dirty-state coherency between the page and the buffers. It the page does |
735 | * not have buffers then when they are later attached they will all be set | 743 | * not have buffers then when they are later attached they will all be set |
736 | * dirty. | 744 | * dirty. |
737 | * | 745 | * |
738 | * The buffers are dirtied before the page is dirtied. There's a small race | 746 | * The buffers are dirtied before the page is dirtied. There's a small race |
739 | * window in which a writepage caller may see the page cleanness but not the | 747 | * window in which a writepage caller may see the page cleanness but not the |
740 | * buffer dirtiness. That's fine. If this code were to set the page dirty | 748 | * buffer dirtiness. That's fine. If this code were to set the page dirty |
741 | * before the buffers, a concurrent writepage caller could clear the page dirty | 749 | * before the buffers, a concurrent writepage caller could clear the page dirty |
742 | * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean | 750 | * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean |
743 | * page on the dirty page list. | 751 | * page on the dirty page list. |
744 | * | 752 | * |
745 | * We use private_lock to lock against try_to_free_buffers while using the | 753 | * We use private_lock to lock against try_to_free_buffers while using the |
746 | * page's buffer list. Also use this to protect against clean buffers being | 754 | * page's buffer list. Also use this to protect against clean buffers being |
747 | * added to the page after it was set dirty. | 755 | * added to the page after it was set dirty. |
748 | * | 756 | * |
749 | * FIXME: may need to call ->reservepage here as well. That's rather up to the | 757 | * FIXME: may need to call ->reservepage here as well. That's rather up to the |
750 | * address_space though. | 758 | * address_space though. |
751 | */ | 759 | */ |
752 | int __set_page_dirty_buffers(struct page *page) | 760 | int __set_page_dirty_buffers(struct page *page) |
753 | { | 761 | { |
754 | struct address_space *mapping = page_mapping(page); | 762 | struct address_space *mapping = page_mapping(page); |
755 | 763 | ||
756 | if (unlikely(!mapping)) | 764 | if (unlikely(!mapping)) |
757 | return !TestSetPageDirty(page); | 765 | return !TestSetPageDirty(page); |
758 | 766 | ||
759 | spin_lock(&mapping->private_lock); | 767 | spin_lock(&mapping->private_lock); |
760 | if (page_has_buffers(page)) { | 768 | if (page_has_buffers(page)) { |
761 | struct buffer_head *head = page_buffers(page); | 769 | struct buffer_head *head = page_buffers(page); |
762 | struct buffer_head *bh = head; | 770 | struct buffer_head *bh = head; |
763 | 771 | ||
764 | do { | 772 | do { |
765 | set_buffer_dirty(bh); | 773 | set_buffer_dirty(bh); |
766 | bh = bh->b_this_page; | 774 | bh = bh->b_this_page; |
767 | } while (bh != head); | 775 | } while (bh != head); |
768 | } | 776 | } |
769 | spin_unlock(&mapping->private_lock); | 777 | spin_unlock(&mapping->private_lock); |
770 | 778 | ||
771 | return __set_page_dirty(page, mapping, 1); | 779 | return __set_page_dirty(page, mapping, 1); |
772 | } | 780 | } |
773 | EXPORT_SYMBOL(__set_page_dirty_buffers); | 781 | EXPORT_SYMBOL(__set_page_dirty_buffers); |
774 | 782 | ||
775 | /* | 783 | /* |
776 | * Write out and wait upon a list of buffers. | 784 | * Write out and wait upon a list of buffers. |
777 | * | 785 | * |
778 | * We have conflicting pressures: we want to make sure that all | 786 | * We have conflicting pressures: we want to make sure that all |
779 | * initially dirty buffers get waited on, but that any subsequently | 787 | * initially dirty buffers get waited on, but that any subsequently |
780 | * dirtied buffers don't. After all, we don't want fsync to last | 788 | * dirtied buffers don't. After all, we don't want fsync to last |
781 | * forever if somebody is actively writing to the file. | 789 | * forever if somebody is actively writing to the file. |
782 | * | 790 | * |
783 | * Do this in two main stages: first we copy dirty buffers to a | 791 | * Do this in two main stages: first we copy dirty buffers to a |
784 | * temporary inode list, queueing the writes as we go. Then we clean | 792 | * temporary inode list, queueing the writes as we go. Then we clean |
785 | * up, waiting for those writes to complete. | 793 | * up, waiting for those writes to complete. |
786 | * | 794 | * |
787 | * During this second stage, any subsequent updates to the file may end | 795 | * During this second stage, any subsequent updates to the file may end |
788 | * up refiling the buffer on the original inode's dirty list again, so | 796 | * up refiling the buffer on the original inode's dirty list again, so |
789 | * there is a chance we will end up with a buffer queued for write but | 797 | * there is a chance we will end up with a buffer queued for write but |
790 | * not yet completed on that list. So, as a final cleanup we go through | 798 | * not yet completed on that list. So, as a final cleanup we go through |
791 | * the osync code to catch these locked, dirty buffers without requeuing | 799 | * the osync code to catch these locked, dirty buffers without requeuing |
792 | * any newly dirty buffers for write. | 800 | * any newly dirty buffers for write. |
793 | */ | 801 | */ |
794 | static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) | 802 | static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) |
795 | { | 803 | { |
796 | struct buffer_head *bh; | 804 | struct buffer_head *bh; |
797 | struct list_head tmp; | 805 | struct list_head tmp; |
798 | struct address_space *mapping; | 806 | struct address_space *mapping; |
799 | int err = 0, err2; | 807 | int err = 0, err2; |
800 | 808 | ||
801 | INIT_LIST_HEAD(&tmp); | 809 | INIT_LIST_HEAD(&tmp); |
802 | 810 | ||
803 | spin_lock(lock); | 811 | spin_lock(lock); |
804 | while (!list_empty(list)) { | 812 | while (!list_empty(list)) { |
805 | bh = BH_ENTRY(list->next); | 813 | bh = BH_ENTRY(list->next); |
806 | mapping = bh->b_assoc_map; | 814 | mapping = bh->b_assoc_map; |
807 | __remove_assoc_queue(bh); | 815 | __remove_assoc_queue(bh); |
808 | /* Avoid race with mark_buffer_dirty_inode() which does | 816 | /* Avoid race with mark_buffer_dirty_inode() which does |
809 | * a lockless check and we rely on seeing the dirty bit */ | 817 | * a lockless check and we rely on seeing the dirty bit */ |
810 | smp_mb(); | 818 | smp_mb(); |
811 | if (buffer_dirty(bh) || buffer_locked(bh)) { | 819 | if (buffer_dirty(bh) || buffer_locked(bh)) { |
812 | list_add(&bh->b_assoc_buffers, &tmp); | 820 | list_add(&bh->b_assoc_buffers, &tmp); |
813 | bh->b_assoc_map = mapping; | 821 | bh->b_assoc_map = mapping; |
814 | if (buffer_dirty(bh)) { | 822 | if (buffer_dirty(bh)) { |
815 | get_bh(bh); | 823 | get_bh(bh); |
816 | spin_unlock(lock); | 824 | spin_unlock(lock); |
817 | /* | 825 | /* |
818 | * Ensure any pending I/O completes so that | 826 | * Ensure any pending I/O completes so that |
819 | * ll_rw_block() actually writes the current | 827 | * ll_rw_block() actually writes the current |
820 | * contents - it is a noop if I/O is still in | 828 | * contents - it is a noop if I/O is still in |
821 | * flight on potentially older contents. | 829 | * flight on potentially older contents. |
822 | */ | 830 | */ |
823 | ll_rw_block(SWRITE_SYNC, 1, &bh); | 831 | ll_rw_block(SWRITE_SYNC, 1, &bh); |
824 | brelse(bh); | 832 | brelse(bh); |
825 | spin_lock(lock); | 833 | spin_lock(lock); |
826 | } | 834 | } |
827 | } | 835 | } |
828 | } | 836 | } |
829 | 837 | ||
830 | while (!list_empty(&tmp)) { | 838 | while (!list_empty(&tmp)) { |
831 | bh = BH_ENTRY(tmp.prev); | 839 | bh = BH_ENTRY(tmp.prev); |
832 | get_bh(bh); | 840 | get_bh(bh); |
833 | mapping = bh->b_assoc_map; | 841 | mapping = bh->b_assoc_map; |
834 | __remove_assoc_queue(bh); | 842 | __remove_assoc_queue(bh); |
835 | /* Avoid race with mark_buffer_dirty_inode() which does | 843 | /* Avoid race with mark_buffer_dirty_inode() which does |
836 | * a lockless check and we rely on seeing the dirty bit */ | 844 | * a lockless check and we rely on seeing the dirty bit */ |
837 | smp_mb(); | 845 | smp_mb(); |
838 | if (buffer_dirty(bh)) { | 846 | if (buffer_dirty(bh)) { |
839 | list_add(&bh->b_assoc_buffers, | 847 | list_add(&bh->b_assoc_buffers, |
840 | &mapping->private_list); | 848 | &mapping->private_list); |
841 | bh->b_assoc_map = mapping; | 849 | bh->b_assoc_map = mapping; |
842 | } | 850 | } |
843 | spin_unlock(lock); | 851 | spin_unlock(lock); |
844 | wait_on_buffer(bh); | 852 | wait_on_buffer(bh); |
845 | if (!buffer_uptodate(bh)) | 853 | if (!buffer_uptodate(bh)) |
846 | err = -EIO; | 854 | err = -EIO; |
847 | brelse(bh); | 855 | brelse(bh); |
848 | spin_lock(lock); | 856 | spin_lock(lock); |
849 | } | 857 | } |
850 | 858 | ||
851 | spin_unlock(lock); | 859 | spin_unlock(lock); |
852 | err2 = osync_buffers_list(lock, list); | 860 | err2 = osync_buffers_list(lock, list); |
853 | if (err) | 861 | if (err) |
854 | return err; | 862 | return err; |
855 | else | 863 | else |
856 | return err2; | 864 | return err2; |
857 | } | 865 | } |
858 | 866 | ||
859 | /* | 867 | /* |
860 | * Invalidate any and all dirty buffers on a given inode. We are | 868 | * Invalidate any and all dirty buffers on a given inode. We are |
861 | * probably unmounting the fs, but that doesn't mean we have already | 869 | * probably unmounting the fs, but that doesn't mean we have already |
862 | * done a sync(). Just drop the buffers from the inode list. | 870 | * done a sync(). Just drop the buffers from the inode list. |
863 | * | 871 | * |
864 | * NOTE: we take the inode's blockdev's mapping's private_lock. Which | 872 | * NOTE: we take the inode's blockdev's mapping's private_lock. Which |
865 | * assumes that all the buffers are against the blockdev. Not true | 873 | * assumes that all the buffers are against the blockdev. Not true |
866 | * for reiserfs. | 874 | * for reiserfs. |
867 | */ | 875 | */ |
868 | void invalidate_inode_buffers(struct inode *inode) | 876 | void invalidate_inode_buffers(struct inode *inode) |
869 | { | 877 | { |
870 | if (inode_has_buffers(inode)) { | 878 | if (inode_has_buffers(inode)) { |
871 | struct address_space *mapping = &inode->i_data; | 879 | struct address_space *mapping = &inode->i_data; |
872 | struct list_head *list = &mapping->private_list; | 880 | struct list_head *list = &mapping->private_list; |
873 | struct address_space *buffer_mapping = mapping->assoc_mapping; | 881 | struct address_space *buffer_mapping = mapping->assoc_mapping; |
874 | 882 | ||
875 | spin_lock(&buffer_mapping->private_lock); | 883 | spin_lock(&buffer_mapping->private_lock); |
876 | while (!list_empty(list)) | 884 | while (!list_empty(list)) |
877 | __remove_assoc_queue(BH_ENTRY(list->next)); | 885 | __remove_assoc_queue(BH_ENTRY(list->next)); |
878 | spin_unlock(&buffer_mapping->private_lock); | 886 | spin_unlock(&buffer_mapping->private_lock); |
879 | } | 887 | } |
880 | } | 888 | } |
881 | EXPORT_SYMBOL(invalidate_inode_buffers); | 889 | EXPORT_SYMBOL(invalidate_inode_buffers); |
882 | 890 | ||
883 | /* | 891 | /* |
884 | * Remove any clean buffers from the inode's buffer list. This is called | 892 | * Remove any clean buffers from the inode's buffer list. This is called |
885 | * when we're trying to free the inode itself. Those buffers can pin it. | 893 | * when we're trying to free the inode itself. Those buffers can pin it. |
886 | * | 894 | * |
887 | * Returns true if all buffers were removed. | 895 | * Returns true if all buffers were removed. |
888 | */ | 896 | */ |
889 | int remove_inode_buffers(struct inode *inode) | 897 | int remove_inode_buffers(struct inode *inode) |
890 | { | 898 | { |
891 | int ret = 1; | 899 | int ret = 1; |
892 | 900 | ||
893 | if (inode_has_buffers(inode)) { | 901 | if (inode_has_buffers(inode)) { |
894 | struct address_space *mapping = &inode->i_data; | 902 | struct address_space *mapping = &inode->i_data; |
895 | struct list_head *list = &mapping->private_list; | 903 | struct list_head *list = &mapping->private_list; |
896 | struct address_space *buffer_mapping = mapping->assoc_mapping; | 904 | struct address_space *buffer_mapping = mapping->assoc_mapping; |
897 | 905 | ||
898 | spin_lock(&buffer_mapping->private_lock); | 906 | spin_lock(&buffer_mapping->private_lock); |
899 | while (!list_empty(list)) { | 907 | while (!list_empty(list)) { |
900 | struct buffer_head *bh = BH_ENTRY(list->next); | 908 | struct buffer_head *bh = BH_ENTRY(list->next); |
901 | if (buffer_dirty(bh)) { | 909 | if (buffer_dirty(bh)) { |
902 | ret = 0; | 910 | ret = 0; |
903 | break; | 911 | break; |
904 | } | 912 | } |
905 | __remove_assoc_queue(bh); | 913 | __remove_assoc_queue(bh); |
906 | } | 914 | } |
907 | spin_unlock(&buffer_mapping->private_lock); | 915 | spin_unlock(&buffer_mapping->private_lock); |
908 | } | 916 | } |
909 | return ret; | 917 | return ret; |
910 | } | 918 | } |
911 | 919 | ||
912 | /* | 920 | /* |
913 | * Create the appropriate buffers when given a page for data area and | 921 | * Create the appropriate buffers when given a page for data area and |
914 | * the size of each buffer.. Use the bh->b_this_page linked list to | 922 | * the size of each buffer.. Use the bh->b_this_page linked list to |
915 | * follow the buffers created. Return NULL if unable to create more | 923 | * follow the buffers created. Return NULL if unable to create more |
916 | * buffers. | 924 | * buffers. |
917 | * | 925 | * |
918 | * The retry flag is used to differentiate async IO (paging, swapping) | 926 | * The retry flag is used to differentiate async IO (paging, swapping) |
919 | * which may not fail from ordinary buffer allocations. | 927 | * which may not fail from ordinary buffer allocations. |
920 | */ | 928 | */ |
921 | struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, | 929 | struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, |
922 | int retry) | 930 | int retry) |
923 | { | 931 | { |
924 | struct buffer_head *bh, *head; | 932 | struct buffer_head *bh, *head; |
925 | long offset; | 933 | long offset; |
926 | 934 | ||
927 | try_again: | 935 | try_again: |
928 | head = NULL; | 936 | head = NULL; |
929 | offset = PAGE_SIZE; | 937 | offset = PAGE_SIZE; |
930 | while ((offset -= size) >= 0) { | 938 | while ((offset -= size) >= 0) { |
931 | bh = alloc_buffer_head(GFP_NOFS); | 939 | bh = alloc_buffer_head(GFP_NOFS); |
932 | if (!bh) | 940 | if (!bh) |
933 | goto no_grow; | 941 | goto no_grow; |
934 | 942 | ||
935 | bh->b_bdev = NULL; | 943 | bh->b_bdev = NULL; |
936 | bh->b_this_page = head; | 944 | bh->b_this_page = head; |
937 | bh->b_blocknr = -1; | 945 | bh->b_blocknr = -1; |
938 | head = bh; | 946 | head = bh; |
939 | 947 | ||
940 | bh->b_state = 0; | 948 | bh->b_state = 0; |
941 | atomic_set(&bh->b_count, 0); | 949 | atomic_set(&bh->b_count, 0); |
942 | bh->b_private = NULL; | 950 | bh->b_private = NULL; |
943 | bh->b_size = size; | 951 | bh->b_size = size; |
944 | 952 | ||
945 | /* Link the buffer to its page */ | 953 | /* Link the buffer to its page */ |
946 | set_bh_page(bh, page, offset); | 954 | set_bh_page(bh, page, offset); |
947 | 955 | ||
948 | init_buffer(bh, NULL, NULL); | 956 | init_buffer(bh, NULL, NULL); |
949 | } | 957 | } |
950 | return head; | 958 | return head; |
951 | /* | 959 | /* |
952 | * In case anything failed, we just free everything we got. | 960 | * In case anything failed, we just free everything we got. |
953 | */ | 961 | */ |
954 | no_grow: | 962 | no_grow: |
955 | if (head) { | 963 | if (head) { |
956 | do { | 964 | do { |
957 | bh = head; | 965 | bh = head; |
958 | head = head->b_this_page; | 966 | head = head->b_this_page; |
959 | free_buffer_head(bh); | 967 | free_buffer_head(bh); |
960 | } while (head); | 968 | } while (head); |
961 | } | 969 | } |
962 | 970 | ||
963 | /* | 971 | /* |
964 | * Return failure for non-async IO requests. Async IO requests | 972 | * Return failure for non-async IO requests. Async IO requests |
965 | * are not allowed to fail, so we have to wait until buffer heads | 973 | * are not allowed to fail, so we have to wait until buffer heads |
966 | * become available. But we don't want tasks sleeping with | 974 | * become available. But we don't want tasks sleeping with |
967 | * partially complete buffers, so all were released above. | 975 | * partially complete buffers, so all were released above. |
968 | */ | 976 | */ |
969 | if (!retry) | 977 | if (!retry) |
970 | return NULL; | 978 | return NULL; |
971 | 979 | ||
972 | /* We're _really_ low on memory. Now we just | 980 | /* We're _really_ low on memory. Now we just |
973 | * wait for old buffer heads to become free due to | 981 | * wait for old buffer heads to become free due to |
974 | * finishing IO. Since this is an async request and | 982 | * finishing IO. Since this is an async request and |
975 | * the reserve list is empty, we're sure there are | 983 | * the reserve list is empty, we're sure there are |
976 | * async buffer heads in use. | 984 | * async buffer heads in use. |
977 | */ | 985 | */ |
978 | free_more_memory(); | 986 | free_more_memory(); |
979 | goto try_again; | 987 | goto try_again; |
980 | } | 988 | } |
981 | EXPORT_SYMBOL_GPL(alloc_page_buffers); | 989 | EXPORT_SYMBOL_GPL(alloc_page_buffers); |
982 | 990 | ||
983 | static inline void | 991 | static inline void |
984 | link_dev_buffers(struct page *page, struct buffer_head *head) | 992 | link_dev_buffers(struct page *page, struct buffer_head *head) |
985 | { | 993 | { |
986 | struct buffer_head *bh, *tail; | 994 | struct buffer_head *bh, *tail; |
987 | 995 | ||
988 | bh = head; | 996 | bh = head; |
989 | do { | 997 | do { |
990 | tail = bh; | 998 | tail = bh; |
991 | bh = bh->b_this_page; | 999 | bh = bh->b_this_page; |
992 | } while (bh); | 1000 | } while (bh); |
993 | tail->b_this_page = head; | 1001 | tail->b_this_page = head; |
994 | attach_page_buffers(page, head); | 1002 | attach_page_buffers(page, head); |
995 | } | 1003 | } |
996 | 1004 | ||
997 | /* | 1005 | /* |
998 | * Initialise the state of a blockdev page's buffers. | 1006 | * Initialise the state of a blockdev page's buffers. |
999 | */ | 1007 | */ |
1000 | static void | 1008 | static void |
1001 | init_page_buffers(struct page *page, struct block_device *bdev, | 1009 | init_page_buffers(struct page *page, struct block_device *bdev, |
1002 | sector_t block, int size) | 1010 | sector_t block, int size) |
1003 | { | 1011 | { |
1004 | struct buffer_head *head = page_buffers(page); | 1012 | struct buffer_head *head = page_buffers(page); |
1005 | struct buffer_head *bh = head; | 1013 | struct buffer_head *bh = head; |
1006 | int uptodate = PageUptodate(page); | 1014 | int uptodate = PageUptodate(page); |
1007 | 1015 | ||
1008 | do { | 1016 | do { |
1009 | if (!buffer_mapped(bh)) { | 1017 | if (!buffer_mapped(bh)) { |
1010 | init_buffer(bh, NULL, NULL); | 1018 | init_buffer(bh, NULL, NULL); |
1011 | bh->b_bdev = bdev; | 1019 | bh->b_bdev = bdev; |
1012 | bh->b_blocknr = block; | 1020 | bh->b_blocknr = block; |
1013 | if (uptodate) | 1021 | if (uptodate) |
1014 | set_buffer_uptodate(bh); | 1022 | set_buffer_uptodate(bh); |
1015 | set_buffer_mapped(bh); | 1023 | set_buffer_mapped(bh); |
1016 | } | 1024 | } |
1017 | block++; | 1025 | block++; |
1018 | bh = bh->b_this_page; | 1026 | bh = bh->b_this_page; |
1019 | } while (bh != head); | 1027 | } while (bh != head); |
1020 | } | 1028 | } |
1021 | 1029 | ||
1022 | /* | 1030 | /* |
1023 | * Create the page-cache page that contains the requested block. | 1031 | * Create the page-cache page that contains the requested block. |
1024 | * | 1032 | * |
1025 | * This is user purely for blockdev mappings. | 1033 | * This is user purely for blockdev mappings. |
1026 | */ | 1034 | */ |
1027 | static struct page * | 1035 | static struct page * |
1028 | grow_dev_page(struct block_device *bdev, sector_t block, | 1036 | grow_dev_page(struct block_device *bdev, sector_t block, |
1029 | pgoff_t index, int size) | 1037 | pgoff_t index, int size) |
1030 | { | 1038 | { |
1031 | struct inode *inode = bdev->bd_inode; | 1039 | struct inode *inode = bdev->bd_inode; |
1032 | struct page *page; | 1040 | struct page *page; |
1033 | struct buffer_head *bh; | 1041 | struct buffer_head *bh; |
1034 | 1042 | ||
1035 | page = find_or_create_page(inode->i_mapping, index, | 1043 | page = find_or_create_page(inode->i_mapping, index, |
1036 | (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); | 1044 | (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); |
1037 | if (!page) | 1045 | if (!page) |
1038 | return NULL; | 1046 | return NULL; |
1039 | 1047 | ||
1040 | BUG_ON(!PageLocked(page)); | 1048 | BUG_ON(!PageLocked(page)); |
1041 | 1049 | ||
1042 | if (page_has_buffers(page)) { | 1050 | if (page_has_buffers(page)) { |
1043 | bh = page_buffers(page); | 1051 | bh = page_buffers(page); |
1044 | if (bh->b_size == size) { | 1052 | if (bh->b_size == size) { |
1045 | init_page_buffers(page, bdev, block, size); | 1053 | init_page_buffers(page, bdev, block, size); |
1046 | return page; | 1054 | return page; |
1047 | } | 1055 | } |
1048 | if (!try_to_free_buffers(page)) | 1056 | if (!try_to_free_buffers(page)) |
1049 | goto failed; | 1057 | goto failed; |
1050 | } | 1058 | } |
1051 | 1059 | ||
1052 | /* | 1060 | /* |
1053 | * Allocate some buffers for this page | 1061 | * Allocate some buffers for this page |
1054 | */ | 1062 | */ |
1055 | bh = alloc_page_buffers(page, size, 0); | 1063 | bh = alloc_page_buffers(page, size, 0); |
1056 | if (!bh) | 1064 | if (!bh) |
1057 | goto failed; | 1065 | goto failed; |
1058 | 1066 | ||
1059 | /* | 1067 | /* |
1060 | * Link the page to the buffers and initialise them. Take the | 1068 | * Link the page to the buffers and initialise them. Take the |
1061 | * lock to be atomic wrt __find_get_block(), which does not | 1069 | * lock to be atomic wrt __find_get_block(), which does not |
1062 | * run under the page lock. | 1070 | * run under the page lock. |
1063 | */ | 1071 | */ |
1064 | spin_lock(&inode->i_mapping->private_lock); | 1072 | spin_lock(&inode->i_mapping->private_lock); |
1065 | link_dev_buffers(page, bh); | 1073 | link_dev_buffers(page, bh); |
1066 | init_page_buffers(page, bdev, block, size); | 1074 | init_page_buffers(page, bdev, block, size); |
1067 | spin_unlock(&inode->i_mapping->private_lock); | 1075 | spin_unlock(&inode->i_mapping->private_lock); |
1068 | return page; | 1076 | return page; |
1069 | 1077 | ||
1070 | failed: | 1078 | failed: |
1071 | BUG(); | 1079 | BUG(); |
1072 | unlock_page(page); | 1080 | unlock_page(page); |
1073 | page_cache_release(page); | 1081 | page_cache_release(page); |
1074 | return NULL; | 1082 | return NULL; |
1075 | } | 1083 | } |
1076 | 1084 | ||
1077 | /* | 1085 | /* |
1078 | * Create buffers for the specified block device block's page. If | 1086 | * Create buffers for the specified block device block's page. If |
1079 | * that page was dirty, the buffers are set dirty also. | 1087 | * that page was dirty, the buffers are set dirty also. |
1080 | */ | 1088 | */ |
1081 | static int | 1089 | static int |
1082 | grow_buffers(struct block_device *bdev, sector_t block, int size) | 1090 | grow_buffers(struct block_device *bdev, sector_t block, int size) |
1083 | { | 1091 | { |
1084 | struct page *page; | 1092 | struct page *page; |
1085 | pgoff_t index; | 1093 | pgoff_t index; |
1086 | int sizebits; | 1094 | int sizebits; |
1087 | 1095 | ||
1088 | sizebits = -1; | 1096 | sizebits = -1; |
1089 | do { | 1097 | do { |
1090 | sizebits++; | 1098 | sizebits++; |
1091 | } while ((size << sizebits) < PAGE_SIZE); | 1099 | } while ((size << sizebits) < PAGE_SIZE); |
1092 | 1100 | ||
1093 | index = block >> sizebits; | 1101 | index = block >> sizebits; |
1094 | 1102 | ||
1095 | /* | 1103 | /* |
1096 | * Check for a block which wants to lie outside our maximum possible | 1104 | * Check for a block which wants to lie outside our maximum possible |
1097 | * pagecache index. (this comparison is done using sector_t types). | 1105 | * pagecache index. (this comparison is done using sector_t types). |
1098 | */ | 1106 | */ |
1099 | if (unlikely(index != block >> sizebits)) { | 1107 | if (unlikely(index != block >> sizebits)) { |
1100 | char b[BDEVNAME_SIZE]; | 1108 | char b[BDEVNAME_SIZE]; |
1101 | 1109 | ||
1102 | printk(KERN_ERR "%s: requested out-of-range block %llu for " | 1110 | printk(KERN_ERR "%s: requested out-of-range block %llu for " |
1103 | "device %s\n", | 1111 | "device %s\n", |
1104 | __func__, (unsigned long long)block, | 1112 | __func__, (unsigned long long)block, |
1105 | bdevname(bdev, b)); | 1113 | bdevname(bdev, b)); |
1106 | return -EIO; | 1114 | return -EIO; |
1107 | } | 1115 | } |
1108 | block = index << sizebits; | 1116 | block = index << sizebits; |
1109 | /* Create a page with the proper size buffers.. */ | 1117 | /* Create a page with the proper size buffers.. */ |
1110 | page = grow_dev_page(bdev, block, index, size); | 1118 | page = grow_dev_page(bdev, block, index, size); |
1111 | if (!page) | 1119 | if (!page) |
1112 | return 0; | 1120 | return 0; |
1113 | unlock_page(page); | 1121 | unlock_page(page); |
1114 | page_cache_release(page); | 1122 | page_cache_release(page); |
1115 | return 1; | 1123 | return 1; |
1116 | } | 1124 | } |
1117 | 1125 | ||
1118 | static struct buffer_head * | 1126 | static struct buffer_head * |
1119 | __getblk_slow(struct block_device *bdev, sector_t block, int size) | 1127 | __getblk_slow(struct block_device *bdev, sector_t block, int size) |
1120 | { | 1128 | { |
1121 | /* Size must be multiple of hard sectorsize */ | 1129 | /* Size must be multiple of hard sectorsize */ |
1122 | if (unlikely(size & (bdev_hardsect_size(bdev)-1) || | 1130 | if (unlikely(size & (bdev_hardsect_size(bdev)-1) || |
1123 | (size < 512 || size > PAGE_SIZE))) { | 1131 | (size < 512 || size > PAGE_SIZE))) { |
1124 | printk(KERN_ERR "getblk(): invalid block size %d requested\n", | 1132 | printk(KERN_ERR "getblk(): invalid block size %d requested\n", |
1125 | size); | 1133 | size); |
1126 | printk(KERN_ERR "hardsect size: %d\n", | 1134 | printk(KERN_ERR "hardsect size: %d\n", |
1127 | bdev_hardsect_size(bdev)); | 1135 | bdev_hardsect_size(bdev)); |
1128 | 1136 | ||
1129 | dump_stack(); | 1137 | dump_stack(); |
1130 | return NULL; | 1138 | return NULL; |
1131 | } | 1139 | } |
1132 | 1140 | ||
1133 | for (;;) { | 1141 | for (;;) { |
1134 | struct buffer_head * bh; | 1142 | struct buffer_head * bh; |
1135 | int ret; | 1143 | int ret; |
1136 | 1144 | ||
1137 | bh = __find_get_block(bdev, block, size); | 1145 | bh = __find_get_block(bdev, block, size); |
1138 | if (bh) | 1146 | if (bh) |
1139 | return bh; | 1147 | return bh; |
1140 | 1148 | ||
1141 | ret = grow_buffers(bdev, block, size); | 1149 | ret = grow_buffers(bdev, block, size); |
1142 | if (ret < 0) | 1150 | if (ret < 0) |
1143 | return NULL; | 1151 | return NULL; |
1144 | if (ret == 0) | 1152 | if (ret == 0) |
1145 | free_more_memory(); | 1153 | free_more_memory(); |
1146 | } | 1154 | } |
1147 | } | 1155 | } |
1148 | 1156 | ||
1149 | /* | 1157 | /* |
1150 | * The relationship between dirty buffers and dirty pages: | 1158 | * The relationship between dirty buffers and dirty pages: |
1151 | * | 1159 | * |
1152 | * Whenever a page has any dirty buffers, the page's dirty bit is set, and | 1160 | * Whenever a page has any dirty buffers, the page's dirty bit is set, and |
1153 | * the page is tagged dirty in its radix tree. | 1161 | * the page is tagged dirty in its radix tree. |
1154 | * | 1162 | * |
1155 | * At all times, the dirtiness of the buffers represents the dirtiness of | 1163 | * At all times, the dirtiness of the buffers represents the dirtiness of |
1156 | * subsections of the page. If the page has buffers, the page dirty bit is | 1164 | * subsections of the page. If the page has buffers, the page dirty bit is |
1157 | * merely a hint about the true dirty state. | 1165 | * merely a hint about the true dirty state. |
1158 | * | 1166 | * |
1159 | * When a page is set dirty in its entirety, all its buffers are marked dirty | 1167 | * When a page is set dirty in its entirety, all its buffers are marked dirty |
1160 | * (if the page has buffers). | 1168 | * (if the page has buffers). |
1161 | * | 1169 | * |
1162 | * When a buffer is marked dirty, its page is dirtied, but the page's other | 1170 | * When a buffer is marked dirty, its page is dirtied, but the page's other |
1163 | * buffers are not. | 1171 | * buffers are not. |
1164 | * | 1172 | * |
1165 | * Also. When blockdev buffers are explicitly read with bread(), they | 1173 | * Also. When blockdev buffers are explicitly read with bread(), they |
1166 | * individually become uptodate. But their backing page remains not | 1174 | * individually become uptodate. But their backing page remains not |
1167 | * uptodate - even if all of its buffers are uptodate. A subsequent | 1175 | * uptodate - even if all of its buffers are uptodate. A subsequent |
1168 | * block_read_full_page() against that page will discover all the uptodate | 1176 | * block_read_full_page() against that page will discover all the uptodate |
1169 | * buffers, will set the page uptodate and will perform no I/O. | 1177 | * buffers, will set the page uptodate and will perform no I/O. |
1170 | */ | 1178 | */ |
1171 | 1179 | ||
1172 | /** | 1180 | /** |
1173 | * mark_buffer_dirty - mark a buffer_head as needing writeout | 1181 | * mark_buffer_dirty - mark a buffer_head as needing writeout |
1174 | * @bh: the buffer_head to mark dirty | 1182 | * @bh: the buffer_head to mark dirty |
1175 | * | 1183 | * |
1176 | * mark_buffer_dirty() will set the dirty bit against the buffer, then set its | 1184 | * mark_buffer_dirty() will set the dirty bit against the buffer, then set its |
1177 | * backing page dirty, then tag the page as dirty in its address_space's radix | 1185 | * backing page dirty, then tag the page as dirty in its address_space's radix |
1178 | * tree and then attach the address_space's inode to its superblock's dirty | 1186 | * tree and then attach the address_space's inode to its superblock's dirty |
1179 | * inode list. | 1187 | * inode list. |
1180 | * | 1188 | * |
1181 | * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, | 1189 | * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, |
1182 | * mapping->tree_lock and the global inode_lock. | 1190 | * mapping->tree_lock and the global inode_lock. |
1183 | */ | 1191 | */ |
1184 | void mark_buffer_dirty(struct buffer_head *bh) | 1192 | void mark_buffer_dirty(struct buffer_head *bh) |
1185 | { | 1193 | { |
1186 | WARN_ON_ONCE(!buffer_uptodate(bh)); | 1194 | WARN_ON_ONCE(!buffer_uptodate(bh)); |
1187 | 1195 | ||
1188 | /* | 1196 | /* |
1189 | * Very *carefully* optimize the it-is-already-dirty case. | 1197 | * Very *carefully* optimize the it-is-already-dirty case. |
1190 | * | 1198 | * |
1191 | * Don't let the final "is it dirty" escape to before we | 1199 | * Don't let the final "is it dirty" escape to before we |
1192 | * perhaps modified the buffer. | 1200 | * perhaps modified the buffer. |
1193 | */ | 1201 | */ |
1194 | if (buffer_dirty(bh)) { | 1202 | if (buffer_dirty(bh)) { |
1195 | smp_mb(); | 1203 | smp_mb(); |
1196 | if (buffer_dirty(bh)) | 1204 | if (buffer_dirty(bh)) |
1197 | return; | 1205 | return; |
1198 | } | 1206 | } |
1199 | 1207 | ||
1200 | if (!test_set_buffer_dirty(bh)) | 1208 | if (!test_set_buffer_dirty(bh)) |
1201 | __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0); | 1209 | __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0); |
1202 | } | 1210 | } |
1203 | 1211 | ||
1204 | /* | 1212 | /* |
1205 | * Decrement a buffer_head's reference count. If all buffers against a page | 1213 | * Decrement a buffer_head's reference count. If all buffers against a page |
1206 | * have zero reference count, are clean and unlocked, and if the page is clean | 1214 | * have zero reference count, are clean and unlocked, and if the page is clean |
1207 | * and unlocked then try_to_free_buffers() may strip the buffers from the page | 1215 | * and unlocked then try_to_free_buffers() may strip the buffers from the page |
1208 | * in preparation for freeing it (sometimes, rarely, buffers are removed from | 1216 | * in preparation for freeing it (sometimes, rarely, buffers are removed from |
1209 | * a page but it ends up not being freed, and buffers may later be reattached). | 1217 | * a page but it ends up not being freed, and buffers may later be reattached). |
1210 | */ | 1218 | */ |
1211 | void __brelse(struct buffer_head * buf) | 1219 | void __brelse(struct buffer_head * buf) |
1212 | { | 1220 | { |
1213 | if (atomic_read(&buf->b_count)) { | 1221 | if (atomic_read(&buf->b_count)) { |
1214 | put_bh(buf); | 1222 | put_bh(buf); |
1215 | return; | 1223 | return; |
1216 | } | 1224 | } |
1217 | WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n"); | 1225 | WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n"); |
1218 | } | 1226 | } |
1219 | 1227 | ||
1220 | /* | 1228 | /* |
1221 | * bforget() is like brelse(), except it discards any | 1229 | * bforget() is like brelse(), except it discards any |
1222 | * potentially dirty data. | 1230 | * potentially dirty data. |
1223 | */ | 1231 | */ |
1224 | void __bforget(struct buffer_head *bh) | 1232 | void __bforget(struct buffer_head *bh) |
1225 | { | 1233 | { |
1226 | clear_buffer_dirty(bh); | 1234 | clear_buffer_dirty(bh); |
1227 | if (bh->b_assoc_map) { | 1235 | if (bh->b_assoc_map) { |
1228 | struct address_space *buffer_mapping = bh->b_page->mapping; | 1236 | struct address_space *buffer_mapping = bh->b_page->mapping; |
1229 | 1237 | ||
1230 | spin_lock(&buffer_mapping->private_lock); | 1238 | spin_lock(&buffer_mapping->private_lock); |
1231 | list_del_init(&bh->b_assoc_buffers); | 1239 | list_del_init(&bh->b_assoc_buffers); |
1232 | bh->b_assoc_map = NULL; | 1240 | bh->b_assoc_map = NULL; |
1233 | spin_unlock(&buffer_mapping->private_lock); | 1241 | spin_unlock(&buffer_mapping->private_lock); |
1234 | } | 1242 | } |
1235 | __brelse(bh); | 1243 | __brelse(bh); |
1236 | } | 1244 | } |
1237 | 1245 | ||
1238 | static struct buffer_head *__bread_slow(struct buffer_head *bh) | 1246 | static struct buffer_head *__bread_slow(struct buffer_head *bh) |
1239 | { | 1247 | { |
1240 | lock_buffer(bh); | 1248 | lock_buffer(bh); |
1241 | if (buffer_uptodate(bh)) { | 1249 | if (buffer_uptodate(bh)) { |
1242 | unlock_buffer(bh); | 1250 | unlock_buffer(bh); |
1243 | return bh; | 1251 | return bh; |
1244 | } else { | 1252 | } else { |
1245 | get_bh(bh); | 1253 | get_bh(bh); |
1246 | bh->b_end_io = end_buffer_read_sync; | 1254 | bh->b_end_io = end_buffer_read_sync; |
1247 | submit_bh(READ, bh); | 1255 | submit_bh(READ, bh); |
1248 | wait_on_buffer(bh); | 1256 | wait_on_buffer(bh); |
1249 | if (buffer_uptodate(bh)) | 1257 | if (buffer_uptodate(bh)) |
1250 | return bh; | 1258 | return bh; |
1251 | } | 1259 | } |
1252 | brelse(bh); | 1260 | brelse(bh); |
1253 | return NULL; | 1261 | return NULL; |
1254 | } | 1262 | } |
1255 | 1263 | ||
1256 | /* | 1264 | /* |
1257 | * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). | 1265 | * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). |
1258 | * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their | 1266 | * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their |
1259 | * refcount elevated by one when they're in an LRU. A buffer can only appear | 1267 | * refcount elevated by one when they're in an LRU. A buffer can only appear |
1260 | * once in a particular CPU's LRU. A single buffer can be present in multiple | 1268 | * once in a particular CPU's LRU. A single buffer can be present in multiple |
1261 | * CPU's LRUs at the same time. | 1269 | * CPU's LRUs at the same time. |
1262 | * | 1270 | * |
1263 | * This is a transparent caching front-end to sb_bread(), sb_getblk() and | 1271 | * This is a transparent caching front-end to sb_bread(), sb_getblk() and |
1264 | * sb_find_get_block(). | 1272 | * sb_find_get_block(). |
1265 | * | 1273 | * |
1266 | * The LRUs themselves only need locking against invalidate_bh_lrus. We use | 1274 | * The LRUs themselves only need locking against invalidate_bh_lrus. We use |
1267 | * a local interrupt disable for that. | 1275 | * a local interrupt disable for that. |
1268 | */ | 1276 | */ |
1269 | 1277 | ||
1270 | #define BH_LRU_SIZE 8 | 1278 | #define BH_LRU_SIZE 8 |
1271 | 1279 | ||
1272 | struct bh_lru { | 1280 | struct bh_lru { |
1273 | struct buffer_head *bhs[BH_LRU_SIZE]; | 1281 | struct buffer_head *bhs[BH_LRU_SIZE]; |
1274 | }; | 1282 | }; |
1275 | 1283 | ||
1276 | static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }}; | 1284 | static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }}; |
1277 | 1285 | ||
1278 | #ifdef CONFIG_SMP | 1286 | #ifdef CONFIG_SMP |
1279 | #define bh_lru_lock() local_irq_disable() | 1287 | #define bh_lru_lock() local_irq_disable() |
1280 | #define bh_lru_unlock() local_irq_enable() | 1288 | #define bh_lru_unlock() local_irq_enable() |
1281 | #else | 1289 | #else |
1282 | #define bh_lru_lock() preempt_disable() | 1290 | #define bh_lru_lock() preempt_disable() |
1283 | #define bh_lru_unlock() preempt_enable() | 1291 | #define bh_lru_unlock() preempt_enable() |
1284 | #endif | 1292 | #endif |
1285 | 1293 | ||
1286 | static inline void check_irqs_on(void) | 1294 | static inline void check_irqs_on(void) |
1287 | { | 1295 | { |
1288 | #ifdef irqs_disabled | 1296 | #ifdef irqs_disabled |
1289 | BUG_ON(irqs_disabled()); | 1297 | BUG_ON(irqs_disabled()); |
1290 | #endif | 1298 | #endif |
1291 | } | 1299 | } |
1292 | 1300 | ||
1293 | /* | 1301 | /* |
1294 | * The LRU management algorithm is dopey-but-simple. Sorry. | 1302 | * The LRU management algorithm is dopey-but-simple. Sorry. |
1295 | */ | 1303 | */ |
1296 | static void bh_lru_install(struct buffer_head *bh) | 1304 | static void bh_lru_install(struct buffer_head *bh) |
1297 | { | 1305 | { |
1298 | struct buffer_head *evictee = NULL; | 1306 | struct buffer_head *evictee = NULL; |
1299 | struct bh_lru *lru; | 1307 | struct bh_lru *lru; |
1300 | 1308 | ||
1301 | check_irqs_on(); | 1309 | check_irqs_on(); |
1302 | bh_lru_lock(); | 1310 | bh_lru_lock(); |
1303 | lru = &__get_cpu_var(bh_lrus); | 1311 | lru = &__get_cpu_var(bh_lrus); |
1304 | if (lru->bhs[0] != bh) { | 1312 | if (lru->bhs[0] != bh) { |
1305 | struct buffer_head *bhs[BH_LRU_SIZE]; | 1313 | struct buffer_head *bhs[BH_LRU_SIZE]; |
1306 | int in; | 1314 | int in; |
1307 | int out = 0; | 1315 | int out = 0; |
1308 | 1316 | ||
1309 | get_bh(bh); | 1317 | get_bh(bh); |
1310 | bhs[out++] = bh; | 1318 | bhs[out++] = bh; |
1311 | for (in = 0; in < BH_LRU_SIZE; in++) { | 1319 | for (in = 0; in < BH_LRU_SIZE; in++) { |
1312 | struct buffer_head *bh2 = lru->bhs[in]; | 1320 | struct buffer_head *bh2 = lru->bhs[in]; |
1313 | 1321 | ||
1314 | if (bh2 == bh) { | 1322 | if (bh2 == bh) { |
1315 | __brelse(bh2); | 1323 | __brelse(bh2); |
1316 | } else { | 1324 | } else { |
1317 | if (out >= BH_LRU_SIZE) { | 1325 | if (out >= BH_LRU_SIZE) { |
1318 | BUG_ON(evictee != NULL); | 1326 | BUG_ON(evictee != NULL); |
1319 | evictee = bh2; | 1327 | evictee = bh2; |
1320 | } else { | 1328 | } else { |
1321 | bhs[out++] = bh2; | 1329 | bhs[out++] = bh2; |
1322 | } | 1330 | } |
1323 | } | 1331 | } |
1324 | } | 1332 | } |
1325 | while (out < BH_LRU_SIZE) | 1333 | while (out < BH_LRU_SIZE) |
1326 | bhs[out++] = NULL; | 1334 | bhs[out++] = NULL; |
1327 | memcpy(lru->bhs, bhs, sizeof(bhs)); | 1335 | memcpy(lru->bhs, bhs, sizeof(bhs)); |
1328 | } | 1336 | } |
1329 | bh_lru_unlock(); | 1337 | bh_lru_unlock(); |
1330 | 1338 | ||
1331 | if (evictee) | 1339 | if (evictee) |
1332 | __brelse(evictee); | 1340 | __brelse(evictee); |
1333 | } | 1341 | } |
1334 | 1342 | ||
1335 | /* | 1343 | /* |
1336 | * Look up the bh in this cpu's LRU. If it's there, move it to the head. | 1344 | * Look up the bh in this cpu's LRU. If it's there, move it to the head. |
1337 | */ | 1345 | */ |
1338 | static struct buffer_head * | 1346 | static struct buffer_head * |
1339 | lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) | 1347 | lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) |
1340 | { | 1348 | { |
1341 | struct buffer_head *ret = NULL; | 1349 | struct buffer_head *ret = NULL; |
1342 | struct bh_lru *lru; | 1350 | struct bh_lru *lru; |
1343 | unsigned int i; | 1351 | unsigned int i; |
1344 | 1352 | ||
1345 | check_irqs_on(); | 1353 | check_irqs_on(); |
1346 | bh_lru_lock(); | 1354 | bh_lru_lock(); |
1347 | lru = &__get_cpu_var(bh_lrus); | 1355 | lru = &__get_cpu_var(bh_lrus); |
1348 | for (i = 0; i < BH_LRU_SIZE; i++) { | 1356 | for (i = 0; i < BH_LRU_SIZE; i++) { |
1349 | struct buffer_head *bh = lru->bhs[i]; | 1357 | struct buffer_head *bh = lru->bhs[i]; |
1350 | 1358 | ||
1351 | if (bh && bh->b_bdev == bdev && | 1359 | if (bh && bh->b_bdev == bdev && |
1352 | bh->b_blocknr == block && bh->b_size == size) { | 1360 | bh->b_blocknr == block && bh->b_size == size) { |
1353 | if (i) { | 1361 | if (i) { |
1354 | while (i) { | 1362 | while (i) { |
1355 | lru->bhs[i] = lru->bhs[i - 1]; | 1363 | lru->bhs[i] = lru->bhs[i - 1]; |
1356 | i--; | 1364 | i--; |
1357 | } | 1365 | } |
1358 | lru->bhs[0] = bh; | 1366 | lru->bhs[0] = bh; |
1359 | } | 1367 | } |
1360 | get_bh(bh); | 1368 | get_bh(bh); |
1361 | ret = bh; | 1369 | ret = bh; |
1362 | break; | 1370 | break; |
1363 | } | 1371 | } |
1364 | } | 1372 | } |
1365 | bh_lru_unlock(); | 1373 | bh_lru_unlock(); |
1366 | return ret; | 1374 | return ret; |
1367 | } | 1375 | } |
1368 | 1376 | ||
1369 | /* | 1377 | /* |
1370 | * Perform a pagecache lookup for the matching buffer. If it's there, refresh | 1378 | * Perform a pagecache lookup for the matching buffer. If it's there, refresh |
1371 | * it in the LRU and mark it as accessed. If it is not present then return | 1379 | * it in the LRU and mark it as accessed. If it is not present then return |
1372 | * NULL | 1380 | * NULL |
1373 | */ | 1381 | */ |
1374 | struct buffer_head * | 1382 | struct buffer_head * |
1375 | __find_get_block(struct block_device *bdev, sector_t block, unsigned size) | 1383 | __find_get_block(struct block_device *bdev, sector_t block, unsigned size) |
1376 | { | 1384 | { |
1377 | struct buffer_head *bh = lookup_bh_lru(bdev, block, size); | 1385 | struct buffer_head *bh = lookup_bh_lru(bdev, block, size); |
1378 | 1386 | ||
1379 | if (bh == NULL) { | 1387 | if (bh == NULL) { |
1380 | bh = __find_get_block_slow(bdev, block); | 1388 | bh = __find_get_block_slow(bdev, block); |
1381 | if (bh) | 1389 | if (bh) |
1382 | bh_lru_install(bh); | 1390 | bh_lru_install(bh); |
1383 | } | 1391 | } |
1384 | if (bh) | 1392 | if (bh) |
1385 | touch_buffer(bh); | 1393 | touch_buffer(bh); |
1386 | return bh; | 1394 | return bh; |
1387 | } | 1395 | } |
1388 | EXPORT_SYMBOL(__find_get_block); | 1396 | EXPORT_SYMBOL(__find_get_block); |
1389 | 1397 | ||
1390 | /* | 1398 | /* |
1391 | * __getblk will locate (and, if necessary, create) the buffer_head | 1399 | * __getblk will locate (and, if necessary, create) the buffer_head |
1392 | * which corresponds to the passed block_device, block and size. The | 1400 | * which corresponds to the passed block_device, block and size. The |
1393 | * returned buffer has its reference count incremented. | 1401 | * returned buffer has its reference count incremented. |
1394 | * | 1402 | * |
1395 | * __getblk() cannot fail - it just keeps trying. If you pass it an | 1403 | * __getblk() cannot fail - it just keeps trying. If you pass it an |
1396 | * illegal block number, __getblk() will happily return a buffer_head | 1404 | * illegal block number, __getblk() will happily return a buffer_head |
1397 | * which represents the non-existent block. Very weird. | 1405 | * which represents the non-existent block. Very weird. |
1398 | * | 1406 | * |
1399 | * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() | 1407 | * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() |
1400 | * attempt is failing. FIXME, perhaps? | 1408 | * attempt is failing. FIXME, perhaps? |
1401 | */ | 1409 | */ |
1402 | struct buffer_head * | 1410 | struct buffer_head * |
1403 | __getblk(struct block_device *bdev, sector_t block, unsigned size) | 1411 | __getblk(struct block_device *bdev, sector_t block, unsigned size) |
1404 | { | 1412 | { |
1405 | struct buffer_head *bh = __find_get_block(bdev, block, size); | 1413 | struct buffer_head *bh = __find_get_block(bdev, block, size); |
1406 | 1414 | ||
1407 | might_sleep(); | 1415 | might_sleep(); |
1408 | if (bh == NULL) | 1416 | if (bh == NULL) |
1409 | bh = __getblk_slow(bdev, block, size); | 1417 | bh = __getblk_slow(bdev, block, size); |
1410 | return bh; | 1418 | return bh; |
1411 | } | 1419 | } |
1412 | EXPORT_SYMBOL(__getblk); | 1420 | EXPORT_SYMBOL(__getblk); |
1413 | 1421 | ||
1414 | /* | 1422 | /* |
1415 | * Do async read-ahead on a buffer.. | 1423 | * Do async read-ahead on a buffer.. |
1416 | */ | 1424 | */ |
1417 | void __breadahead(struct block_device *bdev, sector_t block, unsigned size) | 1425 | void __breadahead(struct block_device *bdev, sector_t block, unsigned size) |
1418 | { | 1426 | { |
1419 | struct buffer_head *bh = __getblk(bdev, block, size); | 1427 | struct buffer_head *bh = __getblk(bdev, block, size); |
1420 | if (likely(bh)) { | 1428 | if (likely(bh)) { |
1421 | ll_rw_block(READA, 1, &bh); | 1429 | ll_rw_block(READA, 1, &bh); |
1422 | brelse(bh); | 1430 | brelse(bh); |
1423 | } | 1431 | } |
1424 | } | 1432 | } |
1425 | EXPORT_SYMBOL(__breadahead); | 1433 | EXPORT_SYMBOL(__breadahead); |
1426 | 1434 | ||
1427 | /** | 1435 | /** |
1428 | * __bread() - reads a specified block and returns the bh | 1436 | * __bread() - reads a specified block and returns the bh |
1429 | * @bdev: the block_device to read from | 1437 | * @bdev: the block_device to read from |
1430 | * @block: number of block | 1438 | * @block: number of block |
1431 | * @size: size (in bytes) to read | 1439 | * @size: size (in bytes) to read |
1432 | * | 1440 | * |
1433 | * Reads a specified block, and returns buffer head that contains it. | 1441 | * Reads a specified block, and returns buffer head that contains it. |
1434 | * It returns NULL if the block was unreadable. | 1442 | * It returns NULL if the block was unreadable. |
1435 | */ | 1443 | */ |
1436 | struct buffer_head * | 1444 | struct buffer_head * |
1437 | __bread(struct block_device *bdev, sector_t block, unsigned size) | 1445 | __bread(struct block_device *bdev, sector_t block, unsigned size) |
1438 | { | 1446 | { |
1439 | struct buffer_head *bh = __getblk(bdev, block, size); | 1447 | struct buffer_head *bh = __getblk(bdev, block, size); |
1440 | 1448 | ||
1441 | if (likely(bh) && !buffer_uptodate(bh)) | 1449 | if (likely(bh) && !buffer_uptodate(bh)) |
1442 | bh = __bread_slow(bh); | 1450 | bh = __bread_slow(bh); |
1443 | return bh; | 1451 | return bh; |
1444 | } | 1452 | } |
1445 | EXPORT_SYMBOL(__bread); | 1453 | EXPORT_SYMBOL(__bread); |
1446 | 1454 | ||
1447 | /* | 1455 | /* |
1448 | * invalidate_bh_lrus() is called rarely - but not only at unmount. | 1456 | * invalidate_bh_lrus() is called rarely - but not only at unmount. |
1449 | * This doesn't race because it runs in each cpu either in irq | 1457 | * This doesn't race because it runs in each cpu either in irq |
1450 | * or with preempt disabled. | 1458 | * or with preempt disabled. |
1451 | */ | 1459 | */ |
1452 | static void invalidate_bh_lru(void *arg) | 1460 | static void invalidate_bh_lru(void *arg) |
1453 | { | 1461 | { |
1454 | struct bh_lru *b = &get_cpu_var(bh_lrus); | 1462 | struct bh_lru *b = &get_cpu_var(bh_lrus); |
1455 | int i; | 1463 | int i; |
1456 | 1464 | ||
1457 | for (i = 0; i < BH_LRU_SIZE; i++) { | 1465 | for (i = 0; i < BH_LRU_SIZE; i++) { |
1458 | brelse(b->bhs[i]); | 1466 | brelse(b->bhs[i]); |
1459 | b->bhs[i] = NULL; | 1467 | b->bhs[i] = NULL; |
1460 | } | 1468 | } |
1461 | put_cpu_var(bh_lrus); | 1469 | put_cpu_var(bh_lrus); |
1462 | } | 1470 | } |
1463 | 1471 | ||
1464 | void invalidate_bh_lrus(void) | 1472 | void invalidate_bh_lrus(void) |
1465 | { | 1473 | { |
1466 | on_each_cpu(invalidate_bh_lru, NULL, 1); | 1474 | on_each_cpu(invalidate_bh_lru, NULL, 1); |
1467 | } | 1475 | } |
1468 | EXPORT_SYMBOL_GPL(invalidate_bh_lrus); | 1476 | EXPORT_SYMBOL_GPL(invalidate_bh_lrus); |
1469 | 1477 | ||
1470 | void set_bh_page(struct buffer_head *bh, | 1478 | void set_bh_page(struct buffer_head *bh, |
1471 | struct page *page, unsigned long offset) | 1479 | struct page *page, unsigned long offset) |
1472 | { | 1480 | { |
1473 | bh->b_page = page; | 1481 | bh->b_page = page; |
1474 | BUG_ON(offset >= PAGE_SIZE); | 1482 | BUG_ON(offset >= PAGE_SIZE); |
1475 | if (PageHighMem(page)) | 1483 | if (PageHighMem(page)) |
1476 | /* | 1484 | /* |
1477 | * This catches illegal uses and preserves the offset: | 1485 | * This catches illegal uses and preserves the offset: |
1478 | */ | 1486 | */ |
1479 | bh->b_data = (char *)(0 + offset); | 1487 | bh->b_data = (char *)(0 + offset); |
1480 | else | 1488 | else |
1481 | bh->b_data = page_address(page) + offset; | 1489 | bh->b_data = page_address(page) + offset; |
1482 | } | 1490 | } |
1483 | EXPORT_SYMBOL(set_bh_page); | 1491 | EXPORT_SYMBOL(set_bh_page); |
1484 | 1492 | ||
1485 | /* | 1493 | /* |
1486 | * Called when truncating a buffer on a page completely. | 1494 | * Called when truncating a buffer on a page completely. |
1487 | */ | 1495 | */ |
1488 | static void discard_buffer(struct buffer_head * bh) | 1496 | static void discard_buffer(struct buffer_head * bh) |
1489 | { | 1497 | { |
1490 | lock_buffer(bh); | 1498 | lock_buffer(bh); |
1491 | clear_buffer_dirty(bh); | 1499 | clear_buffer_dirty(bh); |
1492 | bh->b_bdev = NULL; | 1500 | bh->b_bdev = NULL; |
1493 | clear_buffer_mapped(bh); | 1501 | clear_buffer_mapped(bh); |
1494 | clear_buffer_req(bh); | 1502 | clear_buffer_req(bh); |
1495 | clear_buffer_new(bh); | 1503 | clear_buffer_new(bh); |
1496 | clear_buffer_delay(bh); | 1504 | clear_buffer_delay(bh); |
1497 | clear_buffer_unwritten(bh); | 1505 | clear_buffer_unwritten(bh); |
1498 | unlock_buffer(bh); | 1506 | unlock_buffer(bh); |
1499 | } | 1507 | } |
1500 | 1508 | ||
1501 | /** | 1509 | /** |
1502 | * block_invalidatepage - invalidate part of all of a buffer-backed page | 1510 | * block_invalidatepage - invalidate part of all of a buffer-backed page |
1503 | * | 1511 | * |
1504 | * @page: the page which is affected | 1512 | * @page: the page which is affected |
1505 | * @offset: the index of the truncation point | 1513 | * @offset: the index of the truncation point |
1506 | * | 1514 | * |
1507 | * block_invalidatepage() is called when all or part of the page has become | 1515 | * block_invalidatepage() is called when all or part of the page has become |
1508 | * invalidatedby a truncate operation. | 1516 | * invalidatedby a truncate operation. |
1509 | * | 1517 | * |
1510 | * block_invalidatepage() does not have to release all buffers, but it must | 1518 | * block_invalidatepage() does not have to release all buffers, but it must |
1511 | * ensure that no dirty buffer is left outside @offset and that no I/O | 1519 | * ensure that no dirty buffer is left outside @offset and that no I/O |
1512 | * is underway against any of the blocks which are outside the truncation | 1520 | * is underway against any of the blocks which are outside the truncation |
1513 | * point. Because the caller is about to free (and possibly reuse) those | 1521 | * point. Because the caller is about to free (and possibly reuse) those |
1514 | * blocks on-disk. | 1522 | * blocks on-disk. |
1515 | */ | 1523 | */ |
1516 | void block_invalidatepage(struct page *page, unsigned long offset) | 1524 | void block_invalidatepage(struct page *page, unsigned long offset) |
1517 | { | 1525 | { |
1518 | struct buffer_head *head, *bh, *next; | 1526 | struct buffer_head *head, *bh, *next; |
1519 | unsigned int curr_off = 0; | 1527 | unsigned int curr_off = 0; |
1520 | 1528 | ||
1521 | BUG_ON(!PageLocked(page)); | 1529 | BUG_ON(!PageLocked(page)); |
1522 | if (!page_has_buffers(page)) | 1530 | if (!page_has_buffers(page)) |
1523 | goto out; | 1531 | goto out; |
1524 | 1532 | ||
1525 | head = page_buffers(page); | 1533 | head = page_buffers(page); |
1526 | bh = head; | 1534 | bh = head; |
1527 | do { | 1535 | do { |
1528 | unsigned int next_off = curr_off + bh->b_size; | 1536 | unsigned int next_off = curr_off + bh->b_size; |
1529 | next = bh->b_this_page; | 1537 | next = bh->b_this_page; |
1530 | 1538 | ||
1531 | /* | 1539 | /* |
1532 | * is this block fully invalidated? | 1540 | * is this block fully invalidated? |
1533 | */ | 1541 | */ |
1534 | if (offset <= curr_off) | 1542 | if (offset <= curr_off) |
1535 | discard_buffer(bh); | 1543 | discard_buffer(bh); |
1536 | curr_off = next_off; | 1544 | curr_off = next_off; |
1537 | bh = next; | 1545 | bh = next; |
1538 | } while (bh != head); | 1546 | } while (bh != head); |
1539 | 1547 | ||
1540 | /* | 1548 | /* |
1541 | * We release buffers only if the entire page is being invalidated. | 1549 | * We release buffers only if the entire page is being invalidated. |
1542 | * The get_block cached value has been unconditionally invalidated, | 1550 | * The get_block cached value has been unconditionally invalidated, |
1543 | * so real IO is not possible anymore. | 1551 | * so real IO is not possible anymore. |
1544 | */ | 1552 | */ |
1545 | if (offset == 0) | 1553 | if (offset == 0) |
1546 | try_to_release_page(page, 0); | 1554 | try_to_release_page(page, 0); |
1547 | out: | 1555 | out: |
1548 | return; | 1556 | return; |
1549 | } | 1557 | } |
1550 | EXPORT_SYMBOL(block_invalidatepage); | 1558 | EXPORT_SYMBOL(block_invalidatepage); |
1551 | 1559 | ||
1552 | /* | 1560 | /* |
1553 | * We attach and possibly dirty the buffers atomically wrt | 1561 | * We attach and possibly dirty the buffers atomically wrt |
1554 | * __set_page_dirty_buffers() via private_lock. try_to_free_buffers | 1562 | * __set_page_dirty_buffers() via private_lock. try_to_free_buffers |
1555 | * is already excluded via the page lock. | 1563 | * is already excluded via the page lock. |
1556 | */ | 1564 | */ |
1557 | void create_empty_buffers(struct page *page, | 1565 | void create_empty_buffers(struct page *page, |
1558 | unsigned long blocksize, unsigned long b_state) | 1566 | unsigned long blocksize, unsigned long b_state) |
1559 | { | 1567 | { |
1560 | struct buffer_head *bh, *head, *tail; | 1568 | struct buffer_head *bh, *head, *tail; |
1561 | 1569 | ||
1562 | head = alloc_page_buffers(page, blocksize, 1); | 1570 | head = alloc_page_buffers(page, blocksize, 1); |
1563 | bh = head; | 1571 | bh = head; |
1564 | do { | 1572 | do { |
1565 | bh->b_state |= b_state; | 1573 | bh->b_state |= b_state; |
1566 | tail = bh; | 1574 | tail = bh; |
1567 | bh = bh->b_this_page; | 1575 | bh = bh->b_this_page; |
1568 | } while (bh); | 1576 | } while (bh); |
1569 | tail->b_this_page = head; | 1577 | tail->b_this_page = head; |
1570 | 1578 | ||
1571 | spin_lock(&page->mapping->private_lock); | 1579 | spin_lock(&page->mapping->private_lock); |
1572 | if (PageUptodate(page) || PageDirty(page)) { | 1580 | if (PageUptodate(page) || PageDirty(page)) { |
1573 | bh = head; | 1581 | bh = head; |
1574 | do { | 1582 | do { |
1575 | if (PageDirty(page)) | 1583 | if (PageDirty(page)) |
1576 | set_buffer_dirty(bh); | 1584 | set_buffer_dirty(bh); |
1577 | if (PageUptodate(page)) | 1585 | if (PageUptodate(page)) |
1578 | set_buffer_uptodate(bh); | 1586 | set_buffer_uptodate(bh); |
1579 | bh = bh->b_this_page; | 1587 | bh = bh->b_this_page; |
1580 | } while (bh != head); | 1588 | } while (bh != head); |
1581 | } | 1589 | } |
1582 | attach_page_buffers(page, head); | 1590 | attach_page_buffers(page, head); |
1583 | spin_unlock(&page->mapping->private_lock); | 1591 | spin_unlock(&page->mapping->private_lock); |
1584 | } | 1592 | } |
1585 | EXPORT_SYMBOL(create_empty_buffers); | 1593 | EXPORT_SYMBOL(create_empty_buffers); |
1586 | 1594 | ||
1587 | /* | 1595 | /* |
1588 | * We are taking a block for data and we don't want any output from any | 1596 | * We are taking a block for data and we don't want any output from any |
1589 | * buffer-cache aliases starting from return from that function and | 1597 | * buffer-cache aliases starting from return from that function and |
1590 | * until the moment when something will explicitly mark the buffer | 1598 | * until the moment when something will explicitly mark the buffer |
1591 | * dirty (hopefully that will not happen until we will free that block ;-) | 1599 | * dirty (hopefully that will not happen until we will free that block ;-) |
1592 | * We don't even need to mark it not-uptodate - nobody can expect | 1600 | * We don't even need to mark it not-uptodate - nobody can expect |
1593 | * anything from a newly allocated buffer anyway. We used to used | 1601 | * anything from a newly allocated buffer anyway. We used to used |
1594 | * unmap_buffer() for such invalidation, but that was wrong. We definitely | 1602 | * unmap_buffer() for such invalidation, but that was wrong. We definitely |
1595 | * don't want to mark the alias unmapped, for example - it would confuse | 1603 | * don't want to mark the alias unmapped, for example - it would confuse |
1596 | * anyone who might pick it with bread() afterwards... | 1604 | * anyone who might pick it with bread() afterwards... |
1597 | * | 1605 | * |
1598 | * Also.. Note that bforget() doesn't lock the buffer. So there can | 1606 | * Also.. Note that bforget() doesn't lock the buffer. So there can |
1599 | * be writeout I/O going on against recently-freed buffers. We don't | 1607 | * be writeout I/O going on against recently-freed buffers. We don't |
1600 | * wait on that I/O in bforget() - it's more efficient to wait on the I/O | 1608 | * wait on that I/O in bforget() - it's more efficient to wait on the I/O |
1601 | * only if we really need to. That happens here. | 1609 | * only if we really need to. That happens here. |
1602 | */ | 1610 | */ |
1603 | void unmap_underlying_metadata(struct block_device *bdev, sector_t block) | 1611 | void unmap_underlying_metadata(struct block_device *bdev, sector_t block) |
1604 | { | 1612 | { |
1605 | struct buffer_head *old_bh; | 1613 | struct buffer_head *old_bh; |
1606 | 1614 | ||
1607 | might_sleep(); | 1615 | might_sleep(); |
1608 | 1616 | ||
1609 | old_bh = __find_get_block_slow(bdev, block); | 1617 | old_bh = __find_get_block_slow(bdev, block); |
1610 | if (old_bh) { | 1618 | if (old_bh) { |
1611 | clear_buffer_dirty(old_bh); | 1619 | clear_buffer_dirty(old_bh); |
1612 | wait_on_buffer(old_bh); | 1620 | wait_on_buffer(old_bh); |
1613 | clear_buffer_req(old_bh); | 1621 | clear_buffer_req(old_bh); |
1614 | __brelse(old_bh); | 1622 | __brelse(old_bh); |
1615 | } | 1623 | } |
1616 | } | 1624 | } |
1617 | EXPORT_SYMBOL(unmap_underlying_metadata); | 1625 | EXPORT_SYMBOL(unmap_underlying_metadata); |
1618 | 1626 | ||
1619 | /* | 1627 | /* |
1620 | * NOTE! All mapped/uptodate combinations are valid: | 1628 | * NOTE! All mapped/uptodate combinations are valid: |
1621 | * | 1629 | * |
1622 | * Mapped Uptodate Meaning | 1630 | * Mapped Uptodate Meaning |
1623 | * | 1631 | * |
1624 | * No No "unknown" - must do get_block() | 1632 | * No No "unknown" - must do get_block() |
1625 | * No Yes "hole" - zero-filled | 1633 | * No Yes "hole" - zero-filled |
1626 | * Yes No "allocated" - allocated on disk, not read in | 1634 | * Yes No "allocated" - allocated on disk, not read in |
1627 | * Yes Yes "valid" - allocated and up-to-date in memory. | 1635 | * Yes Yes "valid" - allocated and up-to-date in memory. |
1628 | * | 1636 | * |
1629 | * "Dirty" is valid only with the last case (mapped+uptodate). | 1637 | * "Dirty" is valid only with the last case (mapped+uptodate). |
1630 | */ | 1638 | */ |
1631 | 1639 | ||
1632 | /* | 1640 | /* |
1633 | * While block_write_full_page is writing back the dirty buffers under | 1641 | * While block_write_full_page is writing back the dirty buffers under |
1634 | * the page lock, whoever dirtied the buffers may decide to clean them | 1642 | * the page lock, whoever dirtied the buffers may decide to clean them |
1635 | * again at any time. We handle that by only looking at the buffer | 1643 | * again at any time. We handle that by only looking at the buffer |
1636 | * state inside lock_buffer(). | 1644 | * state inside lock_buffer(). |
1637 | * | 1645 | * |
1638 | * If block_write_full_page() is called for regular writeback | 1646 | * If block_write_full_page() is called for regular writeback |
1639 | * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a | 1647 | * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a |
1640 | * locked buffer. This only can happen if someone has written the buffer | 1648 | * locked buffer. This only can happen if someone has written the buffer |
1641 | * directly, with submit_bh(). At the address_space level PageWriteback | 1649 | * directly, with submit_bh(). At the address_space level PageWriteback |
1642 | * prevents this contention from occurring. | 1650 | * prevents this contention from occurring. |
1643 | */ | 1651 | */ |
1644 | static int __block_write_full_page(struct inode *inode, struct page *page, | 1652 | static int __block_write_full_page(struct inode *inode, struct page *page, |
1645 | get_block_t *get_block, struct writeback_control *wbc) | 1653 | get_block_t *get_block, struct writeback_control *wbc) |
1646 | { | 1654 | { |
1647 | int err; | 1655 | int err; |
1648 | sector_t block; | 1656 | sector_t block; |
1649 | sector_t last_block; | 1657 | sector_t last_block; |
1650 | struct buffer_head *bh, *head; | 1658 | struct buffer_head *bh, *head; |
1651 | const unsigned blocksize = 1 << inode->i_blkbits; | 1659 | const unsigned blocksize = 1 << inode->i_blkbits; |
1652 | int nr_underway = 0; | 1660 | int nr_underway = 0; |
1653 | 1661 | ||
1654 | BUG_ON(!PageLocked(page)); | 1662 | BUG_ON(!PageLocked(page)); |
1655 | 1663 | ||
1656 | last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; | 1664 | last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; |
1657 | 1665 | ||
1658 | if (!page_has_buffers(page)) { | 1666 | if (!page_has_buffers(page)) { |
1659 | create_empty_buffers(page, blocksize, | 1667 | create_empty_buffers(page, blocksize, |
1660 | (1 << BH_Dirty)|(1 << BH_Uptodate)); | 1668 | (1 << BH_Dirty)|(1 << BH_Uptodate)); |
1661 | } | 1669 | } |
1662 | 1670 | ||
1663 | /* | 1671 | /* |
1664 | * Be very careful. We have no exclusion from __set_page_dirty_buffers | 1672 | * Be very careful. We have no exclusion from __set_page_dirty_buffers |
1665 | * here, and the (potentially unmapped) buffers may become dirty at | 1673 | * here, and the (potentially unmapped) buffers may become dirty at |
1666 | * any time. If a buffer becomes dirty here after we've inspected it | 1674 | * any time. If a buffer becomes dirty here after we've inspected it |
1667 | * then we just miss that fact, and the page stays dirty. | 1675 | * then we just miss that fact, and the page stays dirty. |
1668 | * | 1676 | * |
1669 | * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; | 1677 | * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; |
1670 | * handle that here by just cleaning them. | 1678 | * handle that here by just cleaning them. |
1671 | */ | 1679 | */ |
1672 | 1680 | ||
1673 | block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | 1681 | block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); |
1674 | head = page_buffers(page); | 1682 | head = page_buffers(page); |
1675 | bh = head; | 1683 | bh = head; |
1676 | 1684 | ||
1677 | /* | 1685 | /* |
1678 | * Get all the dirty buffers mapped to disk addresses and | 1686 | * Get all the dirty buffers mapped to disk addresses and |
1679 | * handle any aliases from the underlying blockdev's mapping. | 1687 | * handle any aliases from the underlying blockdev's mapping. |
1680 | */ | 1688 | */ |
1681 | do { | 1689 | do { |
1682 | if (block > last_block) { | 1690 | if (block > last_block) { |
1683 | /* | 1691 | /* |
1684 | * mapped buffers outside i_size will occur, because | 1692 | * mapped buffers outside i_size will occur, because |
1685 | * this page can be outside i_size when there is a | 1693 | * this page can be outside i_size when there is a |
1686 | * truncate in progress. | 1694 | * truncate in progress. |
1687 | */ | 1695 | */ |
1688 | /* | 1696 | /* |
1689 | * The buffer was zeroed by block_write_full_page() | 1697 | * The buffer was zeroed by block_write_full_page() |
1690 | */ | 1698 | */ |
1691 | clear_buffer_dirty(bh); | 1699 | clear_buffer_dirty(bh); |
1692 | set_buffer_uptodate(bh); | 1700 | set_buffer_uptodate(bh); |
1693 | } else if ((!buffer_mapped(bh) || buffer_delay(bh)) && | 1701 | } else if ((!buffer_mapped(bh) || buffer_delay(bh)) && |
1694 | buffer_dirty(bh)) { | 1702 | buffer_dirty(bh)) { |
1695 | WARN_ON(bh->b_size != blocksize); | 1703 | WARN_ON(bh->b_size != blocksize); |
1696 | err = get_block(inode, block, bh, 1); | 1704 | err = get_block(inode, block, bh, 1); |
1697 | if (err) | 1705 | if (err) |
1698 | goto recover; | 1706 | goto recover; |
1699 | clear_buffer_delay(bh); | 1707 | clear_buffer_delay(bh); |
1700 | if (buffer_new(bh)) { | 1708 | if (buffer_new(bh)) { |
1701 | /* blockdev mappings never come here */ | 1709 | /* blockdev mappings never come here */ |
1702 | clear_buffer_new(bh); | 1710 | clear_buffer_new(bh); |
1703 | unmap_underlying_metadata(bh->b_bdev, | 1711 | unmap_underlying_metadata(bh->b_bdev, |
1704 | bh->b_blocknr); | 1712 | bh->b_blocknr); |
1705 | } | 1713 | } |
1706 | } | 1714 | } |
1707 | bh = bh->b_this_page; | 1715 | bh = bh->b_this_page; |
1708 | block++; | 1716 | block++; |
1709 | } while (bh != head); | 1717 | } while (bh != head); |
1710 | 1718 | ||
1711 | do { | 1719 | do { |
1712 | if (!buffer_mapped(bh)) | 1720 | if (!buffer_mapped(bh)) |
1713 | continue; | 1721 | continue; |
1714 | /* | 1722 | /* |
1715 | * If it's a fully non-blocking write attempt and we cannot | 1723 | * If it's a fully non-blocking write attempt and we cannot |
1716 | * lock the buffer then redirty the page. Note that this can | 1724 | * lock the buffer then redirty the page. Note that this can |
1717 | * potentially cause a busy-wait loop from pdflush and kswapd | 1725 | * potentially cause a busy-wait loop from pdflush and kswapd |
1718 | * activity, but those code paths have their own higher-level | 1726 | * activity, but those code paths have their own higher-level |
1719 | * throttling. | 1727 | * throttling. |
1720 | */ | 1728 | */ |
1721 | if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { | 1729 | if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { |
1722 | lock_buffer(bh); | 1730 | lock_buffer(bh); |
1723 | } else if (!trylock_buffer(bh)) { | 1731 | } else if (!trylock_buffer(bh)) { |
1724 | redirty_page_for_writepage(wbc, page); | 1732 | redirty_page_for_writepage(wbc, page); |
1725 | continue; | 1733 | continue; |
1726 | } | 1734 | } |
1727 | if (test_clear_buffer_dirty(bh)) { | 1735 | if (test_clear_buffer_dirty(bh)) { |
1728 | mark_buffer_async_write(bh); | 1736 | mark_buffer_async_write(bh); |
1729 | } else { | 1737 | } else { |
1730 | unlock_buffer(bh); | 1738 | unlock_buffer(bh); |
1731 | } | 1739 | } |
1732 | } while ((bh = bh->b_this_page) != head); | 1740 | } while ((bh = bh->b_this_page) != head); |
1733 | 1741 | ||
1734 | /* | 1742 | /* |
1735 | * The page and its buffers are protected by PageWriteback(), so we can | 1743 | * The page and its buffers are protected by PageWriteback(), so we can |
1736 | * drop the bh refcounts early. | 1744 | * drop the bh refcounts early. |
1737 | */ | 1745 | */ |
1738 | BUG_ON(PageWriteback(page)); | 1746 | BUG_ON(PageWriteback(page)); |
1739 | set_page_writeback(page); | 1747 | set_page_writeback(page); |
1740 | 1748 | ||
1741 | do { | 1749 | do { |
1742 | struct buffer_head *next = bh->b_this_page; | 1750 | struct buffer_head *next = bh->b_this_page; |
1743 | if (buffer_async_write(bh)) { | 1751 | if (buffer_async_write(bh)) { |
1744 | submit_bh(WRITE, bh); | 1752 | submit_bh(WRITE, bh); |
1745 | nr_underway++; | 1753 | nr_underway++; |
1746 | } | 1754 | } |
1747 | bh = next; | 1755 | bh = next; |
1748 | } while (bh != head); | 1756 | } while (bh != head); |
1749 | unlock_page(page); | 1757 | unlock_page(page); |
1750 | 1758 | ||
1751 | err = 0; | 1759 | err = 0; |
1752 | done: | 1760 | done: |
1753 | if (nr_underway == 0) { | 1761 | if (nr_underway == 0) { |
1754 | /* | 1762 | /* |
1755 | * The page was marked dirty, but the buffers were | 1763 | * The page was marked dirty, but the buffers were |
1756 | * clean. Someone wrote them back by hand with | 1764 | * clean. Someone wrote them back by hand with |
1757 | * ll_rw_block/submit_bh. A rare case. | 1765 | * ll_rw_block/submit_bh. A rare case. |
1758 | */ | 1766 | */ |
1759 | end_page_writeback(page); | 1767 | end_page_writeback(page); |
1760 | 1768 | ||
1761 | /* | 1769 | /* |
1762 | * The page and buffer_heads can be released at any time from | 1770 | * The page and buffer_heads can be released at any time from |
1763 | * here on. | 1771 | * here on. |
1764 | */ | 1772 | */ |
1765 | } | 1773 | } |
1766 | return err; | 1774 | return err; |
1767 | 1775 | ||
1768 | recover: | 1776 | recover: |
1769 | /* | 1777 | /* |
1770 | * ENOSPC, or some other error. We may already have added some | 1778 | * ENOSPC, or some other error. We may already have added some |
1771 | * blocks to the file, so we need to write these out to avoid | 1779 | * blocks to the file, so we need to write these out to avoid |
1772 | * exposing stale data. | 1780 | * exposing stale data. |
1773 | * The page is currently locked and not marked for writeback | 1781 | * The page is currently locked and not marked for writeback |
1774 | */ | 1782 | */ |
1775 | bh = head; | 1783 | bh = head; |
1776 | /* Recovery: lock and submit the mapped buffers */ | 1784 | /* Recovery: lock and submit the mapped buffers */ |
1777 | do { | 1785 | do { |
1778 | if (buffer_mapped(bh) && buffer_dirty(bh) && | 1786 | if (buffer_mapped(bh) && buffer_dirty(bh) && |
1779 | !buffer_delay(bh)) { | 1787 | !buffer_delay(bh)) { |
1780 | lock_buffer(bh); | 1788 | lock_buffer(bh); |
1781 | mark_buffer_async_write(bh); | 1789 | mark_buffer_async_write(bh); |
1782 | } else { | 1790 | } else { |
1783 | /* | 1791 | /* |
1784 | * The buffer may have been set dirty during | 1792 | * The buffer may have been set dirty during |
1785 | * attachment to a dirty page. | 1793 | * attachment to a dirty page. |
1786 | */ | 1794 | */ |
1787 | clear_buffer_dirty(bh); | 1795 | clear_buffer_dirty(bh); |
1788 | } | 1796 | } |
1789 | } while ((bh = bh->b_this_page) != head); | 1797 | } while ((bh = bh->b_this_page) != head); |
1790 | SetPageError(page); | 1798 | SetPageError(page); |
1791 | BUG_ON(PageWriteback(page)); | 1799 | BUG_ON(PageWriteback(page)); |
1792 | mapping_set_error(page->mapping, err); | 1800 | mapping_set_error(page->mapping, err); |
1793 | set_page_writeback(page); | 1801 | set_page_writeback(page); |
1794 | do { | 1802 | do { |
1795 | struct buffer_head *next = bh->b_this_page; | 1803 | struct buffer_head *next = bh->b_this_page; |
1796 | if (buffer_async_write(bh)) { | 1804 | if (buffer_async_write(bh)) { |
1797 | clear_buffer_dirty(bh); | 1805 | clear_buffer_dirty(bh); |
1798 | submit_bh(WRITE, bh); | 1806 | submit_bh(WRITE, bh); |
1799 | nr_underway++; | 1807 | nr_underway++; |
1800 | } | 1808 | } |
1801 | bh = next; | 1809 | bh = next; |
1802 | } while (bh != head); | 1810 | } while (bh != head); |
1803 | unlock_page(page); | 1811 | unlock_page(page); |
1804 | goto done; | 1812 | goto done; |
1805 | } | 1813 | } |
1806 | 1814 | ||
1807 | /* | 1815 | /* |
1808 | * If a page has any new buffers, zero them out here, and mark them uptodate | 1816 | * If a page has any new buffers, zero them out here, and mark them uptodate |
1809 | * and dirty so they'll be written out (in order to prevent uninitialised | 1817 | * and dirty so they'll be written out (in order to prevent uninitialised |
1810 | * block data from leaking). And clear the new bit. | 1818 | * block data from leaking). And clear the new bit. |
1811 | */ | 1819 | */ |
1812 | void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) | 1820 | void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) |
1813 | { | 1821 | { |
1814 | unsigned int block_start, block_end; | 1822 | unsigned int block_start, block_end; |
1815 | struct buffer_head *head, *bh; | 1823 | struct buffer_head *head, *bh; |
1816 | 1824 | ||
1817 | BUG_ON(!PageLocked(page)); | 1825 | BUG_ON(!PageLocked(page)); |
1818 | if (!page_has_buffers(page)) | 1826 | if (!page_has_buffers(page)) |
1819 | return; | 1827 | return; |
1820 | 1828 | ||
1821 | bh = head = page_buffers(page); | 1829 | bh = head = page_buffers(page); |
1822 | block_start = 0; | 1830 | block_start = 0; |
1823 | do { | 1831 | do { |
1824 | block_end = block_start + bh->b_size; | 1832 | block_end = block_start + bh->b_size; |
1825 | 1833 | ||
1826 | if (buffer_new(bh)) { | 1834 | if (buffer_new(bh)) { |
1827 | if (block_end > from && block_start < to) { | 1835 | if (block_end > from && block_start < to) { |
1828 | if (!PageUptodate(page)) { | 1836 | if (!PageUptodate(page)) { |
1829 | unsigned start, size; | 1837 | unsigned start, size; |
1830 | 1838 | ||
1831 | start = max(from, block_start); | 1839 | start = max(from, block_start); |
1832 | size = min(to, block_end) - start; | 1840 | size = min(to, block_end) - start; |
1833 | 1841 | ||
1834 | zero_user(page, start, size); | 1842 | zero_user(page, start, size); |
1835 | set_buffer_uptodate(bh); | 1843 | set_buffer_uptodate(bh); |
1836 | } | 1844 | } |
1837 | 1845 | ||
1838 | clear_buffer_new(bh); | 1846 | clear_buffer_new(bh); |
1839 | mark_buffer_dirty(bh); | 1847 | mark_buffer_dirty(bh); |
1840 | } | 1848 | } |
1841 | } | 1849 | } |
1842 | 1850 | ||
1843 | block_start = block_end; | 1851 | block_start = block_end; |
1844 | bh = bh->b_this_page; | 1852 | bh = bh->b_this_page; |
1845 | } while (bh != head); | 1853 | } while (bh != head); |
1846 | } | 1854 | } |
1847 | EXPORT_SYMBOL(page_zero_new_buffers); | 1855 | EXPORT_SYMBOL(page_zero_new_buffers); |
1848 | 1856 | ||
1849 | static int __block_prepare_write(struct inode *inode, struct page *page, | 1857 | static int __block_prepare_write(struct inode *inode, struct page *page, |
1850 | unsigned from, unsigned to, get_block_t *get_block) | 1858 | unsigned from, unsigned to, get_block_t *get_block) |
1851 | { | 1859 | { |
1852 | unsigned block_start, block_end; | 1860 | unsigned block_start, block_end; |
1853 | sector_t block; | 1861 | sector_t block; |
1854 | int err = 0; | 1862 | int err = 0; |
1855 | unsigned blocksize, bbits; | 1863 | unsigned blocksize, bbits; |
1856 | struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; | 1864 | struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; |
1857 | 1865 | ||
1858 | BUG_ON(!PageLocked(page)); | 1866 | BUG_ON(!PageLocked(page)); |
1859 | BUG_ON(from > PAGE_CACHE_SIZE); | 1867 | BUG_ON(from > PAGE_CACHE_SIZE); |
1860 | BUG_ON(to > PAGE_CACHE_SIZE); | 1868 | BUG_ON(to > PAGE_CACHE_SIZE); |
1861 | BUG_ON(from > to); | 1869 | BUG_ON(from > to); |
1862 | 1870 | ||
1863 | blocksize = 1 << inode->i_blkbits; | 1871 | blocksize = 1 << inode->i_blkbits; |
1864 | if (!page_has_buffers(page)) | 1872 | if (!page_has_buffers(page)) |
1865 | create_empty_buffers(page, blocksize, 0); | 1873 | create_empty_buffers(page, blocksize, 0); |
1866 | head = page_buffers(page); | 1874 | head = page_buffers(page); |
1867 | 1875 | ||
1868 | bbits = inode->i_blkbits; | 1876 | bbits = inode->i_blkbits; |
1869 | block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); | 1877 | block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); |
1870 | 1878 | ||
1871 | for(bh = head, block_start = 0; bh != head || !block_start; | 1879 | for(bh = head, block_start = 0; bh != head || !block_start; |
1872 | block++, block_start=block_end, bh = bh->b_this_page) { | 1880 | block++, block_start=block_end, bh = bh->b_this_page) { |
1873 | block_end = block_start + blocksize; | 1881 | block_end = block_start + blocksize; |
1874 | if (block_end <= from || block_start >= to) { | 1882 | if (block_end <= from || block_start >= to) { |
1875 | if (PageUptodate(page)) { | 1883 | if (PageUptodate(page)) { |
1876 | if (!buffer_uptodate(bh)) | 1884 | if (!buffer_uptodate(bh)) |
1877 | set_buffer_uptodate(bh); | 1885 | set_buffer_uptodate(bh); |
1878 | } | 1886 | } |
1879 | continue; | 1887 | continue; |
1880 | } | 1888 | } |
1881 | if (buffer_new(bh)) | 1889 | if (buffer_new(bh)) |
1882 | clear_buffer_new(bh); | 1890 | clear_buffer_new(bh); |
1883 | if (!buffer_mapped(bh)) { | 1891 | if (!buffer_mapped(bh)) { |
1884 | WARN_ON(bh->b_size != blocksize); | 1892 | WARN_ON(bh->b_size != blocksize); |
1885 | err = get_block(inode, block, bh, 1); | 1893 | err = get_block(inode, block, bh, 1); |
1886 | if (err) | 1894 | if (err) |
1887 | break; | 1895 | break; |
1888 | if (buffer_new(bh)) { | 1896 | if (buffer_new(bh)) { |
1889 | unmap_underlying_metadata(bh->b_bdev, | 1897 | unmap_underlying_metadata(bh->b_bdev, |
1890 | bh->b_blocknr); | 1898 | bh->b_blocknr); |
1891 | if (PageUptodate(page)) { | 1899 | if (PageUptodate(page)) { |
1892 | clear_buffer_new(bh); | 1900 | clear_buffer_new(bh); |
1893 | set_buffer_uptodate(bh); | 1901 | set_buffer_uptodate(bh); |
1894 | mark_buffer_dirty(bh); | 1902 | mark_buffer_dirty(bh); |
1895 | continue; | 1903 | continue; |
1896 | } | 1904 | } |
1897 | if (block_end > to || block_start < from) | 1905 | if (block_end > to || block_start < from) |
1898 | zero_user_segments(page, | 1906 | zero_user_segments(page, |
1899 | to, block_end, | 1907 | to, block_end, |
1900 | block_start, from); | 1908 | block_start, from); |
1901 | continue; | 1909 | continue; |
1902 | } | 1910 | } |
1903 | } | 1911 | } |
1904 | if (PageUptodate(page)) { | 1912 | if (PageUptodate(page)) { |
1905 | if (!buffer_uptodate(bh)) | 1913 | if (!buffer_uptodate(bh)) |
1906 | set_buffer_uptodate(bh); | 1914 | set_buffer_uptodate(bh); |
1907 | continue; | 1915 | continue; |
1908 | } | 1916 | } |
1909 | if (!buffer_uptodate(bh) && !buffer_delay(bh) && | 1917 | if (!buffer_uptodate(bh) && !buffer_delay(bh) && |
1910 | !buffer_unwritten(bh) && | 1918 | !buffer_unwritten(bh) && |
1911 | (block_start < from || block_end > to)) { | 1919 | (block_start < from || block_end > to)) { |
1912 | ll_rw_block(READ, 1, &bh); | 1920 | ll_rw_block(READ, 1, &bh); |
1913 | *wait_bh++=bh; | 1921 | *wait_bh++=bh; |
1914 | } | 1922 | } |
1915 | } | 1923 | } |
1916 | /* | 1924 | /* |
1917 | * If we issued read requests - let them complete. | 1925 | * If we issued read requests - let them complete. |
1918 | */ | 1926 | */ |
1919 | while(wait_bh > wait) { | 1927 | while(wait_bh > wait) { |
1920 | wait_on_buffer(*--wait_bh); | 1928 | wait_on_buffer(*--wait_bh); |
1921 | if (!buffer_uptodate(*wait_bh)) | 1929 | if (!buffer_uptodate(*wait_bh)) |
1922 | err = -EIO; | 1930 | err = -EIO; |
1923 | } | 1931 | } |
1924 | if (unlikely(err)) | 1932 | if (unlikely(err)) |
1925 | page_zero_new_buffers(page, from, to); | 1933 | page_zero_new_buffers(page, from, to); |
1926 | return err; | 1934 | return err; |
1927 | } | 1935 | } |
1928 | 1936 | ||
1929 | static int __block_commit_write(struct inode *inode, struct page *page, | 1937 | static int __block_commit_write(struct inode *inode, struct page *page, |
1930 | unsigned from, unsigned to) | 1938 | unsigned from, unsigned to) |
1931 | { | 1939 | { |
1932 | unsigned block_start, block_end; | 1940 | unsigned block_start, block_end; |
1933 | int partial = 0; | 1941 | int partial = 0; |
1934 | unsigned blocksize; | 1942 | unsigned blocksize; |
1935 | struct buffer_head *bh, *head; | 1943 | struct buffer_head *bh, *head; |
1936 | 1944 | ||
1937 | blocksize = 1 << inode->i_blkbits; | 1945 | blocksize = 1 << inode->i_blkbits; |
1938 | 1946 | ||
1939 | for(bh = head = page_buffers(page), block_start = 0; | 1947 | for(bh = head = page_buffers(page), block_start = 0; |
1940 | bh != head || !block_start; | 1948 | bh != head || !block_start; |
1941 | block_start=block_end, bh = bh->b_this_page) { | 1949 | block_start=block_end, bh = bh->b_this_page) { |
1942 | block_end = block_start + blocksize; | 1950 | block_end = block_start + blocksize; |
1943 | if (block_end <= from || block_start >= to) { | 1951 | if (block_end <= from || block_start >= to) { |
1944 | if (!buffer_uptodate(bh)) | 1952 | if (!buffer_uptodate(bh)) |
1945 | partial = 1; | 1953 | partial = 1; |
1946 | } else { | 1954 | } else { |
1947 | set_buffer_uptodate(bh); | 1955 | set_buffer_uptodate(bh); |
1948 | mark_buffer_dirty(bh); | 1956 | mark_buffer_dirty(bh); |
1949 | } | 1957 | } |
1950 | clear_buffer_new(bh); | 1958 | clear_buffer_new(bh); |
1951 | } | 1959 | } |
1952 | 1960 | ||
1953 | /* | 1961 | /* |
1954 | * If this is a partial write which happened to make all buffers | 1962 | * If this is a partial write which happened to make all buffers |
1955 | * uptodate then we can optimize away a bogus readpage() for | 1963 | * uptodate then we can optimize away a bogus readpage() for |
1956 | * the next read(). Here we 'discover' whether the page went | 1964 | * the next read(). Here we 'discover' whether the page went |
1957 | * uptodate as a result of this (potentially partial) write. | 1965 | * uptodate as a result of this (potentially partial) write. |
1958 | */ | 1966 | */ |
1959 | if (!partial) | 1967 | if (!partial) |
1960 | SetPageUptodate(page); | 1968 | SetPageUptodate(page); |
1961 | return 0; | 1969 | return 0; |
1962 | } | 1970 | } |
1963 | 1971 | ||
1964 | /* | 1972 | /* |
1965 | * block_write_begin takes care of the basic task of block allocation and | 1973 | * block_write_begin takes care of the basic task of block allocation and |
1966 | * bringing partial write blocks uptodate first. | 1974 | * bringing partial write blocks uptodate first. |
1967 | * | 1975 | * |
1968 | * If *pagep is not NULL, then block_write_begin uses the locked page | 1976 | * If *pagep is not NULL, then block_write_begin uses the locked page |
1969 | * at *pagep rather than allocating its own. In this case, the page will | 1977 | * at *pagep rather than allocating its own. In this case, the page will |
1970 | * not be unlocked or deallocated on failure. | 1978 | * not be unlocked or deallocated on failure. |
1971 | */ | 1979 | */ |
1972 | int block_write_begin(struct file *file, struct address_space *mapping, | 1980 | int block_write_begin(struct file *file, struct address_space *mapping, |
1973 | loff_t pos, unsigned len, unsigned flags, | 1981 | loff_t pos, unsigned len, unsigned flags, |
1974 | struct page **pagep, void **fsdata, | 1982 | struct page **pagep, void **fsdata, |
1975 | get_block_t *get_block) | 1983 | get_block_t *get_block) |
1976 | { | 1984 | { |
1977 | struct inode *inode = mapping->host; | 1985 | struct inode *inode = mapping->host; |
1978 | int status = 0; | 1986 | int status = 0; |
1979 | struct page *page; | 1987 | struct page *page; |
1980 | pgoff_t index; | 1988 | pgoff_t index; |
1981 | unsigned start, end; | 1989 | unsigned start, end; |
1982 | int ownpage = 0; | 1990 | int ownpage = 0; |
1983 | 1991 | ||
1984 | index = pos >> PAGE_CACHE_SHIFT; | 1992 | index = pos >> PAGE_CACHE_SHIFT; |
1985 | start = pos & (PAGE_CACHE_SIZE - 1); | 1993 | start = pos & (PAGE_CACHE_SIZE - 1); |
1986 | end = start + len; | 1994 | end = start + len; |
1987 | 1995 | ||
1988 | page = *pagep; | 1996 | page = *pagep; |
1989 | if (page == NULL) { | 1997 | if (page == NULL) { |
1990 | ownpage = 1; | 1998 | ownpage = 1; |
1991 | page = __grab_cache_page(mapping, index); | 1999 | page = __grab_cache_page(mapping, index); |
1992 | if (!page) { | 2000 | if (!page) { |
1993 | status = -ENOMEM; | 2001 | status = -ENOMEM; |
1994 | goto out; | 2002 | goto out; |
1995 | } | 2003 | } |
1996 | *pagep = page; | 2004 | *pagep = page; |
1997 | } else | 2005 | } else |
1998 | BUG_ON(!PageLocked(page)); | 2006 | BUG_ON(!PageLocked(page)); |
1999 | 2007 | ||
2000 | status = __block_prepare_write(inode, page, start, end, get_block); | 2008 | status = __block_prepare_write(inode, page, start, end, get_block); |
2001 | if (unlikely(status)) { | 2009 | if (unlikely(status)) { |
2002 | ClearPageUptodate(page); | 2010 | ClearPageUptodate(page); |
2003 | 2011 | ||
2004 | if (ownpage) { | 2012 | if (ownpage) { |
2005 | unlock_page(page); | 2013 | unlock_page(page); |
2006 | page_cache_release(page); | 2014 | page_cache_release(page); |
2007 | *pagep = NULL; | 2015 | *pagep = NULL; |
2008 | 2016 | ||
2009 | /* | 2017 | /* |
2010 | * prepare_write() may have instantiated a few blocks | 2018 | * prepare_write() may have instantiated a few blocks |
2011 | * outside i_size. Trim these off again. Don't need | 2019 | * outside i_size. Trim these off again. Don't need |
2012 | * i_size_read because we hold i_mutex. | 2020 | * i_size_read because we hold i_mutex. |
2013 | */ | 2021 | */ |
2014 | if (pos + len > inode->i_size) | 2022 | if (pos + len > inode->i_size) |
2015 | vmtruncate(inode, inode->i_size); | 2023 | vmtruncate(inode, inode->i_size); |
2016 | } | 2024 | } |
2017 | goto out; | 2025 | goto out; |
2018 | } | 2026 | } |
2019 | 2027 | ||
2020 | out: | 2028 | out: |
2021 | return status; | 2029 | return status; |
2022 | } | 2030 | } |
2023 | EXPORT_SYMBOL(block_write_begin); | 2031 | EXPORT_SYMBOL(block_write_begin); |
2024 | 2032 | ||
2025 | int block_write_end(struct file *file, struct address_space *mapping, | 2033 | int block_write_end(struct file *file, struct address_space *mapping, |
2026 | loff_t pos, unsigned len, unsigned copied, | 2034 | loff_t pos, unsigned len, unsigned copied, |
2027 | struct page *page, void *fsdata) | 2035 | struct page *page, void *fsdata) |
2028 | { | 2036 | { |
2029 | struct inode *inode = mapping->host; | 2037 | struct inode *inode = mapping->host; |
2030 | unsigned start; | 2038 | unsigned start; |
2031 | 2039 | ||
2032 | start = pos & (PAGE_CACHE_SIZE - 1); | 2040 | start = pos & (PAGE_CACHE_SIZE - 1); |
2033 | 2041 | ||
2034 | if (unlikely(copied < len)) { | 2042 | if (unlikely(copied < len)) { |
2035 | /* | 2043 | /* |
2036 | * The buffers that were written will now be uptodate, so we | 2044 | * The buffers that were written will now be uptodate, so we |
2037 | * don't have to worry about a readpage reading them and | 2045 | * don't have to worry about a readpage reading them and |
2038 | * overwriting a partial write. However if we have encountered | 2046 | * overwriting a partial write. However if we have encountered |
2039 | * a short write and only partially written into a buffer, it | 2047 | * a short write and only partially written into a buffer, it |
2040 | * will not be marked uptodate, so a readpage might come in and | 2048 | * will not be marked uptodate, so a readpage might come in and |
2041 | * destroy our partial write. | 2049 | * destroy our partial write. |
2042 | * | 2050 | * |
2043 | * Do the simplest thing, and just treat any short write to a | 2051 | * Do the simplest thing, and just treat any short write to a |
2044 | * non uptodate page as a zero-length write, and force the | 2052 | * non uptodate page as a zero-length write, and force the |
2045 | * caller to redo the whole thing. | 2053 | * caller to redo the whole thing. |
2046 | */ | 2054 | */ |
2047 | if (!PageUptodate(page)) | 2055 | if (!PageUptodate(page)) |
2048 | copied = 0; | 2056 | copied = 0; |
2049 | 2057 | ||
2050 | page_zero_new_buffers(page, start+copied, start+len); | 2058 | page_zero_new_buffers(page, start+copied, start+len); |
2051 | } | 2059 | } |
2052 | flush_dcache_page(page); | 2060 | flush_dcache_page(page); |
2053 | 2061 | ||
2054 | /* This could be a short (even 0-length) commit */ | 2062 | /* This could be a short (even 0-length) commit */ |
2055 | __block_commit_write(inode, page, start, start+copied); | 2063 | __block_commit_write(inode, page, start, start+copied); |
2056 | 2064 | ||
2057 | return copied; | 2065 | return copied; |
2058 | } | 2066 | } |
2059 | EXPORT_SYMBOL(block_write_end); | 2067 | EXPORT_SYMBOL(block_write_end); |
2060 | 2068 | ||
2061 | int generic_write_end(struct file *file, struct address_space *mapping, | 2069 | int generic_write_end(struct file *file, struct address_space *mapping, |
2062 | loff_t pos, unsigned len, unsigned copied, | 2070 | loff_t pos, unsigned len, unsigned copied, |
2063 | struct page *page, void *fsdata) | 2071 | struct page *page, void *fsdata) |
2064 | { | 2072 | { |
2065 | struct inode *inode = mapping->host; | 2073 | struct inode *inode = mapping->host; |
2066 | int i_size_changed = 0; | 2074 | int i_size_changed = 0; |
2067 | 2075 | ||
2068 | copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); | 2076 | copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); |
2069 | 2077 | ||
2070 | /* | 2078 | /* |
2071 | * No need to use i_size_read() here, the i_size | 2079 | * No need to use i_size_read() here, the i_size |
2072 | * cannot change under us because we hold i_mutex. | 2080 | * cannot change under us because we hold i_mutex. |
2073 | * | 2081 | * |
2074 | * But it's important to update i_size while still holding page lock: | 2082 | * But it's important to update i_size while still holding page lock: |
2075 | * page writeout could otherwise come in and zero beyond i_size. | 2083 | * page writeout could otherwise come in and zero beyond i_size. |
2076 | */ | 2084 | */ |
2077 | if (pos+copied > inode->i_size) { | 2085 | if (pos+copied > inode->i_size) { |
2078 | i_size_write(inode, pos+copied); | 2086 | i_size_write(inode, pos+copied); |
2079 | i_size_changed = 1; | 2087 | i_size_changed = 1; |
2080 | } | 2088 | } |
2081 | 2089 | ||
2082 | unlock_page(page); | 2090 | unlock_page(page); |
2083 | page_cache_release(page); | 2091 | page_cache_release(page); |
2084 | 2092 | ||
2085 | /* | 2093 | /* |
2086 | * Don't mark the inode dirty under page lock. First, it unnecessarily | 2094 | * Don't mark the inode dirty under page lock. First, it unnecessarily |
2087 | * makes the holding time of page lock longer. Second, it forces lock | 2095 | * makes the holding time of page lock longer. Second, it forces lock |
2088 | * ordering of page lock and transaction start for journaling | 2096 | * ordering of page lock and transaction start for journaling |
2089 | * filesystems. | 2097 | * filesystems. |
2090 | */ | 2098 | */ |
2091 | if (i_size_changed) | 2099 | if (i_size_changed) |
2092 | mark_inode_dirty(inode); | 2100 | mark_inode_dirty(inode); |
2093 | 2101 | ||
2094 | return copied; | 2102 | return copied; |
2095 | } | 2103 | } |
2096 | EXPORT_SYMBOL(generic_write_end); | 2104 | EXPORT_SYMBOL(generic_write_end); |
2097 | 2105 | ||
2098 | /* | 2106 | /* |
2099 | * block_is_partially_uptodate checks whether buffers within a page are | 2107 | * block_is_partially_uptodate checks whether buffers within a page are |
2100 | * uptodate or not. | 2108 | * uptodate or not. |
2101 | * | 2109 | * |
2102 | * Returns true if all buffers which correspond to a file portion | 2110 | * Returns true if all buffers which correspond to a file portion |
2103 | * we want to read are uptodate. | 2111 | * we want to read are uptodate. |
2104 | */ | 2112 | */ |
2105 | int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, | 2113 | int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, |
2106 | unsigned long from) | 2114 | unsigned long from) |
2107 | { | 2115 | { |
2108 | struct inode *inode = page->mapping->host; | 2116 | struct inode *inode = page->mapping->host; |
2109 | unsigned block_start, block_end, blocksize; | 2117 | unsigned block_start, block_end, blocksize; |
2110 | unsigned to; | 2118 | unsigned to; |
2111 | struct buffer_head *bh, *head; | 2119 | struct buffer_head *bh, *head; |
2112 | int ret = 1; | 2120 | int ret = 1; |
2113 | 2121 | ||
2114 | if (!page_has_buffers(page)) | 2122 | if (!page_has_buffers(page)) |
2115 | return 0; | 2123 | return 0; |
2116 | 2124 | ||
2117 | blocksize = 1 << inode->i_blkbits; | 2125 | blocksize = 1 << inode->i_blkbits; |
2118 | to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); | 2126 | to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); |
2119 | to = from + to; | 2127 | to = from + to; |
2120 | if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) | 2128 | if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) |
2121 | return 0; | 2129 | return 0; |
2122 | 2130 | ||
2123 | head = page_buffers(page); | 2131 | head = page_buffers(page); |
2124 | bh = head; | 2132 | bh = head; |
2125 | block_start = 0; | 2133 | block_start = 0; |
2126 | do { | 2134 | do { |
2127 | block_end = block_start + blocksize; | 2135 | block_end = block_start + blocksize; |
2128 | if (block_end > from && block_start < to) { | 2136 | if (block_end > from && block_start < to) { |
2129 | if (!buffer_uptodate(bh)) { | 2137 | if (!buffer_uptodate(bh)) { |
2130 | ret = 0; | 2138 | ret = 0; |
2131 | break; | 2139 | break; |
2132 | } | 2140 | } |
2133 | if (block_end >= to) | 2141 | if (block_end >= to) |
2134 | break; | 2142 | break; |
2135 | } | 2143 | } |
2136 | block_start = block_end; | 2144 | block_start = block_end; |
2137 | bh = bh->b_this_page; | 2145 | bh = bh->b_this_page; |
2138 | } while (bh != head); | 2146 | } while (bh != head); |
2139 | 2147 | ||
2140 | return ret; | 2148 | return ret; |
2141 | } | 2149 | } |
2142 | EXPORT_SYMBOL(block_is_partially_uptodate); | 2150 | EXPORT_SYMBOL(block_is_partially_uptodate); |
2143 | 2151 | ||
2144 | /* | 2152 | /* |
2145 | * Generic "read page" function for block devices that have the normal | 2153 | * Generic "read page" function for block devices that have the normal |
2146 | * get_block functionality. This is most of the block device filesystems. | 2154 | * get_block functionality. This is most of the block device filesystems. |
2147 | * Reads the page asynchronously --- the unlock_buffer() and | 2155 | * Reads the page asynchronously --- the unlock_buffer() and |
2148 | * set/clear_buffer_uptodate() functions propagate buffer state into the | 2156 | * set/clear_buffer_uptodate() functions propagate buffer state into the |
2149 | * page struct once IO has completed. | 2157 | * page struct once IO has completed. |
2150 | */ | 2158 | */ |
2151 | int block_read_full_page(struct page *page, get_block_t *get_block) | 2159 | int block_read_full_page(struct page *page, get_block_t *get_block) |
2152 | { | 2160 | { |
2153 | struct inode *inode = page->mapping->host; | 2161 | struct inode *inode = page->mapping->host; |
2154 | sector_t iblock, lblock; | 2162 | sector_t iblock, lblock; |
2155 | struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; | 2163 | struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; |
2156 | unsigned int blocksize; | 2164 | unsigned int blocksize; |
2157 | int nr, i; | 2165 | int nr, i; |
2158 | int fully_mapped = 1; | 2166 | int fully_mapped = 1; |
2159 | 2167 | ||
2160 | BUG_ON(!PageLocked(page)); | 2168 | BUG_ON(!PageLocked(page)); |
2161 | blocksize = 1 << inode->i_blkbits; | 2169 | blocksize = 1 << inode->i_blkbits; |
2162 | if (!page_has_buffers(page)) | 2170 | if (!page_has_buffers(page)) |
2163 | create_empty_buffers(page, blocksize, 0); | 2171 | create_empty_buffers(page, blocksize, 0); |
2164 | head = page_buffers(page); | 2172 | head = page_buffers(page); |
2165 | 2173 | ||
2166 | iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | 2174 | iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); |
2167 | lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; | 2175 | lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; |
2168 | bh = head; | 2176 | bh = head; |
2169 | nr = 0; | 2177 | nr = 0; |
2170 | i = 0; | 2178 | i = 0; |
2171 | 2179 | ||
2172 | do { | 2180 | do { |
2173 | if (buffer_uptodate(bh)) | 2181 | if (buffer_uptodate(bh)) |
2174 | continue; | 2182 | continue; |
2175 | 2183 | ||
2176 | if (!buffer_mapped(bh)) { | 2184 | if (!buffer_mapped(bh)) { |
2177 | int err = 0; | 2185 | int err = 0; |
2178 | 2186 | ||
2179 | fully_mapped = 0; | 2187 | fully_mapped = 0; |
2180 | if (iblock < lblock) { | 2188 | if (iblock < lblock) { |
2181 | WARN_ON(bh->b_size != blocksize); | 2189 | WARN_ON(bh->b_size != blocksize); |
2182 | err = get_block(inode, iblock, bh, 0); | 2190 | err = get_block(inode, iblock, bh, 0); |
2183 | if (err) | 2191 | if (err) |
2184 | SetPageError(page); | 2192 | SetPageError(page); |
2185 | } | 2193 | } |
2186 | if (!buffer_mapped(bh)) { | 2194 | if (!buffer_mapped(bh)) { |
2187 | zero_user(page, i * blocksize, blocksize); | 2195 | zero_user(page, i * blocksize, blocksize); |
2188 | if (!err) | 2196 | if (!err) |
2189 | set_buffer_uptodate(bh); | 2197 | set_buffer_uptodate(bh); |
2190 | continue; | 2198 | continue; |
2191 | } | 2199 | } |
2192 | /* | 2200 | /* |
2193 | * get_block() might have updated the buffer | 2201 | * get_block() might have updated the buffer |
2194 | * synchronously | 2202 | * synchronously |
2195 | */ | 2203 | */ |
2196 | if (buffer_uptodate(bh)) | 2204 | if (buffer_uptodate(bh)) |
2197 | continue; | 2205 | continue; |
2198 | } | 2206 | } |
2199 | arr[nr++] = bh; | 2207 | arr[nr++] = bh; |
2200 | } while (i++, iblock++, (bh = bh->b_this_page) != head); | 2208 | } while (i++, iblock++, (bh = bh->b_this_page) != head); |
2201 | 2209 | ||
2202 | if (fully_mapped) | 2210 | if (fully_mapped) |
2203 | SetPageMappedToDisk(page); | 2211 | SetPageMappedToDisk(page); |
2204 | 2212 | ||
2205 | if (!nr) { | 2213 | if (!nr) { |
2206 | /* | 2214 | /* |
2207 | * All buffers are uptodate - we can set the page uptodate | 2215 | * All buffers are uptodate - we can set the page uptodate |
2208 | * as well. But not if get_block() returned an error. | 2216 | * as well. But not if get_block() returned an error. |
2209 | */ | 2217 | */ |
2210 | if (!PageError(page)) | 2218 | if (!PageError(page)) |
2211 | SetPageUptodate(page); | 2219 | SetPageUptodate(page); |
2212 | unlock_page(page); | 2220 | unlock_page(page); |
2213 | return 0; | 2221 | return 0; |
2214 | } | 2222 | } |
2215 | 2223 | ||
2216 | /* Stage two: lock the buffers */ | 2224 | /* Stage two: lock the buffers */ |
2217 | for (i = 0; i < nr; i++) { | 2225 | for (i = 0; i < nr; i++) { |
2218 | bh = arr[i]; | 2226 | bh = arr[i]; |
2219 | lock_buffer(bh); | 2227 | lock_buffer(bh); |
2220 | mark_buffer_async_read(bh); | 2228 | mark_buffer_async_read(bh); |
2221 | } | 2229 | } |
2222 | 2230 | ||
2223 | /* | 2231 | /* |
2224 | * Stage 3: start the IO. Check for uptodateness | 2232 | * Stage 3: start the IO. Check for uptodateness |
2225 | * inside the buffer lock in case another process reading | 2233 | * inside the buffer lock in case another process reading |
2226 | * the underlying blockdev brought it uptodate (the sct fix). | 2234 | * the underlying blockdev brought it uptodate (the sct fix). |
2227 | */ | 2235 | */ |
2228 | for (i = 0; i < nr; i++) { | 2236 | for (i = 0; i < nr; i++) { |
2229 | bh = arr[i]; | 2237 | bh = arr[i]; |
2230 | if (buffer_uptodate(bh)) | 2238 | if (buffer_uptodate(bh)) |
2231 | end_buffer_async_read(bh, 1); | 2239 | end_buffer_async_read(bh, 1); |
2232 | else | 2240 | else |
2233 | submit_bh(READ, bh); | 2241 | submit_bh(READ, bh); |
2234 | } | 2242 | } |
2235 | return 0; | 2243 | return 0; |
2236 | } | 2244 | } |
2237 | 2245 | ||
2238 | /* utility function for filesystems that need to do work on expanding | 2246 | /* utility function for filesystems that need to do work on expanding |
2239 | * truncates. Uses filesystem pagecache writes to allow the filesystem to | 2247 | * truncates. Uses filesystem pagecache writes to allow the filesystem to |
2240 | * deal with the hole. | 2248 | * deal with the hole. |
2241 | */ | 2249 | */ |
2242 | int generic_cont_expand_simple(struct inode *inode, loff_t size) | 2250 | int generic_cont_expand_simple(struct inode *inode, loff_t size) |
2243 | { | 2251 | { |
2244 | struct address_space *mapping = inode->i_mapping; | 2252 | struct address_space *mapping = inode->i_mapping; |
2245 | struct page *page; | 2253 | struct page *page; |
2246 | void *fsdata; | 2254 | void *fsdata; |
2247 | unsigned long limit; | 2255 | unsigned long limit; |
2248 | int err; | 2256 | int err; |
2249 | 2257 | ||
2250 | err = -EFBIG; | 2258 | err = -EFBIG; |
2251 | limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | 2259 | limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; |
2252 | if (limit != RLIM_INFINITY && size > (loff_t)limit) { | 2260 | if (limit != RLIM_INFINITY && size > (loff_t)limit) { |
2253 | send_sig(SIGXFSZ, current, 0); | 2261 | send_sig(SIGXFSZ, current, 0); |
2254 | goto out; | 2262 | goto out; |
2255 | } | 2263 | } |
2256 | if (size > inode->i_sb->s_maxbytes) | 2264 | if (size > inode->i_sb->s_maxbytes) |
2257 | goto out; | 2265 | goto out; |
2258 | 2266 | ||
2259 | err = pagecache_write_begin(NULL, mapping, size, 0, | 2267 | err = pagecache_write_begin(NULL, mapping, size, 0, |
2260 | AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND, | 2268 | AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND, |
2261 | &page, &fsdata); | 2269 | &page, &fsdata); |
2262 | if (err) | 2270 | if (err) |
2263 | goto out; | 2271 | goto out; |
2264 | 2272 | ||
2265 | err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata); | 2273 | err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata); |
2266 | BUG_ON(err > 0); | 2274 | BUG_ON(err > 0); |
2267 | 2275 | ||
2268 | out: | 2276 | out: |
2269 | return err; | 2277 | return err; |
2270 | } | 2278 | } |
2271 | 2279 | ||
2272 | static int cont_expand_zero(struct file *file, struct address_space *mapping, | 2280 | static int cont_expand_zero(struct file *file, struct address_space *mapping, |
2273 | loff_t pos, loff_t *bytes) | 2281 | loff_t pos, loff_t *bytes) |
2274 | { | 2282 | { |
2275 | struct inode *inode = mapping->host; | 2283 | struct inode *inode = mapping->host; |
2276 | unsigned blocksize = 1 << inode->i_blkbits; | 2284 | unsigned blocksize = 1 << inode->i_blkbits; |
2277 | struct page *page; | 2285 | struct page *page; |
2278 | void *fsdata; | 2286 | void *fsdata; |
2279 | pgoff_t index, curidx; | 2287 | pgoff_t index, curidx; |
2280 | loff_t curpos; | 2288 | loff_t curpos; |
2281 | unsigned zerofrom, offset, len; | 2289 | unsigned zerofrom, offset, len; |
2282 | int err = 0; | 2290 | int err = 0; |
2283 | 2291 | ||
2284 | index = pos >> PAGE_CACHE_SHIFT; | 2292 | index = pos >> PAGE_CACHE_SHIFT; |
2285 | offset = pos & ~PAGE_CACHE_MASK; | 2293 | offset = pos & ~PAGE_CACHE_MASK; |
2286 | 2294 | ||
2287 | while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) { | 2295 | while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) { |
2288 | zerofrom = curpos & ~PAGE_CACHE_MASK; | 2296 | zerofrom = curpos & ~PAGE_CACHE_MASK; |
2289 | if (zerofrom & (blocksize-1)) { | 2297 | if (zerofrom & (blocksize-1)) { |
2290 | *bytes |= (blocksize-1); | 2298 | *bytes |= (blocksize-1); |
2291 | (*bytes)++; | 2299 | (*bytes)++; |
2292 | } | 2300 | } |
2293 | len = PAGE_CACHE_SIZE - zerofrom; | 2301 | len = PAGE_CACHE_SIZE - zerofrom; |
2294 | 2302 | ||
2295 | err = pagecache_write_begin(file, mapping, curpos, len, | 2303 | err = pagecache_write_begin(file, mapping, curpos, len, |
2296 | AOP_FLAG_UNINTERRUPTIBLE, | 2304 | AOP_FLAG_UNINTERRUPTIBLE, |
2297 | &page, &fsdata); | 2305 | &page, &fsdata); |
2298 | if (err) | 2306 | if (err) |
2299 | goto out; | 2307 | goto out; |
2300 | zero_user(page, zerofrom, len); | 2308 | zero_user(page, zerofrom, len); |
2301 | err = pagecache_write_end(file, mapping, curpos, len, len, | 2309 | err = pagecache_write_end(file, mapping, curpos, len, len, |
2302 | page, fsdata); | 2310 | page, fsdata); |
2303 | if (err < 0) | 2311 | if (err < 0) |
2304 | goto out; | 2312 | goto out; |
2305 | BUG_ON(err != len); | 2313 | BUG_ON(err != len); |
2306 | err = 0; | 2314 | err = 0; |
2307 | 2315 | ||
2308 | balance_dirty_pages_ratelimited(mapping); | 2316 | balance_dirty_pages_ratelimited(mapping); |
2309 | } | 2317 | } |
2310 | 2318 | ||
2311 | /* page covers the boundary, find the boundary offset */ | 2319 | /* page covers the boundary, find the boundary offset */ |
2312 | if (index == curidx) { | 2320 | if (index == curidx) { |
2313 | zerofrom = curpos & ~PAGE_CACHE_MASK; | 2321 | zerofrom = curpos & ~PAGE_CACHE_MASK; |
2314 | /* if we will expand the thing last block will be filled */ | 2322 | /* if we will expand the thing last block will be filled */ |
2315 | if (offset <= zerofrom) { | 2323 | if (offset <= zerofrom) { |
2316 | goto out; | 2324 | goto out; |
2317 | } | 2325 | } |
2318 | if (zerofrom & (blocksize-1)) { | 2326 | if (zerofrom & (blocksize-1)) { |
2319 | *bytes |= (blocksize-1); | 2327 | *bytes |= (blocksize-1); |
2320 | (*bytes)++; | 2328 | (*bytes)++; |
2321 | } | 2329 | } |
2322 | len = offset - zerofrom; | 2330 | len = offset - zerofrom; |
2323 | 2331 | ||
2324 | err = pagecache_write_begin(file, mapping, curpos, len, | 2332 | err = pagecache_write_begin(file, mapping, curpos, len, |
2325 | AOP_FLAG_UNINTERRUPTIBLE, | 2333 | AOP_FLAG_UNINTERRUPTIBLE, |
2326 | &page, &fsdata); | 2334 | &page, &fsdata); |
2327 | if (err) | 2335 | if (err) |
2328 | goto out; | 2336 | goto out; |
2329 | zero_user(page, zerofrom, len); | 2337 | zero_user(page, zerofrom, len); |
2330 | err = pagecache_write_end(file, mapping, curpos, len, len, | 2338 | err = pagecache_write_end(file, mapping, curpos, len, len, |
2331 | page, fsdata); | 2339 | page, fsdata); |
2332 | if (err < 0) | 2340 | if (err < 0) |
2333 | goto out; | 2341 | goto out; |
2334 | BUG_ON(err != len); | 2342 | BUG_ON(err != len); |
2335 | err = 0; | 2343 | err = 0; |
2336 | } | 2344 | } |
2337 | out: | 2345 | out: |
2338 | return err; | 2346 | return err; |
2339 | } | 2347 | } |
2340 | 2348 | ||
2341 | /* | 2349 | /* |
2342 | * For moronic filesystems that do not allow holes in file. | 2350 | * For moronic filesystems that do not allow holes in file. |
2343 | * We may have to extend the file. | 2351 | * We may have to extend the file. |
2344 | */ | 2352 | */ |
2345 | int cont_write_begin(struct file *file, struct address_space *mapping, | 2353 | int cont_write_begin(struct file *file, struct address_space *mapping, |
2346 | loff_t pos, unsigned len, unsigned flags, | 2354 | loff_t pos, unsigned len, unsigned flags, |
2347 | struct page **pagep, void **fsdata, | 2355 | struct page **pagep, void **fsdata, |
2348 | get_block_t *get_block, loff_t *bytes) | 2356 | get_block_t *get_block, loff_t *bytes) |
2349 | { | 2357 | { |
2350 | struct inode *inode = mapping->host; | 2358 | struct inode *inode = mapping->host; |
2351 | unsigned blocksize = 1 << inode->i_blkbits; | 2359 | unsigned blocksize = 1 << inode->i_blkbits; |
2352 | unsigned zerofrom; | 2360 | unsigned zerofrom; |
2353 | int err; | 2361 | int err; |
2354 | 2362 | ||
2355 | err = cont_expand_zero(file, mapping, pos, bytes); | 2363 | err = cont_expand_zero(file, mapping, pos, bytes); |
2356 | if (err) | 2364 | if (err) |
2357 | goto out; | 2365 | goto out; |
2358 | 2366 | ||
2359 | zerofrom = *bytes & ~PAGE_CACHE_MASK; | 2367 | zerofrom = *bytes & ~PAGE_CACHE_MASK; |
2360 | if (pos+len > *bytes && zerofrom & (blocksize-1)) { | 2368 | if (pos+len > *bytes && zerofrom & (blocksize-1)) { |
2361 | *bytes |= (blocksize-1); | 2369 | *bytes |= (blocksize-1); |
2362 | (*bytes)++; | 2370 | (*bytes)++; |
2363 | } | 2371 | } |
2364 | 2372 | ||
2365 | *pagep = NULL; | 2373 | *pagep = NULL; |
2366 | err = block_write_begin(file, mapping, pos, len, | 2374 | err = block_write_begin(file, mapping, pos, len, |
2367 | flags, pagep, fsdata, get_block); | 2375 | flags, pagep, fsdata, get_block); |
2368 | out: | 2376 | out: |
2369 | return err; | 2377 | return err; |
2370 | } | 2378 | } |
2371 | 2379 | ||
2372 | int block_prepare_write(struct page *page, unsigned from, unsigned to, | 2380 | int block_prepare_write(struct page *page, unsigned from, unsigned to, |
2373 | get_block_t *get_block) | 2381 | get_block_t *get_block) |
2374 | { | 2382 | { |
2375 | struct inode *inode = page->mapping->host; | 2383 | struct inode *inode = page->mapping->host; |
2376 | int err = __block_prepare_write(inode, page, from, to, get_block); | 2384 | int err = __block_prepare_write(inode, page, from, to, get_block); |
2377 | if (err) | 2385 | if (err) |
2378 | ClearPageUptodate(page); | 2386 | ClearPageUptodate(page); |
2379 | return err; | 2387 | return err; |
2380 | } | 2388 | } |
2381 | 2389 | ||
2382 | int block_commit_write(struct page *page, unsigned from, unsigned to) | 2390 | int block_commit_write(struct page *page, unsigned from, unsigned to) |
2383 | { | 2391 | { |
2384 | struct inode *inode = page->mapping->host; | 2392 | struct inode *inode = page->mapping->host; |
2385 | __block_commit_write(inode,page,from,to); | 2393 | __block_commit_write(inode,page,from,to); |
2386 | return 0; | 2394 | return 0; |
2387 | } | 2395 | } |
2388 | 2396 | ||
2389 | /* | 2397 | /* |
2390 | * block_page_mkwrite() is not allowed to change the file size as it gets | 2398 | * block_page_mkwrite() is not allowed to change the file size as it gets |
2391 | * called from a page fault handler when a page is first dirtied. Hence we must | 2399 | * called from a page fault handler when a page is first dirtied. Hence we must |
2392 | * be careful to check for EOF conditions here. We set the page up correctly | 2400 | * be careful to check for EOF conditions here. We set the page up correctly |
2393 | * for a written page which means we get ENOSPC checking when writing into | 2401 | * for a written page which means we get ENOSPC checking when writing into |
2394 | * holes and correct delalloc and unwritten extent mapping on filesystems that | 2402 | * holes and correct delalloc and unwritten extent mapping on filesystems that |
2395 | * support these features. | 2403 | * support these features. |
2396 | * | 2404 | * |
2397 | * We are not allowed to take the i_mutex here so we have to play games to | 2405 | * We are not allowed to take the i_mutex here so we have to play games to |
2398 | * protect against truncate races as the page could now be beyond EOF. Because | 2406 | * protect against truncate races as the page could now be beyond EOF. Because |
2399 | * vmtruncate() writes the inode size before removing pages, once we have the | 2407 | * vmtruncate() writes the inode size before removing pages, once we have the |
2400 | * page lock we can determine safely if the page is beyond EOF. If it is not | 2408 | * page lock we can determine safely if the page is beyond EOF. If it is not |
2401 | * beyond EOF, then the page is guaranteed safe against truncation until we | 2409 | * beyond EOF, then the page is guaranteed safe against truncation until we |
2402 | * unlock the page. | 2410 | * unlock the page. |
2403 | */ | 2411 | */ |
2404 | int | 2412 | int |
2405 | block_page_mkwrite(struct vm_area_struct *vma, struct page *page, | 2413 | block_page_mkwrite(struct vm_area_struct *vma, struct page *page, |
2406 | get_block_t get_block) | 2414 | get_block_t get_block) |
2407 | { | 2415 | { |
2408 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; | 2416 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
2409 | unsigned long end; | 2417 | unsigned long end; |
2410 | loff_t size; | 2418 | loff_t size; |
2411 | int ret = -EINVAL; | 2419 | int ret = -EINVAL; |
2412 | 2420 | ||
2413 | lock_page(page); | 2421 | lock_page(page); |
2414 | size = i_size_read(inode); | 2422 | size = i_size_read(inode); |
2415 | if ((page->mapping != inode->i_mapping) || | 2423 | if ((page->mapping != inode->i_mapping) || |
2416 | (page_offset(page) > size)) { | 2424 | (page_offset(page) > size)) { |
2417 | /* page got truncated out from underneath us */ | 2425 | /* page got truncated out from underneath us */ |
2418 | goto out_unlock; | 2426 | goto out_unlock; |
2419 | } | 2427 | } |
2420 | 2428 | ||
2421 | /* page is wholly or partially inside EOF */ | 2429 | /* page is wholly or partially inside EOF */ |
2422 | if (((page->index + 1) << PAGE_CACHE_SHIFT) > size) | 2430 | if (((page->index + 1) << PAGE_CACHE_SHIFT) > size) |
2423 | end = size & ~PAGE_CACHE_MASK; | 2431 | end = size & ~PAGE_CACHE_MASK; |
2424 | else | 2432 | else |
2425 | end = PAGE_CACHE_SIZE; | 2433 | end = PAGE_CACHE_SIZE; |
2426 | 2434 | ||
2427 | ret = block_prepare_write(page, 0, end, get_block); | 2435 | ret = block_prepare_write(page, 0, end, get_block); |
2428 | if (!ret) | 2436 | if (!ret) |
2429 | ret = block_commit_write(page, 0, end); | 2437 | ret = block_commit_write(page, 0, end); |
2430 | 2438 | ||
2431 | out_unlock: | 2439 | out_unlock: |
2432 | unlock_page(page); | 2440 | unlock_page(page); |
2433 | return ret; | 2441 | return ret; |
2434 | } | 2442 | } |
2435 | 2443 | ||
2436 | /* | 2444 | /* |
2437 | * nobh_write_begin()'s prereads are special: the buffer_heads are freed | 2445 | * nobh_write_begin()'s prereads are special: the buffer_heads are freed |
2438 | * immediately, while under the page lock. So it needs a special end_io | 2446 | * immediately, while under the page lock. So it needs a special end_io |
2439 | * handler which does not touch the bh after unlocking it. | 2447 | * handler which does not touch the bh after unlocking it. |
2440 | */ | 2448 | */ |
2441 | static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) | 2449 | static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) |
2442 | { | 2450 | { |
2443 | __end_buffer_read_notouch(bh, uptodate); | 2451 | __end_buffer_read_notouch(bh, uptodate); |
2444 | } | 2452 | } |
2445 | 2453 | ||
2446 | /* | 2454 | /* |
2447 | * Attach the singly-linked list of buffers created by nobh_write_begin, to | 2455 | * Attach the singly-linked list of buffers created by nobh_write_begin, to |
2448 | * the page (converting it to circular linked list and taking care of page | 2456 | * the page (converting it to circular linked list and taking care of page |
2449 | * dirty races). | 2457 | * dirty races). |
2450 | */ | 2458 | */ |
2451 | static void attach_nobh_buffers(struct page *page, struct buffer_head *head) | 2459 | static void attach_nobh_buffers(struct page *page, struct buffer_head *head) |
2452 | { | 2460 | { |
2453 | struct buffer_head *bh; | 2461 | struct buffer_head *bh; |
2454 | 2462 | ||
2455 | BUG_ON(!PageLocked(page)); | 2463 | BUG_ON(!PageLocked(page)); |
2456 | 2464 | ||
2457 | spin_lock(&page->mapping->private_lock); | 2465 | spin_lock(&page->mapping->private_lock); |
2458 | bh = head; | 2466 | bh = head; |
2459 | do { | 2467 | do { |
2460 | if (PageDirty(page)) | 2468 | if (PageDirty(page)) |
2461 | set_buffer_dirty(bh); | 2469 | set_buffer_dirty(bh); |
2462 | if (!bh->b_this_page) | 2470 | if (!bh->b_this_page) |
2463 | bh->b_this_page = head; | 2471 | bh->b_this_page = head; |
2464 | bh = bh->b_this_page; | 2472 | bh = bh->b_this_page; |
2465 | } while (bh != head); | 2473 | } while (bh != head); |
2466 | attach_page_buffers(page, head); | 2474 | attach_page_buffers(page, head); |
2467 | spin_unlock(&page->mapping->private_lock); | 2475 | spin_unlock(&page->mapping->private_lock); |
2468 | } | 2476 | } |
2469 | 2477 | ||
2470 | /* | 2478 | /* |
2471 | * On entry, the page is fully not uptodate. | 2479 | * On entry, the page is fully not uptodate. |
2472 | * On exit the page is fully uptodate in the areas outside (from,to) | 2480 | * On exit the page is fully uptodate in the areas outside (from,to) |
2473 | */ | 2481 | */ |
2474 | int nobh_write_begin(struct file *file, struct address_space *mapping, | 2482 | int nobh_write_begin(struct file *file, struct address_space *mapping, |
2475 | loff_t pos, unsigned len, unsigned flags, | 2483 | loff_t pos, unsigned len, unsigned flags, |
2476 | struct page **pagep, void **fsdata, | 2484 | struct page **pagep, void **fsdata, |
2477 | get_block_t *get_block) | 2485 | get_block_t *get_block) |
2478 | { | 2486 | { |
2479 | struct inode *inode = mapping->host; | 2487 | struct inode *inode = mapping->host; |
2480 | const unsigned blkbits = inode->i_blkbits; | 2488 | const unsigned blkbits = inode->i_blkbits; |
2481 | const unsigned blocksize = 1 << blkbits; | 2489 | const unsigned blocksize = 1 << blkbits; |
2482 | struct buffer_head *head, *bh; | 2490 | struct buffer_head *head, *bh; |
2483 | struct page *page; | 2491 | struct page *page; |
2484 | pgoff_t index; | 2492 | pgoff_t index; |
2485 | unsigned from, to; | 2493 | unsigned from, to; |
2486 | unsigned block_in_page; | 2494 | unsigned block_in_page; |
2487 | unsigned block_start, block_end; | 2495 | unsigned block_start, block_end; |
2488 | sector_t block_in_file; | 2496 | sector_t block_in_file; |
2489 | int nr_reads = 0; | 2497 | int nr_reads = 0; |
2490 | int ret = 0; | 2498 | int ret = 0; |
2491 | int is_mapped_to_disk = 1; | 2499 | int is_mapped_to_disk = 1; |
2492 | 2500 | ||
2493 | index = pos >> PAGE_CACHE_SHIFT; | 2501 | index = pos >> PAGE_CACHE_SHIFT; |
2494 | from = pos & (PAGE_CACHE_SIZE - 1); | 2502 | from = pos & (PAGE_CACHE_SIZE - 1); |
2495 | to = from + len; | 2503 | to = from + len; |
2496 | 2504 | ||
2497 | page = __grab_cache_page(mapping, index); | 2505 | page = __grab_cache_page(mapping, index); |
2498 | if (!page) | 2506 | if (!page) |
2499 | return -ENOMEM; | 2507 | return -ENOMEM; |
2500 | *pagep = page; | 2508 | *pagep = page; |
2501 | *fsdata = NULL; | 2509 | *fsdata = NULL; |
2502 | 2510 | ||
2503 | if (page_has_buffers(page)) { | 2511 | if (page_has_buffers(page)) { |
2504 | unlock_page(page); | 2512 | unlock_page(page); |
2505 | page_cache_release(page); | 2513 | page_cache_release(page); |
2506 | *pagep = NULL; | 2514 | *pagep = NULL; |
2507 | return block_write_begin(file, mapping, pos, len, flags, pagep, | 2515 | return block_write_begin(file, mapping, pos, len, flags, pagep, |
2508 | fsdata, get_block); | 2516 | fsdata, get_block); |
2509 | } | 2517 | } |
2510 | 2518 | ||
2511 | if (PageMappedToDisk(page)) | 2519 | if (PageMappedToDisk(page)) |
2512 | return 0; | 2520 | return 0; |
2513 | 2521 | ||
2514 | /* | 2522 | /* |
2515 | * Allocate buffers so that we can keep track of state, and potentially | 2523 | * Allocate buffers so that we can keep track of state, and potentially |
2516 | * attach them to the page if an error occurs. In the common case of | 2524 | * attach them to the page if an error occurs. In the common case of |
2517 | * no error, they will just be freed again without ever being attached | 2525 | * no error, they will just be freed again without ever being attached |
2518 | * to the page (which is all OK, because we're under the page lock). | 2526 | * to the page (which is all OK, because we're under the page lock). |
2519 | * | 2527 | * |
2520 | * Be careful: the buffer linked list is a NULL terminated one, rather | 2528 | * Be careful: the buffer linked list is a NULL terminated one, rather |
2521 | * than the circular one we're used to. | 2529 | * than the circular one we're used to. |
2522 | */ | 2530 | */ |
2523 | head = alloc_page_buffers(page, blocksize, 0); | 2531 | head = alloc_page_buffers(page, blocksize, 0); |
2524 | if (!head) { | 2532 | if (!head) { |
2525 | ret = -ENOMEM; | 2533 | ret = -ENOMEM; |
2526 | goto out_release; | 2534 | goto out_release; |
2527 | } | 2535 | } |
2528 | 2536 | ||
2529 | block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); | 2537 | block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); |
2530 | 2538 | ||
2531 | /* | 2539 | /* |
2532 | * We loop across all blocks in the page, whether or not they are | 2540 | * We loop across all blocks in the page, whether or not they are |
2533 | * part of the affected region. This is so we can discover if the | 2541 | * part of the affected region. This is so we can discover if the |
2534 | * page is fully mapped-to-disk. | 2542 | * page is fully mapped-to-disk. |
2535 | */ | 2543 | */ |
2536 | for (block_start = 0, block_in_page = 0, bh = head; | 2544 | for (block_start = 0, block_in_page = 0, bh = head; |
2537 | block_start < PAGE_CACHE_SIZE; | 2545 | block_start < PAGE_CACHE_SIZE; |
2538 | block_in_page++, block_start += blocksize, bh = bh->b_this_page) { | 2546 | block_in_page++, block_start += blocksize, bh = bh->b_this_page) { |
2539 | int create; | 2547 | int create; |
2540 | 2548 | ||
2541 | block_end = block_start + blocksize; | 2549 | block_end = block_start + blocksize; |
2542 | bh->b_state = 0; | 2550 | bh->b_state = 0; |
2543 | create = 1; | 2551 | create = 1; |
2544 | if (block_start >= to) | 2552 | if (block_start >= to) |
2545 | create = 0; | 2553 | create = 0; |
2546 | ret = get_block(inode, block_in_file + block_in_page, | 2554 | ret = get_block(inode, block_in_file + block_in_page, |
2547 | bh, create); | 2555 | bh, create); |
2548 | if (ret) | 2556 | if (ret) |
2549 | goto failed; | 2557 | goto failed; |
2550 | if (!buffer_mapped(bh)) | 2558 | if (!buffer_mapped(bh)) |
2551 | is_mapped_to_disk = 0; | 2559 | is_mapped_to_disk = 0; |
2552 | if (buffer_new(bh)) | 2560 | if (buffer_new(bh)) |
2553 | unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); | 2561 | unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); |
2554 | if (PageUptodate(page)) { | 2562 | if (PageUptodate(page)) { |
2555 | set_buffer_uptodate(bh); | 2563 | set_buffer_uptodate(bh); |
2556 | continue; | 2564 | continue; |
2557 | } | 2565 | } |
2558 | if (buffer_new(bh) || !buffer_mapped(bh)) { | 2566 | if (buffer_new(bh) || !buffer_mapped(bh)) { |
2559 | zero_user_segments(page, block_start, from, | 2567 | zero_user_segments(page, block_start, from, |
2560 | to, block_end); | 2568 | to, block_end); |
2561 | continue; | 2569 | continue; |
2562 | } | 2570 | } |
2563 | if (buffer_uptodate(bh)) | 2571 | if (buffer_uptodate(bh)) |
2564 | continue; /* reiserfs does this */ | 2572 | continue; /* reiserfs does this */ |
2565 | if (block_start < from || block_end > to) { | 2573 | if (block_start < from || block_end > to) { |
2566 | lock_buffer(bh); | 2574 | lock_buffer(bh); |
2567 | bh->b_end_io = end_buffer_read_nobh; | 2575 | bh->b_end_io = end_buffer_read_nobh; |
2568 | submit_bh(READ, bh); | 2576 | submit_bh(READ, bh); |
2569 | nr_reads++; | 2577 | nr_reads++; |
2570 | } | 2578 | } |
2571 | } | 2579 | } |
2572 | 2580 | ||
2573 | if (nr_reads) { | 2581 | if (nr_reads) { |
2574 | /* | 2582 | /* |
2575 | * The page is locked, so these buffers are protected from | 2583 | * The page is locked, so these buffers are protected from |
2576 | * any VM or truncate activity. Hence we don't need to care | 2584 | * any VM or truncate activity. Hence we don't need to care |
2577 | * for the buffer_head refcounts. | 2585 | * for the buffer_head refcounts. |
2578 | */ | 2586 | */ |
2579 | for (bh = head; bh; bh = bh->b_this_page) { | 2587 | for (bh = head; bh; bh = bh->b_this_page) { |
2580 | wait_on_buffer(bh); | 2588 | wait_on_buffer(bh); |
2581 | if (!buffer_uptodate(bh)) | 2589 | if (!buffer_uptodate(bh)) |
2582 | ret = -EIO; | 2590 | ret = -EIO; |
2583 | } | 2591 | } |
2584 | if (ret) | 2592 | if (ret) |
2585 | goto failed; | 2593 | goto failed; |
2586 | } | 2594 | } |
2587 | 2595 | ||
2588 | if (is_mapped_to_disk) | 2596 | if (is_mapped_to_disk) |
2589 | SetPageMappedToDisk(page); | 2597 | SetPageMappedToDisk(page); |
2590 | 2598 | ||
2591 | *fsdata = head; /* to be released by nobh_write_end */ | 2599 | *fsdata = head; /* to be released by nobh_write_end */ |
2592 | 2600 | ||
2593 | return 0; | 2601 | return 0; |
2594 | 2602 | ||
2595 | failed: | 2603 | failed: |
2596 | BUG_ON(!ret); | 2604 | BUG_ON(!ret); |
2597 | /* | 2605 | /* |
2598 | * Error recovery is a bit difficult. We need to zero out blocks that | 2606 | * Error recovery is a bit difficult. We need to zero out blocks that |
2599 | * were newly allocated, and dirty them to ensure they get written out. | 2607 | * were newly allocated, and dirty them to ensure they get written out. |
2600 | * Buffers need to be attached to the page at this point, otherwise | 2608 | * Buffers need to be attached to the page at this point, otherwise |
2601 | * the handling of potential IO errors during writeout would be hard | 2609 | * the handling of potential IO errors during writeout would be hard |
2602 | * (could try doing synchronous writeout, but what if that fails too?) | 2610 | * (could try doing synchronous writeout, but what if that fails too?) |
2603 | */ | 2611 | */ |
2604 | attach_nobh_buffers(page, head); | 2612 | attach_nobh_buffers(page, head); |
2605 | page_zero_new_buffers(page, from, to); | 2613 | page_zero_new_buffers(page, from, to); |
2606 | 2614 | ||
2607 | out_release: | 2615 | out_release: |
2608 | unlock_page(page); | 2616 | unlock_page(page); |
2609 | page_cache_release(page); | 2617 | page_cache_release(page); |
2610 | *pagep = NULL; | 2618 | *pagep = NULL; |
2611 | 2619 | ||
2612 | if (pos + len > inode->i_size) | 2620 | if (pos + len > inode->i_size) |
2613 | vmtruncate(inode, inode->i_size); | 2621 | vmtruncate(inode, inode->i_size); |
2614 | 2622 | ||
2615 | return ret; | 2623 | return ret; |
2616 | } | 2624 | } |
2617 | EXPORT_SYMBOL(nobh_write_begin); | 2625 | EXPORT_SYMBOL(nobh_write_begin); |
2618 | 2626 | ||
2619 | int nobh_write_end(struct file *file, struct address_space *mapping, | 2627 | int nobh_write_end(struct file *file, struct address_space *mapping, |
2620 | loff_t pos, unsigned len, unsigned copied, | 2628 | loff_t pos, unsigned len, unsigned copied, |
2621 | struct page *page, void *fsdata) | 2629 | struct page *page, void *fsdata) |
2622 | { | 2630 | { |
2623 | struct inode *inode = page->mapping->host; | 2631 | struct inode *inode = page->mapping->host; |
2624 | struct buffer_head *head = fsdata; | 2632 | struct buffer_head *head = fsdata; |
2625 | struct buffer_head *bh; | 2633 | struct buffer_head *bh; |
2626 | BUG_ON(fsdata != NULL && page_has_buffers(page)); | 2634 | BUG_ON(fsdata != NULL && page_has_buffers(page)); |
2627 | 2635 | ||
2628 | if (unlikely(copied < len) && !page_has_buffers(page)) | 2636 | if (unlikely(copied < len) && !page_has_buffers(page)) |
2629 | attach_nobh_buffers(page, head); | 2637 | attach_nobh_buffers(page, head); |
2630 | if (page_has_buffers(page)) | 2638 | if (page_has_buffers(page)) |
2631 | return generic_write_end(file, mapping, pos, len, | 2639 | return generic_write_end(file, mapping, pos, len, |
2632 | copied, page, fsdata); | 2640 | copied, page, fsdata); |
2633 | 2641 | ||
2634 | SetPageUptodate(page); | 2642 | SetPageUptodate(page); |
2635 | set_page_dirty(page); | 2643 | set_page_dirty(page); |
2636 | if (pos+copied > inode->i_size) { | 2644 | if (pos+copied > inode->i_size) { |
2637 | i_size_write(inode, pos+copied); | 2645 | i_size_write(inode, pos+copied); |
2638 | mark_inode_dirty(inode); | 2646 | mark_inode_dirty(inode); |
2639 | } | 2647 | } |
2640 | 2648 | ||
2641 | unlock_page(page); | 2649 | unlock_page(page); |
2642 | page_cache_release(page); | 2650 | page_cache_release(page); |
2643 | 2651 | ||
2644 | while (head) { | 2652 | while (head) { |
2645 | bh = head; | 2653 | bh = head; |
2646 | head = head->b_this_page; | 2654 | head = head->b_this_page; |
2647 | free_buffer_head(bh); | 2655 | free_buffer_head(bh); |
2648 | } | 2656 | } |
2649 | 2657 | ||
2650 | return copied; | 2658 | return copied; |
2651 | } | 2659 | } |
2652 | EXPORT_SYMBOL(nobh_write_end); | 2660 | EXPORT_SYMBOL(nobh_write_end); |
2653 | 2661 | ||
2654 | /* | 2662 | /* |
2655 | * nobh_writepage() - based on block_full_write_page() except | 2663 | * nobh_writepage() - based on block_full_write_page() except |
2656 | * that it tries to operate without attaching bufferheads to | 2664 | * that it tries to operate without attaching bufferheads to |
2657 | * the page. | 2665 | * the page. |
2658 | */ | 2666 | */ |
2659 | int nobh_writepage(struct page *page, get_block_t *get_block, | 2667 | int nobh_writepage(struct page *page, get_block_t *get_block, |
2660 | struct writeback_control *wbc) | 2668 | struct writeback_control *wbc) |
2661 | { | 2669 | { |
2662 | struct inode * const inode = page->mapping->host; | 2670 | struct inode * const inode = page->mapping->host; |
2663 | loff_t i_size = i_size_read(inode); | 2671 | loff_t i_size = i_size_read(inode); |
2664 | const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; | 2672 | const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; |
2665 | unsigned offset; | 2673 | unsigned offset; |
2666 | int ret; | 2674 | int ret; |
2667 | 2675 | ||
2668 | /* Is the page fully inside i_size? */ | 2676 | /* Is the page fully inside i_size? */ |
2669 | if (page->index < end_index) | 2677 | if (page->index < end_index) |
2670 | goto out; | 2678 | goto out; |
2671 | 2679 | ||
2672 | /* Is the page fully outside i_size? (truncate in progress) */ | 2680 | /* Is the page fully outside i_size? (truncate in progress) */ |
2673 | offset = i_size & (PAGE_CACHE_SIZE-1); | 2681 | offset = i_size & (PAGE_CACHE_SIZE-1); |
2674 | if (page->index >= end_index+1 || !offset) { | 2682 | if (page->index >= end_index+1 || !offset) { |
2675 | /* | 2683 | /* |
2676 | * The page may have dirty, unmapped buffers. For example, | 2684 | * The page may have dirty, unmapped buffers. For example, |
2677 | * they may have been added in ext3_writepage(). Make them | 2685 | * they may have been added in ext3_writepage(). Make them |
2678 | * freeable here, so the page does not leak. | 2686 | * freeable here, so the page does not leak. |
2679 | */ | 2687 | */ |
2680 | #if 0 | 2688 | #if 0 |
2681 | /* Not really sure about this - do we need this ? */ | 2689 | /* Not really sure about this - do we need this ? */ |
2682 | if (page->mapping->a_ops->invalidatepage) | 2690 | if (page->mapping->a_ops->invalidatepage) |
2683 | page->mapping->a_ops->invalidatepage(page, offset); | 2691 | page->mapping->a_ops->invalidatepage(page, offset); |
2684 | #endif | 2692 | #endif |
2685 | unlock_page(page); | 2693 | unlock_page(page); |
2686 | return 0; /* don't care */ | 2694 | return 0; /* don't care */ |
2687 | } | 2695 | } |
2688 | 2696 | ||
2689 | /* | 2697 | /* |
2690 | * The page straddles i_size. It must be zeroed out on each and every | 2698 | * The page straddles i_size. It must be zeroed out on each and every |
2691 | * writepage invocation because it may be mmapped. "A file is mapped | 2699 | * writepage invocation because it may be mmapped. "A file is mapped |
2692 | * in multiples of the page size. For a file that is not a multiple of | 2700 | * in multiples of the page size. For a file that is not a multiple of |
2693 | * the page size, the remaining memory is zeroed when mapped, and | 2701 | * the page size, the remaining memory is zeroed when mapped, and |
2694 | * writes to that region are not written out to the file." | 2702 | * writes to that region are not written out to the file." |
2695 | */ | 2703 | */ |
2696 | zero_user_segment(page, offset, PAGE_CACHE_SIZE); | 2704 | zero_user_segment(page, offset, PAGE_CACHE_SIZE); |
2697 | out: | 2705 | out: |
2698 | ret = mpage_writepage(page, get_block, wbc); | 2706 | ret = mpage_writepage(page, get_block, wbc); |
2699 | if (ret == -EAGAIN) | 2707 | if (ret == -EAGAIN) |
2700 | ret = __block_write_full_page(inode, page, get_block, wbc); | 2708 | ret = __block_write_full_page(inode, page, get_block, wbc); |
2701 | return ret; | 2709 | return ret; |
2702 | } | 2710 | } |
2703 | EXPORT_SYMBOL(nobh_writepage); | 2711 | EXPORT_SYMBOL(nobh_writepage); |
2704 | 2712 | ||
2705 | int nobh_truncate_page(struct address_space *mapping, | 2713 | int nobh_truncate_page(struct address_space *mapping, |
2706 | loff_t from, get_block_t *get_block) | 2714 | loff_t from, get_block_t *get_block) |
2707 | { | 2715 | { |
2708 | pgoff_t index = from >> PAGE_CACHE_SHIFT; | 2716 | pgoff_t index = from >> PAGE_CACHE_SHIFT; |
2709 | unsigned offset = from & (PAGE_CACHE_SIZE-1); | 2717 | unsigned offset = from & (PAGE_CACHE_SIZE-1); |
2710 | unsigned blocksize; | 2718 | unsigned blocksize; |
2711 | sector_t iblock; | 2719 | sector_t iblock; |
2712 | unsigned length, pos; | 2720 | unsigned length, pos; |
2713 | struct inode *inode = mapping->host; | 2721 | struct inode *inode = mapping->host; |
2714 | struct page *page; | 2722 | struct page *page; |
2715 | struct buffer_head map_bh; | 2723 | struct buffer_head map_bh; |
2716 | int err; | 2724 | int err; |
2717 | 2725 | ||
2718 | blocksize = 1 << inode->i_blkbits; | 2726 | blocksize = 1 << inode->i_blkbits; |
2719 | length = offset & (blocksize - 1); | 2727 | length = offset & (blocksize - 1); |
2720 | 2728 | ||
2721 | /* Block boundary? Nothing to do */ | 2729 | /* Block boundary? Nothing to do */ |
2722 | if (!length) | 2730 | if (!length) |
2723 | return 0; | 2731 | return 0; |
2724 | 2732 | ||
2725 | length = blocksize - length; | 2733 | length = blocksize - length; |
2726 | iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | 2734 | iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); |
2727 | 2735 | ||
2728 | page = grab_cache_page(mapping, index); | 2736 | page = grab_cache_page(mapping, index); |
2729 | err = -ENOMEM; | 2737 | err = -ENOMEM; |
2730 | if (!page) | 2738 | if (!page) |
2731 | goto out; | 2739 | goto out; |
2732 | 2740 | ||
2733 | if (page_has_buffers(page)) { | 2741 | if (page_has_buffers(page)) { |
2734 | has_buffers: | 2742 | has_buffers: |
2735 | unlock_page(page); | 2743 | unlock_page(page); |
2736 | page_cache_release(page); | 2744 | page_cache_release(page); |
2737 | return block_truncate_page(mapping, from, get_block); | 2745 | return block_truncate_page(mapping, from, get_block); |
2738 | } | 2746 | } |
2739 | 2747 | ||
2740 | /* Find the buffer that contains "offset" */ | 2748 | /* Find the buffer that contains "offset" */ |
2741 | pos = blocksize; | 2749 | pos = blocksize; |
2742 | while (offset >= pos) { | 2750 | while (offset >= pos) { |
2743 | iblock++; | 2751 | iblock++; |
2744 | pos += blocksize; | 2752 | pos += blocksize; |
2745 | } | 2753 | } |
2746 | 2754 | ||
2747 | err = get_block(inode, iblock, &map_bh, 0); | 2755 | err = get_block(inode, iblock, &map_bh, 0); |
2748 | if (err) | 2756 | if (err) |
2749 | goto unlock; | 2757 | goto unlock; |
2750 | /* unmapped? It's a hole - nothing to do */ | 2758 | /* unmapped? It's a hole - nothing to do */ |
2751 | if (!buffer_mapped(&map_bh)) | 2759 | if (!buffer_mapped(&map_bh)) |
2752 | goto unlock; | 2760 | goto unlock; |
2753 | 2761 | ||
2754 | /* Ok, it's mapped. Make sure it's up-to-date */ | 2762 | /* Ok, it's mapped. Make sure it's up-to-date */ |
2755 | if (!PageUptodate(page)) { | 2763 | if (!PageUptodate(page)) { |
2756 | err = mapping->a_ops->readpage(NULL, page); | 2764 | err = mapping->a_ops->readpage(NULL, page); |
2757 | if (err) { | 2765 | if (err) { |
2758 | page_cache_release(page); | 2766 | page_cache_release(page); |
2759 | goto out; | 2767 | goto out; |
2760 | } | 2768 | } |
2761 | lock_page(page); | 2769 | lock_page(page); |
2762 | if (!PageUptodate(page)) { | 2770 | if (!PageUptodate(page)) { |
2763 | err = -EIO; | 2771 | err = -EIO; |
2764 | goto unlock; | 2772 | goto unlock; |
2765 | } | 2773 | } |
2766 | if (page_has_buffers(page)) | 2774 | if (page_has_buffers(page)) |
2767 | goto has_buffers; | 2775 | goto has_buffers; |
2768 | } | 2776 | } |
2769 | zero_user(page, offset, length); | 2777 | zero_user(page, offset, length); |
2770 | set_page_dirty(page); | 2778 | set_page_dirty(page); |
2771 | err = 0; | 2779 | err = 0; |
2772 | 2780 | ||
2773 | unlock: | 2781 | unlock: |
2774 | unlock_page(page); | 2782 | unlock_page(page); |
2775 | page_cache_release(page); | 2783 | page_cache_release(page); |
2776 | out: | 2784 | out: |
2777 | return err; | 2785 | return err; |
2778 | } | 2786 | } |
2779 | EXPORT_SYMBOL(nobh_truncate_page); | 2787 | EXPORT_SYMBOL(nobh_truncate_page); |
2780 | 2788 | ||
2781 | int block_truncate_page(struct address_space *mapping, | 2789 | int block_truncate_page(struct address_space *mapping, |
2782 | loff_t from, get_block_t *get_block) | 2790 | loff_t from, get_block_t *get_block) |
2783 | { | 2791 | { |
2784 | pgoff_t index = from >> PAGE_CACHE_SHIFT; | 2792 | pgoff_t index = from >> PAGE_CACHE_SHIFT; |
2785 | unsigned offset = from & (PAGE_CACHE_SIZE-1); | 2793 | unsigned offset = from & (PAGE_CACHE_SIZE-1); |
2786 | unsigned blocksize; | 2794 | unsigned blocksize; |
2787 | sector_t iblock; | 2795 | sector_t iblock; |
2788 | unsigned length, pos; | 2796 | unsigned length, pos; |
2789 | struct inode *inode = mapping->host; | 2797 | struct inode *inode = mapping->host; |
2790 | struct page *page; | 2798 | struct page *page; |
2791 | struct buffer_head *bh; | 2799 | struct buffer_head *bh; |
2792 | int err; | 2800 | int err; |
2793 | 2801 | ||
2794 | blocksize = 1 << inode->i_blkbits; | 2802 | blocksize = 1 << inode->i_blkbits; |
2795 | length = offset & (blocksize - 1); | 2803 | length = offset & (blocksize - 1); |
2796 | 2804 | ||
2797 | /* Block boundary? Nothing to do */ | 2805 | /* Block boundary? Nothing to do */ |
2798 | if (!length) | 2806 | if (!length) |
2799 | return 0; | 2807 | return 0; |
2800 | 2808 | ||
2801 | length = blocksize - length; | 2809 | length = blocksize - length; |
2802 | iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | 2810 | iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); |
2803 | 2811 | ||
2804 | page = grab_cache_page(mapping, index); | 2812 | page = grab_cache_page(mapping, index); |
2805 | err = -ENOMEM; | 2813 | err = -ENOMEM; |
2806 | if (!page) | 2814 | if (!page) |
2807 | goto out; | 2815 | goto out; |
2808 | 2816 | ||
2809 | if (!page_has_buffers(page)) | 2817 | if (!page_has_buffers(page)) |
2810 | create_empty_buffers(page, blocksize, 0); | 2818 | create_empty_buffers(page, blocksize, 0); |
2811 | 2819 | ||
2812 | /* Find the buffer that contains "offset" */ | 2820 | /* Find the buffer that contains "offset" */ |
2813 | bh = page_buffers(page); | 2821 | bh = page_buffers(page); |
2814 | pos = blocksize; | 2822 | pos = blocksize; |
2815 | while (offset >= pos) { | 2823 | while (offset >= pos) { |
2816 | bh = bh->b_this_page; | 2824 | bh = bh->b_this_page; |
2817 | iblock++; | 2825 | iblock++; |
2818 | pos += blocksize; | 2826 | pos += blocksize; |
2819 | } | 2827 | } |
2820 | 2828 | ||
2821 | err = 0; | 2829 | err = 0; |
2822 | if (!buffer_mapped(bh)) { | 2830 | if (!buffer_mapped(bh)) { |
2823 | WARN_ON(bh->b_size != blocksize); | 2831 | WARN_ON(bh->b_size != blocksize); |
2824 | err = get_block(inode, iblock, bh, 0); | 2832 | err = get_block(inode, iblock, bh, 0); |
2825 | if (err) | 2833 | if (err) |
2826 | goto unlock; | 2834 | goto unlock; |
2827 | /* unmapped? It's a hole - nothing to do */ | 2835 | /* unmapped? It's a hole - nothing to do */ |
2828 | if (!buffer_mapped(bh)) | 2836 | if (!buffer_mapped(bh)) |
2829 | goto unlock; | 2837 | goto unlock; |
2830 | } | 2838 | } |
2831 | 2839 | ||
2832 | /* Ok, it's mapped. Make sure it's up-to-date */ | 2840 | /* Ok, it's mapped. Make sure it's up-to-date */ |
2833 | if (PageUptodate(page)) | 2841 | if (PageUptodate(page)) |
2834 | set_buffer_uptodate(bh); | 2842 | set_buffer_uptodate(bh); |
2835 | 2843 | ||
2836 | if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { | 2844 | if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { |
2837 | err = -EIO; | 2845 | err = -EIO; |
2838 | ll_rw_block(READ, 1, &bh); | 2846 | ll_rw_block(READ, 1, &bh); |
2839 | wait_on_buffer(bh); | 2847 | wait_on_buffer(bh); |
2840 | /* Uhhuh. Read error. Complain and punt. */ | 2848 | /* Uhhuh. Read error. Complain and punt. */ |
2841 | if (!buffer_uptodate(bh)) | 2849 | if (!buffer_uptodate(bh)) |
2842 | goto unlock; | 2850 | goto unlock; |
2843 | } | 2851 | } |
2844 | 2852 | ||
2845 | zero_user(page, offset, length); | 2853 | zero_user(page, offset, length); |
2846 | mark_buffer_dirty(bh); | 2854 | mark_buffer_dirty(bh); |
2847 | err = 0; | 2855 | err = 0; |
2848 | 2856 | ||
2849 | unlock: | 2857 | unlock: |
2850 | unlock_page(page); | 2858 | unlock_page(page); |
2851 | page_cache_release(page); | 2859 | page_cache_release(page); |
2852 | out: | 2860 | out: |
2853 | return err; | 2861 | return err; |
2854 | } | 2862 | } |
2855 | 2863 | ||
2856 | /* | 2864 | /* |
2857 | * The generic ->writepage function for buffer-backed address_spaces | 2865 | * The generic ->writepage function for buffer-backed address_spaces |
2858 | */ | 2866 | */ |
2859 | int block_write_full_page(struct page *page, get_block_t *get_block, | 2867 | int block_write_full_page(struct page *page, get_block_t *get_block, |
2860 | struct writeback_control *wbc) | 2868 | struct writeback_control *wbc) |
2861 | { | 2869 | { |
2862 | struct inode * const inode = page->mapping->host; | 2870 | struct inode * const inode = page->mapping->host; |
2863 | loff_t i_size = i_size_read(inode); | 2871 | loff_t i_size = i_size_read(inode); |
2864 | const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; | 2872 | const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; |
2865 | unsigned offset; | 2873 | unsigned offset; |
2866 | 2874 | ||
2867 | /* Is the page fully inside i_size? */ | 2875 | /* Is the page fully inside i_size? */ |
2868 | if (page->index < end_index) | 2876 | if (page->index < end_index) |
2869 | return __block_write_full_page(inode, page, get_block, wbc); | 2877 | return __block_write_full_page(inode, page, get_block, wbc); |
2870 | 2878 | ||
2871 | /* Is the page fully outside i_size? (truncate in progress) */ | 2879 | /* Is the page fully outside i_size? (truncate in progress) */ |
2872 | offset = i_size & (PAGE_CACHE_SIZE-1); | 2880 | offset = i_size & (PAGE_CACHE_SIZE-1); |
2873 | if (page->index >= end_index+1 || !offset) { | 2881 | if (page->index >= end_index+1 || !offset) { |
2874 | /* | 2882 | /* |
2875 | * The page may have dirty, unmapped buffers. For example, | 2883 | * The page may have dirty, unmapped buffers. For example, |
2876 | * they may have been added in ext3_writepage(). Make them | 2884 | * they may have been added in ext3_writepage(). Make them |
2877 | * freeable here, so the page does not leak. | 2885 | * freeable here, so the page does not leak. |
2878 | */ | 2886 | */ |
2879 | do_invalidatepage(page, 0); | 2887 | do_invalidatepage(page, 0); |
2880 | unlock_page(page); | 2888 | unlock_page(page); |
2881 | return 0; /* don't care */ | 2889 | return 0; /* don't care */ |
2882 | } | 2890 | } |
2883 | 2891 | ||
2884 | /* | 2892 | /* |
2885 | * The page straddles i_size. It must be zeroed out on each and every | 2893 | * The page straddles i_size. It must be zeroed out on each and every |
2886 | * writepage invokation because it may be mmapped. "A file is mapped | 2894 | * writepage invokation because it may be mmapped. "A file is mapped |
2887 | * in multiples of the page size. For a file that is not a multiple of | 2895 | * in multiples of the page size. For a file that is not a multiple of |
2888 | * the page size, the remaining memory is zeroed when mapped, and | 2896 | * the page size, the remaining memory is zeroed when mapped, and |
2889 | * writes to that region are not written out to the file." | 2897 | * writes to that region are not written out to the file." |
2890 | */ | 2898 | */ |
2891 | zero_user_segment(page, offset, PAGE_CACHE_SIZE); | 2899 | zero_user_segment(page, offset, PAGE_CACHE_SIZE); |
2892 | return __block_write_full_page(inode, page, get_block, wbc); | 2900 | return __block_write_full_page(inode, page, get_block, wbc); |
2893 | } | 2901 | } |
2894 | 2902 | ||
2895 | sector_t generic_block_bmap(struct address_space *mapping, sector_t block, | 2903 | sector_t generic_block_bmap(struct address_space *mapping, sector_t block, |
2896 | get_block_t *get_block) | 2904 | get_block_t *get_block) |
2897 | { | 2905 | { |
2898 | struct buffer_head tmp; | 2906 | struct buffer_head tmp; |
2899 | struct inode *inode = mapping->host; | 2907 | struct inode *inode = mapping->host; |
2900 | tmp.b_state = 0; | 2908 | tmp.b_state = 0; |
2901 | tmp.b_blocknr = 0; | 2909 | tmp.b_blocknr = 0; |
2902 | tmp.b_size = 1 << inode->i_blkbits; | 2910 | tmp.b_size = 1 << inode->i_blkbits; |
2903 | get_block(inode, block, &tmp, 0); | 2911 | get_block(inode, block, &tmp, 0); |
2904 | return tmp.b_blocknr; | 2912 | return tmp.b_blocknr; |
2905 | } | 2913 | } |
2906 | 2914 | ||
2907 | static void end_bio_bh_io_sync(struct bio *bio, int err) | 2915 | static void end_bio_bh_io_sync(struct bio *bio, int err) |
2908 | { | 2916 | { |
2909 | struct buffer_head *bh = bio->bi_private; | 2917 | struct buffer_head *bh = bio->bi_private; |
2910 | 2918 | ||
2911 | if (err == -EOPNOTSUPP) { | 2919 | if (err == -EOPNOTSUPP) { |
2912 | set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); | 2920 | set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); |
2913 | set_bit(BH_Eopnotsupp, &bh->b_state); | 2921 | set_bit(BH_Eopnotsupp, &bh->b_state); |
2914 | } | 2922 | } |
2923 | |||
2924 | if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) | ||
2925 | set_bit(BH_Quiet, &bh->b_state); | ||
2915 | 2926 | ||
2916 | bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); | 2927 | bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); |
2917 | bio_put(bio); | 2928 | bio_put(bio); |
2918 | } | 2929 | } |
2919 | 2930 | ||
2920 | int submit_bh(int rw, struct buffer_head * bh) | 2931 | int submit_bh(int rw, struct buffer_head * bh) |
2921 | { | 2932 | { |
2922 | struct bio *bio; | 2933 | struct bio *bio; |
2923 | int ret = 0; | 2934 | int ret = 0; |
2924 | 2935 | ||
2925 | BUG_ON(!buffer_locked(bh)); | 2936 | BUG_ON(!buffer_locked(bh)); |
2926 | BUG_ON(!buffer_mapped(bh)); | 2937 | BUG_ON(!buffer_mapped(bh)); |
2927 | BUG_ON(!bh->b_end_io); | 2938 | BUG_ON(!bh->b_end_io); |
2928 | 2939 | ||
2929 | /* | 2940 | /* |
2930 | * Mask in barrier bit for a write (could be either a WRITE or a | 2941 | * Mask in barrier bit for a write (could be either a WRITE or a |
2931 | * WRITE_SYNC | 2942 | * WRITE_SYNC |
2932 | */ | 2943 | */ |
2933 | if (buffer_ordered(bh) && (rw & WRITE)) | 2944 | if (buffer_ordered(bh) && (rw & WRITE)) |
2934 | rw |= WRITE_BARRIER; | 2945 | rw |= WRITE_BARRIER; |
2935 | 2946 | ||
2936 | /* | 2947 | /* |
2937 | * Only clear out a write error when rewriting | 2948 | * Only clear out a write error when rewriting |
2938 | */ | 2949 | */ |
2939 | if (test_set_buffer_req(bh) && (rw & WRITE)) | 2950 | if (test_set_buffer_req(bh) && (rw & WRITE)) |
2940 | clear_buffer_write_io_error(bh); | 2951 | clear_buffer_write_io_error(bh); |
2941 | 2952 | ||
2942 | /* | 2953 | /* |
2943 | * from here on down, it's all bio -- do the initial mapping, | 2954 | * from here on down, it's all bio -- do the initial mapping, |
2944 | * submit_bio -> generic_make_request may further map this bio around | 2955 | * submit_bio -> generic_make_request may further map this bio around |
2945 | */ | 2956 | */ |
2946 | bio = bio_alloc(GFP_NOIO, 1); | 2957 | bio = bio_alloc(GFP_NOIO, 1); |
2947 | 2958 | ||
2948 | bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); | 2959 | bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); |
2949 | bio->bi_bdev = bh->b_bdev; | 2960 | bio->bi_bdev = bh->b_bdev; |
2950 | bio->bi_io_vec[0].bv_page = bh->b_page; | 2961 | bio->bi_io_vec[0].bv_page = bh->b_page; |
2951 | bio->bi_io_vec[0].bv_len = bh->b_size; | 2962 | bio->bi_io_vec[0].bv_len = bh->b_size; |
2952 | bio->bi_io_vec[0].bv_offset = bh_offset(bh); | 2963 | bio->bi_io_vec[0].bv_offset = bh_offset(bh); |
2953 | 2964 | ||
2954 | bio->bi_vcnt = 1; | 2965 | bio->bi_vcnt = 1; |
2955 | bio->bi_idx = 0; | 2966 | bio->bi_idx = 0; |
2956 | bio->bi_size = bh->b_size; | 2967 | bio->bi_size = bh->b_size; |
2957 | 2968 | ||
2958 | bio->bi_end_io = end_bio_bh_io_sync; | 2969 | bio->bi_end_io = end_bio_bh_io_sync; |
2959 | bio->bi_private = bh; | 2970 | bio->bi_private = bh; |
2960 | 2971 | ||
2961 | bio_get(bio); | 2972 | bio_get(bio); |
2962 | submit_bio(rw, bio); | 2973 | submit_bio(rw, bio); |
2963 | 2974 | ||
2964 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) | 2975 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) |
2965 | ret = -EOPNOTSUPP; | 2976 | ret = -EOPNOTSUPP; |
2966 | 2977 | ||
2967 | bio_put(bio); | 2978 | bio_put(bio); |
2968 | return ret; | 2979 | return ret; |
2969 | } | 2980 | } |
2970 | 2981 | ||
2971 | /** | 2982 | /** |
2972 | * ll_rw_block: low-level access to block devices (DEPRECATED) | 2983 | * ll_rw_block: low-level access to block devices (DEPRECATED) |
2973 | * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead) | 2984 | * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead) |
2974 | * @nr: number of &struct buffer_heads in the array | 2985 | * @nr: number of &struct buffer_heads in the array |
2975 | * @bhs: array of pointers to &struct buffer_head | 2986 | * @bhs: array of pointers to &struct buffer_head |
2976 | * | 2987 | * |
2977 | * ll_rw_block() takes an array of pointers to &struct buffer_heads, and | 2988 | * ll_rw_block() takes an array of pointers to &struct buffer_heads, and |
2978 | * requests an I/O operation on them, either a %READ or a %WRITE. The third | 2989 | * requests an I/O operation on them, either a %READ or a %WRITE. The third |
2979 | * %SWRITE is like %WRITE only we make sure that the *current* data in buffers | 2990 | * %SWRITE is like %WRITE only we make sure that the *current* data in buffers |
2980 | * are sent to disk. The fourth %READA option is described in the documentation | 2991 | * are sent to disk. The fourth %READA option is described in the documentation |
2981 | * for generic_make_request() which ll_rw_block() calls. | 2992 | * for generic_make_request() which ll_rw_block() calls. |
2982 | * | 2993 | * |
2983 | * This function drops any buffer that it cannot get a lock on (with the | 2994 | * This function drops any buffer that it cannot get a lock on (with the |
2984 | * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be | 2995 | * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be |
2985 | * clean when doing a write request, and any buffer that appears to be | 2996 | * clean when doing a write request, and any buffer that appears to be |
2986 | * up-to-date when doing read request. Further it marks as clean buffers that | 2997 | * up-to-date when doing read request. Further it marks as clean buffers that |
2987 | * are processed for writing (the buffer cache won't assume that they are | 2998 | * are processed for writing (the buffer cache won't assume that they are |
2988 | * actually clean until the buffer gets unlocked). | 2999 | * actually clean until the buffer gets unlocked). |
2989 | * | 3000 | * |
2990 | * ll_rw_block sets b_end_io to simple completion handler that marks | 3001 | * ll_rw_block sets b_end_io to simple completion handler that marks |
2991 | * the buffer up-to-date (if approriate), unlocks the buffer and wakes | 3002 | * the buffer up-to-date (if approriate), unlocks the buffer and wakes |
2992 | * any waiters. | 3003 | * any waiters. |
2993 | * | 3004 | * |
2994 | * All of the buffers must be for the same device, and must also be a | 3005 | * All of the buffers must be for the same device, and must also be a |
2995 | * multiple of the current approved size for the device. | 3006 | * multiple of the current approved size for the device. |
2996 | */ | 3007 | */ |
2997 | void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) | 3008 | void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) |
2998 | { | 3009 | { |
2999 | int i; | 3010 | int i; |
3000 | 3011 | ||
3001 | for (i = 0; i < nr; i++) { | 3012 | for (i = 0; i < nr; i++) { |
3002 | struct buffer_head *bh = bhs[i]; | 3013 | struct buffer_head *bh = bhs[i]; |
3003 | 3014 | ||
3004 | if (rw == SWRITE || rw == SWRITE_SYNC) | 3015 | if (rw == SWRITE || rw == SWRITE_SYNC) |
3005 | lock_buffer(bh); | 3016 | lock_buffer(bh); |
3006 | else if (!trylock_buffer(bh)) | 3017 | else if (!trylock_buffer(bh)) |
3007 | continue; | 3018 | continue; |
3008 | 3019 | ||
3009 | if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) { | 3020 | if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) { |
3010 | if (test_clear_buffer_dirty(bh)) { | 3021 | if (test_clear_buffer_dirty(bh)) { |
3011 | bh->b_end_io = end_buffer_write_sync; | 3022 | bh->b_end_io = end_buffer_write_sync; |
3012 | get_bh(bh); | 3023 | get_bh(bh); |
3013 | if (rw == SWRITE_SYNC) | 3024 | if (rw == SWRITE_SYNC) |
3014 | submit_bh(WRITE_SYNC, bh); | 3025 | submit_bh(WRITE_SYNC, bh); |
3015 | else | 3026 | else |
3016 | submit_bh(WRITE, bh); | 3027 | submit_bh(WRITE, bh); |
3017 | continue; | 3028 | continue; |
3018 | } | 3029 | } |
3019 | } else { | 3030 | } else { |
3020 | if (!buffer_uptodate(bh)) { | 3031 | if (!buffer_uptodate(bh)) { |
3021 | bh->b_end_io = end_buffer_read_sync; | 3032 | bh->b_end_io = end_buffer_read_sync; |
3022 | get_bh(bh); | 3033 | get_bh(bh); |
3023 | submit_bh(rw, bh); | 3034 | submit_bh(rw, bh); |
3024 | continue; | 3035 | continue; |
3025 | } | 3036 | } |
3026 | } | 3037 | } |
3027 | unlock_buffer(bh); | 3038 | unlock_buffer(bh); |
3028 | } | 3039 | } |
3029 | } | 3040 | } |
3030 | 3041 | ||
3031 | /* | 3042 | /* |
3032 | * For a data-integrity writeout, we need to wait upon any in-progress I/O | 3043 | * For a data-integrity writeout, we need to wait upon any in-progress I/O |
3033 | * and then start new I/O and then wait upon it. The caller must have a ref on | 3044 | * and then start new I/O and then wait upon it. The caller must have a ref on |
3034 | * the buffer_head. | 3045 | * the buffer_head. |
3035 | */ | 3046 | */ |
3036 | int sync_dirty_buffer(struct buffer_head *bh) | 3047 | int sync_dirty_buffer(struct buffer_head *bh) |
3037 | { | 3048 | { |
3038 | int ret = 0; | 3049 | int ret = 0; |
3039 | 3050 | ||
3040 | WARN_ON(atomic_read(&bh->b_count) < 1); | 3051 | WARN_ON(atomic_read(&bh->b_count) < 1); |
3041 | lock_buffer(bh); | 3052 | lock_buffer(bh); |
3042 | if (test_clear_buffer_dirty(bh)) { | 3053 | if (test_clear_buffer_dirty(bh)) { |
3043 | get_bh(bh); | 3054 | get_bh(bh); |
3044 | bh->b_end_io = end_buffer_write_sync; | 3055 | bh->b_end_io = end_buffer_write_sync; |
3045 | ret = submit_bh(WRITE_SYNC, bh); | 3056 | ret = submit_bh(WRITE_SYNC, bh); |
3046 | wait_on_buffer(bh); | 3057 | wait_on_buffer(bh); |
3047 | if (buffer_eopnotsupp(bh)) { | 3058 | if (buffer_eopnotsupp(bh)) { |
3048 | clear_buffer_eopnotsupp(bh); | 3059 | clear_buffer_eopnotsupp(bh); |
3049 | ret = -EOPNOTSUPP; | 3060 | ret = -EOPNOTSUPP; |
3050 | } | 3061 | } |
3051 | if (!ret && !buffer_uptodate(bh)) | 3062 | if (!ret && !buffer_uptodate(bh)) |
3052 | ret = -EIO; | 3063 | ret = -EIO; |
3053 | } else { | 3064 | } else { |
3054 | unlock_buffer(bh); | 3065 | unlock_buffer(bh); |
3055 | } | 3066 | } |
3056 | return ret; | 3067 | return ret; |
3057 | } | 3068 | } |
3058 | 3069 | ||
3059 | /* | 3070 | /* |
3060 | * try_to_free_buffers() checks if all the buffers on this particular page | 3071 | * try_to_free_buffers() checks if all the buffers on this particular page |
3061 | * are unused, and releases them if so. | 3072 | * are unused, and releases them if so. |
3062 | * | 3073 | * |
3063 | * Exclusion against try_to_free_buffers may be obtained by either | 3074 | * Exclusion against try_to_free_buffers may be obtained by either |
3064 | * locking the page or by holding its mapping's private_lock. | 3075 | * locking the page or by holding its mapping's private_lock. |
3065 | * | 3076 | * |
3066 | * If the page is dirty but all the buffers are clean then we need to | 3077 | * If the page is dirty but all the buffers are clean then we need to |
3067 | * be sure to mark the page clean as well. This is because the page | 3078 | * be sure to mark the page clean as well. This is because the page |
3068 | * may be against a block device, and a later reattachment of buffers | 3079 | * may be against a block device, and a later reattachment of buffers |
3069 | * to a dirty page will set *all* buffers dirty. Which would corrupt | 3080 | * to a dirty page will set *all* buffers dirty. Which would corrupt |
3070 | * filesystem data on the same device. | 3081 | * filesystem data on the same device. |
3071 | * | 3082 | * |
3072 | * The same applies to regular filesystem pages: if all the buffers are | 3083 | * The same applies to regular filesystem pages: if all the buffers are |
3073 | * clean then we set the page clean and proceed. To do that, we require | 3084 | * clean then we set the page clean and proceed. To do that, we require |
3074 | * total exclusion from __set_page_dirty_buffers(). That is obtained with | 3085 | * total exclusion from __set_page_dirty_buffers(). That is obtained with |
3075 | * private_lock. | 3086 | * private_lock. |
3076 | * | 3087 | * |
3077 | * try_to_free_buffers() is non-blocking. | 3088 | * try_to_free_buffers() is non-blocking. |
3078 | */ | 3089 | */ |
3079 | static inline int buffer_busy(struct buffer_head *bh) | 3090 | static inline int buffer_busy(struct buffer_head *bh) |
3080 | { | 3091 | { |
3081 | return atomic_read(&bh->b_count) | | 3092 | return atomic_read(&bh->b_count) | |
3082 | (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); | 3093 | (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); |
3083 | } | 3094 | } |
3084 | 3095 | ||
3085 | static int | 3096 | static int |
3086 | drop_buffers(struct page *page, struct buffer_head **buffers_to_free) | 3097 | drop_buffers(struct page *page, struct buffer_head **buffers_to_free) |
3087 | { | 3098 | { |
3088 | struct buffer_head *head = page_buffers(page); | 3099 | struct buffer_head *head = page_buffers(page); |
3089 | struct buffer_head *bh; | 3100 | struct buffer_head *bh; |
3090 | 3101 | ||
3091 | bh = head; | 3102 | bh = head; |
3092 | do { | 3103 | do { |
3093 | if (buffer_write_io_error(bh) && page->mapping) | 3104 | if (buffer_write_io_error(bh) && page->mapping) |
3094 | set_bit(AS_EIO, &page->mapping->flags); | 3105 | set_bit(AS_EIO, &page->mapping->flags); |
3095 | if (buffer_busy(bh)) | 3106 | if (buffer_busy(bh)) |
3096 | goto failed; | 3107 | goto failed; |
3097 | bh = bh->b_this_page; | 3108 | bh = bh->b_this_page; |
3098 | } while (bh != head); | 3109 | } while (bh != head); |
3099 | 3110 | ||
3100 | do { | 3111 | do { |
3101 | struct buffer_head *next = bh->b_this_page; | 3112 | struct buffer_head *next = bh->b_this_page; |
3102 | 3113 | ||
3103 | if (bh->b_assoc_map) | 3114 | if (bh->b_assoc_map) |
3104 | __remove_assoc_queue(bh); | 3115 | __remove_assoc_queue(bh); |
3105 | bh = next; | 3116 | bh = next; |
3106 | } while (bh != head); | 3117 | } while (bh != head); |
3107 | *buffers_to_free = head; | 3118 | *buffers_to_free = head; |
3108 | __clear_page_buffers(page); | 3119 | __clear_page_buffers(page); |
3109 | return 1; | 3120 | return 1; |
3110 | failed: | 3121 | failed: |
3111 | return 0; | 3122 | return 0; |
3112 | } | 3123 | } |
3113 | 3124 | ||
3114 | int try_to_free_buffers(struct page *page) | 3125 | int try_to_free_buffers(struct page *page) |
3115 | { | 3126 | { |
3116 | struct address_space * const mapping = page->mapping; | 3127 | struct address_space * const mapping = page->mapping; |
3117 | struct buffer_head *buffers_to_free = NULL; | 3128 | struct buffer_head *buffers_to_free = NULL; |
3118 | int ret = 0; | 3129 | int ret = 0; |
3119 | 3130 | ||
3120 | BUG_ON(!PageLocked(page)); | 3131 | BUG_ON(!PageLocked(page)); |
3121 | if (PageWriteback(page)) | 3132 | if (PageWriteback(page)) |
3122 | return 0; | 3133 | return 0; |
3123 | 3134 | ||
3124 | if (mapping == NULL) { /* can this still happen? */ | 3135 | if (mapping == NULL) { /* can this still happen? */ |
3125 | ret = drop_buffers(page, &buffers_to_free); | 3136 | ret = drop_buffers(page, &buffers_to_free); |
3126 | goto out; | 3137 | goto out; |
3127 | } | 3138 | } |
3128 | 3139 | ||
3129 | spin_lock(&mapping->private_lock); | 3140 | spin_lock(&mapping->private_lock); |
3130 | ret = drop_buffers(page, &buffers_to_free); | 3141 | ret = drop_buffers(page, &buffers_to_free); |
3131 | 3142 | ||
3132 | /* | 3143 | /* |
3133 | * If the filesystem writes its buffers by hand (eg ext3) | 3144 | * If the filesystem writes its buffers by hand (eg ext3) |
3134 | * then we can have clean buffers against a dirty page. We | 3145 | * then we can have clean buffers against a dirty page. We |
3135 | * clean the page here; otherwise the VM will never notice | 3146 | * clean the page here; otherwise the VM will never notice |
3136 | * that the filesystem did any IO at all. | 3147 | * that the filesystem did any IO at all. |
3137 | * | 3148 | * |
3138 | * Also, during truncate, discard_buffer will have marked all | 3149 | * Also, during truncate, discard_buffer will have marked all |
3139 | * the page's buffers clean. We discover that here and clean | 3150 | * the page's buffers clean. We discover that here and clean |
3140 | * the page also. | 3151 | * the page also. |
3141 | * | 3152 | * |
3142 | * private_lock must be held over this entire operation in order | 3153 | * private_lock must be held over this entire operation in order |
3143 | * to synchronise against __set_page_dirty_buffers and prevent the | 3154 | * to synchronise against __set_page_dirty_buffers and prevent the |
3144 | * dirty bit from being lost. | 3155 | * dirty bit from being lost. |
3145 | */ | 3156 | */ |
3146 | if (ret) | 3157 | if (ret) |
3147 | cancel_dirty_page(page, PAGE_CACHE_SIZE); | 3158 | cancel_dirty_page(page, PAGE_CACHE_SIZE); |
3148 | spin_unlock(&mapping->private_lock); | 3159 | spin_unlock(&mapping->private_lock); |
3149 | out: | 3160 | out: |
3150 | if (buffers_to_free) { | 3161 | if (buffers_to_free) { |
3151 | struct buffer_head *bh = buffers_to_free; | 3162 | struct buffer_head *bh = buffers_to_free; |
3152 | 3163 | ||
3153 | do { | 3164 | do { |
3154 | struct buffer_head *next = bh->b_this_page; | 3165 | struct buffer_head *next = bh->b_this_page; |
3155 | free_buffer_head(bh); | 3166 | free_buffer_head(bh); |
3156 | bh = next; | 3167 | bh = next; |
3157 | } while (bh != buffers_to_free); | 3168 | } while (bh != buffers_to_free); |
3158 | } | 3169 | } |
3159 | return ret; | 3170 | return ret; |
3160 | } | 3171 | } |
3161 | EXPORT_SYMBOL(try_to_free_buffers); | 3172 | EXPORT_SYMBOL(try_to_free_buffers); |
3162 | 3173 | ||
3163 | void block_sync_page(struct page *page) | 3174 | void block_sync_page(struct page *page) |
3164 | { | 3175 | { |
3165 | struct address_space *mapping; | 3176 | struct address_space *mapping; |
3166 | 3177 | ||
3167 | smp_mb(); | 3178 | smp_mb(); |
3168 | mapping = page_mapping(page); | 3179 | mapping = page_mapping(page); |
3169 | if (mapping) | 3180 | if (mapping) |
3170 | blk_run_backing_dev(mapping->backing_dev_info, page); | 3181 | blk_run_backing_dev(mapping->backing_dev_info, page); |
3171 | } | 3182 | } |
3172 | 3183 | ||
3173 | /* | 3184 | /* |
3174 | * There are no bdflush tunables left. But distributions are | 3185 | * There are no bdflush tunables left. But distributions are |
3175 | * still running obsolete flush daemons, so we terminate them here. | 3186 | * still running obsolete flush daemons, so we terminate them here. |
3176 | * | 3187 | * |
3177 | * Use of bdflush() is deprecated and will be removed in a future kernel. | 3188 | * Use of bdflush() is deprecated and will be removed in a future kernel. |
3178 | * The `pdflush' kernel threads fully replace bdflush daemons and this call. | 3189 | * The `pdflush' kernel threads fully replace bdflush daemons and this call. |
3179 | */ | 3190 | */ |
3180 | asmlinkage long sys_bdflush(int func, long data) | 3191 | asmlinkage long sys_bdflush(int func, long data) |
3181 | { | 3192 | { |
3182 | static int msg_count; | 3193 | static int msg_count; |
3183 | 3194 | ||
3184 | if (!capable(CAP_SYS_ADMIN)) | 3195 | if (!capable(CAP_SYS_ADMIN)) |
3185 | return -EPERM; | 3196 | return -EPERM; |
3186 | 3197 | ||
3187 | if (msg_count < 5) { | 3198 | if (msg_count < 5) { |
3188 | msg_count++; | 3199 | msg_count++; |
3189 | printk(KERN_INFO | 3200 | printk(KERN_INFO |
3190 | "warning: process `%s' used the obsolete bdflush" | 3201 | "warning: process `%s' used the obsolete bdflush" |
3191 | " system call\n", current->comm); | 3202 | " system call\n", current->comm); |
3192 | printk(KERN_INFO "Fix your initscripts?\n"); | 3203 | printk(KERN_INFO "Fix your initscripts?\n"); |
3193 | } | 3204 | } |
3194 | 3205 | ||
3195 | if (func == 1) | 3206 | if (func == 1) |
3196 | do_exit(0); | 3207 | do_exit(0); |
3197 | return 0; | 3208 | return 0; |
3198 | } | 3209 | } |
3199 | 3210 | ||
3200 | /* | 3211 | /* |
3201 | * Buffer-head allocation | 3212 | * Buffer-head allocation |
3202 | */ | 3213 | */ |
3203 | static struct kmem_cache *bh_cachep; | 3214 | static struct kmem_cache *bh_cachep; |
3204 | 3215 | ||
3205 | /* | 3216 | /* |
3206 | * Once the number of bh's in the machine exceeds this level, we start | 3217 | * Once the number of bh's in the machine exceeds this level, we start |
3207 | * stripping them in writeback. | 3218 | * stripping them in writeback. |
3208 | */ | 3219 | */ |
3209 | static int max_buffer_heads; | 3220 | static int max_buffer_heads; |
3210 | 3221 | ||
3211 | int buffer_heads_over_limit; | 3222 | int buffer_heads_over_limit; |
3212 | 3223 | ||
3213 | struct bh_accounting { | 3224 | struct bh_accounting { |
3214 | int nr; /* Number of live bh's */ | 3225 | int nr; /* Number of live bh's */ |
3215 | int ratelimit; /* Limit cacheline bouncing */ | 3226 | int ratelimit; /* Limit cacheline bouncing */ |
3216 | }; | 3227 | }; |
3217 | 3228 | ||
3218 | static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0}; | 3229 | static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0}; |
3219 | 3230 | ||
3220 | static void recalc_bh_state(void) | 3231 | static void recalc_bh_state(void) |
3221 | { | 3232 | { |
3222 | int i; | 3233 | int i; |
3223 | int tot = 0; | 3234 | int tot = 0; |
3224 | 3235 | ||
3225 | if (__get_cpu_var(bh_accounting).ratelimit++ < 4096) | 3236 | if (__get_cpu_var(bh_accounting).ratelimit++ < 4096) |
3226 | return; | 3237 | return; |
3227 | __get_cpu_var(bh_accounting).ratelimit = 0; | 3238 | __get_cpu_var(bh_accounting).ratelimit = 0; |
3228 | for_each_online_cpu(i) | 3239 | for_each_online_cpu(i) |
3229 | tot += per_cpu(bh_accounting, i).nr; | 3240 | tot += per_cpu(bh_accounting, i).nr; |
3230 | buffer_heads_over_limit = (tot > max_buffer_heads); | 3241 | buffer_heads_over_limit = (tot > max_buffer_heads); |
3231 | } | 3242 | } |
3232 | 3243 | ||
3233 | struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) | 3244 | struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) |
3234 | { | 3245 | { |
3235 | struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags); | 3246 | struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags); |
3236 | if (ret) { | 3247 | if (ret) { |
3237 | INIT_LIST_HEAD(&ret->b_assoc_buffers); | 3248 | INIT_LIST_HEAD(&ret->b_assoc_buffers); |
3238 | get_cpu_var(bh_accounting).nr++; | 3249 | get_cpu_var(bh_accounting).nr++; |
3239 | recalc_bh_state(); | 3250 | recalc_bh_state(); |
3240 | put_cpu_var(bh_accounting); | 3251 | put_cpu_var(bh_accounting); |
3241 | } | 3252 | } |
3242 | return ret; | 3253 | return ret; |
3243 | } | 3254 | } |
3244 | EXPORT_SYMBOL(alloc_buffer_head); | 3255 | EXPORT_SYMBOL(alloc_buffer_head); |
3245 | 3256 | ||
3246 | void free_buffer_head(struct buffer_head *bh) | 3257 | void free_buffer_head(struct buffer_head *bh) |
3247 | { | 3258 | { |
3248 | BUG_ON(!list_empty(&bh->b_assoc_buffers)); | 3259 | BUG_ON(!list_empty(&bh->b_assoc_buffers)); |
3249 | kmem_cache_free(bh_cachep, bh); | 3260 | kmem_cache_free(bh_cachep, bh); |
3250 | get_cpu_var(bh_accounting).nr--; | 3261 | get_cpu_var(bh_accounting).nr--; |
3251 | recalc_bh_state(); | 3262 | recalc_bh_state(); |
3252 | put_cpu_var(bh_accounting); | 3263 | put_cpu_var(bh_accounting); |
3253 | } | 3264 | } |
3254 | EXPORT_SYMBOL(free_buffer_head); | 3265 | EXPORT_SYMBOL(free_buffer_head); |
3255 | 3266 | ||
3256 | static void buffer_exit_cpu(int cpu) | 3267 | static void buffer_exit_cpu(int cpu) |
3257 | { | 3268 | { |
3258 | int i; | 3269 | int i; |
3259 | struct bh_lru *b = &per_cpu(bh_lrus, cpu); | 3270 | struct bh_lru *b = &per_cpu(bh_lrus, cpu); |
3260 | 3271 | ||
3261 | for (i = 0; i < BH_LRU_SIZE; i++) { | 3272 | for (i = 0; i < BH_LRU_SIZE; i++) { |
3262 | brelse(b->bhs[i]); | 3273 | brelse(b->bhs[i]); |
3263 | b->bhs[i] = NULL; | 3274 | b->bhs[i] = NULL; |
3264 | } | 3275 | } |
3265 | get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr; | 3276 | get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr; |
3266 | per_cpu(bh_accounting, cpu).nr = 0; | 3277 | per_cpu(bh_accounting, cpu).nr = 0; |
3267 | put_cpu_var(bh_accounting); | 3278 | put_cpu_var(bh_accounting); |
3268 | } | 3279 | } |
3269 | 3280 | ||
3270 | static int buffer_cpu_notify(struct notifier_block *self, | 3281 | static int buffer_cpu_notify(struct notifier_block *self, |
3271 | unsigned long action, void *hcpu) | 3282 | unsigned long action, void *hcpu) |
3272 | { | 3283 | { |
3273 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) | 3284 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) |
3274 | buffer_exit_cpu((unsigned long)hcpu); | 3285 | buffer_exit_cpu((unsigned long)hcpu); |
3275 | return NOTIFY_OK; | 3286 | return NOTIFY_OK; |
3276 | } | 3287 | } |
3277 | 3288 | ||
3278 | /** | 3289 | /** |
3279 | * bh_uptodate_or_lock - Test whether the buffer is uptodate | 3290 | * bh_uptodate_or_lock - Test whether the buffer is uptodate |
3280 | * @bh: struct buffer_head | 3291 | * @bh: struct buffer_head |
3281 | * | 3292 | * |
3282 | * Return true if the buffer is up-to-date and false, | 3293 | * Return true if the buffer is up-to-date and false, |
3283 | * with the buffer locked, if not. | 3294 | * with the buffer locked, if not. |
3284 | */ | 3295 | */ |
3285 | int bh_uptodate_or_lock(struct buffer_head *bh) | 3296 | int bh_uptodate_or_lock(struct buffer_head *bh) |
3286 | { | 3297 | { |
3287 | if (!buffer_uptodate(bh)) { | 3298 | if (!buffer_uptodate(bh)) { |
3288 | lock_buffer(bh); | 3299 | lock_buffer(bh); |
3289 | if (!buffer_uptodate(bh)) | 3300 | if (!buffer_uptodate(bh)) |
3290 | return 0; | 3301 | return 0; |
3291 | unlock_buffer(bh); | 3302 | unlock_buffer(bh); |
3292 | } | 3303 | } |
3293 | return 1; | 3304 | return 1; |
3294 | } | 3305 | } |
3295 | EXPORT_SYMBOL(bh_uptodate_or_lock); | 3306 | EXPORT_SYMBOL(bh_uptodate_or_lock); |
3296 | 3307 | ||
3297 | /** | 3308 | /** |
3298 | * bh_submit_read - Submit a locked buffer for reading | 3309 | * bh_submit_read - Submit a locked buffer for reading |
3299 | * @bh: struct buffer_head | 3310 | * @bh: struct buffer_head |
3300 | * | 3311 | * |
3301 | * Returns zero on success and -EIO on error. | 3312 | * Returns zero on success and -EIO on error. |
3302 | */ | 3313 | */ |
3303 | int bh_submit_read(struct buffer_head *bh) | 3314 | int bh_submit_read(struct buffer_head *bh) |
3304 | { | 3315 | { |
3305 | BUG_ON(!buffer_locked(bh)); | 3316 | BUG_ON(!buffer_locked(bh)); |
3306 | 3317 | ||
3307 | if (buffer_uptodate(bh)) { | 3318 | if (buffer_uptodate(bh)) { |
3308 | unlock_buffer(bh); | 3319 | unlock_buffer(bh); |
3309 | return 0; | 3320 | return 0; |
3310 | } | 3321 | } |
3311 | 3322 | ||
3312 | get_bh(bh); | 3323 | get_bh(bh); |
3313 | bh->b_end_io = end_buffer_read_sync; | 3324 | bh->b_end_io = end_buffer_read_sync; |
3314 | submit_bh(READ, bh); | 3325 | submit_bh(READ, bh); |
3315 | wait_on_buffer(bh); | 3326 | wait_on_buffer(bh); |
3316 | if (buffer_uptodate(bh)) | 3327 | if (buffer_uptodate(bh)) |
3317 | return 0; | 3328 | return 0; |
3318 | return -EIO; | 3329 | return -EIO; |
3319 | } | 3330 | } |
3320 | EXPORT_SYMBOL(bh_submit_read); | 3331 | EXPORT_SYMBOL(bh_submit_read); |
3321 | 3332 | ||
3322 | static void | 3333 | static void |
3323 | init_buffer_head(void *data) | 3334 | init_buffer_head(void *data) |
3324 | { | 3335 | { |
3325 | struct buffer_head *bh = data; | 3336 | struct buffer_head *bh = data; |
3326 | 3337 | ||
3327 | memset(bh, 0, sizeof(*bh)); | 3338 | memset(bh, 0, sizeof(*bh)); |
3328 | INIT_LIST_HEAD(&bh->b_assoc_buffers); | 3339 | INIT_LIST_HEAD(&bh->b_assoc_buffers); |
3329 | } | 3340 | } |
3330 | 3341 | ||
3331 | void __init buffer_init(void) | 3342 | void __init buffer_init(void) |
3332 | { | 3343 | { |
3333 | int nrpages; | 3344 | int nrpages; |
3334 | 3345 | ||
3335 | bh_cachep = kmem_cache_create("buffer_head", | 3346 | bh_cachep = kmem_cache_create("buffer_head", |
3336 | sizeof(struct buffer_head), 0, | 3347 | sizeof(struct buffer_head), 0, |
3337 | (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| | 3348 | (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| |
3338 | SLAB_MEM_SPREAD), | 3349 | SLAB_MEM_SPREAD), |
3339 | init_buffer_head); | 3350 | init_buffer_head); |
3340 | 3351 | ||
3341 | /* | 3352 | /* |
3342 | * Limit the bh occupancy to 10% of ZONE_NORMAL | 3353 | * Limit the bh occupancy to 10% of ZONE_NORMAL |
3343 | */ | 3354 | */ |
3344 | nrpages = (nr_free_buffer_pages() * 10) / 100; | 3355 | nrpages = (nr_free_buffer_pages() * 10) / 100; |
3345 | max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); | 3356 | max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); |
3346 | hotcpu_notifier(buffer_cpu_notify, 0); | 3357 | hotcpu_notifier(buffer_cpu_notify, 0); |
3347 | } | 3358 | } |
3348 | 3359 | ||
3349 | EXPORT_SYMBOL(__bforget); | 3360 | EXPORT_SYMBOL(__bforget); |
3350 | EXPORT_SYMBOL(__brelse); | 3361 | EXPORT_SYMBOL(__brelse); |
3351 | EXPORT_SYMBOL(__wait_on_buffer); | 3362 | EXPORT_SYMBOL(__wait_on_buffer); |
3352 | EXPORT_SYMBOL(block_commit_write); | 3363 | EXPORT_SYMBOL(block_commit_write); |
3353 | EXPORT_SYMBOL(block_prepare_write); | 3364 | EXPORT_SYMBOL(block_prepare_write); |
3354 | EXPORT_SYMBOL(block_page_mkwrite); | 3365 | EXPORT_SYMBOL(block_page_mkwrite); |
3355 | EXPORT_SYMBOL(block_read_full_page); | 3366 | EXPORT_SYMBOL(block_read_full_page); |
3356 | EXPORT_SYMBOL(block_sync_page); | 3367 | EXPORT_SYMBOL(block_sync_page); |
3357 | EXPORT_SYMBOL(block_truncate_page); | 3368 | EXPORT_SYMBOL(block_truncate_page); |
3358 | EXPORT_SYMBOL(block_write_full_page); | 3369 | EXPORT_SYMBOL(block_write_full_page); |
3359 | EXPORT_SYMBOL(cont_write_begin); | 3370 | EXPORT_SYMBOL(cont_write_begin); |
3360 | EXPORT_SYMBOL(end_buffer_read_sync); | 3371 | EXPORT_SYMBOL(end_buffer_read_sync); |
3361 | EXPORT_SYMBOL(end_buffer_write_sync); | 3372 | EXPORT_SYMBOL(end_buffer_write_sync); |
3362 | EXPORT_SYMBOL(file_fsync); | 3373 | EXPORT_SYMBOL(file_fsync); |
3363 | EXPORT_SYMBOL(fsync_bdev); | 3374 | EXPORT_SYMBOL(fsync_bdev); |
3364 | EXPORT_SYMBOL(generic_block_bmap); | 3375 | EXPORT_SYMBOL(generic_block_bmap); |
3365 | EXPORT_SYMBOL(generic_cont_expand_simple); | 3376 | EXPORT_SYMBOL(generic_cont_expand_simple); |
3366 | EXPORT_SYMBOL(init_buffer); | 3377 | EXPORT_SYMBOL(init_buffer); |
3367 | EXPORT_SYMBOL(invalidate_bdev); | 3378 | EXPORT_SYMBOL(invalidate_bdev); |
3368 | EXPORT_SYMBOL(ll_rw_block); | 3379 | EXPORT_SYMBOL(ll_rw_block); |
3369 | EXPORT_SYMBOL(mark_buffer_dirty); | 3380 | EXPORT_SYMBOL(mark_buffer_dirty); |
3370 | EXPORT_SYMBOL(submit_bh); | 3381 | EXPORT_SYMBOL(submit_bh); |
3371 | EXPORT_SYMBOL(sync_dirty_buffer); | 3382 | EXPORT_SYMBOL(sync_dirty_buffer); |
3372 | EXPORT_SYMBOL(unlock_buffer); | 3383 | EXPORT_SYMBOL(unlock_buffer); |
include/linux/bio.h
1 | /* | 1 | /* |
2 | * 2.5 block I/O model | 2 | * 2.5 block I/O model |
3 | * | 3 | * |
4 | * Copyright (C) 2001 Jens Axboe <axboe@suse.de> | 4 | * Copyright (C) 2001 Jens Axboe <axboe@suse.de> |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License version 2 as | 7 | * it under the terms of the GNU General Public License version 2 as |
8 | * published by the Free Software Foundation. | 8 | * published by the Free Software Foundation. |
9 | * | 9 | * |
10 | * This program is distributed in the hope that it will be useful, | 10 | * This program is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | 12 | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | * GNU General Public License for more details. | 14 | * GNU General Public License for more details. |
15 | * | 15 | * |
16 | * You should have received a copy of the GNU General Public Licens | 16 | * You should have received a copy of the GNU General Public Licens |
17 | * along with this program; if not, write to the Free Software | 17 | * along with this program; if not, write to the Free Software |
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- | 18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- |
19 | */ | 19 | */ |
20 | #ifndef __LINUX_BIO_H | 20 | #ifndef __LINUX_BIO_H |
21 | #define __LINUX_BIO_H | 21 | #define __LINUX_BIO_H |
22 | 22 | ||
23 | #include <linux/highmem.h> | 23 | #include <linux/highmem.h> |
24 | #include <linux/mempool.h> | 24 | #include <linux/mempool.h> |
25 | #include <linux/ioprio.h> | 25 | #include <linux/ioprio.h> |
26 | 26 | ||
27 | #ifdef CONFIG_BLOCK | 27 | #ifdef CONFIG_BLOCK |
28 | 28 | ||
29 | #include <asm/io.h> | 29 | #include <asm/io.h> |
30 | 30 | ||
31 | #define BIO_DEBUG | 31 | #define BIO_DEBUG |
32 | 32 | ||
33 | #ifdef BIO_DEBUG | 33 | #ifdef BIO_DEBUG |
34 | #define BIO_BUG_ON BUG_ON | 34 | #define BIO_BUG_ON BUG_ON |
35 | #else | 35 | #else |
36 | #define BIO_BUG_ON | 36 | #define BIO_BUG_ON |
37 | #endif | 37 | #endif |
38 | 38 | ||
39 | #define BIO_MAX_PAGES 256 | 39 | #define BIO_MAX_PAGES 256 |
40 | #define BIO_MAX_SIZE (BIO_MAX_PAGES << PAGE_CACHE_SHIFT) | 40 | #define BIO_MAX_SIZE (BIO_MAX_PAGES << PAGE_CACHE_SHIFT) |
41 | #define BIO_MAX_SECTORS (BIO_MAX_SIZE >> 9) | 41 | #define BIO_MAX_SECTORS (BIO_MAX_SIZE >> 9) |
42 | 42 | ||
43 | /* | 43 | /* |
44 | * was unsigned short, but we might as well be ready for > 64kB I/O pages | 44 | * was unsigned short, but we might as well be ready for > 64kB I/O pages |
45 | */ | 45 | */ |
46 | struct bio_vec { | 46 | struct bio_vec { |
47 | struct page *bv_page; | 47 | struct page *bv_page; |
48 | unsigned int bv_len; | 48 | unsigned int bv_len; |
49 | unsigned int bv_offset; | 49 | unsigned int bv_offset; |
50 | }; | 50 | }; |
51 | 51 | ||
52 | struct bio_set; | 52 | struct bio_set; |
53 | struct bio; | 53 | struct bio; |
54 | struct bio_integrity_payload; | 54 | struct bio_integrity_payload; |
55 | typedef void (bio_end_io_t) (struct bio *, int); | 55 | typedef void (bio_end_io_t) (struct bio *, int); |
56 | typedef void (bio_destructor_t) (struct bio *); | 56 | typedef void (bio_destructor_t) (struct bio *); |
57 | 57 | ||
58 | /* | 58 | /* |
59 | * main unit of I/O for the block layer and lower layers (ie drivers and | 59 | * main unit of I/O for the block layer and lower layers (ie drivers and |
60 | * stacking drivers) | 60 | * stacking drivers) |
61 | */ | 61 | */ |
62 | struct bio { | 62 | struct bio { |
63 | sector_t bi_sector; /* device address in 512 byte | 63 | sector_t bi_sector; /* device address in 512 byte |
64 | sectors */ | 64 | sectors */ |
65 | struct bio *bi_next; /* request queue link */ | 65 | struct bio *bi_next; /* request queue link */ |
66 | struct block_device *bi_bdev; | 66 | struct block_device *bi_bdev; |
67 | unsigned long bi_flags; /* status, command, etc */ | 67 | unsigned long bi_flags; /* status, command, etc */ |
68 | unsigned long bi_rw; /* bottom bits READ/WRITE, | 68 | unsigned long bi_rw; /* bottom bits READ/WRITE, |
69 | * top bits priority | 69 | * top bits priority |
70 | */ | 70 | */ |
71 | 71 | ||
72 | unsigned short bi_vcnt; /* how many bio_vec's */ | 72 | unsigned short bi_vcnt; /* how many bio_vec's */ |
73 | unsigned short bi_idx; /* current index into bvl_vec */ | 73 | unsigned short bi_idx; /* current index into bvl_vec */ |
74 | 74 | ||
75 | /* Number of segments in this BIO after | 75 | /* Number of segments in this BIO after |
76 | * physical address coalescing is performed. | 76 | * physical address coalescing is performed. |
77 | */ | 77 | */ |
78 | unsigned int bi_phys_segments; | 78 | unsigned int bi_phys_segments; |
79 | 79 | ||
80 | unsigned int bi_size; /* residual I/O count */ | 80 | unsigned int bi_size; /* residual I/O count */ |
81 | 81 | ||
82 | /* | 82 | /* |
83 | * To keep track of the max segment size, we account for the | 83 | * To keep track of the max segment size, we account for the |
84 | * sizes of the first and last mergeable segments in this bio. | 84 | * sizes of the first and last mergeable segments in this bio. |
85 | */ | 85 | */ |
86 | unsigned int bi_seg_front_size; | 86 | unsigned int bi_seg_front_size; |
87 | unsigned int bi_seg_back_size; | 87 | unsigned int bi_seg_back_size; |
88 | 88 | ||
89 | unsigned int bi_max_vecs; /* max bvl_vecs we can hold */ | 89 | unsigned int bi_max_vecs; /* max bvl_vecs we can hold */ |
90 | 90 | ||
91 | unsigned int bi_comp_cpu; /* completion CPU */ | 91 | unsigned int bi_comp_cpu; /* completion CPU */ |
92 | 92 | ||
93 | struct bio_vec *bi_io_vec; /* the actual vec list */ | 93 | struct bio_vec *bi_io_vec; /* the actual vec list */ |
94 | 94 | ||
95 | bio_end_io_t *bi_end_io; | 95 | bio_end_io_t *bi_end_io; |
96 | atomic_t bi_cnt; /* pin count */ | 96 | atomic_t bi_cnt; /* pin count */ |
97 | 97 | ||
98 | void *bi_private; | 98 | void *bi_private; |
99 | #if defined(CONFIG_BLK_DEV_INTEGRITY) | 99 | #if defined(CONFIG_BLK_DEV_INTEGRITY) |
100 | struct bio_integrity_payload *bi_integrity; /* data integrity */ | 100 | struct bio_integrity_payload *bi_integrity; /* data integrity */ |
101 | #endif | 101 | #endif |
102 | 102 | ||
103 | bio_destructor_t *bi_destructor; /* destructor */ | 103 | bio_destructor_t *bi_destructor; /* destructor */ |
104 | }; | 104 | }; |
105 | 105 | ||
106 | /* | 106 | /* |
107 | * bio flags | 107 | * bio flags |
108 | */ | 108 | */ |
109 | #define BIO_UPTODATE 0 /* ok after I/O completion */ | 109 | #define BIO_UPTODATE 0 /* ok after I/O completion */ |
110 | #define BIO_RW_BLOCK 1 /* RW_AHEAD set, and read/write would block */ | 110 | #define BIO_RW_BLOCK 1 /* RW_AHEAD set, and read/write would block */ |
111 | #define BIO_EOF 2 /* out-out-bounds error */ | 111 | #define BIO_EOF 2 /* out-out-bounds error */ |
112 | #define BIO_SEG_VALID 3 /* bi_phys_segments valid */ | 112 | #define BIO_SEG_VALID 3 /* bi_phys_segments valid */ |
113 | #define BIO_CLONED 4 /* doesn't own data */ | 113 | #define BIO_CLONED 4 /* doesn't own data */ |
114 | #define BIO_BOUNCED 5 /* bio is a bounce bio */ | 114 | #define BIO_BOUNCED 5 /* bio is a bounce bio */ |
115 | #define BIO_USER_MAPPED 6 /* contains user pages */ | 115 | #define BIO_USER_MAPPED 6 /* contains user pages */ |
116 | #define BIO_EOPNOTSUPP 7 /* not supported */ | 116 | #define BIO_EOPNOTSUPP 7 /* not supported */ |
117 | #define BIO_CPU_AFFINE 8 /* complete bio on same CPU as submitted */ | 117 | #define BIO_CPU_AFFINE 8 /* complete bio on same CPU as submitted */ |
118 | #define BIO_NULL_MAPPED 9 /* contains invalid user pages */ | 118 | #define BIO_NULL_MAPPED 9 /* contains invalid user pages */ |
119 | #define BIO_FS_INTEGRITY 10 /* fs owns integrity data, not block layer */ | 119 | #define BIO_FS_INTEGRITY 10 /* fs owns integrity data, not block layer */ |
120 | #define BIO_QUIET 11 /* Make BIO Quiet */ | ||
120 | #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) | 121 | #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) |
121 | 122 | ||
122 | /* | 123 | /* |
123 | * top 4 bits of bio flags indicate the pool this bio came from | 124 | * top 4 bits of bio flags indicate the pool this bio came from |
124 | */ | 125 | */ |
125 | #define BIO_POOL_BITS (4) | 126 | #define BIO_POOL_BITS (4) |
126 | #define BIO_POOL_OFFSET (BITS_PER_LONG - BIO_POOL_BITS) | 127 | #define BIO_POOL_OFFSET (BITS_PER_LONG - BIO_POOL_BITS) |
127 | #define BIO_POOL_MASK (1UL << BIO_POOL_OFFSET) | 128 | #define BIO_POOL_MASK (1UL << BIO_POOL_OFFSET) |
128 | #define BIO_POOL_IDX(bio) ((bio)->bi_flags >> BIO_POOL_OFFSET) | 129 | #define BIO_POOL_IDX(bio) ((bio)->bi_flags >> BIO_POOL_OFFSET) |
129 | 130 | ||
130 | /* | 131 | /* |
131 | * bio bi_rw flags | 132 | * bio bi_rw flags |
132 | * | 133 | * |
133 | * bit 0 -- data direction | 134 | * bit 0 -- data direction |
134 | * If not set, bio is a read from device. If set, it's a write to device. | 135 | * If not set, bio is a read from device. If set, it's a write to device. |
135 | * bit 1 -- rw-ahead when set | 136 | * bit 1 -- rw-ahead when set |
136 | * bit 2 -- barrier | 137 | * bit 2 -- barrier |
137 | * Insert a serialization point in the IO queue, forcing previously | 138 | * Insert a serialization point in the IO queue, forcing previously |
138 | * submitted IO to be completed before this oen is issued. | 139 | * submitted IO to be completed before this oen is issued. |
139 | * bit 3 -- synchronous I/O hint: the block layer will unplug immediately | 140 | * bit 3 -- synchronous I/O hint: the block layer will unplug immediately |
140 | * Note that this does NOT indicate that the IO itself is sync, just | 141 | * Note that this does NOT indicate that the IO itself is sync, just |
141 | * that the block layer will not postpone issue of this IO by plugging. | 142 | * that the block layer will not postpone issue of this IO by plugging. |
142 | * bit 4 -- metadata request | 143 | * bit 4 -- metadata request |
143 | * Used for tracing to differentiate metadata and data IO. May also | 144 | * Used for tracing to differentiate metadata and data IO. May also |
144 | * get some preferential treatment in the IO scheduler | 145 | * get some preferential treatment in the IO scheduler |
145 | * bit 5 -- discard sectors | 146 | * bit 5 -- discard sectors |
146 | * Informs the lower level device that this range of sectors is no longer | 147 | * Informs the lower level device that this range of sectors is no longer |
147 | * used by the file system and may thus be freed by the device. Used | 148 | * used by the file system and may thus be freed by the device. Used |
148 | * for flash based storage. | 149 | * for flash based storage. |
149 | * bit 6 -- fail fast device errors | 150 | * bit 6 -- fail fast device errors |
150 | * bit 7 -- fail fast transport errors | 151 | * bit 7 -- fail fast transport errors |
151 | * bit 8 -- fail fast driver errors | 152 | * bit 8 -- fail fast driver errors |
152 | * Don't want driver retries for any fast fail whatever the reason. | 153 | * Don't want driver retries for any fast fail whatever the reason. |
153 | */ | 154 | */ |
154 | #define BIO_RW 0 /* Must match RW in req flags (blkdev.h) */ | 155 | #define BIO_RW 0 /* Must match RW in req flags (blkdev.h) */ |
155 | #define BIO_RW_AHEAD 1 /* Must match FAILFAST in req flags */ | 156 | #define BIO_RW_AHEAD 1 /* Must match FAILFAST in req flags */ |
156 | #define BIO_RW_BARRIER 2 | 157 | #define BIO_RW_BARRIER 2 |
157 | #define BIO_RW_SYNC 3 | 158 | #define BIO_RW_SYNC 3 |
158 | #define BIO_RW_META 4 | 159 | #define BIO_RW_META 4 |
159 | #define BIO_RW_DISCARD 5 | 160 | #define BIO_RW_DISCARD 5 |
160 | #define BIO_RW_FAILFAST_DEV 6 | 161 | #define BIO_RW_FAILFAST_DEV 6 |
161 | #define BIO_RW_FAILFAST_TRANSPORT 7 | 162 | #define BIO_RW_FAILFAST_TRANSPORT 7 |
162 | #define BIO_RW_FAILFAST_DRIVER 8 | 163 | #define BIO_RW_FAILFAST_DRIVER 8 |
163 | 164 | ||
164 | /* | 165 | /* |
165 | * upper 16 bits of bi_rw define the io priority of this bio | 166 | * upper 16 bits of bi_rw define the io priority of this bio |
166 | */ | 167 | */ |
167 | #define BIO_PRIO_SHIFT (8 * sizeof(unsigned long) - IOPRIO_BITS) | 168 | #define BIO_PRIO_SHIFT (8 * sizeof(unsigned long) - IOPRIO_BITS) |
168 | #define bio_prio(bio) ((bio)->bi_rw >> BIO_PRIO_SHIFT) | 169 | #define bio_prio(bio) ((bio)->bi_rw >> BIO_PRIO_SHIFT) |
169 | #define bio_prio_valid(bio) ioprio_valid(bio_prio(bio)) | 170 | #define bio_prio_valid(bio) ioprio_valid(bio_prio(bio)) |
170 | 171 | ||
171 | #define bio_set_prio(bio, prio) do { \ | 172 | #define bio_set_prio(bio, prio) do { \ |
172 | WARN_ON(prio >= (1 << IOPRIO_BITS)); \ | 173 | WARN_ON(prio >= (1 << IOPRIO_BITS)); \ |
173 | (bio)->bi_rw &= ((1UL << BIO_PRIO_SHIFT) - 1); \ | 174 | (bio)->bi_rw &= ((1UL << BIO_PRIO_SHIFT) - 1); \ |
174 | (bio)->bi_rw |= ((unsigned long) (prio) << BIO_PRIO_SHIFT); \ | 175 | (bio)->bi_rw |= ((unsigned long) (prio) << BIO_PRIO_SHIFT); \ |
175 | } while (0) | 176 | } while (0) |
176 | 177 | ||
177 | /* | 178 | /* |
178 | * various member access, note that bio_data should of course not be used | 179 | * various member access, note that bio_data should of course not be used |
179 | * on highmem page vectors | 180 | * on highmem page vectors |
180 | */ | 181 | */ |
181 | #define bio_iovec_idx(bio, idx) (&((bio)->bi_io_vec[(idx)])) | 182 | #define bio_iovec_idx(bio, idx) (&((bio)->bi_io_vec[(idx)])) |
182 | #define bio_iovec(bio) bio_iovec_idx((bio), (bio)->bi_idx) | 183 | #define bio_iovec(bio) bio_iovec_idx((bio), (bio)->bi_idx) |
183 | #define bio_page(bio) bio_iovec((bio))->bv_page | 184 | #define bio_page(bio) bio_iovec((bio))->bv_page |
184 | #define bio_offset(bio) bio_iovec((bio))->bv_offset | 185 | #define bio_offset(bio) bio_iovec((bio))->bv_offset |
185 | #define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx) | 186 | #define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx) |
186 | #define bio_sectors(bio) ((bio)->bi_size >> 9) | 187 | #define bio_sectors(bio) ((bio)->bi_size >> 9) |
187 | #define bio_barrier(bio) ((bio)->bi_rw & (1 << BIO_RW_BARRIER)) | 188 | #define bio_barrier(bio) ((bio)->bi_rw & (1 << BIO_RW_BARRIER)) |
188 | #define bio_sync(bio) ((bio)->bi_rw & (1 << BIO_RW_SYNC)) | 189 | #define bio_sync(bio) ((bio)->bi_rw & (1 << BIO_RW_SYNC)) |
189 | #define bio_failfast_dev(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_DEV)) | 190 | #define bio_failfast_dev(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_DEV)) |
190 | #define bio_failfast_transport(bio) \ | 191 | #define bio_failfast_transport(bio) \ |
191 | ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_TRANSPORT)) | 192 | ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_TRANSPORT)) |
192 | #define bio_failfast_driver(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_DRIVER)) | 193 | #define bio_failfast_driver(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_DRIVER)) |
193 | #define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD)) | 194 | #define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD)) |
194 | #define bio_rw_meta(bio) ((bio)->bi_rw & (1 << BIO_RW_META)) | 195 | #define bio_rw_meta(bio) ((bio)->bi_rw & (1 << BIO_RW_META)) |
195 | #define bio_discard(bio) ((bio)->bi_rw & (1 << BIO_RW_DISCARD)) | 196 | #define bio_discard(bio) ((bio)->bi_rw & (1 << BIO_RW_DISCARD)) |
196 | #define bio_empty_barrier(bio) (bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio)) | 197 | #define bio_empty_barrier(bio) (bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio)) |
197 | 198 | ||
198 | static inline unsigned int bio_cur_sectors(struct bio *bio) | 199 | static inline unsigned int bio_cur_sectors(struct bio *bio) |
199 | { | 200 | { |
200 | if (bio->bi_vcnt) | 201 | if (bio->bi_vcnt) |
201 | return bio_iovec(bio)->bv_len >> 9; | 202 | return bio_iovec(bio)->bv_len >> 9; |
202 | else /* dataless requests such as discard */ | 203 | else /* dataless requests such as discard */ |
203 | return bio->bi_size >> 9; | 204 | return bio->bi_size >> 9; |
204 | } | 205 | } |
205 | 206 | ||
206 | static inline void *bio_data(struct bio *bio) | 207 | static inline void *bio_data(struct bio *bio) |
207 | { | 208 | { |
208 | if (bio->bi_vcnt) | 209 | if (bio->bi_vcnt) |
209 | return page_address(bio_page(bio)) + bio_offset(bio); | 210 | return page_address(bio_page(bio)) + bio_offset(bio); |
210 | 211 | ||
211 | return NULL; | 212 | return NULL; |
212 | } | 213 | } |
213 | 214 | ||
214 | /* | 215 | /* |
215 | * will die | 216 | * will die |
216 | */ | 217 | */ |
217 | #define bio_to_phys(bio) (page_to_phys(bio_page((bio))) + (unsigned long) bio_offset((bio))) | 218 | #define bio_to_phys(bio) (page_to_phys(bio_page((bio))) + (unsigned long) bio_offset((bio))) |
218 | #define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset) | 219 | #define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset) |
219 | 220 | ||
220 | /* | 221 | /* |
221 | * queues that have highmem support enabled may still need to revert to | 222 | * queues that have highmem support enabled may still need to revert to |
222 | * PIO transfers occasionally and thus map high pages temporarily. For | 223 | * PIO transfers occasionally and thus map high pages temporarily. For |
223 | * permanent PIO fall back, user is probably better off disabling highmem | 224 | * permanent PIO fall back, user is probably better off disabling highmem |
224 | * I/O completely on that queue (see ide-dma for example) | 225 | * I/O completely on that queue (see ide-dma for example) |
225 | */ | 226 | */ |
226 | #define __bio_kmap_atomic(bio, idx, kmtype) \ | 227 | #define __bio_kmap_atomic(bio, idx, kmtype) \ |
227 | (kmap_atomic(bio_iovec_idx((bio), (idx))->bv_page, kmtype) + \ | 228 | (kmap_atomic(bio_iovec_idx((bio), (idx))->bv_page, kmtype) + \ |
228 | bio_iovec_idx((bio), (idx))->bv_offset) | 229 | bio_iovec_idx((bio), (idx))->bv_offset) |
229 | 230 | ||
230 | #define __bio_kunmap_atomic(addr, kmtype) kunmap_atomic(addr, kmtype) | 231 | #define __bio_kunmap_atomic(addr, kmtype) kunmap_atomic(addr, kmtype) |
231 | 232 | ||
232 | /* | 233 | /* |
233 | * merge helpers etc | 234 | * merge helpers etc |
234 | */ | 235 | */ |
235 | 236 | ||
236 | #define __BVEC_END(bio) bio_iovec_idx((bio), (bio)->bi_vcnt - 1) | 237 | #define __BVEC_END(bio) bio_iovec_idx((bio), (bio)->bi_vcnt - 1) |
237 | #define __BVEC_START(bio) bio_iovec_idx((bio), (bio)->bi_idx) | 238 | #define __BVEC_START(bio) bio_iovec_idx((bio), (bio)->bi_idx) |
238 | 239 | ||
239 | /* Default implementation of BIOVEC_PHYS_MERGEABLE */ | 240 | /* Default implementation of BIOVEC_PHYS_MERGEABLE */ |
240 | #define __BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ | 241 | #define __BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ |
241 | ((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) | 242 | ((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) |
242 | 243 | ||
243 | /* | 244 | /* |
244 | * allow arch override, for eg virtualized architectures (put in asm/io.h) | 245 | * allow arch override, for eg virtualized architectures (put in asm/io.h) |
245 | */ | 246 | */ |
246 | #ifndef BIOVEC_PHYS_MERGEABLE | 247 | #ifndef BIOVEC_PHYS_MERGEABLE |
247 | #define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ | 248 | #define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ |
248 | __BIOVEC_PHYS_MERGEABLE(vec1, vec2) | 249 | __BIOVEC_PHYS_MERGEABLE(vec1, vec2) |
249 | #endif | 250 | #endif |
250 | 251 | ||
251 | #define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \ | 252 | #define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \ |
252 | (((addr1) | (mask)) == (((addr2) - 1) | (mask))) | 253 | (((addr1) | (mask)) == (((addr2) - 1) | (mask))) |
253 | #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \ | 254 | #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \ |
254 | __BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, (q)->seg_boundary_mask) | 255 | __BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, (q)->seg_boundary_mask) |
255 | #define BIO_SEG_BOUNDARY(q, b1, b2) \ | 256 | #define BIO_SEG_BOUNDARY(q, b1, b2) \ |
256 | BIOVEC_SEG_BOUNDARY((q), __BVEC_END((b1)), __BVEC_START((b2))) | 257 | BIOVEC_SEG_BOUNDARY((q), __BVEC_END((b1)), __BVEC_START((b2))) |
257 | 258 | ||
258 | #define bio_io_error(bio) bio_endio((bio), -EIO) | 259 | #define bio_io_error(bio) bio_endio((bio), -EIO) |
259 | 260 | ||
260 | /* | 261 | /* |
261 | * drivers should not use the __ version unless they _really_ want to | 262 | * drivers should not use the __ version unless they _really_ want to |
262 | * run through the entire bio and not just pending pieces | 263 | * run through the entire bio and not just pending pieces |
263 | */ | 264 | */ |
264 | #define __bio_for_each_segment(bvl, bio, i, start_idx) \ | 265 | #define __bio_for_each_segment(bvl, bio, i, start_idx) \ |
265 | for (bvl = bio_iovec_idx((bio), (start_idx)), i = (start_idx); \ | 266 | for (bvl = bio_iovec_idx((bio), (start_idx)), i = (start_idx); \ |
266 | i < (bio)->bi_vcnt; \ | 267 | i < (bio)->bi_vcnt; \ |
267 | bvl++, i++) | 268 | bvl++, i++) |
268 | 269 | ||
269 | #define bio_for_each_segment(bvl, bio, i) \ | 270 | #define bio_for_each_segment(bvl, bio, i) \ |
270 | __bio_for_each_segment(bvl, bio, i, (bio)->bi_idx) | 271 | __bio_for_each_segment(bvl, bio, i, (bio)->bi_idx) |
271 | 272 | ||
272 | /* | 273 | /* |
273 | * get a reference to a bio, so it won't disappear. the intended use is | 274 | * get a reference to a bio, so it won't disappear. the intended use is |
274 | * something like: | 275 | * something like: |
275 | * | 276 | * |
276 | * bio_get(bio); | 277 | * bio_get(bio); |
277 | * submit_bio(rw, bio); | 278 | * submit_bio(rw, bio); |
278 | * if (bio->bi_flags ...) | 279 | * if (bio->bi_flags ...) |
279 | * do_something | 280 | * do_something |
280 | * bio_put(bio); | 281 | * bio_put(bio); |
281 | * | 282 | * |
282 | * without the bio_get(), it could potentially complete I/O before submit_bio | 283 | * without the bio_get(), it could potentially complete I/O before submit_bio |
283 | * returns. and then bio would be freed memory when if (bio->bi_flags ...) | 284 | * returns. and then bio would be freed memory when if (bio->bi_flags ...) |
284 | * runs | 285 | * runs |
285 | */ | 286 | */ |
286 | #define bio_get(bio) atomic_inc(&(bio)->bi_cnt) | 287 | #define bio_get(bio) atomic_inc(&(bio)->bi_cnt) |
287 | 288 | ||
288 | #if defined(CONFIG_BLK_DEV_INTEGRITY) | 289 | #if defined(CONFIG_BLK_DEV_INTEGRITY) |
289 | /* | 290 | /* |
290 | * bio integrity payload | 291 | * bio integrity payload |
291 | */ | 292 | */ |
292 | struct bio_integrity_payload { | 293 | struct bio_integrity_payload { |
293 | struct bio *bip_bio; /* parent bio */ | 294 | struct bio *bip_bio; /* parent bio */ |
294 | struct bio_vec *bip_vec; /* integrity data vector */ | 295 | struct bio_vec *bip_vec; /* integrity data vector */ |
295 | 296 | ||
296 | sector_t bip_sector; /* virtual start sector */ | 297 | sector_t bip_sector; /* virtual start sector */ |
297 | 298 | ||
298 | void *bip_buf; /* generated integrity data */ | 299 | void *bip_buf; /* generated integrity data */ |
299 | bio_end_io_t *bip_end_io; /* saved I/O completion fn */ | 300 | bio_end_io_t *bip_end_io; /* saved I/O completion fn */ |
300 | 301 | ||
301 | int bip_error; /* saved I/O error */ | 302 | int bip_error; /* saved I/O error */ |
302 | unsigned int bip_size; | 303 | unsigned int bip_size; |
303 | 304 | ||
304 | unsigned short bip_pool; /* pool the ivec came from */ | 305 | unsigned short bip_pool; /* pool the ivec came from */ |
305 | unsigned short bip_vcnt; /* # of integrity bio_vecs */ | 306 | unsigned short bip_vcnt; /* # of integrity bio_vecs */ |
306 | unsigned short bip_idx; /* current bip_vec index */ | 307 | unsigned short bip_idx; /* current bip_vec index */ |
307 | 308 | ||
308 | struct work_struct bip_work; /* I/O completion */ | 309 | struct work_struct bip_work; /* I/O completion */ |
309 | }; | 310 | }; |
310 | #endif /* CONFIG_BLK_DEV_INTEGRITY */ | 311 | #endif /* CONFIG_BLK_DEV_INTEGRITY */ |
311 | 312 | ||
312 | /* | 313 | /* |
313 | * A bio_pair is used when we need to split a bio. | 314 | * A bio_pair is used when we need to split a bio. |
314 | * This can only happen for a bio that refers to just one | 315 | * This can only happen for a bio that refers to just one |
315 | * page of data, and in the unusual situation when the | 316 | * page of data, and in the unusual situation when the |
316 | * page crosses a chunk/device boundary | 317 | * page crosses a chunk/device boundary |
317 | * | 318 | * |
318 | * The address of the master bio is stored in bio1.bi_private | 319 | * The address of the master bio is stored in bio1.bi_private |
319 | * The address of the pool the pair was allocated from is stored | 320 | * The address of the pool the pair was allocated from is stored |
320 | * in bio2.bi_private | 321 | * in bio2.bi_private |
321 | */ | 322 | */ |
322 | struct bio_pair { | 323 | struct bio_pair { |
323 | struct bio bio1, bio2; | 324 | struct bio bio1, bio2; |
324 | struct bio_vec bv1, bv2; | 325 | struct bio_vec bv1, bv2; |
325 | #if defined(CONFIG_BLK_DEV_INTEGRITY) | 326 | #if defined(CONFIG_BLK_DEV_INTEGRITY) |
326 | struct bio_integrity_payload bip1, bip2; | 327 | struct bio_integrity_payload bip1, bip2; |
327 | struct bio_vec iv1, iv2; | 328 | struct bio_vec iv1, iv2; |
328 | #endif | 329 | #endif |
329 | atomic_t cnt; | 330 | atomic_t cnt; |
330 | int error; | 331 | int error; |
331 | }; | 332 | }; |
332 | extern struct bio_pair *bio_split(struct bio *bi, int first_sectors); | 333 | extern struct bio_pair *bio_split(struct bio *bi, int first_sectors); |
333 | extern void bio_pair_release(struct bio_pair *dbio); | 334 | extern void bio_pair_release(struct bio_pair *dbio); |
334 | 335 | ||
335 | extern struct bio_set *bioset_create(int, int); | 336 | extern struct bio_set *bioset_create(int, int); |
336 | extern void bioset_free(struct bio_set *); | 337 | extern void bioset_free(struct bio_set *); |
337 | 338 | ||
338 | extern struct bio *bio_alloc(gfp_t, int); | 339 | extern struct bio *bio_alloc(gfp_t, int); |
339 | extern struct bio *bio_kmalloc(gfp_t, int); | 340 | extern struct bio *bio_kmalloc(gfp_t, int); |
340 | extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); | 341 | extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); |
341 | extern void bio_put(struct bio *); | 342 | extern void bio_put(struct bio *); |
342 | extern void bio_free(struct bio *, struct bio_set *); | 343 | extern void bio_free(struct bio *, struct bio_set *); |
343 | 344 | ||
344 | extern void bio_endio(struct bio *, int); | 345 | extern void bio_endio(struct bio *, int); |
345 | struct request_queue; | 346 | struct request_queue; |
346 | extern int bio_phys_segments(struct request_queue *, struct bio *); | 347 | extern int bio_phys_segments(struct request_queue *, struct bio *); |
347 | 348 | ||
348 | extern void __bio_clone(struct bio *, struct bio *); | 349 | extern void __bio_clone(struct bio *, struct bio *); |
349 | extern struct bio *bio_clone(struct bio *, gfp_t); | 350 | extern struct bio *bio_clone(struct bio *, gfp_t); |
350 | 351 | ||
351 | extern void bio_init(struct bio *); | 352 | extern void bio_init(struct bio *); |
352 | 353 | ||
353 | extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); | 354 | extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); |
354 | extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, | 355 | extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, |
355 | unsigned int, unsigned int); | 356 | unsigned int, unsigned int); |
356 | extern int bio_get_nr_vecs(struct block_device *); | 357 | extern int bio_get_nr_vecs(struct block_device *); |
357 | extern sector_t bio_sector_offset(struct bio *, unsigned short, unsigned int); | 358 | extern sector_t bio_sector_offset(struct bio *, unsigned short, unsigned int); |
358 | extern struct bio *bio_map_user(struct request_queue *, struct block_device *, | 359 | extern struct bio *bio_map_user(struct request_queue *, struct block_device *, |
359 | unsigned long, unsigned int, int, gfp_t); | 360 | unsigned long, unsigned int, int, gfp_t); |
360 | struct sg_iovec; | 361 | struct sg_iovec; |
361 | struct rq_map_data; | 362 | struct rq_map_data; |
362 | extern struct bio *bio_map_user_iov(struct request_queue *, | 363 | extern struct bio *bio_map_user_iov(struct request_queue *, |
363 | struct block_device *, | 364 | struct block_device *, |
364 | struct sg_iovec *, int, int, gfp_t); | 365 | struct sg_iovec *, int, int, gfp_t); |
365 | extern void bio_unmap_user(struct bio *); | 366 | extern void bio_unmap_user(struct bio *); |
366 | extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int, | 367 | extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int, |
367 | gfp_t); | 368 | gfp_t); |
368 | extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int, | 369 | extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int, |
369 | gfp_t, int); | 370 | gfp_t, int); |
370 | extern void bio_set_pages_dirty(struct bio *bio); | 371 | extern void bio_set_pages_dirty(struct bio *bio); |
371 | extern void bio_check_pages_dirty(struct bio *bio); | 372 | extern void bio_check_pages_dirty(struct bio *bio); |
372 | extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *, | 373 | extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *, |
373 | unsigned long, unsigned int, int, gfp_t); | 374 | unsigned long, unsigned int, int, gfp_t); |
374 | extern struct bio *bio_copy_user_iov(struct request_queue *, | 375 | extern struct bio *bio_copy_user_iov(struct request_queue *, |
375 | struct rq_map_data *, struct sg_iovec *, | 376 | struct rq_map_data *, struct sg_iovec *, |
376 | int, int, gfp_t); | 377 | int, int, gfp_t); |
377 | extern int bio_uncopy_user(struct bio *); | 378 | extern int bio_uncopy_user(struct bio *); |
378 | void zero_fill_bio(struct bio *bio); | 379 | void zero_fill_bio(struct bio *bio); |
379 | extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *); | 380 | extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *); |
380 | extern unsigned int bvec_nr_vecs(unsigned short idx); | 381 | extern unsigned int bvec_nr_vecs(unsigned short idx); |
381 | 382 | ||
382 | /* | 383 | /* |
383 | * Allow queuer to specify a completion CPU for this bio | 384 | * Allow queuer to specify a completion CPU for this bio |
384 | */ | 385 | */ |
385 | static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu) | 386 | static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu) |
386 | { | 387 | { |
387 | bio->bi_comp_cpu = cpu; | 388 | bio->bi_comp_cpu = cpu; |
388 | } | 389 | } |
389 | 390 | ||
390 | /* | 391 | /* |
391 | * bio_set is used to allow other portions of the IO system to | 392 | * bio_set is used to allow other portions of the IO system to |
392 | * allocate their own private memory pools for bio and iovec structures. | 393 | * allocate their own private memory pools for bio and iovec structures. |
393 | * These memory pools in turn all allocate from the bio_slab | 394 | * These memory pools in turn all allocate from the bio_slab |
394 | * and the bvec_slabs[]. | 395 | * and the bvec_slabs[]. |
395 | */ | 396 | */ |
396 | #define BIO_POOL_SIZE 2 | 397 | #define BIO_POOL_SIZE 2 |
397 | #define BIOVEC_NR_POOLS 6 | 398 | #define BIOVEC_NR_POOLS 6 |
398 | 399 | ||
399 | struct bio_set { | 400 | struct bio_set { |
400 | mempool_t *bio_pool; | 401 | mempool_t *bio_pool; |
401 | #if defined(CONFIG_BLK_DEV_INTEGRITY) | 402 | #if defined(CONFIG_BLK_DEV_INTEGRITY) |
402 | mempool_t *bio_integrity_pool; | 403 | mempool_t *bio_integrity_pool; |
403 | #endif | 404 | #endif |
404 | mempool_t *bvec_pools[BIOVEC_NR_POOLS]; | 405 | mempool_t *bvec_pools[BIOVEC_NR_POOLS]; |
405 | }; | 406 | }; |
406 | 407 | ||
407 | struct biovec_slab { | 408 | struct biovec_slab { |
408 | int nr_vecs; | 409 | int nr_vecs; |
409 | char *name; | 410 | char *name; |
410 | struct kmem_cache *slab; | 411 | struct kmem_cache *slab; |
411 | }; | 412 | }; |
412 | 413 | ||
413 | extern struct bio_set *fs_bio_set; | 414 | extern struct bio_set *fs_bio_set; |
414 | 415 | ||
415 | /* | 416 | /* |
416 | * a small number of entries is fine, not going to be performance critical. | 417 | * a small number of entries is fine, not going to be performance critical. |
417 | * basically we just need to survive | 418 | * basically we just need to survive |
418 | */ | 419 | */ |
419 | #define BIO_SPLIT_ENTRIES 2 | 420 | #define BIO_SPLIT_ENTRIES 2 |
420 | 421 | ||
421 | #ifdef CONFIG_HIGHMEM | 422 | #ifdef CONFIG_HIGHMEM |
422 | /* | 423 | /* |
423 | * remember to add offset! and never ever reenable interrupts between a | 424 | * remember to add offset! and never ever reenable interrupts between a |
424 | * bvec_kmap_irq and bvec_kunmap_irq!! | 425 | * bvec_kmap_irq and bvec_kunmap_irq!! |
425 | * | 426 | * |
426 | * This function MUST be inlined - it plays with the CPU interrupt flags. | 427 | * This function MUST be inlined - it plays with the CPU interrupt flags. |
427 | */ | 428 | */ |
428 | static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) | 429 | static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) |
429 | { | 430 | { |
430 | unsigned long addr; | 431 | unsigned long addr; |
431 | 432 | ||
432 | /* | 433 | /* |
433 | * might not be a highmem page, but the preempt/irq count | 434 | * might not be a highmem page, but the preempt/irq count |
434 | * balancing is a lot nicer this way | 435 | * balancing is a lot nicer this way |
435 | */ | 436 | */ |
436 | local_irq_save(*flags); | 437 | local_irq_save(*flags); |
437 | addr = (unsigned long) kmap_atomic(bvec->bv_page, KM_BIO_SRC_IRQ); | 438 | addr = (unsigned long) kmap_atomic(bvec->bv_page, KM_BIO_SRC_IRQ); |
438 | 439 | ||
439 | BUG_ON(addr & ~PAGE_MASK); | 440 | BUG_ON(addr & ~PAGE_MASK); |
440 | 441 | ||
441 | return (char *) addr + bvec->bv_offset; | 442 | return (char *) addr + bvec->bv_offset; |
442 | } | 443 | } |
443 | 444 | ||
444 | static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) | 445 | static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) |
445 | { | 446 | { |
446 | unsigned long ptr = (unsigned long) buffer & PAGE_MASK; | 447 | unsigned long ptr = (unsigned long) buffer & PAGE_MASK; |
447 | 448 | ||
448 | kunmap_atomic((void *) ptr, KM_BIO_SRC_IRQ); | 449 | kunmap_atomic((void *) ptr, KM_BIO_SRC_IRQ); |
449 | local_irq_restore(*flags); | 450 | local_irq_restore(*flags); |
450 | } | 451 | } |
451 | 452 | ||
452 | #else | 453 | #else |
453 | #define bvec_kmap_irq(bvec, flags) (page_address((bvec)->bv_page) + (bvec)->bv_offset) | 454 | #define bvec_kmap_irq(bvec, flags) (page_address((bvec)->bv_page) + (bvec)->bv_offset) |
454 | #define bvec_kunmap_irq(buf, flags) do { *(flags) = 0; } while (0) | 455 | #define bvec_kunmap_irq(buf, flags) do { *(flags) = 0; } while (0) |
455 | #endif | 456 | #endif |
456 | 457 | ||
457 | static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx, | 458 | static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx, |
458 | unsigned long *flags) | 459 | unsigned long *flags) |
459 | { | 460 | { |
460 | return bvec_kmap_irq(bio_iovec_idx(bio, idx), flags); | 461 | return bvec_kmap_irq(bio_iovec_idx(bio, idx), flags); |
461 | } | 462 | } |
462 | #define __bio_kunmap_irq(buf, flags) bvec_kunmap_irq(buf, flags) | 463 | #define __bio_kunmap_irq(buf, flags) bvec_kunmap_irq(buf, flags) |
463 | 464 | ||
464 | #define bio_kmap_irq(bio, flags) \ | 465 | #define bio_kmap_irq(bio, flags) \ |
465 | __bio_kmap_irq((bio), (bio)->bi_idx, (flags)) | 466 | __bio_kmap_irq((bio), (bio)->bi_idx, (flags)) |
466 | #define bio_kunmap_irq(buf,flags) __bio_kunmap_irq(buf, flags) | 467 | #define bio_kunmap_irq(buf,flags) __bio_kunmap_irq(buf, flags) |
467 | 468 | ||
468 | /* | 469 | /* |
469 | * Check whether this bio carries any data or not. A NULL bio is allowed. | 470 | * Check whether this bio carries any data or not. A NULL bio is allowed. |
470 | */ | 471 | */ |
471 | static inline int bio_has_data(struct bio *bio) | 472 | static inline int bio_has_data(struct bio *bio) |
472 | { | 473 | { |
473 | return bio && bio->bi_io_vec != NULL; | 474 | return bio && bio->bi_io_vec != NULL; |
474 | } | 475 | } |
475 | 476 | ||
476 | #if defined(CONFIG_BLK_DEV_INTEGRITY) | 477 | #if defined(CONFIG_BLK_DEV_INTEGRITY) |
477 | 478 | ||
478 | #define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)])) | 479 | #define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)])) |
479 | #define bip_vec(bip) bip_vec_idx(bip, 0) | 480 | #define bip_vec(bip) bip_vec_idx(bip, 0) |
480 | 481 | ||
481 | #define __bip_for_each_vec(bvl, bip, i, start_idx) \ | 482 | #define __bip_for_each_vec(bvl, bip, i, start_idx) \ |
482 | for (bvl = bip_vec_idx((bip), (start_idx)), i = (start_idx); \ | 483 | for (bvl = bip_vec_idx((bip), (start_idx)), i = (start_idx); \ |
483 | i < (bip)->bip_vcnt; \ | 484 | i < (bip)->bip_vcnt; \ |
484 | bvl++, i++) | 485 | bvl++, i++) |
485 | 486 | ||
486 | #define bip_for_each_vec(bvl, bip, i) \ | 487 | #define bip_for_each_vec(bvl, bip, i) \ |
487 | __bip_for_each_vec(bvl, bip, i, (bip)->bip_idx) | 488 | __bip_for_each_vec(bvl, bip, i, (bip)->bip_idx) |
488 | 489 | ||
489 | #define bio_integrity(bio) (bio->bi_integrity != NULL) | 490 | #define bio_integrity(bio) (bio->bi_integrity != NULL) |
490 | 491 | ||
491 | extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *); | 492 | extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *); |
492 | extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); | 493 | extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); |
493 | extern void bio_integrity_free(struct bio *, struct bio_set *); | 494 | extern void bio_integrity_free(struct bio *, struct bio_set *); |
494 | extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); | 495 | extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); |
495 | extern int bio_integrity_enabled(struct bio *bio); | 496 | extern int bio_integrity_enabled(struct bio *bio); |
496 | extern int bio_integrity_set_tag(struct bio *, void *, unsigned int); | 497 | extern int bio_integrity_set_tag(struct bio *, void *, unsigned int); |
497 | extern int bio_integrity_get_tag(struct bio *, void *, unsigned int); | 498 | extern int bio_integrity_get_tag(struct bio *, void *, unsigned int); |
498 | extern int bio_integrity_prep(struct bio *); | 499 | extern int bio_integrity_prep(struct bio *); |
499 | extern void bio_integrity_endio(struct bio *, int); | 500 | extern void bio_integrity_endio(struct bio *, int); |
500 | extern void bio_integrity_advance(struct bio *, unsigned int); | 501 | extern void bio_integrity_advance(struct bio *, unsigned int); |
501 | extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int); | 502 | extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int); |
502 | extern void bio_integrity_split(struct bio *, struct bio_pair *, int); | 503 | extern void bio_integrity_split(struct bio *, struct bio_pair *, int); |
503 | extern int bio_integrity_clone(struct bio *, struct bio *, struct bio_set *); | 504 | extern int bio_integrity_clone(struct bio *, struct bio *, struct bio_set *); |
504 | extern int bioset_integrity_create(struct bio_set *, int); | 505 | extern int bioset_integrity_create(struct bio_set *, int); |
505 | extern void bioset_integrity_free(struct bio_set *); | 506 | extern void bioset_integrity_free(struct bio_set *); |
506 | extern void bio_integrity_init_slab(void); | 507 | extern void bio_integrity_init_slab(void); |
507 | 508 | ||
508 | #else /* CONFIG_BLK_DEV_INTEGRITY */ | 509 | #else /* CONFIG_BLK_DEV_INTEGRITY */ |
509 | 510 | ||
510 | #define bio_integrity(a) (0) | 511 | #define bio_integrity(a) (0) |
511 | #define bioset_integrity_create(a, b) (0) | 512 | #define bioset_integrity_create(a, b) (0) |
512 | #define bio_integrity_prep(a) (0) | 513 | #define bio_integrity_prep(a) (0) |
513 | #define bio_integrity_enabled(a) (0) | 514 | #define bio_integrity_enabled(a) (0) |
514 | #define bio_integrity_clone(a, b, c) (0) | 515 | #define bio_integrity_clone(a, b, c) (0) |
515 | #define bioset_integrity_free(a) do { } while (0) | 516 | #define bioset_integrity_free(a) do { } while (0) |
516 | #define bio_integrity_free(a, b) do { } while (0) | 517 | #define bio_integrity_free(a, b) do { } while (0) |
517 | #define bio_integrity_endio(a, b) do { } while (0) | 518 | #define bio_integrity_endio(a, b) do { } while (0) |
518 | #define bio_integrity_advance(a, b) do { } while (0) | 519 | #define bio_integrity_advance(a, b) do { } while (0) |
519 | #define bio_integrity_trim(a, b, c) do { } while (0) | 520 | #define bio_integrity_trim(a, b, c) do { } while (0) |
520 | #define bio_integrity_split(a, b, c) do { } while (0) | 521 | #define bio_integrity_split(a, b, c) do { } while (0) |
521 | #define bio_integrity_set_tag(a, b, c) do { } while (0) | 522 | #define bio_integrity_set_tag(a, b, c) do { } while (0) |
522 | #define bio_integrity_get_tag(a, b, c) do { } while (0) | 523 | #define bio_integrity_get_tag(a, b, c) do { } while (0) |
523 | #define bio_integrity_init_slab(a) do { } while (0) | 524 | #define bio_integrity_init_slab(a) do { } while (0) |
524 | 525 | ||
525 | #endif /* CONFIG_BLK_DEV_INTEGRITY */ | 526 | #endif /* CONFIG_BLK_DEV_INTEGRITY */ |
526 | 527 | ||
527 | #endif /* CONFIG_BLOCK */ | 528 | #endif /* CONFIG_BLOCK */ |
528 | #endif /* __LINUX_BIO_H */ | 529 | #endif /* __LINUX_BIO_H */ |
529 | 530 |
include/linux/buffer_head.h
1 | /* | 1 | /* |
2 | * include/linux/buffer_head.h | 2 | * include/linux/buffer_head.h |
3 | * | 3 | * |
4 | * Everything to do with buffer_heads. | 4 | * Everything to do with buffer_heads. |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #ifndef _LINUX_BUFFER_HEAD_H | 7 | #ifndef _LINUX_BUFFER_HEAD_H |
8 | #define _LINUX_BUFFER_HEAD_H | 8 | #define _LINUX_BUFFER_HEAD_H |
9 | 9 | ||
10 | #include <linux/types.h> | 10 | #include <linux/types.h> |
11 | #include <linux/fs.h> | 11 | #include <linux/fs.h> |
12 | #include <linux/linkage.h> | 12 | #include <linux/linkage.h> |
13 | #include <linux/pagemap.h> | 13 | #include <linux/pagemap.h> |
14 | #include <linux/wait.h> | 14 | #include <linux/wait.h> |
15 | #include <asm/atomic.h> | 15 | #include <asm/atomic.h> |
16 | 16 | ||
17 | #ifdef CONFIG_BLOCK | 17 | #ifdef CONFIG_BLOCK |
18 | 18 | ||
19 | enum bh_state_bits { | 19 | enum bh_state_bits { |
20 | BH_Uptodate, /* Contains valid data */ | 20 | BH_Uptodate, /* Contains valid data */ |
21 | BH_Dirty, /* Is dirty */ | 21 | BH_Dirty, /* Is dirty */ |
22 | BH_Lock, /* Is locked */ | 22 | BH_Lock, /* Is locked */ |
23 | BH_Req, /* Has been submitted for I/O */ | 23 | BH_Req, /* Has been submitted for I/O */ |
24 | BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise | 24 | BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise |
25 | * IO completion of other buffers in the page | 25 | * IO completion of other buffers in the page |
26 | */ | 26 | */ |
27 | 27 | ||
28 | BH_Mapped, /* Has a disk mapping */ | 28 | BH_Mapped, /* Has a disk mapping */ |
29 | BH_New, /* Disk mapping was newly created by get_block */ | 29 | BH_New, /* Disk mapping was newly created by get_block */ |
30 | BH_Async_Read, /* Is under end_buffer_async_read I/O */ | 30 | BH_Async_Read, /* Is under end_buffer_async_read I/O */ |
31 | BH_Async_Write, /* Is under end_buffer_async_write I/O */ | 31 | BH_Async_Write, /* Is under end_buffer_async_write I/O */ |
32 | BH_Delay, /* Buffer is not yet allocated on disk */ | 32 | BH_Delay, /* Buffer is not yet allocated on disk */ |
33 | BH_Boundary, /* Block is followed by a discontiguity */ | 33 | BH_Boundary, /* Block is followed by a discontiguity */ |
34 | BH_Write_EIO, /* I/O error on write */ | 34 | BH_Write_EIO, /* I/O error on write */ |
35 | BH_Ordered, /* ordered write */ | 35 | BH_Ordered, /* ordered write */ |
36 | BH_Eopnotsupp, /* operation not supported (barrier) */ | 36 | BH_Eopnotsupp, /* operation not supported (barrier) */ |
37 | BH_Unwritten, /* Buffer is allocated on disk but not written */ | 37 | BH_Unwritten, /* Buffer is allocated on disk but not written */ |
38 | BH_Quiet, /* Buffer Error Prinks to be quiet */ | ||
38 | 39 | ||
39 | BH_PrivateStart,/* not a state bit, but the first bit available | 40 | BH_PrivateStart,/* not a state bit, but the first bit available |
40 | * for private allocation by other entities | 41 | * for private allocation by other entities |
41 | */ | 42 | */ |
42 | }; | 43 | }; |
43 | 44 | ||
44 | #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512) | 45 | #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512) |
45 | 46 | ||
46 | struct page; | 47 | struct page; |
47 | struct buffer_head; | 48 | struct buffer_head; |
48 | struct address_space; | 49 | struct address_space; |
49 | typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); | 50 | typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); |
50 | 51 | ||
51 | /* | 52 | /* |
52 | * Historically, a buffer_head was used to map a single block | 53 | * Historically, a buffer_head was used to map a single block |
53 | * within a page, and of course as the unit of I/O through the | 54 | * within a page, and of course as the unit of I/O through the |
54 | * filesystem and block layers. Nowadays the basic I/O unit | 55 | * filesystem and block layers. Nowadays the basic I/O unit |
55 | * is the bio, and buffer_heads are used for extracting block | 56 | * is the bio, and buffer_heads are used for extracting block |
56 | * mappings (via a get_block_t call), for tracking state within | 57 | * mappings (via a get_block_t call), for tracking state within |
57 | * a page (via a page_mapping) and for wrapping bio submission | 58 | * a page (via a page_mapping) and for wrapping bio submission |
58 | * for backward compatibility reasons (e.g. submit_bh). | 59 | * for backward compatibility reasons (e.g. submit_bh). |
59 | */ | 60 | */ |
60 | struct buffer_head { | 61 | struct buffer_head { |
61 | unsigned long b_state; /* buffer state bitmap (see above) */ | 62 | unsigned long b_state; /* buffer state bitmap (see above) */ |
62 | struct buffer_head *b_this_page;/* circular list of page's buffers */ | 63 | struct buffer_head *b_this_page;/* circular list of page's buffers */ |
63 | struct page *b_page; /* the page this bh is mapped to */ | 64 | struct page *b_page; /* the page this bh is mapped to */ |
64 | 65 | ||
65 | sector_t b_blocknr; /* start block number */ | 66 | sector_t b_blocknr; /* start block number */ |
66 | size_t b_size; /* size of mapping */ | 67 | size_t b_size; /* size of mapping */ |
67 | char *b_data; /* pointer to data within the page */ | 68 | char *b_data; /* pointer to data within the page */ |
68 | 69 | ||
69 | struct block_device *b_bdev; | 70 | struct block_device *b_bdev; |
70 | bh_end_io_t *b_end_io; /* I/O completion */ | 71 | bh_end_io_t *b_end_io; /* I/O completion */ |
71 | void *b_private; /* reserved for b_end_io */ | 72 | void *b_private; /* reserved for b_end_io */ |
72 | struct list_head b_assoc_buffers; /* associated with another mapping */ | 73 | struct list_head b_assoc_buffers; /* associated with another mapping */ |
73 | struct address_space *b_assoc_map; /* mapping this buffer is | 74 | struct address_space *b_assoc_map; /* mapping this buffer is |
74 | associated with */ | 75 | associated with */ |
75 | atomic_t b_count; /* users using this buffer_head */ | 76 | atomic_t b_count; /* users using this buffer_head */ |
76 | }; | 77 | }; |
77 | 78 | ||
78 | /* | 79 | /* |
79 | * macro tricks to expand the set_buffer_foo(), clear_buffer_foo() | 80 | * macro tricks to expand the set_buffer_foo(), clear_buffer_foo() |
80 | * and buffer_foo() functions. | 81 | * and buffer_foo() functions. |
81 | */ | 82 | */ |
82 | #define BUFFER_FNS(bit, name) \ | 83 | #define BUFFER_FNS(bit, name) \ |
83 | static inline void set_buffer_##name(struct buffer_head *bh) \ | 84 | static inline void set_buffer_##name(struct buffer_head *bh) \ |
84 | { \ | 85 | { \ |
85 | set_bit(BH_##bit, &(bh)->b_state); \ | 86 | set_bit(BH_##bit, &(bh)->b_state); \ |
86 | } \ | 87 | } \ |
87 | static inline void clear_buffer_##name(struct buffer_head *bh) \ | 88 | static inline void clear_buffer_##name(struct buffer_head *bh) \ |
88 | { \ | 89 | { \ |
89 | clear_bit(BH_##bit, &(bh)->b_state); \ | 90 | clear_bit(BH_##bit, &(bh)->b_state); \ |
90 | } \ | 91 | } \ |
91 | static inline int buffer_##name(const struct buffer_head *bh) \ | 92 | static inline int buffer_##name(const struct buffer_head *bh) \ |
92 | { \ | 93 | { \ |
93 | return test_bit(BH_##bit, &(bh)->b_state); \ | 94 | return test_bit(BH_##bit, &(bh)->b_state); \ |
94 | } | 95 | } |
95 | 96 | ||
96 | /* | 97 | /* |
97 | * test_set_buffer_foo() and test_clear_buffer_foo() | 98 | * test_set_buffer_foo() and test_clear_buffer_foo() |
98 | */ | 99 | */ |
99 | #define TAS_BUFFER_FNS(bit, name) \ | 100 | #define TAS_BUFFER_FNS(bit, name) \ |
100 | static inline int test_set_buffer_##name(struct buffer_head *bh) \ | 101 | static inline int test_set_buffer_##name(struct buffer_head *bh) \ |
101 | { \ | 102 | { \ |
102 | return test_and_set_bit(BH_##bit, &(bh)->b_state); \ | 103 | return test_and_set_bit(BH_##bit, &(bh)->b_state); \ |
103 | } \ | 104 | } \ |
104 | static inline int test_clear_buffer_##name(struct buffer_head *bh) \ | 105 | static inline int test_clear_buffer_##name(struct buffer_head *bh) \ |
105 | { \ | 106 | { \ |
106 | return test_and_clear_bit(BH_##bit, &(bh)->b_state); \ | 107 | return test_and_clear_bit(BH_##bit, &(bh)->b_state); \ |
107 | } \ | 108 | } \ |
108 | 109 | ||
109 | /* | 110 | /* |
110 | * Emit the buffer bitops functions. Note that there are also functions | 111 | * Emit the buffer bitops functions. Note that there are also functions |
111 | * of the form "mark_buffer_foo()". These are higher-level functions which | 112 | * of the form "mark_buffer_foo()". These are higher-level functions which |
112 | * do something in addition to setting a b_state bit. | 113 | * do something in addition to setting a b_state bit. |
113 | */ | 114 | */ |
114 | BUFFER_FNS(Uptodate, uptodate) | 115 | BUFFER_FNS(Uptodate, uptodate) |
115 | BUFFER_FNS(Dirty, dirty) | 116 | BUFFER_FNS(Dirty, dirty) |
116 | TAS_BUFFER_FNS(Dirty, dirty) | 117 | TAS_BUFFER_FNS(Dirty, dirty) |
117 | BUFFER_FNS(Lock, locked) | 118 | BUFFER_FNS(Lock, locked) |
118 | BUFFER_FNS(Req, req) | 119 | BUFFER_FNS(Req, req) |
119 | TAS_BUFFER_FNS(Req, req) | 120 | TAS_BUFFER_FNS(Req, req) |
120 | BUFFER_FNS(Mapped, mapped) | 121 | BUFFER_FNS(Mapped, mapped) |
121 | BUFFER_FNS(New, new) | 122 | BUFFER_FNS(New, new) |
122 | BUFFER_FNS(Async_Read, async_read) | 123 | BUFFER_FNS(Async_Read, async_read) |
123 | BUFFER_FNS(Async_Write, async_write) | 124 | BUFFER_FNS(Async_Write, async_write) |
124 | BUFFER_FNS(Delay, delay) | 125 | BUFFER_FNS(Delay, delay) |
125 | BUFFER_FNS(Boundary, boundary) | 126 | BUFFER_FNS(Boundary, boundary) |
126 | BUFFER_FNS(Write_EIO, write_io_error) | 127 | BUFFER_FNS(Write_EIO, write_io_error) |
127 | BUFFER_FNS(Ordered, ordered) | 128 | BUFFER_FNS(Ordered, ordered) |
128 | BUFFER_FNS(Eopnotsupp, eopnotsupp) | 129 | BUFFER_FNS(Eopnotsupp, eopnotsupp) |
129 | BUFFER_FNS(Unwritten, unwritten) | 130 | BUFFER_FNS(Unwritten, unwritten) |
130 | 131 | ||
131 | #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) | 132 | #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) |
132 | #define touch_buffer(bh) mark_page_accessed(bh->b_page) | 133 | #define touch_buffer(bh) mark_page_accessed(bh->b_page) |
133 | 134 | ||
134 | /* If we *know* page->private refers to buffer_heads */ | 135 | /* If we *know* page->private refers to buffer_heads */ |
135 | #define page_buffers(page) \ | 136 | #define page_buffers(page) \ |
136 | ({ \ | 137 | ({ \ |
137 | BUG_ON(!PagePrivate(page)); \ | 138 | BUG_ON(!PagePrivate(page)); \ |
138 | ((struct buffer_head *)page_private(page)); \ | 139 | ((struct buffer_head *)page_private(page)); \ |
139 | }) | 140 | }) |
140 | #define page_has_buffers(page) PagePrivate(page) | 141 | #define page_has_buffers(page) PagePrivate(page) |
141 | 142 | ||
142 | /* | 143 | /* |
143 | * Declarations | 144 | * Declarations |
144 | */ | 145 | */ |
145 | 146 | ||
146 | void mark_buffer_dirty(struct buffer_head *bh); | 147 | void mark_buffer_dirty(struct buffer_head *bh); |
147 | void init_buffer(struct buffer_head *, bh_end_io_t *, void *); | 148 | void init_buffer(struct buffer_head *, bh_end_io_t *, void *); |
148 | void set_bh_page(struct buffer_head *bh, | 149 | void set_bh_page(struct buffer_head *bh, |
149 | struct page *page, unsigned long offset); | 150 | struct page *page, unsigned long offset); |
150 | int try_to_free_buffers(struct page *); | 151 | int try_to_free_buffers(struct page *); |
151 | struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, | 152 | struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, |
152 | int retry); | 153 | int retry); |
153 | void create_empty_buffers(struct page *, unsigned long, | 154 | void create_empty_buffers(struct page *, unsigned long, |
154 | unsigned long b_state); | 155 | unsigned long b_state); |
155 | void end_buffer_read_sync(struct buffer_head *bh, int uptodate); | 156 | void end_buffer_read_sync(struct buffer_head *bh, int uptodate); |
156 | void end_buffer_write_sync(struct buffer_head *bh, int uptodate); | 157 | void end_buffer_write_sync(struct buffer_head *bh, int uptodate); |
157 | 158 | ||
158 | /* Things to do with buffers at mapping->private_list */ | 159 | /* Things to do with buffers at mapping->private_list */ |
159 | void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode); | 160 | void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode); |
160 | int inode_has_buffers(struct inode *); | 161 | int inode_has_buffers(struct inode *); |
161 | void invalidate_inode_buffers(struct inode *); | 162 | void invalidate_inode_buffers(struct inode *); |
162 | int remove_inode_buffers(struct inode *inode); | 163 | int remove_inode_buffers(struct inode *inode); |
163 | int sync_mapping_buffers(struct address_space *mapping); | 164 | int sync_mapping_buffers(struct address_space *mapping); |
164 | void unmap_underlying_metadata(struct block_device *bdev, sector_t block); | 165 | void unmap_underlying_metadata(struct block_device *bdev, sector_t block); |
165 | 166 | ||
166 | void mark_buffer_async_write(struct buffer_head *bh); | 167 | void mark_buffer_async_write(struct buffer_head *bh); |
167 | void invalidate_bdev(struct block_device *); | 168 | void invalidate_bdev(struct block_device *); |
168 | int sync_blockdev(struct block_device *bdev); | 169 | int sync_blockdev(struct block_device *bdev); |
169 | void __wait_on_buffer(struct buffer_head *); | 170 | void __wait_on_buffer(struct buffer_head *); |
170 | wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); | 171 | wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); |
171 | int fsync_bdev(struct block_device *); | 172 | int fsync_bdev(struct block_device *); |
172 | struct super_block *freeze_bdev(struct block_device *); | 173 | struct super_block *freeze_bdev(struct block_device *); |
173 | void thaw_bdev(struct block_device *, struct super_block *); | 174 | void thaw_bdev(struct block_device *, struct super_block *); |
174 | int fsync_super(struct super_block *); | 175 | int fsync_super(struct super_block *); |
175 | int fsync_no_super(struct block_device *); | 176 | int fsync_no_super(struct block_device *); |
176 | struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, | 177 | struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, |
177 | unsigned size); | 178 | unsigned size); |
178 | struct buffer_head *__getblk(struct block_device *bdev, sector_t block, | 179 | struct buffer_head *__getblk(struct block_device *bdev, sector_t block, |
179 | unsigned size); | 180 | unsigned size); |
180 | void __brelse(struct buffer_head *); | 181 | void __brelse(struct buffer_head *); |
181 | void __bforget(struct buffer_head *); | 182 | void __bforget(struct buffer_head *); |
182 | void __breadahead(struct block_device *, sector_t block, unsigned int size); | 183 | void __breadahead(struct block_device *, sector_t block, unsigned int size); |
183 | struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size); | 184 | struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size); |
184 | void invalidate_bh_lrus(void); | 185 | void invalidate_bh_lrus(void); |
185 | struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); | 186 | struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); |
186 | void free_buffer_head(struct buffer_head * bh); | 187 | void free_buffer_head(struct buffer_head * bh); |
187 | void unlock_buffer(struct buffer_head *bh); | 188 | void unlock_buffer(struct buffer_head *bh); |
188 | void __lock_buffer(struct buffer_head *bh); | 189 | void __lock_buffer(struct buffer_head *bh); |
189 | void ll_rw_block(int, int, struct buffer_head * bh[]); | 190 | void ll_rw_block(int, int, struct buffer_head * bh[]); |
190 | int sync_dirty_buffer(struct buffer_head *bh); | 191 | int sync_dirty_buffer(struct buffer_head *bh); |
191 | int submit_bh(int, struct buffer_head *); | 192 | int submit_bh(int, struct buffer_head *); |
192 | void write_boundary_block(struct block_device *bdev, | 193 | void write_boundary_block(struct block_device *bdev, |
193 | sector_t bblock, unsigned blocksize); | 194 | sector_t bblock, unsigned blocksize); |
194 | int bh_uptodate_or_lock(struct buffer_head *bh); | 195 | int bh_uptodate_or_lock(struct buffer_head *bh); |
195 | int bh_submit_read(struct buffer_head *bh); | 196 | int bh_submit_read(struct buffer_head *bh); |
196 | 197 | ||
197 | extern int buffer_heads_over_limit; | 198 | extern int buffer_heads_over_limit; |
198 | 199 | ||
199 | /* | 200 | /* |
200 | * Generic address_space_operations implementations for buffer_head-backed | 201 | * Generic address_space_operations implementations for buffer_head-backed |
201 | * address_spaces. | 202 | * address_spaces. |
202 | */ | 203 | */ |
203 | void block_invalidatepage(struct page *page, unsigned long offset); | 204 | void block_invalidatepage(struct page *page, unsigned long offset); |
204 | int block_write_full_page(struct page *page, get_block_t *get_block, | 205 | int block_write_full_page(struct page *page, get_block_t *get_block, |
205 | struct writeback_control *wbc); | 206 | struct writeback_control *wbc); |
206 | int block_read_full_page(struct page*, get_block_t*); | 207 | int block_read_full_page(struct page*, get_block_t*); |
207 | int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, | 208 | int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, |
208 | unsigned long from); | 209 | unsigned long from); |
209 | int block_write_begin(struct file *, struct address_space *, | 210 | int block_write_begin(struct file *, struct address_space *, |
210 | loff_t, unsigned, unsigned, | 211 | loff_t, unsigned, unsigned, |
211 | struct page **, void **, get_block_t*); | 212 | struct page **, void **, get_block_t*); |
212 | int block_write_end(struct file *, struct address_space *, | 213 | int block_write_end(struct file *, struct address_space *, |
213 | loff_t, unsigned, unsigned, | 214 | loff_t, unsigned, unsigned, |
214 | struct page *, void *); | 215 | struct page *, void *); |
215 | int generic_write_end(struct file *, struct address_space *, | 216 | int generic_write_end(struct file *, struct address_space *, |
216 | loff_t, unsigned, unsigned, | 217 | loff_t, unsigned, unsigned, |
217 | struct page *, void *); | 218 | struct page *, void *); |
218 | void page_zero_new_buffers(struct page *page, unsigned from, unsigned to); | 219 | void page_zero_new_buffers(struct page *page, unsigned from, unsigned to); |
219 | int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); | 220 | int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); |
220 | int cont_write_begin(struct file *, struct address_space *, loff_t, | 221 | int cont_write_begin(struct file *, struct address_space *, loff_t, |
221 | unsigned, unsigned, struct page **, void **, | 222 | unsigned, unsigned, struct page **, void **, |
222 | get_block_t *, loff_t *); | 223 | get_block_t *, loff_t *); |
223 | int generic_cont_expand_simple(struct inode *inode, loff_t size); | 224 | int generic_cont_expand_simple(struct inode *inode, loff_t size); |
224 | int block_commit_write(struct page *page, unsigned from, unsigned to); | 225 | int block_commit_write(struct page *page, unsigned from, unsigned to); |
225 | int block_page_mkwrite(struct vm_area_struct *vma, struct page *page, | 226 | int block_page_mkwrite(struct vm_area_struct *vma, struct page *page, |
226 | get_block_t get_block); | 227 | get_block_t get_block); |
227 | void block_sync_page(struct page *); | 228 | void block_sync_page(struct page *); |
228 | sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); | 229 | sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); |
229 | int block_truncate_page(struct address_space *, loff_t, get_block_t *); | 230 | int block_truncate_page(struct address_space *, loff_t, get_block_t *); |
230 | int file_fsync(struct file *, struct dentry *, int); | 231 | int file_fsync(struct file *, struct dentry *, int); |
231 | int nobh_write_begin(struct file *, struct address_space *, | 232 | int nobh_write_begin(struct file *, struct address_space *, |
232 | loff_t, unsigned, unsigned, | 233 | loff_t, unsigned, unsigned, |
233 | struct page **, void **, get_block_t*); | 234 | struct page **, void **, get_block_t*); |
234 | int nobh_write_end(struct file *, struct address_space *, | 235 | int nobh_write_end(struct file *, struct address_space *, |
235 | loff_t, unsigned, unsigned, | 236 | loff_t, unsigned, unsigned, |
236 | struct page *, void *); | 237 | struct page *, void *); |
237 | int nobh_truncate_page(struct address_space *, loff_t, get_block_t *); | 238 | int nobh_truncate_page(struct address_space *, loff_t, get_block_t *); |
238 | int nobh_writepage(struct page *page, get_block_t *get_block, | 239 | int nobh_writepage(struct page *page, get_block_t *get_block, |
239 | struct writeback_control *wbc); | 240 | struct writeback_control *wbc); |
240 | 241 | ||
241 | void buffer_init(void); | 242 | void buffer_init(void); |
242 | 243 | ||
243 | /* | 244 | /* |
244 | * inline definitions | 245 | * inline definitions |
245 | */ | 246 | */ |
246 | 247 | ||
247 | static inline void attach_page_buffers(struct page *page, | 248 | static inline void attach_page_buffers(struct page *page, |
248 | struct buffer_head *head) | 249 | struct buffer_head *head) |
249 | { | 250 | { |
250 | page_cache_get(page); | 251 | page_cache_get(page); |
251 | SetPagePrivate(page); | 252 | SetPagePrivate(page); |
252 | set_page_private(page, (unsigned long)head); | 253 | set_page_private(page, (unsigned long)head); |
253 | } | 254 | } |
254 | 255 | ||
255 | static inline void get_bh(struct buffer_head *bh) | 256 | static inline void get_bh(struct buffer_head *bh) |
256 | { | 257 | { |
257 | atomic_inc(&bh->b_count); | 258 | atomic_inc(&bh->b_count); |
258 | } | 259 | } |
259 | 260 | ||
260 | static inline void put_bh(struct buffer_head *bh) | 261 | static inline void put_bh(struct buffer_head *bh) |
261 | { | 262 | { |
262 | smp_mb__before_atomic_dec(); | 263 | smp_mb__before_atomic_dec(); |
263 | atomic_dec(&bh->b_count); | 264 | atomic_dec(&bh->b_count); |
264 | } | 265 | } |
265 | 266 | ||
266 | static inline void brelse(struct buffer_head *bh) | 267 | static inline void brelse(struct buffer_head *bh) |
267 | { | 268 | { |
268 | if (bh) | 269 | if (bh) |
269 | __brelse(bh); | 270 | __brelse(bh); |
270 | } | 271 | } |
271 | 272 | ||
272 | static inline void bforget(struct buffer_head *bh) | 273 | static inline void bforget(struct buffer_head *bh) |
273 | { | 274 | { |
274 | if (bh) | 275 | if (bh) |
275 | __bforget(bh); | 276 | __bforget(bh); |
276 | } | 277 | } |
277 | 278 | ||
278 | static inline struct buffer_head * | 279 | static inline struct buffer_head * |
279 | sb_bread(struct super_block *sb, sector_t block) | 280 | sb_bread(struct super_block *sb, sector_t block) |
280 | { | 281 | { |
281 | return __bread(sb->s_bdev, block, sb->s_blocksize); | 282 | return __bread(sb->s_bdev, block, sb->s_blocksize); |
282 | } | 283 | } |
283 | 284 | ||
284 | static inline void | 285 | static inline void |
285 | sb_breadahead(struct super_block *sb, sector_t block) | 286 | sb_breadahead(struct super_block *sb, sector_t block) |
286 | { | 287 | { |
287 | __breadahead(sb->s_bdev, block, sb->s_blocksize); | 288 | __breadahead(sb->s_bdev, block, sb->s_blocksize); |
288 | } | 289 | } |
289 | 290 | ||
290 | static inline struct buffer_head * | 291 | static inline struct buffer_head * |
291 | sb_getblk(struct super_block *sb, sector_t block) | 292 | sb_getblk(struct super_block *sb, sector_t block) |
292 | { | 293 | { |
293 | return __getblk(sb->s_bdev, block, sb->s_blocksize); | 294 | return __getblk(sb->s_bdev, block, sb->s_blocksize); |
294 | } | 295 | } |
295 | 296 | ||
296 | static inline struct buffer_head * | 297 | static inline struct buffer_head * |
297 | sb_find_get_block(struct super_block *sb, sector_t block) | 298 | sb_find_get_block(struct super_block *sb, sector_t block) |
298 | { | 299 | { |
299 | return __find_get_block(sb->s_bdev, block, sb->s_blocksize); | 300 | return __find_get_block(sb->s_bdev, block, sb->s_blocksize); |
300 | } | 301 | } |
301 | 302 | ||
302 | static inline void | 303 | static inline void |
303 | map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block) | 304 | map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block) |
304 | { | 305 | { |
305 | set_buffer_mapped(bh); | 306 | set_buffer_mapped(bh); |
306 | bh->b_bdev = sb->s_bdev; | 307 | bh->b_bdev = sb->s_bdev; |
307 | bh->b_blocknr = block; | 308 | bh->b_blocknr = block; |
308 | bh->b_size = sb->s_blocksize; | 309 | bh->b_size = sb->s_blocksize; |
309 | } | 310 | } |
310 | 311 | ||
311 | /* | 312 | /* |
312 | * Calling wait_on_buffer() for a zero-ref buffer is illegal, so we call into | 313 | * Calling wait_on_buffer() for a zero-ref buffer is illegal, so we call into |
313 | * __wait_on_buffer() just to trip a debug check. Because debug code in inline | 314 | * __wait_on_buffer() just to trip a debug check. Because debug code in inline |
314 | * functions is bloaty. | 315 | * functions is bloaty. |
315 | */ | 316 | */ |
316 | static inline void wait_on_buffer(struct buffer_head *bh) | 317 | static inline void wait_on_buffer(struct buffer_head *bh) |
317 | { | 318 | { |
318 | might_sleep(); | 319 | might_sleep(); |
319 | if (buffer_locked(bh) || atomic_read(&bh->b_count) == 0) | 320 | if (buffer_locked(bh) || atomic_read(&bh->b_count) == 0) |
320 | __wait_on_buffer(bh); | 321 | __wait_on_buffer(bh); |
321 | } | 322 | } |
322 | 323 | ||
323 | static inline int trylock_buffer(struct buffer_head *bh) | 324 | static inline int trylock_buffer(struct buffer_head *bh) |
324 | { | 325 | { |
325 | return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state)); | 326 | return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state)); |
326 | } | 327 | } |
327 | 328 | ||
328 | static inline void lock_buffer(struct buffer_head *bh) | 329 | static inline void lock_buffer(struct buffer_head *bh) |
329 | { | 330 | { |
330 | might_sleep(); | 331 | might_sleep(); |
331 | if (!trylock_buffer(bh)) | 332 | if (!trylock_buffer(bh)) |
332 | __lock_buffer(bh); | 333 | __lock_buffer(bh); |
333 | } | 334 | } |
334 | 335 | ||
335 | extern int __set_page_dirty_buffers(struct page *page); | 336 | extern int __set_page_dirty_buffers(struct page *page); |
336 | 337 | ||
337 | #else /* CONFIG_BLOCK */ | 338 | #else /* CONFIG_BLOCK */ |
338 | 339 | ||
339 | static inline void buffer_init(void) {} | 340 | static inline void buffer_init(void) {} |
340 | static inline int try_to_free_buffers(struct page *page) { return 1; } | 341 | static inline int try_to_free_buffers(struct page *page) { return 1; } |
341 | static inline int sync_blockdev(struct block_device *bdev) { return 0; } | 342 | static inline int sync_blockdev(struct block_device *bdev) { return 0; } |
342 | static inline int inode_has_buffers(struct inode *inode) { return 0; } | 343 | static inline int inode_has_buffers(struct inode *inode) { return 0; } |
343 | static inline void invalidate_inode_buffers(struct inode *inode) {} | 344 | static inline void invalidate_inode_buffers(struct inode *inode) {} |
344 | static inline int remove_inode_buffers(struct inode *inode) { return 1; } | 345 | static inline int remove_inode_buffers(struct inode *inode) { return 1; } |
345 | static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } | 346 | static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } |
346 | static inline void invalidate_bdev(struct block_device *bdev) {} | 347 | static inline void invalidate_bdev(struct block_device *bdev) {} |
347 | 348 | ||
348 | 349 | ||
349 | #endif /* CONFIG_BLOCK */ | 350 | #endif /* CONFIG_BLOCK */ |
350 | #endif /* _LINUX_BUFFER_HEAD_H */ | 351 | #endif /* _LINUX_BUFFER_HEAD_H */ |
351 | 352 |