Commit b8887e6e8c04bcefb512cdb08fc7e9c310ac847e

Authored by Randy Dunlap
Committed by Linus Torvalds
1 parent 1e5d533142

[PATCH] kernel-docs: fix kernel-doc format problems

Convert to proper kernel-doc format.

Some have extra blank lines (not allowed immed.  after the function name)
or need blank lines (after all parameters).  Function summary must be only
one line.

Colon (":") in a function description does weird things (causes kernel-doc
to think that it's a new section head sadly).

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 4 changed files with 4 additions and 5 deletions Inline Diff

drivers/block/ll_rw_blk.c
1 /* 1 /*
2 * linux/drivers/block/ll_rw_blk.c 2 * linux/drivers/block/ll_rw_blk.c
3 * 3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics 5 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics
6 * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE 6 * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
7 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> 7 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
8 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000 8 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000
9 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 9 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
10 */ 10 */
11 11
12 /* 12 /*
13 * This handles all read/write requests to block devices 13 * This handles all read/write requests to block devices
14 */ 14 */
15 #include <linux/config.h> 15 #include <linux/config.h>
16 #include <linux/kernel.h> 16 #include <linux/kernel.h>
17 #include <linux/module.h> 17 #include <linux/module.h>
18 #include <linux/backing-dev.h> 18 #include <linux/backing-dev.h>
19 #include <linux/bio.h> 19 #include <linux/bio.h>
20 #include <linux/blkdev.h> 20 #include <linux/blkdev.h>
21 #include <linux/highmem.h> 21 #include <linux/highmem.h>
22 #include <linux/mm.h> 22 #include <linux/mm.h>
23 #include <linux/kernel_stat.h> 23 #include <linux/kernel_stat.h>
24 #include <linux/string.h> 24 #include <linux/string.h>
25 #include <linux/init.h> 25 #include <linux/init.h>
26 #include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ 26 #include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
27 #include <linux/completion.h> 27 #include <linux/completion.h>
28 #include <linux/slab.h> 28 #include <linux/slab.h>
29 #include <linux/swap.h> 29 #include <linux/swap.h>
30 #include <linux/writeback.h> 30 #include <linux/writeback.h>
31 #include <linux/blkdev.h> 31 #include <linux/blkdev.h>
32 32
33 /* 33 /*
34 * for max sense size 34 * for max sense size
35 */ 35 */
36 #include <scsi/scsi_cmnd.h> 36 #include <scsi/scsi_cmnd.h>
37 37
38 static void blk_unplug_work(void *data); 38 static void blk_unplug_work(void *data);
39 static void blk_unplug_timeout(unsigned long data); 39 static void blk_unplug_timeout(unsigned long data);
40 static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io); 40 static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);
41 41
42 /* 42 /*
43 * For the allocated request tables 43 * For the allocated request tables
44 */ 44 */
45 static kmem_cache_t *request_cachep; 45 static kmem_cache_t *request_cachep;
46 46
47 /* 47 /*
48 * For queue allocation 48 * For queue allocation
49 */ 49 */
50 static kmem_cache_t *requestq_cachep; 50 static kmem_cache_t *requestq_cachep;
51 51
52 /* 52 /*
53 * For io context allocations 53 * For io context allocations
54 */ 54 */
55 static kmem_cache_t *iocontext_cachep; 55 static kmem_cache_t *iocontext_cachep;
56 56
57 static wait_queue_head_t congestion_wqh[2] = { 57 static wait_queue_head_t congestion_wqh[2] = {
58 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), 58 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
59 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) 59 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
60 }; 60 };
61 61
62 /* 62 /*
63 * Controlling structure to kblockd 63 * Controlling structure to kblockd
64 */ 64 */
65 static struct workqueue_struct *kblockd_workqueue; 65 static struct workqueue_struct *kblockd_workqueue;
66 66
67 unsigned long blk_max_low_pfn, blk_max_pfn; 67 unsigned long blk_max_low_pfn, blk_max_pfn;
68 68
69 EXPORT_SYMBOL(blk_max_low_pfn); 69 EXPORT_SYMBOL(blk_max_low_pfn);
70 EXPORT_SYMBOL(blk_max_pfn); 70 EXPORT_SYMBOL(blk_max_pfn);
71 71
72 /* Amount of time in which a process may batch requests */ 72 /* Amount of time in which a process may batch requests */
73 #define BLK_BATCH_TIME (HZ/50UL) 73 #define BLK_BATCH_TIME (HZ/50UL)
74 74
75 /* Number of requests a "batching" process may submit */ 75 /* Number of requests a "batching" process may submit */
76 #define BLK_BATCH_REQ 32 76 #define BLK_BATCH_REQ 32
77 77
78 /* 78 /*
79 * Return the threshold (number of used requests) at which the queue is 79 * Return the threshold (number of used requests) at which the queue is
80 * considered to be congested. It include a little hysteresis to keep the 80 * considered to be congested. It include a little hysteresis to keep the
81 * context switch rate down. 81 * context switch rate down.
82 */ 82 */
83 static inline int queue_congestion_on_threshold(struct request_queue *q) 83 static inline int queue_congestion_on_threshold(struct request_queue *q)
84 { 84 {
85 return q->nr_congestion_on; 85 return q->nr_congestion_on;
86 } 86 }
87 87
88 /* 88 /*
89 * The threshold at which a queue is considered to be uncongested 89 * The threshold at which a queue is considered to be uncongested
90 */ 90 */
91 static inline int queue_congestion_off_threshold(struct request_queue *q) 91 static inline int queue_congestion_off_threshold(struct request_queue *q)
92 { 92 {
93 return q->nr_congestion_off; 93 return q->nr_congestion_off;
94 } 94 }
95 95
96 static void blk_queue_congestion_threshold(struct request_queue *q) 96 static void blk_queue_congestion_threshold(struct request_queue *q)
97 { 97 {
98 int nr; 98 int nr;
99 99
100 nr = q->nr_requests - (q->nr_requests / 8) + 1; 100 nr = q->nr_requests - (q->nr_requests / 8) + 1;
101 if (nr > q->nr_requests) 101 if (nr > q->nr_requests)
102 nr = q->nr_requests; 102 nr = q->nr_requests;
103 q->nr_congestion_on = nr; 103 q->nr_congestion_on = nr;
104 104
105 nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1; 105 nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
106 if (nr < 1) 106 if (nr < 1)
107 nr = 1; 107 nr = 1;
108 q->nr_congestion_off = nr; 108 q->nr_congestion_off = nr;
109 } 109 }
110 110
111 /* 111 /*
112 * A queue has just exitted congestion. Note this in the global counter of 112 * A queue has just exitted congestion. Note this in the global counter of
113 * congested queues, and wake up anyone who was waiting for requests to be 113 * congested queues, and wake up anyone who was waiting for requests to be
114 * put back. 114 * put back.
115 */ 115 */
116 static void clear_queue_congested(request_queue_t *q, int rw) 116 static void clear_queue_congested(request_queue_t *q, int rw)
117 { 117 {
118 enum bdi_state bit; 118 enum bdi_state bit;
119 wait_queue_head_t *wqh = &congestion_wqh[rw]; 119 wait_queue_head_t *wqh = &congestion_wqh[rw];
120 120
121 bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; 121 bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
122 clear_bit(bit, &q->backing_dev_info.state); 122 clear_bit(bit, &q->backing_dev_info.state);
123 smp_mb__after_clear_bit(); 123 smp_mb__after_clear_bit();
124 if (waitqueue_active(wqh)) 124 if (waitqueue_active(wqh))
125 wake_up(wqh); 125 wake_up(wqh);
126 } 126 }
127 127
128 /* 128 /*
129 * A queue has just entered congestion. Flag that in the queue's VM-visible 129 * A queue has just entered congestion. Flag that in the queue's VM-visible
130 * state flags and increment the global gounter of congested queues. 130 * state flags and increment the global gounter of congested queues.
131 */ 131 */
132 static void set_queue_congested(request_queue_t *q, int rw) 132 static void set_queue_congested(request_queue_t *q, int rw)
133 { 133 {
134 enum bdi_state bit; 134 enum bdi_state bit;
135 135
136 bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; 136 bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
137 set_bit(bit, &q->backing_dev_info.state); 137 set_bit(bit, &q->backing_dev_info.state);
138 } 138 }
139 139
140 /** 140 /**
141 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info 141 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
142 * @bdev: device 142 * @bdev: device
143 * 143 *
144 * Locates the passed device's request queue and returns the address of its 144 * Locates the passed device's request queue and returns the address of its
145 * backing_dev_info 145 * backing_dev_info
146 * 146 *
147 * Will return NULL if the request queue cannot be located. 147 * Will return NULL if the request queue cannot be located.
148 */ 148 */
149 struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) 149 struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
150 { 150 {
151 struct backing_dev_info *ret = NULL; 151 struct backing_dev_info *ret = NULL;
152 request_queue_t *q = bdev_get_queue(bdev); 152 request_queue_t *q = bdev_get_queue(bdev);
153 153
154 if (q) 154 if (q)
155 ret = &q->backing_dev_info; 155 ret = &q->backing_dev_info;
156 return ret; 156 return ret;
157 } 157 }
158 158
159 EXPORT_SYMBOL(blk_get_backing_dev_info); 159 EXPORT_SYMBOL(blk_get_backing_dev_info);
160 160
161 void blk_queue_activity_fn(request_queue_t *q, activity_fn *fn, void *data) 161 void blk_queue_activity_fn(request_queue_t *q, activity_fn *fn, void *data)
162 { 162 {
163 q->activity_fn = fn; 163 q->activity_fn = fn;
164 q->activity_data = data; 164 q->activity_data = data;
165 } 165 }
166 166
167 EXPORT_SYMBOL(blk_queue_activity_fn); 167 EXPORT_SYMBOL(blk_queue_activity_fn);
168 168
169 /** 169 /**
170 * blk_queue_prep_rq - set a prepare_request function for queue 170 * blk_queue_prep_rq - set a prepare_request function for queue
171 * @q: queue 171 * @q: queue
172 * @pfn: prepare_request function 172 * @pfn: prepare_request function
173 * 173 *
174 * It's possible for a queue to register a prepare_request callback which 174 * It's possible for a queue to register a prepare_request callback which
175 * is invoked before the request is handed to the request_fn. The goal of 175 * is invoked before the request is handed to the request_fn. The goal of
176 * the function is to prepare a request for I/O, it can be used to build a 176 * the function is to prepare a request for I/O, it can be used to build a
177 * cdb from the request data for instance. 177 * cdb from the request data for instance.
178 * 178 *
179 */ 179 */
180 void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn) 180 void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn)
181 { 181 {
182 q->prep_rq_fn = pfn; 182 q->prep_rq_fn = pfn;
183 } 183 }
184 184
185 EXPORT_SYMBOL(blk_queue_prep_rq); 185 EXPORT_SYMBOL(blk_queue_prep_rq);
186 186
187 /** 187 /**
188 * blk_queue_merge_bvec - set a merge_bvec function for queue 188 * blk_queue_merge_bvec - set a merge_bvec function for queue
189 * @q: queue 189 * @q: queue
190 * @mbfn: merge_bvec_fn 190 * @mbfn: merge_bvec_fn
191 * 191 *
192 * Usually queues have static limitations on the max sectors or segments that 192 * Usually queues have static limitations on the max sectors or segments that
193 * we can put in a request. Stacking drivers may have some settings that 193 * we can put in a request. Stacking drivers may have some settings that
194 * are dynamic, and thus we have to query the queue whether it is ok to 194 * are dynamic, and thus we have to query the queue whether it is ok to
195 * add a new bio_vec to a bio at a given offset or not. If the block device 195 * add a new bio_vec to a bio at a given offset or not. If the block device
196 * has such limitations, it needs to register a merge_bvec_fn to control 196 * has such limitations, it needs to register a merge_bvec_fn to control
197 * the size of bio's sent to it. Note that a block device *must* allow a 197 * the size of bio's sent to it. Note that a block device *must* allow a
198 * single page to be added to an empty bio. The block device driver may want 198 * single page to be added to an empty bio. The block device driver may want
199 * to use the bio_split() function to deal with these bio's. By default 199 * to use the bio_split() function to deal with these bio's. By default
200 * no merge_bvec_fn is defined for a queue, and only the fixed limits are 200 * no merge_bvec_fn is defined for a queue, and only the fixed limits are
201 * honored. 201 * honored.
202 */ 202 */
203 void blk_queue_merge_bvec(request_queue_t *q, merge_bvec_fn *mbfn) 203 void blk_queue_merge_bvec(request_queue_t *q, merge_bvec_fn *mbfn)
204 { 204 {
205 q->merge_bvec_fn = mbfn; 205 q->merge_bvec_fn = mbfn;
206 } 206 }
207 207
208 EXPORT_SYMBOL(blk_queue_merge_bvec); 208 EXPORT_SYMBOL(blk_queue_merge_bvec);
209 209
210 /** 210 /**
211 * blk_queue_make_request - define an alternate make_request function for a device 211 * blk_queue_make_request - define an alternate make_request function for a device
212 * @q: the request queue for the device to be affected 212 * @q: the request queue for the device to be affected
213 * @mfn: the alternate make_request function 213 * @mfn: the alternate make_request function
214 * 214 *
215 * Description: 215 * Description:
216 * The normal way for &struct bios to be passed to a device 216 * The normal way for &struct bios to be passed to a device
217 * driver is for them to be collected into requests on a request 217 * driver is for them to be collected into requests on a request
218 * queue, and then to allow the device driver to select requests 218 * queue, and then to allow the device driver to select requests
219 * off that queue when it is ready. This works well for many block 219 * off that queue when it is ready. This works well for many block
220 * devices. However some block devices (typically virtual devices 220 * devices. However some block devices (typically virtual devices
221 * such as md or lvm) do not benefit from the processing on the 221 * such as md or lvm) do not benefit from the processing on the
222 * request queue, and are served best by having the requests passed 222 * request queue, and are served best by having the requests passed
223 * directly to them. This can be achieved by providing a function 223 * directly to them. This can be achieved by providing a function
224 * to blk_queue_make_request(). 224 * to blk_queue_make_request().
225 * 225 *
226 * Caveat: 226 * Caveat:
227 * The driver that does this *must* be able to deal appropriately 227 * The driver that does this *must* be able to deal appropriately
228 * with buffers in "highmemory". This can be accomplished by either calling 228 * with buffers in "highmemory". This can be accomplished by either calling
229 * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling 229 * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
230 * blk_queue_bounce() to create a buffer in normal memory. 230 * blk_queue_bounce() to create a buffer in normal memory.
231 **/ 231 **/
232 void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn) 232 void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
233 { 233 {
234 /* 234 /*
235 * set defaults 235 * set defaults
236 */ 236 */
237 q->nr_requests = BLKDEV_MAX_RQ; 237 q->nr_requests = BLKDEV_MAX_RQ;
238 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); 238 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
239 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); 239 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
240 q->make_request_fn = mfn; 240 q->make_request_fn = mfn;
241 q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 241 q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
242 q->backing_dev_info.state = 0; 242 q->backing_dev_info.state = 0;
243 q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; 243 q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
244 blk_queue_max_sectors(q, MAX_SECTORS); 244 blk_queue_max_sectors(q, MAX_SECTORS);
245 blk_queue_hardsect_size(q, 512); 245 blk_queue_hardsect_size(q, 512);
246 blk_queue_dma_alignment(q, 511); 246 blk_queue_dma_alignment(q, 511);
247 blk_queue_congestion_threshold(q); 247 blk_queue_congestion_threshold(q);
248 q->nr_batching = BLK_BATCH_REQ; 248 q->nr_batching = BLK_BATCH_REQ;
249 249
250 q->unplug_thresh = 4; /* hmm */ 250 q->unplug_thresh = 4; /* hmm */
251 q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ 251 q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */
252 if (q->unplug_delay == 0) 252 if (q->unplug_delay == 0)
253 q->unplug_delay = 1; 253 q->unplug_delay = 1;
254 254
255 INIT_WORK(&q->unplug_work, blk_unplug_work, q); 255 INIT_WORK(&q->unplug_work, blk_unplug_work, q);
256 256
257 q->unplug_timer.function = blk_unplug_timeout; 257 q->unplug_timer.function = blk_unplug_timeout;
258 q->unplug_timer.data = (unsigned long)q; 258 q->unplug_timer.data = (unsigned long)q;
259 259
260 /* 260 /*
261 * by default assume old behaviour and bounce for any highmem page 261 * by default assume old behaviour and bounce for any highmem page
262 */ 262 */
263 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); 263 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
264 264
265 blk_queue_activity_fn(q, NULL, NULL); 265 blk_queue_activity_fn(q, NULL, NULL);
266 } 266 }
267 267
268 EXPORT_SYMBOL(blk_queue_make_request); 268 EXPORT_SYMBOL(blk_queue_make_request);
269 269
270 static inline void rq_init(request_queue_t *q, struct request *rq) 270 static inline void rq_init(request_queue_t *q, struct request *rq)
271 { 271 {
272 INIT_LIST_HEAD(&rq->queuelist); 272 INIT_LIST_HEAD(&rq->queuelist);
273 273
274 rq->errors = 0; 274 rq->errors = 0;
275 rq->rq_status = RQ_ACTIVE; 275 rq->rq_status = RQ_ACTIVE;
276 rq->bio = rq->biotail = NULL; 276 rq->bio = rq->biotail = NULL;
277 rq->ioprio = 0; 277 rq->ioprio = 0;
278 rq->buffer = NULL; 278 rq->buffer = NULL;
279 rq->ref_count = 1; 279 rq->ref_count = 1;
280 rq->q = q; 280 rq->q = q;
281 rq->waiting = NULL; 281 rq->waiting = NULL;
282 rq->special = NULL; 282 rq->special = NULL;
283 rq->data_len = 0; 283 rq->data_len = 0;
284 rq->data = NULL; 284 rq->data = NULL;
285 rq->nr_phys_segments = 0; 285 rq->nr_phys_segments = 0;
286 rq->sense = NULL; 286 rq->sense = NULL;
287 rq->end_io = NULL; 287 rq->end_io = NULL;
288 rq->end_io_data = NULL; 288 rq->end_io_data = NULL;
289 } 289 }
290 290
291 /** 291 /**
292 * blk_queue_ordered - does this queue support ordered writes 292 * blk_queue_ordered - does this queue support ordered writes
293 * @q: the request queue 293 * @q: the request queue
294 * @flag: see below 294 * @flag: see below
295 * 295 *
296 * Description: 296 * Description:
297 * For journalled file systems, doing ordered writes on a commit 297 * For journalled file systems, doing ordered writes on a commit
298 * block instead of explicitly doing wait_on_buffer (which is bad 298 * block instead of explicitly doing wait_on_buffer (which is bad
299 * for performance) can be a big win. Block drivers supporting this 299 * for performance) can be a big win. Block drivers supporting this
300 * feature should call this function and indicate so. 300 * feature should call this function and indicate so.
301 * 301 *
302 **/ 302 **/
303 void blk_queue_ordered(request_queue_t *q, int flag) 303 void blk_queue_ordered(request_queue_t *q, int flag)
304 { 304 {
305 switch (flag) { 305 switch (flag) {
306 case QUEUE_ORDERED_NONE: 306 case QUEUE_ORDERED_NONE:
307 if (q->flush_rq) 307 if (q->flush_rq)
308 kmem_cache_free(request_cachep, q->flush_rq); 308 kmem_cache_free(request_cachep, q->flush_rq);
309 q->flush_rq = NULL; 309 q->flush_rq = NULL;
310 q->ordered = flag; 310 q->ordered = flag;
311 break; 311 break;
312 case QUEUE_ORDERED_TAG: 312 case QUEUE_ORDERED_TAG:
313 q->ordered = flag; 313 q->ordered = flag;
314 break; 314 break;
315 case QUEUE_ORDERED_FLUSH: 315 case QUEUE_ORDERED_FLUSH:
316 q->ordered = flag; 316 q->ordered = flag;
317 if (!q->flush_rq) 317 if (!q->flush_rq)
318 q->flush_rq = kmem_cache_alloc(request_cachep, 318 q->flush_rq = kmem_cache_alloc(request_cachep,
319 GFP_KERNEL); 319 GFP_KERNEL);
320 break; 320 break;
321 default: 321 default:
322 printk("blk_queue_ordered: bad value %d\n", flag); 322 printk("blk_queue_ordered: bad value %d\n", flag);
323 break; 323 break;
324 } 324 }
325 } 325 }
326 326
327 EXPORT_SYMBOL(blk_queue_ordered); 327 EXPORT_SYMBOL(blk_queue_ordered);
328 328
329 /** 329 /**
330 * blk_queue_issue_flush_fn - set function for issuing a flush 330 * blk_queue_issue_flush_fn - set function for issuing a flush
331 * @q: the request queue 331 * @q: the request queue
332 * @iff: the function to be called issuing the flush 332 * @iff: the function to be called issuing the flush
333 * 333 *
334 * Description: 334 * Description:
335 * If a driver supports issuing a flush command, the support is notified 335 * If a driver supports issuing a flush command, the support is notified
336 * to the block layer by defining it through this call. 336 * to the block layer by defining it through this call.
337 * 337 *
338 **/ 338 **/
339 void blk_queue_issue_flush_fn(request_queue_t *q, issue_flush_fn *iff) 339 void blk_queue_issue_flush_fn(request_queue_t *q, issue_flush_fn *iff)
340 { 340 {
341 q->issue_flush_fn = iff; 341 q->issue_flush_fn = iff;
342 } 342 }
343 343
344 EXPORT_SYMBOL(blk_queue_issue_flush_fn); 344 EXPORT_SYMBOL(blk_queue_issue_flush_fn);
345 345
346 /* 346 /*
347 * Cache flushing for ordered writes handling 347 * Cache flushing for ordered writes handling
348 */ 348 */
349 static void blk_pre_flush_end_io(struct request *flush_rq) 349 static void blk_pre_flush_end_io(struct request *flush_rq)
350 { 350 {
351 struct request *rq = flush_rq->end_io_data; 351 struct request *rq = flush_rq->end_io_data;
352 request_queue_t *q = rq->q; 352 request_queue_t *q = rq->q;
353 353
354 elv_completed_request(q, flush_rq); 354 elv_completed_request(q, flush_rq);
355 355
356 rq->flags |= REQ_BAR_PREFLUSH; 356 rq->flags |= REQ_BAR_PREFLUSH;
357 357
358 if (!flush_rq->errors) 358 if (!flush_rq->errors)
359 elv_requeue_request(q, rq); 359 elv_requeue_request(q, rq);
360 else { 360 else {
361 q->end_flush_fn(q, flush_rq); 361 q->end_flush_fn(q, flush_rq);
362 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags); 362 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
363 q->request_fn(q); 363 q->request_fn(q);
364 } 364 }
365 } 365 }
366 366
367 static void blk_post_flush_end_io(struct request *flush_rq) 367 static void blk_post_flush_end_io(struct request *flush_rq)
368 { 368 {
369 struct request *rq = flush_rq->end_io_data; 369 struct request *rq = flush_rq->end_io_data;
370 request_queue_t *q = rq->q; 370 request_queue_t *q = rq->q;
371 371
372 elv_completed_request(q, flush_rq); 372 elv_completed_request(q, flush_rq);
373 373
374 rq->flags |= REQ_BAR_POSTFLUSH; 374 rq->flags |= REQ_BAR_POSTFLUSH;
375 375
376 q->end_flush_fn(q, flush_rq); 376 q->end_flush_fn(q, flush_rq);
377 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags); 377 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
378 q->request_fn(q); 378 q->request_fn(q);
379 } 379 }
380 380
381 struct request *blk_start_pre_flush(request_queue_t *q, struct request *rq) 381 struct request *blk_start_pre_flush(request_queue_t *q, struct request *rq)
382 { 382 {
383 struct request *flush_rq = q->flush_rq; 383 struct request *flush_rq = q->flush_rq;
384 384
385 BUG_ON(!blk_barrier_rq(rq)); 385 BUG_ON(!blk_barrier_rq(rq));
386 386
387 if (test_and_set_bit(QUEUE_FLAG_FLUSH, &q->queue_flags)) 387 if (test_and_set_bit(QUEUE_FLAG_FLUSH, &q->queue_flags))
388 return NULL; 388 return NULL;
389 389
390 rq_init(q, flush_rq); 390 rq_init(q, flush_rq);
391 flush_rq->elevator_private = NULL; 391 flush_rq->elevator_private = NULL;
392 flush_rq->flags = REQ_BAR_FLUSH; 392 flush_rq->flags = REQ_BAR_FLUSH;
393 flush_rq->rq_disk = rq->rq_disk; 393 flush_rq->rq_disk = rq->rq_disk;
394 flush_rq->rl = NULL; 394 flush_rq->rl = NULL;
395 395
396 /* 396 /*
397 * prepare_flush returns 0 if no flush is needed, just mark both 397 * prepare_flush returns 0 if no flush is needed, just mark both
398 * pre and post flush as done in that case 398 * pre and post flush as done in that case
399 */ 399 */
400 if (!q->prepare_flush_fn(q, flush_rq)) { 400 if (!q->prepare_flush_fn(q, flush_rq)) {
401 rq->flags |= REQ_BAR_PREFLUSH | REQ_BAR_POSTFLUSH; 401 rq->flags |= REQ_BAR_PREFLUSH | REQ_BAR_POSTFLUSH;
402 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags); 402 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
403 return rq; 403 return rq;
404 } 404 }
405 405
406 /* 406 /*
407 * some drivers dequeue requests right away, some only after io 407 * some drivers dequeue requests right away, some only after io
408 * completion. make sure the request is dequeued. 408 * completion. make sure the request is dequeued.
409 */ 409 */
410 if (!list_empty(&rq->queuelist)) 410 if (!list_empty(&rq->queuelist))
411 blkdev_dequeue_request(rq); 411 blkdev_dequeue_request(rq);
412 412
413 flush_rq->end_io_data = rq; 413 flush_rq->end_io_data = rq;
414 flush_rq->end_io = blk_pre_flush_end_io; 414 flush_rq->end_io = blk_pre_flush_end_io;
415 415
416 __elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0); 416 __elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0);
417 return flush_rq; 417 return flush_rq;
418 } 418 }
419 419
420 static void blk_start_post_flush(request_queue_t *q, struct request *rq) 420 static void blk_start_post_flush(request_queue_t *q, struct request *rq)
421 { 421 {
422 struct request *flush_rq = q->flush_rq; 422 struct request *flush_rq = q->flush_rq;
423 423
424 BUG_ON(!blk_barrier_rq(rq)); 424 BUG_ON(!blk_barrier_rq(rq));
425 425
426 rq_init(q, flush_rq); 426 rq_init(q, flush_rq);
427 flush_rq->elevator_private = NULL; 427 flush_rq->elevator_private = NULL;
428 flush_rq->flags = REQ_BAR_FLUSH; 428 flush_rq->flags = REQ_BAR_FLUSH;
429 flush_rq->rq_disk = rq->rq_disk; 429 flush_rq->rq_disk = rq->rq_disk;
430 flush_rq->rl = NULL; 430 flush_rq->rl = NULL;
431 431
432 if (q->prepare_flush_fn(q, flush_rq)) { 432 if (q->prepare_flush_fn(q, flush_rq)) {
433 flush_rq->end_io_data = rq; 433 flush_rq->end_io_data = rq;
434 flush_rq->end_io = blk_post_flush_end_io; 434 flush_rq->end_io = blk_post_flush_end_io;
435 435
436 __elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0); 436 __elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0);
437 q->request_fn(q); 437 q->request_fn(q);
438 } 438 }
439 } 439 }
440 440
441 static inline int blk_check_end_barrier(request_queue_t *q, struct request *rq, 441 static inline int blk_check_end_barrier(request_queue_t *q, struct request *rq,
442 int sectors) 442 int sectors)
443 { 443 {
444 if (sectors > rq->nr_sectors) 444 if (sectors > rq->nr_sectors)
445 sectors = rq->nr_sectors; 445 sectors = rq->nr_sectors;
446 446
447 rq->nr_sectors -= sectors; 447 rq->nr_sectors -= sectors;
448 return rq->nr_sectors; 448 return rq->nr_sectors;
449 } 449 }
450 450
451 static int __blk_complete_barrier_rq(request_queue_t *q, struct request *rq, 451 static int __blk_complete_barrier_rq(request_queue_t *q, struct request *rq,
452 int sectors, int queue_locked) 452 int sectors, int queue_locked)
453 { 453 {
454 if (q->ordered != QUEUE_ORDERED_FLUSH) 454 if (q->ordered != QUEUE_ORDERED_FLUSH)
455 return 0; 455 return 0;
456 if (!blk_fs_request(rq) || !blk_barrier_rq(rq)) 456 if (!blk_fs_request(rq) || !blk_barrier_rq(rq))
457 return 0; 457 return 0;
458 if (blk_barrier_postflush(rq)) 458 if (blk_barrier_postflush(rq))
459 return 0; 459 return 0;
460 460
461 if (!blk_check_end_barrier(q, rq, sectors)) { 461 if (!blk_check_end_barrier(q, rq, sectors)) {
462 unsigned long flags = 0; 462 unsigned long flags = 0;
463 463
464 if (!queue_locked) 464 if (!queue_locked)
465 spin_lock_irqsave(q->queue_lock, flags); 465 spin_lock_irqsave(q->queue_lock, flags);
466 466
467 blk_start_post_flush(q, rq); 467 blk_start_post_flush(q, rq);
468 468
469 if (!queue_locked) 469 if (!queue_locked)
470 spin_unlock_irqrestore(q->queue_lock, flags); 470 spin_unlock_irqrestore(q->queue_lock, flags);
471 } 471 }
472 472
473 return 1; 473 return 1;
474 } 474 }
475 475
476 /** 476 /**
477 * blk_complete_barrier_rq - complete possible barrier request 477 * blk_complete_barrier_rq - complete possible barrier request
478 * @q: the request queue for the device 478 * @q: the request queue for the device
479 * @rq: the request 479 * @rq: the request
480 * @sectors: number of sectors to complete 480 * @sectors: number of sectors to complete
481 * 481 *
482 * Description: 482 * Description:
483 * Used in driver end_io handling to determine whether to postpone 483 * Used in driver end_io handling to determine whether to postpone
484 * completion of a barrier request until a post flush has been done. This 484 * completion of a barrier request until a post flush has been done. This
485 * is the unlocked variant, used if the caller doesn't already hold the 485 * is the unlocked variant, used if the caller doesn't already hold the
486 * queue lock. 486 * queue lock.
487 **/ 487 **/
488 int blk_complete_barrier_rq(request_queue_t *q, struct request *rq, int sectors) 488 int blk_complete_barrier_rq(request_queue_t *q, struct request *rq, int sectors)
489 { 489 {
490 return __blk_complete_barrier_rq(q, rq, sectors, 0); 490 return __blk_complete_barrier_rq(q, rq, sectors, 0);
491 } 491 }
492 EXPORT_SYMBOL(blk_complete_barrier_rq); 492 EXPORT_SYMBOL(blk_complete_barrier_rq);
493 493
494 /** 494 /**
495 * blk_complete_barrier_rq_locked - complete possible barrier request 495 * blk_complete_barrier_rq_locked - complete possible barrier request
496 * @q: the request queue for the device 496 * @q: the request queue for the device
497 * @rq: the request 497 * @rq: the request
498 * @sectors: number of sectors to complete 498 * @sectors: number of sectors to complete
499 * 499 *
500 * Description: 500 * Description:
501 * See blk_complete_barrier_rq(). This variant must be used if the caller 501 * See blk_complete_barrier_rq(). This variant must be used if the caller
502 * holds the queue lock. 502 * holds the queue lock.
503 **/ 503 **/
504 int blk_complete_barrier_rq_locked(request_queue_t *q, struct request *rq, 504 int blk_complete_barrier_rq_locked(request_queue_t *q, struct request *rq,
505 int sectors) 505 int sectors)
506 { 506 {
507 return __blk_complete_barrier_rq(q, rq, sectors, 1); 507 return __blk_complete_barrier_rq(q, rq, sectors, 1);
508 } 508 }
509 EXPORT_SYMBOL(blk_complete_barrier_rq_locked); 509 EXPORT_SYMBOL(blk_complete_barrier_rq_locked);
510 510
511 /** 511 /**
512 * blk_queue_bounce_limit - set bounce buffer limit for queue 512 * blk_queue_bounce_limit - set bounce buffer limit for queue
513 * @q: the request queue for the device 513 * @q: the request queue for the device
514 * @dma_addr: bus address limit 514 * @dma_addr: bus address limit
515 * 515 *
516 * Description: 516 * Description:
517 * Different hardware can have different requirements as to what pages 517 * Different hardware can have different requirements as to what pages
518 * it can do I/O directly to. A low level driver can call 518 * it can do I/O directly to. A low level driver can call
519 * blk_queue_bounce_limit to have lower memory pages allocated as bounce 519 * blk_queue_bounce_limit to have lower memory pages allocated as bounce
520 * buffers for doing I/O to pages residing above @page. By default 520 * buffers for doing I/O to pages residing above @page. By default
521 * the block layer sets this to the highest numbered "low" memory page. 521 * the block layer sets this to the highest numbered "low" memory page.
522 **/ 522 **/
523 void blk_queue_bounce_limit(request_queue_t *q, u64 dma_addr) 523 void blk_queue_bounce_limit(request_queue_t *q, u64 dma_addr)
524 { 524 {
525 unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT; 525 unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;
526 526
527 /* 527 /*
528 * set appropriate bounce gfp mask -- unfortunately we don't have a 528 * set appropriate bounce gfp mask -- unfortunately we don't have a
529 * full 4GB zone, so we have to resort to low memory for any bounces. 529 * full 4GB zone, so we have to resort to low memory for any bounces.
530 * ISA has its own < 16MB zone. 530 * ISA has its own < 16MB zone.
531 */ 531 */
532 if (bounce_pfn < blk_max_low_pfn) { 532 if (bounce_pfn < blk_max_low_pfn) {
533 BUG_ON(dma_addr < BLK_BOUNCE_ISA); 533 BUG_ON(dma_addr < BLK_BOUNCE_ISA);
534 init_emergency_isa_pool(); 534 init_emergency_isa_pool();
535 q->bounce_gfp = GFP_NOIO | GFP_DMA; 535 q->bounce_gfp = GFP_NOIO | GFP_DMA;
536 } else 536 } else
537 q->bounce_gfp = GFP_NOIO; 537 q->bounce_gfp = GFP_NOIO;
538 538
539 q->bounce_pfn = bounce_pfn; 539 q->bounce_pfn = bounce_pfn;
540 } 540 }
541 541
542 EXPORT_SYMBOL(blk_queue_bounce_limit); 542 EXPORT_SYMBOL(blk_queue_bounce_limit);
543 543
544 /** 544 /**
545 * blk_queue_max_sectors - set max sectors for a request for this queue 545 * blk_queue_max_sectors - set max sectors for a request for this queue
546 * @q: the request queue for the device 546 * @q: the request queue for the device
547 * @max_sectors: max sectors in the usual 512b unit 547 * @max_sectors: max sectors in the usual 512b unit
548 * 548 *
549 * Description: 549 * Description:
550 * Enables a low level driver to set an upper limit on the size of 550 * Enables a low level driver to set an upper limit on the size of
551 * received requests. 551 * received requests.
552 **/ 552 **/
553 void blk_queue_max_sectors(request_queue_t *q, unsigned short max_sectors) 553 void blk_queue_max_sectors(request_queue_t *q, unsigned short max_sectors)
554 { 554 {
555 if ((max_sectors << 9) < PAGE_CACHE_SIZE) { 555 if ((max_sectors << 9) < PAGE_CACHE_SIZE) {
556 max_sectors = 1 << (PAGE_CACHE_SHIFT - 9); 556 max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
557 printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors); 557 printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);
558 } 558 }
559 559
560 q->max_sectors = q->max_hw_sectors = max_sectors; 560 q->max_sectors = q->max_hw_sectors = max_sectors;
561 } 561 }
562 562
563 EXPORT_SYMBOL(blk_queue_max_sectors); 563 EXPORT_SYMBOL(blk_queue_max_sectors);
564 564
565 /** 565 /**
566 * blk_queue_max_phys_segments - set max phys segments for a request for this queue 566 * blk_queue_max_phys_segments - set max phys segments for a request for this queue
567 * @q: the request queue for the device 567 * @q: the request queue for the device
568 * @max_segments: max number of segments 568 * @max_segments: max number of segments
569 * 569 *
570 * Description: 570 * Description:
571 * Enables a low level driver to set an upper limit on the number of 571 * Enables a low level driver to set an upper limit on the number of
572 * physical data segments in a request. This would be the largest sized 572 * physical data segments in a request. This would be the largest sized
573 * scatter list the driver could handle. 573 * scatter list the driver could handle.
574 **/ 574 **/
575 void blk_queue_max_phys_segments(request_queue_t *q, unsigned short max_segments) 575 void blk_queue_max_phys_segments(request_queue_t *q, unsigned short max_segments)
576 { 576 {
577 if (!max_segments) { 577 if (!max_segments) {
578 max_segments = 1; 578 max_segments = 1;
579 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); 579 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
580 } 580 }
581 581
582 q->max_phys_segments = max_segments; 582 q->max_phys_segments = max_segments;
583 } 583 }
584 584
585 EXPORT_SYMBOL(blk_queue_max_phys_segments); 585 EXPORT_SYMBOL(blk_queue_max_phys_segments);
586 586
587 /** 587 /**
588 * blk_queue_max_hw_segments - set max hw segments for a request for this queue 588 * blk_queue_max_hw_segments - set max hw segments for a request for this queue
589 * @q: the request queue for the device 589 * @q: the request queue for the device
590 * @max_segments: max number of segments 590 * @max_segments: max number of segments
591 * 591 *
592 * Description: 592 * Description:
593 * Enables a low level driver to set an upper limit on the number of 593 * Enables a low level driver to set an upper limit on the number of
594 * hw data segments in a request. This would be the largest number of 594 * hw data segments in a request. This would be the largest number of
595 * address/length pairs the host adapter can actually give as once 595 * address/length pairs the host adapter can actually give as once
596 * to the device. 596 * to the device.
597 **/ 597 **/
598 void blk_queue_max_hw_segments(request_queue_t *q, unsigned short max_segments) 598 void blk_queue_max_hw_segments(request_queue_t *q, unsigned short max_segments)
599 { 599 {
600 if (!max_segments) { 600 if (!max_segments) {
601 max_segments = 1; 601 max_segments = 1;
602 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); 602 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
603 } 603 }
604 604
605 q->max_hw_segments = max_segments; 605 q->max_hw_segments = max_segments;
606 } 606 }
607 607
608 EXPORT_SYMBOL(blk_queue_max_hw_segments); 608 EXPORT_SYMBOL(blk_queue_max_hw_segments);
609 609
610 /** 610 /**
611 * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg 611 * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
612 * @q: the request queue for the device 612 * @q: the request queue for the device
613 * @max_size: max size of segment in bytes 613 * @max_size: max size of segment in bytes
614 * 614 *
615 * Description: 615 * Description:
616 * Enables a low level driver to set an upper limit on the size of a 616 * Enables a low level driver to set an upper limit on the size of a
617 * coalesced segment 617 * coalesced segment
618 **/ 618 **/
619 void blk_queue_max_segment_size(request_queue_t *q, unsigned int max_size) 619 void blk_queue_max_segment_size(request_queue_t *q, unsigned int max_size)
620 { 620 {
621 if (max_size < PAGE_CACHE_SIZE) { 621 if (max_size < PAGE_CACHE_SIZE) {
622 max_size = PAGE_CACHE_SIZE; 622 max_size = PAGE_CACHE_SIZE;
623 printk("%s: set to minimum %d\n", __FUNCTION__, max_size); 623 printk("%s: set to minimum %d\n", __FUNCTION__, max_size);
624 } 624 }
625 625
626 q->max_segment_size = max_size; 626 q->max_segment_size = max_size;
627 } 627 }
628 628
629 EXPORT_SYMBOL(blk_queue_max_segment_size); 629 EXPORT_SYMBOL(blk_queue_max_segment_size);
630 630
631 /** 631 /**
632 * blk_queue_hardsect_size - set hardware sector size for the queue 632 * blk_queue_hardsect_size - set hardware sector size for the queue
633 * @q: the request queue for the device 633 * @q: the request queue for the device
634 * @size: the hardware sector size, in bytes 634 * @size: the hardware sector size, in bytes
635 * 635 *
636 * Description: 636 * Description:
637 * This should typically be set to the lowest possible sector size 637 * This should typically be set to the lowest possible sector size
638 * that the hardware can operate on (possible without reverting to 638 * that the hardware can operate on (possible without reverting to
639 * even internal read-modify-write operations). Usually the default 639 * even internal read-modify-write operations). Usually the default
640 * of 512 covers most hardware. 640 * of 512 covers most hardware.
641 **/ 641 **/
642 void blk_queue_hardsect_size(request_queue_t *q, unsigned short size) 642 void blk_queue_hardsect_size(request_queue_t *q, unsigned short size)
643 { 643 {
644 q->hardsect_size = size; 644 q->hardsect_size = size;
645 } 645 }
646 646
647 EXPORT_SYMBOL(blk_queue_hardsect_size); 647 EXPORT_SYMBOL(blk_queue_hardsect_size);
648 648
649 /* 649 /*
650 * Returns the minimum that is _not_ zero, unless both are zero. 650 * Returns the minimum that is _not_ zero, unless both are zero.
651 */ 651 */
652 #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) 652 #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
653 653
654 /** 654 /**
655 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers 655 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
656 * @t: the stacking driver (top) 656 * @t: the stacking driver (top)
657 * @b: the underlying device (bottom) 657 * @b: the underlying device (bottom)
658 **/ 658 **/
659 void blk_queue_stack_limits(request_queue_t *t, request_queue_t *b) 659 void blk_queue_stack_limits(request_queue_t *t, request_queue_t *b)
660 { 660 {
661 /* zero is "infinity" */ 661 /* zero is "infinity" */
662 t->max_sectors = t->max_hw_sectors = 662 t->max_sectors = t->max_hw_sectors =
663 min_not_zero(t->max_sectors,b->max_sectors); 663 min_not_zero(t->max_sectors,b->max_sectors);
664 664
665 t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments); 665 t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);
666 t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments); 666 t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
667 t->max_segment_size = min(t->max_segment_size,b->max_segment_size); 667 t->max_segment_size = min(t->max_segment_size,b->max_segment_size);
668 t->hardsect_size = max(t->hardsect_size,b->hardsect_size); 668 t->hardsect_size = max(t->hardsect_size,b->hardsect_size);
669 } 669 }
670 670
671 EXPORT_SYMBOL(blk_queue_stack_limits); 671 EXPORT_SYMBOL(blk_queue_stack_limits);
672 672
673 /** 673 /**
674 * blk_queue_segment_boundary - set boundary rules for segment merging 674 * blk_queue_segment_boundary - set boundary rules for segment merging
675 * @q: the request queue for the device 675 * @q: the request queue for the device
676 * @mask: the memory boundary mask 676 * @mask: the memory boundary mask
677 **/ 677 **/
678 void blk_queue_segment_boundary(request_queue_t *q, unsigned long mask) 678 void blk_queue_segment_boundary(request_queue_t *q, unsigned long mask)
679 { 679 {
680 if (mask < PAGE_CACHE_SIZE - 1) { 680 if (mask < PAGE_CACHE_SIZE - 1) {
681 mask = PAGE_CACHE_SIZE - 1; 681 mask = PAGE_CACHE_SIZE - 1;
682 printk("%s: set to minimum %lx\n", __FUNCTION__, mask); 682 printk("%s: set to minimum %lx\n", __FUNCTION__, mask);
683 } 683 }
684 684
685 q->seg_boundary_mask = mask; 685 q->seg_boundary_mask = mask;
686 } 686 }
687 687
688 EXPORT_SYMBOL(blk_queue_segment_boundary); 688 EXPORT_SYMBOL(blk_queue_segment_boundary);
689 689
690 /** 690 /**
691 * blk_queue_dma_alignment - set dma length and memory alignment 691 * blk_queue_dma_alignment - set dma length and memory alignment
692 * @q: the request queue for the device 692 * @q: the request queue for the device
693 * @mask: alignment mask 693 * @mask: alignment mask
694 * 694 *
695 * description: 695 * description:
696 * set required memory and length aligment for direct dma transactions. 696 * set required memory and length aligment for direct dma transactions.
697 * this is used when buiding direct io requests for the queue. 697 * this is used when buiding direct io requests for the queue.
698 * 698 *
699 **/ 699 **/
700 void blk_queue_dma_alignment(request_queue_t *q, int mask) 700 void blk_queue_dma_alignment(request_queue_t *q, int mask)
701 { 701 {
702 q->dma_alignment = mask; 702 q->dma_alignment = mask;
703 } 703 }
704 704
705 EXPORT_SYMBOL(blk_queue_dma_alignment); 705 EXPORT_SYMBOL(blk_queue_dma_alignment);
706 706
707 /** 707 /**
708 * blk_queue_find_tag - find a request by its tag and queue 708 * blk_queue_find_tag - find a request by its tag and queue
709 *
710 * @q: The request queue for the device 709 * @q: The request queue for the device
711 * @tag: The tag of the request 710 * @tag: The tag of the request
712 * 711 *
713 * Notes: 712 * Notes:
714 * Should be used when a device returns a tag and you want to match 713 * Should be used when a device returns a tag and you want to match
715 * it with a request. 714 * it with a request.
716 * 715 *
717 * no locks need be held. 716 * no locks need be held.
718 **/ 717 **/
719 struct request *blk_queue_find_tag(request_queue_t *q, int tag) 718 struct request *blk_queue_find_tag(request_queue_t *q, int tag)
720 { 719 {
721 struct blk_queue_tag *bqt = q->queue_tags; 720 struct blk_queue_tag *bqt = q->queue_tags;
722 721
723 if (unlikely(bqt == NULL || tag >= bqt->real_max_depth)) 722 if (unlikely(bqt == NULL || tag >= bqt->real_max_depth))
724 return NULL; 723 return NULL;
725 724
726 return bqt->tag_index[tag]; 725 return bqt->tag_index[tag];
727 } 726 }
728 727
729 EXPORT_SYMBOL(blk_queue_find_tag); 728 EXPORT_SYMBOL(blk_queue_find_tag);
730 729
731 /** 730 /**
732 * __blk_queue_free_tags - release tag maintenance info 731 * __blk_queue_free_tags - release tag maintenance info
733 * @q: the request queue for the device 732 * @q: the request queue for the device
734 * 733 *
735 * Notes: 734 * Notes:
736 * blk_cleanup_queue() will take care of calling this function, if tagging 735 * blk_cleanup_queue() will take care of calling this function, if tagging
737 * has been used. So there's no need to call this directly. 736 * has been used. So there's no need to call this directly.
738 **/ 737 **/
739 static void __blk_queue_free_tags(request_queue_t *q) 738 static void __blk_queue_free_tags(request_queue_t *q)
740 { 739 {
741 struct blk_queue_tag *bqt = q->queue_tags; 740 struct blk_queue_tag *bqt = q->queue_tags;
742 741
743 if (!bqt) 742 if (!bqt)
744 return; 743 return;
745 744
746 if (atomic_dec_and_test(&bqt->refcnt)) { 745 if (atomic_dec_and_test(&bqt->refcnt)) {
747 BUG_ON(bqt->busy); 746 BUG_ON(bqt->busy);
748 BUG_ON(!list_empty(&bqt->busy_list)); 747 BUG_ON(!list_empty(&bqt->busy_list));
749 748
750 kfree(bqt->tag_index); 749 kfree(bqt->tag_index);
751 bqt->tag_index = NULL; 750 bqt->tag_index = NULL;
752 751
753 kfree(bqt->tag_map); 752 kfree(bqt->tag_map);
754 bqt->tag_map = NULL; 753 bqt->tag_map = NULL;
755 754
756 kfree(bqt); 755 kfree(bqt);
757 } 756 }
758 757
759 q->queue_tags = NULL; 758 q->queue_tags = NULL;
760 q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED); 759 q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
761 } 760 }
762 761
763 /** 762 /**
764 * blk_queue_free_tags - release tag maintenance info 763 * blk_queue_free_tags - release tag maintenance info
765 * @q: the request queue for the device 764 * @q: the request queue for the device
766 * 765 *
767 * Notes: 766 * Notes:
768 * This is used to disabled tagged queuing to a device, yet leave 767 * This is used to disabled tagged queuing to a device, yet leave
769 * queue in function. 768 * queue in function.
770 **/ 769 **/
771 void blk_queue_free_tags(request_queue_t *q) 770 void blk_queue_free_tags(request_queue_t *q)
772 { 771 {
773 clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); 772 clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
774 } 773 }
775 774
776 EXPORT_SYMBOL(blk_queue_free_tags); 775 EXPORT_SYMBOL(blk_queue_free_tags);
777 776
778 static int 777 static int
779 init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth) 778 init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth)
780 { 779 {
781 struct request **tag_index; 780 struct request **tag_index;
782 unsigned long *tag_map; 781 unsigned long *tag_map;
783 int nr_ulongs; 782 int nr_ulongs;
784 783
785 if (depth > q->nr_requests * 2) { 784 if (depth > q->nr_requests * 2) {
786 depth = q->nr_requests * 2; 785 depth = q->nr_requests * 2;
787 printk(KERN_ERR "%s: adjusted depth to %d\n", 786 printk(KERN_ERR "%s: adjusted depth to %d\n",
788 __FUNCTION__, depth); 787 __FUNCTION__, depth);
789 } 788 }
790 789
791 tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC); 790 tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC);
792 if (!tag_index) 791 if (!tag_index)
793 goto fail; 792 goto fail;
794 793
795 nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG; 794 nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
796 tag_map = kmalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC); 795 tag_map = kmalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
797 if (!tag_map) 796 if (!tag_map)
798 goto fail; 797 goto fail;
799 798
800 memset(tag_index, 0, depth * sizeof(struct request *)); 799 memset(tag_index, 0, depth * sizeof(struct request *));
801 memset(tag_map, 0, nr_ulongs * sizeof(unsigned long)); 800 memset(tag_map, 0, nr_ulongs * sizeof(unsigned long));
802 tags->real_max_depth = depth; 801 tags->real_max_depth = depth;
803 tags->max_depth = depth; 802 tags->max_depth = depth;
804 tags->tag_index = tag_index; 803 tags->tag_index = tag_index;
805 tags->tag_map = tag_map; 804 tags->tag_map = tag_map;
806 805
807 return 0; 806 return 0;
808 fail: 807 fail:
809 kfree(tag_index); 808 kfree(tag_index);
810 return -ENOMEM; 809 return -ENOMEM;
811 } 810 }
812 811
813 /** 812 /**
814 * blk_queue_init_tags - initialize the queue tag info 813 * blk_queue_init_tags - initialize the queue tag info
815 * @q: the request queue for the device 814 * @q: the request queue for the device
816 * @depth: the maximum queue depth supported 815 * @depth: the maximum queue depth supported
817 * @tags: the tag to use 816 * @tags: the tag to use
818 **/ 817 **/
819 int blk_queue_init_tags(request_queue_t *q, int depth, 818 int blk_queue_init_tags(request_queue_t *q, int depth,
820 struct blk_queue_tag *tags) 819 struct blk_queue_tag *tags)
821 { 820 {
822 int rc; 821 int rc;
823 822
824 BUG_ON(tags && q->queue_tags && tags != q->queue_tags); 823 BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
825 824
826 if (!tags && !q->queue_tags) { 825 if (!tags && !q->queue_tags) {
827 tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC); 826 tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
828 if (!tags) 827 if (!tags)
829 goto fail; 828 goto fail;
830 829
831 if (init_tag_map(q, tags, depth)) 830 if (init_tag_map(q, tags, depth))
832 goto fail; 831 goto fail;
833 832
834 INIT_LIST_HEAD(&tags->busy_list); 833 INIT_LIST_HEAD(&tags->busy_list);
835 tags->busy = 0; 834 tags->busy = 0;
836 atomic_set(&tags->refcnt, 1); 835 atomic_set(&tags->refcnt, 1);
837 } else if (q->queue_tags) { 836 } else if (q->queue_tags) {
838 if ((rc = blk_queue_resize_tags(q, depth))) 837 if ((rc = blk_queue_resize_tags(q, depth)))
839 return rc; 838 return rc;
840 set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); 839 set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
841 return 0; 840 return 0;
842 } else 841 } else
843 atomic_inc(&tags->refcnt); 842 atomic_inc(&tags->refcnt);
844 843
845 /* 844 /*
846 * assign it, all done 845 * assign it, all done
847 */ 846 */
848 q->queue_tags = tags; 847 q->queue_tags = tags;
849 q->queue_flags |= (1 << QUEUE_FLAG_QUEUED); 848 q->queue_flags |= (1 << QUEUE_FLAG_QUEUED);
850 return 0; 849 return 0;
851 fail: 850 fail:
852 kfree(tags); 851 kfree(tags);
853 return -ENOMEM; 852 return -ENOMEM;
854 } 853 }
855 854
856 EXPORT_SYMBOL(blk_queue_init_tags); 855 EXPORT_SYMBOL(blk_queue_init_tags);
857 856
858 /** 857 /**
859 * blk_queue_resize_tags - change the queueing depth 858 * blk_queue_resize_tags - change the queueing depth
860 * @q: the request queue for the device 859 * @q: the request queue for the device
861 * @new_depth: the new max command queueing depth 860 * @new_depth: the new max command queueing depth
862 * 861 *
863 * Notes: 862 * Notes:
864 * Must be called with the queue lock held. 863 * Must be called with the queue lock held.
865 **/ 864 **/
866 int blk_queue_resize_tags(request_queue_t *q, int new_depth) 865 int blk_queue_resize_tags(request_queue_t *q, int new_depth)
867 { 866 {
868 struct blk_queue_tag *bqt = q->queue_tags; 867 struct blk_queue_tag *bqt = q->queue_tags;
869 struct request **tag_index; 868 struct request **tag_index;
870 unsigned long *tag_map; 869 unsigned long *tag_map;
871 int max_depth, nr_ulongs; 870 int max_depth, nr_ulongs;
872 871
873 if (!bqt) 872 if (!bqt)
874 return -ENXIO; 873 return -ENXIO;
875 874
876 /* 875 /*
877 * if we already have large enough real_max_depth. just 876 * if we already have large enough real_max_depth. just
878 * adjust max_depth. *NOTE* as requests with tag value 877 * adjust max_depth. *NOTE* as requests with tag value
879 * between new_depth and real_max_depth can be in-flight, tag 878 * between new_depth and real_max_depth can be in-flight, tag
880 * map can not be shrunk blindly here. 879 * map can not be shrunk blindly here.
881 */ 880 */
882 if (new_depth <= bqt->real_max_depth) { 881 if (new_depth <= bqt->real_max_depth) {
883 bqt->max_depth = new_depth; 882 bqt->max_depth = new_depth;
884 return 0; 883 return 0;
885 } 884 }
886 885
887 /* 886 /*
888 * save the old state info, so we can copy it back 887 * save the old state info, so we can copy it back
889 */ 888 */
890 tag_index = bqt->tag_index; 889 tag_index = bqt->tag_index;
891 tag_map = bqt->tag_map; 890 tag_map = bqt->tag_map;
892 max_depth = bqt->real_max_depth; 891 max_depth = bqt->real_max_depth;
893 892
894 if (init_tag_map(q, bqt, new_depth)) 893 if (init_tag_map(q, bqt, new_depth))
895 return -ENOMEM; 894 return -ENOMEM;
896 895
897 memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *)); 896 memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
898 nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG; 897 nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
899 memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long)); 898 memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
900 899
901 kfree(tag_index); 900 kfree(tag_index);
902 kfree(tag_map); 901 kfree(tag_map);
903 return 0; 902 return 0;
904 } 903 }
905 904
906 EXPORT_SYMBOL(blk_queue_resize_tags); 905 EXPORT_SYMBOL(blk_queue_resize_tags);
907 906
908 /** 907 /**
909 * blk_queue_end_tag - end tag operations for a request 908 * blk_queue_end_tag - end tag operations for a request
910 * @q: the request queue for the device 909 * @q: the request queue for the device
911 * @rq: the request that has completed 910 * @rq: the request that has completed
912 * 911 *
913 * Description: 912 * Description:
914 * Typically called when end_that_request_first() returns 0, meaning 913 * Typically called when end_that_request_first() returns 0, meaning
915 * all transfers have been done for a request. It's important to call 914 * all transfers have been done for a request. It's important to call
916 * this function before end_that_request_last(), as that will put the 915 * this function before end_that_request_last(), as that will put the
917 * request back on the free list thus corrupting the internal tag list. 916 * request back on the free list thus corrupting the internal tag list.
918 * 917 *
919 * Notes: 918 * Notes:
920 * queue lock must be held. 919 * queue lock must be held.
921 **/ 920 **/
922 void blk_queue_end_tag(request_queue_t *q, struct request *rq) 921 void blk_queue_end_tag(request_queue_t *q, struct request *rq)
923 { 922 {
924 struct blk_queue_tag *bqt = q->queue_tags; 923 struct blk_queue_tag *bqt = q->queue_tags;
925 int tag = rq->tag; 924 int tag = rq->tag;
926 925
927 BUG_ON(tag == -1); 926 BUG_ON(tag == -1);
928 927
929 if (unlikely(tag >= bqt->real_max_depth)) 928 if (unlikely(tag >= bqt->real_max_depth))
930 /* 929 /*
931 * This can happen after tag depth has been reduced. 930 * This can happen after tag depth has been reduced.
932 * FIXME: how about a warning or info message here? 931 * FIXME: how about a warning or info message here?
933 */ 932 */
934 return; 933 return;
935 934
936 if (unlikely(!__test_and_clear_bit(tag, bqt->tag_map))) { 935 if (unlikely(!__test_and_clear_bit(tag, bqt->tag_map))) {
937 printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n", 936 printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",
938 __FUNCTION__, tag); 937 __FUNCTION__, tag);
939 return; 938 return;
940 } 939 }
941 940
942 list_del_init(&rq->queuelist); 941 list_del_init(&rq->queuelist);
943 rq->flags &= ~REQ_QUEUED; 942 rq->flags &= ~REQ_QUEUED;
944 rq->tag = -1; 943 rq->tag = -1;
945 944
946 if (unlikely(bqt->tag_index[tag] == NULL)) 945 if (unlikely(bqt->tag_index[tag] == NULL))
947 printk(KERN_ERR "%s: tag %d is missing\n", 946 printk(KERN_ERR "%s: tag %d is missing\n",
948 __FUNCTION__, tag); 947 __FUNCTION__, tag);
949 948
950 bqt->tag_index[tag] = NULL; 949 bqt->tag_index[tag] = NULL;
951 bqt->busy--; 950 bqt->busy--;
952 } 951 }
953 952
954 EXPORT_SYMBOL(blk_queue_end_tag); 953 EXPORT_SYMBOL(blk_queue_end_tag);
955 954
956 /** 955 /**
957 * blk_queue_start_tag - find a free tag and assign it 956 * blk_queue_start_tag - find a free tag and assign it
958 * @q: the request queue for the device 957 * @q: the request queue for the device
959 * @rq: the block request that needs tagging 958 * @rq: the block request that needs tagging
960 * 959 *
961 * Description: 960 * Description:
962 * This can either be used as a stand-alone helper, or possibly be 961 * This can either be used as a stand-alone helper, or possibly be
963 * assigned as the queue &prep_rq_fn (in which case &struct request 962 * assigned as the queue &prep_rq_fn (in which case &struct request
964 * automagically gets a tag assigned). Note that this function 963 * automagically gets a tag assigned). Note that this function
965 * assumes that any type of request can be queued! if this is not 964 * assumes that any type of request can be queued! if this is not
966 * true for your device, you must check the request type before 965 * true for your device, you must check the request type before
967 * calling this function. The request will also be removed from 966 * calling this function. The request will also be removed from
968 * the request queue, so it's the drivers responsibility to readd 967 * the request queue, so it's the drivers responsibility to readd
969 * it if it should need to be restarted for some reason. 968 * it if it should need to be restarted for some reason.
970 * 969 *
971 * Notes: 970 * Notes:
972 * queue lock must be held. 971 * queue lock must be held.
973 **/ 972 **/
974 int blk_queue_start_tag(request_queue_t *q, struct request *rq) 973 int blk_queue_start_tag(request_queue_t *q, struct request *rq)
975 { 974 {
976 struct blk_queue_tag *bqt = q->queue_tags; 975 struct blk_queue_tag *bqt = q->queue_tags;
977 int tag; 976 int tag;
978 977
979 if (unlikely((rq->flags & REQ_QUEUED))) { 978 if (unlikely((rq->flags & REQ_QUEUED))) {
980 printk(KERN_ERR 979 printk(KERN_ERR
981 "%s: request %p for device [%s] already tagged %d", 980 "%s: request %p for device [%s] already tagged %d",
982 __FUNCTION__, rq, 981 __FUNCTION__, rq,
983 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag); 982 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
984 BUG(); 983 BUG();
985 } 984 }
986 985
987 tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth); 986 tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);
988 if (tag >= bqt->max_depth) 987 if (tag >= bqt->max_depth)
989 return 1; 988 return 1;
990 989
991 __set_bit(tag, bqt->tag_map); 990 __set_bit(tag, bqt->tag_map);
992 991
993 rq->flags |= REQ_QUEUED; 992 rq->flags |= REQ_QUEUED;
994 rq->tag = tag; 993 rq->tag = tag;
995 bqt->tag_index[tag] = rq; 994 bqt->tag_index[tag] = rq;
996 blkdev_dequeue_request(rq); 995 blkdev_dequeue_request(rq);
997 list_add(&rq->queuelist, &bqt->busy_list); 996 list_add(&rq->queuelist, &bqt->busy_list);
998 bqt->busy++; 997 bqt->busy++;
999 return 0; 998 return 0;
1000 } 999 }
1001 1000
1002 EXPORT_SYMBOL(blk_queue_start_tag); 1001 EXPORT_SYMBOL(blk_queue_start_tag);
1003 1002
1004 /** 1003 /**
1005 * blk_queue_invalidate_tags - invalidate all pending tags 1004 * blk_queue_invalidate_tags - invalidate all pending tags
1006 * @q: the request queue for the device 1005 * @q: the request queue for the device
1007 * 1006 *
1008 * Description: 1007 * Description:
1009 * Hardware conditions may dictate a need to stop all pending requests. 1008 * Hardware conditions may dictate a need to stop all pending requests.
1010 * In this case, we will safely clear the block side of the tag queue and 1009 * In this case, we will safely clear the block side of the tag queue and
1011 * readd all requests to the request queue in the right order. 1010 * readd all requests to the request queue in the right order.
1012 * 1011 *
1013 * Notes: 1012 * Notes:
1014 * queue lock must be held. 1013 * queue lock must be held.
1015 **/ 1014 **/
1016 void blk_queue_invalidate_tags(request_queue_t *q) 1015 void blk_queue_invalidate_tags(request_queue_t *q)
1017 { 1016 {
1018 struct blk_queue_tag *bqt = q->queue_tags; 1017 struct blk_queue_tag *bqt = q->queue_tags;
1019 struct list_head *tmp, *n; 1018 struct list_head *tmp, *n;
1020 struct request *rq; 1019 struct request *rq;
1021 1020
1022 list_for_each_safe(tmp, n, &bqt->busy_list) { 1021 list_for_each_safe(tmp, n, &bqt->busy_list) {
1023 rq = list_entry_rq(tmp); 1022 rq = list_entry_rq(tmp);
1024 1023
1025 if (rq->tag == -1) { 1024 if (rq->tag == -1) {
1026 printk(KERN_ERR 1025 printk(KERN_ERR
1027 "%s: bad tag found on list\n", __FUNCTION__); 1026 "%s: bad tag found on list\n", __FUNCTION__);
1028 list_del_init(&rq->queuelist); 1027 list_del_init(&rq->queuelist);
1029 rq->flags &= ~REQ_QUEUED; 1028 rq->flags &= ~REQ_QUEUED;
1030 } else 1029 } else
1031 blk_queue_end_tag(q, rq); 1030 blk_queue_end_tag(q, rq);
1032 1031
1033 rq->flags &= ~REQ_STARTED; 1032 rq->flags &= ~REQ_STARTED;
1034 __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0); 1033 __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
1035 } 1034 }
1036 } 1035 }
1037 1036
1038 EXPORT_SYMBOL(blk_queue_invalidate_tags); 1037 EXPORT_SYMBOL(blk_queue_invalidate_tags);
1039 1038
1040 static char *rq_flags[] = { 1039 static char *rq_flags[] = {
1041 "REQ_RW", 1040 "REQ_RW",
1042 "REQ_FAILFAST", 1041 "REQ_FAILFAST",
1043 "REQ_SORTED", 1042 "REQ_SORTED",
1044 "REQ_SOFTBARRIER", 1043 "REQ_SOFTBARRIER",
1045 "REQ_HARDBARRIER", 1044 "REQ_HARDBARRIER",
1046 "REQ_CMD", 1045 "REQ_CMD",
1047 "REQ_NOMERGE", 1046 "REQ_NOMERGE",
1048 "REQ_STARTED", 1047 "REQ_STARTED",
1049 "REQ_DONTPREP", 1048 "REQ_DONTPREP",
1050 "REQ_QUEUED", 1049 "REQ_QUEUED",
1051 "REQ_ELVPRIV", 1050 "REQ_ELVPRIV",
1052 "REQ_PC", 1051 "REQ_PC",
1053 "REQ_BLOCK_PC", 1052 "REQ_BLOCK_PC",
1054 "REQ_SENSE", 1053 "REQ_SENSE",
1055 "REQ_FAILED", 1054 "REQ_FAILED",
1056 "REQ_QUIET", 1055 "REQ_QUIET",
1057 "REQ_SPECIAL", 1056 "REQ_SPECIAL",
1058 "REQ_DRIVE_CMD", 1057 "REQ_DRIVE_CMD",
1059 "REQ_DRIVE_TASK", 1058 "REQ_DRIVE_TASK",
1060 "REQ_DRIVE_TASKFILE", 1059 "REQ_DRIVE_TASKFILE",
1061 "REQ_PREEMPT", 1060 "REQ_PREEMPT",
1062 "REQ_PM_SUSPEND", 1061 "REQ_PM_SUSPEND",
1063 "REQ_PM_RESUME", 1062 "REQ_PM_RESUME",
1064 "REQ_PM_SHUTDOWN", 1063 "REQ_PM_SHUTDOWN",
1065 }; 1064 };
1066 1065
1067 void blk_dump_rq_flags(struct request *rq, char *msg) 1066 void blk_dump_rq_flags(struct request *rq, char *msg)
1068 { 1067 {
1069 int bit; 1068 int bit;
1070 1069
1071 printk("%s: dev %s: flags = ", msg, 1070 printk("%s: dev %s: flags = ", msg,
1072 rq->rq_disk ? rq->rq_disk->disk_name : "?"); 1071 rq->rq_disk ? rq->rq_disk->disk_name : "?");
1073 bit = 0; 1072 bit = 0;
1074 do { 1073 do {
1075 if (rq->flags & (1 << bit)) 1074 if (rq->flags & (1 << bit))
1076 printk("%s ", rq_flags[bit]); 1075 printk("%s ", rq_flags[bit]);
1077 bit++; 1076 bit++;
1078 } while (bit < __REQ_NR_BITS); 1077 } while (bit < __REQ_NR_BITS);
1079 1078
1080 printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector, 1079 printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector,
1081 rq->nr_sectors, 1080 rq->nr_sectors,
1082 rq->current_nr_sectors); 1081 rq->current_nr_sectors);
1083 printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len); 1082 printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len);
1084 1083
1085 if (rq->flags & (REQ_BLOCK_PC | REQ_PC)) { 1084 if (rq->flags & (REQ_BLOCK_PC | REQ_PC)) {
1086 printk("cdb: "); 1085 printk("cdb: ");
1087 for (bit = 0; bit < sizeof(rq->cmd); bit++) 1086 for (bit = 0; bit < sizeof(rq->cmd); bit++)
1088 printk("%02x ", rq->cmd[bit]); 1087 printk("%02x ", rq->cmd[bit]);
1089 printk("\n"); 1088 printk("\n");
1090 } 1089 }
1091 } 1090 }
1092 1091
1093 EXPORT_SYMBOL(blk_dump_rq_flags); 1092 EXPORT_SYMBOL(blk_dump_rq_flags);
1094 1093
1095 void blk_recount_segments(request_queue_t *q, struct bio *bio) 1094 void blk_recount_segments(request_queue_t *q, struct bio *bio)
1096 { 1095 {
1097 struct bio_vec *bv, *bvprv = NULL; 1096 struct bio_vec *bv, *bvprv = NULL;
1098 int i, nr_phys_segs, nr_hw_segs, seg_size, hw_seg_size, cluster; 1097 int i, nr_phys_segs, nr_hw_segs, seg_size, hw_seg_size, cluster;
1099 int high, highprv = 1; 1098 int high, highprv = 1;
1100 1099
1101 if (unlikely(!bio->bi_io_vec)) 1100 if (unlikely(!bio->bi_io_vec))
1102 return; 1101 return;
1103 1102
1104 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER); 1103 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
1105 hw_seg_size = seg_size = nr_phys_segs = nr_hw_segs = 0; 1104 hw_seg_size = seg_size = nr_phys_segs = nr_hw_segs = 0;
1106 bio_for_each_segment(bv, bio, i) { 1105 bio_for_each_segment(bv, bio, i) {
1107 /* 1106 /*
1108 * the trick here is making sure that a high page is never 1107 * the trick here is making sure that a high page is never
1109 * considered part of another segment, since that might 1108 * considered part of another segment, since that might
1110 * change with the bounce page. 1109 * change with the bounce page.
1111 */ 1110 */
1112 high = page_to_pfn(bv->bv_page) >= q->bounce_pfn; 1111 high = page_to_pfn(bv->bv_page) >= q->bounce_pfn;
1113 if (high || highprv) 1112 if (high || highprv)
1114 goto new_hw_segment; 1113 goto new_hw_segment;
1115 if (cluster) { 1114 if (cluster) {
1116 if (seg_size + bv->bv_len > q->max_segment_size) 1115 if (seg_size + bv->bv_len > q->max_segment_size)
1117 goto new_segment; 1116 goto new_segment;
1118 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv)) 1117 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
1119 goto new_segment; 1118 goto new_segment;
1120 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv)) 1119 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
1121 goto new_segment; 1120 goto new_segment;
1122 if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) 1121 if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
1123 goto new_hw_segment; 1122 goto new_hw_segment;
1124 1123
1125 seg_size += bv->bv_len; 1124 seg_size += bv->bv_len;
1126 hw_seg_size += bv->bv_len; 1125 hw_seg_size += bv->bv_len;
1127 bvprv = bv; 1126 bvprv = bv;
1128 continue; 1127 continue;
1129 } 1128 }
1130 new_segment: 1129 new_segment:
1131 if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) && 1130 if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&
1132 !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) { 1131 !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) {
1133 hw_seg_size += bv->bv_len; 1132 hw_seg_size += bv->bv_len;
1134 } else { 1133 } else {
1135 new_hw_segment: 1134 new_hw_segment:
1136 if (hw_seg_size > bio->bi_hw_front_size) 1135 if (hw_seg_size > bio->bi_hw_front_size)
1137 bio->bi_hw_front_size = hw_seg_size; 1136 bio->bi_hw_front_size = hw_seg_size;
1138 hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len; 1137 hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;
1139 nr_hw_segs++; 1138 nr_hw_segs++;
1140 } 1139 }
1141 1140
1142 nr_phys_segs++; 1141 nr_phys_segs++;
1143 bvprv = bv; 1142 bvprv = bv;
1144 seg_size = bv->bv_len; 1143 seg_size = bv->bv_len;
1145 highprv = high; 1144 highprv = high;
1146 } 1145 }
1147 if (hw_seg_size > bio->bi_hw_back_size) 1146 if (hw_seg_size > bio->bi_hw_back_size)
1148 bio->bi_hw_back_size = hw_seg_size; 1147 bio->bi_hw_back_size = hw_seg_size;
1149 if (nr_hw_segs == 1 && hw_seg_size > bio->bi_hw_front_size) 1148 if (nr_hw_segs == 1 && hw_seg_size > bio->bi_hw_front_size)
1150 bio->bi_hw_front_size = hw_seg_size; 1149 bio->bi_hw_front_size = hw_seg_size;
1151 bio->bi_phys_segments = nr_phys_segs; 1150 bio->bi_phys_segments = nr_phys_segs;
1152 bio->bi_hw_segments = nr_hw_segs; 1151 bio->bi_hw_segments = nr_hw_segs;
1153 bio->bi_flags |= (1 << BIO_SEG_VALID); 1152 bio->bi_flags |= (1 << BIO_SEG_VALID);
1154 } 1153 }
1155 1154
1156 1155
1157 static int blk_phys_contig_segment(request_queue_t *q, struct bio *bio, 1156 static int blk_phys_contig_segment(request_queue_t *q, struct bio *bio,
1158 struct bio *nxt) 1157 struct bio *nxt)
1159 { 1158 {
1160 if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER))) 1159 if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))
1161 return 0; 1160 return 0;
1162 1161
1163 if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt))) 1162 if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
1164 return 0; 1163 return 0;
1165 if (bio->bi_size + nxt->bi_size > q->max_segment_size) 1164 if (bio->bi_size + nxt->bi_size > q->max_segment_size)
1166 return 0; 1165 return 0;
1167 1166
1168 /* 1167 /*
1169 * bio and nxt are contigous in memory, check if the queue allows 1168 * bio and nxt are contigous in memory, check if the queue allows
1170 * these two to be merged into one 1169 * these two to be merged into one
1171 */ 1170 */
1172 if (BIO_SEG_BOUNDARY(q, bio, nxt)) 1171 if (BIO_SEG_BOUNDARY(q, bio, nxt))
1173 return 1; 1172 return 1;
1174 1173
1175 return 0; 1174 return 0;
1176 } 1175 }
1177 1176
1178 static int blk_hw_contig_segment(request_queue_t *q, struct bio *bio, 1177 static int blk_hw_contig_segment(request_queue_t *q, struct bio *bio,
1179 struct bio *nxt) 1178 struct bio *nxt)
1180 { 1179 {
1181 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 1180 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1182 blk_recount_segments(q, bio); 1181 blk_recount_segments(q, bio);
1183 if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID))) 1182 if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID)))
1184 blk_recount_segments(q, nxt); 1183 blk_recount_segments(q, nxt);
1185 if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) || 1184 if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||
1186 BIOVEC_VIRT_OVERSIZE(bio->bi_hw_front_size + bio->bi_hw_back_size)) 1185 BIOVEC_VIRT_OVERSIZE(bio->bi_hw_front_size + bio->bi_hw_back_size))
1187 return 0; 1186 return 0;
1188 if (bio->bi_size + nxt->bi_size > q->max_segment_size) 1187 if (bio->bi_size + nxt->bi_size > q->max_segment_size)
1189 return 0; 1188 return 0;
1190 1189
1191 return 1; 1190 return 1;
1192 } 1191 }
1193 1192
1194 /* 1193 /*
1195 * map a request to scatterlist, return number of sg entries setup. Caller 1194 * map a request to scatterlist, return number of sg entries setup. Caller
1196 * must make sure sg can hold rq->nr_phys_segments entries 1195 * must make sure sg can hold rq->nr_phys_segments entries
1197 */ 1196 */
1198 int blk_rq_map_sg(request_queue_t *q, struct request *rq, struct scatterlist *sg) 1197 int blk_rq_map_sg(request_queue_t *q, struct request *rq, struct scatterlist *sg)
1199 { 1198 {
1200 struct bio_vec *bvec, *bvprv; 1199 struct bio_vec *bvec, *bvprv;
1201 struct bio *bio; 1200 struct bio *bio;
1202 int nsegs, i, cluster; 1201 int nsegs, i, cluster;
1203 1202
1204 nsegs = 0; 1203 nsegs = 0;
1205 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER); 1204 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
1206 1205
1207 /* 1206 /*
1208 * for each bio in rq 1207 * for each bio in rq
1209 */ 1208 */
1210 bvprv = NULL; 1209 bvprv = NULL;
1211 rq_for_each_bio(bio, rq) { 1210 rq_for_each_bio(bio, rq) {
1212 /* 1211 /*
1213 * for each segment in bio 1212 * for each segment in bio
1214 */ 1213 */
1215 bio_for_each_segment(bvec, bio, i) { 1214 bio_for_each_segment(bvec, bio, i) {
1216 int nbytes = bvec->bv_len; 1215 int nbytes = bvec->bv_len;
1217 1216
1218 if (bvprv && cluster) { 1217 if (bvprv && cluster) {
1219 if (sg[nsegs - 1].length + nbytes > q->max_segment_size) 1218 if (sg[nsegs - 1].length + nbytes > q->max_segment_size)
1220 goto new_segment; 1219 goto new_segment;
1221 1220
1222 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) 1221 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
1223 goto new_segment; 1222 goto new_segment;
1224 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) 1223 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
1225 goto new_segment; 1224 goto new_segment;
1226 1225
1227 sg[nsegs - 1].length += nbytes; 1226 sg[nsegs - 1].length += nbytes;
1228 } else { 1227 } else {
1229 new_segment: 1228 new_segment:
1230 memset(&sg[nsegs],0,sizeof(struct scatterlist)); 1229 memset(&sg[nsegs],0,sizeof(struct scatterlist));
1231 sg[nsegs].page = bvec->bv_page; 1230 sg[nsegs].page = bvec->bv_page;
1232 sg[nsegs].length = nbytes; 1231 sg[nsegs].length = nbytes;
1233 sg[nsegs].offset = bvec->bv_offset; 1232 sg[nsegs].offset = bvec->bv_offset;
1234 1233
1235 nsegs++; 1234 nsegs++;
1236 } 1235 }
1237 bvprv = bvec; 1236 bvprv = bvec;
1238 } /* segments in bio */ 1237 } /* segments in bio */
1239 } /* bios in rq */ 1238 } /* bios in rq */
1240 1239
1241 return nsegs; 1240 return nsegs;
1242 } 1241 }
1243 1242
1244 EXPORT_SYMBOL(blk_rq_map_sg); 1243 EXPORT_SYMBOL(blk_rq_map_sg);
1245 1244
1246 /* 1245 /*
1247 * the standard queue merge functions, can be overridden with device 1246 * the standard queue merge functions, can be overridden with device
1248 * specific ones if so desired 1247 * specific ones if so desired
1249 */ 1248 */
1250 1249
1251 static inline int ll_new_mergeable(request_queue_t *q, 1250 static inline int ll_new_mergeable(request_queue_t *q,
1252 struct request *req, 1251 struct request *req,
1253 struct bio *bio) 1252 struct bio *bio)
1254 { 1253 {
1255 int nr_phys_segs = bio_phys_segments(q, bio); 1254 int nr_phys_segs = bio_phys_segments(q, bio);
1256 1255
1257 if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) { 1256 if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
1258 req->flags |= REQ_NOMERGE; 1257 req->flags |= REQ_NOMERGE;
1259 if (req == q->last_merge) 1258 if (req == q->last_merge)
1260 q->last_merge = NULL; 1259 q->last_merge = NULL;
1261 return 0; 1260 return 0;
1262 } 1261 }
1263 1262
1264 /* 1263 /*
1265 * A hw segment is just getting larger, bump just the phys 1264 * A hw segment is just getting larger, bump just the phys
1266 * counter. 1265 * counter.
1267 */ 1266 */
1268 req->nr_phys_segments += nr_phys_segs; 1267 req->nr_phys_segments += nr_phys_segs;
1269 return 1; 1268 return 1;
1270 } 1269 }
1271 1270
1272 static inline int ll_new_hw_segment(request_queue_t *q, 1271 static inline int ll_new_hw_segment(request_queue_t *q,
1273 struct request *req, 1272 struct request *req,
1274 struct bio *bio) 1273 struct bio *bio)
1275 { 1274 {
1276 int nr_hw_segs = bio_hw_segments(q, bio); 1275 int nr_hw_segs = bio_hw_segments(q, bio);
1277 int nr_phys_segs = bio_phys_segments(q, bio); 1276 int nr_phys_segs = bio_phys_segments(q, bio);
1278 1277
1279 if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments 1278 if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments
1280 || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) { 1279 || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
1281 req->flags |= REQ_NOMERGE; 1280 req->flags |= REQ_NOMERGE;
1282 if (req == q->last_merge) 1281 if (req == q->last_merge)
1283 q->last_merge = NULL; 1282 q->last_merge = NULL;
1284 return 0; 1283 return 0;
1285 } 1284 }
1286 1285
1287 /* 1286 /*
1288 * This will form the start of a new hw segment. Bump both 1287 * This will form the start of a new hw segment. Bump both
1289 * counters. 1288 * counters.
1290 */ 1289 */
1291 req->nr_hw_segments += nr_hw_segs; 1290 req->nr_hw_segments += nr_hw_segs;
1292 req->nr_phys_segments += nr_phys_segs; 1291 req->nr_phys_segments += nr_phys_segs;
1293 return 1; 1292 return 1;
1294 } 1293 }
1295 1294
1296 static int ll_back_merge_fn(request_queue_t *q, struct request *req, 1295 static int ll_back_merge_fn(request_queue_t *q, struct request *req,
1297 struct bio *bio) 1296 struct bio *bio)
1298 { 1297 {
1299 int len; 1298 int len;
1300 1299
1301 if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) { 1300 if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) {
1302 req->flags |= REQ_NOMERGE; 1301 req->flags |= REQ_NOMERGE;
1303 if (req == q->last_merge) 1302 if (req == q->last_merge)
1304 q->last_merge = NULL; 1303 q->last_merge = NULL;
1305 return 0; 1304 return 0;
1306 } 1305 }
1307 if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID))) 1306 if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID)))
1308 blk_recount_segments(q, req->biotail); 1307 blk_recount_segments(q, req->biotail);
1309 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 1308 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1310 blk_recount_segments(q, bio); 1309 blk_recount_segments(q, bio);
1311 len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size; 1310 len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;
1312 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) && 1311 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) &&
1313 !BIOVEC_VIRT_OVERSIZE(len)) { 1312 !BIOVEC_VIRT_OVERSIZE(len)) {
1314 int mergeable = ll_new_mergeable(q, req, bio); 1313 int mergeable = ll_new_mergeable(q, req, bio);
1315 1314
1316 if (mergeable) { 1315 if (mergeable) {
1317 if (req->nr_hw_segments == 1) 1316 if (req->nr_hw_segments == 1)
1318 req->bio->bi_hw_front_size = len; 1317 req->bio->bi_hw_front_size = len;
1319 if (bio->bi_hw_segments == 1) 1318 if (bio->bi_hw_segments == 1)
1320 bio->bi_hw_back_size = len; 1319 bio->bi_hw_back_size = len;
1321 } 1320 }
1322 return mergeable; 1321 return mergeable;
1323 } 1322 }
1324 1323
1325 return ll_new_hw_segment(q, req, bio); 1324 return ll_new_hw_segment(q, req, bio);
1326 } 1325 }
1327 1326
1328 static int ll_front_merge_fn(request_queue_t *q, struct request *req, 1327 static int ll_front_merge_fn(request_queue_t *q, struct request *req,
1329 struct bio *bio) 1328 struct bio *bio)
1330 { 1329 {
1331 int len; 1330 int len;
1332 1331
1333 if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) { 1332 if (req->nr_sectors + bio_sectors(bio) > q->max_sectors) {
1334 req->flags |= REQ_NOMERGE; 1333 req->flags |= REQ_NOMERGE;
1335 if (req == q->last_merge) 1334 if (req == q->last_merge)
1336 q->last_merge = NULL; 1335 q->last_merge = NULL;
1337 return 0; 1336 return 0;
1338 } 1337 }
1339 len = bio->bi_hw_back_size + req->bio->bi_hw_front_size; 1338 len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;
1340 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 1339 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1341 blk_recount_segments(q, bio); 1340 blk_recount_segments(q, bio);
1342 if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID))) 1341 if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID)))
1343 blk_recount_segments(q, req->bio); 1342 blk_recount_segments(q, req->bio);
1344 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) && 1343 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
1345 !BIOVEC_VIRT_OVERSIZE(len)) { 1344 !BIOVEC_VIRT_OVERSIZE(len)) {
1346 int mergeable = ll_new_mergeable(q, req, bio); 1345 int mergeable = ll_new_mergeable(q, req, bio);
1347 1346
1348 if (mergeable) { 1347 if (mergeable) {
1349 if (bio->bi_hw_segments == 1) 1348 if (bio->bi_hw_segments == 1)
1350 bio->bi_hw_front_size = len; 1349 bio->bi_hw_front_size = len;
1351 if (req->nr_hw_segments == 1) 1350 if (req->nr_hw_segments == 1)
1352 req->biotail->bi_hw_back_size = len; 1351 req->biotail->bi_hw_back_size = len;
1353 } 1352 }
1354 return mergeable; 1353 return mergeable;
1355 } 1354 }
1356 1355
1357 return ll_new_hw_segment(q, req, bio); 1356 return ll_new_hw_segment(q, req, bio);
1358 } 1357 }
1359 1358
1360 static int ll_merge_requests_fn(request_queue_t *q, struct request *req, 1359 static int ll_merge_requests_fn(request_queue_t *q, struct request *req,
1361 struct request *next) 1360 struct request *next)
1362 { 1361 {
1363 int total_phys_segments; 1362 int total_phys_segments;
1364 int total_hw_segments; 1363 int total_hw_segments;
1365 1364
1366 /* 1365 /*
1367 * First check if the either of the requests are re-queued 1366 * First check if the either of the requests are re-queued
1368 * requests. Can't merge them if they are. 1367 * requests. Can't merge them if they are.
1369 */ 1368 */
1370 if (req->special || next->special) 1369 if (req->special || next->special)
1371 return 0; 1370 return 0;
1372 1371
1373 /* 1372 /*
1374 * Will it become too large? 1373 * Will it become too large?
1375 */ 1374 */
1376 if ((req->nr_sectors + next->nr_sectors) > q->max_sectors) 1375 if ((req->nr_sectors + next->nr_sectors) > q->max_sectors)
1377 return 0; 1376 return 0;
1378 1377
1379 total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; 1378 total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
1380 if (blk_phys_contig_segment(q, req->biotail, next->bio)) 1379 if (blk_phys_contig_segment(q, req->biotail, next->bio))
1381 total_phys_segments--; 1380 total_phys_segments--;
1382 1381
1383 if (total_phys_segments > q->max_phys_segments) 1382 if (total_phys_segments > q->max_phys_segments)
1384 return 0; 1383 return 0;
1385 1384
1386 total_hw_segments = req->nr_hw_segments + next->nr_hw_segments; 1385 total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
1387 if (blk_hw_contig_segment(q, req->biotail, next->bio)) { 1386 if (blk_hw_contig_segment(q, req->biotail, next->bio)) {
1388 int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size; 1387 int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size;
1389 /* 1388 /*
1390 * propagate the combined length to the end of the requests 1389 * propagate the combined length to the end of the requests
1391 */ 1390 */
1392 if (req->nr_hw_segments == 1) 1391 if (req->nr_hw_segments == 1)
1393 req->bio->bi_hw_front_size = len; 1392 req->bio->bi_hw_front_size = len;
1394 if (next->nr_hw_segments == 1) 1393 if (next->nr_hw_segments == 1)
1395 next->biotail->bi_hw_back_size = len; 1394 next->biotail->bi_hw_back_size = len;
1396 total_hw_segments--; 1395 total_hw_segments--;
1397 } 1396 }
1398 1397
1399 if (total_hw_segments > q->max_hw_segments) 1398 if (total_hw_segments > q->max_hw_segments)
1400 return 0; 1399 return 0;
1401 1400
1402 /* Merge is OK... */ 1401 /* Merge is OK... */
1403 req->nr_phys_segments = total_phys_segments; 1402 req->nr_phys_segments = total_phys_segments;
1404 req->nr_hw_segments = total_hw_segments; 1403 req->nr_hw_segments = total_hw_segments;
1405 return 1; 1404 return 1;
1406 } 1405 }
1407 1406
1408 /* 1407 /*
1409 * "plug" the device if there are no outstanding requests: this will 1408 * "plug" the device if there are no outstanding requests: this will
1410 * force the transfer to start only after we have put all the requests 1409 * force the transfer to start only after we have put all the requests
1411 * on the list. 1410 * on the list.
1412 * 1411 *
1413 * This is called with interrupts off and no requests on the queue and 1412 * This is called with interrupts off and no requests on the queue and
1414 * with the queue lock held. 1413 * with the queue lock held.
1415 */ 1414 */
1416 void blk_plug_device(request_queue_t *q) 1415 void blk_plug_device(request_queue_t *q)
1417 { 1416 {
1418 WARN_ON(!irqs_disabled()); 1417 WARN_ON(!irqs_disabled());
1419 1418
1420 /* 1419 /*
1421 * don't plug a stopped queue, it must be paired with blk_start_queue() 1420 * don't plug a stopped queue, it must be paired with blk_start_queue()
1422 * which will restart the queueing 1421 * which will restart the queueing
1423 */ 1422 */
1424 if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) 1423 if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
1425 return; 1424 return;
1426 1425
1427 if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) 1426 if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
1428 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); 1427 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
1429 } 1428 }
1430 1429
1431 EXPORT_SYMBOL(blk_plug_device); 1430 EXPORT_SYMBOL(blk_plug_device);
1432 1431
1433 /* 1432 /*
1434 * remove the queue from the plugged list, if present. called with 1433 * remove the queue from the plugged list, if present. called with
1435 * queue lock held and interrupts disabled. 1434 * queue lock held and interrupts disabled.
1436 */ 1435 */
1437 int blk_remove_plug(request_queue_t *q) 1436 int blk_remove_plug(request_queue_t *q)
1438 { 1437 {
1439 WARN_ON(!irqs_disabled()); 1438 WARN_ON(!irqs_disabled());
1440 1439
1441 if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) 1440 if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
1442 return 0; 1441 return 0;
1443 1442
1444 del_timer(&q->unplug_timer); 1443 del_timer(&q->unplug_timer);
1445 return 1; 1444 return 1;
1446 } 1445 }
1447 1446
1448 EXPORT_SYMBOL(blk_remove_plug); 1447 EXPORT_SYMBOL(blk_remove_plug);
1449 1448
1450 /* 1449 /*
1451 * remove the plug and let it rip.. 1450 * remove the plug and let it rip..
1452 */ 1451 */
1453 void __generic_unplug_device(request_queue_t *q) 1452 void __generic_unplug_device(request_queue_t *q)
1454 { 1453 {
1455 if (unlikely(test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))) 1454 if (unlikely(test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)))
1456 return; 1455 return;
1457 1456
1458 if (!blk_remove_plug(q)) 1457 if (!blk_remove_plug(q))
1459 return; 1458 return;
1460 1459
1461 q->request_fn(q); 1460 q->request_fn(q);
1462 } 1461 }
1463 EXPORT_SYMBOL(__generic_unplug_device); 1462 EXPORT_SYMBOL(__generic_unplug_device);
1464 1463
1465 /** 1464 /**
1466 * generic_unplug_device - fire a request queue 1465 * generic_unplug_device - fire a request queue
1467 * @q: The &request_queue_t in question 1466 * @q: The &request_queue_t in question
1468 * 1467 *
1469 * Description: 1468 * Description:
1470 * Linux uses plugging to build bigger requests queues before letting 1469 * Linux uses plugging to build bigger requests queues before letting
1471 * the device have at them. If a queue is plugged, the I/O scheduler 1470 * the device have at them. If a queue is plugged, the I/O scheduler
1472 * is still adding and merging requests on the queue. Once the queue 1471 * is still adding and merging requests on the queue. Once the queue
1473 * gets unplugged, the request_fn defined for the queue is invoked and 1472 * gets unplugged, the request_fn defined for the queue is invoked and
1474 * transfers started. 1473 * transfers started.
1475 **/ 1474 **/
1476 void generic_unplug_device(request_queue_t *q) 1475 void generic_unplug_device(request_queue_t *q)
1477 { 1476 {
1478 spin_lock_irq(q->queue_lock); 1477 spin_lock_irq(q->queue_lock);
1479 __generic_unplug_device(q); 1478 __generic_unplug_device(q);
1480 spin_unlock_irq(q->queue_lock); 1479 spin_unlock_irq(q->queue_lock);
1481 } 1480 }
1482 EXPORT_SYMBOL(generic_unplug_device); 1481 EXPORT_SYMBOL(generic_unplug_device);
1483 1482
1484 static void blk_backing_dev_unplug(struct backing_dev_info *bdi, 1483 static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
1485 struct page *page) 1484 struct page *page)
1486 { 1485 {
1487 request_queue_t *q = bdi->unplug_io_data; 1486 request_queue_t *q = bdi->unplug_io_data;
1488 1487
1489 /* 1488 /*
1490 * devices don't necessarily have an ->unplug_fn defined 1489 * devices don't necessarily have an ->unplug_fn defined
1491 */ 1490 */
1492 if (q->unplug_fn) 1491 if (q->unplug_fn)
1493 q->unplug_fn(q); 1492 q->unplug_fn(q);
1494 } 1493 }
1495 1494
1496 static void blk_unplug_work(void *data) 1495 static void blk_unplug_work(void *data)
1497 { 1496 {
1498 request_queue_t *q = data; 1497 request_queue_t *q = data;
1499 1498
1500 q->unplug_fn(q); 1499 q->unplug_fn(q);
1501 } 1500 }
1502 1501
1503 static void blk_unplug_timeout(unsigned long data) 1502 static void blk_unplug_timeout(unsigned long data)
1504 { 1503 {
1505 request_queue_t *q = (request_queue_t *)data; 1504 request_queue_t *q = (request_queue_t *)data;
1506 1505
1507 kblockd_schedule_work(&q->unplug_work); 1506 kblockd_schedule_work(&q->unplug_work);
1508 } 1507 }
1509 1508
1510 /** 1509 /**
1511 * blk_start_queue - restart a previously stopped queue 1510 * blk_start_queue - restart a previously stopped queue
1512 * @q: The &request_queue_t in question 1511 * @q: The &request_queue_t in question
1513 * 1512 *
1514 * Description: 1513 * Description:
1515 * blk_start_queue() will clear the stop flag on the queue, and call 1514 * blk_start_queue() will clear the stop flag on the queue, and call
1516 * the request_fn for the queue if it was in a stopped state when 1515 * the request_fn for the queue if it was in a stopped state when
1517 * entered. Also see blk_stop_queue(). Queue lock must be held. 1516 * entered. Also see blk_stop_queue(). Queue lock must be held.
1518 **/ 1517 **/
1519 void blk_start_queue(request_queue_t *q) 1518 void blk_start_queue(request_queue_t *q)
1520 { 1519 {
1521 clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags); 1520 clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
1522 1521
1523 /* 1522 /*
1524 * one level of recursion is ok and is much faster than kicking 1523 * one level of recursion is ok and is much faster than kicking
1525 * the unplug handling 1524 * the unplug handling
1526 */ 1525 */
1527 if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { 1526 if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
1528 q->request_fn(q); 1527 q->request_fn(q);
1529 clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags); 1528 clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
1530 } else { 1529 } else {
1531 blk_plug_device(q); 1530 blk_plug_device(q);
1532 kblockd_schedule_work(&q->unplug_work); 1531 kblockd_schedule_work(&q->unplug_work);
1533 } 1532 }
1534 } 1533 }
1535 1534
1536 EXPORT_SYMBOL(blk_start_queue); 1535 EXPORT_SYMBOL(blk_start_queue);
1537 1536
1538 /** 1537 /**
1539 * blk_stop_queue - stop a queue 1538 * blk_stop_queue - stop a queue
1540 * @q: The &request_queue_t in question 1539 * @q: The &request_queue_t in question
1541 * 1540 *
1542 * Description: 1541 * Description:
1543 * The Linux block layer assumes that a block driver will consume all 1542 * The Linux block layer assumes that a block driver will consume all
1544 * entries on the request queue when the request_fn strategy is called. 1543 * entries on the request queue when the request_fn strategy is called.
1545 * Often this will not happen, because of hardware limitations (queue 1544 * Often this will not happen, because of hardware limitations (queue
1546 * depth settings). If a device driver gets a 'queue full' response, 1545 * depth settings). If a device driver gets a 'queue full' response,
1547 * or if it simply chooses not to queue more I/O at one point, it can 1546 * or if it simply chooses not to queue more I/O at one point, it can
1548 * call this function to prevent the request_fn from being called until 1547 * call this function to prevent the request_fn from being called until
1549 * the driver has signalled it's ready to go again. This happens by calling 1548 * the driver has signalled it's ready to go again. This happens by calling
1550 * blk_start_queue() to restart queue operations. Queue lock must be held. 1549 * blk_start_queue() to restart queue operations. Queue lock must be held.
1551 **/ 1550 **/
1552 void blk_stop_queue(request_queue_t *q) 1551 void blk_stop_queue(request_queue_t *q)
1553 { 1552 {
1554 blk_remove_plug(q); 1553 blk_remove_plug(q);
1555 set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags); 1554 set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
1556 } 1555 }
1557 EXPORT_SYMBOL(blk_stop_queue); 1556 EXPORT_SYMBOL(blk_stop_queue);
1558 1557
1559 /** 1558 /**
1560 * blk_sync_queue - cancel any pending callbacks on a queue 1559 * blk_sync_queue - cancel any pending callbacks on a queue
1561 * @q: the queue 1560 * @q: the queue
1562 * 1561 *
1563 * Description: 1562 * Description:
1564 * The block layer may perform asynchronous callback activity 1563 * The block layer may perform asynchronous callback activity
1565 * on a queue, such as calling the unplug function after a timeout. 1564 * on a queue, such as calling the unplug function after a timeout.
1566 * A block device may call blk_sync_queue to ensure that any 1565 * A block device may call blk_sync_queue to ensure that any
1567 * such activity is cancelled, thus allowing it to release resources 1566 * such activity is cancelled, thus allowing it to release resources
1568 * the the callbacks might use. The caller must already have made sure 1567 * the the callbacks might use. The caller must already have made sure
1569 * that its ->make_request_fn will not re-add plugging prior to calling 1568 * that its ->make_request_fn will not re-add plugging prior to calling
1570 * this function. 1569 * this function.
1571 * 1570 *
1572 */ 1571 */
1573 void blk_sync_queue(struct request_queue *q) 1572 void blk_sync_queue(struct request_queue *q)
1574 { 1573 {
1575 del_timer_sync(&q->unplug_timer); 1574 del_timer_sync(&q->unplug_timer);
1576 kblockd_flush(); 1575 kblockd_flush();
1577 } 1576 }
1578 EXPORT_SYMBOL(blk_sync_queue); 1577 EXPORT_SYMBOL(blk_sync_queue);
1579 1578
1580 /** 1579 /**
1581 * blk_run_queue - run a single device queue 1580 * blk_run_queue - run a single device queue
1582 * @q: The queue to run 1581 * @q: The queue to run
1583 */ 1582 */
1584 void blk_run_queue(struct request_queue *q) 1583 void blk_run_queue(struct request_queue *q)
1585 { 1584 {
1586 unsigned long flags; 1585 unsigned long flags;
1587 1586
1588 spin_lock_irqsave(q->queue_lock, flags); 1587 spin_lock_irqsave(q->queue_lock, flags);
1589 blk_remove_plug(q); 1588 blk_remove_plug(q);
1590 if (!elv_queue_empty(q)) 1589 if (!elv_queue_empty(q))
1591 q->request_fn(q); 1590 q->request_fn(q);
1592 spin_unlock_irqrestore(q->queue_lock, flags); 1591 spin_unlock_irqrestore(q->queue_lock, flags);
1593 } 1592 }
1594 EXPORT_SYMBOL(blk_run_queue); 1593 EXPORT_SYMBOL(blk_run_queue);
1595 1594
1596 /** 1595 /**
1597 * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed 1596 * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
1598 * @q: the request queue to be released 1597 * @q: the request queue to be released
1599 * 1598 *
1600 * Description: 1599 * Description:
1601 * blk_cleanup_queue is the pair to blk_init_queue() or 1600 * blk_cleanup_queue is the pair to blk_init_queue() or
1602 * blk_queue_make_request(). It should be called when a request queue is 1601 * blk_queue_make_request(). It should be called when a request queue is
1603 * being released; typically when a block device is being de-registered. 1602 * being released; typically when a block device is being de-registered.
1604 * Currently, its primary task it to free all the &struct request 1603 * Currently, its primary task it to free all the &struct request
1605 * structures that were allocated to the queue and the queue itself. 1604 * structures that were allocated to the queue and the queue itself.
1606 * 1605 *
1607 * Caveat: 1606 * Caveat:
1608 * Hopefully the low level driver will have finished any 1607 * Hopefully the low level driver will have finished any
1609 * outstanding requests first... 1608 * outstanding requests first...
1610 **/ 1609 **/
1611 void blk_cleanup_queue(request_queue_t * q) 1610 void blk_cleanup_queue(request_queue_t * q)
1612 { 1611 {
1613 struct request_list *rl = &q->rq; 1612 struct request_list *rl = &q->rq;
1614 1613
1615 if (!atomic_dec_and_test(&q->refcnt)) 1614 if (!atomic_dec_and_test(&q->refcnt))
1616 return; 1615 return;
1617 1616
1618 if (q->elevator) 1617 if (q->elevator)
1619 elevator_exit(q->elevator); 1618 elevator_exit(q->elevator);
1620 1619
1621 blk_sync_queue(q); 1620 blk_sync_queue(q);
1622 1621
1623 if (rl->rq_pool) 1622 if (rl->rq_pool)
1624 mempool_destroy(rl->rq_pool); 1623 mempool_destroy(rl->rq_pool);
1625 1624
1626 if (q->queue_tags) 1625 if (q->queue_tags)
1627 __blk_queue_free_tags(q); 1626 __blk_queue_free_tags(q);
1628 1627
1629 blk_queue_ordered(q, QUEUE_ORDERED_NONE); 1628 blk_queue_ordered(q, QUEUE_ORDERED_NONE);
1630 1629
1631 kmem_cache_free(requestq_cachep, q); 1630 kmem_cache_free(requestq_cachep, q);
1632 } 1631 }
1633 1632
1634 EXPORT_SYMBOL(blk_cleanup_queue); 1633 EXPORT_SYMBOL(blk_cleanup_queue);
1635 1634
1636 static int blk_init_free_list(request_queue_t *q) 1635 static int blk_init_free_list(request_queue_t *q)
1637 { 1636 {
1638 struct request_list *rl = &q->rq; 1637 struct request_list *rl = &q->rq;
1639 1638
1640 rl->count[READ] = rl->count[WRITE] = 0; 1639 rl->count[READ] = rl->count[WRITE] = 0;
1641 rl->starved[READ] = rl->starved[WRITE] = 0; 1640 rl->starved[READ] = rl->starved[WRITE] = 0;
1642 rl->elvpriv = 0; 1641 rl->elvpriv = 0;
1643 init_waitqueue_head(&rl->wait[READ]); 1642 init_waitqueue_head(&rl->wait[READ]);
1644 init_waitqueue_head(&rl->wait[WRITE]); 1643 init_waitqueue_head(&rl->wait[WRITE]);
1645 1644
1646 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, 1645 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
1647 mempool_free_slab, request_cachep, q->node); 1646 mempool_free_slab, request_cachep, q->node);
1648 1647
1649 if (!rl->rq_pool) 1648 if (!rl->rq_pool)
1650 return -ENOMEM; 1649 return -ENOMEM;
1651 1650
1652 return 0; 1651 return 0;
1653 } 1652 }
1654 1653
1655 static int __make_request(request_queue_t *, struct bio *); 1654 static int __make_request(request_queue_t *, struct bio *);
1656 1655
1657 request_queue_t *blk_alloc_queue(gfp_t gfp_mask) 1656 request_queue_t *blk_alloc_queue(gfp_t gfp_mask)
1658 { 1657 {
1659 return blk_alloc_queue_node(gfp_mask, -1); 1658 return blk_alloc_queue_node(gfp_mask, -1);
1660 } 1659 }
1661 EXPORT_SYMBOL(blk_alloc_queue); 1660 EXPORT_SYMBOL(blk_alloc_queue);
1662 1661
1663 request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) 1662 request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
1664 { 1663 {
1665 request_queue_t *q; 1664 request_queue_t *q;
1666 1665
1667 q = kmem_cache_alloc_node(requestq_cachep, gfp_mask, node_id); 1666 q = kmem_cache_alloc_node(requestq_cachep, gfp_mask, node_id);
1668 if (!q) 1667 if (!q)
1669 return NULL; 1668 return NULL;
1670 1669
1671 memset(q, 0, sizeof(*q)); 1670 memset(q, 0, sizeof(*q));
1672 init_timer(&q->unplug_timer); 1671 init_timer(&q->unplug_timer);
1673 atomic_set(&q->refcnt, 1); 1672 atomic_set(&q->refcnt, 1);
1674 1673
1675 q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; 1674 q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
1676 q->backing_dev_info.unplug_io_data = q; 1675 q->backing_dev_info.unplug_io_data = q;
1677 1676
1678 return q; 1677 return q;
1679 } 1678 }
1680 EXPORT_SYMBOL(blk_alloc_queue_node); 1679 EXPORT_SYMBOL(blk_alloc_queue_node);
1681 1680
1682 /** 1681 /**
1683 * blk_init_queue - prepare a request queue for use with a block device 1682 * blk_init_queue - prepare a request queue for use with a block device
1684 * @rfn: The function to be called to process requests that have been 1683 * @rfn: The function to be called to process requests that have been
1685 * placed on the queue. 1684 * placed on the queue.
1686 * @lock: Request queue spin lock 1685 * @lock: Request queue spin lock
1687 * 1686 *
1688 * Description: 1687 * Description:
1689 * If a block device wishes to use the standard request handling procedures, 1688 * If a block device wishes to use the standard request handling procedures,
1690 * which sorts requests and coalesces adjacent requests, then it must 1689 * which sorts requests and coalesces adjacent requests, then it must
1691 * call blk_init_queue(). The function @rfn will be called when there 1690 * call blk_init_queue(). The function @rfn will be called when there
1692 * are requests on the queue that need to be processed. If the device 1691 * are requests on the queue that need to be processed. If the device
1693 * supports plugging, then @rfn may not be called immediately when requests 1692 * supports plugging, then @rfn may not be called immediately when requests
1694 * are available on the queue, but may be called at some time later instead. 1693 * are available on the queue, but may be called at some time later instead.
1695 * Plugged queues are generally unplugged when a buffer belonging to one 1694 * Plugged queues are generally unplugged when a buffer belonging to one
1696 * of the requests on the queue is needed, or due to memory pressure. 1695 * of the requests on the queue is needed, or due to memory pressure.
1697 * 1696 *
1698 * @rfn is not required, or even expected, to remove all requests off the 1697 * @rfn is not required, or even expected, to remove all requests off the
1699 * queue, but only as many as it can handle at a time. If it does leave 1698 * queue, but only as many as it can handle at a time. If it does leave
1700 * requests on the queue, it is responsible for arranging that the requests 1699 * requests on the queue, it is responsible for arranging that the requests
1701 * get dealt with eventually. 1700 * get dealt with eventually.
1702 * 1701 *
1703 * The queue spin lock must be held while manipulating the requests on the 1702 * The queue spin lock must be held while manipulating the requests on the
1704 * request queue. 1703 * request queue.
1705 * 1704 *
1706 * Function returns a pointer to the initialized request queue, or NULL if 1705 * Function returns a pointer to the initialized request queue, or NULL if
1707 * it didn't succeed. 1706 * it didn't succeed.
1708 * 1707 *
1709 * Note: 1708 * Note:
1710 * blk_init_queue() must be paired with a blk_cleanup_queue() call 1709 * blk_init_queue() must be paired with a blk_cleanup_queue() call
1711 * when the block device is deactivated (such as at module unload). 1710 * when the block device is deactivated (such as at module unload).
1712 **/ 1711 **/
1713 1712
1714 request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) 1713 request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
1715 { 1714 {
1716 return blk_init_queue_node(rfn, lock, -1); 1715 return blk_init_queue_node(rfn, lock, -1);
1717 } 1716 }
1718 EXPORT_SYMBOL(blk_init_queue); 1717 EXPORT_SYMBOL(blk_init_queue);
1719 1718
1720 request_queue_t * 1719 request_queue_t *
1721 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) 1720 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
1722 { 1721 {
1723 request_queue_t *q = blk_alloc_queue_node(GFP_KERNEL, node_id); 1722 request_queue_t *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
1724 1723
1725 if (!q) 1724 if (!q)
1726 return NULL; 1725 return NULL;
1727 1726
1728 q->node = node_id; 1727 q->node = node_id;
1729 if (blk_init_free_list(q)) 1728 if (blk_init_free_list(q))
1730 goto out_init; 1729 goto out_init;
1731 1730
1732 /* 1731 /*
1733 * if caller didn't supply a lock, they get per-queue locking with 1732 * if caller didn't supply a lock, they get per-queue locking with
1734 * our embedded lock 1733 * our embedded lock
1735 */ 1734 */
1736 if (!lock) { 1735 if (!lock) {
1737 spin_lock_init(&q->__queue_lock); 1736 spin_lock_init(&q->__queue_lock);
1738 lock = &q->__queue_lock; 1737 lock = &q->__queue_lock;
1739 } 1738 }
1740 1739
1741 q->request_fn = rfn; 1740 q->request_fn = rfn;
1742 q->back_merge_fn = ll_back_merge_fn; 1741 q->back_merge_fn = ll_back_merge_fn;
1743 q->front_merge_fn = ll_front_merge_fn; 1742 q->front_merge_fn = ll_front_merge_fn;
1744 q->merge_requests_fn = ll_merge_requests_fn; 1743 q->merge_requests_fn = ll_merge_requests_fn;
1745 q->prep_rq_fn = NULL; 1744 q->prep_rq_fn = NULL;
1746 q->unplug_fn = generic_unplug_device; 1745 q->unplug_fn = generic_unplug_device;
1747 q->queue_flags = (1 << QUEUE_FLAG_CLUSTER); 1746 q->queue_flags = (1 << QUEUE_FLAG_CLUSTER);
1748 q->queue_lock = lock; 1747 q->queue_lock = lock;
1749 1748
1750 blk_queue_segment_boundary(q, 0xffffffff); 1749 blk_queue_segment_boundary(q, 0xffffffff);
1751 1750
1752 blk_queue_make_request(q, __make_request); 1751 blk_queue_make_request(q, __make_request);
1753 blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE); 1752 blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
1754 1753
1755 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); 1754 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
1756 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); 1755 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
1757 1756
1758 /* 1757 /*
1759 * all done 1758 * all done
1760 */ 1759 */
1761 if (!elevator_init(q, NULL)) { 1760 if (!elevator_init(q, NULL)) {
1762 blk_queue_congestion_threshold(q); 1761 blk_queue_congestion_threshold(q);
1763 return q; 1762 return q;
1764 } 1763 }
1765 1764
1766 blk_cleanup_queue(q); 1765 blk_cleanup_queue(q);
1767 out_init: 1766 out_init:
1768 kmem_cache_free(requestq_cachep, q); 1767 kmem_cache_free(requestq_cachep, q);
1769 return NULL; 1768 return NULL;
1770 } 1769 }
1771 EXPORT_SYMBOL(blk_init_queue_node); 1770 EXPORT_SYMBOL(blk_init_queue_node);
1772 1771
1773 int blk_get_queue(request_queue_t *q) 1772 int blk_get_queue(request_queue_t *q)
1774 { 1773 {
1775 if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { 1774 if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
1776 atomic_inc(&q->refcnt); 1775 atomic_inc(&q->refcnt);
1777 return 0; 1776 return 0;
1778 } 1777 }
1779 1778
1780 return 1; 1779 return 1;
1781 } 1780 }
1782 1781
1783 EXPORT_SYMBOL(blk_get_queue); 1782 EXPORT_SYMBOL(blk_get_queue);
1784 1783
1785 static inline void blk_free_request(request_queue_t *q, struct request *rq) 1784 static inline void blk_free_request(request_queue_t *q, struct request *rq)
1786 { 1785 {
1787 if (rq->flags & REQ_ELVPRIV) 1786 if (rq->flags & REQ_ELVPRIV)
1788 elv_put_request(q, rq); 1787 elv_put_request(q, rq);
1789 mempool_free(rq, q->rq.rq_pool); 1788 mempool_free(rq, q->rq.rq_pool);
1790 } 1789 }
1791 1790
1792 static inline struct request * 1791 static inline struct request *
1793 blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, 1792 blk_alloc_request(request_queue_t *q, int rw, struct bio *bio,
1794 int priv, gfp_t gfp_mask) 1793 int priv, gfp_t gfp_mask)
1795 { 1794 {
1796 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); 1795 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
1797 1796
1798 if (!rq) 1797 if (!rq)
1799 return NULL; 1798 return NULL;
1800 1799
1801 /* 1800 /*
1802 * first three bits are identical in rq->flags and bio->bi_rw, 1801 * first three bits are identical in rq->flags and bio->bi_rw,
1803 * see bio.h and blkdev.h 1802 * see bio.h and blkdev.h
1804 */ 1803 */
1805 rq->flags = rw; 1804 rq->flags = rw;
1806 1805
1807 if (priv) { 1806 if (priv) {
1808 if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) { 1807 if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) {
1809 mempool_free(rq, q->rq.rq_pool); 1808 mempool_free(rq, q->rq.rq_pool);
1810 return NULL; 1809 return NULL;
1811 } 1810 }
1812 rq->flags |= REQ_ELVPRIV; 1811 rq->flags |= REQ_ELVPRIV;
1813 } 1812 }
1814 1813
1815 return rq; 1814 return rq;
1816 } 1815 }
1817 1816
1818 /* 1817 /*
1819 * ioc_batching returns true if the ioc is a valid batching request and 1818 * ioc_batching returns true if the ioc is a valid batching request and
1820 * should be given priority access to a request. 1819 * should be given priority access to a request.
1821 */ 1820 */
1822 static inline int ioc_batching(request_queue_t *q, struct io_context *ioc) 1821 static inline int ioc_batching(request_queue_t *q, struct io_context *ioc)
1823 { 1822 {
1824 if (!ioc) 1823 if (!ioc)
1825 return 0; 1824 return 0;
1826 1825
1827 /* 1826 /*
1828 * Make sure the process is able to allocate at least 1 request 1827 * Make sure the process is able to allocate at least 1 request
1829 * even if the batch times out, otherwise we could theoretically 1828 * even if the batch times out, otherwise we could theoretically
1830 * lose wakeups. 1829 * lose wakeups.
1831 */ 1830 */
1832 return ioc->nr_batch_requests == q->nr_batching || 1831 return ioc->nr_batch_requests == q->nr_batching ||
1833 (ioc->nr_batch_requests > 0 1832 (ioc->nr_batch_requests > 0
1834 && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME)); 1833 && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
1835 } 1834 }
1836 1835
1837 /* 1836 /*
1838 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This 1837 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
1839 * will cause the process to be a "batcher" on all queues in the system. This 1838 * will cause the process to be a "batcher" on all queues in the system. This
1840 * is the behaviour we want though - once it gets a wakeup it should be given 1839 * is the behaviour we want though - once it gets a wakeup it should be given
1841 * a nice run. 1840 * a nice run.
1842 */ 1841 */
1843 static void ioc_set_batching(request_queue_t *q, struct io_context *ioc) 1842 static void ioc_set_batching(request_queue_t *q, struct io_context *ioc)
1844 { 1843 {
1845 if (!ioc || ioc_batching(q, ioc)) 1844 if (!ioc || ioc_batching(q, ioc))
1846 return; 1845 return;
1847 1846
1848 ioc->nr_batch_requests = q->nr_batching; 1847 ioc->nr_batch_requests = q->nr_batching;
1849 ioc->last_waited = jiffies; 1848 ioc->last_waited = jiffies;
1850 } 1849 }
1851 1850
1852 static void __freed_request(request_queue_t *q, int rw) 1851 static void __freed_request(request_queue_t *q, int rw)
1853 { 1852 {
1854 struct request_list *rl = &q->rq; 1853 struct request_list *rl = &q->rq;
1855 1854
1856 if (rl->count[rw] < queue_congestion_off_threshold(q)) 1855 if (rl->count[rw] < queue_congestion_off_threshold(q))
1857 clear_queue_congested(q, rw); 1856 clear_queue_congested(q, rw);
1858 1857
1859 if (rl->count[rw] + 1 <= q->nr_requests) { 1858 if (rl->count[rw] + 1 <= q->nr_requests) {
1860 if (waitqueue_active(&rl->wait[rw])) 1859 if (waitqueue_active(&rl->wait[rw]))
1861 wake_up(&rl->wait[rw]); 1860 wake_up(&rl->wait[rw]);
1862 1861
1863 blk_clear_queue_full(q, rw); 1862 blk_clear_queue_full(q, rw);
1864 } 1863 }
1865 } 1864 }
1866 1865
1867 /* 1866 /*
1868 * A request has just been released. Account for it, update the full and 1867 * A request has just been released. Account for it, update the full and
1869 * congestion status, wake up any waiters. Called under q->queue_lock. 1868 * congestion status, wake up any waiters. Called under q->queue_lock.
1870 */ 1869 */
1871 static void freed_request(request_queue_t *q, int rw, int priv) 1870 static void freed_request(request_queue_t *q, int rw, int priv)
1872 { 1871 {
1873 struct request_list *rl = &q->rq; 1872 struct request_list *rl = &q->rq;
1874 1873
1875 rl->count[rw]--; 1874 rl->count[rw]--;
1876 if (priv) 1875 if (priv)
1877 rl->elvpriv--; 1876 rl->elvpriv--;
1878 1877
1879 __freed_request(q, rw); 1878 __freed_request(q, rw);
1880 1879
1881 if (unlikely(rl->starved[rw ^ 1])) 1880 if (unlikely(rl->starved[rw ^ 1]))
1882 __freed_request(q, rw ^ 1); 1881 __freed_request(q, rw ^ 1);
1883 } 1882 }
1884 1883
1885 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist) 1884 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
1886 /* 1885 /*
1887 * Get a free request, queue_lock must be held. 1886 * Get a free request, queue_lock must be held.
1888 * Returns NULL on failure, with queue_lock held. 1887 * Returns NULL on failure, with queue_lock held.
1889 * Returns !NULL on success, with queue_lock *not held*. 1888 * Returns !NULL on success, with queue_lock *not held*.
1890 */ 1889 */
1891 static struct request *get_request(request_queue_t *q, int rw, struct bio *bio, 1890 static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
1892 gfp_t gfp_mask) 1891 gfp_t gfp_mask)
1893 { 1892 {
1894 struct request *rq = NULL; 1893 struct request *rq = NULL;
1895 struct request_list *rl = &q->rq; 1894 struct request_list *rl = &q->rq;
1896 struct io_context *ioc = current_io_context(GFP_ATOMIC); 1895 struct io_context *ioc = current_io_context(GFP_ATOMIC);
1897 int priv; 1896 int priv;
1898 1897
1899 if (rl->count[rw]+1 >= q->nr_requests) { 1898 if (rl->count[rw]+1 >= q->nr_requests) {
1900 /* 1899 /*
1901 * The queue will fill after this allocation, so set it as 1900 * The queue will fill after this allocation, so set it as
1902 * full, and mark this process as "batching". This process 1901 * full, and mark this process as "batching". This process
1903 * will be allowed to complete a batch of requests, others 1902 * will be allowed to complete a batch of requests, others
1904 * will be blocked. 1903 * will be blocked.
1905 */ 1904 */
1906 if (!blk_queue_full(q, rw)) { 1905 if (!blk_queue_full(q, rw)) {
1907 ioc_set_batching(q, ioc); 1906 ioc_set_batching(q, ioc);
1908 blk_set_queue_full(q, rw); 1907 blk_set_queue_full(q, rw);
1909 } 1908 }
1910 } 1909 }
1911 1910
1912 switch (elv_may_queue(q, rw, bio)) { 1911 switch (elv_may_queue(q, rw, bio)) {
1913 case ELV_MQUEUE_NO: 1912 case ELV_MQUEUE_NO:
1914 goto rq_starved; 1913 goto rq_starved;
1915 case ELV_MQUEUE_MAY: 1914 case ELV_MQUEUE_MAY:
1916 break; 1915 break;
1917 case ELV_MQUEUE_MUST: 1916 case ELV_MQUEUE_MUST:
1918 goto get_rq; 1917 goto get_rq;
1919 } 1918 }
1920 1919
1921 if (blk_queue_full(q, rw) && !ioc_batching(q, ioc)) { 1920 if (blk_queue_full(q, rw) && !ioc_batching(q, ioc)) {
1922 /* 1921 /*
1923 * The queue is full and the allocating process is not a 1922 * The queue is full and the allocating process is not a
1924 * "batcher", and not exempted by the IO scheduler 1923 * "batcher", and not exempted by the IO scheduler
1925 */ 1924 */
1926 goto out; 1925 goto out;
1927 } 1926 }
1928 1927
1929 get_rq: 1928 get_rq:
1930 /* 1929 /*
1931 * Only allow batching queuers to allocate up to 50% over the defined 1930 * Only allow batching queuers to allocate up to 50% over the defined
1932 * limit of requests, otherwise we could have thousands of requests 1931 * limit of requests, otherwise we could have thousands of requests
1933 * allocated with any setting of ->nr_requests 1932 * allocated with any setting of ->nr_requests
1934 */ 1933 */
1935 if (rl->count[rw] >= (3 * q->nr_requests / 2)) 1934 if (rl->count[rw] >= (3 * q->nr_requests / 2))
1936 goto out; 1935 goto out;
1937 1936
1938 rl->count[rw]++; 1937 rl->count[rw]++;
1939 rl->starved[rw] = 0; 1938 rl->starved[rw] = 0;
1940 if (rl->count[rw] >= queue_congestion_on_threshold(q)) 1939 if (rl->count[rw] >= queue_congestion_on_threshold(q))
1941 set_queue_congested(q, rw); 1940 set_queue_congested(q, rw);
1942 1941
1943 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 1942 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
1944 if (priv) 1943 if (priv)
1945 rl->elvpriv++; 1944 rl->elvpriv++;
1946 1945
1947 spin_unlock_irq(q->queue_lock); 1946 spin_unlock_irq(q->queue_lock);
1948 1947
1949 rq = blk_alloc_request(q, rw, bio, priv, gfp_mask); 1948 rq = blk_alloc_request(q, rw, bio, priv, gfp_mask);
1950 if (!rq) { 1949 if (!rq) {
1951 /* 1950 /*
1952 * Allocation failed presumably due to memory. Undo anything 1951 * Allocation failed presumably due to memory. Undo anything
1953 * we might have messed up. 1952 * we might have messed up.
1954 * 1953 *
1955 * Allocating task should really be put onto the front of the 1954 * Allocating task should really be put onto the front of the
1956 * wait queue, but this is pretty rare. 1955 * wait queue, but this is pretty rare.
1957 */ 1956 */
1958 spin_lock_irq(q->queue_lock); 1957 spin_lock_irq(q->queue_lock);
1959 freed_request(q, rw, priv); 1958 freed_request(q, rw, priv);
1960 1959
1961 /* 1960 /*
1962 * in the very unlikely event that allocation failed and no 1961 * in the very unlikely event that allocation failed and no
1963 * requests for this direction was pending, mark us starved 1962 * requests for this direction was pending, mark us starved
1964 * so that freeing of a request in the other direction will 1963 * so that freeing of a request in the other direction will
1965 * notice us. another possible fix would be to split the 1964 * notice us. another possible fix would be to split the
1966 * rq mempool into READ and WRITE 1965 * rq mempool into READ and WRITE
1967 */ 1966 */
1968 rq_starved: 1967 rq_starved:
1969 if (unlikely(rl->count[rw] == 0)) 1968 if (unlikely(rl->count[rw] == 0))
1970 rl->starved[rw] = 1; 1969 rl->starved[rw] = 1;
1971 1970
1972 goto out; 1971 goto out;
1973 } 1972 }
1974 1973
1975 if (ioc_batching(q, ioc)) 1974 if (ioc_batching(q, ioc))
1976 ioc->nr_batch_requests--; 1975 ioc->nr_batch_requests--;
1977 1976
1978 rq_init(q, rq); 1977 rq_init(q, rq);
1979 rq->rl = rl; 1978 rq->rl = rl;
1980 out: 1979 out:
1981 return rq; 1980 return rq;
1982 } 1981 }
1983 1982
1984 /* 1983 /*
1985 * No available requests for this queue, unplug the device and wait for some 1984 * No available requests for this queue, unplug the device and wait for some
1986 * requests to become available. 1985 * requests to become available.
1987 * 1986 *
1988 * Called with q->queue_lock held, and returns with it unlocked. 1987 * Called with q->queue_lock held, and returns with it unlocked.
1989 */ 1988 */
1990 static struct request *get_request_wait(request_queue_t *q, int rw, 1989 static struct request *get_request_wait(request_queue_t *q, int rw,
1991 struct bio *bio) 1990 struct bio *bio)
1992 { 1991 {
1993 struct request *rq; 1992 struct request *rq;
1994 1993
1995 rq = get_request(q, rw, bio, GFP_NOIO); 1994 rq = get_request(q, rw, bio, GFP_NOIO);
1996 while (!rq) { 1995 while (!rq) {
1997 DEFINE_WAIT(wait); 1996 DEFINE_WAIT(wait);
1998 struct request_list *rl = &q->rq; 1997 struct request_list *rl = &q->rq;
1999 1998
2000 prepare_to_wait_exclusive(&rl->wait[rw], &wait, 1999 prepare_to_wait_exclusive(&rl->wait[rw], &wait,
2001 TASK_UNINTERRUPTIBLE); 2000 TASK_UNINTERRUPTIBLE);
2002 2001
2003 rq = get_request(q, rw, bio, GFP_NOIO); 2002 rq = get_request(q, rw, bio, GFP_NOIO);
2004 2003
2005 if (!rq) { 2004 if (!rq) {
2006 struct io_context *ioc; 2005 struct io_context *ioc;
2007 2006
2008 __generic_unplug_device(q); 2007 __generic_unplug_device(q);
2009 spin_unlock_irq(q->queue_lock); 2008 spin_unlock_irq(q->queue_lock);
2010 io_schedule(); 2009 io_schedule();
2011 2010
2012 /* 2011 /*
2013 * After sleeping, we become a "batching" process and 2012 * After sleeping, we become a "batching" process and
2014 * will be able to allocate at least one request, and 2013 * will be able to allocate at least one request, and
2015 * up to a big batch of them for a small period time. 2014 * up to a big batch of them for a small period time.
2016 * See ioc_batching, ioc_set_batching 2015 * See ioc_batching, ioc_set_batching
2017 */ 2016 */
2018 ioc = current_io_context(GFP_NOIO); 2017 ioc = current_io_context(GFP_NOIO);
2019 ioc_set_batching(q, ioc); 2018 ioc_set_batching(q, ioc);
2020 2019
2021 spin_lock_irq(q->queue_lock); 2020 spin_lock_irq(q->queue_lock);
2022 } 2021 }
2023 finish_wait(&rl->wait[rw], &wait); 2022 finish_wait(&rl->wait[rw], &wait);
2024 } 2023 }
2025 2024
2026 return rq; 2025 return rq;
2027 } 2026 }
2028 2027
2029 struct request *blk_get_request(request_queue_t *q, int rw, gfp_t gfp_mask) 2028 struct request *blk_get_request(request_queue_t *q, int rw, gfp_t gfp_mask)
2030 { 2029 {
2031 struct request *rq; 2030 struct request *rq;
2032 2031
2033 BUG_ON(rw != READ && rw != WRITE); 2032 BUG_ON(rw != READ && rw != WRITE);
2034 2033
2035 spin_lock_irq(q->queue_lock); 2034 spin_lock_irq(q->queue_lock);
2036 if (gfp_mask & __GFP_WAIT) { 2035 if (gfp_mask & __GFP_WAIT) {
2037 rq = get_request_wait(q, rw, NULL); 2036 rq = get_request_wait(q, rw, NULL);
2038 } else { 2037 } else {
2039 rq = get_request(q, rw, NULL, gfp_mask); 2038 rq = get_request(q, rw, NULL, gfp_mask);
2040 if (!rq) 2039 if (!rq)
2041 spin_unlock_irq(q->queue_lock); 2040 spin_unlock_irq(q->queue_lock);
2042 } 2041 }
2043 /* q->queue_lock is unlocked at this point */ 2042 /* q->queue_lock is unlocked at this point */
2044 2043
2045 return rq; 2044 return rq;
2046 } 2045 }
2047 EXPORT_SYMBOL(blk_get_request); 2046 EXPORT_SYMBOL(blk_get_request);
2048 2047
2049 /** 2048 /**
2050 * blk_requeue_request - put a request back on queue 2049 * blk_requeue_request - put a request back on queue
2051 * @q: request queue where request should be inserted 2050 * @q: request queue where request should be inserted
2052 * @rq: request to be inserted 2051 * @rq: request to be inserted
2053 * 2052 *
2054 * Description: 2053 * Description:
2055 * Drivers often keep queueing requests until the hardware cannot accept 2054 * Drivers often keep queueing requests until the hardware cannot accept
2056 * more, when that condition happens we need to put the request back 2055 * more, when that condition happens we need to put the request back
2057 * on the queue. Must be called with queue lock held. 2056 * on the queue. Must be called with queue lock held.
2058 */ 2057 */
2059 void blk_requeue_request(request_queue_t *q, struct request *rq) 2058 void blk_requeue_request(request_queue_t *q, struct request *rq)
2060 { 2059 {
2061 if (blk_rq_tagged(rq)) 2060 if (blk_rq_tagged(rq))
2062 blk_queue_end_tag(q, rq); 2061 blk_queue_end_tag(q, rq);
2063 2062
2064 elv_requeue_request(q, rq); 2063 elv_requeue_request(q, rq);
2065 } 2064 }
2066 2065
2067 EXPORT_SYMBOL(blk_requeue_request); 2066 EXPORT_SYMBOL(blk_requeue_request);
2068 2067
2069 /** 2068 /**
2070 * blk_insert_request - insert a special request in to a request queue 2069 * blk_insert_request - insert a special request in to a request queue
2071 * @q: request queue where request should be inserted 2070 * @q: request queue where request should be inserted
2072 * @rq: request to be inserted 2071 * @rq: request to be inserted
2073 * @at_head: insert request at head or tail of queue 2072 * @at_head: insert request at head or tail of queue
2074 * @data: private data 2073 * @data: private data
2075 * 2074 *
2076 * Description: 2075 * Description:
2077 * Many block devices need to execute commands asynchronously, so they don't 2076 * Many block devices need to execute commands asynchronously, so they don't
2078 * block the whole kernel from preemption during request execution. This is 2077 * block the whole kernel from preemption during request execution. This is
2079 * accomplished normally by inserting aritficial requests tagged as 2078 * accomplished normally by inserting aritficial requests tagged as
2080 * REQ_SPECIAL in to the corresponding request queue, and letting them be 2079 * REQ_SPECIAL in to the corresponding request queue, and letting them be
2081 * scheduled for actual execution by the request queue. 2080 * scheduled for actual execution by the request queue.
2082 * 2081 *
2083 * We have the option of inserting the head or the tail of the queue. 2082 * We have the option of inserting the head or the tail of the queue.
2084 * Typically we use the tail for new ioctls and so forth. We use the head 2083 * Typically we use the tail for new ioctls and so forth. We use the head
2085 * of the queue for things like a QUEUE_FULL message from a device, or a 2084 * of the queue for things like a QUEUE_FULL message from a device, or a
2086 * host that is unable to accept a particular command. 2085 * host that is unable to accept a particular command.
2087 */ 2086 */
2088 void blk_insert_request(request_queue_t *q, struct request *rq, 2087 void blk_insert_request(request_queue_t *q, struct request *rq,
2089 int at_head, void *data) 2088 int at_head, void *data)
2090 { 2089 {
2091 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; 2090 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
2092 unsigned long flags; 2091 unsigned long flags;
2093 2092
2094 /* 2093 /*
2095 * tell I/O scheduler that this isn't a regular read/write (ie it 2094 * tell I/O scheduler that this isn't a regular read/write (ie it
2096 * must not attempt merges on this) and that it acts as a soft 2095 * must not attempt merges on this) and that it acts as a soft
2097 * barrier 2096 * barrier
2098 */ 2097 */
2099 rq->flags |= REQ_SPECIAL | REQ_SOFTBARRIER; 2098 rq->flags |= REQ_SPECIAL | REQ_SOFTBARRIER;
2100 2099
2101 rq->special = data; 2100 rq->special = data;
2102 2101
2103 spin_lock_irqsave(q->queue_lock, flags); 2102 spin_lock_irqsave(q->queue_lock, flags);
2104 2103
2105 /* 2104 /*
2106 * If command is tagged, release the tag 2105 * If command is tagged, release the tag
2107 */ 2106 */
2108 if (blk_rq_tagged(rq)) 2107 if (blk_rq_tagged(rq))
2109 blk_queue_end_tag(q, rq); 2108 blk_queue_end_tag(q, rq);
2110 2109
2111 drive_stat_acct(rq, rq->nr_sectors, 1); 2110 drive_stat_acct(rq, rq->nr_sectors, 1);
2112 __elv_add_request(q, rq, where, 0); 2111 __elv_add_request(q, rq, where, 0);
2113 2112
2114 if (blk_queue_plugged(q)) 2113 if (blk_queue_plugged(q))
2115 __generic_unplug_device(q); 2114 __generic_unplug_device(q);
2116 else 2115 else
2117 q->request_fn(q); 2116 q->request_fn(q);
2118 spin_unlock_irqrestore(q->queue_lock, flags); 2117 spin_unlock_irqrestore(q->queue_lock, flags);
2119 } 2118 }
2120 2119
2121 EXPORT_SYMBOL(blk_insert_request); 2120 EXPORT_SYMBOL(blk_insert_request);
2122 2121
2123 /** 2122 /**
2124 * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage 2123 * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage
2125 * @q: request queue where request should be inserted 2124 * @q: request queue where request should be inserted
2126 * @rq: request structure to fill 2125 * @rq: request structure to fill
2127 * @ubuf: the user buffer 2126 * @ubuf: the user buffer
2128 * @len: length of user data 2127 * @len: length of user data
2129 * 2128 *
2130 * Description: 2129 * Description:
2131 * Data will be mapped directly for zero copy io, if possible. Otherwise 2130 * Data will be mapped directly for zero copy io, if possible. Otherwise
2132 * a kernel bounce buffer is used. 2131 * a kernel bounce buffer is used.
2133 * 2132 *
2134 * A matching blk_rq_unmap_user() must be issued at the end of io, while 2133 * A matching blk_rq_unmap_user() must be issued at the end of io, while
2135 * still in process context. 2134 * still in process context.
2136 * 2135 *
2137 * Note: The mapped bio may need to be bounced through blk_queue_bounce() 2136 * Note: The mapped bio may need to be bounced through blk_queue_bounce()
2138 * before being submitted to the device, as pages mapped may be out of 2137 * before being submitted to the device, as pages mapped may be out of
2139 * reach. It's the callers responsibility to make sure this happens. The 2138 * reach. It's the callers responsibility to make sure this happens. The
2140 * original bio must be passed back in to blk_rq_unmap_user() for proper 2139 * original bio must be passed back in to blk_rq_unmap_user() for proper
2141 * unmapping. 2140 * unmapping.
2142 */ 2141 */
2143 int blk_rq_map_user(request_queue_t *q, struct request *rq, void __user *ubuf, 2142 int blk_rq_map_user(request_queue_t *q, struct request *rq, void __user *ubuf,
2144 unsigned int len) 2143 unsigned int len)
2145 { 2144 {
2146 unsigned long uaddr; 2145 unsigned long uaddr;
2147 struct bio *bio; 2146 struct bio *bio;
2148 int reading; 2147 int reading;
2149 2148
2150 if (len > (q->max_sectors << 9)) 2149 if (len > (q->max_sectors << 9))
2151 return -EINVAL; 2150 return -EINVAL;
2152 if (!len || !ubuf) 2151 if (!len || !ubuf)
2153 return -EINVAL; 2152 return -EINVAL;
2154 2153
2155 reading = rq_data_dir(rq) == READ; 2154 reading = rq_data_dir(rq) == READ;
2156 2155
2157 /* 2156 /*
2158 * if alignment requirement is satisfied, map in user pages for 2157 * if alignment requirement is satisfied, map in user pages for
2159 * direct dma. else, set up kernel bounce buffers 2158 * direct dma. else, set up kernel bounce buffers
2160 */ 2159 */
2161 uaddr = (unsigned long) ubuf; 2160 uaddr = (unsigned long) ubuf;
2162 if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q))) 2161 if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q)))
2163 bio = bio_map_user(q, NULL, uaddr, len, reading); 2162 bio = bio_map_user(q, NULL, uaddr, len, reading);
2164 else 2163 else
2165 bio = bio_copy_user(q, uaddr, len, reading); 2164 bio = bio_copy_user(q, uaddr, len, reading);
2166 2165
2167 if (!IS_ERR(bio)) { 2166 if (!IS_ERR(bio)) {
2168 rq->bio = rq->biotail = bio; 2167 rq->bio = rq->biotail = bio;
2169 blk_rq_bio_prep(q, rq, bio); 2168 blk_rq_bio_prep(q, rq, bio);
2170 2169
2171 rq->buffer = rq->data = NULL; 2170 rq->buffer = rq->data = NULL;
2172 rq->data_len = len; 2171 rq->data_len = len;
2173 return 0; 2172 return 0;
2174 } 2173 }
2175 2174
2176 /* 2175 /*
2177 * bio is the err-ptr 2176 * bio is the err-ptr
2178 */ 2177 */
2179 return PTR_ERR(bio); 2178 return PTR_ERR(bio);
2180 } 2179 }
2181 2180
2182 EXPORT_SYMBOL(blk_rq_map_user); 2181 EXPORT_SYMBOL(blk_rq_map_user);
2183 2182
2184 /** 2183 /**
2185 * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage 2184 * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage
2186 * @q: request queue where request should be inserted 2185 * @q: request queue where request should be inserted
2187 * @rq: request to map data to 2186 * @rq: request to map data to
2188 * @iov: pointer to the iovec 2187 * @iov: pointer to the iovec
2189 * @iov_count: number of elements in the iovec 2188 * @iov_count: number of elements in the iovec
2190 * 2189 *
2191 * Description: 2190 * Description:
2192 * Data will be mapped directly for zero copy io, if possible. Otherwise 2191 * Data will be mapped directly for zero copy io, if possible. Otherwise
2193 * a kernel bounce buffer is used. 2192 * a kernel bounce buffer is used.
2194 * 2193 *
2195 * A matching blk_rq_unmap_user() must be issued at the end of io, while 2194 * A matching blk_rq_unmap_user() must be issued at the end of io, while
2196 * still in process context. 2195 * still in process context.
2197 * 2196 *
2198 * Note: The mapped bio may need to be bounced through blk_queue_bounce() 2197 * Note: The mapped bio may need to be bounced through blk_queue_bounce()
2199 * before being submitted to the device, as pages mapped may be out of 2198 * before being submitted to the device, as pages mapped may be out of
2200 * reach. It's the callers responsibility to make sure this happens. The 2199 * reach. It's the callers responsibility to make sure this happens. The
2201 * original bio must be passed back in to blk_rq_unmap_user() for proper 2200 * original bio must be passed back in to blk_rq_unmap_user() for proper
2202 * unmapping. 2201 * unmapping.
2203 */ 2202 */
2204 int blk_rq_map_user_iov(request_queue_t *q, struct request *rq, 2203 int blk_rq_map_user_iov(request_queue_t *q, struct request *rq,
2205 struct sg_iovec *iov, int iov_count) 2204 struct sg_iovec *iov, int iov_count)
2206 { 2205 {
2207 struct bio *bio; 2206 struct bio *bio;
2208 2207
2209 if (!iov || iov_count <= 0) 2208 if (!iov || iov_count <= 0)
2210 return -EINVAL; 2209 return -EINVAL;
2211 2210
2212 /* we don't allow misaligned data like bio_map_user() does. If the 2211 /* we don't allow misaligned data like bio_map_user() does. If the
2213 * user is using sg, they're expected to know the alignment constraints 2212 * user is using sg, they're expected to know the alignment constraints
2214 * and respect them accordingly */ 2213 * and respect them accordingly */
2215 bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ); 2214 bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ);
2216 if (IS_ERR(bio)) 2215 if (IS_ERR(bio))
2217 return PTR_ERR(bio); 2216 return PTR_ERR(bio);
2218 2217
2219 rq->bio = rq->biotail = bio; 2218 rq->bio = rq->biotail = bio;
2220 blk_rq_bio_prep(q, rq, bio); 2219 blk_rq_bio_prep(q, rq, bio);
2221 rq->buffer = rq->data = NULL; 2220 rq->buffer = rq->data = NULL;
2222 rq->data_len = bio->bi_size; 2221 rq->data_len = bio->bi_size;
2223 return 0; 2222 return 0;
2224 } 2223 }
2225 2224
2226 EXPORT_SYMBOL(blk_rq_map_user_iov); 2225 EXPORT_SYMBOL(blk_rq_map_user_iov);
2227 2226
2228 /** 2227 /**
2229 * blk_rq_unmap_user - unmap a request with user data 2228 * blk_rq_unmap_user - unmap a request with user data
2230 * @bio: bio to be unmapped 2229 * @bio: bio to be unmapped
2231 * @ulen: length of user buffer 2230 * @ulen: length of user buffer
2232 * 2231 *
2233 * Description: 2232 * Description:
2234 * Unmap a bio previously mapped by blk_rq_map_user(). 2233 * Unmap a bio previously mapped by blk_rq_map_user().
2235 */ 2234 */
2236 int blk_rq_unmap_user(struct bio *bio, unsigned int ulen) 2235 int blk_rq_unmap_user(struct bio *bio, unsigned int ulen)
2237 { 2236 {
2238 int ret = 0; 2237 int ret = 0;
2239 2238
2240 if (bio) { 2239 if (bio) {
2241 if (bio_flagged(bio, BIO_USER_MAPPED)) 2240 if (bio_flagged(bio, BIO_USER_MAPPED))
2242 bio_unmap_user(bio); 2241 bio_unmap_user(bio);
2243 else 2242 else
2244 ret = bio_uncopy_user(bio); 2243 ret = bio_uncopy_user(bio);
2245 } 2244 }
2246 2245
2247 return 0; 2246 return 0;
2248 } 2247 }
2249 2248
2250 EXPORT_SYMBOL(blk_rq_unmap_user); 2249 EXPORT_SYMBOL(blk_rq_unmap_user);
2251 2250
2252 /** 2251 /**
2253 * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage 2252 * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage
2254 * @q: request queue where request should be inserted 2253 * @q: request queue where request should be inserted
2255 * @rq: request to fill 2254 * @rq: request to fill
2256 * @kbuf: the kernel buffer 2255 * @kbuf: the kernel buffer
2257 * @len: length of user data 2256 * @len: length of user data
2258 * @gfp_mask: memory allocation flags 2257 * @gfp_mask: memory allocation flags
2259 */ 2258 */
2260 int blk_rq_map_kern(request_queue_t *q, struct request *rq, void *kbuf, 2259 int blk_rq_map_kern(request_queue_t *q, struct request *rq, void *kbuf,
2261 unsigned int len, gfp_t gfp_mask) 2260 unsigned int len, gfp_t gfp_mask)
2262 { 2261 {
2263 struct bio *bio; 2262 struct bio *bio;
2264 2263
2265 if (len > (q->max_sectors << 9)) 2264 if (len > (q->max_sectors << 9))
2266 return -EINVAL; 2265 return -EINVAL;
2267 if (!len || !kbuf) 2266 if (!len || !kbuf)
2268 return -EINVAL; 2267 return -EINVAL;
2269 2268
2270 bio = bio_map_kern(q, kbuf, len, gfp_mask); 2269 bio = bio_map_kern(q, kbuf, len, gfp_mask);
2271 if (IS_ERR(bio)) 2270 if (IS_ERR(bio))
2272 return PTR_ERR(bio); 2271 return PTR_ERR(bio);
2273 2272
2274 if (rq_data_dir(rq) == WRITE) 2273 if (rq_data_dir(rq) == WRITE)
2275 bio->bi_rw |= (1 << BIO_RW); 2274 bio->bi_rw |= (1 << BIO_RW);
2276 2275
2277 rq->bio = rq->biotail = bio; 2276 rq->bio = rq->biotail = bio;
2278 blk_rq_bio_prep(q, rq, bio); 2277 blk_rq_bio_prep(q, rq, bio);
2279 2278
2280 rq->buffer = rq->data = NULL; 2279 rq->buffer = rq->data = NULL;
2281 rq->data_len = len; 2280 rq->data_len = len;
2282 return 0; 2281 return 0;
2283 } 2282 }
2284 2283
2285 EXPORT_SYMBOL(blk_rq_map_kern); 2284 EXPORT_SYMBOL(blk_rq_map_kern);
2286 2285
2287 /** 2286 /**
2288 * blk_execute_rq_nowait - insert a request into queue for execution 2287 * blk_execute_rq_nowait - insert a request into queue for execution
2289 * @q: queue to insert the request in 2288 * @q: queue to insert the request in
2290 * @bd_disk: matching gendisk 2289 * @bd_disk: matching gendisk
2291 * @rq: request to insert 2290 * @rq: request to insert
2292 * @at_head: insert request at head or tail of queue 2291 * @at_head: insert request at head or tail of queue
2293 * @done: I/O completion handler 2292 * @done: I/O completion handler
2294 * 2293 *
2295 * Description: 2294 * Description:
2296 * Insert a fully prepared request at the back of the io scheduler queue 2295 * Insert a fully prepared request at the back of the io scheduler queue
2297 * for execution. Don't wait for completion. 2296 * for execution. Don't wait for completion.
2298 */ 2297 */
2299 void blk_execute_rq_nowait(request_queue_t *q, struct gendisk *bd_disk, 2298 void blk_execute_rq_nowait(request_queue_t *q, struct gendisk *bd_disk,
2300 struct request *rq, int at_head, 2299 struct request *rq, int at_head,
2301 void (*done)(struct request *)) 2300 void (*done)(struct request *))
2302 { 2301 {
2303 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; 2302 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
2304 2303
2305 rq->rq_disk = bd_disk; 2304 rq->rq_disk = bd_disk;
2306 rq->flags |= REQ_NOMERGE; 2305 rq->flags |= REQ_NOMERGE;
2307 rq->end_io = done; 2306 rq->end_io = done;
2308 elv_add_request(q, rq, where, 1); 2307 elv_add_request(q, rq, where, 1);
2309 generic_unplug_device(q); 2308 generic_unplug_device(q);
2310 } 2309 }
2311 2310
2312 /** 2311 /**
2313 * blk_execute_rq - insert a request into queue for execution 2312 * blk_execute_rq - insert a request into queue for execution
2314 * @q: queue to insert the request in 2313 * @q: queue to insert the request in
2315 * @bd_disk: matching gendisk 2314 * @bd_disk: matching gendisk
2316 * @rq: request to insert 2315 * @rq: request to insert
2317 * @at_head: insert request at head or tail of queue 2316 * @at_head: insert request at head or tail of queue
2318 * 2317 *
2319 * Description: 2318 * Description:
2320 * Insert a fully prepared request at the back of the io scheduler queue 2319 * Insert a fully prepared request at the back of the io scheduler queue
2321 * for execution and wait for completion. 2320 * for execution and wait for completion.
2322 */ 2321 */
2323 int blk_execute_rq(request_queue_t *q, struct gendisk *bd_disk, 2322 int blk_execute_rq(request_queue_t *q, struct gendisk *bd_disk,
2324 struct request *rq, int at_head) 2323 struct request *rq, int at_head)
2325 { 2324 {
2326 DECLARE_COMPLETION(wait); 2325 DECLARE_COMPLETION(wait);
2327 char sense[SCSI_SENSE_BUFFERSIZE]; 2326 char sense[SCSI_SENSE_BUFFERSIZE];
2328 int err = 0; 2327 int err = 0;
2329 2328
2330 /* 2329 /*
2331 * we need an extra reference to the request, so we can look at 2330 * we need an extra reference to the request, so we can look at
2332 * it after io completion 2331 * it after io completion
2333 */ 2332 */
2334 rq->ref_count++; 2333 rq->ref_count++;
2335 2334
2336 if (!rq->sense) { 2335 if (!rq->sense) {
2337 memset(sense, 0, sizeof(sense)); 2336 memset(sense, 0, sizeof(sense));
2338 rq->sense = sense; 2337 rq->sense = sense;
2339 rq->sense_len = 0; 2338 rq->sense_len = 0;
2340 } 2339 }
2341 2340
2342 rq->waiting = &wait; 2341 rq->waiting = &wait;
2343 blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); 2342 blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
2344 wait_for_completion(&wait); 2343 wait_for_completion(&wait);
2345 rq->waiting = NULL; 2344 rq->waiting = NULL;
2346 2345
2347 if (rq->errors) 2346 if (rq->errors)
2348 err = -EIO; 2347 err = -EIO;
2349 2348
2350 return err; 2349 return err;
2351 } 2350 }
2352 2351
2353 EXPORT_SYMBOL(blk_execute_rq); 2352 EXPORT_SYMBOL(blk_execute_rq);
2354 2353
2355 /** 2354 /**
2356 * blkdev_issue_flush - queue a flush 2355 * blkdev_issue_flush - queue a flush
2357 * @bdev: blockdev to issue flush for 2356 * @bdev: blockdev to issue flush for
2358 * @error_sector: error sector 2357 * @error_sector: error sector
2359 * 2358 *
2360 * Description: 2359 * Description:
2361 * Issue a flush for the block device in question. Caller can supply 2360 * Issue a flush for the block device in question. Caller can supply
2362 * room for storing the error offset in case of a flush error, if they 2361 * room for storing the error offset in case of a flush error, if they
2363 * wish to. Caller must run wait_for_completion() on its own. 2362 * wish to. Caller must run wait_for_completion() on its own.
2364 */ 2363 */
2365 int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) 2364 int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
2366 { 2365 {
2367 request_queue_t *q; 2366 request_queue_t *q;
2368 2367
2369 if (bdev->bd_disk == NULL) 2368 if (bdev->bd_disk == NULL)
2370 return -ENXIO; 2369 return -ENXIO;
2371 2370
2372 q = bdev_get_queue(bdev); 2371 q = bdev_get_queue(bdev);
2373 if (!q) 2372 if (!q)
2374 return -ENXIO; 2373 return -ENXIO;
2375 if (!q->issue_flush_fn) 2374 if (!q->issue_flush_fn)
2376 return -EOPNOTSUPP; 2375 return -EOPNOTSUPP;
2377 2376
2378 return q->issue_flush_fn(q, bdev->bd_disk, error_sector); 2377 return q->issue_flush_fn(q, bdev->bd_disk, error_sector);
2379 } 2378 }
2380 2379
2381 EXPORT_SYMBOL(blkdev_issue_flush); 2380 EXPORT_SYMBOL(blkdev_issue_flush);
2382 2381
2383 static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io) 2382 static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)
2384 { 2383 {
2385 int rw = rq_data_dir(rq); 2384 int rw = rq_data_dir(rq);
2386 2385
2387 if (!blk_fs_request(rq) || !rq->rq_disk) 2386 if (!blk_fs_request(rq) || !rq->rq_disk)
2388 return; 2387 return;
2389 2388
2390 if (!new_io) { 2389 if (!new_io) {
2391 __disk_stat_inc(rq->rq_disk, merges[rw]); 2390 __disk_stat_inc(rq->rq_disk, merges[rw]);
2392 } else { 2391 } else {
2393 disk_round_stats(rq->rq_disk); 2392 disk_round_stats(rq->rq_disk);
2394 rq->rq_disk->in_flight++; 2393 rq->rq_disk->in_flight++;
2395 } 2394 }
2396 } 2395 }
2397 2396
2398 /* 2397 /*
2399 * add-request adds a request to the linked list. 2398 * add-request adds a request to the linked list.
2400 * queue lock is held and interrupts disabled, as we muck with the 2399 * queue lock is held and interrupts disabled, as we muck with the
2401 * request queue list. 2400 * request queue list.
2402 */ 2401 */
2403 static inline void add_request(request_queue_t * q, struct request * req) 2402 static inline void add_request(request_queue_t * q, struct request * req)
2404 { 2403 {
2405 drive_stat_acct(req, req->nr_sectors, 1); 2404 drive_stat_acct(req, req->nr_sectors, 1);
2406 2405
2407 if (q->activity_fn) 2406 if (q->activity_fn)
2408 q->activity_fn(q->activity_data, rq_data_dir(req)); 2407 q->activity_fn(q->activity_data, rq_data_dir(req));
2409 2408
2410 /* 2409 /*
2411 * elevator indicated where it wants this request to be 2410 * elevator indicated where it wants this request to be
2412 * inserted at elevator_merge time 2411 * inserted at elevator_merge time
2413 */ 2412 */
2414 __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); 2413 __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
2415 } 2414 }
2416 2415
2417 /* 2416 /*
2418 * disk_round_stats() - Round off the performance stats on a struct 2417 * disk_round_stats() - Round off the performance stats on a struct
2419 * disk_stats. 2418 * disk_stats.
2420 * 2419 *
2421 * The average IO queue length and utilisation statistics are maintained 2420 * The average IO queue length and utilisation statistics are maintained
2422 * by observing the current state of the queue length and the amount of 2421 * by observing the current state of the queue length and the amount of
2423 * time it has been in this state for. 2422 * time it has been in this state for.
2424 * 2423 *
2425 * Normally, that accounting is done on IO completion, but that can result 2424 * Normally, that accounting is done on IO completion, but that can result
2426 * in more than a second's worth of IO being accounted for within any one 2425 * in more than a second's worth of IO being accounted for within any one
2427 * second, leading to >100% utilisation. To deal with that, we call this 2426 * second, leading to >100% utilisation. To deal with that, we call this
2428 * function to do a round-off before returning the results when reading 2427 * function to do a round-off before returning the results when reading
2429 * /proc/diskstats. This accounts immediately for all queue usage up to 2428 * /proc/diskstats. This accounts immediately for all queue usage up to
2430 * the current jiffies and restarts the counters again. 2429 * the current jiffies and restarts the counters again.
2431 */ 2430 */
2432 void disk_round_stats(struct gendisk *disk) 2431 void disk_round_stats(struct gendisk *disk)
2433 { 2432 {
2434 unsigned long now = jiffies; 2433 unsigned long now = jiffies;
2435 2434
2436 if (now == disk->stamp) 2435 if (now == disk->stamp)
2437 return; 2436 return;
2438 2437
2439 if (disk->in_flight) { 2438 if (disk->in_flight) {
2440 __disk_stat_add(disk, time_in_queue, 2439 __disk_stat_add(disk, time_in_queue,
2441 disk->in_flight * (now - disk->stamp)); 2440 disk->in_flight * (now - disk->stamp));
2442 __disk_stat_add(disk, io_ticks, (now - disk->stamp)); 2441 __disk_stat_add(disk, io_ticks, (now - disk->stamp));
2443 } 2442 }
2444 disk->stamp = now; 2443 disk->stamp = now;
2445 } 2444 }
2446 2445
2447 /* 2446 /*
2448 * queue lock must be held 2447 * queue lock must be held
2449 */ 2448 */
2450 static void __blk_put_request(request_queue_t *q, struct request *req) 2449 static void __blk_put_request(request_queue_t *q, struct request *req)
2451 { 2450 {
2452 struct request_list *rl = req->rl; 2451 struct request_list *rl = req->rl;
2453 2452
2454 if (unlikely(!q)) 2453 if (unlikely(!q))
2455 return; 2454 return;
2456 if (unlikely(--req->ref_count)) 2455 if (unlikely(--req->ref_count))
2457 return; 2456 return;
2458 2457
2459 elv_completed_request(q, req); 2458 elv_completed_request(q, req);
2460 2459
2461 req->rq_status = RQ_INACTIVE; 2460 req->rq_status = RQ_INACTIVE;
2462 req->rl = NULL; 2461 req->rl = NULL;
2463 2462
2464 /* 2463 /*
2465 * Request may not have originated from ll_rw_blk. if not, 2464 * Request may not have originated from ll_rw_blk. if not,
2466 * it didn't come out of our reserved rq pools 2465 * it didn't come out of our reserved rq pools
2467 */ 2466 */
2468 if (rl) { 2467 if (rl) {
2469 int rw = rq_data_dir(req); 2468 int rw = rq_data_dir(req);
2470 int priv = req->flags & REQ_ELVPRIV; 2469 int priv = req->flags & REQ_ELVPRIV;
2471 2470
2472 BUG_ON(!list_empty(&req->queuelist)); 2471 BUG_ON(!list_empty(&req->queuelist));
2473 2472
2474 blk_free_request(q, req); 2473 blk_free_request(q, req);
2475 freed_request(q, rw, priv); 2474 freed_request(q, rw, priv);
2476 } 2475 }
2477 } 2476 }
2478 2477
2479 void blk_put_request(struct request *req) 2478 void blk_put_request(struct request *req)
2480 { 2479 {
2481 unsigned long flags; 2480 unsigned long flags;
2482 request_queue_t *q = req->q; 2481 request_queue_t *q = req->q;
2483 2482
2484 /* 2483 /*
2485 * Gee, IDE calls in w/ NULL q. Fix IDE and remove the 2484 * Gee, IDE calls in w/ NULL q. Fix IDE and remove the
2486 * following if (q) test. 2485 * following if (q) test.
2487 */ 2486 */
2488 if (q) { 2487 if (q) {
2489 spin_lock_irqsave(q->queue_lock, flags); 2488 spin_lock_irqsave(q->queue_lock, flags);
2490 __blk_put_request(q, req); 2489 __blk_put_request(q, req);
2491 spin_unlock_irqrestore(q->queue_lock, flags); 2490 spin_unlock_irqrestore(q->queue_lock, flags);
2492 } 2491 }
2493 } 2492 }
2494 2493
2495 EXPORT_SYMBOL(blk_put_request); 2494 EXPORT_SYMBOL(blk_put_request);
2496 2495
2497 /** 2496 /**
2498 * blk_end_sync_rq - executes a completion event on a request 2497 * blk_end_sync_rq - executes a completion event on a request
2499 * @rq: request to complete 2498 * @rq: request to complete
2500 */ 2499 */
2501 void blk_end_sync_rq(struct request *rq) 2500 void blk_end_sync_rq(struct request *rq)
2502 { 2501 {
2503 struct completion *waiting = rq->waiting; 2502 struct completion *waiting = rq->waiting;
2504 2503
2505 rq->waiting = NULL; 2504 rq->waiting = NULL;
2506 __blk_put_request(rq->q, rq); 2505 __blk_put_request(rq->q, rq);
2507 2506
2508 /* 2507 /*
2509 * complete last, if this is a stack request the process (and thus 2508 * complete last, if this is a stack request the process (and thus
2510 * the rq pointer) could be invalid right after this complete() 2509 * the rq pointer) could be invalid right after this complete()
2511 */ 2510 */
2512 complete(waiting); 2511 complete(waiting);
2513 } 2512 }
2514 EXPORT_SYMBOL(blk_end_sync_rq); 2513 EXPORT_SYMBOL(blk_end_sync_rq);
2515 2514
2516 /** 2515 /**
2517 * blk_congestion_wait - wait for a queue to become uncongested 2516 * blk_congestion_wait - wait for a queue to become uncongested
2518 * @rw: READ or WRITE 2517 * @rw: READ or WRITE
2519 * @timeout: timeout in jiffies 2518 * @timeout: timeout in jiffies
2520 * 2519 *
2521 * Waits for up to @timeout jiffies for a queue (any queue) to exit congestion. 2520 * Waits for up to @timeout jiffies for a queue (any queue) to exit congestion.
2522 * If no queues are congested then just wait for the next request to be 2521 * If no queues are congested then just wait for the next request to be
2523 * returned. 2522 * returned.
2524 */ 2523 */
2525 long blk_congestion_wait(int rw, long timeout) 2524 long blk_congestion_wait(int rw, long timeout)
2526 { 2525 {
2527 long ret; 2526 long ret;
2528 DEFINE_WAIT(wait); 2527 DEFINE_WAIT(wait);
2529 wait_queue_head_t *wqh = &congestion_wqh[rw]; 2528 wait_queue_head_t *wqh = &congestion_wqh[rw];
2530 2529
2531 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 2530 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
2532 ret = io_schedule_timeout(timeout); 2531 ret = io_schedule_timeout(timeout);
2533 finish_wait(wqh, &wait); 2532 finish_wait(wqh, &wait);
2534 return ret; 2533 return ret;
2535 } 2534 }
2536 2535
2537 EXPORT_SYMBOL(blk_congestion_wait); 2536 EXPORT_SYMBOL(blk_congestion_wait);
2538 2537
2539 /* 2538 /*
2540 * Has to be called with the request spinlock acquired 2539 * Has to be called with the request spinlock acquired
2541 */ 2540 */
2542 static int attempt_merge(request_queue_t *q, struct request *req, 2541 static int attempt_merge(request_queue_t *q, struct request *req,
2543 struct request *next) 2542 struct request *next)
2544 { 2543 {
2545 if (!rq_mergeable(req) || !rq_mergeable(next)) 2544 if (!rq_mergeable(req) || !rq_mergeable(next))
2546 return 0; 2545 return 0;
2547 2546
2548 /* 2547 /*
2549 * not contigious 2548 * not contigious
2550 */ 2549 */
2551 if (req->sector + req->nr_sectors != next->sector) 2550 if (req->sector + req->nr_sectors != next->sector)
2552 return 0; 2551 return 0;
2553 2552
2554 if (rq_data_dir(req) != rq_data_dir(next) 2553 if (rq_data_dir(req) != rq_data_dir(next)
2555 || req->rq_disk != next->rq_disk 2554 || req->rq_disk != next->rq_disk
2556 || next->waiting || next->special) 2555 || next->waiting || next->special)
2557 return 0; 2556 return 0;
2558 2557
2559 /* 2558 /*
2560 * If we are allowed to merge, then append bio list 2559 * If we are allowed to merge, then append bio list
2561 * from next to rq and release next. merge_requests_fn 2560 * from next to rq and release next. merge_requests_fn
2562 * will have updated segment counts, update sector 2561 * will have updated segment counts, update sector
2563 * counts here. 2562 * counts here.
2564 */ 2563 */
2565 if (!q->merge_requests_fn(q, req, next)) 2564 if (!q->merge_requests_fn(q, req, next))
2566 return 0; 2565 return 0;
2567 2566
2568 /* 2567 /*
2569 * At this point we have either done a back merge 2568 * At this point we have either done a back merge
2570 * or front merge. We need the smaller start_time of 2569 * or front merge. We need the smaller start_time of
2571 * the merged requests to be the current request 2570 * the merged requests to be the current request
2572 * for accounting purposes. 2571 * for accounting purposes.
2573 */ 2572 */
2574 if (time_after(req->start_time, next->start_time)) 2573 if (time_after(req->start_time, next->start_time))
2575 req->start_time = next->start_time; 2574 req->start_time = next->start_time;
2576 2575
2577 req->biotail->bi_next = next->bio; 2576 req->biotail->bi_next = next->bio;
2578 req->biotail = next->biotail; 2577 req->biotail = next->biotail;
2579 2578
2580 req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors; 2579 req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
2581 2580
2582 elv_merge_requests(q, req, next); 2581 elv_merge_requests(q, req, next);
2583 2582
2584 if (req->rq_disk) { 2583 if (req->rq_disk) {
2585 disk_round_stats(req->rq_disk); 2584 disk_round_stats(req->rq_disk);
2586 req->rq_disk->in_flight--; 2585 req->rq_disk->in_flight--;
2587 } 2586 }
2588 2587
2589 req->ioprio = ioprio_best(req->ioprio, next->ioprio); 2588 req->ioprio = ioprio_best(req->ioprio, next->ioprio);
2590 2589
2591 __blk_put_request(q, next); 2590 __blk_put_request(q, next);
2592 return 1; 2591 return 1;
2593 } 2592 }
2594 2593
2595 static inline int attempt_back_merge(request_queue_t *q, struct request *rq) 2594 static inline int attempt_back_merge(request_queue_t *q, struct request *rq)
2596 { 2595 {
2597 struct request *next = elv_latter_request(q, rq); 2596 struct request *next = elv_latter_request(q, rq);
2598 2597
2599 if (next) 2598 if (next)
2600 return attempt_merge(q, rq, next); 2599 return attempt_merge(q, rq, next);
2601 2600
2602 return 0; 2601 return 0;
2603 } 2602 }
2604 2603
2605 static inline int attempt_front_merge(request_queue_t *q, struct request *rq) 2604 static inline int attempt_front_merge(request_queue_t *q, struct request *rq)
2606 { 2605 {
2607 struct request *prev = elv_former_request(q, rq); 2606 struct request *prev = elv_former_request(q, rq);
2608 2607
2609 if (prev) 2608 if (prev)
2610 return attempt_merge(q, prev, rq); 2609 return attempt_merge(q, prev, rq);
2611 2610
2612 return 0; 2611 return 0;
2613 } 2612 }
2614 2613
2615 /** 2614 /**
2616 * blk_attempt_remerge - attempt to remerge active head with next request 2615 * blk_attempt_remerge - attempt to remerge active head with next request
2617 * @q: The &request_queue_t belonging to the device 2616 * @q: The &request_queue_t belonging to the device
2618 * @rq: The head request (usually) 2617 * @rq: The head request (usually)
2619 * 2618 *
2620 * Description: 2619 * Description:
2621 * For head-active devices, the queue can easily be unplugged so quickly 2620 * For head-active devices, the queue can easily be unplugged so quickly
2622 * that proper merging is not done on the front request. This may hurt 2621 * that proper merging is not done on the front request. This may hurt
2623 * performance greatly for some devices. The block layer cannot safely 2622 * performance greatly for some devices. The block layer cannot safely
2624 * do merging on that first request for these queues, but the driver can 2623 * do merging on that first request for these queues, but the driver can
2625 * call this function and make it happen any way. Only the driver knows 2624 * call this function and make it happen any way. Only the driver knows
2626 * when it is safe to do so. 2625 * when it is safe to do so.
2627 **/ 2626 **/
2628 void blk_attempt_remerge(request_queue_t *q, struct request *rq) 2627 void blk_attempt_remerge(request_queue_t *q, struct request *rq)
2629 { 2628 {
2630 unsigned long flags; 2629 unsigned long flags;
2631 2630
2632 spin_lock_irqsave(q->queue_lock, flags); 2631 spin_lock_irqsave(q->queue_lock, flags);
2633 attempt_back_merge(q, rq); 2632 attempt_back_merge(q, rq);
2634 spin_unlock_irqrestore(q->queue_lock, flags); 2633 spin_unlock_irqrestore(q->queue_lock, flags);
2635 } 2634 }
2636 2635
2637 EXPORT_SYMBOL(blk_attempt_remerge); 2636 EXPORT_SYMBOL(blk_attempt_remerge);
2638 2637
2639 static int __make_request(request_queue_t *q, struct bio *bio) 2638 static int __make_request(request_queue_t *q, struct bio *bio)
2640 { 2639 {
2641 struct request *req; 2640 struct request *req;
2642 int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync; 2641 int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync;
2643 unsigned short prio; 2642 unsigned short prio;
2644 sector_t sector; 2643 sector_t sector;
2645 2644
2646 sector = bio->bi_sector; 2645 sector = bio->bi_sector;
2647 nr_sectors = bio_sectors(bio); 2646 nr_sectors = bio_sectors(bio);
2648 cur_nr_sectors = bio_cur_sectors(bio); 2647 cur_nr_sectors = bio_cur_sectors(bio);
2649 prio = bio_prio(bio); 2648 prio = bio_prio(bio);
2650 2649
2651 rw = bio_data_dir(bio); 2650 rw = bio_data_dir(bio);
2652 sync = bio_sync(bio); 2651 sync = bio_sync(bio);
2653 2652
2654 /* 2653 /*
2655 * low level driver can indicate that it wants pages above a 2654 * low level driver can indicate that it wants pages above a
2656 * certain limit bounced to low memory (ie for highmem, or even 2655 * certain limit bounced to low memory (ie for highmem, or even
2657 * ISA dma in theory) 2656 * ISA dma in theory)
2658 */ 2657 */
2659 blk_queue_bounce(q, &bio); 2658 blk_queue_bounce(q, &bio);
2660 2659
2661 spin_lock_prefetch(q->queue_lock); 2660 spin_lock_prefetch(q->queue_lock);
2662 2661
2663 barrier = bio_barrier(bio); 2662 barrier = bio_barrier(bio);
2664 if (unlikely(barrier) && (q->ordered == QUEUE_ORDERED_NONE)) { 2663 if (unlikely(barrier) && (q->ordered == QUEUE_ORDERED_NONE)) {
2665 err = -EOPNOTSUPP; 2664 err = -EOPNOTSUPP;
2666 goto end_io; 2665 goto end_io;
2667 } 2666 }
2668 2667
2669 spin_lock_irq(q->queue_lock); 2668 spin_lock_irq(q->queue_lock);
2670 2669
2671 if (unlikely(barrier) || elv_queue_empty(q)) 2670 if (unlikely(barrier) || elv_queue_empty(q))
2672 goto get_rq; 2671 goto get_rq;
2673 2672
2674 el_ret = elv_merge(q, &req, bio); 2673 el_ret = elv_merge(q, &req, bio);
2675 switch (el_ret) { 2674 switch (el_ret) {
2676 case ELEVATOR_BACK_MERGE: 2675 case ELEVATOR_BACK_MERGE:
2677 BUG_ON(!rq_mergeable(req)); 2676 BUG_ON(!rq_mergeable(req));
2678 2677
2679 if (!q->back_merge_fn(q, req, bio)) 2678 if (!q->back_merge_fn(q, req, bio))
2680 break; 2679 break;
2681 2680
2682 req->biotail->bi_next = bio; 2681 req->biotail->bi_next = bio;
2683 req->biotail = bio; 2682 req->biotail = bio;
2684 req->nr_sectors = req->hard_nr_sectors += nr_sectors; 2683 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
2685 req->ioprio = ioprio_best(req->ioprio, prio); 2684 req->ioprio = ioprio_best(req->ioprio, prio);
2686 drive_stat_acct(req, nr_sectors, 0); 2685 drive_stat_acct(req, nr_sectors, 0);
2687 if (!attempt_back_merge(q, req)) 2686 if (!attempt_back_merge(q, req))
2688 elv_merged_request(q, req); 2687 elv_merged_request(q, req);
2689 goto out; 2688 goto out;
2690 2689
2691 case ELEVATOR_FRONT_MERGE: 2690 case ELEVATOR_FRONT_MERGE:
2692 BUG_ON(!rq_mergeable(req)); 2691 BUG_ON(!rq_mergeable(req));
2693 2692
2694 if (!q->front_merge_fn(q, req, bio)) 2693 if (!q->front_merge_fn(q, req, bio))
2695 break; 2694 break;
2696 2695
2697 bio->bi_next = req->bio; 2696 bio->bi_next = req->bio;
2698 req->bio = bio; 2697 req->bio = bio;
2699 2698
2700 /* 2699 /*
2701 * may not be valid. if the low level driver said 2700 * may not be valid. if the low level driver said
2702 * it didn't need a bounce buffer then it better 2701 * it didn't need a bounce buffer then it better
2703 * not touch req->buffer either... 2702 * not touch req->buffer either...
2704 */ 2703 */
2705 req->buffer = bio_data(bio); 2704 req->buffer = bio_data(bio);
2706 req->current_nr_sectors = cur_nr_sectors; 2705 req->current_nr_sectors = cur_nr_sectors;
2707 req->hard_cur_sectors = cur_nr_sectors; 2706 req->hard_cur_sectors = cur_nr_sectors;
2708 req->sector = req->hard_sector = sector; 2707 req->sector = req->hard_sector = sector;
2709 req->nr_sectors = req->hard_nr_sectors += nr_sectors; 2708 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
2710 req->ioprio = ioprio_best(req->ioprio, prio); 2709 req->ioprio = ioprio_best(req->ioprio, prio);
2711 drive_stat_acct(req, nr_sectors, 0); 2710 drive_stat_acct(req, nr_sectors, 0);
2712 if (!attempt_front_merge(q, req)) 2711 if (!attempt_front_merge(q, req))
2713 elv_merged_request(q, req); 2712 elv_merged_request(q, req);
2714 goto out; 2713 goto out;
2715 2714
2716 /* ELV_NO_MERGE: elevator says don't/can't merge. */ 2715 /* ELV_NO_MERGE: elevator says don't/can't merge. */
2717 default: 2716 default:
2718 ; 2717 ;
2719 } 2718 }
2720 2719
2721 get_rq: 2720 get_rq:
2722 /* 2721 /*
2723 * Grab a free request. This is might sleep but can not fail. 2722 * Grab a free request. This is might sleep but can not fail.
2724 * Returns with the queue unlocked. 2723 * Returns with the queue unlocked.
2725 */ 2724 */
2726 req = get_request_wait(q, rw, bio); 2725 req = get_request_wait(q, rw, bio);
2727 2726
2728 /* 2727 /*
2729 * After dropping the lock and possibly sleeping here, our request 2728 * After dropping the lock and possibly sleeping here, our request
2730 * may now be mergeable after it had proven unmergeable (above). 2729 * may now be mergeable after it had proven unmergeable (above).
2731 * We don't worry about that case for efficiency. It won't happen 2730 * We don't worry about that case for efficiency. It won't happen
2732 * often, and the elevators are able to handle it. 2731 * often, and the elevators are able to handle it.
2733 */ 2732 */
2734 2733
2735 req->flags |= REQ_CMD; 2734 req->flags |= REQ_CMD;
2736 2735
2737 /* 2736 /*
2738 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST) 2737 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
2739 */ 2738 */
2740 if (bio_rw_ahead(bio) || bio_failfast(bio)) 2739 if (bio_rw_ahead(bio) || bio_failfast(bio))
2741 req->flags |= REQ_FAILFAST; 2740 req->flags |= REQ_FAILFAST;
2742 2741
2743 /* 2742 /*
2744 * REQ_BARRIER implies no merging, but lets make it explicit 2743 * REQ_BARRIER implies no merging, but lets make it explicit
2745 */ 2744 */
2746 if (unlikely(barrier)) 2745 if (unlikely(barrier))
2747 req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE); 2746 req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
2748 2747
2749 req->errors = 0; 2748 req->errors = 0;
2750 req->hard_sector = req->sector = sector; 2749 req->hard_sector = req->sector = sector;
2751 req->hard_nr_sectors = req->nr_sectors = nr_sectors; 2750 req->hard_nr_sectors = req->nr_sectors = nr_sectors;
2752 req->current_nr_sectors = req->hard_cur_sectors = cur_nr_sectors; 2751 req->current_nr_sectors = req->hard_cur_sectors = cur_nr_sectors;
2753 req->nr_phys_segments = bio_phys_segments(q, bio); 2752 req->nr_phys_segments = bio_phys_segments(q, bio);
2754 req->nr_hw_segments = bio_hw_segments(q, bio); 2753 req->nr_hw_segments = bio_hw_segments(q, bio);
2755 req->buffer = bio_data(bio); /* see ->buffer comment above */ 2754 req->buffer = bio_data(bio); /* see ->buffer comment above */
2756 req->waiting = NULL; 2755 req->waiting = NULL;
2757 req->bio = req->biotail = bio; 2756 req->bio = req->biotail = bio;
2758 req->ioprio = prio; 2757 req->ioprio = prio;
2759 req->rq_disk = bio->bi_bdev->bd_disk; 2758 req->rq_disk = bio->bi_bdev->bd_disk;
2760 req->start_time = jiffies; 2759 req->start_time = jiffies;
2761 2760
2762 spin_lock_irq(q->queue_lock); 2761 spin_lock_irq(q->queue_lock);
2763 if (elv_queue_empty(q)) 2762 if (elv_queue_empty(q))
2764 blk_plug_device(q); 2763 blk_plug_device(q);
2765 add_request(q, req); 2764 add_request(q, req);
2766 out: 2765 out:
2767 if (sync) 2766 if (sync)
2768 __generic_unplug_device(q); 2767 __generic_unplug_device(q);
2769 2768
2770 spin_unlock_irq(q->queue_lock); 2769 spin_unlock_irq(q->queue_lock);
2771 return 0; 2770 return 0;
2772 2771
2773 end_io: 2772 end_io:
2774 bio_endio(bio, nr_sectors << 9, err); 2773 bio_endio(bio, nr_sectors << 9, err);
2775 return 0; 2774 return 0;
2776 } 2775 }
2777 2776
2778 /* 2777 /*
2779 * If bio->bi_dev is a partition, remap the location 2778 * If bio->bi_dev is a partition, remap the location
2780 */ 2779 */
2781 static inline void blk_partition_remap(struct bio *bio) 2780 static inline void blk_partition_remap(struct bio *bio)
2782 { 2781 {
2783 struct block_device *bdev = bio->bi_bdev; 2782 struct block_device *bdev = bio->bi_bdev;
2784 2783
2785 if (bdev != bdev->bd_contains) { 2784 if (bdev != bdev->bd_contains) {
2786 struct hd_struct *p = bdev->bd_part; 2785 struct hd_struct *p = bdev->bd_part;
2787 const int rw = bio_data_dir(bio); 2786 const int rw = bio_data_dir(bio);
2788 2787
2789 p->sectors[rw] += bio_sectors(bio); 2788 p->sectors[rw] += bio_sectors(bio);
2790 p->ios[rw]++; 2789 p->ios[rw]++;
2791 2790
2792 bio->bi_sector += p->start_sect; 2791 bio->bi_sector += p->start_sect;
2793 bio->bi_bdev = bdev->bd_contains; 2792 bio->bi_bdev = bdev->bd_contains;
2794 } 2793 }
2795 } 2794 }
2796 2795
2797 static void handle_bad_sector(struct bio *bio) 2796 static void handle_bad_sector(struct bio *bio)
2798 { 2797 {
2799 char b[BDEVNAME_SIZE]; 2798 char b[BDEVNAME_SIZE];
2800 2799
2801 printk(KERN_INFO "attempt to access beyond end of device\n"); 2800 printk(KERN_INFO "attempt to access beyond end of device\n");
2802 printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n", 2801 printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
2803 bdevname(bio->bi_bdev, b), 2802 bdevname(bio->bi_bdev, b),
2804 bio->bi_rw, 2803 bio->bi_rw,
2805 (unsigned long long)bio->bi_sector + bio_sectors(bio), 2804 (unsigned long long)bio->bi_sector + bio_sectors(bio),
2806 (long long)(bio->bi_bdev->bd_inode->i_size >> 9)); 2805 (long long)(bio->bi_bdev->bd_inode->i_size >> 9));
2807 2806
2808 set_bit(BIO_EOF, &bio->bi_flags); 2807 set_bit(BIO_EOF, &bio->bi_flags);
2809 } 2808 }
2810 2809
2811 /** 2810 /**
2812 * generic_make_request: hand a buffer to its device driver for I/O 2811 * generic_make_request: hand a buffer to its device driver for I/O
2813 * @bio: The bio describing the location in memory and on the device. 2812 * @bio: The bio describing the location in memory and on the device.
2814 * 2813 *
2815 * generic_make_request() is used to make I/O requests of block 2814 * generic_make_request() is used to make I/O requests of block
2816 * devices. It is passed a &struct bio, which describes the I/O that needs 2815 * devices. It is passed a &struct bio, which describes the I/O that needs
2817 * to be done. 2816 * to be done.
2818 * 2817 *
2819 * generic_make_request() does not return any status. The 2818 * generic_make_request() does not return any status. The
2820 * success/failure status of the request, along with notification of 2819 * success/failure status of the request, along with notification of
2821 * completion, is delivered asynchronously through the bio->bi_end_io 2820 * completion, is delivered asynchronously through the bio->bi_end_io
2822 * function described (one day) else where. 2821 * function described (one day) else where.
2823 * 2822 *
2824 * The caller of generic_make_request must make sure that bi_io_vec 2823 * The caller of generic_make_request must make sure that bi_io_vec
2825 * are set to describe the memory buffer, and that bi_dev and bi_sector are 2824 * are set to describe the memory buffer, and that bi_dev and bi_sector are
2826 * set to describe the device address, and the 2825 * set to describe the device address, and the
2827 * bi_end_io and optionally bi_private are set to describe how 2826 * bi_end_io and optionally bi_private are set to describe how
2828 * completion notification should be signaled. 2827 * completion notification should be signaled.
2829 * 2828 *
2830 * generic_make_request and the drivers it calls may use bi_next if this 2829 * generic_make_request and the drivers it calls may use bi_next if this
2831 * bio happens to be merged with someone else, and may change bi_dev and 2830 * bio happens to be merged with someone else, and may change bi_dev and
2832 * bi_sector for remaps as it sees fit. So the values of these fields 2831 * bi_sector for remaps as it sees fit. So the values of these fields
2833 * should NOT be depended on after the call to generic_make_request. 2832 * should NOT be depended on after the call to generic_make_request.
2834 */ 2833 */
2835 void generic_make_request(struct bio *bio) 2834 void generic_make_request(struct bio *bio)
2836 { 2835 {
2837 request_queue_t *q; 2836 request_queue_t *q;
2838 sector_t maxsector; 2837 sector_t maxsector;
2839 int ret, nr_sectors = bio_sectors(bio); 2838 int ret, nr_sectors = bio_sectors(bio);
2840 2839
2841 might_sleep(); 2840 might_sleep();
2842 /* Test device or partition size, when known. */ 2841 /* Test device or partition size, when known. */
2843 maxsector = bio->bi_bdev->bd_inode->i_size >> 9; 2842 maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
2844 if (maxsector) { 2843 if (maxsector) {
2845 sector_t sector = bio->bi_sector; 2844 sector_t sector = bio->bi_sector;
2846 2845
2847 if (maxsector < nr_sectors || maxsector - nr_sectors < sector) { 2846 if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
2848 /* 2847 /*
2849 * This may well happen - the kernel calls bread() 2848 * This may well happen - the kernel calls bread()
2850 * without checking the size of the device, e.g., when 2849 * without checking the size of the device, e.g., when
2851 * mounting a device. 2850 * mounting a device.
2852 */ 2851 */
2853 handle_bad_sector(bio); 2852 handle_bad_sector(bio);
2854 goto end_io; 2853 goto end_io;
2855 } 2854 }
2856 } 2855 }
2857 2856
2858 /* 2857 /*
2859 * Resolve the mapping until finished. (drivers are 2858 * Resolve the mapping until finished. (drivers are
2860 * still free to implement/resolve their own stacking 2859 * still free to implement/resolve their own stacking
2861 * by explicitly returning 0) 2860 * by explicitly returning 0)
2862 * 2861 *
2863 * NOTE: we don't repeat the blk_size check for each new device. 2862 * NOTE: we don't repeat the blk_size check for each new device.
2864 * Stacking drivers are expected to know what they are doing. 2863 * Stacking drivers are expected to know what they are doing.
2865 */ 2864 */
2866 do { 2865 do {
2867 char b[BDEVNAME_SIZE]; 2866 char b[BDEVNAME_SIZE];
2868 2867
2869 q = bdev_get_queue(bio->bi_bdev); 2868 q = bdev_get_queue(bio->bi_bdev);
2870 if (!q) { 2869 if (!q) {
2871 printk(KERN_ERR 2870 printk(KERN_ERR
2872 "generic_make_request: Trying to access " 2871 "generic_make_request: Trying to access "
2873 "nonexistent block-device %s (%Lu)\n", 2872 "nonexistent block-device %s (%Lu)\n",
2874 bdevname(bio->bi_bdev, b), 2873 bdevname(bio->bi_bdev, b),
2875 (long long) bio->bi_sector); 2874 (long long) bio->bi_sector);
2876 end_io: 2875 end_io:
2877 bio_endio(bio, bio->bi_size, -EIO); 2876 bio_endio(bio, bio->bi_size, -EIO);
2878 break; 2877 break;
2879 } 2878 }
2880 2879
2881 if (unlikely(bio_sectors(bio) > q->max_hw_sectors)) { 2880 if (unlikely(bio_sectors(bio) > q->max_hw_sectors)) {
2882 printk("bio too big device %s (%u > %u)\n", 2881 printk("bio too big device %s (%u > %u)\n",
2883 bdevname(bio->bi_bdev, b), 2882 bdevname(bio->bi_bdev, b),
2884 bio_sectors(bio), 2883 bio_sectors(bio),
2885 q->max_hw_sectors); 2884 q->max_hw_sectors);
2886 goto end_io; 2885 goto end_io;
2887 } 2886 }
2888 2887
2889 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) 2888 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
2890 goto end_io; 2889 goto end_io;
2891 2890
2892 /* 2891 /*
2893 * If this device has partitions, remap block n 2892 * If this device has partitions, remap block n
2894 * of partition p to block n+start(p) of the disk. 2893 * of partition p to block n+start(p) of the disk.
2895 */ 2894 */
2896 blk_partition_remap(bio); 2895 blk_partition_remap(bio);
2897 2896
2898 ret = q->make_request_fn(q, bio); 2897 ret = q->make_request_fn(q, bio);
2899 } while (ret); 2898 } while (ret);
2900 } 2899 }
2901 2900
2902 EXPORT_SYMBOL(generic_make_request); 2901 EXPORT_SYMBOL(generic_make_request);
2903 2902
2904 /** 2903 /**
2905 * submit_bio: submit a bio to the block device layer for I/O 2904 * submit_bio: submit a bio to the block device layer for I/O
2906 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) 2905 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
2907 * @bio: The &struct bio which describes the I/O 2906 * @bio: The &struct bio which describes the I/O
2908 * 2907 *
2909 * submit_bio() is very similar in purpose to generic_make_request(), and 2908 * submit_bio() is very similar in purpose to generic_make_request(), and
2910 * uses that function to do most of the work. Both are fairly rough 2909 * uses that function to do most of the work. Both are fairly rough
2911 * interfaces, @bio must be presetup and ready for I/O. 2910 * interfaces, @bio must be presetup and ready for I/O.
2912 * 2911 *
2913 */ 2912 */
2914 void submit_bio(int rw, struct bio *bio) 2913 void submit_bio(int rw, struct bio *bio)
2915 { 2914 {
2916 int count = bio_sectors(bio); 2915 int count = bio_sectors(bio);
2917 2916
2918 BIO_BUG_ON(!bio->bi_size); 2917 BIO_BUG_ON(!bio->bi_size);
2919 BIO_BUG_ON(!bio->bi_io_vec); 2918 BIO_BUG_ON(!bio->bi_io_vec);
2920 bio->bi_rw |= rw; 2919 bio->bi_rw |= rw;
2921 if (rw & WRITE) 2920 if (rw & WRITE)
2922 mod_page_state(pgpgout, count); 2921 mod_page_state(pgpgout, count);
2923 else 2922 else
2924 mod_page_state(pgpgin, count); 2923 mod_page_state(pgpgin, count);
2925 2924
2926 if (unlikely(block_dump)) { 2925 if (unlikely(block_dump)) {
2927 char b[BDEVNAME_SIZE]; 2926 char b[BDEVNAME_SIZE];
2928 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", 2927 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
2929 current->comm, current->pid, 2928 current->comm, current->pid,
2930 (rw & WRITE) ? "WRITE" : "READ", 2929 (rw & WRITE) ? "WRITE" : "READ",
2931 (unsigned long long)bio->bi_sector, 2930 (unsigned long long)bio->bi_sector,
2932 bdevname(bio->bi_bdev,b)); 2931 bdevname(bio->bi_bdev,b));
2933 } 2932 }
2934 2933
2935 generic_make_request(bio); 2934 generic_make_request(bio);
2936 } 2935 }
2937 2936
2938 EXPORT_SYMBOL(submit_bio); 2937 EXPORT_SYMBOL(submit_bio);
2939 2938
2940 static void blk_recalc_rq_segments(struct request *rq) 2939 static void blk_recalc_rq_segments(struct request *rq)
2941 { 2940 {
2942 struct bio *bio, *prevbio = NULL; 2941 struct bio *bio, *prevbio = NULL;
2943 int nr_phys_segs, nr_hw_segs; 2942 int nr_phys_segs, nr_hw_segs;
2944 unsigned int phys_size, hw_size; 2943 unsigned int phys_size, hw_size;
2945 request_queue_t *q = rq->q; 2944 request_queue_t *q = rq->q;
2946 2945
2947 if (!rq->bio) 2946 if (!rq->bio)
2948 return; 2947 return;
2949 2948
2950 phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0; 2949 phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;
2951 rq_for_each_bio(bio, rq) { 2950 rq_for_each_bio(bio, rq) {
2952 /* Force bio hw/phys segs to be recalculated. */ 2951 /* Force bio hw/phys segs to be recalculated. */
2953 bio->bi_flags &= ~(1 << BIO_SEG_VALID); 2952 bio->bi_flags &= ~(1 << BIO_SEG_VALID);
2954 2953
2955 nr_phys_segs += bio_phys_segments(q, bio); 2954 nr_phys_segs += bio_phys_segments(q, bio);
2956 nr_hw_segs += bio_hw_segments(q, bio); 2955 nr_hw_segs += bio_hw_segments(q, bio);
2957 if (prevbio) { 2956 if (prevbio) {
2958 int pseg = phys_size + prevbio->bi_size + bio->bi_size; 2957 int pseg = phys_size + prevbio->bi_size + bio->bi_size;
2959 int hseg = hw_size + prevbio->bi_size + bio->bi_size; 2958 int hseg = hw_size + prevbio->bi_size + bio->bi_size;
2960 2959
2961 if (blk_phys_contig_segment(q, prevbio, bio) && 2960 if (blk_phys_contig_segment(q, prevbio, bio) &&
2962 pseg <= q->max_segment_size) { 2961 pseg <= q->max_segment_size) {
2963 nr_phys_segs--; 2962 nr_phys_segs--;
2964 phys_size += prevbio->bi_size + bio->bi_size; 2963 phys_size += prevbio->bi_size + bio->bi_size;
2965 } else 2964 } else
2966 phys_size = 0; 2965 phys_size = 0;
2967 2966
2968 if (blk_hw_contig_segment(q, prevbio, bio) && 2967 if (blk_hw_contig_segment(q, prevbio, bio) &&
2969 hseg <= q->max_segment_size) { 2968 hseg <= q->max_segment_size) {
2970 nr_hw_segs--; 2969 nr_hw_segs--;
2971 hw_size += prevbio->bi_size + bio->bi_size; 2970 hw_size += prevbio->bi_size + bio->bi_size;
2972 } else 2971 } else
2973 hw_size = 0; 2972 hw_size = 0;
2974 } 2973 }
2975 prevbio = bio; 2974 prevbio = bio;
2976 } 2975 }
2977 2976
2978 rq->nr_phys_segments = nr_phys_segs; 2977 rq->nr_phys_segments = nr_phys_segs;
2979 rq->nr_hw_segments = nr_hw_segs; 2978 rq->nr_hw_segments = nr_hw_segs;
2980 } 2979 }
2981 2980
2982 static void blk_recalc_rq_sectors(struct request *rq, int nsect) 2981 static void blk_recalc_rq_sectors(struct request *rq, int nsect)
2983 { 2982 {
2984 if (blk_fs_request(rq)) { 2983 if (blk_fs_request(rq)) {
2985 rq->hard_sector += nsect; 2984 rq->hard_sector += nsect;
2986 rq->hard_nr_sectors -= nsect; 2985 rq->hard_nr_sectors -= nsect;
2987 2986
2988 /* 2987 /*
2989 * Move the I/O submission pointers ahead if required. 2988 * Move the I/O submission pointers ahead if required.
2990 */ 2989 */
2991 if ((rq->nr_sectors >= rq->hard_nr_sectors) && 2990 if ((rq->nr_sectors >= rq->hard_nr_sectors) &&
2992 (rq->sector <= rq->hard_sector)) { 2991 (rq->sector <= rq->hard_sector)) {
2993 rq->sector = rq->hard_sector; 2992 rq->sector = rq->hard_sector;
2994 rq->nr_sectors = rq->hard_nr_sectors; 2993 rq->nr_sectors = rq->hard_nr_sectors;
2995 rq->hard_cur_sectors = bio_cur_sectors(rq->bio); 2994 rq->hard_cur_sectors = bio_cur_sectors(rq->bio);
2996 rq->current_nr_sectors = rq->hard_cur_sectors; 2995 rq->current_nr_sectors = rq->hard_cur_sectors;
2997 rq->buffer = bio_data(rq->bio); 2996 rq->buffer = bio_data(rq->bio);
2998 } 2997 }
2999 2998
3000 /* 2999 /*
3001 * if total number of sectors is less than the first segment 3000 * if total number of sectors is less than the first segment
3002 * size, something has gone terribly wrong 3001 * size, something has gone terribly wrong
3003 */ 3002 */
3004 if (rq->nr_sectors < rq->current_nr_sectors) { 3003 if (rq->nr_sectors < rq->current_nr_sectors) {
3005 printk("blk: request botched\n"); 3004 printk("blk: request botched\n");
3006 rq->nr_sectors = rq->current_nr_sectors; 3005 rq->nr_sectors = rq->current_nr_sectors;
3007 } 3006 }
3008 } 3007 }
3009 } 3008 }
3010 3009
3011 static int __end_that_request_first(struct request *req, int uptodate, 3010 static int __end_that_request_first(struct request *req, int uptodate,
3012 int nr_bytes) 3011 int nr_bytes)
3013 { 3012 {
3014 int total_bytes, bio_nbytes, error, next_idx = 0; 3013 int total_bytes, bio_nbytes, error, next_idx = 0;
3015 struct bio *bio; 3014 struct bio *bio;
3016 3015
3017 /* 3016 /*
3018 * extend uptodate bool to allow < 0 value to be direct io error 3017 * extend uptodate bool to allow < 0 value to be direct io error
3019 */ 3018 */
3020 error = 0; 3019 error = 0;
3021 if (end_io_error(uptodate)) 3020 if (end_io_error(uptodate))
3022 error = !uptodate ? -EIO : uptodate; 3021 error = !uptodate ? -EIO : uptodate;
3023 3022
3024 /* 3023 /*
3025 * for a REQ_BLOCK_PC request, we want to carry any eventual 3024 * for a REQ_BLOCK_PC request, we want to carry any eventual
3026 * sense key with us all the way through 3025 * sense key with us all the way through
3027 */ 3026 */
3028 if (!blk_pc_request(req)) 3027 if (!blk_pc_request(req))
3029 req->errors = 0; 3028 req->errors = 0;
3030 3029
3031 if (!uptodate) { 3030 if (!uptodate) {
3032 if (blk_fs_request(req) && !(req->flags & REQ_QUIET)) 3031 if (blk_fs_request(req) && !(req->flags & REQ_QUIET))
3033 printk("end_request: I/O error, dev %s, sector %llu\n", 3032 printk("end_request: I/O error, dev %s, sector %llu\n",
3034 req->rq_disk ? req->rq_disk->disk_name : "?", 3033 req->rq_disk ? req->rq_disk->disk_name : "?",
3035 (unsigned long long)req->sector); 3034 (unsigned long long)req->sector);
3036 } 3035 }
3037 3036
3038 if (blk_fs_request(req) && req->rq_disk) { 3037 if (blk_fs_request(req) && req->rq_disk) {
3039 const int rw = rq_data_dir(req); 3038 const int rw = rq_data_dir(req);
3040 3039
3041 __disk_stat_add(req->rq_disk, sectors[rw], nr_bytes >> 9); 3040 __disk_stat_add(req->rq_disk, sectors[rw], nr_bytes >> 9);
3042 } 3041 }
3043 3042
3044 total_bytes = bio_nbytes = 0; 3043 total_bytes = bio_nbytes = 0;
3045 while ((bio = req->bio) != NULL) { 3044 while ((bio = req->bio) != NULL) {
3046 int nbytes; 3045 int nbytes;
3047 3046
3048 if (nr_bytes >= bio->bi_size) { 3047 if (nr_bytes >= bio->bi_size) {
3049 req->bio = bio->bi_next; 3048 req->bio = bio->bi_next;
3050 nbytes = bio->bi_size; 3049 nbytes = bio->bi_size;
3051 bio_endio(bio, nbytes, error); 3050 bio_endio(bio, nbytes, error);
3052 next_idx = 0; 3051 next_idx = 0;
3053 bio_nbytes = 0; 3052 bio_nbytes = 0;
3054 } else { 3053 } else {
3055 int idx = bio->bi_idx + next_idx; 3054 int idx = bio->bi_idx + next_idx;
3056 3055
3057 if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { 3056 if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
3058 blk_dump_rq_flags(req, "__end_that"); 3057 blk_dump_rq_flags(req, "__end_that");
3059 printk("%s: bio idx %d >= vcnt %d\n", 3058 printk("%s: bio idx %d >= vcnt %d\n",
3060 __FUNCTION__, 3059 __FUNCTION__,
3061 bio->bi_idx, bio->bi_vcnt); 3060 bio->bi_idx, bio->bi_vcnt);
3062 break; 3061 break;
3063 } 3062 }
3064 3063
3065 nbytes = bio_iovec_idx(bio, idx)->bv_len; 3064 nbytes = bio_iovec_idx(bio, idx)->bv_len;
3066 BIO_BUG_ON(nbytes > bio->bi_size); 3065 BIO_BUG_ON(nbytes > bio->bi_size);
3067 3066
3068 /* 3067 /*
3069 * not a complete bvec done 3068 * not a complete bvec done
3070 */ 3069 */
3071 if (unlikely(nbytes > nr_bytes)) { 3070 if (unlikely(nbytes > nr_bytes)) {
3072 bio_nbytes += nr_bytes; 3071 bio_nbytes += nr_bytes;
3073 total_bytes += nr_bytes; 3072 total_bytes += nr_bytes;
3074 break; 3073 break;
3075 } 3074 }
3076 3075
3077 /* 3076 /*
3078 * advance to the next vector 3077 * advance to the next vector
3079 */ 3078 */
3080 next_idx++; 3079 next_idx++;
3081 bio_nbytes += nbytes; 3080 bio_nbytes += nbytes;
3082 } 3081 }
3083 3082
3084 total_bytes += nbytes; 3083 total_bytes += nbytes;
3085 nr_bytes -= nbytes; 3084 nr_bytes -= nbytes;
3086 3085
3087 if ((bio = req->bio)) { 3086 if ((bio = req->bio)) {
3088 /* 3087 /*
3089 * end more in this run, or just return 'not-done' 3088 * end more in this run, or just return 'not-done'
3090 */ 3089 */
3091 if (unlikely(nr_bytes <= 0)) 3090 if (unlikely(nr_bytes <= 0))
3092 break; 3091 break;
3093 } 3092 }
3094 } 3093 }
3095 3094
3096 /* 3095 /*
3097 * completely done 3096 * completely done
3098 */ 3097 */
3099 if (!req->bio) 3098 if (!req->bio)
3100 return 0; 3099 return 0;
3101 3100
3102 /* 3101 /*
3103 * if the request wasn't completed, update state 3102 * if the request wasn't completed, update state
3104 */ 3103 */
3105 if (bio_nbytes) { 3104 if (bio_nbytes) {
3106 bio_endio(bio, bio_nbytes, error); 3105 bio_endio(bio, bio_nbytes, error);
3107 bio->bi_idx += next_idx; 3106 bio->bi_idx += next_idx;
3108 bio_iovec(bio)->bv_offset += nr_bytes; 3107 bio_iovec(bio)->bv_offset += nr_bytes;
3109 bio_iovec(bio)->bv_len -= nr_bytes; 3108 bio_iovec(bio)->bv_len -= nr_bytes;
3110 } 3109 }
3111 3110
3112 blk_recalc_rq_sectors(req, total_bytes >> 9); 3111 blk_recalc_rq_sectors(req, total_bytes >> 9);
3113 blk_recalc_rq_segments(req); 3112 blk_recalc_rq_segments(req);
3114 return 1; 3113 return 1;
3115 } 3114 }
3116 3115
3117 /** 3116 /**
3118 * end_that_request_first - end I/O on a request 3117 * end_that_request_first - end I/O on a request
3119 * @req: the request being processed 3118 * @req: the request being processed
3120 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error 3119 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
3121 * @nr_sectors: number of sectors to end I/O on 3120 * @nr_sectors: number of sectors to end I/O on
3122 * 3121 *
3123 * Description: 3122 * Description:
3124 * Ends I/O on a number of sectors attached to @req, and sets it up 3123 * Ends I/O on a number of sectors attached to @req, and sets it up
3125 * for the next range of segments (if any) in the cluster. 3124 * for the next range of segments (if any) in the cluster.
3126 * 3125 *
3127 * Return: 3126 * Return:
3128 * 0 - we are done with this request, call end_that_request_last() 3127 * 0 - we are done with this request, call end_that_request_last()
3129 * 1 - still buffers pending for this request 3128 * 1 - still buffers pending for this request
3130 **/ 3129 **/
3131 int end_that_request_first(struct request *req, int uptodate, int nr_sectors) 3130 int end_that_request_first(struct request *req, int uptodate, int nr_sectors)
3132 { 3131 {
3133 return __end_that_request_first(req, uptodate, nr_sectors << 9); 3132 return __end_that_request_first(req, uptodate, nr_sectors << 9);
3134 } 3133 }
3135 3134
3136 EXPORT_SYMBOL(end_that_request_first); 3135 EXPORT_SYMBOL(end_that_request_first);
3137 3136
3138 /** 3137 /**
3139 * end_that_request_chunk - end I/O on a request 3138 * end_that_request_chunk - end I/O on a request
3140 * @req: the request being processed 3139 * @req: the request being processed
3141 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error 3140 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
3142 * @nr_bytes: number of bytes to complete 3141 * @nr_bytes: number of bytes to complete
3143 * 3142 *
3144 * Description: 3143 * Description:
3145 * Ends I/O on a number of bytes attached to @req, and sets it up 3144 * Ends I/O on a number of bytes attached to @req, and sets it up
3146 * for the next range of segments (if any). Like end_that_request_first(), 3145 * for the next range of segments (if any). Like end_that_request_first(),
3147 * but deals with bytes instead of sectors. 3146 * but deals with bytes instead of sectors.
3148 * 3147 *
3149 * Return: 3148 * Return:
3150 * 0 - we are done with this request, call end_that_request_last() 3149 * 0 - we are done with this request, call end_that_request_last()
3151 * 1 - still buffers pending for this request 3150 * 1 - still buffers pending for this request
3152 **/ 3151 **/
3153 int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes) 3152 int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes)
3154 { 3153 {
3155 return __end_that_request_first(req, uptodate, nr_bytes); 3154 return __end_that_request_first(req, uptodate, nr_bytes);
3156 } 3155 }
3157 3156
3158 EXPORT_SYMBOL(end_that_request_chunk); 3157 EXPORT_SYMBOL(end_that_request_chunk);
3159 3158
3160 /* 3159 /*
3161 * queue lock must be held 3160 * queue lock must be held
3162 */ 3161 */
3163 void end_that_request_last(struct request *req) 3162 void end_that_request_last(struct request *req)
3164 { 3163 {
3165 struct gendisk *disk = req->rq_disk; 3164 struct gendisk *disk = req->rq_disk;
3166 3165
3167 if (unlikely(laptop_mode) && blk_fs_request(req)) 3166 if (unlikely(laptop_mode) && blk_fs_request(req))
3168 laptop_io_completion(); 3167 laptop_io_completion();
3169 3168
3170 if (disk && blk_fs_request(req)) { 3169 if (disk && blk_fs_request(req)) {
3171 unsigned long duration = jiffies - req->start_time; 3170 unsigned long duration = jiffies - req->start_time;
3172 const int rw = rq_data_dir(req); 3171 const int rw = rq_data_dir(req);
3173 3172
3174 __disk_stat_inc(disk, ios[rw]); 3173 __disk_stat_inc(disk, ios[rw]);
3175 __disk_stat_add(disk, ticks[rw], duration); 3174 __disk_stat_add(disk, ticks[rw], duration);
3176 disk_round_stats(disk); 3175 disk_round_stats(disk);
3177 disk->in_flight--; 3176 disk->in_flight--;
3178 } 3177 }
3179 if (req->end_io) 3178 if (req->end_io)
3180 req->end_io(req); 3179 req->end_io(req);
3181 else 3180 else
3182 __blk_put_request(req->q, req); 3181 __blk_put_request(req->q, req);
3183 } 3182 }
3184 3183
3185 EXPORT_SYMBOL(end_that_request_last); 3184 EXPORT_SYMBOL(end_that_request_last);
3186 3185
3187 void end_request(struct request *req, int uptodate) 3186 void end_request(struct request *req, int uptodate)
3188 { 3187 {
3189 if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) { 3188 if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) {
3190 add_disk_randomness(req->rq_disk); 3189 add_disk_randomness(req->rq_disk);
3191 blkdev_dequeue_request(req); 3190 blkdev_dequeue_request(req);
3192 end_that_request_last(req); 3191 end_that_request_last(req);
3193 } 3192 }
3194 } 3193 }
3195 3194
3196 EXPORT_SYMBOL(end_request); 3195 EXPORT_SYMBOL(end_request);
3197 3196
3198 void blk_rq_bio_prep(request_queue_t *q, struct request *rq, struct bio *bio) 3197 void blk_rq_bio_prep(request_queue_t *q, struct request *rq, struct bio *bio)
3199 { 3198 {
3200 /* first three bits are identical in rq->flags and bio->bi_rw */ 3199 /* first three bits are identical in rq->flags and bio->bi_rw */
3201 rq->flags |= (bio->bi_rw & 7); 3200 rq->flags |= (bio->bi_rw & 7);
3202 3201
3203 rq->nr_phys_segments = bio_phys_segments(q, bio); 3202 rq->nr_phys_segments = bio_phys_segments(q, bio);
3204 rq->nr_hw_segments = bio_hw_segments(q, bio); 3203 rq->nr_hw_segments = bio_hw_segments(q, bio);
3205 rq->current_nr_sectors = bio_cur_sectors(bio); 3204 rq->current_nr_sectors = bio_cur_sectors(bio);
3206 rq->hard_cur_sectors = rq->current_nr_sectors; 3205 rq->hard_cur_sectors = rq->current_nr_sectors;
3207 rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio); 3206 rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
3208 rq->buffer = bio_data(bio); 3207 rq->buffer = bio_data(bio);
3209 3208
3210 rq->bio = rq->biotail = bio; 3209 rq->bio = rq->biotail = bio;
3211 } 3210 }
3212 3211
3213 EXPORT_SYMBOL(blk_rq_bio_prep); 3212 EXPORT_SYMBOL(blk_rq_bio_prep);
3214 3213
3215 int kblockd_schedule_work(struct work_struct *work) 3214 int kblockd_schedule_work(struct work_struct *work)
3216 { 3215 {
3217 return queue_work(kblockd_workqueue, work); 3216 return queue_work(kblockd_workqueue, work);
3218 } 3217 }
3219 3218
3220 EXPORT_SYMBOL(kblockd_schedule_work); 3219 EXPORT_SYMBOL(kblockd_schedule_work);
3221 3220
3222 void kblockd_flush(void) 3221 void kblockd_flush(void)
3223 { 3222 {
3224 flush_workqueue(kblockd_workqueue); 3223 flush_workqueue(kblockd_workqueue);
3225 } 3224 }
3226 EXPORT_SYMBOL(kblockd_flush); 3225 EXPORT_SYMBOL(kblockd_flush);
3227 3226
3228 int __init blk_dev_init(void) 3227 int __init blk_dev_init(void)
3229 { 3228 {
3230 kblockd_workqueue = create_workqueue("kblockd"); 3229 kblockd_workqueue = create_workqueue("kblockd");
3231 if (!kblockd_workqueue) 3230 if (!kblockd_workqueue)
3232 panic("Failed to create kblockd\n"); 3231 panic("Failed to create kblockd\n");
3233 3232
3234 request_cachep = kmem_cache_create("blkdev_requests", 3233 request_cachep = kmem_cache_create("blkdev_requests",
3235 sizeof(struct request), 0, SLAB_PANIC, NULL, NULL); 3234 sizeof(struct request), 0, SLAB_PANIC, NULL, NULL);
3236 3235
3237 requestq_cachep = kmem_cache_create("blkdev_queue", 3236 requestq_cachep = kmem_cache_create("blkdev_queue",
3238 sizeof(request_queue_t), 0, SLAB_PANIC, NULL, NULL); 3237 sizeof(request_queue_t), 0, SLAB_PANIC, NULL, NULL);
3239 3238
3240 iocontext_cachep = kmem_cache_create("blkdev_ioc", 3239 iocontext_cachep = kmem_cache_create("blkdev_ioc",
3241 sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL); 3240 sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL);
3242 3241
3243 blk_max_low_pfn = max_low_pfn; 3242 blk_max_low_pfn = max_low_pfn;
3244 blk_max_pfn = max_pfn; 3243 blk_max_pfn = max_pfn;
3245 3244
3246 return 0; 3245 return 0;
3247 } 3246 }
3248 3247
3249 /* 3248 /*
3250 * IO Context helper functions 3249 * IO Context helper functions
3251 */ 3250 */
3252 void put_io_context(struct io_context *ioc) 3251 void put_io_context(struct io_context *ioc)
3253 { 3252 {
3254 if (ioc == NULL) 3253 if (ioc == NULL)
3255 return; 3254 return;
3256 3255
3257 BUG_ON(atomic_read(&ioc->refcount) == 0); 3256 BUG_ON(atomic_read(&ioc->refcount) == 0);
3258 3257
3259 if (atomic_dec_and_test(&ioc->refcount)) { 3258 if (atomic_dec_and_test(&ioc->refcount)) {
3260 if (ioc->aic && ioc->aic->dtor) 3259 if (ioc->aic && ioc->aic->dtor)
3261 ioc->aic->dtor(ioc->aic); 3260 ioc->aic->dtor(ioc->aic);
3262 if (ioc->cic && ioc->cic->dtor) 3261 if (ioc->cic && ioc->cic->dtor)
3263 ioc->cic->dtor(ioc->cic); 3262 ioc->cic->dtor(ioc->cic);
3264 3263
3265 kmem_cache_free(iocontext_cachep, ioc); 3264 kmem_cache_free(iocontext_cachep, ioc);
3266 } 3265 }
3267 } 3266 }
3268 EXPORT_SYMBOL(put_io_context); 3267 EXPORT_SYMBOL(put_io_context);
3269 3268
3270 /* Called by the exitting task */ 3269 /* Called by the exitting task */
3271 void exit_io_context(void) 3270 void exit_io_context(void)
3272 { 3271 {
3273 unsigned long flags; 3272 unsigned long flags;
3274 struct io_context *ioc; 3273 struct io_context *ioc;
3275 3274
3276 local_irq_save(flags); 3275 local_irq_save(flags);
3277 task_lock(current); 3276 task_lock(current);
3278 ioc = current->io_context; 3277 ioc = current->io_context;
3279 current->io_context = NULL; 3278 current->io_context = NULL;
3280 ioc->task = NULL; 3279 ioc->task = NULL;
3281 task_unlock(current); 3280 task_unlock(current);
3282 local_irq_restore(flags); 3281 local_irq_restore(flags);
3283 3282
3284 if (ioc->aic && ioc->aic->exit) 3283 if (ioc->aic && ioc->aic->exit)
3285 ioc->aic->exit(ioc->aic); 3284 ioc->aic->exit(ioc->aic);
3286 if (ioc->cic && ioc->cic->exit) 3285 if (ioc->cic && ioc->cic->exit)
3287 ioc->cic->exit(ioc->cic); 3286 ioc->cic->exit(ioc->cic);
3288 3287
3289 put_io_context(ioc); 3288 put_io_context(ioc);
3290 } 3289 }
3291 3290
3292 /* 3291 /*
3293 * If the current task has no IO context then create one and initialise it. 3292 * If the current task has no IO context then create one and initialise it.
3294 * Otherwise, return its existing IO context. 3293 * Otherwise, return its existing IO context.
3295 * 3294 *
3296 * This returned IO context doesn't have a specifically elevated refcount, 3295 * This returned IO context doesn't have a specifically elevated refcount,
3297 * but since the current task itself holds a reference, the context can be 3296 * but since the current task itself holds a reference, the context can be
3298 * used in general code, so long as it stays within `current` context. 3297 * used in general code, so long as it stays within `current` context.
3299 */ 3298 */
3300 struct io_context *current_io_context(gfp_t gfp_flags) 3299 struct io_context *current_io_context(gfp_t gfp_flags)
3301 { 3300 {
3302 struct task_struct *tsk = current; 3301 struct task_struct *tsk = current;
3303 struct io_context *ret; 3302 struct io_context *ret;
3304 3303
3305 ret = tsk->io_context; 3304 ret = tsk->io_context;
3306 if (likely(ret)) 3305 if (likely(ret))
3307 return ret; 3306 return ret;
3308 3307
3309 ret = kmem_cache_alloc(iocontext_cachep, gfp_flags); 3308 ret = kmem_cache_alloc(iocontext_cachep, gfp_flags);
3310 if (ret) { 3309 if (ret) {
3311 atomic_set(&ret->refcount, 1); 3310 atomic_set(&ret->refcount, 1);
3312 ret->task = current; 3311 ret->task = current;
3313 ret->set_ioprio = NULL; 3312 ret->set_ioprio = NULL;
3314 ret->last_waited = jiffies; /* doesn't matter... */ 3313 ret->last_waited = jiffies; /* doesn't matter... */
3315 ret->nr_batch_requests = 0; /* because this is 0 */ 3314 ret->nr_batch_requests = 0; /* because this is 0 */
3316 ret->aic = NULL; 3315 ret->aic = NULL;
3317 ret->cic = NULL; 3316 ret->cic = NULL;
3318 tsk->io_context = ret; 3317 tsk->io_context = ret;
3319 } 3318 }
3320 3319
3321 return ret; 3320 return ret;
3322 } 3321 }
3323 EXPORT_SYMBOL(current_io_context); 3322 EXPORT_SYMBOL(current_io_context);
3324 3323
3325 /* 3324 /*
3326 * If the current task has no IO context then create one and initialise it. 3325 * If the current task has no IO context then create one and initialise it.
3327 * If it does have a context, take a ref on it. 3326 * If it does have a context, take a ref on it.
3328 * 3327 *
3329 * This is always called in the context of the task which submitted the I/O. 3328 * This is always called in the context of the task which submitted the I/O.
3330 */ 3329 */
3331 struct io_context *get_io_context(gfp_t gfp_flags) 3330 struct io_context *get_io_context(gfp_t gfp_flags)
3332 { 3331 {
3333 struct io_context *ret; 3332 struct io_context *ret;
3334 ret = current_io_context(gfp_flags); 3333 ret = current_io_context(gfp_flags);
3335 if (likely(ret)) 3334 if (likely(ret))
3336 atomic_inc(&ret->refcount); 3335 atomic_inc(&ret->refcount);
3337 return ret; 3336 return ret;
3338 } 3337 }
3339 EXPORT_SYMBOL(get_io_context); 3338 EXPORT_SYMBOL(get_io_context);
3340 3339
3341 void copy_io_context(struct io_context **pdst, struct io_context **psrc) 3340 void copy_io_context(struct io_context **pdst, struct io_context **psrc)
3342 { 3341 {
3343 struct io_context *src = *psrc; 3342 struct io_context *src = *psrc;
3344 struct io_context *dst = *pdst; 3343 struct io_context *dst = *pdst;
3345 3344
3346 if (src) { 3345 if (src) {
3347 BUG_ON(atomic_read(&src->refcount) == 0); 3346 BUG_ON(atomic_read(&src->refcount) == 0);
3348 atomic_inc(&src->refcount); 3347 atomic_inc(&src->refcount);
3349 put_io_context(dst); 3348 put_io_context(dst);
3350 *pdst = src; 3349 *pdst = src;
3351 } 3350 }
3352 } 3351 }
3353 EXPORT_SYMBOL(copy_io_context); 3352 EXPORT_SYMBOL(copy_io_context);
3354 3353
3355 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2) 3354 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2)
3356 { 3355 {
3357 struct io_context *temp; 3356 struct io_context *temp;
3358 temp = *ioc1; 3357 temp = *ioc1;
3359 *ioc1 = *ioc2; 3358 *ioc1 = *ioc2;
3360 *ioc2 = temp; 3359 *ioc2 = temp;
3361 } 3360 }
3362 EXPORT_SYMBOL(swap_io_context); 3361 EXPORT_SYMBOL(swap_io_context);
3363 3362
3364 /* 3363 /*
3365 * sysfs parts below 3364 * sysfs parts below
3366 */ 3365 */
3367 struct queue_sysfs_entry { 3366 struct queue_sysfs_entry {
3368 struct attribute attr; 3367 struct attribute attr;
3369 ssize_t (*show)(struct request_queue *, char *); 3368 ssize_t (*show)(struct request_queue *, char *);
3370 ssize_t (*store)(struct request_queue *, const char *, size_t); 3369 ssize_t (*store)(struct request_queue *, const char *, size_t);
3371 }; 3370 };
3372 3371
3373 static ssize_t 3372 static ssize_t
3374 queue_var_show(unsigned int var, char *page) 3373 queue_var_show(unsigned int var, char *page)
3375 { 3374 {
3376 return sprintf(page, "%d\n", var); 3375 return sprintf(page, "%d\n", var);
3377 } 3376 }
3378 3377
3379 static ssize_t 3378 static ssize_t
3380 queue_var_store(unsigned long *var, const char *page, size_t count) 3379 queue_var_store(unsigned long *var, const char *page, size_t count)
3381 { 3380 {
3382 char *p = (char *) page; 3381 char *p = (char *) page;
3383 3382
3384 *var = simple_strtoul(p, &p, 10); 3383 *var = simple_strtoul(p, &p, 10);
3385 return count; 3384 return count;
3386 } 3385 }
3387 3386
3388 static ssize_t queue_requests_show(struct request_queue *q, char *page) 3387 static ssize_t queue_requests_show(struct request_queue *q, char *page)
3389 { 3388 {
3390 return queue_var_show(q->nr_requests, (page)); 3389 return queue_var_show(q->nr_requests, (page));
3391 } 3390 }
3392 3391
3393 static ssize_t 3392 static ssize_t
3394 queue_requests_store(struct request_queue *q, const char *page, size_t count) 3393 queue_requests_store(struct request_queue *q, const char *page, size_t count)
3395 { 3394 {
3396 struct request_list *rl = &q->rq; 3395 struct request_list *rl = &q->rq;
3397 3396
3398 int ret = queue_var_store(&q->nr_requests, page, count); 3397 int ret = queue_var_store(&q->nr_requests, page, count);
3399 if (q->nr_requests < BLKDEV_MIN_RQ) 3398 if (q->nr_requests < BLKDEV_MIN_RQ)
3400 q->nr_requests = BLKDEV_MIN_RQ; 3399 q->nr_requests = BLKDEV_MIN_RQ;
3401 blk_queue_congestion_threshold(q); 3400 blk_queue_congestion_threshold(q);
3402 3401
3403 if (rl->count[READ] >= queue_congestion_on_threshold(q)) 3402 if (rl->count[READ] >= queue_congestion_on_threshold(q))
3404 set_queue_congested(q, READ); 3403 set_queue_congested(q, READ);
3405 else if (rl->count[READ] < queue_congestion_off_threshold(q)) 3404 else if (rl->count[READ] < queue_congestion_off_threshold(q))
3406 clear_queue_congested(q, READ); 3405 clear_queue_congested(q, READ);
3407 3406
3408 if (rl->count[WRITE] >= queue_congestion_on_threshold(q)) 3407 if (rl->count[WRITE] >= queue_congestion_on_threshold(q))
3409 set_queue_congested(q, WRITE); 3408 set_queue_congested(q, WRITE);
3410 else if (rl->count[WRITE] < queue_congestion_off_threshold(q)) 3409 else if (rl->count[WRITE] < queue_congestion_off_threshold(q))
3411 clear_queue_congested(q, WRITE); 3410 clear_queue_congested(q, WRITE);
3412 3411
3413 if (rl->count[READ] >= q->nr_requests) { 3412 if (rl->count[READ] >= q->nr_requests) {
3414 blk_set_queue_full(q, READ); 3413 blk_set_queue_full(q, READ);
3415 } else if (rl->count[READ]+1 <= q->nr_requests) { 3414 } else if (rl->count[READ]+1 <= q->nr_requests) {
3416 blk_clear_queue_full(q, READ); 3415 blk_clear_queue_full(q, READ);
3417 wake_up(&rl->wait[READ]); 3416 wake_up(&rl->wait[READ]);
3418 } 3417 }
3419 3418
3420 if (rl->count[WRITE] >= q->nr_requests) { 3419 if (rl->count[WRITE] >= q->nr_requests) {
3421 blk_set_queue_full(q, WRITE); 3420 blk_set_queue_full(q, WRITE);
3422 } else if (rl->count[WRITE]+1 <= q->nr_requests) { 3421 } else if (rl->count[WRITE]+1 <= q->nr_requests) {
3423 blk_clear_queue_full(q, WRITE); 3422 blk_clear_queue_full(q, WRITE);
3424 wake_up(&rl->wait[WRITE]); 3423 wake_up(&rl->wait[WRITE]);
3425 } 3424 }
3426 return ret; 3425 return ret;
3427 } 3426 }
3428 3427
3429 static ssize_t queue_ra_show(struct request_queue *q, char *page) 3428 static ssize_t queue_ra_show(struct request_queue *q, char *page)
3430 { 3429 {
3431 int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10); 3430 int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
3432 3431
3433 return queue_var_show(ra_kb, (page)); 3432 return queue_var_show(ra_kb, (page));
3434 } 3433 }
3435 3434
3436 static ssize_t 3435 static ssize_t
3437 queue_ra_store(struct request_queue *q, const char *page, size_t count) 3436 queue_ra_store(struct request_queue *q, const char *page, size_t count)
3438 { 3437 {
3439 unsigned long ra_kb; 3438 unsigned long ra_kb;
3440 ssize_t ret = queue_var_store(&ra_kb, page, count); 3439 ssize_t ret = queue_var_store(&ra_kb, page, count);
3441 3440
3442 spin_lock_irq(q->queue_lock); 3441 spin_lock_irq(q->queue_lock);
3443 if (ra_kb > (q->max_sectors >> 1)) 3442 if (ra_kb > (q->max_sectors >> 1))
3444 ra_kb = (q->max_sectors >> 1); 3443 ra_kb = (q->max_sectors >> 1);
3445 3444
3446 q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10); 3445 q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
3447 spin_unlock_irq(q->queue_lock); 3446 spin_unlock_irq(q->queue_lock);
3448 3447
3449 return ret; 3448 return ret;
3450 } 3449 }
3451 3450
3452 static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) 3451 static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
3453 { 3452 {
3454 int max_sectors_kb = q->max_sectors >> 1; 3453 int max_sectors_kb = q->max_sectors >> 1;
3455 3454
3456 return queue_var_show(max_sectors_kb, (page)); 3455 return queue_var_show(max_sectors_kb, (page));
3457 } 3456 }
3458 3457
3459 static ssize_t 3458 static ssize_t
3460 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) 3459 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
3461 { 3460 {
3462 unsigned long max_sectors_kb, 3461 unsigned long max_sectors_kb,
3463 max_hw_sectors_kb = q->max_hw_sectors >> 1, 3462 max_hw_sectors_kb = q->max_hw_sectors >> 1,
3464 page_kb = 1 << (PAGE_CACHE_SHIFT - 10); 3463 page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
3465 ssize_t ret = queue_var_store(&max_sectors_kb, page, count); 3464 ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
3466 int ra_kb; 3465 int ra_kb;
3467 3466
3468 if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) 3467 if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
3469 return -EINVAL; 3468 return -EINVAL;
3470 /* 3469 /*
3471 * Take the queue lock to update the readahead and max_sectors 3470 * Take the queue lock to update the readahead and max_sectors
3472 * values synchronously: 3471 * values synchronously:
3473 */ 3472 */
3474 spin_lock_irq(q->queue_lock); 3473 spin_lock_irq(q->queue_lock);
3475 /* 3474 /*
3476 * Trim readahead window as well, if necessary: 3475 * Trim readahead window as well, if necessary:
3477 */ 3476 */
3478 ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10); 3477 ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
3479 if (ra_kb > max_sectors_kb) 3478 if (ra_kb > max_sectors_kb)
3480 q->backing_dev_info.ra_pages = 3479 q->backing_dev_info.ra_pages =
3481 max_sectors_kb >> (PAGE_CACHE_SHIFT - 10); 3480 max_sectors_kb >> (PAGE_CACHE_SHIFT - 10);
3482 3481
3483 q->max_sectors = max_sectors_kb << 1; 3482 q->max_sectors = max_sectors_kb << 1;
3484 spin_unlock_irq(q->queue_lock); 3483 spin_unlock_irq(q->queue_lock);
3485 3484
3486 return ret; 3485 return ret;
3487 } 3486 }
3488 3487
3489 static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) 3488 static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
3490 { 3489 {
3491 int max_hw_sectors_kb = q->max_hw_sectors >> 1; 3490 int max_hw_sectors_kb = q->max_hw_sectors >> 1;
3492 3491
3493 return queue_var_show(max_hw_sectors_kb, (page)); 3492 return queue_var_show(max_hw_sectors_kb, (page));
3494 } 3493 }
3495 3494
3496 3495
3497 static struct queue_sysfs_entry queue_requests_entry = { 3496 static struct queue_sysfs_entry queue_requests_entry = {
3498 .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, 3497 .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
3499 .show = queue_requests_show, 3498 .show = queue_requests_show,
3500 .store = queue_requests_store, 3499 .store = queue_requests_store,
3501 }; 3500 };
3502 3501
3503 static struct queue_sysfs_entry queue_ra_entry = { 3502 static struct queue_sysfs_entry queue_ra_entry = {
3504 .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR }, 3503 .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR },
3505 .show = queue_ra_show, 3504 .show = queue_ra_show,
3506 .store = queue_ra_store, 3505 .store = queue_ra_store,
3507 }; 3506 };
3508 3507
3509 static struct queue_sysfs_entry queue_max_sectors_entry = { 3508 static struct queue_sysfs_entry queue_max_sectors_entry = {
3510 .attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR }, 3509 .attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR },
3511 .show = queue_max_sectors_show, 3510 .show = queue_max_sectors_show,
3512 .store = queue_max_sectors_store, 3511 .store = queue_max_sectors_store,
3513 }; 3512 };
3514 3513
3515 static struct queue_sysfs_entry queue_max_hw_sectors_entry = { 3514 static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
3516 .attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO }, 3515 .attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },
3517 .show = queue_max_hw_sectors_show, 3516 .show = queue_max_hw_sectors_show,
3518 }; 3517 };
3519 3518
3520 static struct queue_sysfs_entry queue_iosched_entry = { 3519 static struct queue_sysfs_entry queue_iosched_entry = {
3521 .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR }, 3520 .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
3522 .show = elv_iosched_show, 3521 .show = elv_iosched_show,
3523 .store = elv_iosched_store, 3522 .store = elv_iosched_store,
3524 }; 3523 };
3525 3524
3526 static struct attribute *default_attrs[] = { 3525 static struct attribute *default_attrs[] = {
3527 &queue_requests_entry.attr, 3526 &queue_requests_entry.attr,
3528 &queue_ra_entry.attr, 3527 &queue_ra_entry.attr,
3529 &queue_max_hw_sectors_entry.attr, 3528 &queue_max_hw_sectors_entry.attr,
3530 &queue_max_sectors_entry.attr, 3529 &queue_max_sectors_entry.attr,
3531 &queue_iosched_entry.attr, 3530 &queue_iosched_entry.attr,
3532 NULL, 3531 NULL,
3533 }; 3532 };
3534 3533
3535 #define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) 3534 #define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)
3536 3535
3537 static ssize_t 3536 static ssize_t
3538 queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3537 queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3539 { 3538 {
3540 struct queue_sysfs_entry *entry = to_queue(attr); 3539 struct queue_sysfs_entry *entry = to_queue(attr);
3541 struct request_queue *q; 3540 struct request_queue *q;
3542 3541
3543 q = container_of(kobj, struct request_queue, kobj); 3542 q = container_of(kobj, struct request_queue, kobj);
3544 if (!entry->show) 3543 if (!entry->show)
3545 return -EIO; 3544 return -EIO;
3546 3545
3547 return entry->show(q, page); 3546 return entry->show(q, page);
3548 } 3547 }
3549 3548
3550 static ssize_t 3549 static ssize_t
3551 queue_attr_store(struct kobject *kobj, struct attribute *attr, 3550 queue_attr_store(struct kobject *kobj, struct attribute *attr,
3552 const char *page, size_t length) 3551 const char *page, size_t length)
3553 { 3552 {
3554 struct queue_sysfs_entry *entry = to_queue(attr); 3553 struct queue_sysfs_entry *entry = to_queue(attr);
3555 struct request_queue *q; 3554 struct request_queue *q;
3556 3555
3557 q = container_of(kobj, struct request_queue, kobj); 3556 q = container_of(kobj, struct request_queue, kobj);
3558 if (!entry->store) 3557 if (!entry->store)
3559 return -EIO; 3558 return -EIO;
3560 3559
3561 return entry->store(q, page, length); 3560 return entry->store(q, page, length);
3562 } 3561 }
3563 3562
3564 static struct sysfs_ops queue_sysfs_ops = { 3563 static struct sysfs_ops queue_sysfs_ops = {
3565 .show = queue_attr_show, 3564 .show = queue_attr_show,
3566 .store = queue_attr_store, 3565 .store = queue_attr_store,
3567 }; 3566 };
3568 3567
3569 static struct kobj_type queue_ktype = { 3568 static struct kobj_type queue_ktype = {
3570 .sysfs_ops = &queue_sysfs_ops, 3569 .sysfs_ops = &queue_sysfs_ops,
3571 .default_attrs = default_attrs, 3570 .default_attrs = default_attrs,
3572 }; 3571 };
3573 3572
3574 int blk_register_queue(struct gendisk *disk) 3573 int blk_register_queue(struct gendisk *disk)
3575 { 3574 {
3576 int ret; 3575 int ret;
3577 3576
3578 request_queue_t *q = disk->queue; 3577 request_queue_t *q = disk->queue;
3579 3578
3580 if (!q || !q->request_fn) 3579 if (!q || !q->request_fn)
3581 return -ENXIO; 3580 return -ENXIO;
3582 3581
3583 q->kobj.parent = kobject_get(&disk->kobj); 3582 q->kobj.parent = kobject_get(&disk->kobj);
3584 if (!q->kobj.parent) 3583 if (!q->kobj.parent)
3585 return -EBUSY; 3584 return -EBUSY;
3586 3585
3587 snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue"); 3586 snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue");
3588 q->kobj.ktype = &queue_ktype; 3587 q->kobj.ktype = &queue_ktype;
3589 3588
3590 ret = kobject_register(&q->kobj); 3589 ret = kobject_register(&q->kobj);
3591 if (ret < 0) 3590 if (ret < 0)
3592 return ret; 3591 return ret;
3593 3592
3594 ret = elv_register_queue(q); 3593 ret = elv_register_queue(q);
3595 if (ret) { 3594 if (ret) {
3596 kobject_unregister(&q->kobj); 3595 kobject_unregister(&q->kobj);
3597 return ret; 3596 return ret;
3598 } 3597 }
3599 3598
3600 return 0; 3599 return 0;
3601 } 3600 }
3602 3601
3603 void blk_unregister_queue(struct gendisk *disk) 3602 void blk_unregister_queue(struct gendisk *disk)
3604 { 3603 {
3605 request_queue_t *q = disk->queue; 3604 request_queue_t *q = disk->queue;
3606 3605
3607 if (q && q->request_fn) { 3606 if (q && q->request_fn) {
3608 elv_unregister_queue(q); 3607 elv_unregister_queue(q);
3609 3608
3610 kobject_unregister(&q->kobj); 3609 kobject_unregister(&q->kobj);
3611 kobject_put(&disk->kobj); 3610 kobject_put(&disk->kobj);
3612 } 3611 }
3613 } 3612 }
3614 3613
1 /* 1 /*
2 * fs/fs-writeback.c 2 * fs/fs-writeback.c
3 * 3 *
4 * Copyright (C) 2002, Linus Torvalds. 4 * Copyright (C) 2002, Linus Torvalds.
5 * 5 *
6 * Contains all the functions related to writing back and waiting 6 * Contains all the functions related to writing back and waiting
7 * upon dirty inodes against superblocks, and writing back dirty 7 * upon dirty inodes against superblocks, and writing back dirty
8 * pages against inodes. ie: data writeback. Writeout of the 8 * pages against inodes. ie: data writeback. Writeout of the
9 * inode itself is not handled here. 9 * inode itself is not handled here.
10 * 10 *
11 * 10Apr2002 akpm@zip.com.au 11 * 10Apr2002 akpm@zip.com.au
12 * Split out of fs/inode.c 12 * Split out of fs/inode.c
13 * Additions for address_space-based writeback 13 * Additions for address_space-based writeback
14 */ 14 */
15 15
16 #include <linux/kernel.h> 16 #include <linux/kernel.h>
17 #include <linux/spinlock.h> 17 #include <linux/spinlock.h>
18 #include <linux/sched.h> 18 #include <linux/sched.h>
19 #include <linux/fs.h> 19 #include <linux/fs.h>
20 #include <linux/mm.h> 20 #include <linux/mm.h>
21 #include <linux/writeback.h> 21 #include <linux/writeback.h>
22 #include <linux/blkdev.h> 22 #include <linux/blkdev.h>
23 #include <linux/backing-dev.h> 23 #include <linux/backing-dev.h>
24 #include <linux/buffer_head.h> 24 #include <linux/buffer_head.h>
25 25
26 extern struct super_block *blockdev_superblock; 26 extern struct super_block *blockdev_superblock;
27 27
28 /** 28 /**
29 * __mark_inode_dirty - internal function 29 * __mark_inode_dirty - internal function
30 * @inode: inode to mark 30 * @inode: inode to mark
31 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) 31 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
32 * Mark an inode as dirty. Callers should use mark_inode_dirty or 32 * Mark an inode as dirty. Callers should use mark_inode_dirty or
33 * mark_inode_dirty_sync. 33 * mark_inode_dirty_sync.
34 * 34 *
35 * Put the inode on the super block's dirty list. 35 * Put the inode on the super block's dirty list.
36 * 36 *
37 * CAREFUL! We mark it dirty unconditionally, but move it onto the 37 * CAREFUL! We mark it dirty unconditionally, but move it onto the
38 * dirty list only if it is hashed or if it refers to a blockdev. 38 * dirty list only if it is hashed or if it refers to a blockdev.
39 * If it was not hashed, it will never be added to the dirty list 39 * If it was not hashed, it will never be added to the dirty list
40 * even if it is later hashed, as it will have been marked dirty already. 40 * even if it is later hashed, as it will have been marked dirty already.
41 * 41 *
42 * In short, make sure you hash any inodes _before_ you start marking 42 * In short, make sure you hash any inodes _before_ you start marking
43 * them dirty. 43 * them dirty.
44 * 44 *
45 * This function *must* be atomic for the I_DIRTY_PAGES case - 45 * This function *must* be atomic for the I_DIRTY_PAGES case -
46 * set_page_dirty() is called under spinlock in several places. 46 * set_page_dirty() is called under spinlock in several places.
47 * 47 *
48 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of 48 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
49 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of 49 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
50 * the kernel-internal blockdev inode represents the dirtying time of the 50 * the kernel-internal blockdev inode represents the dirtying time of the
51 * blockdev's pages. This is why for I_DIRTY_PAGES we always use 51 * blockdev's pages. This is why for I_DIRTY_PAGES we always use
52 * page->mapping->host, so the page-dirtying time is recorded in the internal 52 * page->mapping->host, so the page-dirtying time is recorded in the internal
53 * blockdev inode. 53 * blockdev inode.
54 */ 54 */
55 void __mark_inode_dirty(struct inode *inode, int flags) 55 void __mark_inode_dirty(struct inode *inode, int flags)
56 { 56 {
57 struct super_block *sb = inode->i_sb; 57 struct super_block *sb = inode->i_sb;
58 58
59 /* 59 /*
60 * Don't do this for I_DIRTY_PAGES - that doesn't actually 60 * Don't do this for I_DIRTY_PAGES - that doesn't actually
61 * dirty the inode itself 61 * dirty the inode itself
62 */ 62 */
63 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 63 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
64 if (sb->s_op->dirty_inode) 64 if (sb->s_op->dirty_inode)
65 sb->s_op->dirty_inode(inode); 65 sb->s_op->dirty_inode(inode);
66 } 66 }
67 67
68 /* 68 /*
69 * make sure that changes are seen by all cpus before we test i_state 69 * make sure that changes are seen by all cpus before we test i_state
70 * -- mikulas 70 * -- mikulas
71 */ 71 */
72 smp_mb(); 72 smp_mb();
73 73
74 /* avoid the locking if we can */ 74 /* avoid the locking if we can */
75 if ((inode->i_state & flags) == flags) 75 if ((inode->i_state & flags) == flags)
76 return; 76 return;
77 77
78 if (unlikely(block_dump)) { 78 if (unlikely(block_dump)) {
79 struct dentry *dentry = NULL; 79 struct dentry *dentry = NULL;
80 const char *name = "?"; 80 const char *name = "?";
81 81
82 if (!list_empty(&inode->i_dentry)) { 82 if (!list_empty(&inode->i_dentry)) {
83 dentry = list_entry(inode->i_dentry.next, 83 dentry = list_entry(inode->i_dentry.next,
84 struct dentry, d_alias); 84 struct dentry, d_alias);
85 if (dentry && dentry->d_name.name) 85 if (dentry && dentry->d_name.name)
86 name = (const char *) dentry->d_name.name; 86 name = (const char *) dentry->d_name.name;
87 } 87 }
88 88
89 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) 89 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev"))
90 printk(KERN_DEBUG 90 printk(KERN_DEBUG
91 "%s(%d): dirtied inode %lu (%s) on %s\n", 91 "%s(%d): dirtied inode %lu (%s) on %s\n",
92 current->comm, current->pid, inode->i_ino, 92 current->comm, current->pid, inode->i_ino,
93 name, inode->i_sb->s_id); 93 name, inode->i_sb->s_id);
94 } 94 }
95 95
96 spin_lock(&inode_lock); 96 spin_lock(&inode_lock);
97 if ((inode->i_state & flags) != flags) { 97 if ((inode->i_state & flags) != flags) {
98 const int was_dirty = inode->i_state & I_DIRTY; 98 const int was_dirty = inode->i_state & I_DIRTY;
99 99
100 inode->i_state |= flags; 100 inode->i_state |= flags;
101 101
102 /* 102 /*
103 * If the inode is locked, just update its dirty state. 103 * If the inode is locked, just update its dirty state.
104 * The unlocker will place the inode on the appropriate 104 * The unlocker will place the inode on the appropriate
105 * superblock list, based upon its state. 105 * superblock list, based upon its state.
106 */ 106 */
107 if (inode->i_state & I_LOCK) 107 if (inode->i_state & I_LOCK)
108 goto out; 108 goto out;
109 109
110 /* 110 /*
111 * Only add valid (hashed) inodes to the superblock's 111 * Only add valid (hashed) inodes to the superblock's
112 * dirty list. Add blockdev inodes as well. 112 * dirty list. Add blockdev inodes as well.
113 */ 113 */
114 if (!S_ISBLK(inode->i_mode)) { 114 if (!S_ISBLK(inode->i_mode)) {
115 if (hlist_unhashed(&inode->i_hash)) 115 if (hlist_unhashed(&inode->i_hash))
116 goto out; 116 goto out;
117 } 117 }
118 if (inode->i_state & (I_FREEING|I_CLEAR)) 118 if (inode->i_state & (I_FREEING|I_CLEAR))
119 goto out; 119 goto out;
120 120
121 /* 121 /*
122 * If the inode was already on s_dirty or s_io, don't 122 * If the inode was already on s_dirty or s_io, don't
123 * reposition it (that would break s_dirty time-ordering). 123 * reposition it (that would break s_dirty time-ordering).
124 */ 124 */
125 if (!was_dirty) { 125 if (!was_dirty) {
126 inode->dirtied_when = jiffies; 126 inode->dirtied_when = jiffies;
127 list_move(&inode->i_list, &sb->s_dirty); 127 list_move(&inode->i_list, &sb->s_dirty);
128 } 128 }
129 } 129 }
130 out: 130 out:
131 spin_unlock(&inode_lock); 131 spin_unlock(&inode_lock);
132 } 132 }
133 133
134 EXPORT_SYMBOL(__mark_inode_dirty); 134 EXPORT_SYMBOL(__mark_inode_dirty);
135 135
136 static int write_inode(struct inode *inode, int sync) 136 static int write_inode(struct inode *inode, int sync)
137 { 137 {
138 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) 138 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
139 return inode->i_sb->s_op->write_inode(inode, sync); 139 return inode->i_sb->s_op->write_inode(inode, sync);
140 return 0; 140 return 0;
141 } 141 }
142 142
143 /* 143 /*
144 * Write a single inode's dirty pages and inode data out to disk. 144 * Write a single inode's dirty pages and inode data out to disk.
145 * If `wait' is set, wait on the writeout. 145 * If `wait' is set, wait on the writeout.
146 * 146 *
147 * The whole writeout design is quite complex and fragile. We want to avoid 147 * The whole writeout design is quite complex and fragile. We want to avoid
148 * starvation of particular inodes when others are being redirtied, prevent 148 * starvation of particular inodes when others are being redirtied, prevent
149 * livelocks, etc. 149 * livelocks, etc.
150 * 150 *
151 * Called under inode_lock. 151 * Called under inode_lock.
152 */ 152 */
153 static int 153 static int
154 __sync_single_inode(struct inode *inode, struct writeback_control *wbc) 154 __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
155 { 155 {
156 unsigned dirty; 156 unsigned dirty;
157 struct address_space *mapping = inode->i_mapping; 157 struct address_space *mapping = inode->i_mapping;
158 struct super_block *sb = inode->i_sb; 158 struct super_block *sb = inode->i_sb;
159 int wait = wbc->sync_mode == WB_SYNC_ALL; 159 int wait = wbc->sync_mode == WB_SYNC_ALL;
160 int ret; 160 int ret;
161 161
162 BUG_ON(inode->i_state & I_LOCK); 162 BUG_ON(inode->i_state & I_LOCK);
163 163
164 /* Set I_LOCK, reset I_DIRTY */ 164 /* Set I_LOCK, reset I_DIRTY */
165 dirty = inode->i_state & I_DIRTY; 165 dirty = inode->i_state & I_DIRTY;
166 inode->i_state |= I_LOCK; 166 inode->i_state |= I_LOCK;
167 inode->i_state &= ~I_DIRTY; 167 inode->i_state &= ~I_DIRTY;
168 168
169 spin_unlock(&inode_lock); 169 spin_unlock(&inode_lock);
170 170
171 ret = do_writepages(mapping, wbc); 171 ret = do_writepages(mapping, wbc);
172 172
173 /* Don't write the inode if only I_DIRTY_PAGES was set */ 173 /* Don't write the inode if only I_DIRTY_PAGES was set */
174 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 174 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
175 int err = write_inode(inode, wait); 175 int err = write_inode(inode, wait);
176 if (ret == 0) 176 if (ret == 0)
177 ret = err; 177 ret = err;
178 } 178 }
179 179
180 if (wait) { 180 if (wait) {
181 int err = filemap_fdatawait(mapping); 181 int err = filemap_fdatawait(mapping);
182 if (ret == 0) 182 if (ret == 0)
183 ret = err; 183 ret = err;
184 } 184 }
185 185
186 spin_lock(&inode_lock); 186 spin_lock(&inode_lock);
187 inode->i_state &= ~I_LOCK; 187 inode->i_state &= ~I_LOCK;
188 if (!(inode->i_state & I_FREEING)) { 188 if (!(inode->i_state & I_FREEING)) {
189 if (!(inode->i_state & I_DIRTY) && 189 if (!(inode->i_state & I_DIRTY) &&
190 mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 190 mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
191 /* 191 /*
192 * We didn't write back all the pages. nfs_writepages() 192 * We didn't write back all the pages. nfs_writepages()
193 * sometimes bales out without doing anything. Redirty 193 * sometimes bales out without doing anything. Redirty
194 * the inode. It is still on sb->s_io. 194 * the inode. It is still on sb->s_io.
195 */ 195 */
196 if (wbc->for_kupdate) { 196 if (wbc->for_kupdate) {
197 /* 197 /*
198 * For the kupdate function we leave the inode 198 * For the kupdate function we leave the inode
199 * at the head of sb_dirty so it will get more 199 * at the head of sb_dirty so it will get more
200 * writeout as soon as the queue becomes 200 * writeout as soon as the queue becomes
201 * uncongested. 201 * uncongested.
202 */ 202 */
203 inode->i_state |= I_DIRTY_PAGES; 203 inode->i_state |= I_DIRTY_PAGES;
204 list_move_tail(&inode->i_list, &sb->s_dirty); 204 list_move_tail(&inode->i_list, &sb->s_dirty);
205 } else { 205 } else {
206 /* 206 /*
207 * Otherwise fully redirty the inode so that 207 * Otherwise fully redirty the inode so that
208 * other inodes on this superblock will get some 208 * other inodes on this superblock will get some
209 * writeout. Otherwise heavy writing to one 209 * writeout. Otherwise heavy writing to one
210 * file would indefinitely suspend writeout of 210 * file would indefinitely suspend writeout of
211 * all the other files. 211 * all the other files.
212 */ 212 */
213 inode->i_state |= I_DIRTY_PAGES; 213 inode->i_state |= I_DIRTY_PAGES;
214 inode->dirtied_when = jiffies; 214 inode->dirtied_when = jiffies;
215 list_move(&inode->i_list, &sb->s_dirty); 215 list_move(&inode->i_list, &sb->s_dirty);
216 } 216 }
217 } else if (inode->i_state & I_DIRTY) { 217 } else if (inode->i_state & I_DIRTY) {
218 /* 218 /*
219 * Someone redirtied the inode while were writing back 219 * Someone redirtied the inode while were writing back
220 * the pages. 220 * the pages.
221 */ 221 */
222 list_move(&inode->i_list, &sb->s_dirty); 222 list_move(&inode->i_list, &sb->s_dirty);
223 } else if (atomic_read(&inode->i_count)) { 223 } else if (atomic_read(&inode->i_count)) {
224 /* 224 /*
225 * The inode is clean, inuse 225 * The inode is clean, inuse
226 */ 226 */
227 list_move(&inode->i_list, &inode_in_use); 227 list_move(&inode->i_list, &inode_in_use);
228 } else { 228 } else {
229 /* 229 /*
230 * The inode is clean, unused 230 * The inode is clean, unused
231 */ 231 */
232 list_move(&inode->i_list, &inode_unused); 232 list_move(&inode->i_list, &inode_unused);
233 } 233 }
234 } 234 }
235 wake_up_inode(inode); 235 wake_up_inode(inode);
236 return ret; 236 return ret;
237 } 237 }
238 238
239 /* 239 /*
240 * Write out an inode's dirty pages. Called under inode_lock. Either the 240 * Write out an inode's dirty pages. Called under inode_lock. Either the
241 * caller has ref on the inode (either via __iget or via syscall against an fd) 241 * caller has ref on the inode (either via __iget or via syscall against an fd)
242 * or the inode has I_WILL_FREE set (via generic_forget_inode) 242 * or the inode has I_WILL_FREE set (via generic_forget_inode)
243 */ 243 */
244 static int 244 static int
245 __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) 245 __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
246 { 246 {
247 wait_queue_head_t *wqh; 247 wait_queue_head_t *wqh;
248 248
249 if (!atomic_read(&inode->i_count)) 249 if (!atomic_read(&inode->i_count))
250 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); 250 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
251 else 251 else
252 WARN_ON(inode->i_state & I_WILL_FREE); 252 WARN_ON(inode->i_state & I_WILL_FREE);
253 253
254 if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_LOCK)) { 254 if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_LOCK)) {
255 list_move(&inode->i_list, &inode->i_sb->s_dirty); 255 list_move(&inode->i_list, &inode->i_sb->s_dirty);
256 return 0; 256 return 0;
257 } 257 }
258 258
259 /* 259 /*
260 * It's a data-integrity sync. We must wait. 260 * It's a data-integrity sync. We must wait.
261 */ 261 */
262 if (inode->i_state & I_LOCK) { 262 if (inode->i_state & I_LOCK) {
263 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LOCK); 263 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LOCK);
264 264
265 wqh = bit_waitqueue(&inode->i_state, __I_LOCK); 265 wqh = bit_waitqueue(&inode->i_state, __I_LOCK);
266 do { 266 do {
267 spin_unlock(&inode_lock); 267 spin_unlock(&inode_lock);
268 __wait_on_bit(wqh, &wq, inode_wait, 268 __wait_on_bit(wqh, &wq, inode_wait,
269 TASK_UNINTERRUPTIBLE); 269 TASK_UNINTERRUPTIBLE);
270 spin_lock(&inode_lock); 270 spin_lock(&inode_lock);
271 } while (inode->i_state & I_LOCK); 271 } while (inode->i_state & I_LOCK);
272 } 272 }
273 return __sync_single_inode(inode, wbc); 273 return __sync_single_inode(inode, wbc);
274 } 274 }
275 275
276 /* 276 /*
277 * Write out a superblock's list of dirty inodes. A wait will be performed 277 * Write out a superblock's list of dirty inodes. A wait will be performed
278 * upon no inodes, all inodes or the final one, depending upon sync_mode. 278 * upon no inodes, all inodes or the final one, depending upon sync_mode.
279 * 279 *
280 * If older_than_this is non-NULL, then only write out inodes which 280 * If older_than_this is non-NULL, then only write out inodes which
281 * had their first dirtying at a time earlier than *older_than_this. 281 * had their first dirtying at a time earlier than *older_than_this.
282 * 282 *
283 * If we're a pdlfush thread, then implement pdflush collision avoidance 283 * If we're a pdlfush thread, then implement pdflush collision avoidance
284 * against the entire list. 284 * against the entire list.
285 * 285 *
286 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so 286 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
287 * that it can be located for waiting on in __writeback_single_inode(). 287 * that it can be located for waiting on in __writeback_single_inode().
288 * 288 *
289 * Called under inode_lock. 289 * Called under inode_lock.
290 * 290 *
291 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 291 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
292 * This function assumes that the blockdev superblock's inodes are backed by 292 * This function assumes that the blockdev superblock's inodes are backed by
293 * a variety of queues, so all inodes are searched. For other superblocks, 293 * a variety of queues, so all inodes are searched. For other superblocks,
294 * assume that all inodes are backed by the same queue. 294 * assume that all inodes are backed by the same queue.
295 * 295 *
296 * FIXME: this linear search could get expensive with many fileystems. But 296 * FIXME: this linear search could get expensive with many fileystems. But
297 * how to fix? We need to go from an address_space to all inodes which share 297 * how to fix? We need to go from an address_space to all inodes which share
298 * a queue with that address_space. (Easy: have a global "dirty superblocks" 298 * a queue with that address_space. (Easy: have a global "dirty superblocks"
299 * list). 299 * list).
300 * 300 *
301 * The inodes to be written are parked on sb->s_io. They are moved back onto 301 * The inodes to be written are parked on sb->s_io. They are moved back onto
302 * sb->s_dirty as they are selected for writing. This way, none can be missed 302 * sb->s_dirty as they are selected for writing. This way, none can be missed
303 * on the writer throttling path, and we get decent balancing between many 303 * on the writer throttling path, and we get decent balancing between many
304 * throttled threads: we don't want them all piling up on __wait_on_inode. 304 * throttled threads: we don't want them all piling up on __wait_on_inode.
305 */ 305 */
306 static void 306 static void
307 sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) 307 sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
308 { 308 {
309 const unsigned long start = jiffies; /* livelock avoidance */ 309 const unsigned long start = jiffies; /* livelock avoidance */
310 310
311 if (!wbc->for_kupdate || list_empty(&sb->s_io)) 311 if (!wbc->for_kupdate || list_empty(&sb->s_io))
312 list_splice_init(&sb->s_dirty, &sb->s_io); 312 list_splice_init(&sb->s_dirty, &sb->s_io);
313 313
314 while (!list_empty(&sb->s_io)) { 314 while (!list_empty(&sb->s_io)) {
315 struct inode *inode = list_entry(sb->s_io.prev, 315 struct inode *inode = list_entry(sb->s_io.prev,
316 struct inode, i_list); 316 struct inode, i_list);
317 struct address_space *mapping = inode->i_mapping; 317 struct address_space *mapping = inode->i_mapping;
318 struct backing_dev_info *bdi = mapping->backing_dev_info; 318 struct backing_dev_info *bdi = mapping->backing_dev_info;
319 long pages_skipped; 319 long pages_skipped;
320 320
321 if (!bdi_cap_writeback_dirty(bdi)) { 321 if (!bdi_cap_writeback_dirty(bdi)) {
322 list_move(&inode->i_list, &sb->s_dirty); 322 list_move(&inode->i_list, &sb->s_dirty);
323 if (sb == blockdev_superblock) { 323 if (sb == blockdev_superblock) {
324 /* 324 /*
325 * Dirty memory-backed blockdev: the ramdisk 325 * Dirty memory-backed blockdev: the ramdisk
326 * driver does this. Skip just this inode 326 * driver does this. Skip just this inode
327 */ 327 */
328 continue; 328 continue;
329 } 329 }
330 /* 330 /*
331 * Dirty memory-backed inode against a filesystem other 331 * Dirty memory-backed inode against a filesystem other
332 * than the kernel-internal bdev filesystem. Skip the 332 * than the kernel-internal bdev filesystem. Skip the
333 * entire superblock. 333 * entire superblock.
334 */ 334 */
335 break; 335 break;
336 } 336 }
337 337
338 if (wbc->nonblocking && bdi_write_congested(bdi)) { 338 if (wbc->nonblocking && bdi_write_congested(bdi)) {
339 wbc->encountered_congestion = 1; 339 wbc->encountered_congestion = 1;
340 if (sb != blockdev_superblock) 340 if (sb != blockdev_superblock)
341 break; /* Skip a congested fs */ 341 break; /* Skip a congested fs */
342 list_move(&inode->i_list, &sb->s_dirty); 342 list_move(&inode->i_list, &sb->s_dirty);
343 continue; /* Skip a congested blockdev */ 343 continue; /* Skip a congested blockdev */
344 } 344 }
345 345
346 if (wbc->bdi && bdi != wbc->bdi) { 346 if (wbc->bdi && bdi != wbc->bdi) {
347 if (sb != blockdev_superblock) 347 if (sb != blockdev_superblock)
348 break; /* fs has the wrong queue */ 348 break; /* fs has the wrong queue */
349 list_move(&inode->i_list, &sb->s_dirty); 349 list_move(&inode->i_list, &sb->s_dirty);
350 continue; /* blockdev has wrong queue */ 350 continue; /* blockdev has wrong queue */
351 } 351 }
352 352
353 /* Was this inode dirtied after sync_sb_inodes was called? */ 353 /* Was this inode dirtied after sync_sb_inodes was called? */
354 if (time_after(inode->dirtied_when, start)) 354 if (time_after(inode->dirtied_when, start))
355 break; 355 break;
356 356
357 /* Was this inode dirtied too recently? */ 357 /* Was this inode dirtied too recently? */
358 if (wbc->older_than_this && time_after(inode->dirtied_when, 358 if (wbc->older_than_this && time_after(inode->dirtied_when,
359 *wbc->older_than_this)) 359 *wbc->older_than_this))
360 break; 360 break;
361 361
362 /* Is another pdflush already flushing this queue? */ 362 /* Is another pdflush already flushing this queue? */
363 if (current_is_pdflush() && !writeback_acquire(bdi)) 363 if (current_is_pdflush() && !writeback_acquire(bdi))
364 break; 364 break;
365 365
366 BUG_ON(inode->i_state & I_FREEING); 366 BUG_ON(inode->i_state & I_FREEING);
367 __iget(inode); 367 __iget(inode);
368 pages_skipped = wbc->pages_skipped; 368 pages_skipped = wbc->pages_skipped;
369 __writeback_single_inode(inode, wbc); 369 __writeback_single_inode(inode, wbc);
370 if (wbc->sync_mode == WB_SYNC_HOLD) { 370 if (wbc->sync_mode == WB_SYNC_HOLD) {
371 inode->dirtied_when = jiffies; 371 inode->dirtied_when = jiffies;
372 list_move(&inode->i_list, &sb->s_dirty); 372 list_move(&inode->i_list, &sb->s_dirty);
373 } 373 }
374 if (current_is_pdflush()) 374 if (current_is_pdflush())
375 writeback_release(bdi); 375 writeback_release(bdi);
376 if (wbc->pages_skipped != pages_skipped) { 376 if (wbc->pages_skipped != pages_skipped) {
377 /* 377 /*
378 * writeback is not making progress due to locked 378 * writeback is not making progress due to locked
379 * buffers. Skip this inode for now. 379 * buffers. Skip this inode for now.
380 */ 380 */
381 list_move(&inode->i_list, &sb->s_dirty); 381 list_move(&inode->i_list, &sb->s_dirty);
382 } 382 }
383 spin_unlock(&inode_lock); 383 spin_unlock(&inode_lock);
384 cond_resched(); 384 cond_resched();
385 iput(inode); 385 iput(inode);
386 spin_lock(&inode_lock); 386 spin_lock(&inode_lock);
387 if (wbc->nr_to_write <= 0) 387 if (wbc->nr_to_write <= 0)
388 break; 388 break;
389 } 389 }
390 return; /* Leave any unwritten inodes on s_io */ 390 return; /* Leave any unwritten inodes on s_io */
391 } 391 }
392 392
393 /* 393 /*
394 * Start writeback of dirty pagecache data against all unlocked inodes. 394 * Start writeback of dirty pagecache data against all unlocked inodes.
395 * 395 *
396 * Note: 396 * Note:
397 * We don't need to grab a reference to superblock here. If it has non-empty 397 * We don't need to grab a reference to superblock here. If it has non-empty
398 * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed 398 * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
399 * past sync_inodes_sb() until both the ->s_dirty and ->s_io lists are 399 * past sync_inodes_sb() until both the ->s_dirty and ->s_io lists are
400 * empty. Since __sync_single_inode() regains inode_lock before it finally moves 400 * empty. Since __sync_single_inode() regains inode_lock before it finally moves
401 * inode from superblock lists we are OK. 401 * inode from superblock lists we are OK.
402 * 402 *
403 * If `older_than_this' is non-zero then only flush inodes which have a 403 * If `older_than_this' is non-zero then only flush inodes which have a
404 * flushtime older than *older_than_this. 404 * flushtime older than *older_than_this.
405 * 405 *
406 * If `bdi' is non-zero then we will scan the first inode against each 406 * If `bdi' is non-zero then we will scan the first inode against each
407 * superblock until we find the matching ones. One group will be the dirty 407 * superblock until we find the matching ones. One group will be the dirty
408 * inodes against a filesystem. Then when we hit the dummy blockdev superblock, 408 * inodes against a filesystem. Then when we hit the dummy blockdev superblock,
409 * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not 409 * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not
410 * super-efficient but we're about to do a ton of I/O... 410 * super-efficient but we're about to do a ton of I/O...
411 */ 411 */
412 void 412 void
413 writeback_inodes(struct writeback_control *wbc) 413 writeback_inodes(struct writeback_control *wbc)
414 { 414 {
415 struct super_block *sb; 415 struct super_block *sb;
416 416
417 might_sleep(); 417 might_sleep();
418 spin_lock(&sb_lock); 418 spin_lock(&sb_lock);
419 restart: 419 restart:
420 sb = sb_entry(super_blocks.prev); 420 sb = sb_entry(super_blocks.prev);
421 for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) { 421 for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) {
422 if (!list_empty(&sb->s_dirty) || !list_empty(&sb->s_io)) { 422 if (!list_empty(&sb->s_dirty) || !list_empty(&sb->s_io)) {
423 /* we're making our own get_super here */ 423 /* we're making our own get_super here */
424 sb->s_count++; 424 sb->s_count++;
425 spin_unlock(&sb_lock); 425 spin_unlock(&sb_lock);
426 /* 426 /*
427 * If we can't get the readlock, there's no sense in 427 * If we can't get the readlock, there's no sense in
428 * waiting around, most of the time the FS is going to 428 * waiting around, most of the time the FS is going to
429 * be unmounted by the time it is released. 429 * be unmounted by the time it is released.
430 */ 430 */
431 if (down_read_trylock(&sb->s_umount)) { 431 if (down_read_trylock(&sb->s_umount)) {
432 if (sb->s_root) { 432 if (sb->s_root) {
433 spin_lock(&inode_lock); 433 spin_lock(&inode_lock);
434 sync_sb_inodes(sb, wbc); 434 sync_sb_inodes(sb, wbc);
435 spin_unlock(&inode_lock); 435 spin_unlock(&inode_lock);
436 } 436 }
437 up_read(&sb->s_umount); 437 up_read(&sb->s_umount);
438 } 438 }
439 spin_lock(&sb_lock); 439 spin_lock(&sb_lock);
440 if (__put_super_and_need_restart(sb)) 440 if (__put_super_and_need_restart(sb))
441 goto restart; 441 goto restart;
442 } 442 }
443 if (wbc->nr_to_write <= 0) 443 if (wbc->nr_to_write <= 0)
444 break; 444 break;
445 } 445 }
446 spin_unlock(&sb_lock); 446 spin_unlock(&sb_lock);
447 } 447 }
448 448
449 /* 449 /*
450 * writeback and wait upon the filesystem's dirty inodes. The caller will 450 * writeback and wait upon the filesystem's dirty inodes. The caller will
451 * do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is 451 * do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is
452 * used to park the written inodes on sb->s_dirty for the wait pass. 452 * used to park the written inodes on sb->s_dirty for the wait pass.
453 * 453 *
454 * A finite limit is set on the number of pages which will be written. 454 * A finite limit is set on the number of pages which will be written.
455 * To prevent infinite livelock of sys_sync(). 455 * To prevent infinite livelock of sys_sync().
456 * 456 *
457 * We add in the number of potentially dirty inodes, because each inode write 457 * We add in the number of potentially dirty inodes, because each inode write
458 * can dirty pagecache in the underlying blockdev. 458 * can dirty pagecache in the underlying blockdev.
459 */ 459 */
460 void sync_inodes_sb(struct super_block *sb, int wait) 460 void sync_inodes_sb(struct super_block *sb, int wait)
461 { 461 {
462 struct writeback_control wbc = { 462 struct writeback_control wbc = {
463 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD, 463 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
464 }; 464 };
465 unsigned long nr_dirty = read_page_state(nr_dirty); 465 unsigned long nr_dirty = read_page_state(nr_dirty);
466 unsigned long nr_unstable = read_page_state(nr_unstable); 466 unsigned long nr_unstable = read_page_state(nr_unstable);
467 467
468 wbc.nr_to_write = nr_dirty + nr_unstable + 468 wbc.nr_to_write = nr_dirty + nr_unstable +
469 (inodes_stat.nr_inodes - inodes_stat.nr_unused) + 469 (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
470 nr_dirty + nr_unstable; 470 nr_dirty + nr_unstable;
471 wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */ 471 wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
472 spin_lock(&inode_lock); 472 spin_lock(&inode_lock);
473 sync_sb_inodes(sb, &wbc); 473 sync_sb_inodes(sb, &wbc);
474 spin_unlock(&inode_lock); 474 spin_unlock(&inode_lock);
475 } 475 }
476 476
477 /* 477 /*
478 * Rather lame livelock avoidance. 478 * Rather lame livelock avoidance.
479 */ 479 */
480 static void set_sb_syncing(int val) 480 static void set_sb_syncing(int val)
481 { 481 {
482 struct super_block *sb; 482 struct super_block *sb;
483 spin_lock(&sb_lock); 483 spin_lock(&sb_lock);
484 sb = sb_entry(super_blocks.prev); 484 sb = sb_entry(super_blocks.prev);
485 for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) { 485 for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) {
486 sb->s_syncing = val; 486 sb->s_syncing = val;
487 } 487 }
488 spin_unlock(&sb_lock); 488 spin_unlock(&sb_lock);
489 } 489 }
490 490
491 /** 491 /**
492 * sync_inodes - writes all inodes to disk 492 * sync_inodes - writes all inodes to disk
493 * @wait: wait for completion 493 * @wait: wait for completion
494 * 494 *
495 * sync_inodes() goes through each super block's dirty inode list, writes the 495 * sync_inodes() goes through each super block's dirty inode list, writes the
496 * inodes out, waits on the writeout and puts the inodes back on the normal 496 * inodes out, waits on the writeout and puts the inodes back on the normal
497 * list. 497 * list.
498 * 498 *
499 * This is for sys_sync(). fsync_dev() uses the same algorithm. The subtle 499 * This is for sys_sync(). fsync_dev() uses the same algorithm. The subtle
500 * part of the sync functions is that the blockdev "superblock" is processed 500 * part of the sync functions is that the blockdev "superblock" is processed
501 * last. This is because the write_inode() function of a typical fs will 501 * last. This is because the write_inode() function of a typical fs will
502 * perform no I/O, but will mark buffers in the blockdev mapping as dirty. 502 * perform no I/O, but will mark buffers in the blockdev mapping as dirty.
503 * What we want to do is to perform all that dirtying first, and then write 503 * What we want to do is to perform all that dirtying first, and then write
504 * back all those inode blocks via the blockdev mapping in one sweep. So the 504 * back all those inode blocks via the blockdev mapping in one sweep. So the
505 * additional (somewhat redundant) sync_blockdev() calls here are to make 505 * additional (somewhat redundant) sync_blockdev() calls here are to make
506 * sure that really happens. Because if we call sync_inodes_sb(wait=1) with 506 * sure that really happens. Because if we call sync_inodes_sb(wait=1) with
507 * outstanding dirty inodes, the writeback goes block-at-a-time within the 507 * outstanding dirty inodes, the writeback goes block-at-a-time within the
508 * filesystem's write_inode(). This is extremely slow. 508 * filesystem's write_inode(). This is extremely slow.
509 */ 509 */
510 static void __sync_inodes(int wait) 510 static void __sync_inodes(int wait)
511 { 511 {
512 struct super_block *sb; 512 struct super_block *sb;
513 513
514 spin_lock(&sb_lock); 514 spin_lock(&sb_lock);
515 restart: 515 restart:
516 list_for_each_entry(sb, &super_blocks, s_list) { 516 list_for_each_entry(sb, &super_blocks, s_list) {
517 if (sb->s_syncing) 517 if (sb->s_syncing)
518 continue; 518 continue;
519 sb->s_syncing = 1; 519 sb->s_syncing = 1;
520 sb->s_count++; 520 sb->s_count++;
521 spin_unlock(&sb_lock); 521 spin_unlock(&sb_lock);
522 down_read(&sb->s_umount); 522 down_read(&sb->s_umount);
523 if (sb->s_root) { 523 if (sb->s_root) {
524 sync_inodes_sb(sb, wait); 524 sync_inodes_sb(sb, wait);
525 sync_blockdev(sb->s_bdev); 525 sync_blockdev(sb->s_bdev);
526 } 526 }
527 up_read(&sb->s_umount); 527 up_read(&sb->s_umount);
528 spin_lock(&sb_lock); 528 spin_lock(&sb_lock);
529 if (__put_super_and_need_restart(sb)) 529 if (__put_super_and_need_restart(sb))
530 goto restart; 530 goto restart;
531 } 531 }
532 spin_unlock(&sb_lock); 532 spin_unlock(&sb_lock);
533 } 533 }
534 534
535 void sync_inodes(int wait) 535 void sync_inodes(int wait)
536 { 536 {
537 set_sb_syncing(0); 537 set_sb_syncing(0);
538 __sync_inodes(0); 538 __sync_inodes(0);
539 539
540 if (wait) { 540 if (wait) {
541 set_sb_syncing(0); 541 set_sb_syncing(0);
542 __sync_inodes(1); 542 __sync_inodes(1);
543 } 543 }
544 } 544 }
545 545
546 /** 546 /**
547 * write_inode_now - write an inode to disk 547 * write_inode_now - write an inode to disk
548 * @inode: inode to write to disk 548 * @inode: inode to write to disk
549 * @sync: whether the write should be synchronous or not 549 * @sync: whether the write should be synchronous or not
550 * 550 *
551 * This function commits an inode to disk immediately if it is dirty. This is 551 * This function commits an inode to disk immediately if it is dirty. This is
552 * primarily needed by knfsd. 552 * primarily needed by knfsd.
553 * 553 *
554 * The caller must either have a ref on the inode or must have set I_WILL_FREE. 554 * The caller must either have a ref on the inode or must have set I_WILL_FREE.
555 */ 555 */
556 int write_inode_now(struct inode *inode, int sync) 556 int write_inode_now(struct inode *inode, int sync)
557 { 557 {
558 int ret; 558 int ret;
559 struct writeback_control wbc = { 559 struct writeback_control wbc = {
560 .nr_to_write = LONG_MAX, 560 .nr_to_write = LONG_MAX,
561 .sync_mode = WB_SYNC_ALL, 561 .sync_mode = WB_SYNC_ALL,
562 }; 562 };
563 563
564 if (!mapping_cap_writeback_dirty(inode->i_mapping)) 564 if (!mapping_cap_writeback_dirty(inode->i_mapping))
565 wbc.nr_to_write = 0; 565 wbc.nr_to_write = 0;
566 566
567 might_sleep(); 567 might_sleep();
568 spin_lock(&inode_lock); 568 spin_lock(&inode_lock);
569 ret = __writeback_single_inode(inode, &wbc); 569 ret = __writeback_single_inode(inode, &wbc);
570 spin_unlock(&inode_lock); 570 spin_unlock(&inode_lock);
571 if (sync) 571 if (sync)
572 wait_on_inode(inode); 572 wait_on_inode(inode);
573 return ret; 573 return ret;
574 } 574 }
575 EXPORT_SYMBOL(write_inode_now); 575 EXPORT_SYMBOL(write_inode_now);
576 576
577 /** 577 /**
578 * sync_inode - write an inode and its pages to disk. 578 * sync_inode - write an inode and its pages to disk.
579 * @inode: the inode to sync 579 * @inode: the inode to sync
580 * @wbc: controls the writeback mode 580 * @wbc: controls the writeback mode
581 * 581 *
582 * sync_inode() will write an inode and its pages to disk. It will also 582 * sync_inode() will write an inode and its pages to disk. It will also
583 * correctly update the inode on its superblock's dirty inode lists and will 583 * correctly update the inode on its superblock's dirty inode lists and will
584 * update inode->i_state. 584 * update inode->i_state.
585 * 585 *
586 * The caller must have a ref on the inode. 586 * The caller must have a ref on the inode.
587 */ 587 */
588 int sync_inode(struct inode *inode, struct writeback_control *wbc) 588 int sync_inode(struct inode *inode, struct writeback_control *wbc)
589 { 589 {
590 int ret; 590 int ret;
591 591
592 spin_lock(&inode_lock); 592 spin_lock(&inode_lock);
593 ret = __writeback_single_inode(inode, wbc); 593 ret = __writeback_single_inode(inode, wbc);
594 spin_unlock(&inode_lock); 594 spin_unlock(&inode_lock);
595 return ret; 595 return ret;
596 } 596 }
597 EXPORT_SYMBOL(sync_inode); 597 EXPORT_SYMBOL(sync_inode);
598 598
599 /** 599 /**
600 * generic_osync_inode - flush all dirty data for a given inode to disk 600 * generic_osync_inode - flush all dirty data for a given inode to disk
601 * @inode: inode to write 601 * @inode: inode to write
602 * @mapping: the address_space that should be flushed 602 * @mapping: the address_space that should be flushed
603 * @what: what to write and wait upon 603 * @what: what to write and wait upon
604 * 604 *
605 * This can be called by file_write functions for files which have the 605 * This can be called by file_write functions for files which have the
606 * O_SYNC flag set, to flush dirty writes to disk. 606 * O_SYNC flag set, to flush dirty writes to disk.
607 * 607 *
608 * @what is a bitmask, specifying which part of the inode's data should be 608 * @what is a bitmask, specifying which part of the inode's data should be
609 * written and waited upon: 609 * written and waited upon.
610 * 610 *
611 * OSYNC_DATA: i_mapping's dirty data 611 * OSYNC_DATA: i_mapping's dirty data
612 * OSYNC_METADATA: the buffers at i_mapping->private_list 612 * OSYNC_METADATA: the buffers at i_mapping->private_list
613 * OSYNC_INODE: the inode itself 613 * OSYNC_INODE: the inode itself
614 */ 614 */
615 615
616 int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what) 616 int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what)
617 { 617 {
618 int err = 0; 618 int err = 0;
619 int need_write_inode_now = 0; 619 int need_write_inode_now = 0;
620 int err2; 620 int err2;
621 621
622 current->flags |= PF_SYNCWRITE; 622 current->flags |= PF_SYNCWRITE;
623 if (what & OSYNC_DATA) 623 if (what & OSYNC_DATA)
624 err = filemap_fdatawrite(mapping); 624 err = filemap_fdatawrite(mapping);
625 if (what & (OSYNC_METADATA|OSYNC_DATA)) { 625 if (what & (OSYNC_METADATA|OSYNC_DATA)) {
626 err2 = sync_mapping_buffers(mapping); 626 err2 = sync_mapping_buffers(mapping);
627 if (!err) 627 if (!err)
628 err = err2; 628 err = err2;
629 } 629 }
630 if (what & OSYNC_DATA) { 630 if (what & OSYNC_DATA) {
631 err2 = filemap_fdatawait(mapping); 631 err2 = filemap_fdatawait(mapping);
632 if (!err) 632 if (!err)
633 err = err2; 633 err = err2;
634 } 634 }
635 current->flags &= ~PF_SYNCWRITE; 635 current->flags &= ~PF_SYNCWRITE;
636 636
637 spin_lock(&inode_lock); 637 spin_lock(&inode_lock);
638 if ((inode->i_state & I_DIRTY) && 638 if ((inode->i_state & I_DIRTY) &&
639 ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC))) 639 ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
640 need_write_inode_now = 1; 640 need_write_inode_now = 1;
641 spin_unlock(&inode_lock); 641 spin_unlock(&inode_lock);
642 642
643 if (need_write_inode_now) { 643 if (need_write_inode_now) {
644 err2 = write_inode_now(inode, 1); 644 err2 = write_inode_now(inode, 1);
645 if (!err) 645 if (!err)
646 err = err2; 646 err = err2;
647 } 647 }
648 else 648 else
649 wait_on_inode(inode); 649 wait_on_inode(inode);
650 650
651 return err; 651 return err;
652 } 652 }
653 653
654 EXPORT_SYMBOL(generic_osync_inode); 654 EXPORT_SYMBOL(generic_osync_inode);
655 655
656 /** 656 /**
657 * writeback_acquire: attempt to get exclusive writeback access to a device 657 * writeback_acquire: attempt to get exclusive writeback access to a device
658 * @bdi: the device's backing_dev_info structure 658 * @bdi: the device's backing_dev_info structure
659 * 659 *
660 * It is a waste of resources to have more than one pdflush thread blocked on 660 * It is a waste of resources to have more than one pdflush thread blocked on
661 * a single request queue. Exclusion at the request_queue level is obtained 661 * a single request queue. Exclusion at the request_queue level is obtained
662 * via a flag in the request_queue's backing_dev_info.state. 662 * via a flag in the request_queue's backing_dev_info.state.
663 * 663 *
664 * Non-request_queue-backed address_spaces will share default_backing_dev_info, 664 * Non-request_queue-backed address_spaces will share default_backing_dev_info,
665 * unless they implement their own. Which is somewhat inefficient, as this 665 * unless they implement their own. Which is somewhat inefficient, as this
666 * may prevent concurrent writeback against multiple devices. 666 * may prevent concurrent writeback against multiple devices.
667 */ 667 */
668 int writeback_acquire(struct backing_dev_info *bdi) 668 int writeback_acquire(struct backing_dev_info *bdi)
669 { 669 {
670 return !test_and_set_bit(BDI_pdflush, &bdi->state); 670 return !test_and_set_bit(BDI_pdflush, &bdi->state);
671 } 671 }
672 672
673 /** 673 /**
674 * writeback_in_progress: determine whether there is writeback in progress 674 * writeback_in_progress: determine whether there is writeback in progress
675 * against a backing device.
676 * @bdi: the device's backing_dev_info structure. 675 * @bdi: the device's backing_dev_info structure.
676 *
677 * Determine whether there is writeback in progress against a backing device.
677 */ 678 */
678 int writeback_in_progress(struct backing_dev_info *bdi) 679 int writeback_in_progress(struct backing_dev_info *bdi)
679 { 680 {
680 return test_bit(BDI_pdflush, &bdi->state); 681 return test_bit(BDI_pdflush, &bdi->state);
681 } 682 }
682 683
683 /** 684 /**
684 * writeback_release: relinquish exclusive writeback access against a device. 685 * writeback_release: relinquish exclusive writeback access against a device.
685 * @bdi: the device's backing_dev_info structure 686 * @bdi: the device's backing_dev_info structure
686 */ 687 */
687 void writeback_release(struct backing_dev_info *bdi) 688 void writeback_release(struct backing_dev_info *bdi)
688 { 689 {
689 BUG_ON(!writeback_in_progress(bdi)); 690 BUG_ON(!writeback_in_progress(bdi));
690 clear_bit(BDI_pdflush, &bdi->state); 691 clear_bit(BDI_pdflush, &bdi->state);
691 } 692 }
include/linux/kernel.h
1 #ifndef _LINUX_KERNEL_H 1 #ifndef _LINUX_KERNEL_H
2 #define _LINUX_KERNEL_H 2 #define _LINUX_KERNEL_H
3 3
4 /* 4 /*
5 * 'kernel.h' contains some often-used function prototypes etc 5 * 'kernel.h' contains some often-used function prototypes etc
6 */ 6 */
7 7
8 #ifdef __KERNEL__ 8 #ifdef __KERNEL__
9 9
10 #include <stdarg.h> 10 #include <stdarg.h>
11 #include <linux/linkage.h> 11 #include <linux/linkage.h>
12 #include <linux/stddef.h> 12 #include <linux/stddef.h>
13 #include <linux/types.h> 13 #include <linux/types.h>
14 #include <linux/compiler.h> 14 #include <linux/compiler.h>
15 #include <linux/bitops.h> 15 #include <linux/bitops.h>
16 #include <asm/byteorder.h> 16 #include <asm/byteorder.h>
17 #include <asm/bug.h> 17 #include <asm/bug.h>
18 18
19 extern const char linux_banner[]; 19 extern const char linux_banner[];
20 20
21 #define INT_MAX ((int)(~0U>>1)) 21 #define INT_MAX ((int)(~0U>>1))
22 #define INT_MIN (-INT_MAX - 1) 22 #define INT_MIN (-INT_MAX - 1)
23 #define UINT_MAX (~0U) 23 #define UINT_MAX (~0U)
24 #define LONG_MAX ((long)(~0UL>>1)) 24 #define LONG_MAX ((long)(~0UL>>1))
25 #define LONG_MIN (-LONG_MAX - 1) 25 #define LONG_MIN (-LONG_MAX - 1)
26 #define ULONG_MAX (~0UL) 26 #define ULONG_MAX (~0UL)
27 27
28 #define STACK_MAGIC 0xdeadbeef 28 #define STACK_MAGIC 0xdeadbeef
29 29
30 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 30 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
31 #define ALIGN(x,a) (((x)+(a)-1)&~((a)-1)) 31 #define ALIGN(x,a) (((x)+(a)-1)&~((a)-1))
32 32
33 #define KERN_EMERG "<0>" /* system is unusable */ 33 #define KERN_EMERG "<0>" /* system is unusable */
34 #define KERN_ALERT "<1>" /* action must be taken immediately */ 34 #define KERN_ALERT "<1>" /* action must be taken immediately */
35 #define KERN_CRIT "<2>" /* critical conditions */ 35 #define KERN_CRIT "<2>" /* critical conditions */
36 #define KERN_ERR "<3>" /* error conditions */ 36 #define KERN_ERR "<3>" /* error conditions */
37 #define KERN_WARNING "<4>" /* warning conditions */ 37 #define KERN_WARNING "<4>" /* warning conditions */
38 #define KERN_NOTICE "<5>" /* normal but significant condition */ 38 #define KERN_NOTICE "<5>" /* normal but significant condition */
39 #define KERN_INFO "<6>" /* informational */ 39 #define KERN_INFO "<6>" /* informational */
40 #define KERN_DEBUG "<7>" /* debug-level messages */ 40 #define KERN_DEBUG "<7>" /* debug-level messages */
41 41
42 extern int console_printk[]; 42 extern int console_printk[];
43 43
44 #define console_loglevel (console_printk[0]) 44 #define console_loglevel (console_printk[0])
45 #define default_message_loglevel (console_printk[1]) 45 #define default_message_loglevel (console_printk[1])
46 #define minimum_console_loglevel (console_printk[2]) 46 #define minimum_console_loglevel (console_printk[2])
47 #define default_console_loglevel (console_printk[3]) 47 #define default_console_loglevel (console_printk[3])
48 48
49 struct completion; 49 struct completion;
50 50
51 /** 51 /**
52 * might_sleep - annotation for functions that can sleep 52 * might_sleep - annotation for functions that can sleep
53 * 53 *
54 * this macro will print a stack trace if it is executed in an atomic 54 * this macro will print a stack trace if it is executed in an atomic
55 * context (spinlock, irq-handler, ...). 55 * context (spinlock, irq-handler, ...).
56 * 56 *
57 * This is a useful debugging help to be able to catch problems early and not 57 * This is a useful debugging help to be able to catch problems early and not
58 * be biten later when the calling function happens to sleep when it is not 58 * be biten later when the calling function happens to sleep when it is not
59 * supposed to. 59 * supposed to.
60 */ 60 */
61 #ifdef CONFIG_PREEMPT_VOLUNTARY 61 #ifdef CONFIG_PREEMPT_VOLUNTARY
62 extern int cond_resched(void); 62 extern int cond_resched(void);
63 # define might_resched() cond_resched() 63 # define might_resched() cond_resched()
64 #else 64 #else
65 # define might_resched() do { } while (0) 65 # define might_resched() do { } while (0)
66 #endif 66 #endif
67 67
68 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 68 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
69 void __might_sleep(char *file, int line); 69 void __might_sleep(char *file, int line);
70 # define might_sleep() \ 70 # define might_sleep() \
71 do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) 71 do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0)
72 #else 72 #else
73 # define might_sleep() do { might_resched(); } while (0) 73 # define might_sleep() do { might_resched(); } while (0)
74 #endif 74 #endif
75 75
76 #define might_sleep_if(cond) do { if (unlikely(cond)) might_sleep(); } while (0) 76 #define might_sleep_if(cond) do { if (unlikely(cond)) might_sleep(); } while (0)
77 77
78 #define abs(x) ({ \ 78 #define abs(x) ({ \
79 int __x = (x); \ 79 int __x = (x); \
80 (__x < 0) ? -__x : __x; \ 80 (__x < 0) ? -__x : __x; \
81 }) 81 })
82 82
83 #define labs(x) ({ \ 83 #define labs(x) ({ \
84 long __x = (x); \ 84 long __x = (x); \
85 (__x < 0) ? -__x : __x; \ 85 (__x < 0) ? -__x : __x; \
86 }) 86 })
87 87
88 extern struct notifier_block *panic_notifier_list; 88 extern struct notifier_block *panic_notifier_list;
89 extern long (*panic_blink)(long time); 89 extern long (*panic_blink)(long time);
90 NORET_TYPE void panic(const char * fmt, ...) 90 NORET_TYPE void panic(const char * fmt, ...)
91 __attribute__ ((NORET_AND format (printf, 1, 2))); 91 __attribute__ ((NORET_AND format (printf, 1, 2)));
92 fastcall NORET_TYPE void do_exit(long error_code) 92 fastcall NORET_TYPE void do_exit(long error_code)
93 ATTRIB_NORET; 93 ATTRIB_NORET;
94 NORET_TYPE void complete_and_exit(struct completion *, long) 94 NORET_TYPE void complete_and_exit(struct completion *, long)
95 ATTRIB_NORET; 95 ATTRIB_NORET;
96 extern unsigned long simple_strtoul(const char *,char **,unsigned int); 96 extern unsigned long simple_strtoul(const char *,char **,unsigned int);
97 extern long simple_strtol(const char *,char **,unsigned int); 97 extern long simple_strtol(const char *,char **,unsigned int);
98 extern unsigned long long simple_strtoull(const char *,char **,unsigned int); 98 extern unsigned long long simple_strtoull(const char *,char **,unsigned int);
99 extern long long simple_strtoll(const char *,char **,unsigned int); 99 extern long long simple_strtoll(const char *,char **,unsigned int);
100 extern int sprintf(char * buf, const char * fmt, ...) 100 extern int sprintf(char * buf, const char * fmt, ...)
101 __attribute__ ((format (printf, 2, 3))); 101 __attribute__ ((format (printf, 2, 3)));
102 extern int vsprintf(char *buf, const char *, va_list) 102 extern int vsprintf(char *buf, const char *, va_list)
103 __attribute__ ((format (printf, 2, 0))); 103 __attribute__ ((format (printf, 2, 0)));
104 extern int snprintf(char * buf, size_t size, const char * fmt, ...) 104 extern int snprintf(char * buf, size_t size, const char * fmt, ...)
105 __attribute__ ((format (printf, 3, 4))); 105 __attribute__ ((format (printf, 3, 4)));
106 extern int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) 106 extern int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
107 __attribute__ ((format (printf, 3, 0))); 107 __attribute__ ((format (printf, 3, 0)));
108 extern int scnprintf(char * buf, size_t size, const char * fmt, ...) 108 extern int scnprintf(char * buf, size_t size, const char * fmt, ...)
109 __attribute__ ((format (printf, 3, 4))); 109 __attribute__ ((format (printf, 3, 4)));
110 extern int vscnprintf(char *buf, size_t size, const char *fmt, va_list args) 110 extern int vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
111 __attribute__ ((format (printf, 3, 0))); 111 __attribute__ ((format (printf, 3, 0)));
112 112
113 extern int sscanf(const char *, const char *, ...) 113 extern int sscanf(const char *, const char *, ...)
114 __attribute__ ((format (scanf, 2, 3))); 114 __attribute__ ((format (scanf, 2, 3)));
115 extern int vsscanf(const char *, const char *, va_list) 115 extern int vsscanf(const char *, const char *, va_list)
116 __attribute__ ((format (scanf, 2, 0))); 116 __attribute__ ((format (scanf, 2, 0)));
117 117
118 extern int get_option(char **str, int *pint); 118 extern int get_option(char **str, int *pint);
119 extern char *get_options(const char *str, int nints, int *ints); 119 extern char *get_options(const char *str, int nints, int *ints);
120 extern unsigned long long memparse(char *ptr, char **retptr); 120 extern unsigned long long memparse(char *ptr, char **retptr);
121 121
122 extern int __kernel_text_address(unsigned long addr); 122 extern int __kernel_text_address(unsigned long addr);
123 extern int kernel_text_address(unsigned long addr); 123 extern int kernel_text_address(unsigned long addr);
124 extern int session_of_pgrp(int pgrp); 124 extern int session_of_pgrp(int pgrp);
125 125
126 #ifdef CONFIG_PRINTK 126 #ifdef CONFIG_PRINTK
127 asmlinkage int vprintk(const char *fmt, va_list args) 127 asmlinkage int vprintk(const char *fmt, va_list args)
128 __attribute__ ((format (printf, 1, 0))); 128 __attribute__ ((format (printf, 1, 0)));
129 asmlinkage int printk(const char * fmt, ...) 129 asmlinkage int printk(const char * fmt, ...)
130 __attribute__ ((format (printf, 1, 2))); 130 __attribute__ ((format (printf, 1, 2)));
131 #else 131 #else
132 static inline int vprintk(const char *s, va_list args) 132 static inline int vprintk(const char *s, va_list args)
133 __attribute__ ((format (printf, 1, 0))); 133 __attribute__ ((format (printf, 1, 0)));
134 static inline int vprintk(const char *s, va_list args) { return 0; } 134 static inline int vprintk(const char *s, va_list args) { return 0; }
135 static inline int printk(const char *s, ...) 135 static inline int printk(const char *s, ...)
136 __attribute__ ((format (printf, 1, 2))); 136 __attribute__ ((format (printf, 1, 2)));
137 static inline int printk(const char *s, ...) { return 0; } 137 static inline int printk(const char *s, ...) { return 0; }
138 #endif 138 #endif
139 139
140 unsigned long int_sqrt(unsigned long); 140 unsigned long int_sqrt(unsigned long);
141 141
142 static inline int __attribute_pure__ long_log2(unsigned long x) 142 static inline int __attribute_pure__ long_log2(unsigned long x)
143 { 143 {
144 int r = 0; 144 int r = 0;
145 for (x >>= 1; x > 0; x >>= 1) 145 for (x >>= 1; x > 0; x >>= 1)
146 r++; 146 r++;
147 return r; 147 return r;
148 } 148 }
149 149
150 static inline unsigned long __attribute_const__ roundup_pow_of_two(unsigned long x) 150 static inline unsigned long __attribute_const__ roundup_pow_of_two(unsigned long x)
151 { 151 {
152 return (1UL << fls(x - 1)); 152 return (1UL << fls(x - 1));
153 } 153 }
154 154
155 extern int printk_ratelimit(void); 155 extern int printk_ratelimit(void);
156 extern int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst); 156 extern int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst);
157 157
158 static inline void console_silent(void) 158 static inline void console_silent(void)
159 { 159 {
160 console_loglevel = 0; 160 console_loglevel = 0;
161 } 161 }
162 162
163 static inline void console_verbose(void) 163 static inline void console_verbose(void)
164 { 164 {
165 if (console_loglevel) 165 if (console_loglevel)
166 console_loglevel = 15; 166 console_loglevel = 15;
167 } 167 }
168 168
169 extern void bust_spinlocks(int yes); 169 extern void bust_spinlocks(int yes);
170 extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ 170 extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */
171 extern int panic_timeout; 171 extern int panic_timeout;
172 extern int panic_on_oops; 172 extern int panic_on_oops;
173 extern int tainted; 173 extern int tainted;
174 extern const char *print_tainted(void); 174 extern const char *print_tainted(void);
175 extern void add_taint(unsigned); 175 extern void add_taint(unsigned);
176 176
177 /* Values used for system_state */ 177 /* Values used for system_state */
178 extern enum system_states { 178 extern enum system_states {
179 SYSTEM_BOOTING, 179 SYSTEM_BOOTING,
180 SYSTEM_RUNNING, 180 SYSTEM_RUNNING,
181 SYSTEM_HALT, 181 SYSTEM_HALT,
182 SYSTEM_POWER_OFF, 182 SYSTEM_POWER_OFF,
183 SYSTEM_RESTART, 183 SYSTEM_RESTART,
184 } system_state; 184 } system_state;
185 185
186 #define TAINT_PROPRIETARY_MODULE (1<<0) 186 #define TAINT_PROPRIETARY_MODULE (1<<0)
187 #define TAINT_FORCED_MODULE (1<<1) 187 #define TAINT_FORCED_MODULE (1<<1)
188 #define TAINT_UNSAFE_SMP (1<<2) 188 #define TAINT_UNSAFE_SMP (1<<2)
189 #define TAINT_FORCED_RMMOD (1<<3) 189 #define TAINT_FORCED_RMMOD (1<<3)
190 #define TAINT_MACHINE_CHECK (1<<4) 190 #define TAINT_MACHINE_CHECK (1<<4)
191 #define TAINT_BAD_PAGE (1<<5) 191 #define TAINT_BAD_PAGE (1<<5)
192 192
193 extern void dump_stack(void); 193 extern void dump_stack(void);
194 194
195 #ifdef DEBUG 195 #ifdef DEBUG
196 #define pr_debug(fmt,arg...) \ 196 #define pr_debug(fmt,arg...) \
197 printk(KERN_DEBUG fmt,##arg) 197 printk(KERN_DEBUG fmt,##arg)
198 #else 198 #else
199 #define pr_debug(fmt,arg...) \ 199 #define pr_debug(fmt,arg...) \
200 do { } while (0) 200 do { } while (0)
201 #endif 201 #endif
202 202
203 #define pr_info(fmt,arg...) \ 203 #define pr_info(fmt,arg...) \
204 printk(KERN_INFO fmt,##arg) 204 printk(KERN_INFO fmt,##arg)
205 205
206 /* 206 /*
207 * Display an IP address in readable format. 207 * Display an IP address in readable format.
208 */ 208 */
209 209
210 #define NIPQUAD(addr) \ 210 #define NIPQUAD(addr) \
211 ((unsigned char *)&addr)[0], \ 211 ((unsigned char *)&addr)[0], \
212 ((unsigned char *)&addr)[1], \ 212 ((unsigned char *)&addr)[1], \
213 ((unsigned char *)&addr)[2], \ 213 ((unsigned char *)&addr)[2], \
214 ((unsigned char *)&addr)[3] 214 ((unsigned char *)&addr)[3]
215 215
216 #define NIP6(addr) \ 216 #define NIP6(addr) \
217 ntohs((addr).s6_addr16[0]), \ 217 ntohs((addr).s6_addr16[0]), \
218 ntohs((addr).s6_addr16[1]), \ 218 ntohs((addr).s6_addr16[1]), \
219 ntohs((addr).s6_addr16[2]), \ 219 ntohs((addr).s6_addr16[2]), \
220 ntohs((addr).s6_addr16[3]), \ 220 ntohs((addr).s6_addr16[3]), \
221 ntohs((addr).s6_addr16[4]), \ 221 ntohs((addr).s6_addr16[4]), \
222 ntohs((addr).s6_addr16[5]), \ 222 ntohs((addr).s6_addr16[5]), \
223 ntohs((addr).s6_addr16[6]), \ 223 ntohs((addr).s6_addr16[6]), \
224 ntohs((addr).s6_addr16[7]) 224 ntohs((addr).s6_addr16[7])
225 225
226 #if defined(__LITTLE_ENDIAN) 226 #if defined(__LITTLE_ENDIAN)
227 #define HIPQUAD(addr) \ 227 #define HIPQUAD(addr) \
228 ((unsigned char *)&addr)[3], \ 228 ((unsigned char *)&addr)[3], \
229 ((unsigned char *)&addr)[2], \ 229 ((unsigned char *)&addr)[2], \
230 ((unsigned char *)&addr)[1], \ 230 ((unsigned char *)&addr)[1], \
231 ((unsigned char *)&addr)[0] 231 ((unsigned char *)&addr)[0]
232 #elif defined(__BIG_ENDIAN) 232 #elif defined(__BIG_ENDIAN)
233 #define HIPQUAD NIPQUAD 233 #define HIPQUAD NIPQUAD
234 #else 234 #else
235 #error "Please fix asm/byteorder.h" 235 #error "Please fix asm/byteorder.h"
236 #endif /* __LITTLE_ENDIAN */ 236 #endif /* __LITTLE_ENDIAN */
237 237
238 /* 238 /*
239 * min()/max() macros that also do 239 * min()/max() macros that also do
240 * strict type-checking.. See the 240 * strict type-checking.. See the
241 * "unnecessary" pointer comparison. 241 * "unnecessary" pointer comparison.
242 */ 242 */
243 #define min(x,y) ({ \ 243 #define min(x,y) ({ \
244 typeof(x) _x = (x); \ 244 typeof(x) _x = (x); \
245 typeof(y) _y = (y); \ 245 typeof(y) _y = (y); \
246 (void) (&_x == &_y); \ 246 (void) (&_x == &_y); \
247 _x < _y ? _x : _y; }) 247 _x < _y ? _x : _y; })
248 248
249 #define max(x,y) ({ \ 249 #define max(x,y) ({ \
250 typeof(x) _x = (x); \ 250 typeof(x) _x = (x); \
251 typeof(y) _y = (y); \ 251 typeof(y) _y = (y); \
252 (void) (&_x == &_y); \ 252 (void) (&_x == &_y); \
253 _x > _y ? _x : _y; }) 253 _x > _y ? _x : _y; })
254 254
255 /* 255 /*
256 * ..and if you can't take the strict 256 * ..and if you can't take the strict
257 * types, you can specify one yourself. 257 * types, you can specify one yourself.
258 * 258 *
259 * Or not use min/max at all, of course. 259 * Or not use min/max at all, of course.
260 */ 260 */
261 #define min_t(type,x,y) \ 261 #define min_t(type,x,y) \
262 ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; }) 262 ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
263 #define max_t(type,x,y) \ 263 #define max_t(type,x,y) \
264 ({ type __x = (x); type __y = (y); __x > __y ? __x: __y; }) 264 ({ type __x = (x); type __y = (y); __x > __y ? __x: __y; })
265 265
266 266
267 /** 267 /**
268 * container_of - cast a member of a structure out to the containing structure 268 * container_of - cast a member of a structure out to the containing structure
269 *
270 * @ptr: the pointer to the member. 269 * @ptr: the pointer to the member.
271 * @type: the type of the container struct this is embedded in. 270 * @type: the type of the container struct this is embedded in.
272 * @member: the name of the member within the struct. 271 * @member: the name of the member within the struct.
273 * 272 *
274 */ 273 */
275 #define container_of(ptr, type, member) ({ \ 274 #define container_of(ptr, type, member) ({ \
276 const typeof( ((type *)0)->member ) *__mptr = (ptr); \ 275 const typeof( ((type *)0)->member ) *__mptr = (ptr); \
277 (type *)( (char *)__mptr - offsetof(type,member) );}) 276 (type *)( (char *)__mptr - offsetof(type,member) );})
278 277
279 /* 278 /*
280 * Check at compile time that something is of a particular type. 279 * Check at compile time that something is of a particular type.
281 * Always evaluates to 1 so you may use it easily in comparisons. 280 * Always evaluates to 1 so you may use it easily in comparisons.
282 */ 281 */
283 #define typecheck(type,x) \ 282 #define typecheck(type,x) \
284 ({ type __dummy; \ 283 ({ type __dummy; \
285 typeof(x) __dummy2; \ 284 typeof(x) __dummy2; \
286 (void)(&__dummy == &__dummy2); \ 285 (void)(&__dummy == &__dummy2); \
287 1; \ 286 1; \
288 }) 287 })
289 288
290 #endif /* __KERNEL__ */ 289 #endif /* __KERNEL__ */
291 290
292 #define SI_LOAD_SHIFT 16 291 #define SI_LOAD_SHIFT 16
293 struct sysinfo { 292 struct sysinfo {
294 long uptime; /* Seconds since boot */ 293 long uptime; /* Seconds since boot */
295 unsigned long loads[3]; /* 1, 5, and 15 minute load averages */ 294 unsigned long loads[3]; /* 1, 5, and 15 minute load averages */
296 unsigned long totalram; /* Total usable main memory size */ 295 unsigned long totalram; /* Total usable main memory size */
297 unsigned long freeram; /* Available memory size */ 296 unsigned long freeram; /* Available memory size */
298 unsigned long sharedram; /* Amount of shared memory */ 297 unsigned long sharedram; /* Amount of shared memory */
299 unsigned long bufferram; /* Memory used by buffers */ 298 unsigned long bufferram; /* Memory used by buffers */
300 unsigned long totalswap; /* Total swap space size */ 299 unsigned long totalswap; /* Total swap space size */
301 unsigned long freeswap; /* swap space still available */ 300 unsigned long freeswap; /* swap space still available */
302 unsigned short procs; /* Number of current processes */ 301 unsigned short procs; /* Number of current processes */
303 unsigned short pad; /* explicit padding for m68k */ 302 unsigned short pad; /* explicit padding for m68k */
304 unsigned long totalhigh; /* Total high memory size */ 303 unsigned long totalhigh; /* Total high memory size */
305 unsigned long freehigh; /* Available high memory size */ 304 unsigned long freehigh; /* Available high memory size */
306 unsigned int mem_unit; /* Memory unit size in bytes */ 305 unsigned int mem_unit; /* Memory unit size in bytes */
307 char _f[20-2*sizeof(long)-sizeof(int)]; /* Padding: libc5 uses this.. */ 306 char _f[20-2*sizeof(long)-sizeof(int)]; /* Padding: libc5 uses this.. */
308 }; 307 };
309 308
310 /* Force a compilation error if condition is true */ 309 /* Force a compilation error if condition is true */
311 #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) 310 #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
312 311
313 #ifdef CONFIG_SYSCTL 312 #ifdef CONFIG_SYSCTL
314 extern int randomize_va_space; 313 extern int randomize_va_space;
315 #else 314 #else
316 #define randomize_va_space 1 315 #define randomize_va_space 1
317 #endif 316 #endif
318 317
319 /* Trap pasters of __FUNCTION__ at compile-time */ 318 /* Trap pasters of __FUNCTION__ at compile-time */
320 #if __GNUC__ > 2 || __GNUC_MINOR__ >= 95 319 #if __GNUC__ > 2 || __GNUC_MINOR__ >= 95
321 #define __FUNCTION__ (__func__) 320 #define __FUNCTION__ (__func__)
322 #endif 321 #endif
323 322
324 #endif 323 #endif
325 324
1 /* 1 /*
2 * linux/kernel/sys.c 2 * linux/kernel/sys.c
3 * 3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */ 5 */
6 6
7 #include <linux/config.h> 7 #include <linux/config.h>
8 #include <linux/module.h> 8 #include <linux/module.h>
9 #include <linux/mm.h> 9 #include <linux/mm.h>
10 #include <linux/utsname.h> 10 #include <linux/utsname.h>
11 #include <linux/mman.h> 11 #include <linux/mman.h>
12 #include <linux/smp_lock.h> 12 #include <linux/smp_lock.h>
13 #include <linux/notifier.h> 13 #include <linux/notifier.h>
14 #include <linux/reboot.h> 14 #include <linux/reboot.h>
15 #include <linux/prctl.h> 15 #include <linux/prctl.h>
16 #include <linux/init.h> 16 #include <linux/init.h>
17 #include <linux/highuid.h> 17 #include <linux/highuid.h>
18 #include <linux/fs.h> 18 #include <linux/fs.h>
19 #include <linux/kernel.h> 19 #include <linux/kernel.h>
20 #include <linux/kexec.h> 20 #include <linux/kexec.h>
21 #include <linux/workqueue.h> 21 #include <linux/workqueue.h>
22 #include <linux/device.h> 22 #include <linux/device.h>
23 #include <linux/key.h> 23 #include <linux/key.h>
24 #include <linux/times.h> 24 #include <linux/times.h>
25 #include <linux/posix-timers.h> 25 #include <linux/posix-timers.h>
26 #include <linux/security.h> 26 #include <linux/security.h>
27 #include <linux/dcookies.h> 27 #include <linux/dcookies.h>
28 #include <linux/suspend.h> 28 #include <linux/suspend.h>
29 #include <linux/tty.h> 29 #include <linux/tty.h>
30 #include <linux/signal.h> 30 #include <linux/signal.h>
31 #include <linux/cn_proc.h> 31 #include <linux/cn_proc.h>
32 32
33 #include <linux/compat.h> 33 #include <linux/compat.h>
34 #include <linux/syscalls.h> 34 #include <linux/syscalls.h>
35 35
36 #include <asm/uaccess.h> 36 #include <asm/uaccess.h>
37 #include <asm/io.h> 37 #include <asm/io.h>
38 #include <asm/unistd.h> 38 #include <asm/unistd.h>
39 39
40 #ifndef SET_UNALIGN_CTL 40 #ifndef SET_UNALIGN_CTL
41 # define SET_UNALIGN_CTL(a,b) (-EINVAL) 41 # define SET_UNALIGN_CTL(a,b) (-EINVAL)
42 #endif 42 #endif
43 #ifndef GET_UNALIGN_CTL 43 #ifndef GET_UNALIGN_CTL
44 # define GET_UNALIGN_CTL(a,b) (-EINVAL) 44 # define GET_UNALIGN_CTL(a,b) (-EINVAL)
45 #endif 45 #endif
46 #ifndef SET_FPEMU_CTL 46 #ifndef SET_FPEMU_CTL
47 # define SET_FPEMU_CTL(a,b) (-EINVAL) 47 # define SET_FPEMU_CTL(a,b) (-EINVAL)
48 #endif 48 #endif
49 #ifndef GET_FPEMU_CTL 49 #ifndef GET_FPEMU_CTL
50 # define GET_FPEMU_CTL(a,b) (-EINVAL) 50 # define GET_FPEMU_CTL(a,b) (-EINVAL)
51 #endif 51 #endif
52 #ifndef SET_FPEXC_CTL 52 #ifndef SET_FPEXC_CTL
53 # define SET_FPEXC_CTL(a,b) (-EINVAL) 53 # define SET_FPEXC_CTL(a,b) (-EINVAL)
54 #endif 54 #endif
55 #ifndef GET_FPEXC_CTL 55 #ifndef GET_FPEXC_CTL
56 # define GET_FPEXC_CTL(a,b) (-EINVAL) 56 # define GET_FPEXC_CTL(a,b) (-EINVAL)
57 #endif 57 #endif
58 58
59 /* 59 /*
60 * this is where the system-wide overflow UID and GID are defined, for 60 * this is where the system-wide overflow UID and GID are defined, for
61 * architectures that now have 32-bit UID/GID but didn't in the past 61 * architectures that now have 32-bit UID/GID but didn't in the past
62 */ 62 */
63 63
64 int overflowuid = DEFAULT_OVERFLOWUID; 64 int overflowuid = DEFAULT_OVERFLOWUID;
65 int overflowgid = DEFAULT_OVERFLOWGID; 65 int overflowgid = DEFAULT_OVERFLOWGID;
66 66
67 #ifdef CONFIG_UID16 67 #ifdef CONFIG_UID16
68 EXPORT_SYMBOL(overflowuid); 68 EXPORT_SYMBOL(overflowuid);
69 EXPORT_SYMBOL(overflowgid); 69 EXPORT_SYMBOL(overflowgid);
70 #endif 70 #endif
71 71
72 /* 72 /*
73 * the same as above, but for filesystems which can only store a 16-bit 73 * the same as above, but for filesystems which can only store a 16-bit
74 * UID and GID. as such, this is needed on all architectures 74 * UID and GID. as such, this is needed on all architectures
75 */ 75 */
76 76
77 int fs_overflowuid = DEFAULT_FS_OVERFLOWUID; 77 int fs_overflowuid = DEFAULT_FS_OVERFLOWUID;
78 int fs_overflowgid = DEFAULT_FS_OVERFLOWUID; 78 int fs_overflowgid = DEFAULT_FS_OVERFLOWUID;
79 79
80 EXPORT_SYMBOL(fs_overflowuid); 80 EXPORT_SYMBOL(fs_overflowuid);
81 EXPORT_SYMBOL(fs_overflowgid); 81 EXPORT_SYMBOL(fs_overflowgid);
82 82
83 /* 83 /*
84 * this indicates whether you can reboot with ctrl-alt-del: the default is yes 84 * this indicates whether you can reboot with ctrl-alt-del: the default is yes
85 */ 85 */
86 86
87 int C_A_D = 1; 87 int C_A_D = 1;
88 int cad_pid = 1; 88 int cad_pid = 1;
89 89
90 /* 90 /*
91 * Notifier list for kernel code which wants to be called 91 * Notifier list for kernel code which wants to be called
92 * at shutdown. This is used to stop any idling DMA operations 92 * at shutdown. This is used to stop any idling DMA operations
93 * and the like. 93 * and the like.
94 */ 94 */
95 95
96 static struct notifier_block *reboot_notifier_list; 96 static struct notifier_block *reboot_notifier_list;
97 static DEFINE_RWLOCK(notifier_lock); 97 static DEFINE_RWLOCK(notifier_lock);
98 98
99 /** 99 /**
100 * notifier_chain_register - Add notifier to a notifier chain 100 * notifier_chain_register - Add notifier to a notifier chain
101 * @list: Pointer to root list pointer 101 * @list: Pointer to root list pointer
102 * @n: New entry in notifier chain 102 * @n: New entry in notifier chain
103 * 103 *
104 * Adds a notifier to a notifier chain. 104 * Adds a notifier to a notifier chain.
105 * 105 *
106 * Currently always returns zero. 106 * Currently always returns zero.
107 */ 107 */
108 108
109 int notifier_chain_register(struct notifier_block **list, struct notifier_block *n) 109 int notifier_chain_register(struct notifier_block **list, struct notifier_block *n)
110 { 110 {
111 write_lock(&notifier_lock); 111 write_lock(&notifier_lock);
112 while(*list) 112 while(*list)
113 { 113 {
114 if(n->priority > (*list)->priority) 114 if(n->priority > (*list)->priority)
115 break; 115 break;
116 list= &((*list)->next); 116 list= &((*list)->next);
117 } 117 }
118 n->next = *list; 118 n->next = *list;
119 *list=n; 119 *list=n;
120 write_unlock(&notifier_lock); 120 write_unlock(&notifier_lock);
121 return 0; 121 return 0;
122 } 122 }
123 123
124 EXPORT_SYMBOL(notifier_chain_register); 124 EXPORT_SYMBOL(notifier_chain_register);
125 125
126 /** 126 /**
127 * notifier_chain_unregister - Remove notifier from a notifier chain 127 * notifier_chain_unregister - Remove notifier from a notifier chain
128 * @nl: Pointer to root list pointer 128 * @nl: Pointer to root list pointer
129 * @n: New entry in notifier chain 129 * @n: New entry in notifier chain
130 * 130 *
131 * Removes a notifier from a notifier chain. 131 * Removes a notifier from a notifier chain.
132 * 132 *
133 * Returns zero on success, or %-ENOENT on failure. 133 * Returns zero on success, or %-ENOENT on failure.
134 */ 134 */
135 135
136 int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n) 136 int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n)
137 { 137 {
138 write_lock(&notifier_lock); 138 write_lock(&notifier_lock);
139 while((*nl)!=NULL) 139 while((*nl)!=NULL)
140 { 140 {
141 if((*nl)==n) 141 if((*nl)==n)
142 { 142 {
143 *nl=n->next; 143 *nl=n->next;
144 write_unlock(&notifier_lock); 144 write_unlock(&notifier_lock);
145 return 0; 145 return 0;
146 } 146 }
147 nl=&((*nl)->next); 147 nl=&((*nl)->next);
148 } 148 }
149 write_unlock(&notifier_lock); 149 write_unlock(&notifier_lock);
150 return -ENOENT; 150 return -ENOENT;
151 } 151 }
152 152
153 EXPORT_SYMBOL(notifier_chain_unregister); 153 EXPORT_SYMBOL(notifier_chain_unregister);
154 154
155 /** 155 /**
156 * notifier_call_chain - Call functions in a notifier chain 156 * notifier_call_chain - Call functions in a notifier chain
157 * @n: Pointer to root pointer of notifier chain 157 * @n: Pointer to root pointer of notifier chain
158 * @val: Value passed unmodified to notifier function 158 * @val: Value passed unmodified to notifier function
159 * @v: Pointer passed unmodified to notifier function 159 * @v: Pointer passed unmodified to notifier function
160 * 160 *
161 * Calls each function in a notifier chain in turn. 161 * Calls each function in a notifier chain in turn.
162 * 162 *
163 * If the return value of the notifier can be and'd 163 * If the return value of the notifier can be and'd
164 * with %NOTIFY_STOP_MASK, then notifier_call_chain 164 * with %NOTIFY_STOP_MASK, then notifier_call_chain
165 * will return immediately, with the return value of 165 * will return immediately, with the return value of
166 * the notifier function which halted execution. 166 * the notifier function which halted execution.
167 * Otherwise, the return value is the return value 167 * Otherwise, the return value is the return value
168 * of the last notifier function called. 168 * of the last notifier function called.
169 */ 169 */
170 170
171 int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) 171 int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
172 { 172 {
173 int ret=NOTIFY_DONE; 173 int ret=NOTIFY_DONE;
174 struct notifier_block *nb = *n; 174 struct notifier_block *nb = *n;
175 175
176 while(nb) 176 while(nb)
177 { 177 {
178 ret=nb->notifier_call(nb,val,v); 178 ret=nb->notifier_call(nb,val,v);
179 if(ret&NOTIFY_STOP_MASK) 179 if(ret&NOTIFY_STOP_MASK)
180 { 180 {
181 return ret; 181 return ret;
182 } 182 }
183 nb=nb->next; 183 nb=nb->next;
184 } 184 }
185 return ret; 185 return ret;
186 } 186 }
187 187
188 EXPORT_SYMBOL(notifier_call_chain); 188 EXPORT_SYMBOL(notifier_call_chain);
189 189
190 /** 190 /**
191 * register_reboot_notifier - Register function to be called at reboot time 191 * register_reboot_notifier - Register function to be called at reboot time
192 * @nb: Info about notifier function to be called 192 * @nb: Info about notifier function to be called
193 * 193 *
194 * Registers a function with the list of functions 194 * Registers a function with the list of functions
195 * to be called at reboot time. 195 * to be called at reboot time.
196 * 196 *
197 * Currently always returns zero, as notifier_chain_register 197 * Currently always returns zero, as notifier_chain_register
198 * always returns zero. 198 * always returns zero.
199 */ 199 */
200 200
201 int register_reboot_notifier(struct notifier_block * nb) 201 int register_reboot_notifier(struct notifier_block * nb)
202 { 202 {
203 return notifier_chain_register(&reboot_notifier_list, nb); 203 return notifier_chain_register(&reboot_notifier_list, nb);
204 } 204 }
205 205
206 EXPORT_SYMBOL(register_reboot_notifier); 206 EXPORT_SYMBOL(register_reboot_notifier);
207 207
208 /** 208 /**
209 * unregister_reboot_notifier - Unregister previously registered reboot notifier 209 * unregister_reboot_notifier - Unregister previously registered reboot notifier
210 * @nb: Hook to be unregistered 210 * @nb: Hook to be unregistered
211 * 211 *
212 * Unregisters a previously registered reboot 212 * Unregisters a previously registered reboot
213 * notifier function. 213 * notifier function.
214 * 214 *
215 * Returns zero on success, or %-ENOENT on failure. 215 * Returns zero on success, or %-ENOENT on failure.
216 */ 216 */
217 217
218 int unregister_reboot_notifier(struct notifier_block * nb) 218 int unregister_reboot_notifier(struct notifier_block * nb)
219 { 219 {
220 return notifier_chain_unregister(&reboot_notifier_list, nb); 220 return notifier_chain_unregister(&reboot_notifier_list, nb);
221 } 221 }
222 222
223 EXPORT_SYMBOL(unregister_reboot_notifier); 223 EXPORT_SYMBOL(unregister_reboot_notifier);
224 224
225 static int set_one_prio(struct task_struct *p, int niceval, int error) 225 static int set_one_prio(struct task_struct *p, int niceval, int error)
226 { 226 {
227 int no_nice; 227 int no_nice;
228 228
229 if (p->uid != current->euid && 229 if (p->uid != current->euid &&
230 p->euid != current->euid && !capable(CAP_SYS_NICE)) { 230 p->euid != current->euid && !capable(CAP_SYS_NICE)) {
231 error = -EPERM; 231 error = -EPERM;
232 goto out; 232 goto out;
233 } 233 }
234 if (niceval < task_nice(p) && !can_nice(p, niceval)) { 234 if (niceval < task_nice(p) && !can_nice(p, niceval)) {
235 error = -EACCES; 235 error = -EACCES;
236 goto out; 236 goto out;
237 } 237 }
238 no_nice = security_task_setnice(p, niceval); 238 no_nice = security_task_setnice(p, niceval);
239 if (no_nice) { 239 if (no_nice) {
240 error = no_nice; 240 error = no_nice;
241 goto out; 241 goto out;
242 } 242 }
243 if (error == -ESRCH) 243 if (error == -ESRCH)
244 error = 0; 244 error = 0;
245 set_user_nice(p, niceval); 245 set_user_nice(p, niceval);
246 out: 246 out:
247 return error; 247 return error;
248 } 248 }
249 249
250 asmlinkage long sys_setpriority(int which, int who, int niceval) 250 asmlinkage long sys_setpriority(int which, int who, int niceval)
251 { 251 {
252 struct task_struct *g, *p; 252 struct task_struct *g, *p;
253 struct user_struct *user; 253 struct user_struct *user;
254 int error = -EINVAL; 254 int error = -EINVAL;
255 255
256 if (which > 2 || which < 0) 256 if (which > 2 || which < 0)
257 goto out; 257 goto out;
258 258
259 /* normalize: avoid signed division (rounding problems) */ 259 /* normalize: avoid signed division (rounding problems) */
260 error = -ESRCH; 260 error = -ESRCH;
261 if (niceval < -20) 261 if (niceval < -20)
262 niceval = -20; 262 niceval = -20;
263 if (niceval > 19) 263 if (niceval > 19)
264 niceval = 19; 264 niceval = 19;
265 265
266 read_lock(&tasklist_lock); 266 read_lock(&tasklist_lock);
267 switch (which) { 267 switch (which) {
268 case PRIO_PROCESS: 268 case PRIO_PROCESS:
269 if (!who) 269 if (!who)
270 who = current->pid; 270 who = current->pid;
271 p = find_task_by_pid(who); 271 p = find_task_by_pid(who);
272 if (p) 272 if (p)
273 error = set_one_prio(p, niceval, error); 273 error = set_one_prio(p, niceval, error);
274 break; 274 break;
275 case PRIO_PGRP: 275 case PRIO_PGRP:
276 if (!who) 276 if (!who)
277 who = process_group(current); 277 who = process_group(current);
278 do_each_task_pid(who, PIDTYPE_PGID, p) { 278 do_each_task_pid(who, PIDTYPE_PGID, p) {
279 error = set_one_prio(p, niceval, error); 279 error = set_one_prio(p, niceval, error);
280 } while_each_task_pid(who, PIDTYPE_PGID, p); 280 } while_each_task_pid(who, PIDTYPE_PGID, p);
281 break; 281 break;
282 case PRIO_USER: 282 case PRIO_USER:
283 user = current->user; 283 user = current->user;
284 if (!who) 284 if (!who)
285 who = current->uid; 285 who = current->uid;
286 else 286 else
287 if ((who != current->uid) && !(user = find_user(who))) 287 if ((who != current->uid) && !(user = find_user(who)))
288 goto out_unlock; /* No processes for this user */ 288 goto out_unlock; /* No processes for this user */
289 289
290 do_each_thread(g, p) 290 do_each_thread(g, p)
291 if (p->uid == who) 291 if (p->uid == who)
292 error = set_one_prio(p, niceval, error); 292 error = set_one_prio(p, niceval, error);
293 while_each_thread(g, p); 293 while_each_thread(g, p);
294 if (who != current->uid) 294 if (who != current->uid)
295 free_uid(user); /* For find_user() */ 295 free_uid(user); /* For find_user() */
296 break; 296 break;
297 } 297 }
298 out_unlock: 298 out_unlock:
299 read_unlock(&tasklist_lock); 299 read_unlock(&tasklist_lock);
300 out: 300 out:
301 return error; 301 return error;
302 } 302 }
303 303
304 /* 304 /*
305 * Ugh. To avoid negative return values, "getpriority()" will 305 * Ugh. To avoid negative return values, "getpriority()" will
306 * not return the normal nice-value, but a negated value that 306 * not return the normal nice-value, but a negated value that
307 * has been offset by 20 (ie it returns 40..1 instead of -20..19) 307 * has been offset by 20 (ie it returns 40..1 instead of -20..19)
308 * to stay compatible. 308 * to stay compatible.
309 */ 309 */
310 asmlinkage long sys_getpriority(int which, int who) 310 asmlinkage long sys_getpriority(int which, int who)
311 { 311 {
312 struct task_struct *g, *p; 312 struct task_struct *g, *p;
313 struct user_struct *user; 313 struct user_struct *user;
314 long niceval, retval = -ESRCH; 314 long niceval, retval = -ESRCH;
315 315
316 if (which > 2 || which < 0) 316 if (which > 2 || which < 0)
317 return -EINVAL; 317 return -EINVAL;
318 318
319 read_lock(&tasklist_lock); 319 read_lock(&tasklist_lock);
320 switch (which) { 320 switch (which) {
321 case PRIO_PROCESS: 321 case PRIO_PROCESS:
322 if (!who) 322 if (!who)
323 who = current->pid; 323 who = current->pid;
324 p = find_task_by_pid(who); 324 p = find_task_by_pid(who);
325 if (p) { 325 if (p) {
326 niceval = 20 - task_nice(p); 326 niceval = 20 - task_nice(p);
327 if (niceval > retval) 327 if (niceval > retval)
328 retval = niceval; 328 retval = niceval;
329 } 329 }
330 break; 330 break;
331 case PRIO_PGRP: 331 case PRIO_PGRP:
332 if (!who) 332 if (!who)
333 who = process_group(current); 333 who = process_group(current);
334 do_each_task_pid(who, PIDTYPE_PGID, p) { 334 do_each_task_pid(who, PIDTYPE_PGID, p) {
335 niceval = 20 - task_nice(p); 335 niceval = 20 - task_nice(p);
336 if (niceval > retval) 336 if (niceval > retval)
337 retval = niceval; 337 retval = niceval;
338 } while_each_task_pid(who, PIDTYPE_PGID, p); 338 } while_each_task_pid(who, PIDTYPE_PGID, p);
339 break; 339 break;
340 case PRIO_USER: 340 case PRIO_USER:
341 user = current->user; 341 user = current->user;
342 if (!who) 342 if (!who)
343 who = current->uid; 343 who = current->uid;
344 else 344 else
345 if ((who != current->uid) && !(user = find_user(who))) 345 if ((who != current->uid) && !(user = find_user(who)))
346 goto out_unlock; /* No processes for this user */ 346 goto out_unlock; /* No processes for this user */
347 347
348 do_each_thread(g, p) 348 do_each_thread(g, p)
349 if (p->uid == who) { 349 if (p->uid == who) {
350 niceval = 20 - task_nice(p); 350 niceval = 20 - task_nice(p);
351 if (niceval > retval) 351 if (niceval > retval)
352 retval = niceval; 352 retval = niceval;
353 } 353 }
354 while_each_thread(g, p); 354 while_each_thread(g, p);
355 if (who != current->uid) 355 if (who != current->uid)
356 free_uid(user); /* for find_user() */ 356 free_uid(user); /* for find_user() */
357 break; 357 break;
358 } 358 }
359 out_unlock: 359 out_unlock:
360 read_unlock(&tasklist_lock); 360 read_unlock(&tasklist_lock);
361 361
362 return retval; 362 return retval;
363 } 363 }
364 364
365 /** 365 /**
366 * emergency_restart - reboot the system 366 * emergency_restart - reboot the system
367 * 367 *
368 * Without shutting down any hardware or taking any locks 368 * Without shutting down any hardware or taking any locks
369 * reboot the system. This is called when we know we are in 369 * reboot the system. This is called when we know we are in
370 * trouble so this is our best effort to reboot. This is 370 * trouble so this is our best effort to reboot. This is
371 * safe to call in interrupt context. 371 * safe to call in interrupt context.
372 */ 372 */
373 void emergency_restart(void) 373 void emergency_restart(void)
374 { 374 {
375 machine_emergency_restart(); 375 machine_emergency_restart();
376 } 376 }
377 EXPORT_SYMBOL_GPL(emergency_restart); 377 EXPORT_SYMBOL_GPL(emergency_restart);
378 378
379 void kernel_restart_prepare(char *cmd) 379 void kernel_restart_prepare(char *cmd)
380 { 380 {
381 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 381 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
382 system_state = SYSTEM_RESTART; 382 system_state = SYSTEM_RESTART;
383 device_shutdown(); 383 device_shutdown();
384 } 384 }
385 385
386 /** 386 /**
387 * kernel_restart - reboot the system 387 * kernel_restart - reboot the system
388 * @cmd: pointer to buffer containing command to execute for restart 388 * @cmd: pointer to buffer containing command to execute for restart
389 * or NULL 389 * or %NULL
390 * 390 *
391 * Shutdown everything and perform a clean reboot. 391 * Shutdown everything and perform a clean reboot.
392 * This is not safe to call in interrupt context. 392 * This is not safe to call in interrupt context.
393 */ 393 */
394 void kernel_restart(char *cmd) 394 void kernel_restart(char *cmd)
395 { 395 {
396 kernel_restart_prepare(cmd); 396 kernel_restart_prepare(cmd);
397 if (!cmd) { 397 if (!cmd) {
398 printk(KERN_EMERG "Restarting system.\n"); 398 printk(KERN_EMERG "Restarting system.\n");
399 } else { 399 } else {
400 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); 400 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
401 } 401 }
402 printk(".\n"); 402 printk(".\n");
403 machine_restart(cmd); 403 machine_restart(cmd);
404 } 404 }
405 EXPORT_SYMBOL_GPL(kernel_restart); 405 EXPORT_SYMBOL_GPL(kernel_restart);
406 406
407 /** 407 /**
408 * kernel_kexec - reboot the system 408 * kernel_kexec - reboot the system
409 * 409 *
410 * Move into place and start executing a preloaded standalone 410 * Move into place and start executing a preloaded standalone
411 * executable. If nothing was preloaded return an error. 411 * executable. If nothing was preloaded return an error.
412 */ 412 */
413 void kernel_kexec(void) 413 void kernel_kexec(void)
414 { 414 {
415 #ifdef CONFIG_KEXEC 415 #ifdef CONFIG_KEXEC
416 struct kimage *image; 416 struct kimage *image;
417 image = xchg(&kexec_image, 0); 417 image = xchg(&kexec_image, 0);
418 if (!image) { 418 if (!image) {
419 return; 419 return;
420 } 420 }
421 kernel_restart_prepare(NULL); 421 kernel_restart_prepare(NULL);
422 printk(KERN_EMERG "Starting new kernel\n"); 422 printk(KERN_EMERG "Starting new kernel\n");
423 machine_shutdown(); 423 machine_shutdown();
424 machine_kexec(image); 424 machine_kexec(image);
425 #endif 425 #endif
426 } 426 }
427 EXPORT_SYMBOL_GPL(kernel_kexec); 427 EXPORT_SYMBOL_GPL(kernel_kexec);
428 428
429 /** 429 /**
430 * kernel_halt - halt the system 430 * kernel_halt - halt the system
431 * 431 *
432 * Shutdown everything and perform a clean system halt. 432 * Shutdown everything and perform a clean system halt.
433 */ 433 */
434 void kernel_halt_prepare(void) 434 void kernel_halt_prepare(void)
435 { 435 {
436 notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); 436 notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
437 system_state = SYSTEM_HALT; 437 system_state = SYSTEM_HALT;
438 device_shutdown(); 438 device_shutdown();
439 } 439 }
440 void kernel_halt(void) 440 void kernel_halt(void)
441 { 441 {
442 kernel_halt_prepare(); 442 kernel_halt_prepare();
443 printk(KERN_EMERG "System halted.\n"); 443 printk(KERN_EMERG "System halted.\n");
444 machine_halt(); 444 machine_halt();
445 } 445 }
446 EXPORT_SYMBOL_GPL(kernel_halt); 446 EXPORT_SYMBOL_GPL(kernel_halt);
447 447
448 /** 448 /**
449 * kernel_power_off - power_off the system 449 * kernel_power_off - power_off the system
450 * 450 *
451 * Shutdown everything and perform a clean system power_off. 451 * Shutdown everything and perform a clean system power_off.
452 */ 452 */
453 void kernel_power_off_prepare(void) 453 void kernel_power_off_prepare(void)
454 { 454 {
455 notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); 455 notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
456 system_state = SYSTEM_POWER_OFF; 456 system_state = SYSTEM_POWER_OFF;
457 device_shutdown(); 457 device_shutdown();
458 } 458 }
459 void kernel_power_off(void) 459 void kernel_power_off(void)
460 { 460 {
461 kernel_power_off_prepare(); 461 kernel_power_off_prepare();
462 printk(KERN_EMERG "Power down.\n"); 462 printk(KERN_EMERG "Power down.\n");
463 machine_power_off(); 463 machine_power_off();
464 } 464 }
465 EXPORT_SYMBOL_GPL(kernel_power_off); 465 EXPORT_SYMBOL_GPL(kernel_power_off);
466 466
467 /* 467 /*
468 * Reboot system call: for obvious reasons only root may call it, 468 * Reboot system call: for obvious reasons only root may call it,
469 * and even root needs to set up some magic numbers in the registers 469 * and even root needs to set up some magic numbers in the registers
470 * so that some mistake won't make this reboot the whole machine. 470 * so that some mistake won't make this reboot the whole machine.
471 * You can also set the meaning of the ctrl-alt-del-key here. 471 * You can also set the meaning of the ctrl-alt-del-key here.
472 * 472 *
473 * reboot doesn't sync: do that yourself before calling this. 473 * reboot doesn't sync: do that yourself before calling this.
474 */ 474 */
475 asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user * arg) 475 asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user * arg)
476 { 476 {
477 char buffer[256]; 477 char buffer[256];
478 478
479 /* We only trust the superuser with rebooting the system. */ 479 /* We only trust the superuser with rebooting the system. */
480 if (!capable(CAP_SYS_BOOT)) 480 if (!capable(CAP_SYS_BOOT))
481 return -EPERM; 481 return -EPERM;
482 482
483 /* For safety, we require "magic" arguments. */ 483 /* For safety, we require "magic" arguments. */
484 if (magic1 != LINUX_REBOOT_MAGIC1 || 484 if (magic1 != LINUX_REBOOT_MAGIC1 ||
485 (magic2 != LINUX_REBOOT_MAGIC2 && 485 (magic2 != LINUX_REBOOT_MAGIC2 &&
486 magic2 != LINUX_REBOOT_MAGIC2A && 486 magic2 != LINUX_REBOOT_MAGIC2A &&
487 magic2 != LINUX_REBOOT_MAGIC2B && 487 magic2 != LINUX_REBOOT_MAGIC2B &&
488 magic2 != LINUX_REBOOT_MAGIC2C)) 488 magic2 != LINUX_REBOOT_MAGIC2C))
489 return -EINVAL; 489 return -EINVAL;
490 490
491 lock_kernel(); 491 lock_kernel();
492 switch (cmd) { 492 switch (cmd) {
493 case LINUX_REBOOT_CMD_RESTART: 493 case LINUX_REBOOT_CMD_RESTART:
494 kernel_restart(NULL); 494 kernel_restart(NULL);
495 break; 495 break;
496 496
497 case LINUX_REBOOT_CMD_CAD_ON: 497 case LINUX_REBOOT_CMD_CAD_ON:
498 C_A_D = 1; 498 C_A_D = 1;
499 break; 499 break;
500 500
501 case LINUX_REBOOT_CMD_CAD_OFF: 501 case LINUX_REBOOT_CMD_CAD_OFF:
502 C_A_D = 0; 502 C_A_D = 0;
503 break; 503 break;
504 504
505 case LINUX_REBOOT_CMD_HALT: 505 case LINUX_REBOOT_CMD_HALT:
506 kernel_halt(); 506 kernel_halt();
507 unlock_kernel(); 507 unlock_kernel();
508 do_exit(0); 508 do_exit(0);
509 break; 509 break;
510 510
511 case LINUX_REBOOT_CMD_POWER_OFF: 511 case LINUX_REBOOT_CMD_POWER_OFF:
512 kernel_power_off(); 512 kernel_power_off();
513 unlock_kernel(); 513 unlock_kernel();
514 do_exit(0); 514 do_exit(0);
515 break; 515 break;
516 516
517 case LINUX_REBOOT_CMD_RESTART2: 517 case LINUX_REBOOT_CMD_RESTART2:
518 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { 518 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
519 unlock_kernel(); 519 unlock_kernel();
520 return -EFAULT; 520 return -EFAULT;
521 } 521 }
522 buffer[sizeof(buffer) - 1] = '\0'; 522 buffer[sizeof(buffer) - 1] = '\0';
523 523
524 kernel_restart(buffer); 524 kernel_restart(buffer);
525 break; 525 break;
526 526
527 case LINUX_REBOOT_CMD_KEXEC: 527 case LINUX_REBOOT_CMD_KEXEC:
528 kernel_kexec(); 528 kernel_kexec();
529 unlock_kernel(); 529 unlock_kernel();
530 return -EINVAL; 530 return -EINVAL;
531 531
532 #ifdef CONFIG_SOFTWARE_SUSPEND 532 #ifdef CONFIG_SOFTWARE_SUSPEND
533 case LINUX_REBOOT_CMD_SW_SUSPEND: 533 case LINUX_REBOOT_CMD_SW_SUSPEND:
534 { 534 {
535 int ret = software_suspend(); 535 int ret = software_suspend();
536 unlock_kernel(); 536 unlock_kernel();
537 return ret; 537 return ret;
538 } 538 }
539 #endif 539 #endif
540 540
541 default: 541 default:
542 unlock_kernel(); 542 unlock_kernel();
543 return -EINVAL; 543 return -EINVAL;
544 } 544 }
545 unlock_kernel(); 545 unlock_kernel();
546 return 0; 546 return 0;
547 } 547 }
548 548
549 static void deferred_cad(void *dummy) 549 static void deferred_cad(void *dummy)
550 { 550 {
551 kernel_restart(NULL); 551 kernel_restart(NULL);
552 } 552 }
553 553
554 /* 554 /*
555 * This function gets called by ctrl-alt-del - ie the keyboard interrupt. 555 * This function gets called by ctrl-alt-del - ie the keyboard interrupt.
556 * As it's called within an interrupt, it may NOT sync: the only choice 556 * As it's called within an interrupt, it may NOT sync: the only choice
557 * is whether to reboot at once, or just ignore the ctrl-alt-del. 557 * is whether to reboot at once, or just ignore the ctrl-alt-del.
558 */ 558 */
559 void ctrl_alt_del(void) 559 void ctrl_alt_del(void)
560 { 560 {
561 static DECLARE_WORK(cad_work, deferred_cad, NULL); 561 static DECLARE_WORK(cad_work, deferred_cad, NULL);
562 562
563 if (C_A_D) 563 if (C_A_D)
564 schedule_work(&cad_work); 564 schedule_work(&cad_work);
565 else 565 else
566 kill_proc(cad_pid, SIGINT, 1); 566 kill_proc(cad_pid, SIGINT, 1);
567 } 567 }
568 568
569 569
570 /* 570 /*
571 * Unprivileged users may change the real gid to the effective gid 571 * Unprivileged users may change the real gid to the effective gid
572 * or vice versa. (BSD-style) 572 * or vice versa. (BSD-style)
573 * 573 *
574 * If you set the real gid at all, or set the effective gid to a value not 574 * If you set the real gid at all, or set the effective gid to a value not
575 * equal to the real gid, then the saved gid is set to the new effective gid. 575 * equal to the real gid, then the saved gid is set to the new effective gid.
576 * 576 *
577 * This makes it possible for a setgid program to completely drop its 577 * This makes it possible for a setgid program to completely drop its
578 * privileges, which is often a useful assertion to make when you are doing 578 * privileges, which is often a useful assertion to make when you are doing
579 * a security audit over a program. 579 * a security audit over a program.
580 * 580 *
581 * The general idea is that a program which uses just setregid() will be 581 * The general idea is that a program which uses just setregid() will be
582 * 100% compatible with BSD. A program which uses just setgid() will be 582 * 100% compatible with BSD. A program which uses just setgid() will be
583 * 100% compatible with POSIX with saved IDs. 583 * 100% compatible with POSIX with saved IDs.
584 * 584 *
585 * SMP: There are not races, the GIDs are checked only by filesystem 585 * SMP: There are not races, the GIDs are checked only by filesystem
586 * operations (as far as semantic preservation is concerned). 586 * operations (as far as semantic preservation is concerned).
587 */ 587 */
588 asmlinkage long sys_setregid(gid_t rgid, gid_t egid) 588 asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
589 { 589 {
590 int old_rgid = current->gid; 590 int old_rgid = current->gid;
591 int old_egid = current->egid; 591 int old_egid = current->egid;
592 int new_rgid = old_rgid; 592 int new_rgid = old_rgid;
593 int new_egid = old_egid; 593 int new_egid = old_egid;
594 int retval; 594 int retval;
595 595
596 retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE); 596 retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
597 if (retval) 597 if (retval)
598 return retval; 598 return retval;
599 599
600 if (rgid != (gid_t) -1) { 600 if (rgid != (gid_t) -1) {
601 if ((old_rgid == rgid) || 601 if ((old_rgid == rgid) ||
602 (current->egid==rgid) || 602 (current->egid==rgid) ||
603 capable(CAP_SETGID)) 603 capable(CAP_SETGID))
604 new_rgid = rgid; 604 new_rgid = rgid;
605 else 605 else
606 return -EPERM; 606 return -EPERM;
607 } 607 }
608 if (egid != (gid_t) -1) { 608 if (egid != (gid_t) -1) {
609 if ((old_rgid == egid) || 609 if ((old_rgid == egid) ||
610 (current->egid == egid) || 610 (current->egid == egid) ||
611 (current->sgid == egid) || 611 (current->sgid == egid) ||
612 capable(CAP_SETGID)) 612 capable(CAP_SETGID))
613 new_egid = egid; 613 new_egid = egid;
614 else { 614 else {
615 return -EPERM; 615 return -EPERM;
616 } 616 }
617 } 617 }
618 if (new_egid != old_egid) 618 if (new_egid != old_egid)
619 { 619 {
620 current->mm->dumpable = suid_dumpable; 620 current->mm->dumpable = suid_dumpable;
621 smp_wmb(); 621 smp_wmb();
622 } 622 }
623 if (rgid != (gid_t) -1 || 623 if (rgid != (gid_t) -1 ||
624 (egid != (gid_t) -1 && egid != old_rgid)) 624 (egid != (gid_t) -1 && egid != old_rgid))
625 current->sgid = new_egid; 625 current->sgid = new_egid;
626 current->fsgid = new_egid; 626 current->fsgid = new_egid;
627 current->egid = new_egid; 627 current->egid = new_egid;
628 current->gid = new_rgid; 628 current->gid = new_rgid;
629 key_fsgid_changed(current); 629 key_fsgid_changed(current);
630 proc_id_connector(current, PROC_EVENT_GID); 630 proc_id_connector(current, PROC_EVENT_GID);
631 return 0; 631 return 0;
632 } 632 }
633 633
634 /* 634 /*
635 * setgid() is implemented like SysV w/ SAVED_IDS 635 * setgid() is implemented like SysV w/ SAVED_IDS
636 * 636 *
637 * SMP: Same implicit races as above. 637 * SMP: Same implicit races as above.
638 */ 638 */
639 asmlinkage long sys_setgid(gid_t gid) 639 asmlinkage long sys_setgid(gid_t gid)
640 { 640 {
641 int old_egid = current->egid; 641 int old_egid = current->egid;
642 int retval; 642 int retval;
643 643
644 retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID); 644 retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
645 if (retval) 645 if (retval)
646 return retval; 646 return retval;
647 647
648 if (capable(CAP_SETGID)) 648 if (capable(CAP_SETGID))
649 { 649 {
650 if(old_egid != gid) 650 if(old_egid != gid)
651 { 651 {
652 current->mm->dumpable = suid_dumpable; 652 current->mm->dumpable = suid_dumpable;
653 smp_wmb(); 653 smp_wmb();
654 } 654 }
655 current->gid = current->egid = current->sgid = current->fsgid = gid; 655 current->gid = current->egid = current->sgid = current->fsgid = gid;
656 } 656 }
657 else if ((gid == current->gid) || (gid == current->sgid)) 657 else if ((gid == current->gid) || (gid == current->sgid))
658 { 658 {
659 if(old_egid != gid) 659 if(old_egid != gid)
660 { 660 {
661 current->mm->dumpable = suid_dumpable; 661 current->mm->dumpable = suid_dumpable;
662 smp_wmb(); 662 smp_wmb();
663 } 663 }
664 current->egid = current->fsgid = gid; 664 current->egid = current->fsgid = gid;
665 } 665 }
666 else 666 else
667 return -EPERM; 667 return -EPERM;
668 668
669 key_fsgid_changed(current); 669 key_fsgid_changed(current);
670 proc_id_connector(current, PROC_EVENT_GID); 670 proc_id_connector(current, PROC_EVENT_GID);
671 return 0; 671 return 0;
672 } 672 }
673 673
674 static int set_user(uid_t new_ruid, int dumpclear) 674 static int set_user(uid_t new_ruid, int dumpclear)
675 { 675 {
676 struct user_struct *new_user; 676 struct user_struct *new_user;
677 677
678 new_user = alloc_uid(new_ruid); 678 new_user = alloc_uid(new_ruid);
679 if (!new_user) 679 if (!new_user)
680 return -EAGAIN; 680 return -EAGAIN;
681 681
682 if (atomic_read(&new_user->processes) >= 682 if (atomic_read(&new_user->processes) >=
683 current->signal->rlim[RLIMIT_NPROC].rlim_cur && 683 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
684 new_user != &root_user) { 684 new_user != &root_user) {
685 free_uid(new_user); 685 free_uid(new_user);
686 return -EAGAIN; 686 return -EAGAIN;
687 } 687 }
688 688
689 switch_uid(new_user); 689 switch_uid(new_user);
690 690
691 if(dumpclear) 691 if(dumpclear)
692 { 692 {
693 current->mm->dumpable = suid_dumpable; 693 current->mm->dumpable = suid_dumpable;
694 smp_wmb(); 694 smp_wmb();
695 } 695 }
696 current->uid = new_ruid; 696 current->uid = new_ruid;
697 return 0; 697 return 0;
698 } 698 }
699 699
700 /* 700 /*
701 * Unprivileged users may change the real uid to the effective uid 701 * Unprivileged users may change the real uid to the effective uid
702 * or vice versa. (BSD-style) 702 * or vice versa. (BSD-style)
703 * 703 *
704 * If you set the real uid at all, or set the effective uid to a value not 704 * If you set the real uid at all, or set the effective uid to a value not
705 * equal to the real uid, then the saved uid is set to the new effective uid. 705 * equal to the real uid, then the saved uid is set to the new effective uid.
706 * 706 *
707 * This makes it possible for a setuid program to completely drop its 707 * This makes it possible for a setuid program to completely drop its
708 * privileges, which is often a useful assertion to make when you are doing 708 * privileges, which is often a useful assertion to make when you are doing
709 * a security audit over a program. 709 * a security audit over a program.
710 * 710 *
711 * The general idea is that a program which uses just setreuid() will be 711 * The general idea is that a program which uses just setreuid() will be
712 * 100% compatible with BSD. A program which uses just setuid() will be 712 * 100% compatible with BSD. A program which uses just setuid() will be
713 * 100% compatible with POSIX with saved IDs. 713 * 100% compatible with POSIX with saved IDs.
714 */ 714 */
715 asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) 715 asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
716 { 716 {
717 int old_ruid, old_euid, old_suid, new_ruid, new_euid; 717 int old_ruid, old_euid, old_suid, new_ruid, new_euid;
718 int retval; 718 int retval;
719 719
720 retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE); 720 retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
721 if (retval) 721 if (retval)
722 return retval; 722 return retval;
723 723
724 new_ruid = old_ruid = current->uid; 724 new_ruid = old_ruid = current->uid;
725 new_euid = old_euid = current->euid; 725 new_euid = old_euid = current->euid;
726 old_suid = current->suid; 726 old_suid = current->suid;
727 727
728 if (ruid != (uid_t) -1) { 728 if (ruid != (uid_t) -1) {
729 new_ruid = ruid; 729 new_ruid = ruid;
730 if ((old_ruid != ruid) && 730 if ((old_ruid != ruid) &&
731 (current->euid != ruid) && 731 (current->euid != ruid) &&
732 !capable(CAP_SETUID)) 732 !capable(CAP_SETUID))
733 return -EPERM; 733 return -EPERM;
734 } 734 }
735 735
736 if (euid != (uid_t) -1) { 736 if (euid != (uid_t) -1) {
737 new_euid = euid; 737 new_euid = euid;
738 if ((old_ruid != euid) && 738 if ((old_ruid != euid) &&
739 (current->euid != euid) && 739 (current->euid != euid) &&
740 (current->suid != euid) && 740 (current->suid != euid) &&
741 !capable(CAP_SETUID)) 741 !capable(CAP_SETUID))
742 return -EPERM; 742 return -EPERM;
743 } 743 }
744 744
745 if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0) 745 if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0)
746 return -EAGAIN; 746 return -EAGAIN;
747 747
748 if (new_euid != old_euid) 748 if (new_euid != old_euid)
749 { 749 {
750 current->mm->dumpable = suid_dumpable; 750 current->mm->dumpable = suid_dumpable;
751 smp_wmb(); 751 smp_wmb();
752 } 752 }
753 current->fsuid = current->euid = new_euid; 753 current->fsuid = current->euid = new_euid;
754 if (ruid != (uid_t) -1 || 754 if (ruid != (uid_t) -1 ||
755 (euid != (uid_t) -1 && euid != old_ruid)) 755 (euid != (uid_t) -1 && euid != old_ruid))
756 current->suid = current->euid; 756 current->suid = current->euid;
757 current->fsuid = current->euid; 757 current->fsuid = current->euid;
758 758
759 key_fsuid_changed(current); 759 key_fsuid_changed(current);
760 proc_id_connector(current, PROC_EVENT_UID); 760 proc_id_connector(current, PROC_EVENT_UID);
761 761
762 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE); 762 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE);
763 } 763 }
764 764
765 765
766 766
767 /* 767 /*
768 * setuid() is implemented like SysV with SAVED_IDS 768 * setuid() is implemented like SysV with SAVED_IDS
769 * 769 *
770 * Note that SAVED_ID's is deficient in that a setuid root program 770 * Note that SAVED_ID's is deficient in that a setuid root program
771 * like sendmail, for example, cannot set its uid to be a normal 771 * like sendmail, for example, cannot set its uid to be a normal
772 * user and then switch back, because if you're root, setuid() sets 772 * user and then switch back, because if you're root, setuid() sets
773 * the saved uid too. If you don't like this, blame the bright people 773 * the saved uid too. If you don't like this, blame the bright people
774 * in the POSIX committee and/or USG. Note that the BSD-style setreuid() 774 * in the POSIX committee and/or USG. Note that the BSD-style setreuid()
775 * will allow a root program to temporarily drop privileges and be able to 775 * will allow a root program to temporarily drop privileges and be able to
776 * regain them by swapping the real and effective uid. 776 * regain them by swapping the real and effective uid.
777 */ 777 */
778 asmlinkage long sys_setuid(uid_t uid) 778 asmlinkage long sys_setuid(uid_t uid)
779 { 779 {
780 int old_euid = current->euid; 780 int old_euid = current->euid;
781 int old_ruid, old_suid, new_ruid, new_suid; 781 int old_ruid, old_suid, new_ruid, new_suid;
782 int retval; 782 int retval;
783 783
784 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); 784 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
785 if (retval) 785 if (retval)
786 return retval; 786 return retval;
787 787
788 old_ruid = new_ruid = current->uid; 788 old_ruid = new_ruid = current->uid;
789 old_suid = current->suid; 789 old_suid = current->suid;
790 new_suid = old_suid; 790 new_suid = old_suid;
791 791
792 if (capable(CAP_SETUID)) { 792 if (capable(CAP_SETUID)) {
793 if (uid != old_ruid && set_user(uid, old_euid != uid) < 0) 793 if (uid != old_ruid && set_user(uid, old_euid != uid) < 0)
794 return -EAGAIN; 794 return -EAGAIN;
795 new_suid = uid; 795 new_suid = uid;
796 } else if ((uid != current->uid) && (uid != new_suid)) 796 } else if ((uid != current->uid) && (uid != new_suid))
797 return -EPERM; 797 return -EPERM;
798 798
799 if (old_euid != uid) 799 if (old_euid != uid)
800 { 800 {
801 current->mm->dumpable = suid_dumpable; 801 current->mm->dumpable = suid_dumpable;
802 smp_wmb(); 802 smp_wmb();
803 } 803 }
804 current->fsuid = current->euid = uid; 804 current->fsuid = current->euid = uid;
805 current->suid = new_suid; 805 current->suid = new_suid;
806 806
807 key_fsuid_changed(current); 807 key_fsuid_changed(current);
808 proc_id_connector(current, PROC_EVENT_UID); 808 proc_id_connector(current, PROC_EVENT_UID);
809 809
810 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID); 810 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID);
811 } 811 }
812 812
813 813
814 /* 814 /*
815 * This function implements a generic ability to update ruid, euid, 815 * This function implements a generic ability to update ruid, euid,
816 * and suid. This allows you to implement the 4.4 compatible seteuid(). 816 * and suid. This allows you to implement the 4.4 compatible seteuid().
817 */ 817 */
818 asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) 818 asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
819 { 819 {
820 int old_ruid = current->uid; 820 int old_ruid = current->uid;
821 int old_euid = current->euid; 821 int old_euid = current->euid;
822 int old_suid = current->suid; 822 int old_suid = current->suid;
823 int retval; 823 int retval;
824 824
825 retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES); 825 retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
826 if (retval) 826 if (retval)
827 return retval; 827 return retval;
828 828
829 if (!capable(CAP_SETUID)) { 829 if (!capable(CAP_SETUID)) {
830 if ((ruid != (uid_t) -1) && (ruid != current->uid) && 830 if ((ruid != (uid_t) -1) && (ruid != current->uid) &&
831 (ruid != current->euid) && (ruid != current->suid)) 831 (ruid != current->euid) && (ruid != current->suid))
832 return -EPERM; 832 return -EPERM;
833 if ((euid != (uid_t) -1) && (euid != current->uid) && 833 if ((euid != (uid_t) -1) && (euid != current->uid) &&
834 (euid != current->euid) && (euid != current->suid)) 834 (euid != current->euid) && (euid != current->suid))
835 return -EPERM; 835 return -EPERM;
836 if ((suid != (uid_t) -1) && (suid != current->uid) && 836 if ((suid != (uid_t) -1) && (suid != current->uid) &&
837 (suid != current->euid) && (suid != current->suid)) 837 (suid != current->euid) && (suid != current->suid))
838 return -EPERM; 838 return -EPERM;
839 } 839 }
840 if (ruid != (uid_t) -1) { 840 if (ruid != (uid_t) -1) {
841 if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0) 841 if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0)
842 return -EAGAIN; 842 return -EAGAIN;
843 } 843 }
844 if (euid != (uid_t) -1) { 844 if (euid != (uid_t) -1) {
845 if (euid != current->euid) 845 if (euid != current->euid)
846 { 846 {
847 current->mm->dumpable = suid_dumpable; 847 current->mm->dumpable = suid_dumpable;
848 smp_wmb(); 848 smp_wmb();
849 } 849 }
850 current->euid = euid; 850 current->euid = euid;
851 } 851 }
852 current->fsuid = current->euid; 852 current->fsuid = current->euid;
853 if (suid != (uid_t) -1) 853 if (suid != (uid_t) -1)
854 current->suid = suid; 854 current->suid = suid;
855 855
856 key_fsuid_changed(current); 856 key_fsuid_changed(current);
857 proc_id_connector(current, PROC_EVENT_UID); 857 proc_id_connector(current, PROC_EVENT_UID);
858 858
859 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES); 859 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES);
860 } 860 }
861 861
862 asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid) 862 asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid)
863 { 863 {
864 int retval; 864 int retval;
865 865
866 if (!(retval = put_user(current->uid, ruid)) && 866 if (!(retval = put_user(current->uid, ruid)) &&
867 !(retval = put_user(current->euid, euid))) 867 !(retval = put_user(current->euid, euid)))
868 retval = put_user(current->suid, suid); 868 retval = put_user(current->suid, suid);
869 869
870 return retval; 870 return retval;
871 } 871 }
872 872
873 /* 873 /*
874 * Same as above, but for rgid, egid, sgid. 874 * Same as above, but for rgid, egid, sgid.
875 */ 875 */
876 asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) 876 asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
877 { 877 {
878 int retval; 878 int retval;
879 879
880 retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES); 880 retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
881 if (retval) 881 if (retval)
882 return retval; 882 return retval;
883 883
884 if (!capable(CAP_SETGID)) { 884 if (!capable(CAP_SETGID)) {
885 if ((rgid != (gid_t) -1) && (rgid != current->gid) && 885 if ((rgid != (gid_t) -1) && (rgid != current->gid) &&
886 (rgid != current->egid) && (rgid != current->sgid)) 886 (rgid != current->egid) && (rgid != current->sgid))
887 return -EPERM; 887 return -EPERM;
888 if ((egid != (gid_t) -1) && (egid != current->gid) && 888 if ((egid != (gid_t) -1) && (egid != current->gid) &&
889 (egid != current->egid) && (egid != current->sgid)) 889 (egid != current->egid) && (egid != current->sgid))
890 return -EPERM; 890 return -EPERM;
891 if ((sgid != (gid_t) -1) && (sgid != current->gid) && 891 if ((sgid != (gid_t) -1) && (sgid != current->gid) &&
892 (sgid != current->egid) && (sgid != current->sgid)) 892 (sgid != current->egid) && (sgid != current->sgid))
893 return -EPERM; 893 return -EPERM;
894 } 894 }
895 if (egid != (gid_t) -1) { 895 if (egid != (gid_t) -1) {
896 if (egid != current->egid) 896 if (egid != current->egid)
897 { 897 {
898 current->mm->dumpable = suid_dumpable; 898 current->mm->dumpable = suid_dumpable;
899 smp_wmb(); 899 smp_wmb();
900 } 900 }
901 current->egid = egid; 901 current->egid = egid;
902 } 902 }
903 current->fsgid = current->egid; 903 current->fsgid = current->egid;
904 if (rgid != (gid_t) -1) 904 if (rgid != (gid_t) -1)
905 current->gid = rgid; 905 current->gid = rgid;
906 if (sgid != (gid_t) -1) 906 if (sgid != (gid_t) -1)
907 current->sgid = sgid; 907 current->sgid = sgid;
908 908
909 key_fsgid_changed(current); 909 key_fsgid_changed(current);
910 proc_id_connector(current, PROC_EVENT_GID); 910 proc_id_connector(current, PROC_EVENT_GID);
911 return 0; 911 return 0;
912 } 912 }
913 913
914 asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid) 914 asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid)
915 { 915 {
916 int retval; 916 int retval;
917 917
918 if (!(retval = put_user(current->gid, rgid)) && 918 if (!(retval = put_user(current->gid, rgid)) &&
919 !(retval = put_user(current->egid, egid))) 919 !(retval = put_user(current->egid, egid)))
920 retval = put_user(current->sgid, sgid); 920 retval = put_user(current->sgid, sgid);
921 921
922 return retval; 922 return retval;
923 } 923 }
924 924
925 925
926 /* 926 /*
927 * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This 927 * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This
928 * is used for "access()" and for the NFS daemon (letting nfsd stay at 928 * is used for "access()" and for the NFS daemon (letting nfsd stay at
929 * whatever uid it wants to). It normally shadows "euid", except when 929 * whatever uid it wants to). It normally shadows "euid", except when
930 * explicitly set by setfsuid() or for access.. 930 * explicitly set by setfsuid() or for access..
931 */ 931 */
932 asmlinkage long sys_setfsuid(uid_t uid) 932 asmlinkage long sys_setfsuid(uid_t uid)
933 { 933 {
934 int old_fsuid; 934 int old_fsuid;
935 935
936 old_fsuid = current->fsuid; 936 old_fsuid = current->fsuid;
937 if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS)) 937 if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS))
938 return old_fsuid; 938 return old_fsuid;
939 939
940 if (uid == current->uid || uid == current->euid || 940 if (uid == current->uid || uid == current->euid ||
941 uid == current->suid || uid == current->fsuid || 941 uid == current->suid || uid == current->fsuid ||
942 capable(CAP_SETUID)) 942 capable(CAP_SETUID))
943 { 943 {
944 if (uid != old_fsuid) 944 if (uid != old_fsuid)
945 { 945 {
946 current->mm->dumpable = suid_dumpable; 946 current->mm->dumpable = suid_dumpable;
947 smp_wmb(); 947 smp_wmb();
948 } 948 }
949 current->fsuid = uid; 949 current->fsuid = uid;
950 } 950 }
951 951
952 key_fsuid_changed(current); 952 key_fsuid_changed(current);
953 proc_id_connector(current, PROC_EVENT_UID); 953 proc_id_connector(current, PROC_EVENT_UID);
954 954
955 security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS); 955 security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS);
956 956
957 return old_fsuid; 957 return old_fsuid;
958 } 958 }
959 959
960 /* 960 /*
961 * Samma pรฅ svenska.. 961 * Samma pรฅ svenska..
962 */ 962 */
963 asmlinkage long sys_setfsgid(gid_t gid) 963 asmlinkage long sys_setfsgid(gid_t gid)
964 { 964 {
965 int old_fsgid; 965 int old_fsgid;
966 966
967 old_fsgid = current->fsgid; 967 old_fsgid = current->fsgid;
968 if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS)) 968 if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
969 return old_fsgid; 969 return old_fsgid;
970 970
971 if (gid == current->gid || gid == current->egid || 971 if (gid == current->gid || gid == current->egid ||
972 gid == current->sgid || gid == current->fsgid || 972 gid == current->sgid || gid == current->fsgid ||
973 capable(CAP_SETGID)) 973 capable(CAP_SETGID))
974 { 974 {
975 if (gid != old_fsgid) 975 if (gid != old_fsgid)
976 { 976 {
977 current->mm->dumpable = suid_dumpable; 977 current->mm->dumpable = suid_dumpable;
978 smp_wmb(); 978 smp_wmb();
979 } 979 }
980 current->fsgid = gid; 980 current->fsgid = gid;
981 key_fsgid_changed(current); 981 key_fsgid_changed(current);
982 proc_id_connector(current, PROC_EVENT_GID); 982 proc_id_connector(current, PROC_EVENT_GID);
983 } 983 }
984 return old_fsgid; 984 return old_fsgid;
985 } 985 }
986 986
987 asmlinkage long sys_times(struct tms __user * tbuf) 987 asmlinkage long sys_times(struct tms __user * tbuf)
988 { 988 {
989 /* 989 /*
990 * In the SMP world we might just be unlucky and have one of 990 * In the SMP world we might just be unlucky and have one of
991 * the times increment as we use it. Since the value is an 991 * the times increment as we use it. Since the value is an
992 * atomically safe type this is just fine. Conceptually its 992 * atomically safe type this is just fine. Conceptually its
993 * as if the syscall took an instant longer to occur. 993 * as if the syscall took an instant longer to occur.
994 */ 994 */
995 if (tbuf) { 995 if (tbuf) {
996 struct tms tmp; 996 struct tms tmp;
997 cputime_t utime, stime, cutime, cstime; 997 cputime_t utime, stime, cutime, cstime;
998 998
999 #ifdef CONFIG_SMP 999 #ifdef CONFIG_SMP
1000 if (thread_group_empty(current)) { 1000 if (thread_group_empty(current)) {
1001 /* 1001 /*
1002 * Single thread case without the use of any locks. 1002 * Single thread case without the use of any locks.
1003 * 1003 *
1004 * We may race with release_task if two threads are 1004 * We may race with release_task if two threads are
1005 * executing. However, release task first adds up the 1005 * executing. However, release task first adds up the
1006 * counters (__exit_signal) before removing the task 1006 * counters (__exit_signal) before removing the task
1007 * from the process tasklist (__unhash_process). 1007 * from the process tasklist (__unhash_process).
1008 * __exit_signal also acquires and releases the 1008 * __exit_signal also acquires and releases the
1009 * siglock which results in the proper memory ordering 1009 * siglock which results in the proper memory ordering
1010 * so that the list modifications are always visible 1010 * so that the list modifications are always visible
1011 * after the counters have been updated. 1011 * after the counters have been updated.
1012 * 1012 *
1013 * If the counters have been updated by the second thread 1013 * If the counters have been updated by the second thread
1014 * but the thread has not yet been removed from the list 1014 * but the thread has not yet been removed from the list
1015 * then the other branch will be executing which will 1015 * then the other branch will be executing which will
1016 * block on tasklist_lock until the exit handling of the 1016 * block on tasklist_lock until the exit handling of the
1017 * other task is finished. 1017 * other task is finished.
1018 * 1018 *
1019 * This also implies that the sighand->siglock cannot 1019 * This also implies that the sighand->siglock cannot
1020 * be held by another processor. So we can also 1020 * be held by another processor. So we can also
1021 * skip acquiring that lock. 1021 * skip acquiring that lock.
1022 */ 1022 */
1023 utime = cputime_add(current->signal->utime, current->utime); 1023 utime = cputime_add(current->signal->utime, current->utime);
1024 stime = cputime_add(current->signal->utime, current->stime); 1024 stime = cputime_add(current->signal->utime, current->stime);
1025 cutime = current->signal->cutime; 1025 cutime = current->signal->cutime;
1026 cstime = current->signal->cstime; 1026 cstime = current->signal->cstime;
1027 } else 1027 } else
1028 #endif 1028 #endif
1029 { 1029 {
1030 1030
1031 /* Process with multiple threads */ 1031 /* Process with multiple threads */
1032 struct task_struct *tsk = current; 1032 struct task_struct *tsk = current;
1033 struct task_struct *t; 1033 struct task_struct *t;
1034 1034
1035 read_lock(&tasklist_lock); 1035 read_lock(&tasklist_lock);
1036 utime = tsk->signal->utime; 1036 utime = tsk->signal->utime;
1037 stime = tsk->signal->stime; 1037 stime = tsk->signal->stime;
1038 t = tsk; 1038 t = tsk;
1039 do { 1039 do {
1040 utime = cputime_add(utime, t->utime); 1040 utime = cputime_add(utime, t->utime);
1041 stime = cputime_add(stime, t->stime); 1041 stime = cputime_add(stime, t->stime);
1042 t = next_thread(t); 1042 t = next_thread(t);
1043 } while (t != tsk); 1043 } while (t != tsk);
1044 1044
1045 /* 1045 /*
1046 * While we have tasklist_lock read-locked, no dying thread 1046 * While we have tasklist_lock read-locked, no dying thread
1047 * can be updating current->signal->[us]time. Instead, 1047 * can be updating current->signal->[us]time. Instead,
1048 * we got their counts included in the live thread loop. 1048 * we got their counts included in the live thread loop.
1049 * However, another thread can come in right now and 1049 * However, another thread can come in right now and
1050 * do a wait call that updates current->signal->c[us]time. 1050 * do a wait call that updates current->signal->c[us]time.
1051 * To make sure we always see that pair updated atomically, 1051 * To make sure we always see that pair updated atomically,
1052 * we take the siglock around fetching them. 1052 * we take the siglock around fetching them.
1053 */ 1053 */
1054 spin_lock_irq(&tsk->sighand->siglock); 1054 spin_lock_irq(&tsk->sighand->siglock);
1055 cutime = tsk->signal->cutime; 1055 cutime = tsk->signal->cutime;
1056 cstime = tsk->signal->cstime; 1056 cstime = tsk->signal->cstime;
1057 spin_unlock_irq(&tsk->sighand->siglock); 1057 spin_unlock_irq(&tsk->sighand->siglock);
1058 read_unlock(&tasklist_lock); 1058 read_unlock(&tasklist_lock);
1059 } 1059 }
1060 tmp.tms_utime = cputime_to_clock_t(utime); 1060 tmp.tms_utime = cputime_to_clock_t(utime);
1061 tmp.tms_stime = cputime_to_clock_t(stime); 1061 tmp.tms_stime = cputime_to_clock_t(stime);
1062 tmp.tms_cutime = cputime_to_clock_t(cutime); 1062 tmp.tms_cutime = cputime_to_clock_t(cutime);
1063 tmp.tms_cstime = cputime_to_clock_t(cstime); 1063 tmp.tms_cstime = cputime_to_clock_t(cstime);
1064 if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) 1064 if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
1065 return -EFAULT; 1065 return -EFAULT;
1066 } 1066 }
1067 return (long) jiffies_64_to_clock_t(get_jiffies_64()); 1067 return (long) jiffies_64_to_clock_t(get_jiffies_64());
1068 } 1068 }
1069 1069
1070 /* 1070 /*
1071 * This needs some heavy checking ... 1071 * This needs some heavy checking ...
1072 * I just haven't the stomach for it. I also don't fully 1072 * I just haven't the stomach for it. I also don't fully
1073 * understand sessions/pgrp etc. Let somebody who does explain it. 1073 * understand sessions/pgrp etc. Let somebody who does explain it.
1074 * 1074 *
1075 * OK, I think I have the protection semantics right.... this is really 1075 * OK, I think I have the protection semantics right.... this is really
1076 * only important on a multi-user system anyway, to make sure one user 1076 * only important on a multi-user system anyway, to make sure one user
1077 * can't send a signal to a process owned by another. -TYT, 12/12/91 1077 * can't send a signal to a process owned by another. -TYT, 12/12/91
1078 * 1078 *
1079 * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. 1079 * Auch. Had to add the 'did_exec' flag to conform completely to POSIX.
1080 * LBT 04.03.94 1080 * LBT 04.03.94
1081 */ 1081 */
1082 1082
1083 asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) 1083 asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1084 { 1084 {
1085 struct task_struct *p; 1085 struct task_struct *p;
1086 int err = -EINVAL; 1086 int err = -EINVAL;
1087 1087
1088 if (!pid) 1088 if (!pid)
1089 pid = current->pid; 1089 pid = current->pid;
1090 if (!pgid) 1090 if (!pgid)
1091 pgid = pid; 1091 pgid = pid;
1092 if (pgid < 0) 1092 if (pgid < 0)
1093 return -EINVAL; 1093 return -EINVAL;
1094 1094
1095 /* From this point forward we keep holding onto the tasklist lock 1095 /* From this point forward we keep holding onto the tasklist lock
1096 * so that our parent does not change from under us. -DaveM 1096 * so that our parent does not change from under us. -DaveM
1097 */ 1097 */
1098 write_lock_irq(&tasklist_lock); 1098 write_lock_irq(&tasklist_lock);
1099 1099
1100 err = -ESRCH; 1100 err = -ESRCH;
1101 p = find_task_by_pid(pid); 1101 p = find_task_by_pid(pid);
1102 if (!p) 1102 if (!p)
1103 goto out; 1103 goto out;
1104 1104
1105 err = -EINVAL; 1105 err = -EINVAL;
1106 if (!thread_group_leader(p)) 1106 if (!thread_group_leader(p))
1107 goto out; 1107 goto out;
1108 1108
1109 if (p->parent == current || p->real_parent == current) { 1109 if (p->parent == current || p->real_parent == current) {
1110 err = -EPERM; 1110 err = -EPERM;
1111 if (p->signal->session != current->signal->session) 1111 if (p->signal->session != current->signal->session)
1112 goto out; 1112 goto out;
1113 err = -EACCES; 1113 err = -EACCES;
1114 if (p->did_exec) 1114 if (p->did_exec)
1115 goto out; 1115 goto out;
1116 } else { 1116 } else {
1117 err = -ESRCH; 1117 err = -ESRCH;
1118 if (p != current) 1118 if (p != current)
1119 goto out; 1119 goto out;
1120 } 1120 }
1121 1121
1122 err = -EPERM; 1122 err = -EPERM;
1123 if (p->signal->leader) 1123 if (p->signal->leader)
1124 goto out; 1124 goto out;
1125 1125
1126 if (pgid != pid) { 1126 if (pgid != pid) {
1127 struct task_struct *p; 1127 struct task_struct *p;
1128 1128
1129 do_each_task_pid(pgid, PIDTYPE_PGID, p) { 1129 do_each_task_pid(pgid, PIDTYPE_PGID, p) {
1130 if (p->signal->session == current->signal->session) 1130 if (p->signal->session == current->signal->session)
1131 goto ok_pgid; 1131 goto ok_pgid;
1132 } while_each_task_pid(pgid, PIDTYPE_PGID, p); 1132 } while_each_task_pid(pgid, PIDTYPE_PGID, p);
1133 goto out; 1133 goto out;
1134 } 1134 }
1135 1135
1136 ok_pgid: 1136 ok_pgid:
1137 err = security_task_setpgid(p, pgid); 1137 err = security_task_setpgid(p, pgid);
1138 if (err) 1138 if (err)
1139 goto out; 1139 goto out;
1140 1140
1141 if (process_group(p) != pgid) { 1141 if (process_group(p) != pgid) {
1142 detach_pid(p, PIDTYPE_PGID); 1142 detach_pid(p, PIDTYPE_PGID);
1143 p->signal->pgrp = pgid; 1143 p->signal->pgrp = pgid;
1144 attach_pid(p, PIDTYPE_PGID, pgid); 1144 attach_pid(p, PIDTYPE_PGID, pgid);
1145 } 1145 }
1146 1146
1147 err = 0; 1147 err = 0;
1148 out: 1148 out:
1149 /* All paths lead to here, thus we are safe. -DaveM */ 1149 /* All paths lead to here, thus we are safe. -DaveM */
1150 write_unlock_irq(&tasklist_lock); 1150 write_unlock_irq(&tasklist_lock);
1151 return err; 1151 return err;
1152 } 1152 }
1153 1153
1154 asmlinkage long sys_getpgid(pid_t pid) 1154 asmlinkage long sys_getpgid(pid_t pid)
1155 { 1155 {
1156 if (!pid) { 1156 if (!pid) {
1157 return process_group(current); 1157 return process_group(current);
1158 } else { 1158 } else {
1159 int retval; 1159 int retval;
1160 struct task_struct *p; 1160 struct task_struct *p;
1161 1161
1162 read_lock(&tasklist_lock); 1162 read_lock(&tasklist_lock);
1163 p = find_task_by_pid(pid); 1163 p = find_task_by_pid(pid);
1164 1164
1165 retval = -ESRCH; 1165 retval = -ESRCH;
1166 if (p) { 1166 if (p) {
1167 retval = security_task_getpgid(p); 1167 retval = security_task_getpgid(p);
1168 if (!retval) 1168 if (!retval)
1169 retval = process_group(p); 1169 retval = process_group(p);
1170 } 1170 }
1171 read_unlock(&tasklist_lock); 1171 read_unlock(&tasklist_lock);
1172 return retval; 1172 return retval;
1173 } 1173 }
1174 } 1174 }
1175 1175
1176 #ifdef __ARCH_WANT_SYS_GETPGRP 1176 #ifdef __ARCH_WANT_SYS_GETPGRP
1177 1177
1178 asmlinkage long sys_getpgrp(void) 1178 asmlinkage long sys_getpgrp(void)
1179 { 1179 {
1180 /* SMP - assuming writes are word atomic this is fine */ 1180 /* SMP - assuming writes are word atomic this is fine */
1181 return process_group(current); 1181 return process_group(current);
1182 } 1182 }
1183 1183
1184 #endif 1184 #endif
1185 1185
1186 asmlinkage long sys_getsid(pid_t pid) 1186 asmlinkage long sys_getsid(pid_t pid)
1187 { 1187 {
1188 if (!pid) { 1188 if (!pid) {
1189 return current->signal->session; 1189 return current->signal->session;
1190 } else { 1190 } else {
1191 int retval; 1191 int retval;
1192 struct task_struct *p; 1192 struct task_struct *p;
1193 1193
1194 read_lock(&tasklist_lock); 1194 read_lock(&tasklist_lock);
1195 p = find_task_by_pid(pid); 1195 p = find_task_by_pid(pid);
1196 1196
1197 retval = -ESRCH; 1197 retval = -ESRCH;
1198 if(p) { 1198 if(p) {
1199 retval = security_task_getsid(p); 1199 retval = security_task_getsid(p);
1200 if (!retval) 1200 if (!retval)
1201 retval = p->signal->session; 1201 retval = p->signal->session;
1202 } 1202 }
1203 read_unlock(&tasklist_lock); 1203 read_unlock(&tasklist_lock);
1204 return retval; 1204 return retval;
1205 } 1205 }
1206 } 1206 }
1207 1207
1208 asmlinkage long sys_setsid(void) 1208 asmlinkage long sys_setsid(void)
1209 { 1209 {
1210 struct pid *pid; 1210 struct pid *pid;
1211 int err = -EPERM; 1211 int err = -EPERM;
1212 1212
1213 if (!thread_group_leader(current)) 1213 if (!thread_group_leader(current))
1214 return -EINVAL; 1214 return -EINVAL;
1215 1215
1216 down(&tty_sem); 1216 down(&tty_sem);
1217 write_lock_irq(&tasklist_lock); 1217 write_lock_irq(&tasklist_lock);
1218 1218
1219 pid = find_pid(PIDTYPE_PGID, current->pid); 1219 pid = find_pid(PIDTYPE_PGID, current->pid);
1220 if (pid) 1220 if (pid)
1221 goto out; 1221 goto out;
1222 1222
1223 current->signal->leader = 1; 1223 current->signal->leader = 1;
1224 __set_special_pids(current->pid, current->pid); 1224 __set_special_pids(current->pid, current->pid);
1225 current->signal->tty = NULL; 1225 current->signal->tty = NULL;
1226 current->signal->tty_old_pgrp = 0; 1226 current->signal->tty_old_pgrp = 0;
1227 err = process_group(current); 1227 err = process_group(current);
1228 out: 1228 out:
1229 write_unlock_irq(&tasklist_lock); 1229 write_unlock_irq(&tasklist_lock);
1230 up(&tty_sem); 1230 up(&tty_sem);
1231 return err; 1231 return err;
1232 } 1232 }
1233 1233
1234 /* 1234 /*
1235 * Supplementary group IDs 1235 * Supplementary group IDs
1236 */ 1236 */
1237 1237
1238 /* init to 2 - one for init_task, one to ensure it is never freed */ 1238 /* init to 2 - one for init_task, one to ensure it is never freed */
1239 struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; 1239 struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
1240 1240
1241 struct group_info *groups_alloc(int gidsetsize) 1241 struct group_info *groups_alloc(int gidsetsize)
1242 { 1242 {
1243 struct group_info *group_info; 1243 struct group_info *group_info;
1244 int nblocks; 1244 int nblocks;
1245 int i; 1245 int i;
1246 1246
1247 nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK; 1247 nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
1248 /* Make sure we always allocate at least one indirect block pointer */ 1248 /* Make sure we always allocate at least one indirect block pointer */
1249 nblocks = nblocks ? : 1; 1249 nblocks = nblocks ? : 1;
1250 group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER); 1250 group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
1251 if (!group_info) 1251 if (!group_info)
1252 return NULL; 1252 return NULL;
1253 group_info->ngroups = gidsetsize; 1253 group_info->ngroups = gidsetsize;
1254 group_info->nblocks = nblocks; 1254 group_info->nblocks = nblocks;
1255 atomic_set(&group_info->usage, 1); 1255 atomic_set(&group_info->usage, 1);
1256 1256
1257 if (gidsetsize <= NGROUPS_SMALL) { 1257 if (gidsetsize <= NGROUPS_SMALL) {
1258 group_info->blocks[0] = group_info->small_block; 1258 group_info->blocks[0] = group_info->small_block;
1259 } else { 1259 } else {
1260 for (i = 0; i < nblocks; i++) { 1260 for (i = 0; i < nblocks; i++) {
1261 gid_t *b; 1261 gid_t *b;
1262 b = (void *)__get_free_page(GFP_USER); 1262 b = (void *)__get_free_page(GFP_USER);
1263 if (!b) 1263 if (!b)
1264 goto out_undo_partial_alloc; 1264 goto out_undo_partial_alloc;
1265 group_info->blocks[i] = b; 1265 group_info->blocks[i] = b;
1266 } 1266 }
1267 } 1267 }
1268 return group_info; 1268 return group_info;
1269 1269
1270 out_undo_partial_alloc: 1270 out_undo_partial_alloc:
1271 while (--i >= 0) { 1271 while (--i >= 0) {
1272 free_page((unsigned long)group_info->blocks[i]); 1272 free_page((unsigned long)group_info->blocks[i]);
1273 } 1273 }
1274 kfree(group_info); 1274 kfree(group_info);
1275 return NULL; 1275 return NULL;
1276 } 1276 }
1277 1277
1278 EXPORT_SYMBOL(groups_alloc); 1278 EXPORT_SYMBOL(groups_alloc);
1279 1279
1280 void groups_free(struct group_info *group_info) 1280 void groups_free(struct group_info *group_info)
1281 { 1281 {
1282 if (group_info->blocks[0] != group_info->small_block) { 1282 if (group_info->blocks[0] != group_info->small_block) {
1283 int i; 1283 int i;
1284 for (i = 0; i < group_info->nblocks; i++) 1284 for (i = 0; i < group_info->nblocks; i++)
1285 free_page((unsigned long)group_info->blocks[i]); 1285 free_page((unsigned long)group_info->blocks[i]);
1286 } 1286 }
1287 kfree(group_info); 1287 kfree(group_info);
1288 } 1288 }
1289 1289
1290 EXPORT_SYMBOL(groups_free); 1290 EXPORT_SYMBOL(groups_free);
1291 1291
1292 /* export the group_info to a user-space array */ 1292 /* export the group_info to a user-space array */
1293 static int groups_to_user(gid_t __user *grouplist, 1293 static int groups_to_user(gid_t __user *grouplist,
1294 struct group_info *group_info) 1294 struct group_info *group_info)
1295 { 1295 {
1296 int i; 1296 int i;
1297 int count = group_info->ngroups; 1297 int count = group_info->ngroups;
1298 1298
1299 for (i = 0; i < group_info->nblocks; i++) { 1299 for (i = 0; i < group_info->nblocks; i++) {
1300 int cp_count = min(NGROUPS_PER_BLOCK, count); 1300 int cp_count = min(NGROUPS_PER_BLOCK, count);
1301 int off = i * NGROUPS_PER_BLOCK; 1301 int off = i * NGROUPS_PER_BLOCK;
1302 int len = cp_count * sizeof(*grouplist); 1302 int len = cp_count * sizeof(*grouplist);
1303 1303
1304 if (copy_to_user(grouplist+off, group_info->blocks[i], len)) 1304 if (copy_to_user(grouplist+off, group_info->blocks[i], len))
1305 return -EFAULT; 1305 return -EFAULT;
1306 1306
1307 count -= cp_count; 1307 count -= cp_count;
1308 } 1308 }
1309 return 0; 1309 return 0;
1310 } 1310 }
1311 1311
1312 /* fill a group_info from a user-space array - it must be allocated already */ 1312 /* fill a group_info from a user-space array - it must be allocated already */
1313 static int groups_from_user(struct group_info *group_info, 1313 static int groups_from_user(struct group_info *group_info,
1314 gid_t __user *grouplist) 1314 gid_t __user *grouplist)
1315 { 1315 {
1316 int i; 1316 int i;
1317 int count = group_info->ngroups; 1317 int count = group_info->ngroups;
1318 1318
1319 for (i = 0; i < group_info->nblocks; i++) { 1319 for (i = 0; i < group_info->nblocks; i++) {
1320 int cp_count = min(NGROUPS_PER_BLOCK, count); 1320 int cp_count = min(NGROUPS_PER_BLOCK, count);
1321 int off = i * NGROUPS_PER_BLOCK; 1321 int off = i * NGROUPS_PER_BLOCK;
1322 int len = cp_count * sizeof(*grouplist); 1322 int len = cp_count * sizeof(*grouplist);
1323 1323
1324 if (copy_from_user(group_info->blocks[i], grouplist+off, len)) 1324 if (copy_from_user(group_info->blocks[i], grouplist+off, len))
1325 return -EFAULT; 1325 return -EFAULT;
1326 1326
1327 count -= cp_count; 1327 count -= cp_count;
1328 } 1328 }
1329 return 0; 1329 return 0;
1330 } 1330 }
1331 1331
1332 /* a simple Shell sort */ 1332 /* a simple Shell sort */
1333 static void groups_sort(struct group_info *group_info) 1333 static void groups_sort(struct group_info *group_info)
1334 { 1334 {
1335 int base, max, stride; 1335 int base, max, stride;
1336 int gidsetsize = group_info->ngroups; 1336 int gidsetsize = group_info->ngroups;
1337 1337
1338 for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) 1338 for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
1339 ; /* nothing */ 1339 ; /* nothing */
1340 stride /= 3; 1340 stride /= 3;
1341 1341
1342 while (stride) { 1342 while (stride) {
1343 max = gidsetsize - stride; 1343 max = gidsetsize - stride;
1344 for (base = 0; base < max; base++) { 1344 for (base = 0; base < max; base++) {
1345 int left = base; 1345 int left = base;
1346 int right = left + stride; 1346 int right = left + stride;
1347 gid_t tmp = GROUP_AT(group_info, right); 1347 gid_t tmp = GROUP_AT(group_info, right);
1348 1348
1349 while (left >= 0 && GROUP_AT(group_info, left) > tmp) { 1349 while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
1350 GROUP_AT(group_info, right) = 1350 GROUP_AT(group_info, right) =
1351 GROUP_AT(group_info, left); 1351 GROUP_AT(group_info, left);
1352 right = left; 1352 right = left;
1353 left -= stride; 1353 left -= stride;
1354 } 1354 }
1355 GROUP_AT(group_info, right) = tmp; 1355 GROUP_AT(group_info, right) = tmp;
1356 } 1356 }
1357 stride /= 3; 1357 stride /= 3;
1358 } 1358 }
1359 } 1359 }
1360 1360
1361 /* a simple bsearch */ 1361 /* a simple bsearch */
1362 int groups_search(struct group_info *group_info, gid_t grp) 1362 int groups_search(struct group_info *group_info, gid_t grp)
1363 { 1363 {
1364 int left, right; 1364 int left, right;
1365 1365
1366 if (!group_info) 1366 if (!group_info)
1367 return 0; 1367 return 0;
1368 1368
1369 left = 0; 1369 left = 0;
1370 right = group_info->ngroups; 1370 right = group_info->ngroups;
1371 while (left < right) { 1371 while (left < right) {
1372 int mid = (left+right)/2; 1372 int mid = (left+right)/2;
1373 int cmp = grp - GROUP_AT(group_info, mid); 1373 int cmp = grp - GROUP_AT(group_info, mid);
1374 if (cmp > 0) 1374 if (cmp > 0)
1375 left = mid + 1; 1375 left = mid + 1;
1376 else if (cmp < 0) 1376 else if (cmp < 0)
1377 right = mid; 1377 right = mid;
1378 else 1378 else
1379 return 1; 1379 return 1;
1380 } 1380 }
1381 return 0; 1381 return 0;
1382 } 1382 }
1383 1383
1384 /* validate and set current->group_info */ 1384 /* validate and set current->group_info */
1385 int set_current_groups(struct group_info *group_info) 1385 int set_current_groups(struct group_info *group_info)
1386 { 1386 {
1387 int retval; 1387 int retval;
1388 struct group_info *old_info; 1388 struct group_info *old_info;
1389 1389
1390 retval = security_task_setgroups(group_info); 1390 retval = security_task_setgroups(group_info);
1391 if (retval) 1391 if (retval)
1392 return retval; 1392 return retval;
1393 1393
1394 groups_sort(group_info); 1394 groups_sort(group_info);
1395 get_group_info(group_info); 1395 get_group_info(group_info);
1396 1396
1397 task_lock(current); 1397 task_lock(current);
1398 old_info = current->group_info; 1398 old_info = current->group_info;
1399 current->group_info = group_info; 1399 current->group_info = group_info;
1400 task_unlock(current); 1400 task_unlock(current);
1401 1401
1402 put_group_info(old_info); 1402 put_group_info(old_info);
1403 1403
1404 return 0; 1404 return 0;
1405 } 1405 }
1406 1406
1407 EXPORT_SYMBOL(set_current_groups); 1407 EXPORT_SYMBOL(set_current_groups);
1408 1408
1409 asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) 1409 asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
1410 { 1410 {
1411 int i = 0; 1411 int i = 0;
1412 1412
1413 /* 1413 /*
1414 * SMP: Nobody else can change our grouplist. Thus we are 1414 * SMP: Nobody else can change our grouplist. Thus we are
1415 * safe. 1415 * safe.
1416 */ 1416 */
1417 1417
1418 if (gidsetsize < 0) 1418 if (gidsetsize < 0)
1419 return -EINVAL; 1419 return -EINVAL;
1420 1420
1421 /* no need to grab task_lock here; it cannot change */ 1421 /* no need to grab task_lock here; it cannot change */
1422 get_group_info(current->group_info); 1422 get_group_info(current->group_info);
1423 i = current->group_info->ngroups; 1423 i = current->group_info->ngroups;
1424 if (gidsetsize) { 1424 if (gidsetsize) {
1425 if (i > gidsetsize) { 1425 if (i > gidsetsize) {
1426 i = -EINVAL; 1426 i = -EINVAL;
1427 goto out; 1427 goto out;
1428 } 1428 }
1429 if (groups_to_user(grouplist, current->group_info)) { 1429 if (groups_to_user(grouplist, current->group_info)) {
1430 i = -EFAULT; 1430 i = -EFAULT;
1431 goto out; 1431 goto out;
1432 } 1432 }
1433 } 1433 }
1434 out: 1434 out:
1435 put_group_info(current->group_info); 1435 put_group_info(current->group_info);
1436 return i; 1436 return i;
1437 } 1437 }
1438 1438
1439 /* 1439 /*
1440 * SMP: Our groups are copy-on-write. We can set them safely 1440 * SMP: Our groups are copy-on-write. We can set them safely
1441 * without another task interfering. 1441 * without another task interfering.
1442 */ 1442 */
1443 1443
1444 asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist) 1444 asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist)
1445 { 1445 {
1446 struct group_info *group_info; 1446 struct group_info *group_info;
1447 int retval; 1447 int retval;
1448 1448
1449 if (!capable(CAP_SETGID)) 1449 if (!capable(CAP_SETGID))
1450 return -EPERM; 1450 return -EPERM;
1451 if ((unsigned)gidsetsize > NGROUPS_MAX) 1451 if ((unsigned)gidsetsize > NGROUPS_MAX)
1452 return -EINVAL; 1452 return -EINVAL;
1453 1453
1454 group_info = groups_alloc(gidsetsize); 1454 group_info = groups_alloc(gidsetsize);
1455 if (!group_info) 1455 if (!group_info)
1456 return -ENOMEM; 1456 return -ENOMEM;
1457 retval = groups_from_user(group_info, grouplist); 1457 retval = groups_from_user(group_info, grouplist);
1458 if (retval) { 1458 if (retval) {
1459 put_group_info(group_info); 1459 put_group_info(group_info);
1460 return retval; 1460 return retval;
1461 } 1461 }
1462 1462
1463 retval = set_current_groups(group_info); 1463 retval = set_current_groups(group_info);
1464 put_group_info(group_info); 1464 put_group_info(group_info);
1465 1465
1466 return retval; 1466 return retval;
1467 } 1467 }
1468 1468
1469 /* 1469 /*
1470 * Check whether we're fsgid/egid or in the supplemental group.. 1470 * Check whether we're fsgid/egid or in the supplemental group..
1471 */ 1471 */
1472 int in_group_p(gid_t grp) 1472 int in_group_p(gid_t grp)
1473 { 1473 {
1474 int retval = 1; 1474 int retval = 1;
1475 if (grp != current->fsgid) { 1475 if (grp != current->fsgid) {
1476 get_group_info(current->group_info); 1476 get_group_info(current->group_info);
1477 retval = groups_search(current->group_info, grp); 1477 retval = groups_search(current->group_info, grp);
1478 put_group_info(current->group_info); 1478 put_group_info(current->group_info);
1479 } 1479 }
1480 return retval; 1480 return retval;
1481 } 1481 }
1482 1482
1483 EXPORT_SYMBOL(in_group_p); 1483 EXPORT_SYMBOL(in_group_p);
1484 1484
1485 int in_egroup_p(gid_t grp) 1485 int in_egroup_p(gid_t grp)
1486 { 1486 {
1487 int retval = 1; 1487 int retval = 1;
1488 if (grp != current->egid) { 1488 if (grp != current->egid) {
1489 get_group_info(current->group_info); 1489 get_group_info(current->group_info);
1490 retval = groups_search(current->group_info, grp); 1490 retval = groups_search(current->group_info, grp);
1491 put_group_info(current->group_info); 1491 put_group_info(current->group_info);
1492 } 1492 }
1493 return retval; 1493 return retval;
1494 } 1494 }
1495 1495
1496 EXPORT_SYMBOL(in_egroup_p); 1496 EXPORT_SYMBOL(in_egroup_p);
1497 1497
1498 DECLARE_RWSEM(uts_sem); 1498 DECLARE_RWSEM(uts_sem);
1499 1499
1500 EXPORT_SYMBOL(uts_sem); 1500 EXPORT_SYMBOL(uts_sem);
1501 1501
1502 asmlinkage long sys_newuname(struct new_utsname __user * name) 1502 asmlinkage long sys_newuname(struct new_utsname __user * name)
1503 { 1503 {
1504 int errno = 0; 1504 int errno = 0;
1505 1505
1506 down_read(&uts_sem); 1506 down_read(&uts_sem);
1507 if (copy_to_user(name,&system_utsname,sizeof *name)) 1507 if (copy_to_user(name,&system_utsname,sizeof *name))
1508 errno = -EFAULT; 1508 errno = -EFAULT;
1509 up_read(&uts_sem); 1509 up_read(&uts_sem);
1510 return errno; 1510 return errno;
1511 } 1511 }
1512 1512
1513 asmlinkage long sys_sethostname(char __user *name, int len) 1513 asmlinkage long sys_sethostname(char __user *name, int len)
1514 { 1514 {
1515 int errno; 1515 int errno;
1516 char tmp[__NEW_UTS_LEN]; 1516 char tmp[__NEW_UTS_LEN];
1517 1517
1518 if (!capable(CAP_SYS_ADMIN)) 1518 if (!capable(CAP_SYS_ADMIN))
1519 return -EPERM; 1519 return -EPERM;
1520 if (len < 0 || len > __NEW_UTS_LEN) 1520 if (len < 0 || len > __NEW_UTS_LEN)
1521 return -EINVAL; 1521 return -EINVAL;
1522 down_write(&uts_sem); 1522 down_write(&uts_sem);
1523 errno = -EFAULT; 1523 errno = -EFAULT;
1524 if (!copy_from_user(tmp, name, len)) { 1524 if (!copy_from_user(tmp, name, len)) {
1525 memcpy(system_utsname.nodename, tmp, len); 1525 memcpy(system_utsname.nodename, tmp, len);
1526 system_utsname.nodename[len] = 0; 1526 system_utsname.nodename[len] = 0;
1527 errno = 0; 1527 errno = 0;
1528 } 1528 }
1529 up_write(&uts_sem); 1529 up_write(&uts_sem);
1530 return errno; 1530 return errno;
1531 } 1531 }
1532 1532
1533 #ifdef __ARCH_WANT_SYS_GETHOSTNAME 1533 #ifdef __ARCH_WANT_SYS_GETHOSTNAME
1534 1534
1535 asmlinkage long sys_gethostname(char __user *name, int len) 1535 asmlinkage long sys_gethostname(char __user *name, int len)
1536 { 1536 {
1537 int i, errno; 1537 int i, errno;
1538 1538
1539 if (len < 0) 1539 if (len < 0)
1540 return -EINVAL; 1540 return -EINVAL;
1541 down_read(&uts_sem); 1541 down_read(&uts_sem);
1542 i = 1 + strlen(system_utsname.nodename); 1542 i = 1 + strlen(system_utsname.nodename);
1543 if (i > len) 1543 if (i > len)
1544 i = len; 1544 i = len;
1545 errno = 0; 1545 errno = 0;
1546 if (copy_to_user(name, system_utsname.nodename, i)) 1546 if (copy_to_user(name, system_utsname.nodename, i))
1547 errno = -EFAULT; 1547 errno = -EFAULT;
1548 up_read(&uts_sem); 1548 up_read(&uts_sem);
1549 return errno; 1549 return errno;
1550 } 1550 }
1551 1551
1552 #endif 1552 #endif
1553 1553
1554 /* 1554 /*
1555 * Only setdomainname; getdomainname can be implemented by calling 1555 * Only setdomainname; getdomainname can be implemented by calling
1556 * uname() 1556 * uname()
1557 */ 1557 */
1558 asmlinkage long sys_setdomainname(char __user *name, int len) 1558 asmlinkage long sys_setdomainname(char __user *name, int len)
1559 { 1559 {
1560 int errno; 1560 int errno;
1561 char tmp[__NEW_UTS_LEN]; 1561 char tmp[__NEW_UTS_LEN];
1562 1562
1563 if (!capable(CAP_SYS_ADMIN)) 1563 if (!capable(CAP_SYS_ADMIN))
1564 return -EPERM; 1564 return -EPERM;
1565 if (len < 0 || len > __NEW_UTS_LEN) 1565 if (len < 0 || len > __NEW_UTS_LEN)
1566 return -EINVAL; 1566 return -EINVAL;
1567 1567
1568 down_write(&uts_sem); 1568 down_write(&uts_sem);
1569 errno = -EFAULT; 1569 errno = -EFAULT;
1570 if (!copy_from_user(tmp, name, len)) { 1570 if (!copy_from_user(tmp, name, len)) {
1571 memcpy(system_utsname.domainname, tmp, len); 1571 memcpy(system_utsname.domainname, tmp, len);
1572 system_utsname.domainname[len] = 0; 1572 system_utsname.domainname[len] = 0;
1573 errno = 0; 1573 errno = 0;
1574 } 1574 }
1575 up_write(&uts_sem); 1575 up_write(&uts_sem);
1576 return errno; 1576 return errno;
1577 } 1577 }
1578 1578
1579 asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim) 1579 asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim)
1580 { 1580 {
1581 if (resource >= RLIM_NLIMITS) 1581 if (resource >= RLIM_NLIMITS)
1582 return -EINVAL; 1582 return -EINVAL;
1583 else { 1583 else {
1584 struct rlimit value; 1584 struct rlimit value;
1585 task_lock(current->group_leader); 1585 task_lock(current->group_leader);
1586 value = current->signal->rlim[resource]; 1586 value = current->signal->rlim[resource];
1587 task_unlock(current->group_leader); 1587 task_unlock(current->group_leader);
1588 return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; 1588 return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
1589 } 1589 }
1590 } 1590 }
1591 1591
1592 #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT 1592 #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
1593 1593
1594 /* 1594 /*
1595 * Back compatibility for getrlimit. Needed for some apps. 1595 * Back compatibility for getrlimit. Needed for some apps.
1596 */ 1596 */
1597 1597
1598 asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *rlim) 1598 asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *rlim)
1599 { 1599 {
1600 struct rlimit x; 1600 struct rlimit x;
1601 if (resource >= RLIM_NLIMITS) 1601 if (resource >= RLIM_NLIMITS)
1602 return -EINVAL; 1602 return -EINVAL;
1603 1603
1604 task_lock(current->group_leader); 1604 task_lock(current->group_leader);
1605 x = current->signal->rlim[resource]; 1605 x = current->signal->rlim[resource];
1606 task_unlock(current->group_leader); 1606 task_unlock(current->group_leader);
1607 if(x.rlim_cur > 0x7FFFFFFF) 1607 if(x.rlim_cur > 0x7FFFFFFF)
1608 x.rlim_cur = 0x7FFFFFFF; 1608 x.rlim_cur = 0x7FFFFFFF;
1609 if(x.rlim_max > 0x7FFFFFFF) 1609 if(x.rlim_max > 0x7FFFFFFF)
1610 x.rlim_max = 0x7FFFFFFF; 1610 x.rlim_max = 0x7FFFFFFF;
1611 return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; 1611 return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0;
1612 } 1612 }
1613 1613
1614 #endif 1614 #endif
1615 1615
1616 asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) 1616 asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1617 { 1617 {
1618 struct rlimit new_rlim, *old_rlim; 1618 struct rlimit new_rlim, *old_rlim;
1619 int retval; 1619 int retval;
1620 1620
1621 if (resource >= RLIM_NLIMITS) 1621 if (resource >= RLIM_NLIMITS)
1622 return -EINVAL; 1622 return -EINVAL;
1623 if(copy_from_user(&new_rlim, rlim, sizeof(*rlim))) 1623 if(copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
1624 return -EFAULT; 1624 return -EFAULT;
1625 if (new_rlim.rlim_cur > new_rlim.rlim_max) 1625 if (new_rlim.rlim_cur > new_rlim.rlim_max)
1626 return -EINVAL; 1626 return -EINVAL;
1627 old_rlim = current->signal->rlim + resource; 1627 old_rlim = current->signal->rlim + resource;
1628 if ((new_rlim.rlim_max > old_rlim->rlim_max) && 1628 if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
1629 !capable(CAP_SYS_RESOURCE)) 1629 !capable(CAP_SYS_RESOURCE))
1630 return -EPERM; 1630 return -EPERM;
1631 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN) 1631 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN)
1632 return -EPERM; 1632 return -EPERM;
1633 1633
1634 retval = security_task_setrlimit(resource, &new_rlim); 1634 retval = security_task_setrlimit(resource, &new_rlim);
1635 if (retval) 1635 if (retval)
1636 return retval; 1636 return retval;
1637 1637
1638 task_lock(current->group_leader); 1638 task_lock(current->group_leader);
1639 *old_rlim = new_rlim; 1639 *old_rlim = new_rlim;
1640 task_unlock(current->group_leader); 1640 task_unlock(current->group_leader);
1641 1641
1642 if (resource == RLIMIT_CPU && new_rlim.rlim_cur != RLIM_INFINITY && 1642 if (resource == RLIMIT_CPU && new_rlim.rlim_cur != RLIM_INFINITY &&
1643 (cputime_eq(current->signal->it_prof_expires, cputime_zero) || 1643 (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
1644 new_rlim.rlim_cur <= cputime_to_secs( 1644 new_rlim.rlim_cur <= cputime_to_secs(
1645 current->signal->it_prof_expires))) { 1645 current->signal->it_prof_expires))) {
1646 cputime_t cputime = secs_to_cputime(new_rlim.rlim_cur); 1646 cputime_t cputime = secs_to_cputime(new_rlim.rlim_cur);
1647 read_lock(&tasklist_lock); 1647 read_lock(&tasklist_lock);
1648 spin_lock_irq(&current->sighand->siglock); 1648 spin_lock_irq(&current->sighand->siglock);
1649 set_process_cpu_timer(current, CPUCLOCK_PROF, 1649 set_process_cpu_timer(current, CPUCLOCK_PROF,
1650 &cputime, NULL); 1650 &cputime, NULL);
1651 spin_unlock_irq(&current->sighand->siglock); 1651 spin_unlock_irq(&current->sighand->siglock);
1652 read_unlock(&tasklist_lock); 1652 read_unlock(&tasklist_lock);
1653 } 1653 }
1654 1654
1655 return 0; 1655 return 0;
1656 } 1656 }
1657 1657
1658 /* 1658 /*
1659 * It would make sense to put struct rusage in the task_struct, 1659 * It would make sense to put struct rusage in the task_struct,
1660 * except that would make the task_struct be *really big*. After 1660 * except that would make the task_struct be *really big*. After
1661 * task_struct gets moved into malloc'ed memory, it would 1661 * task_struct gets moved into malloc'ed memory, it would
1662 * make sense to do this. It will make moving the rest of the information 1662 * make sense to do this. It will make moving the rest of the information
1663 * a lot simpler! (Which we're not doing right now because we're not 1663 * a lot simpler! (Which we're not doing right now because we're not
1664 * measuring them yet). 1664 * measuring them yet).
1665 * 1665 *
1666 * This expects to be called with tasklist_lock read-locked or better, 1666 * This expects to be called with tasklist_lock read-locked or better,
1667 * and the siglock not locked. It may momentarily take the siglock. 1667 * and the siglock not locked. It may momentarily take the siglock.
1668 * 1668 *
1669 * When sampling multiple threads for RUSAGE_SELF, under SMP we might have 1669 * When sampling multiple threads for RUSAGE_SELF, under SMP we might have
1670 * races with threads incrementing their own counters. But since word 1670 * races with threads incrementing their own counters. But since word
1671 * reads are atomic, we either get new values or old values and we don't 1671 * reads are atomic, we either get new values or old values and we don't
1672 * care which for the sums. We always take the siglock to protect reading 1672 * care which for the sums. We always take the siglock to protect reading
1673 * the c* fields from p->signal from races with exit.c updating those 1673 * the c* fields from p->signal from races with exit.c updating those
1674 * fields when reaping, so a sample either gets all the additions of a 1674 * fields when reaping, so a sample either gets all the additions of a
1675 * given child after it's reaped, or none so this sample is before reaping. 1675 * given child after it's reaped, or none so this sample is before reaping.
1676 */ 1676 */
1677 1677
1678 static void k_getrusage(struct task_struct *p, int who, struct rusage *r) 1678 static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1679 { 1679 {
1680 struct task_struct *t; 1680 struct task_struct *t;
1681 unsigned long flags; 1681 unsigned long flags;
1682 cputime_t utime, stime; 1682 cputime_t utime, stime;
1683 1683
1684 memset((char *) r, 0, sizeof *r); 1684 memset((char *) r, 0, sizeof *r);
1685 1685
1686 if (unlikely(!p->signal)) 1686 if (unlikely(!p->signal))
1687 return; 1687 return;
1688 1688
1689 switch (who) { 1689 switch (who) {
1690 case RUSAGE_CHILDREN: 1690 case RUSAGE_CHILDREN:
1691 spin_lock_irqsave(&p->sighand->siglock, flags); 1691 spin_lock_irqsave(&p->sighand->siglock, flags);
1692 utime = p->signal->cutime; 1692 utime = p->signal->cutime;
1693 stime = p->signal->cstime; 1693 stime = p->signal->cstime;
1694 r->ru_nvcsw = p->signal->cnvcsw; 1694 r->ru_nvcsw = p->signal->cnvcsw;
1695 r->ru_nivcsw = p->signal->cnivcsw; 1695 r->ru_nivcsw = p->signal->cnivcsw;
1696 r->ru_minflt = p->signal->cmin_flt; 1696 r->ru_minflt = p->signal->cmin_flt;
1697 r->ru_majflt = p->signal->cmaj_flt; 1697 r->ru_majflt = p->signal->cmaj_flt;
1698 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1698 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1699 cputime_to_timeval(utime, &r->ru_utime); 1699 cputime_to_timeval(utime, &r->ru_utime);
1700 cputime_to_timeval(stime, &r->ru_stime); 1700 cputime_to_timeval(stime, &r->ru_stime);
1701 break; 1701 break;
1702 case RUSAGE_SELF: 1702 case RUSAGE_SELF:
1703 spin_lock_irqsave(&p->sighand->siglock, flags); 1703 spin_lock_irqsave(&p->sighand->siglock, flags);
1704 utime = stime = cputime_zero; 1704 utime = stime = cputime_zero;
1705 goto sum_group; 1705 goto sum_group;
1706 case RUSAGE_BOTH: 1706 case RUSAGE_BOTH:
1707 spin_lock_irqsave(&p->sighand->siglock, flags); 1707 spin_lock_irqsave(&p->sighand->siglock, flags);
1708 utime = p->signal->cutime; 1708 utime = p->signal->cutime;
1709 stime = p->signal->cstime; 1709 stime = p->signal->cstime;
1710 r->ru_nvcsw = p->signal->cnvcsw; 1710 r->ru_nvcsw = p->signal->cnvcsw;
1711 r->ru_nivcsw = p->signal->cnivcsw; 1711 r->ru_nivcsw = p->signal->cnivcsw;
1712 r->ru_minflt = p->signal->cmin_flt; 1712 r->ru_minflt = p->signal->cmin_flt;
1713 r->ru_majflt = p->signal->cmaj_flt; 1713 r->ru_majflt = p->signal->cmaj_flt;
1714 sum_group: 1714 sum_group:
1715 utime = cputime_add(utime, p->signal->utime); 1715 utime = cputime_add(utime, p->signal->utime);
1716 stime = cputime_add(stime, p->signal->stime); 1716 stime = cputime_add(stime, p->signal->stime);
1717 r->ru_nvcsw += p->signal->nvcsw; 1717 r->ru_nvcsw += p->signal->nvcsw;
1718 r->ru_nivcsw += p->signal->nivcsw; 1718 r->ru_nivcsw += p->signal->nivcsw;
1719 r->ru_minflt += p->signal->min_flt; 1719 r->ru_minflt += p->signal->min_flt;
1720 r->ru_majflt += p->signal->maj_flt; 1720 r->ru_majflt += p->signal->maj_flt;
1721 t = p; 1721 t = p;
1722 do { 1722 do {
1723 utime = cputime_add(utime, t->utime); 1723 utime = cputime_add(utime, t->utime);
1724 stime = cputime_add(stime, t->stime); 1724 stime = cputime_add(stime, t->stime);
1725 r->ru_nvcsw += t->nvcsw; 1725 r->ru_nvcsw += t->nvcsw;
1726 r->ru_nivcsw += t->nivcsw; 1726 r->ru_nivcsw += t->nivcsw;
1727 r->ru_minflt += t->min_flt; 1727 r->ru_minflt += t->min_flt;
1728 r->ru_majflt += t->maj_flt; 1728 r->ru_majflt += t->maj_flt;
1729 t = next_thread(t); 1729 t = next_thread(t);
1730 } while (t != p); 1730 } while (t != p);
1731 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1731 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1732 cputime_to_timeval(utime, &r->ru_utime); 1732 cputime_to_timeval(utime, &r->ru_utime);
1733 cputime_to_timeval(stime, &r->ru_stime); 1733 cputime_to_timeval(stime, &r->ru_stime);
1734 break; 1734 break;
1735 default: 1735 default:
1736 BUG(); 1736 BUG();
1737 } 1737 }
1738 } 1738 }
1739 1739
1740 int getrusage(struct task_struct *p, int who, struct rusage __user *ru) 1740 int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
1741 { 1741 {
1742 struct rusage r; 1742 struct rusage r;
1743 read_lock(&tasklist_lock); 1743 read_lock(&tasklist_lock);
1744 k_getrusage(p, who, &r); 1744 k_getrusage(p, who, &r);
1745 read_unlock(&tasklist_lock); 1745 read_unlock(&tasklist_lock);
1746 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; 1746 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
1747 } 1747 }
1748 1748
1749 asmlinkage long sys_getrusage(int who, struct rusage __user *ru) 1749 asmlinkage long sys_getrusage(int who, struct rusage __user *ru)
1750 { 1750 {
1751 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN) 1751 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN)
1752 return -EINVAL; 1752 return -EINVAL;
1753 return getrusage(current, who, ru); 1753 return getrusage(current, who, ru);
1754 } 1754 }
1755 1755
1756 asmlinkage long sys_umask(int mask) 1756 asmlinkage long sys_umask(int mask)
1757 { 1757 {
1758 mask = xchg(&current->fs->umask, mask & S_IRWXUGO); 1758 mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
1759 return mask; 1759 return mask;
1760 } 1760 }
1761 1761
1762 asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, 1762 asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1763 unsigned long arg4, unsigned long arg5) 1763 unsigned long arg4, unsigned long arg5)
1764 { 1764 {
1765 long error; 1765 long error;
1766 1766
1767 error = security_task_prctl(option, arg2, arg3, arg4, arg5); 1767 error = security_task_prctl(option, arg2, arg3, arg4, arg5);
1768 if (error) 1768 if (error)
1769 return error; 1769 return error;
1770 1770
1771 switch (option) { 1771 switch (option) {
1772 case PR_SET_PDEATHSIG: 1772 case PR_SET_PDEATHSIG:
1773 if (!valid_signal(arg2)) { 1773 if (!valid_signal(arg2)) {
1774 error = -EINVAL; 1774 error = -EINVAL;
1775 break; 1775 break;
1776 } 1776 }
1777 current->pdeath_signal = arg2; 1777 current->pdeath_signal = arg2;
1778 break; 1778 break;
1779 case PR_GET_PDEATHSIG: 1779 case PR_GET_PDEATHSIG:
1780 error = put_user(current->pdeath_signal, (int __user *)arg2); 1780 error = put_user(current->pdeath_signal, (int __user *)arg2);
1781 break; 1781 break;
1782 case PR_GET_DUMPABLE: 1782 case PR_GET_DUMPABLE:
1783 error = current->mm->dumpable; 1783 error = current->mm->dumpable;
1784 break; 1784 break;
1785 case PR_SET_DUMPABLE: 1785 case PR_SET_DUMPABLE:
1786 if (arg2 < 0 || arg2 > 2) { 1786 if (arg2 < 0 || arg2 > 2) {
1787 error = -EINVAL; 1787 error = -EINVAL;
1788 break; 1788 break;
1789 } 1789 }
1790 current->mm->dumpable = arg2; 1790 current->mm->dumpable = arg2;
1791 break; 1791 break;
1792 1792
1793 case PR_SET_UNALIGN: 1793 case PR_SET_UNALIGN:
1794 error = SET_UNALIGN_CTL(current, arg2); 1794 error = SET_UNALIGN_CTL(current, arg2);
1795 break; 1795 break;
1796 case PR_GET_UNALIGN: 1796 case PR_GET_UNALIGN:
1797 error = GET_UNALIGN_CTL(current, arg2); 1797 error = GET_UNALIGN_CTL(current, arg2);
1798 break; 1798 break;
1799 case PR_SET_FPEMU: 1799 case PR_SET_FPEMU:
1800 error = SET_FPEMU_CTL(current, arg2); 1800 error = SET_FPEMU_CTL(current, arg2);
1801 break; 1801 break;
1802 case PR_GET_FPEMU: 1802 case PR_GET_FPEMU:
1803 error = GET_FPEMU_CTL(current, arg2); 1803 error = GET_FPEMU_CTL(current, arg2);
1804 break; 1804 break;
1805 case PR_SET_FPEXC: 1805 case PR_SET_FPEXC:
1806 error = SET_FPEXC_CTL(current, arg2); 1806 error = SET_FPEXC_CTL(current, arg2);
1807 break; 1807 break;
1808 case PR_GET_FPEXC: 1808 case PR_GET_FPEXC:
1809 error = GET_FPEXC_CTL(current, arg2); 1809 error = GET_FPEXC_CTL(current, arg2);
1810 break; 1810 break;
1811 case PR_GET_TIMING: 1811 case PR_GET_TIMING:
1812 error = PR_TIMING_STATISTICAL; 1812 error = PR_TIMING_STATISTICAL;
1813 break; 1813 break;
1814 case PR_SET_TIMING: 1814 case PR_SET_TIMING:
1815 if (arg2 == PR_TIMING_STATISTICAL) 1815 if (arg2 == PR_TIMING_STATISTICAL)
1816 error = 0; 1816 error = 0;
1817 else 1817 else
1818 error = -EINVAL; 1818 error = -EINVAL;
1819 break; 1819 break;
1820 1820
1821 case PR_GET_KEEPCAPS: 1821 case PR_GET_KEEPCAPS:
1822 if (current->keep_capabilities) 1822 if (current->keep_capabilities)
1823 error = 1; 1823 error = 1;
1824 break; 1824 break;
1825 case PR_SET_KEEPCAPS: 1825 case PR_SET_KEEPCAPS:
1826 if (arg2 != 0 && arg2 != 1) { 1826 if (arg2 != 0 && arg2 != 1) {
1827 error = -EINVAL; 1827 error = -EINVAL;
1828 break; 1828 break;
1829 } 1829 }
1830 current->keep_capabilities = arg2; 1830 current->keep_capabilities = arg2;
1831 break; 1831 break;
1832 case PR_SET_NAME: { 1832 case PR_SET_NAME: {
1833 struct task_struct *me = current; 1833 struct task_struct *me = current;
1834 unsigned char ncomm[sizeof(me->comm)]; 1834 unsigned char ncomm[sizeof(me->comm)];
1835 1835
1836 ncomm[sizeof(me->comm)-1] = 0; 1836 ncomm[sizeof(me->comm)-1] = 0;
1837 if (strncpy_from_user(ncomm, (char __user *)arg2, 1837 if (strncpy_from_user(ncomm, (char __user *)arg2,
1838 sizeof(me->comm)-1) < 0) 1838 sizeof(me->comm)-1) < 0)
1839 return -EFAULT; 1839 return -EFAULT;
1840 set_task_comm(me, ncomm); 1840 set_task_comm(me, ncomm);
1841 return 0; 1841 return 0;
1842 } 1842 }
1843 case PR_GET_NAME: { 1843 case PR_GET_NAME: {
1844 struct task_struct *me = current; 1844 struct task_struct *me = current;
1845 unsigned char tcomm[sizeof(me->comm)]; 1845 unsigned char tcomm[sizeof(me->comm)];
1846 1846
1847 get_task_comm(tcomm, me); 1847 get_task_comm(tcomm, me);
1848 if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm))) 1848 if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm)))
1849 return -EFAULT; 1849 return -EFAULT;
1850 return 0; 1850 return 0;
1851 } 1851 }
1852 default: 1852 default:
1853 error = -EINVAL; 1853 error = -EINVAL;
1854 break; 1854 break;
1855 } 1855 }
1856 return error; 1856 return error;
1857 } 1857 }
1858 1858