Commit e9dd2b6837e26fe202708cce5ea4bb4ee3e3482e
Merge branch 'for-2.6.37/core' of git://git.kernel.dk/linux-2.6-block
* 'for-2.6.37/core' of git://git.kernel.dk/linux-2.6-block: (39 commits) cfq-iosched: Fix a gcc 4.5 warning and put some comments block: Turn bvec_k{un,}map_irq() into static inline functions block: fix accounting bug on cross partition merges block: Make the integrity mapped property a bio flag block: Fix double free in blk_integrity_unregister block: Ensure physical block size is unsigned int blkio-throttle: Fix possible multiplication overflow in iops calculations blkio-throttle: limit max iops value to UINT_MAX blkio-throttle: There is no need to convert jiffies to milli seconds blkio-throttle: Fix link failure failure on i386 blkio: Recalculate the throttled bio dispatch time upon throttle limit change blkio: Add root group to td->tg_list blkio: deletion of a cgroup was causes oops blkio: Do not export throttle files if CONFIG_BLK_DEV_THROTTLING=n block: set the bounce_pfn to the actual DMA limit rather than to max memory block: revert bad fix for memory hotplug causing bounces Fix compile error in blk-exec.c for !CONFIG_DETECT_HUNG_TASK block: set the bounce_pfn to the actual DMA limit rather than to max memory block: Prevent hang_check firing during long I/O cfq: improve fsync performance for small files ... Fix up trivial conflicts due to __rcu sparse annotation in include/linux/genhd.h
Showing 43 changed files Side-by-side Diff
- Documentation/cgroups/blkio-controller.txt
- block/Kconfig
- block/Makefile
- block/blk-cgroup.c
- block/blk-cgroup.h
- block/blk-core.c
- block/blk-exec.c
- block/blk-integrity.c
- block/blk-map.c
- block/blk-merge.c
- block/blk-settings.c
- block/blk-sysfs.c
- block/blk-throttle.c
- block/blk.h
- block/cfq-iosched.c
- block/cfq.h
- block/genhd.c
- block/ioctl.c
- drivers/block/drbd/drbd_receiver.c
- drivers/md/dm-snap.c
- drivers/md/dm-table.c
- drivers/s390/scsi/zfcp_scsi.c
- drivers/scsi/hosts.c
- drivers/scsi/scsi_lib.c
- drivers/scsi/scsi_sysfs.c
- drivers/scsi/sd_dif.c
- drivers/scsi/sg.c
- fs/jbd/commit.c
- fs/jbd2/commit.c
- fs/partitions/check.c
- fs/partitions/check.h
- fs/partitions/efi.c
- include/linux/bio.h
- include/linux/blk_types.h
- include/linux/blkdev.h
- include/linux/elevator.h
- include/linux/genhd.h
- include/linux/kernel.h
- include/linux/sched.h
- include/scsi/scsi.h
- include/scsi/scsi_host.h
- init/Kconfig
- init/do_mounts.c
Documentation/cgroups/blkio-controller.txt
... | ... | @@ -8,12 +8,17 @@ |
8 | 8 | Plan is to use the same cgroup based management interface for blkio controller |
9 | 9 | and based on user options switch IO policies in the background. |
10 | 10 | |
11 | -In the first phase, this patchset implements proportional weight time based | |
12 | -division of disk policy. It is implemented in CFQ. Hence this policy takes | |
13 | -effect only on leaf nodes when CFQ is being used. | |
11 | +Currently two IO control policies are implemented. First one is proportional | |
12 | +weight time based division of disk policy. It is implemented in CFQ. Hence | |
13 | +this policy takes effect only on leaf nodes when CFQ is being used. The second | |
14 | +one is throttling policy which can be used to specify upper IO rate limits | |
15 | +on devices. This policy is implemented in generic block layer and can be | |
16 | +used on leaf nodes as well as higher level logical devices like device mapper. | |
14 | 17 | |
15 | 18 | HOWTO |
16 | 19 | ===== |
20 | +Proportional Weight division of bandwidth | |
21 | +----------------------------------------- | |
17 | 22 | You can do a very simple testing of running two dd threads in two different |
18 | 23 | cgroups. Here is what you can do. |
19 | 24 | |
... | ... | @@ -55,6 +60,35 @@ |
55 | 60 | group dispatched to the disk. We provide fairness in terms of disk time, so |
56 | 61 | ideally io.disk_time of cgroups should be in proportion to the weight. |
57 | 62 | |
63 | +Throttling/Upper Limit policy | |
64 | +----------------------------- | |
65 | +- Enable Block IO controller | |
66 | + CONFIG_BLK_CGROUP=y | |
67 | + | |
68 | +- Enable throttling in block layer | |
69 | + CONFIG_BLK_DEV_THROTTLING=y | |
70 | + | |
71 | +- Mount blkio controller | |
72 | + mount -t cgroup -o blkio none /cgroup/blkio | |
73 | + | |
74 | +- Specify a bandwidth rate on particular device for root group. The format | |
75 | + for policy is "<major>:<minor> <byes_per_second>". | |
76 | + | |
77 | + echo "8:16 1048576" > /cgroup/blkio/blkio.read_bps_device | |
78 | + | |
79 | + Above will put a limit of 1MB/second on reads happening for root group | |
80 | + on device having major/minor number 8:16. | |
81 | + | |
82 | +- Run dd to read a file and see if rate is throttled to 1MB/s or not. | |
83 | + | |
84 | + # dd if=/mnt/common/zerofile of=/dev/null bs=4K count=1024 | |
85 | + # iflag=direct | |
86 | + 1024+0 records in | |
87 | + 1024+0 records out | |
88 | + 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s | |
89 | + | |
90 | + Limits for writes can be put using blkio.write_bps_device file. | |
91 | + | |
58 | 92 | Various user visible config options |
59 | 93 | =================================== |
60 | 94 | CONFIG_BLK_CGROUP |
61 | 95 | |
... | ... | @@ -68,8 +102,13 @@ |
68 | 102 | - Enables group scheduling in CFQ. Currently only 1 level of group |
69 | 103 | creation is allowed. |
70 | 104 | |
105 | +CONFIG_BLK_DEV_THROTTLING | |
106 | + - Enable block device throttling support in block layer. | |
107 | + | |
71 | 108 | Details of cgroup files |
72 | 109 | ======================= |
110 | +Proportional weight policy files | |
111 | +-------------------------------- | |
73 | 112 | - blkio.weight |
74 | 113 | - Specifies per cgroup weight. This is default weight of the group |
75 | 114 | on all the devices until and unless overridden by per device rule. |
... | ... | @@ -210,6 +249,67 @@ |
210 | 249 | and minor number of the device and third field specifies the number |
211 | 250 | of times a group was dequeued from a particular device. |
212 | 251 | |
252 | +Throttling/Upper limit policy files | |
253 | +----------------------------------- | |
254 | +- blkio.throttle.read_bps_device | |
255 | + - Specifies upper limit on READ rate from the device. IO rate is | |
256 | + specified in bytes per second. Rules are per deivce. Following is | |
257 | + the format. | |
258 | + | |
259 | + echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.read_bps_device | |
260 | + | |
261 | +- blkio.throttle.write_bps_device | |
262 | + - Specifies upper limit on WRITE rate to the device. IO rate is | |
263 | + specified in bytes per second. Rules are per deivce. Following is | |
264 | + the format. | |
265 | + | |
266 | + echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.write_bps_device | |
267 | + | |
268 | +- blkio.throttle.read_iops_device | |
269 | + - Specifies upper limit on READ rate from the device. IO rate is | |
270 | + specified in IO per second. Rules are per deivce. Following is | |
271 | + the format. | |
272 | + | |
273 | + echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.read_iops_device | |
274 | + | |
275 | +- blkio.throttle.write_iops_device | |
276 | + - Specifies upper limit on WRITE rate to the device. IO rate is | |
277 | + specified in io per second. Rules are per deivce. Following is | |
278 | + the format. | |
279 | + | |
280 | + echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.write_iops_device | |
281 | + | |
282 | +Note: If both BW and IOPS rules are specified for a device, then IO is | |
283 | + subjectd to both the constraints. | |
284 | + | |
285 | +- blkio.throttle.io_serviced | |
286 | + - Number of IOs (bio) completed to/from the disk by the group (as | |
287 | + seen by throttling policy). These are further divided by the type | |
288 | + of operation - read or write, sync or async. First two fields specify | |
289 | + the major and minor number of the device, third field specifies the | |
290 | + operation type and the fourth field specifies the number of IOs. | |
291 | + | |
292 | + blkio.io_serviced does accounting as seen by CFQ and counts are in | |
293 | + number of requests (struct request). On the other hand, | |
294 | + blkio.throttle.io_serviced counts number of IO in terms of number | |
295 | + of bios as seen by throttling policy. These bios can later be | |
296 | + merged by elevator and total number of requests completed can be | |
297 | + lesser. | |
298 | + | |
299 | +- blkio.throttle.io_service_bytes | |
300 | + - Number of bytes transferred to/from the disk by the group. These | |
301 | + are further divided by the type of operation - read or write, sync | |
302 | + or async. First two fields specify the major and minor number of the | |
303 | + device, third field specifies the operation type and the fourth field | |
304 | + specifies the number of bytes. | |
305 | + | |
306 | + These numbers should roughly be same as blkio.io_service_bytes as | |
307 | + updated by CFQ. The difference between two is that | |
308 | + blkio.io_service_bytes will not be updated if CFQ is not operating | |
309 | + on request queue. | |
310 | + | |
311 | +Common files among various policies | |
312 | +----------------------------------- | |
213 | 313 | - blkio.reset_stats |
214 | 314 | - Writing an int to this file will result in resetting all the stats |
215 | 315 | for that cgroup. |
block/Kconfig
... | ... | @@ -77,6 +77,18 @@ |
77 | 77 | T10/SCSI Data Integrity Field or the T13/ATA External Path |
78 | 78 | Protection. If in doubt, say N. |
79 | 79 | |
80 | +config BLK_DEV_THROTTLING | |
81 | + bool "Block layer bio throttling support" | |
82 | + depends on BLK_CGROUP=y && EXPERIMENTAL | |
83 | + default n | |
84 | + ---help--- | |
85 | + Block layer bio throttling support. It can be used to limit | |
86 | + the IO rate to a device. IO rate policies are per cgroup and | |
87 | + one needs to mount and use blkio cgroup controller for creating | |
88 | + cgroups and specifying per device IO rate policies. | |
89 | + | |
90 | + See Documentation/cgroups/blkio-controller.txt for more information. | |
91 | + | |
80 | 92 | endif # BLOCK |
81 | 93 | |
82 | 94 | config BLOCK_COMPAT |
block/Makefile
... | ... | @@ -9,6 +9,7 @@ |
9 | 9 | |
10 | 10 | obj-$(CONFIG_BLK_DEV_BSG) += bsg.o |
11 | 11 | obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o |
12 | +obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o | |
12 | 13 | obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o |
13 | 14 | obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o |
14 | 15 | obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o |
block/blk-cgroup.c
Changes suppressed. Click to show
... | ... | @@ -37,6 +37,12 @@ |
37 | 37 | static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); |
38 | 38 | static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); |
39 | 39 | |
40 | +/* for encoding cft->private value on file */ | |
41 | +#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val)) | |
42 | +/* What policy owns the file, proportional or throttle */ | |
43 | +#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff) | |
44 | +#define BLKIOFILE_ATTR(val) ((val) & 0xffff) | |
45 | + | |
40 | 46 | struct cgroup_subsys blkio_subsys = { |
41 | 47 | .name = "blkio", |
42 | 48 | .create = blkiocg_create, |
... | ... | @@ -59,6 +65,27 @@ |
59 | 65 | list_add(&pn->node, &blkcg->policy_list); |
60 | 66 | } |
61 | 67 | |
68 | +static inline bool cftype_blkg_same_policy(struct cftype *cft, | |
69 | + struct blkio_group *blkg) | |
70 | +{ | |
71 | + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | |
72 | + | |
73 | + if (blkg->plid == plid) | |
74 | + return 1; | |
75 | + | |
76 | + return 0; | |
77 | +} | |
78 | + | |
79 | +/* Determines if policy node matches cgroup file being accessed */ | |
80 | +static inline bool pn_matches_cftype(struct cftype *cft, | |
81 | + struct blkio_policy_node *pn) | |
82 | +{ | |
83 | + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | |
84 | + int fileid = BLKIOFILE_ATTR(cft->private); | |
85 | + | |
86 | + return (plid == pn->plid && fileid == pn->fileid); | |
87 | +} | |
88 | + | |
62 | 89 | /* Must be called with blkcg->lock held */ |
63 | 90 | static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) |
64 | 91 | { |
65 | 92 | |
... | ... | @@ -67,12 +94,13 @@ |
67 | 94 | |
68 | 95 | /* Must be called with blkcg->lock held */ |
69 | 96 | static struct blkio_policy_node * |
70 | -blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev) | |
97 | +blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev, | |
98 | + enum blkio_policy_id plid, int fileid) | |
71 | 99 | { |
72 | 100 | struct blkio_policy_node *pn; |
73 | 101 | |
74 | 102 | list_for_each_entry(pn, &blkcg->policy_list, node) { |
75 | - if (pn->dev == dev) | |
103 | + if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid) | |
76 | 104 | return pn; |
77 | 105 | } |
78 | 106 | |
... | ... | @@ -86,6 +114,67 @@ |
86 | 114 | } |
87 | 115 | EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); |
88 | 116 | |
117 | +static inline void | |
118 | +blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) | |
119 | +{ | |
120 | + struct blkio_policy_type *blkiop; | |
121 | + | |
122 | + list_for_each_entry(blkiop, &blkio_list, list) { | |
123 | + /* If this policy does not own the blkg, do not send updates */ | |
124 | + if (blkiop->plid != blkg->plid) | |
125 | + continue; | |
126 | + if (blkiop->ops.blkio_update_group_weight_fn) | |
127 | + blkiop->ops.blkio_update_group_weight_fn(blkg->key, | |
128 | + blkg, weight); | |
129 | + } | |
130 | +} | |
131 | + | |
132 | +static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, | |
133 | + int fileid) | |
134 | +{ | |
135 | + struct blkio_policy_type *blkiop; | |
136 | + | |
137 | + list_for_each_entry(blkiop, &blkio_list, list) { | |
138 | + | |
139 | + /* If this policy does not own the blkg, do not send updates */ | |
140 | + if (blkiop->plid != blkg->plid) | |
141 | + continue; | |
142 | + | |
143 | + if (fileid == BLKIO_THROTL_read_bps_device | |
144 | + && blkiop->ops.blkio_update_group_read_bps_fn) | |
145 | + blkiop->ops.blkio_update_group_read_bps_fn(blkg->key, | |
146 | + blkg, bps); | |
147 | + | |
148 | + if (fileid == BLKIO_THROTL_write_bps_device | |
149 | + && blkiop->ops.blkio_update_group_write_bps_fn) | |
150 | + blkiop->ops.blkio_update_group_write_bps_fn(blkg->key, | |
151 | + blkg, bps); | |
152 | + } | |
153 | +} | |
154 | + | |
155 | +static inline void blkio_update_group_iops(struct blkio_group *blkg, | |
156 | + unsigned int iops, int fileid) | |
157 | +{ | |
158 | + struct blkio_policy_type *blkiop; | |
159 | + | |
160 | + list_for_each_entry(blkiop, &blkio_list, list) { | |
161 | + | |
162 | + /* If this policy does not own the blkg, do not send updates */ | |
163 | + if (blkiop->plid != blkg->plid) | |
164 | + continue; | |
165 | + | |
166 | + if (fileid == BLKIO_THROTL_read_iops_device | |
167 | + && blkiop->ops.blkio_update_group_read_iops_fn) | |
168 | + blkiop->ops.blkio_update_group_read_iops_fn(blkg->key, | |
169 | + blkg, iops); | |
170 | + | |
171 | + if (fileid == BLKIO_THROTL_write_iops_device | |
172 | + && blkiop->ops.blkio_update_group_write_iops_fn) | |
173 | + blkiop->ops.blkio_update_group_write_iops_fn(blkg->key, | |
174 | + blkg,iops); | |
175 | + } | |
176 | +} | |
177 | + | |
89 | 178 | /* |
90 | 179 | * Add to the appropriate stat variable depending on the request type. |
91 | 180 | * This should be called with the blkg->stats_lock held. |
... | ... | @@ -341,7 +430,8 @@ |
341 | 430 | EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); |
342 | 431 | |
343 | 432 | void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, |
344 | - struct blkio_group *blkg, void *key, dev_t dev) | |
433 | + struct blkio_group *blkg, void *key, dev_t dev, | |
434 | + enum blkio_policy_id plid) | |
345 | 435 | { |
346 | 436 | unsigned long flags; |
347 | 437 | |
... | ... | @@ -350,6 +440,7 @@ |
350 | 440 | rcu_assign_pointer(blkg->key, key); |
351 | 441 | blkg->blkcg_id = css_id(&blkcg->css); |
352 | 442 | hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); |
443 | + blkg->plid = plid; | |
353 | 444 | spin_unlock_irqrestore(&blkcg->lock, flags); |
354 | 445 | /* Need to take css reference ? */ |
355 | 446 | cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); |
356 | 447 | |
... | ... | @@ -408,52 +499,7 @@ |
408 | 499 | } |
409 | 500 | EXPORT_SYMBOL_GPL(blkiocg_lookup_group); |
410 | 501 | |
411 | -#define SHOW_FUNCTION(__VAR) \ | |
412 | -static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \ | |
413 | - struct cftype *cftype) \ | |
414 | -{ \ | |
415 | - struct blkio_cgroup *blkcg; \ | |
416 | - \ | |
417 | - blkcg = cgroup_to_blkio_cgroup(cgroup); \ | |
418 | - return (u64)blkcg->__VAR; \ | |
419 | -} | |
420 | - | |
421 | -SHOW_FUNCTION(weight); | |
422 | -#undef SHOW_FUNCTION | |
423 | - | |
424 | 502 | static int |
425 | -blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) | |
426 | -{ | |
427 | - struct blkio_cgroup *blkcg; | |
428 | - struct blkio_group *blkg; | |
429 | - struct hlist_node *n; | |
430 | - struct blkio_policy_type *blkiop; | |
431 | - struct blkio_policy_node *pn; | |
432 | - | |
433 | - if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) | |
434 | - return -EINVAL; | |
435 | - | |
436 | - blkcg = cgroup_to_blkio_cgroup(cgroup); | |
437 | - spin_lock(&blkio_list_lock); | |
438 | - spin_lock_irq(&blkcg->lock); | |
439 | - blkcg->weight = (unsigned int)val; | |
440 | - | |
441 | - hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { | |
442 | - pn = blkio_policy_search_node(blkcg, blkg->dev); | |
443 | - | |
444 | - if (pn) | |
445 | - continue; | |
446 | - | |
447 | - list_for_each_entry(blkiop, &blkio_list, list) | |
448 | - blkiop->ops.blkio_update_group_weight_fn(blkg, | |
449 | - blkcg->weight); | |
450 | - } | |
451 | - spin_unlock_irq(&blkcg->lock); | |
452 | - spin_unlock(&blkio_list_lock); | |
453 | - return 0; | |
454 | -} | |
455 | - | |
456 | -static int | |
457 | 503 | blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) |
458 | 504 | { |
459 | 505 | struct blkio_cgroup *blkcg; |
... | ... | @@ -593,52 +639,6 @@ |
593 | 639 | return disk_total; |
594 | 640 | } |
595 | 641 | |
596 | -#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \ | |
597 | -static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ | |
598 | - struct cftype *cftype, struct cgroup_map_cb *cb) \ | |
599 | -{ \ | |
600 | - struct blkio_cgroup *blkcg; \ | |
601 | - struct blkio_group *blkg; \ | |
602 | - struct hlist_node *n; \ | |
603 | - uint64_t cgroup_total = 0; \ | |
604 | - \ | |
605 | - if (!cgroup_lock_live_group(cgroup)) \ | |
606 | - return -ENODEV; \ | |
607 | - \ | |
608 | - blkcg = cgroup_to_blkio_cgroup(cgroup); \ | |
609 | - rcu_read_lock(); \ | |
610 | - hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\ | |
611 | - if (blkg->dev) { \ | |
612 | - spin_lock_irq(&blkg->stats_lock); \ | |
613 | - cgroup_total += blkio_get_stat(blkg, cb, \ | |
614 | - blkg->dev, type); \ | |
615 | - spin_unlock_irq(&blkg->stats_lock); \ | |
616 | - } \ | |
617 | - } \ | |
618 | - if (show_total) \ | |
619 | - cb->fill(cb, "Total", cgroup_total); \ | |
620 | - rcu_read_unlock(); \ | |
621 | - cgroup_unlock(); \ | |
622 | - return 0; \ | |
623 | -} | |
624 | - | |
625 | -SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0); | |
626 | -SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0); | |
627 | -SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1); | |
628 | -SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1); | |
629 | -SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1); | |
630 | -SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1); | |
631 | -SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1); | |
632 | -SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1); | |
633 | -#ifdef CONFIG_DEBUG_BLK_CGROUP | |
634 | -SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0); | |
635 | -SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0); | |
636 | -SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0); | |
637 | -SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0); | |
638 | -SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0); | |
639 | -#endif | |
640 | -#undef SHOW_FUNCTION_PER_GROUP | |
641 | - | |
642 | 642 | static int blkio_check_dev_num(dev_t dev) |
643 | 643 | { |
644 | 644 | int part = 0; |
645 | 645 | |
... | ... | @@ -652,13 +652,14 @@ |
652 | 652 | } |
653 | 653 | |
654 | 654 | static int blkio_policy_parse_and_set(char *buf, |
655 | - struct blkio_policy_node *newpn) | |
655 | + struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid) | |
656 | 656 | { |
657 | 657 | char *s[4], *p, *major_s = NULL, *minor_s = NULL; |
658 | 658 | int ret; |
659 | 659 | unsigned long major, minor, temp; |
660 | 660 | int i = 0; |
661 | 661 | dev_t dev; |
662 | + u64 bps, iops; | |
662 | 663 | |
663 | 664 | memset(s, 0, sizeof(s)); |
664 | 665 | |
665 | 666 | |
666 | 667 | |
... | ... | @@ -705,13 +706,48 @@ |
705 | 706 | if (s[1] == NULL) |
706 | 707 | return -EINVAL; |
707 | 708 | |
708 | - ret = strict_strtoul(s[1], 10, &temp); | |
709 | - if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || | |
710 | - temp > BLKIO_WEIGHT_MAX) | |
711 | - return -EINVAL; | |
709 | + switch (plid) { | |
710 | + case BLKIO_POLICY_PROP: | |
711 | + ret = strict_strtoul(s[1], 10, &temp); | |
712 | + if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || | |
713 | + temp > BLKIO_WEIGHT_MAX) | |
714 | + return -EINVAL; | |
712 | 715 | |
713 | - newpn->weight = temp; | |
716 | + newpn->plid = plid; | |
717 | + newpn->fileid = fileid; | |
718 | + newpn->val.weight = temp; | |
719 | + break; | |
720 | + case BLKIO_POLICY_THROTL: | |
721 | + switch(fileid) { | |
722 | + case BLKIO_THROTL_read_bps_device: | |
723 | + case BLKIO_THROTL_write_bps_device: | |
724 | + ret = strict_strtoull(s[1], 10, &bps); | |
725 | + if (ret) | |
726 | + return -EINVAL; | |
714 | 727 | |
728 | + newpn->plid = plid; | |
729 | + newpn->fileid = fileid; | |
730 | + newpn->val.bps = bps; | |
731 | + break; | |
732 | + case BLKIO_THROTL_read_iops_device: | |
733 | + case BLKIO_THROTL_write_iops_device: | |
734 | + ret = strict_strtoull(s[1], 10, &iops); | |
735 | + if (ret) | |
736 | + return -EINVAL; | |
737 | + | |
738 | + if (iops > THROTL_IOPS_MAX) | |
739 | + return -EINVAL; | |
740 | + | |
741 | + newpn->plid = plid; | |
742 | + newpn->fileid = fileid; | |
743 | + newpn->val.iops = (unsigned int)iops; | |
744 | + break; | |
745 | + } | |
746 | + break; | |
747 | + default: | |
748 | + BUG(); | |
749 | + } | |
750 | + | |
715 | 751 | return 0; |
716 | 752 | } |
717 | 753 | |
718 | 754 | |
719 | 755 | |
720 | 756 | |
721 | 757 | |
722 | 758 | |
723 | 759 | |
... | ... | @@ -720,26 +756,180 @@ |
720 | 756 | { |
721 | 757 | struct blkio_policy_node *pn; |
722 | 758 | |
723 | - pn = blkio_policy_search_node(blkcg, dev); | |
759 | + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP, | |
760 | + BLKIO_PROP_weight_device); | |
724 | 761 | if (pn) |
725 | - return pn->weight; | |
762 | + return pn->val.weight; | |
726 | 763 | else |
727 | 764 | return blkcg->weight; |
728 | 765 | } |
729 | 766 | EXPORT_SYMBOL_GPL(blkcg_get_weight); |
730 | 767 | |
768 | +uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev) | |
769 | +{ | |
770 | + struct blkio_policy_node *pn; | |
731 | 771 | |
732 | -static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, | |
733 | - const char *buffer) | |
772 | + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, | |
773 | + BLKIO_THROTL_read_bps_device); | |
774 | + if (pn) | |
775 | + return pn->val.bps; | |
776 | + else | |
777 | + return -1; | |
778 | +} | |
779 | + | |
780 | +uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev) | |
734 | 781 | { |
782 | + struct blkio_policy_node *pn; | |
783 | + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, | |
784 | + BLKIO_THROTL_write_bps_device); | |
785 | + if (pn) | |
786 | + return pn->val.bps; | |
787 | + else | |
788 | + return -1; | |
789 | +} | |
790 | + | |
791 | +unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev) | |
792 | +{ | |
793 | + struct blkio_policy_node *pn; | |
794 | + | |
795 | + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, | |
796 | + BLKIO_THROTL_read_iops_device); | |
797 | + if (pn) | |
798 | + return pn->val.iops; | |
799 | + else | |
800 | + return -1; | |
801 | +} | |
802 | + | |
803 | +unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev) | |
804 | +{ | |
805 | + struct blkio_policy_node *pn; | |
806 | + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, | |
807 | + BLKIO_THROTL_write_iops_device); | |
808 | + if (pn) | |
809 | + return pn->val.iops; | |
810 | + else | |
811 | + return -1; | |
812 | +} | |
813 | + | |
814 | +/* Checks whether user asked for deleting a policy rule */ | |
815 | +static bool blkio_delete_rule_command(struct blkio_policy_node *pn) | |
816 | +{ | |
817 | + switch(pn->plid) { | |
818 | + case BLKIO_POLICY_PROP: | |
819 | + if (pn->val.weight == 0) | |
820 | + return 1; | |
821 | + break; | |
822 | + case BLKIO_POLICY_THROTL: | |
823 | + switch(pn->fileid) { | |
824 | + case BLKIO_THROTL_read_bps_device: | |
825 | + case BLKIO_THROTL_write_bps_device: | |
826 | + if (pn->val.bps == 0) | |
827 | + return 1; | |
828 | + break; | |
829 | + case BLKIO_THROTL_read_iops_device: | |
830 | + case BLKIO_THROTL_write_iops_device: | |
831 | + if (pn->val.iops == 0) | |
832 | + return 1; | |
833 | + } | |
834 | + break; | |
835 | + default: | |
836 | + BUG(); | |
837 | + } | |
838 | + | |
839 | + return 0; | |
840 | +} | |
841 | + | |
842 | +static void blkio_update_policy_rule(struct blkio_policy_node *oldpn, | |
843 | + struct blkio_policy_node *newpn) | |
844 | +{ | |
845 | + switch(oldpn->plid) { | |
846 | + case BLKIO_POLICY_PROP: | |
847 | + oldpn->val.weight = newpn->val.weight; | |
848 | + break; | |
849 | + case BLKIO_POLICY_THROTL: | |
850 | + switch(newpn->fileid) { | |
851 | + case BLKIO_THROTL_read_bps_device: | |
852 | + case BLKIO_THROTL_write_bps_device: | |
853 | + oldpn->val.bps = newpn->val.bps; | |
854 | + break; | |
855 | + case BLKIO_THROTL_read_iops_device: | |
856 | + case BLKIO_THROTL_write_iops_device: | |
857 | + oldpn->val.iops = newpn->val.iops; | |
858 | + } | |
859 | + break; | |
860 | + default: | |
861 | + BUG(); | |
862 | + } | |
863 | +} | |
864 | + | |
865 | +/* | |
866 | + * Some rules/values in blkg have changed. Propogate those to respective | |
867 | + * policies. | |
868 | + */ | |
869 | +static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, | |
870 | + struct blkio_group *blkg, struct blkio_policy_node *pn) | |
871 | +{ | |
872 | + unsigned int weight, iops; | |
873 | + u64 bps; | |
874 | + | |
875 | + switch(pn->plid) { | |
876 | + case BLKIO_POLICY_PROP: | |
877 | + weight = pn->val.weight ? pn->val.weight : | |
878 | + blkcg->weight; | |
879 | + blkio_update_group_weight(blkg, weight); | |
880 | + break; | |
881 | + case BLKIO_POLICY_THROTL: | |
882 | + switch(pn->fileid) { | |
883 | + case BLKIO_THROTL_read_bps_device: | |
884 | + case BLKIO_THROTL_write_bps_device: | |
885 | + bps = pn->val.bps ? pn->val.bps : (-1); | |
886 | + blkio_update_group_bps(blkg, bps, pn->fileid); | |
887 | + break; | |
888 | + case BLKIO_THROTL_read_iops_device: | |
889 | + case BLKIO_THROTL_write_iops_device: | |
890 | + iops = pn->val.iops ? pn->val.iops : (-1); | |
891 | + blkio_update_group_iops(blkg, iops, pn->fileid); | |
892 | + break; | |
893 | + } | |
894 | + break; | |
895 | + default: | |
896 | + BUG(); | |
897 | + } | |
898 | +} | |
899 | + | |
900 | +/* | |
901 | + * A policy node rule has been updated. Propogate this update to all the | |
902 | + * block groups which might be affected by this update. | |
903 | + */ | |
904 | +static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg, | |
905 | + struct blkio_policy_node *pn) | |
906 | +{ | |
907 | + struct blkio_group *blkg; | |
908 | + struct hlist_node *n; | |
909 | + | |
910 | + spin_lock(&blkio_list_lock); | |
911 | + spin_lock_irq(&blkcg->lock); | |
912 | + | |
913 | + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { | |
914 | + if (pn->dev != blkg->dev || pn->plid != blkg->plid) | |
915 | + continue; | |
916 | + blkio_update_blkg_policy(blkcg, blkg, pn); | |
917 | + } | |
918 | + | |
919 | + spin_unlock_irq(&blkcg->lock); | |
920 | + spin_unlock(&blkio_list_lock); | |
921 | +} | |
922 | + | |
923 | +static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, | |
924 | + const char *buffer) | |
925 | +{ | |
735 | 926 | int ret = 0; |
736 | 927 | char *buf; |
737 | 928 | struct blkio_policy_node *newpn, *pn; |
738 | 929 | struct blkio_cgroup *blkcg; |
739 | - struct blkio_group *blkg; | |
740 | 930 | int keep_newpn = 0; |
741 | - struct hlist_node *n; | |
742 | - struct blkio_policy_type *blkiop; | |
931 | + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | |
932 | + int fileid = BLKIOFILE_ATTR(cft->private); | |
743 | 933 | |
744 | 934 | buf = kstrdup(buffer, GFP_KERNEL); |
745 | 935 | if (!buf) |
... | ... | @@ -751,7 +941,7 @@ |
751 | 941 | goto free_buf; |
752 | 942 | } |
753 | 943 | |
754 | - ret = blkio_policy_parse_and_set(buf, newpn); | |
944 | + ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid); | |
755 | 945 | if (ret) |
756 | 946 | goto free_newpn; |
757 | 947 | |
758 | 948 | |
... | ... | @@ -759,9 +949,9 @@ |
759 | 949 | |
760 | 950 | spin_lock_irq(&blkcg->lock); |
761 | 951 | |
762 | - pn = blkio_policy_search_node(blkcg, newpn->dev); | |
952 | + pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid); | |
763 | 953 | if (!pn) { |
764 | - if (newpn->weight != 0) { | |
954 | + if (!blkio_delete_rule_command(newpn)) { | |
765 | 955 | blkio_policy_insert_node(blkcg, newpn); |
766 | 956 | keep_newpn = 1; |
767 | 957 | } |
768 | 958 | |
769 | 959 | |
770 | 960 | |
... | ... | @@ -769,34 +959,18 @@ |
769 | 959 | goto update_io_group; |
770 | 960 | } |
771 | 961 | |
772 | - if (newpn->weight == 0) { | |
773 | - /* weight == 0 means deleteing a specific weight */ | |
962 | + if (blkio_delete_rule_command(newpn)) { | |
774 | 963 | blkio_policy_delete_node(pn); |
775 | 964 | spin_unlock_irq(&blkcg->lock); |
776 | 965 | goto update_io_group; |
777 | 966 | } |
778 | 967 | spin_unlock_irq(&blkcg->lock); |
779 | 968 | |
780 | - pn->weight = newpn->weight; | |
969 | + blkio_update_policy_rule(pn, newpn); | |
781 | 970 | |
782 | 971 | update_io_group: |
783 | - /* update weight for each cfqg */ | |
784 | - spin_lock(&blkio_list_lock); | |
785 | - spin_lock_irq(&blkcg->lock); | |
972 | + blkio_update_policy_node_blkg(blkcg, newpn); | |
786 | 973 | |
787 | - hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { | |
788 | - if (newpn->dev == blkg->dev) { | |
789 | - list_for_each_entry(blkiop, &blkio_list, list) | |
790 | - blkiop->ops.blkio_update_group_weight_fn(blkg, | |
791 | - newpn->weight ? | |
792 | - newpn->weight : | |
793 | - blkcg->weight); | |
794 | - } | |
795 | - } | |
796 | - | |
797 | - spin_unlock_irq(&blkcg->lock); | |
798 | - spin_unlock(&blkio_list_lock); | |
799 | - | |
800 | 974 | free_newpn: |
801 | 975 | if (!keep_newpn) |
802 | 976 | kfree(newpn); |
803 | 977 | |
804 | 978 | |
805 | 979 | |
806 | 980 | |
807 | 981 | |
808 | 982 | |
809 | 983 | |
810 | 984 | |
811 | 985 | |
812 | 986 | |
813 | 987 | |
814 | 988 | |
815 | 989 | |
816 | 990 | |
817 | 991 | |
818 | 992 | |
819 | 993 | |
820 | 994 | |
821 | 995 | |
822 | 996 | |
823 | 997 | |
824 | 998 | |
... | ... | @@ -805,95 +979,408 @@ |
805 | 979 | return ret; |
806 | 980 | } |
807 | 981 | |
808 | -static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft, | |
809 | - struct seq_file *m) | |
982 | +static void | |
983 | +blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn) | |
810 | 984 | { |
811 | - struct blkio_cgroup *blkcg; | |
985 | + switch(pn->plid) { | |
986 | + case BLKIO_POLICY_PROP: | |
987 | + if (pn->fileid == BLKIO_PROP_weight_device) | |
988 | + seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), | |
989 | + MINOR(pn->dev), pn->val.weight); | |
990 | + break; | |
991 | + case BLKIO_POLICY_THROTL: | |
992 | + switch(pn->fileid) { | |
993 | + case BLKIO_THROTL_read_bps_device: | |
994 | + case BLKIO_THROTL_write_bps_device: | |
995 | + seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev), | |
996 | + MINOR(pn->dev), pn->val.bps); | |
997 | + break; | |
998 | + case BLKIO_THROTL_read_iops_device: | |
999 | + case BLKIO_THROTL_write_iops_device: | |
1000 | + seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), | |
1001 | + MINOR(pn->dev), pn->val.iops); | |
1002 | + break; | |
1003 | + } | |
1004 | + break; | |
1005 | + default: | |
1006 | + BUG(); | |
1007 | + } | |
1008 | +} | |
1009 | + | |
1010 | +/* cgroup files which read their data from policy nodes end up here */ | |
1011 | +static void blkio_read_policy_node_files(struct cftype *cft, | |
1012 | + struct blkio_cgroup *blkcg, struct seq_file *m) | |
1013 | +{ | |
812 | 1014 | struct blkio_policy_node *pn; |
813 | 1015 | |
814 | - seq_printf(m, "dev\tweight\n"); | |
815 | - | |
816 | - blkcg = cgroup_to_blkio_cgroup(cgrp); | |
817 | 1016 | if (!list_empty(&blkcg->policy_list)) { |
818 | 1017 | spin_lock_irq(&blkcg->lock); |
819 | 1018 | list_for_each_entry(pn, &blkcg->policy_list, node) { |
820 | - seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), | |
821 | - MINOR(pn->dev), pn->weight); | |
1019 | + if (!pn_matches_cftype(cft, pn)) | |
1020 | + continue; | |
1021 | + blkio_print_policy_node(m, pn); | |
822 | 1022 | } |
823 | 1023 | spin_unlock_irq(&blkcg->lock); |
824 | 1024 | } |
1025 | +} | |
825 | 1026 | |
1027 | +static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, | |
1028 | + struct seq_file *m) | |
1029 | +{ | |
1030 | + struct blkio_cgroup *blkcg; | |
1031 | + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | |
1032 | + int name = BLKIOFILE_ATTR(cft->private); | |
1033 | + | |
1034 | + blkcg = cgroup_to_blkio_cgroup(cgrp); | |
1035 | + | |
1036 | + switch(plid) { | |
1037 | + case BLKIO_POLICY_PROP: | |
1038 | + switch(name) { | |
1039 | + case BLKIO_PROP_weight_device: | |
1040 | + blkio_read_policy_node_files(cft, blkcg, m); | |
1041 | + return 0; | |
1042 | + default: | |
1043 | + BUG(); | |
1044 | + } | |
1045 | + break; | |
1046 | + case BLKIO_POLICY_THROTL: | |
1047 | + switch(name){ | |
1048 | + case BLKIO_THROTL_read_bps_device: | |
1049 | + case BLKIO_THROTL_write_bps_device: | |
1050 | + case BLKIO_THROTL_read_iops_device: | |
1051 | + case BLKIO_THROTL_write_iops_device: | |
1052 | + blkio_read_policy_node_files(cft, blkcg, m); | |
1053 | + return 0; | |
1054 | + default: | |
1055 | + BUG(); | |
1056 | + } | |
1057 | + break; | |
1058 | + default: | |
1059 | + BUG(); | |
1060 | + } | |
1061 | + | |
826 | 1062 | return 0; |
827 | 1063 | } |
828 | 1064 | |
1065 | +static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, | |
1066 | + struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type, | |
1067 | + bool show_total) | |
1068 | +{ | |
1069 | + struct blkio_group *blkg; | |
1070 | + struct hlist_node *n; | |
1071 | + uint64_t cgroup_total = 0; | |
1072 | + | |
1073 | + rcu_read_lock(); | |
1074 | + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { | |
1075 | + if (blkg->dev) { | |
1076 | + if (!cftype_blkg_same_policy(cft, blkg)) | |
1077 | + continue; | |
1078 | + spin_lock_irq(&blkg->stats_lock); | |
1079 | + cgroup_total += blkio_get_stat(blkg, cb, blkg->dev, | |
1080 | + type); | |
1081 | + spin_unlock_irq(&blkg->stats_lock); | |
1082 | + } | |
1083 | + } | |
1084 | + if (show_total) | |
1085 | + cb->fill(cb, "Total", cgroup_total); | |
1086 | + rcu_read_unlock(); | |
1087 | + return 0; | |
1088 | +} | |
1089 | + | |
1090 | +/* All map kind of cgroup file get serviced by this function */ | |
1091 | +static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, | |
1092 | + struct cgroup_map_cb *cb) | |
1093 | +{ | |
1094 | + struct blkio_cgroup *blkcg; | |
1095 | + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | |
1096 | + int name = BLKIOFILE_ATTR(cft->private); | |
1097 | + | |
1098 | + blkcg = cgroup_to_blkio_cgroup(cgrp); | |
1099 | + | |
1100 | + switch(plid) { | |
1101 | + case BLKIO_POLICY_PROP: | |
1102 | + switch(name) { | |
1103 | + case BLKIO_PROP_time: | |
1104 | + return blkio_read_blkg_stats(blkcg, cft, cb, | |
1105 | + BLKIO_STAT_TIME, 0); | |
1106 | + case BLKIO_PROP_sectors: | |
1107 | + return blkio_read_blkg_stats(blkcg, cft, cb, | |
1108 | + BLKIO_STAT_SECTORS, 0); | |
1109 | + case BLKIO_PROP_io_service_bytes: | |
1110 | + return blkio_read_blkg_stats(blkcg, cft, cb, | |
1111 | + BLKIO_STAT_SERVICE_BYTES, 1); | |
1112 | + case BLKIO_PROP_io_serviced: | |
1113 | + return blkio_read_blkg_stats(blkcg, cft, cb, | |
1114 | + BLKIO_STAT_SERVICED, 1); | |
1115 | + case BLKIO_PROP_io_service_time: | |
1116 | + return blkio_read_blkg_stats(blkcg, cft, cb, | |
1117 | + BLKIO_STAT_SERVICE_TIME, 1); | |
1118 | + case BLKIO_PROP_io_wait_time: | |
1119 | + return blkio_read_blkg_stats(blkcg, cft, cb, | |
1120 | + BLKIO_STAT_WAIT_TIME, 1); | |
1121 | + case BLKIO_PROP_io_merged: | |
1122 | + return blkio_read_blkg_stats(blkcg, cft, cb, | |
1123 | + BLKIO_STAT_MERGED, 1); | |
1124 | + case BLKIO_PROP_io_queued: | |
1125 | + return blkio_read_blkg_stats(blkcg, cft, cb, | |
1126 | + BLKIO_STAT_QUEUED, 1); | |
1127 | +#ifdef CONFIG_DEBUG_BLK_CGROUP | |
1128 | + case BLKIO_PROP_dequeue: | |
1129 | + return blkio_read_blkg_stats(blkcg, cft, cb, | |
1130 | + BLKIO_STAT_DEQUEUE, 0); | |
1131 | + case BLKIO_PROP_avg_queue_size: | |
1132 | + return blkio_read_blkg_stats(blkcg, cft, cb, | |
1133 | + BLKIO_STAT_AVG_QUEUE_SIZE, 0); | |
1134 | + case BLKIO_PROP_group_wait_time: | |
1135 | + return blkio_read_blkg_stats(blkcg, cft, cb, | |
1136 | + BLKIO_STAT_GROUP_WAIT_TIME, 0); | |
1137 | + case BLKIO_PROP_idle_time: | |
1138 | + return blkio_read_blkg_stats(blkcg, cft, cb, | |
1139 | + BLKIO_STAT_IDLE_TIME, 0); | |
1140 | + case BLKIO_PROP_empty_time: | |
1141 | + return blkio_read_blkg_stats(blkcg, cft, cb, | |
1142 | + BLKIO_STAT_EMPTY_TIME, 0); | |
1143 | +#endif | |
1144 | + default: | |
1145 | + BUG(); | |
1146 | + } | |
1147 | + break; | |
1148 | + case BLKIO_POLICY_THROTL: | |
1149 | + switch(name){ | |
1150 | + case BLKIO_THROTL_io_service_bytes: | |
1151 | + return blkio_read_blkg_stats(blkcg, cft, cb, | |
1152 | + BLKIO_STAT_SERVICE_BYTES, 1); | |
1153 | + case BLKIO_THROTL_io_serviced: | |
1154 | + return blkio_read_blkg_stats(blkcg, cft, cb, | |
1155 | + BLKIO_STAT_SERVICED, 1); | |
1156 | + default: | |
1157 | + BUG(); | |
1158 | + } | |
1159 | + break; | |
1160 | + default: | |
1161 | + BUG(); | |
1162 | + } | |
1163 | + | |
1164 | + return 0; | |
1165 | +} | |
1166 | + | |
1167 | +static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val) | |
1168 | +{ | |
1169 | + struct blkio_group *blkg; | |
1170 | + struct hlist_node *n; | |
1171 | + struct blkio_policy_node *pn; | |
1172 | + | |
1173 | + if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) | |
1174 | + return -EINVAL; | |
1175 | + | |
1176 | + spin_lock(&blkio_list_lock); | |
1177 | + spin_lock_irq(&blkcg->lock); | |
1178 | + blkcg->weight = (unsigned int)val; | |
1179 | + | |
1180 | + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { | |
1181 | + pn = blkio_policy_search_node(blkcg, blkg->dev, | |
1182 | + BLKIO_POLICY_PROP, BLKIO_PROP_weight_device); | |
1183 | + if (pn) | |
1184 | + continue; | |
1185 | + | |
1186 | + blkio_update_group_weight(blkg, blkcg->weight); | |
1187 | + } | |
1188 | + spin_unlock_irq(&blkcg->lock); | |
1189 | + spin_unlock(&blkio_list_lock); | |
1190 | + return 0; | |
1191 | +} | |
1192 | + | |
1193 | +static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) { | |
1194 | + struct blkio_cgroup *blkcg; | |
1195 | + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | |
1196 | + int name = BLKIOFILE_ATTR(cft->private); | |
1197 | + | |
1198 | + blkcg = cgroup_to_blkio_cgroup(cgrp); | |
1199 | + | |
1200 | + switch(plid) { | |
1201 | + case BLKIO_POLICY_PROP: | |
1202 | + switch(name) { | |
1203 | + case BLKIO_PROP_weight: | |
1204 | + return (u64)blkcg->weight; | |
1205 | + } | |
1206 | + break; | |
1207 | + default: | |
1208 | + BUG(); | |
1209 | + } | |
1210 | + return 0; | |
1211 | +} | |
1212 | + | |
1213 | +static int | |
1214 | +blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) | |
1215 | +{ | |
1216 | + struct blkio_cgroup *blkcg; | |
1217 | + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | |
1218 | + int name = BLKIOFILE_ATTR(cft->private); | |
1219 | + | |
1220 | + blkcg = cgroup_to_blkio_cgroup(cgrp); | |
1221 | + | |
1222 | + switch(plid) { | |
1223 | + case BLKIO_POLICY_PROP: | |
1224 | + switch(name) { | |
1225 | + case BLKIO_PROP_weight: | |
1226 | + return blkio_weight_write(blkcg, val); | |
1227 | + } | |
1228 | + break; | |
1229 | + default: | |
1230 | + BUG(); | |
1231 | + } | |
1232 | + | |
1233 | + return 0; | |
1234 | +} | |
1235 | + | |
829 | 1236 | struct cftype blkio_files[] = { |
830 | 1237 | { |
831 | 1238 | .name = "weight_device", |
832 | - .read_seq_string = blkiocg_weight_device_read, | |
833 | - .write_string = blkiocg_weight_device_write, | |
1239 | + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | |
1240 | + BLKIO_PROP_weight_device), | |
1241 | + .read_seq_string = blkiocg_file_read, | |
1242 | + .write_string = blkiocg_file_write, | |
834 | 1243 | .max_write_len = 256, |
835 | 1244 | }, |
836 | 1245 | { |
837 | 1246 | .name = "weight", |
838 | - .read_u64 = blkiocg_weight_read, | |
839 | - .write_u64 = blkiocg_weight_write, | |
1247 | + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | |
1248 | + BLKIO_PROP_weight), | |
1249 | + .read_u64 = blkiocg_file_read_u64, | |
1250 | + .write_u64 = blkiocg_file_write_u64, | |
840 | 1251 | }, |
841 | 1252 | { |
842 | 1253 | .name = "time", |
843 | - .read_map = blkiocg_time_read, | |
1254 | + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | |
1255 | + BLKIO_PROP_time), | |
1256 | + .read_map = blkiocg_file_read_map, | |
844 | 1257 | }, |
845 | 1258 | { |
846 | 1259 | .name = "sectors", |
847 | - .read_map = blkiocg_sectors_read, | |
1260 | + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | |
1261 | + BLKIO_PROP_sectors), | |
1262 | + .read_map = blkiocg_file_read_map, | |
848 | 1263 | }, |
849 | 1264 | { |
850 | 1265 | .name = "io_service_bytes", |
851 | - .read_map = blkiocg_io_service_bytes_read, | |
1266 | + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | |
1267 | + BLKIO_PROP_io_service_bytes), | |
1268 | + .read_map = blkiocg_file_read_map, | |
852 | 1269 | }, |
853 | 1270 | { |
854 | 1271 | .name = "io_serviced", |
855 | - .read_map = blkiocg_io_serviced_read, | |
1272 | + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | |
1273 | + BLKIO_PROP_io_serviced), | |
1274 | + .read_map = blkiocg_file_read_map, | |
856 | 1275 | }, |
857 | 1276 | { |
858 | 1277 | .name = "io_service_time", |
859 | - .read_map = blkiocg_io_service_time_read, | |
1278 | + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | |
1279 | + BLKIO_PROP_io_service_time), | |
1280 | + .read_map = blkiocg_file_read_map, | |
860 | 1281 | }, |
861 | 1282 | { |
862 | 1283 | .name = "io_wait_time", |
863 | - .read_map = blkiocg_io_wait_time_read, | |
1284 | + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | |
1285 | + BLKIO_PROP_io_wait_time), | |
1286 | + .read_map = blkiocg_file_read_map, | |
864 | 1287 | }, |
865 | 1288 | { |
866 | 1289 | .name = "io_merged", |
867 | - .read_map = blkiocg_io_merged_read, | |
1290 | + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | |
1291 | + BLKIO_PROP_io_merged), | |
1292 | + .read_map = blkiocg_file_read_map, | |
868 | 1293 | }, |
869 | 1294 | { |
870 | 1295 | .name = "io_queued", |
871 | - .read_map = blkiocg_io_queued_read, | |
1296 | + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | |
1297 | + BLKIO_PROP_io_queued), | |
1298 | + .read_map = blkiocg_file_read_map, | |
872 | 1299 | }, |
873 | 1300 | { |
874 | 1301 | .name = "reset_stats", |
875 | 1302 | .write_u64 = blkiocg_reset_stats, |
876 | 1303 | }, |
1304 | +#ifdef CONFIG_BLK_DEV_THROTTLING | |
1305 | + { | |
1306 | + .name = "throttle.read_bps_device", | |
1307 | + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | |
1308 | + BLKIO_THROTL_read_bps_device), | |
1309 | + .read_seq_string = blkiocg_file_read, | |
1310 | + .write_string = blkiocg_file_write, | |
1311 | + .max_write_len = 256, | |
1312 | + }, | |
1313 | + | |
1314 | + { | |
1315 | + .name = "throttle.write_bps_device", | |
1316 | + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | |
1317 | + BLKIO_THROTL_write_bps_device), | |
1318 | + .read_seq_string = blkiocg_file_read, | |
1319 | + .write_string = blkiocg_file_write, | |
1320 | + .max_write_len = 256, | |
1321 | + }, | |
1322 | + | |
1323 | + { | |
1324 | + .name = "throttle.read_iops_device", | |
1325 | + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | |
1326 | + BLKIO_THROTL_read_iops_device), | |
1327 | + .read_seq_string = blkiocg_file_read, | |
1328 | + .write_string = blkiocg_file_write, | |
1329 | + .max_write_len = 256, | |
1330 | + }, | |
1331 | + | |
1332 | + { | |
1333 | + .name = "throttle.write_iops_device", | |
1334 | + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | |
1335 | + BLKIO_THROTL_write_iops_device), | |
1336 | + .read_seq_string = blkiocg_file_read, | |
1337 | + .write_string = blkiocg_file_write, | |
1338 | + .max_write_len = 256, | |
1339 | + }, | |
1340 | + { | |
1341 | + .name = "throttle.io_service_bytes", | |
1342 | + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | |
1343 | + BLKIO_THROTL_io_service_bytes), | |
1344 | + .read_map = blkiocg_file_read_map, | |
1345 | + }, | |
1346 | + { | |
1347 | + .name = "throttle.io_serviced", | |
1348 | + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | |
1349 | + BLKIO_THROTL_io_serviced), | |
1350 | + .read_map = blkiocg_file_read_map, | |
1351 | + }, | |
1352 | +#endif /* CONFIG_BLK_DEV_THROTTLING */ | |
1353 | + | |
877 | 1354 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
878 | 1355 | { |
879 | 1356 | .name = "avg_queue_size", |
880 | - .read_map = blkiocg_avg_queue_size_read, | |
1357 | + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | |
1358 | + BLKIO_PROP_avg_queue_size), | |
1359 | + .read_map = blkiocg_file_read_map, | |
881 | 1360 | }, |
882 | 1361 | { |
883 | 1362 | .name = "group_wait_time", |
884 | - .read_map = blkiocg_group_wait_time_read, | |
1363 | + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | |
1364 | + BLKIO_PROP_group_wait_time), | |
1365 | + .read_map = blkiocg_file_read_map, | |
885 | 1366 | }, |
886 | 1367 | { |
887 | 1368 | .name = "idle_time", |
888 | - .read_map = blkiocg_idle_time_read, | |
1369 | + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | |
1370 | + BLKIO_PROP_idle_time), | |
1371 | + .read_map = blkiocg_file_read_map, | |
889 | 1372 | }, |
890 | 1373 | { |
891 | 1374 | .name = "empty_time", |
892 | - .read_map = blkiocg_empty_time_read, | |
1375 | + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | |
1376 | + BLKIO_PROP_empty_time), | |
1377 | + .read_map = blkiocg_file_read_map, | |
893 | 1378 | }, |
894 | 1379 | { |
895 | 1380 | .name = "dequeue", |
896 | - .read_map = blkiocg_dequeue_read, | |
1381 | + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | |
1382 | + BLKIO_PROP_dequeue), | |
1383 | + .read_map = blkiocg_file_read_map, | |
897 | 1384 | }, |
898 | 1385 | #endif |
899 | 1386 | }; |
900 | 1387 | |
901 | 1388 | |
... | ... | @@ -932,13 +1419,14 @@ |
932 | 1419 | /* |
933 | 1420 | * This blkio_group is being unlinked as associated cgroup is |
934 | 1421 | * going away. Let all the IO controlling policies know about |
935 | - * this event. Currently this is static call to one io | |
936 | - * controlling policy. Once we have more policies in place, we | |
937 | - * need some dynamic registration of callback function. | |
1422 | + * this event. | |
938 | 1423 | */ |
939 | 1424 | spin_lock(&blkio_list_lock); |
940 | - list_for_each_entry(blkiop, &blkio_list, list) | |
1425 | + list_for_each_entry(blkiop, &blkio_list, list) { | |
1426 | + if (blkiop->plid != blkg->plid) | |
1427 | + continue; | |
941 | 1428 | blkiop->ops.blkio_unlink_group_fn(key, blkg); |
1429 | + } | |
942 | 1430 | spin_unlock(&blkio_list_lock); |
943 | 1431 | } while (1); |
944 | 1432 |
block/blk-cgroup.h
... | ... | @@ -15,6 +15,14 @@ |
15 | 15 | |
16 | 16 | #include <linux/cgroup.h> |
17 | 17 | |
18 | +enum blkio_policy_id { | |
19 | + BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */ | |
20 | + BLKIO_POLICY_THROTL, /* Throttling */ | |
21 | +}; | |
22 | + | |
23 | +/* Max limits for throttle policy */ | |
24 | +#define THROTL_IOPS_MAX UINT_MAX | |
25 | + | |
18 | 26 | #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) |
19 | 27 | |
20 | 28 | #ifndef CONFIG_BLK_CGROUP |
... | ... | @@ -65,6 +73,35 @@ |
65 | 73 | BLKG_empty, |
66 | 74 | }; |
67 | 75 | |
76 | +/* cgroup files owned by proportional weight policy */ | |
77 | +enum blkcg_file_name_prop { | |
78 | + BLKIO_PROP_weight = 1, | |
79 | + BLKIO_PROP_weight_device, | |
80 | + BLKIO_PROP_io_service_bytes, | |
81 | + BLKIO_PROP_io_serviced, | |
82 | + BLKIO_PROP_time, | |
83 | + BLKIO_PROP_sectors, | |
84 | + BLKIO_PROP_io_service_time, | |
85 | + BLKIO_PROP_io_wait_time, | |
86 | + BLKIO_PROP_io_merged, | |
87 | + BLKIO_PROP_io_queued, | |
88 | + BLKIO_PROP_avg_queue_size, | |
89 | + BLKIO_PROP_group_wait_time, | |
90 | + BLKIO_PROP_idle_time, | |
91 | + BLKIO_PROP_empty_time, | |
92 | + BLKIO_PROP_dequeue, | |
93 | +}; | |
94 | + | |
95 | +/* cgroup files owned by throttle policy */ | |
96 | +enum blkcg_file_name_throtl { | |
97 | + BLKIO_THROTL_read_bps_device, | |
98 | + BLKIO_THROTL_write_bps_device, | |
99 | + BLKIO_THROTL_read_iops_device, | |
100 | + BLKIO_THROTL_write_iops_device, | |
101 | + BLKIO_THROTL_io_service_bytes, | |
102 | + BLKIO_THROTL_io_serviced, | |
103 | +}; | |
104 | + | |
68 | 105 | struct blkio_cgroup { |
69 | 106 | struct cgroup_subsys_state css; |
70 | 107 | unsigned int weight; |
... | ... | @@ -112,6 +149,8 @@ |
112 | 149 | char path[128]; |
113 | 150 | /* The device MKDEV(major, minor), this group has been created for */ |
114 | 151 | dev_t dev; |
152 | + /* policy which owns this blk group */ | |
153 | + enum blkio_policy_id plid; | |
115 | 154 | |
116 | 155 | /* Need to serialize the stats in the case of reset/update */ |
117 | 156 | spinlock_t stats_lock; |
118 | 157 | |
119 | 158 | |
120 | 159 | |
121 | 160 | |
122 | 161 | |
... | ... | @@ -121,24 +160,60 @@ |
121 | 160 | struct blkio_policy_node { |
122 | 161 | struct list_head node; |
123 | 162 | dev_t dev; |
124 | - unsigned int weight; | |
163 | + /* This node belongs to max bw policy or porportional weight policy */ | |
164 | + enum blkio_policy_id plid; | |
165 | + /* cgroup file to which this rule belongs to */ | |
166 | + int fileid; | |
167 | + | |
168 | + union { | |
169 | + unsigned int weight; | |
170 | + /* | |
171 | + * Rate read/write in terms of byptes per second | |
172 | + * Whether this rate represents read or write is determined | |
173 | + * by file type "fileid". | |
174 | + */ | |
175 | + u64 bps; | |
176 | + unsigned int iops; | |
177 | + } val; | |
125 | 178 | }; |
126 | 179 | |
127 | 180 | extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, |
128 | 181 | dev_t dev); |
182 | +extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, | |
183 | + dev_t dev); | |
184 | +extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, | |
185 | + dev_t dev); | |
186 | +extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, | |
187 | + dev_t dev); | |
188 | +extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, | |
189 | + dev_t dev); | |
129 | 190 | |
130 | 191 | typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); |
131 | -typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, | |
132 | - unsigned int weight); | |
133 | 192 | |
193 | +typedef void (blkio_update_group_weight_fn) (void *key, | |
194 | + struct blkio_group *blkg, unsigned int weight); | |
195 | +typedef void (blkio_update_group_read_bps_fn) (void * key, | |
196 | + struct blkio_group *blkg, u64 read_bps); | |
197 | +typedef void (blkio_update_group_write_bps_fn) (void *key, | |
198 | + struct blkio_group *blkg, u64 write_bps); | |
199 | +typedef void (blkio_update_group_read_iops_fn) (void *key, | |
200 | + struct blkio_group *blkg, unsigned int read_iops); | |
201 | +typedef void (blkio_update_group_write_iops_fn) (void *key, | |
202 | + struct blkio_group *blkg, unsigned int write_iops); | |
203 | + | |
134 | 204 | struct blkio_policy_ops { |
135 | 205 | blkio_unlink_group_fn *blkio_unlink_group_fn; |
136 | 206 | blkio_update_group_weight_fn *blkio_update_group_weight_fn; |
207 | + blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn; | |
208 | + blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn; | |
209 | + blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn; | |
210 | + blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn; | |
137 | 211 | }; |
138 | 212 | |
139 | 213 | struct blkio_policy_type { |
140 | 214 | struct list_head list; |
141 | 215 | struct blkio_policy_ops ops; |
216 | + enum blkio_policy_id plid; | |
142 | 217 | }; |
143 | 218 | |
144 | 219 | /* Blkio controller policy registration */ |
... | ... | @@ -212,7 +287,8 @@ |
212 | 287 | extern struct blkio_cgroup blkio_root_cgroup; |
213 | 288 | extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); |
214 | 289 | extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, |
215 | - struct blkio_group *blkg, void *key, dev_t dev); | |
290 | + struct blkio_group *blkg, void *key, dev_t dev, | |
291 | + enum blkio_policy_id plid); | |
216 | 292 | extern int blkiocg_del_blkio_group(struct blkio_group *blkg); |
217 | 293 | extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, |
218 | 294 | void *key); |
... | ... | @@ -234,7 +310,8 @@ |
234 | 310 | cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } |
235 | 311 | |
236 | 312 | static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, |
237 | - struct blkio_group *blkg, void *key, dev_t dev) {} | |
313 | + struct blkio_group *blkg, void *key, dev_t dev, | |
314 | + enum blkio_policy_id plid) {} | |
238 | 315 | |
239 | 316 | static inline int |
240 | 317 | blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } |
block/blk-core.c
... | ... | @@ -64,13 +64,15 @@ |
64 | 64 | return; |
65 | 65 | |
66 | 66 | cpu = part_stat_lock(); |
67 | - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); | |
68 | 67 | |
69 | - if (!new_io) | |
68 | + if (!new_io) { | |
69 | + part = rq->part; | |
70 | 70 | part_stat_inc(cpu, part, merges[rw]); |
71 | - else { | |
71 | + } else { | |
72 | + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); | |
72 | 73 | part_round_stats(cpu, part); |
73 | 74 | part_inc_in_flight(part, rw); |
75 | + rq->part = part; | |
74 | 76 | } |
75 | 77 | |
76 | 78 | part_stat_unlock(); |
... | ... | @@ -128,6 +130,7 @@ |
128 | 130 | rq->ref_count = 1; |
129 | 131 | rq->start_time = jiffies; |
130 | 132 | set_start_time_ns(rq); |
133 | + rq->part = NULL; | |
131 | 134 | } |
132 | 135 | EXPORT_SYMBOL(blk_rq_init); |
133 | 136 | |
... | ... | @@ -382,6 +385,7 @@ |
382 | 385 | del_timer_sync(&q->unplug_timer); |
383 | 386 | del_timer_sync(&q->timeout); |
384 | 387 | cancel_work_sync(&q->unplug_work); |
388 | + throtl_shutdown_timer_wq(q); | |
385 | 389 | } |
386 | 390 | EXPORT_SYMBOL(blk_sync_queue); |
387 | 391 | |
... | ... | @@ -459,6 +463,8 @@ |
459 | 463 | if (q->elevator) |
460 | 464 | elevator_exit(q->elevator); |
461 | 465 | |
466 | + blk_throtl_exit(q); | |
467 | + | |
462 | 468 | blk_put_queue(q); |
463 | 469 | } |
464 | 470 | EXPORT_SYMBOL(blk_cleanup_queue); |
... | ... | @@ -515,6 +521,11 @@ |
515 | 521 | return NULL; |
516 | 522 | } |
517 | 523 | |
524 | + if (blk_throtl_init(q)) { | |
525 | + kmem_cache_free(blk_requestq_cachep, q); | |
526 | + return NULL; | |
527 | + } | |
528 | + | |
518 | 529 | setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, |
519 | 530 | laptop_mode_timer_fn, (unsigned long) q); |
520 | 531 | init_timer(&q->unplug_timer); |
521 | 532 | |
... | ... | @@ -796,11 +807,16 @@ |
796 | 807 | rl->starved[is_sync] = 0; |
797 | 808 | |
798 | 809 | priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); |
799 | - if (priv) | |
810 | + if (priv) { | |
800 | 811 | rl->elvpriv++; |
801 | 812 | |
802 | - if (blk_queue_io_stat(q)) | |
803 | - rw_flags |= REQ_IO_STAT; | |
813 | + /* | |
814 | + * Don't do stats for non-priv requests | |
815 | + */ | |
816 | + if (blk_queue_io_stat(q)) | |
817 | + rw_flags |= REQ_IO_STAT; | |
818 | + } | |
819 | + | |
804 | 820 | spin_unlock_irq(q->queue_lock); |
805 | 821 | |
806 | 822 | rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); |
... | ... | @@ -1522,6 +1538,15 @@ |
1522 | 1538 | goto end_io; |
1523 | 1539 | } |
1524 | 1540 | |
1541 | + blk_throtl_bio(q, &bio); | |
1542 | + | |
1543 | + /* | |
1544 | + * If bio = NULL, bio has been throttled and will be submitted | |
1545 | + * later. | |
1546 | + */ | |
1547 | + if (!bio) | |
1548 | + break; | |
1549 | + | |
1525 | 1550 | trace_block_bio_queue(q, bio); |
1526 | 1551 | |
1527 | 1552 | ret = q->make_request_fn(q, bio); |
1528 | 1553 | |
... | ... | @@ -1612,11 +1637,12 @@ |
1612 | 1637 | |
1613 | 1638 | if (unlikely(block_dump)) { |
1614 | 1639 | char b[BDEVNAME_SIZE]; |
1615 | - printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", | |
1640 | + printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", | |
1616 | 1641 | current->comm, task_pid_nr(current), |
1617 | 1642 | (rw & WRITE) ? "WRITE" : "READ", |
1618 | 1643 | (unsigned long long)bio->bi_sector, |
1619 | - bdevname(bio->bi_bdev, b)); | |
1644 | + bdevname(bio->bi_bdev, b), | |
1645 | + count); | |
1620 | 1646 | } |
1621 | 1647 | } |
1622 | 1648 | |
... | ... | @@ -1759,7 +1785,7 @@ |
1759 | 1785 | int cpu; |
1760 | 1786 | |
1761 | 1787 | cpu = part_stat_lock(); |
1762 | - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); | |
1788 | + part = req->part; | |
1763 | 1789 | part_stat_add(cpu, part, sectors[rw], bytes >> 9); |
1764 | 1790 | part_stat_unlock(); |
1765 | 1791 | } |
... | ... | @@ -1779,7 +1805,7 @@ |
1779 | 1805 | int cpu; |
1780 | 1806 | |
1781 | 1807 | cpu = part_stat_lock(); |
1782 | - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); | |
1808 | + part = req->part; | |
1783 | 1809 | |
1784 | 1810 | part_stat_inc(cpu, part, ios[rw]); |
1785 | 1811 | part_stat_add(cpu, part, ticks[rw], duration); |
... | ... | @@ -2578,6 +2604,13 @@ |
2578 | 2604 | return queue_work(kblockd_workqueue, work); |
2579 | 2605 | } |
2580 | 2606 | EXPORT_SYMBOL(kblockd_schedule_work); |
2607 | + | |
2608 | +int kblockd_schedule_delayed_work(struct request_queue *q, | |
2609 | + struct delayed_work *dwork, unsigned long delay) | |
2610 | +{ | |
2611 | + return queue_delayed_work(kblockd_workqueue, dwork, delay); | |
2612 | +} | |
2613 | +EXPORT_SYMBOL(kblockd_schedule_delayed_work); | |
2581 | 2614 | |
2582 | 2615 | int __init blk_dev_init(void) |
2583 | 2616 | { |
block/blk-exec.c
... | ... | @@ -80,6 +80,7 @@ |
80 | 80 | DECLARE_COMPLETION_ONSTACK(wait); |
81 | 81 | char sense[SCSI_SENSE_BUFFERSIZE]; |
82 | 82 | int err = 0; |
83 | + unsigned long hang_check; | |
83 | 84 | |
84 | 85 | /* |
85 | 86 | * we need an extra reference to the request, so we can look at |
... | ... | @@ -95,7 +96,13 @@ |
95 | 96 | |
96 | 97 | rq->end_io_data = &wait; |
97 | 98 | blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); |
98 | - wait_for_completion(&wait); | |
99 | + | |
100 | + /* Prevent hang_check timer from firing at us during very long I/O */ | |
101 | + hang_check = sysctl_hung_task_timeout_secs; | |
102 | + if (hang_check) | |
103 | + while (!wait_for_completion_timeout(&wait, hang_check * (HZ/2))); | |
104 | + else | |
105 | + wait_for_completion(&wait); | |
99 | 106 | |
100 | 107 | if (rq->errors) |
101 | 108 | err = -EIO; |
block/blk-integrity.c
... | ... | @@ -32,24 +32,37 @@ |
32 | 32 | |
33 | 33 | /** |
34 | 34 | * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements |
35 | - * @rq: request with integrity metadata attached | |
35 | + * @q: request queue | |
36 | + * @bio: bio with integrity metadata attached | |
36 | 37 | * |
37 | 38 | * Description: Returns the number of elements required in a |
38 | - * scatterlist corresponding to the integrity metadata in a request. | |
39 | + * scatterlist corresponding to the integrity metadata in a bio. | |
39 | 40 | */ |
40 | -int blk_rq_count_integrity_sg(struct request *rq) | |
41 | +int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio) | |
41 | 42 | { |
42 | - struct bio_vec *iv, *ivprv; | |
43 | - struct req_iterator iter; | |
44 | - unsigned int segments; | |
43 | + struct bio_vec *iv, *ivprv = NULL; | |
44 | + unsigned int segments = 0; | |
45 | + unsigned int seg_size = 0; | |
46 | + unsigned int i = 0; | |
45 | 47 | |
46 | - ivprv = NULL; | |
47 | - segments = 0; | |
48 | + bio_for_each_integrity_vec(iv, bio, i) { | |
48 | 49 | |
49 | - rq_for_each_integrity_segment(iv, rq, iter) { | |
50 | + if (ivprv) { | |
51 | + if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) | |
52 | + goto new_segment; | |
50 | 53 | |
51 | - if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv)) | |
54 | + if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv)) | |
55 | + goto new_segment; | |
56 | + | |
57 | + if (seg_size + iv->bv_len > queue_max_segment_size(q)) | |
58 | + goto new_segment; | |
59 | + | |
60 | + seg_size += iv->bv_len; | |
61 | + } else { | |
62 | +new_segment: | |
52 | 63 | segments++; |
64 | + seg_size = iv->bv_len; | |
65 | + } | |
53 | 66 | |
54 | 67 | ivprv = iv; |
55 | 68 | } |
56 | 69 | |
57 | 70 | |
58 | 71 | |
59 | 72 | |
60 | 73 | |
... | ... | @@ -60,30 +73,34 @@ |
60 | 73 | |
61 | 74 | /** |
62 | 75 | * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist |
63 | - * @rq: request with integrity metadata attached | |
76 | + * @q: request queue | |
77 | + * @bio: bio with integrity metadata attached | |
64 | 78 | * @sglist: target scatterlist |
65 | 79 | * |
66 | 80 | * Description: Map the integrity vectors in request into a |
67 | 81 | * scatterlist. The scatterlist must be big enough to hold all |
68 | 82 | * elements. I.e. sized using blk_rq_count_integrity_sg(). |
69 | 83 | */ |
70 | -int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) | |
84 | +int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio, | |
85 | + struct scatterlist *sglist) | |
71 | 86 | { |
72 | - struct bio_vec *iv, *ivprv; | |
73 | - struct req_iterator iter; | |
74 | - struct scatterlist *sg; | |
75 | - unsigned int segments; | |
87 | + struct bio_vec *iv, *ivprv = NULL; | |
88 | + struct scatterlist *sg = NULL; | |
89 | + unsigned int segments = 0; | |
90 | + unsigned int i = 0; | |
76 | 91 | |
77 | - ivprv = NULL; | |
78 | - sg = NULL; | |
79 | - segments = 0; | |
92 | + bio_for_each_integrity_vec(iv, bio, i) { | |
80 | 93 | |
81 | - rq_for_each_integrity_segment(iv, rq, iter) { | |
82 | - | |
83 | 94 | if (ivprv) { |
84 | 95 | if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) |
85 | 96 | goto new_segment; |
86 | 97 | |
98 | + if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv)) | |
99 | + goto new_segment; | |
100 | + | |
101 | + if (sg->length + iv->bv_len > queue_max_segment_size(q)) | |
102 | + goto new_segment; | |
103 | + | |
87 | 104 | sg->length += iv->bv_len; |
88 | 105 | } else { |
89 | 106 | new_segment: |
... | ... | @@ -162,6 +179,40 @@ |
162 | 179 | } |
163 | 180 | EXPORT_SYMBOL(blk_integrity_compare); |
164 | 181 | |
182 | +int blk_integrity_merge_rq(struct request_queue *q, struct request *req, | |
183 | + struct request *next) | |
184 | +{ | |
185 | + if (blk_integrity_rq(req) != blk_integrity_rq(next)) | |
186 | + return -1; | |
187 | + | |
188 | + if (req->nr_integrity_segments + next->nr_integrity_segments > | |
189 | + q->limits.max_integrity_segments) | |
190 | + return -1; | |
191 | + | |
192 | + return 0; | |
193 | +} | |
194 | +EXPORT_SYMBOL(blk_integrity_merge_rq); | |
195 | + | |
196 | +int blk_integrity_merge_bio(struct request_queue *q, struct request *req, | |
197 | + struct bio *bio) | |
198 | +{ | |
199 | + int nr_integrity_segs; | |
200 | + struct bio *next = bio->bi_next; | |
201 | + | |
202 | + bio->bi_next = NULL; | |
203 | + nr_integrity_segs = blk_rq_count_integrity_sg(q, bio); | |
204 | + bio->bi_next = next; | |
205 | + | |
206 | + if (req->nr_integrity_segments + nr_integrity_segs > | |
207 | + q->limits.max_integrity_segments) | |
208 | + return -1; | |
209 | + | |
210 | + req->nr_integrity_segments += nr_integrity_segs; | |
211 | + | |
212 | + return 0; | |
213 | +} | |
214 | +EXPORT_SYMBOL(blk_integrity_merge_bio); | |
215 | + | |
165 | 216 | struct integrity_sysfs_entry { |
166 | 217 | struct attribute attr; |
167 | 218 | ssize_t (*show)(struct blk_integrity *, char *); |
... | ... | @@ -381,7 +432,6 @@ |
381 | 432 | kobject_uevent(&bi->kobj, KOBJ_REMOVE); |
382 | 433 | kobject_del(&bi->kobj); |
383 | 434 | kobject_put(&bi->kobj); |
384 | - kmem_cache_free(integrity_cachep, bi); | |
385 | 435 | disk->integrity = NULL; |
386 | 436 | } |
387 | 437 | EXPORT_SYMBOL(blk_integrity_unregister); |
block/blk-map.c
... | ... | @@ -54,7 +54,7 @@ |
54 | 54 | * direct dma. else, set up kernel bounce buffers |
55 | 55 | */ |
56 | 56 | uaddr = (unsigned long) ubuf; |
57 | - if (blk_rq_aligned(q, ubuf, len) && !map_data) | |
57 | + if (blk_rq_aligned(q, uaddr, len) && !map_data) | |
58 | 58 | bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask); |
59 | 59 | else |
60 | 60 | bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask); |
... | ... | @@ -288,6 +288,7 @@ |
288 | 288 | unsigned int len, gfp_t gfp_mask) |
289 | 289 | { |
290 | 290 | int reading = rq_data_dir(rq) == READ; |
291 | + unsigned long addr = (unsigned long) kbuf; | |
291 | 292 | int do_copy = 0; |
292 | 293 | struct bio *bio; |
293 | 294 | int ret; |
... | ... | @@ -297,7 +298,7 @@ |
297 | 298 | if (!len || !kbuf) |
298 | 299 | return -EINVAL; |
299 | 300 | |
300 | - do_copy = !blk_rq_aligned(q, kbuf, len) || object_is_on_stack(kbuf); | |
301 | + do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf); | |
301 | 302 | if (do_copy) |
302 | 303 | bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); |
303 | 304 | else |
block/blk-merge.c
... | ... | @@ -205,19 +205,24 @@ |
205 | 205 | { |
206 | 206 | int nr_phys_segs = bio_phys_segments(q, bio); |
207 | 207 | |
208 | - if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) { | |
209 | - req->cmd_flags |= REQ_NOMERGE; | |
210 | - if (req == q->last_merge) | |
211 | - q->last_merge = NULL; | |
212 | - return 0; | |
213 | - } | |
208 | + if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) | |
209 | + goto no_merge; | |
214 | 210 | |
211 | + if (bio_integrity(bio) && blk_integrity_merge_bio(q, req, bio)) | |
212 | + goto no_merge; | |
213 | + | |
215 | 214 | /* |
216 | 215 | * This will form the start of a new hw segment. Bump both |
217 | 216 | * counters. |
218 | 217 | */ |
219 | 218 | req->nr_phys_segments += nr_phys_segs; |
220 | 219 | return 1; |
220 | + | |
221 | +no_merge: | |
222 | + req->cmd_flags |= REQ_NOMERGE; | |
223 | + if (req == q->last_merge) | |
224 | + q->last_merge = NULL; | |
225 | + return 0; | |
221 | 226 | } |
222 | 227 | |
223 | 228 | int ll_back_merge_fn(struct request_queue *q, struct request *req, |
... | ... | @@ -301,6 +306,9 @@ |
301 | 306 | if (total_phys_segments > queue_max_segments(q)) |
302 | 307 | return 0; |
303 | 308 | |
309 | + if (blk_integrity_rq(req) && blk_integrity_merge_rq(q, req, next)) | |
310 | + return 0; | |
311 | + | |
304 | 312 | /* Merge is OK... */ |
305 | 313 | req->nr_phys_segments = total_phys_segments; |
306 | 314 | return 1; |
... | ... | @@ -343,7 +351,7 @@ |
343 | 351 | int cpu; |
344 | 352 | |
345 | 353 | cpu = part_stat_lock(); |
346 | - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); | |
354 | + part = req->part; | |
347 | 355 | |
348 | 356 | part_round_stats(cpu, part); |
349 | 357 | part_dec_in_flight(part, rq_data_dir(req)); |
... | ... | @@ -382,9 +390,6 @@ |
382 | 390 | if (rq_data_dir(req) != rq_data_dir(next) |
383 | 391 | || req->rq_disk != next->rq_disk |
384 | 392 | || next->special) |
385 | - return 0; | |
386 | - | |
387 | - if (blk_integrity_rq(req) != blk_integrity_rq(next)) | |
388 | 393 | return 0; |
389 | 394 | |
390 | 395 | /* |
block/blk-settings.c
... | ... | @@ -111,6 +111,7 @@ |
111 | 111 | void blk_set_default_limits(struct queue_limits *lim) |
112 | 112 | { |
113 | 113 | lim->max_segments = BLK_MAX_SEGMENTS; |
114 | + lim->max_integrity_segments = 0; | |
114 | 115 | lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; |
115 | 116 | lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; |
116 | 117 | lim->max_sectors = BLK_DEF_MAX_SECTORS; |
... | ... | @@ -213,7 +214,7 @@ |
213 | 214 | */ |
214 | 215 | if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) |
215 | 216 | dma = 1; |
216 | - q->limits.bounce_pfn = max_low_pfn; | |
217 | + q->limits.bounce_pfn = max(max_low_pfn, b_pfn); | |
217 | 218 | #else |
218 | 219 | if (b_pfn < blk_max_low_pfn) |
219 | 220 | dma = 1; |
... | ... | @@ -343,7 +344,7 @@ |
343 | 344 | * hardware can operate on without reverting to read-modify-write |
344 | 345 | * operations. |
345 | 346 | */ |
346 | -void blk_queue_physical_block_size(struct request_queue *q, unsigned short size) | |
347 | +void blk_queue_physical_block_size(struct request_queue *q, unsigned int size) | |
347 | 348 | { |
348 | 349 | q->limits.physical_block_size = size; |
349 | 350 | |
... | ... | @@ -455,11 +456,6 @@ |
455 | 456 | } |
456 | 457 | EXPORT_SYMBOL(blk_queue_io_opt); |
457 | 458 | |
458 | -/* | |
459 | - * Returns the minimum that is _not_ zero, unless both are zero. | |
460 | - */ | |
461 | -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | |
462 | - | |
463 | 459 | /** |
464 | 460 | * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers |
465 | 461 | * @t: the stacking driver (top) |
... | ... | @@ -514,6 +510,8 @@ |
514 | 510 | b->seg_boundary_mask); |
515 | 511 | |
516 | 512 | t->max_segments = min_not_zero(t->max_segments, b->max_segments); |
513 | + t->max_integrity_segments = min_not_zero(t->max_integrity_segments, | |
514 | + b->max_integrity_segments); | |
517 | 515 | |
518 | 516 | t->max_segment_size = min_not_zero(t->max_segment_size, |
519 | 517 | b->max_segment_size); |
block/blk-sysfs.c
... | ... | @@ -112,6 +112,11 @@ |
112 | 112 | return queue_var_show(queue_max_segments(q), (page)); |
113 | 113 | } |
114 | 114 | |
115 | +static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page) | |
116 | +{ | |
117 | + return queue_var_show(q->limits.max_integrity_segments, (page)); | |
118 | +} | |
119 | + | |
115 | 120 | static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) |
116 | 121 | { |
117 | 122 | if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) |
... | ... | @@ -288,6 +293,11 @@ |
288 | 293 | .show = queue_max_segments_show, |
289 | 294 | }; |
290 | 295 | |
296 | +static struct queue_sysfs_entry queue_max_integrity_segments_entry = { | |
297 | + .attr = {.name = "max_integrity_segments", .mode = S_IRUGO }, | |
298 | + .show = queue_max_integrity_segments_show, | |
299 | +}; | |
300 | + | |
291 | 301 | static struct queue_sysfs_entry queue_max_segment_size_entry = { |
292 | 302 | .attr = {.name = "max_segment_size", .mode = S_IRUGO }, |
293 | 303 | .show = queue_max_segment_size_show, |
... | ... | @@ -375,6 +385,7 @@ |
375 | 385 | &queue_max_hw_sectors_entry.attr, |
376 | 386 | &queue_max_sectors_entry.attr, |
377 | 387 | &queue_max_segments_entry.attr, |
388 | + &queue_max_integrity_segments_entry.attr, | |
378 | 389 | &queue_max_segment_size_entry.attr, |
379 | 390 | &queue_iosched_entry.attr, |
380 | 391 | &queue_hw_sector_size_entry.attr, |
block/blk-throttle.c
Changes suppressed. Click to show
1 | +/* | |
2 | + * Interface for controlling IO bandwidth on a request queue | |
3 | + * | |
4 | + * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> | |
5 | + */ | |
6 | + | |
7 | +#include <linux/module.h> | |
8 | +#include <linux/slab.h> | |
9 | +#include <linux/blkdev.h> | |
10 | +#include <linux/bio.h> | |
11 | +#include <linux/blktrace_api.h> | |
12 | +#include "blk-cgroup.h" | |
13 | + | |
14 | +/* Max dispatch from a group in 1 round */ | |
15 | +static int throtl_grp_quantum = 8; | |
16 | + | |
17 | +/* Total max dispatch from all groups in one round */ | |
18 | +static int throtl_quantum = 32; | |
19 | + | |
20 | +/* Throttling is performed over 100ms slice and after that slice is renewed */ | |
21 | +static unsigned long throtl_slice = HZ/10; /* 100 ms */ | |
22 | + | |
23 | +struct throtl_rb_root { | |
24 | + struct rb_root rb; | |
25 | + struct rb_node *left; | |
26 | + unsigned int count; | |
27 | + unsigned long min_disptime; | |
28 | +}; | |
29 | + | |
30 | +#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \ | |
31 | + .count = 0, .min_disptime = 0} | |
32 | + | |
33 | +#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) | |
34 | + | |
35 | +struct throtl_grp { | |
36 | + /* List of throtl groups on the request queue*/ | |
37 | + struct hlist_node tg_node; | |
38 | + | |
39 | + /* active throtl group service_tree member */ | |
40 | + struct rb_node rb_node; | |
41 | + | |
42 | + /* | |
43 | + * Dispatch time in jiffies. This is the estimated time when group | |
44 | + * will unthrottle and is ready to dispatch more bio. It is used as | |
45 | + * key to sort active groups in service tree. | |
46 | + */ | |
47 | + unsigned long disptime; | |
48 | + | |
49 | + struct blkio_group blkg; | |
50 | + atomic_t ref; | |
51 | + unsigned int flags; | |
52 | + | |
53 | + /* Two lists for READ and WRITE */ | |
54 | + struct bio_list bio_lists[2]; | |
55 | + | |
56 | + /* Number of queued bios on READ and WRITE lists */ | |
57 | + unsigned int nr_queued[2]; | |
58 | + | |
59 | + /* bytes per second rate limits */ | |
60 | + uint64_t bps[2]; | |
61 | + | |
62 | + /* IOPS limits */ | |
63 | + unsigned int iops[2]; | |
64 | + | |
65 | + /* Number of bytes disptached in current slice */ | |
66 | + uint64_t bytes_disp[2]; | |
67 | + /* Number of bio's dispatched in current slice */ | |
68 | + unsigned int io_disp[2]; | |
69 | + | |
70 | + /* When did we start a new slice */ | |
71 | + unsigned long slice_start[2]; | |
72 | + unsigned long slice_end[2]; | |
73 | + | |
74 | + /* Some throttle limits got updated for the group */ | |
75 | + bool limits_changed; | |
76 | +}; | |
77 | + | |
78 | +struct throtl_data | |
79 | +{ | |
80 | + /* List of throtl groups */ | |
81 | + struct hlist_head tg_list; | |
82 | + | |
83 | + /* service tree for active throtl groups */ | |
84 | + struct throtl_rb_root tg_service_tree; | |
85 | + | |
86 | + struct throtl_grp root_tg; | |
87 | + struct request_queue *queue; | |
88 | + | |
89 | + /* Total Number of queued bios on READ and WRITE lists */ | |
90 | + unsigned int nr_queued[2]; | |
91 | + | |
92 | + /* | |
93 | + * number of total undestroyed groups | |
94 | + */ | |
95 | + unsigned int nr_undestroyed_grps; | |
96 | + | |
97 | + /* Work for dispatching throttled bios */ | |
98 | + struct delayed_work throtl_work; | |
99 | + | |
100 | + atomic_t limits_changed; | |
101 | +}; | |
102 | + | |
103 | +enum tg_state_flags { | |
104 | + THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ | |
105 | +}; | |
106 | + | |
107 | +#define THROTL_TG_FNS(name) \ | |
108 | +static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \ | |
109 | +{ \ | |
110 | + (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \ | |
111 | +} \ | |
112 | +static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \ | |
113 | +{ \ | |
114 | + (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \ | |
115 | +} \ | |
116 | +static inline int throtl_tg_##name(const struct throtl_grp *tg) \ | |
117 | +{ \ | |
118 | + return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \ | |
119 | +} | |
120 | + | |
121 | +THROTL_TG_FNS(on_rr); | |
122 | + | |
123 | +#define throtl_log_tg(td, tg, fmt, args...) \ | |
124 | + blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ | |
125 | + blkg_path(&(tg)->blkg), ##args); \ | |
126 | + | |
127 | +#define throtl_log(td, fmt, args...) \ | |
128 | + blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) | |
129 | + | |
130 | +static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg) | |
131 | +{ | |
132 | + if (blkg) | |
133 | + return container_of(blkg, struct throtl_grp, blkg); | |
134 | + | |
135 | + return NULL; | |
136 | +} | |
137 | + | |
138 | +static inline int total_nr_queued(struct throtl_data *td) | |
139 | +{ | |
140 | + return (td->nr_queued[0] + td->nr_queued[1]); | |
141 | +} | |
142 | + | |
143 | +static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) | |
144 | +{ | |
145 | + atomic_inc(&tg->ref); | |
146 | + return tg; | |
147 | +} | |
148 | + | |
149 | +static void throtl_put_tg(struct throtl_grp *tg) | |
150 | +{ | |
151 | + BUG_ON(atomic_read(&tg->ref) <= 0); | |
152 | + if (!atomic_dec_and_test(&tg->ref)) | |
153 | + return; | |
154 | + kfree(tg); | |
155 | +} | |
156 | + | |
157 | +static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, | |
158 | + struct cgroup *cgroup) | |
159 | +{ | |
160 | + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); | |
161 | + struct throtl_grp *tg = NULL; | |
162 | + void *key = td; | |
163 | + struct backing_dev_info *bdi = &td->queue->backing_dev_info; | |
164 | + unsigned int major, minor; | |
165 | + | |
166 | + /* | |
167 | + * TODO: Speed up blkiocg_lookup_group() by maintaining a radix | |
168 | + * tree of blkg (instead of traversing through hash list all | |
169 | + * the time. | |
170 | + */ | |
171 | + tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); | |
172 | + | |
173 | + /* Fill in device details for root group */ | |
174 | + if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { | |
175 | + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | |
176 | + tg->blkg.dev = MKDEV(major, minor); | |
177 | + goto done; | |
178 | + } | |
179 | + | |
180 | + if (tg) | |
181 | + goto done; | |
182 | + | |
183 | + tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); | |
184 | + if (!tg) | |
185 | + goto done; | |
186 | + | |
187 | + INIT_HLIST_NODE(&tg->tg_node); | |
188 | + RB_CLEAR_NODE(&tg->rb_node); | |
189 | + bio_list_init(&tg->bio_lists[0]); | |
190 | + bio_list_init(&tg->bio_lists[1]); | |
191 | + | |
192 | + /* | |
193 | + * Take the initial reference that will be released on destroy | |
194 | + * This can be thought of a joint reference by cgroup and | |
195 | + * request queue which will be dropped by either request queue | |
196 | + * exit or cgroup deletion path depending on who is exiting first. | |
197 | + */ | |
198 | + atomic_set(&tg->ref, 1); | |
199 | + | |
200 | + /* Add group onto cgroup list */ | |
201 | + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | |
202 | + blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, | |
203 | + MKDEV(major, minor), BLKIO_POLICY_THROTL); | |
204 | + | |
205 | + tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); | |
206 | + tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); | |
207 | + tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); | |
208 | + tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); | |
209 | + | |
210 | + hlist_add_head(&tg->tg_node, &td->tg_list); | |
211 | + td->nr_undestroyed_grps++; | |
212 | +done: | |
213 | + return tg; | |
214 | +} | |
215 | + | |
216 | +static struct throtl_grp * throtl_get_tg(struct throtl_data *td) | |
217 | +{ | |
218 | + struct cgroup *cgroup; | |
219 | + struct throtl_grp *tg = NULL; | |
220 | + | |
221 | + rcu_read_lock(); | |
222 | + cgroup = task_cgroup(current, blkio_subsys_id); | |
223 | + tg = throtl_find_alloc_tg(td, cgroup); | |
224 | + if (!tg) | |
225 | + tg = &td->root_tg; | |
226 | + rcu_read_unlock(); | |
227 | + return tg; | |
228 | +} | |
229 | + | |
230 | +static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root) | |
231 | +{ | |
232 | + /* Service tree is empty */ | |
233 | + if (!root->count) | |
234 | + return NULL; | |
235 | + | |
236 | + if (!root->left) | |
237 | + root->left = rb_first(&root->rb); | |
238 | + | |
239 | + if (root->left) | |
240 | + return rb_entry_tg(root->left); | |
241 | + | |
242 | + return NULL; | |
243 | +} | |
244 | + | |
245 | +static void rb_erase_init(struct rb_node *n, struct rb_root *root) | |
246 | +{ | |
247 | + rb_erase(n, root); | |
248 | + RB_CLEAR_NODE(n); | |
249 | +} | |
250 | + | |
251 | +static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root) | |
252 | +{ | |
253 | + if (root->left == n) | |
254 | + root->left = NULL; | |
255 | + rb_erase_init(n, &root->rb); | |
256 | + --root->count; | |
257 | +} | |
258 | + | |
259 | +static void update_min_dispatch_time(struct throtl_rb_root *st) | |
260 | +{ | |
261 | + struct throtl_grp *tg; | |
262 | + | |
263 | + tg = throtl_rb_first(st); | |
264 | + if (!tg) | |
265 | + return; | |
266 | + | |
267 | + st->min_disptime = tg->disptime; | |
268 | +} | |
269 | + | |
270 | +static void | |
271 | +tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg) | |
272 | +{ | |
273 | + struct rb_node **node = &st->rb.rb_node; | |
274 | + struct rb_node *parent = NULL; | |
275 | + struct throtl_grp *__tg; | |
276 | + unsigned long key = tg->disptime; | |
277 | + int left = 1; | |
278 | + | |
279 | + while (*node != NULL) { | |
280 | + parent = *node; | |
281 | + __tg = rb_entry_tg(parent); | |
282 | + | |
283 | + if (time_before(key, __tg->disptime)) | |
284 | + node = &parent->rb_left; | |
285 | + else { | |
286 | + node = &parent->rb_right; | |
287 | + left = 0; | |
288 | + } | |
289 | + } | |
290 | + | |
291 | + if (left) | |
292 | + st->left = &tg->rb_node; | |
293 | + | |
294 | + rb_link_node(&tg->rb_node, parent, node); | |
295 | + rb_insert_color(&tg->rb_node, &st->rb); | |
296 | +} | |
297 | + | |
298 | +static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) | |
299 | +{ | |
300 | + struct throtl_rb_root *st = &td->tg_service_tree; | |
301 | + | |
302 | + tg_service_tree_add(st, tg); | |
303 | + throtl_mark_tg_on_rr(tg); | |
304 | + st->count++; | |
305 | +} | |
306 | + | |
307 | +static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) | |
308 | +{ | |
309 | + if (!throtl_tg_on_rr(tg)) | |
310 | + __throtl_enqueue_tg(td, tg); | |
311 | +} | |
312 | + | |
313 | +static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) | |
314 | +{ | |
315 | + throtl_rb_erase(&tg->rb_node, &td->tg_service_tree); | |
316 | + throtl_clear_tg_on_rr(tg); | |
317 | +} | |
318 | + | |
319 | +static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) | |
320 | +{ | |
321 | + if (throtl_tg_on_rr(tg)) | |
322 | + __throtl_dequeue_tg(td, tg); | |
323 | +} | |
324 | + | |
325 | +static void throtl_schedule_next_dispatch(struct throtl_data *td) | |
326 | +{ | |
327 | + struct throtl_rb_root *st = &td->tg_service_tree; | |
328 | + | |
329 | + /* | |
330 | + * If there are more bios pending, schedule more work. | |
331 | + */ | |
332 | + if (!total_nr_queued(td)) | |
333 | + return; | |
334 | + | |
335 | + BUG_ON(!st->count); | |
336 | + | |
337 | + update_min_dispatch_time(st); | |
338 | + | |
339 | + if (time_before_eq(st->min_disptime, jiffies)) | |
340 | + throtl_schedule_delayed_work(td->queue, 0); | |
341 | + else | |
342 | + throtl_schedule_delayed_work(td->queue, | |
343 | + (st->min_disptime - jiffies)); | |
344 | +} | |
345 | + | |
346 | +static inline void | |
347 | +throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) | |
348 | +{ | |
349 | + tg->bytes_disp[rw] = 0; | |
350 | + tg->io_disp[rw] = 0; | |
351 | + tg->slice_start[rw] = jiffies; | |
352 | + tg->slice_end[rw] = jiffies + throtl_slice; | |
353 | + throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", | |
354 | + rw == READ ? 'R' : 'W', tg->slice_start[rw], | |
355 | + tg->slice_end[rw], jiffies); | |
356 | +} | |
357 | + | |
358 | +static inline void throtl_extend_slice(struct throtl_data *td, | |
359 | + struct throtl_grp *tg, bool rw, unsigned long jiffy_end) | |
360 | +{ | |
361 | + tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); | |
362 | + throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", | |
363 | + rw == READ ? 'R' : 'W', tg->slice_start[rw], | |
364 | + tg->slice_end[rw], jiffies); | |
365 | +} | |
366 | + | |
367 | +/* Determine if previously allocated or extended slice is complete or not */ | |
368 | +static bool | |
369 | +throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) | |
370 | +{ | |
371 | + if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) | |
372 | + return 0; | |
373 | + | |
374 | + return 1; | |
375 | +} | |
376 | + | |
377 | +/* Trim the used slices and adjust slice start accordingly */ | |
378 | +static inline void | |
379 | +throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) | |
380 | +{ | |
381 | + unsigned long nr_slices, time_elapsed, io_trim; | |
382 | + u64 bytes_trim, tmp; | |
383 | + | |
384 | + BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); | |
385 | + | |
386 | + /* | |
387 | + * If bps are unlimited (-1), then time slice don't get | |
388 | + * renewed. Don't try to trim the slice if slice is used. A new | |
389 | + * slice will start when appropriate. | |
390 | + */ | |
391 | + if (throtl_slice_used(td, tg, rw)) | |
392 | + return; | |
393 | + | |
394 | + time_elapsed = jiffies - tg->slice_start[rw]; | |
395 | + | |
396 | + nr_slices = time_elapsed / throtl_slice; | |
397 | + | |
398 | + if (!nr_slices) | |
399 | + return; | |
400 | + tmp = tg->bps[rw] * throtl_slice * nr_slices; | |
401 | + do_div(tmp, HZ); | |
402 | + bytes_trim = tmp; | |
403 | + | |
404 | + io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; | |
405 | + | |
406 | + if (!bytes_trim && !io_trim) | |
407 | + return; | |
408 | + | |
409 | + if (tg->bytes_disp[rw] >= bytes_trim) | |
410 | + tg->bytes_disp[rw] -= bytes_trim; | |
411 | + else | |
412 | + tg->bytes_disp[rw] = 0; | |
413 | + | |
414 | + if (tg->io_disp[rw] >= io_trim) | |
415 | + tg->io_disp[rw] -= io_trim; | |
416 | + else | |
417 | + tg->io_disp[rw] = 0; | |
418 | + | |
419 | + tg->slice_start[rw] += nr_slices * throtl_slice; | |
420 | + | |
421 | + throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" | |
422 | + " start=%lu end=%lu jiffies=%lu", | |
423 | + rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, | |
424 | + tg->slice_start[rw], tg->slice_end[rw], jiffies); | |
425 | +} | |
426 | + | |
427 | +static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, | |
428 | + struct bio *bio, unsigned long *wait) | |
429 | +{ | |
430 | + bool rw = bio_data_dir(bio); | |
431 | + unsigned int io_allowed; | |
432 | + unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; | |
433 | + u64 tmp; | |
434 | + | |
435 | + jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; | |
436 | + | |
437 | + /* Slice has just started. Consider one slice interval */ | |
438 | + if (!jiffy_elapsed) | |
439 | + jiffy_elapsed_rnd = throtl_slice; | |
440 | + | |
441 | + jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); | |
442 | + | |
443 | + /* | |
444 | + * jiffy_elapsed_rnd should not be a big value as minimum iops can be | |
445 | + * 1 then at max jiffy elapsed should be equivalent of 1 second as we | |
446 | + * will allow dispatch after 1 second and after that slice should | |
447 | + * have been trimmed. | |
448 | + */ | |
449 | + | |
450 | + tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; | |
451 | + do_div(tmp, HZ); | |
452 | + | |
453 | + if (tmp > UINT_MAX) | |
454 | + io_allowed = UINT_MAX; | |
455 | + else | |
456 | + io_allowed = tmp; | |
457 | + | |
458 | + if (tg->io_disp[rw] + 1 <= io_allowed) { | |
459 | + if (wait) | |
460 | + *wait = 0; | |
461 | + return 1; | |
462 | + } | |
463 | + | |
464 | + /* Calc approx time to dispatch */ | |
465 | + jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; | |
466 | + | |
467 | + if (jiffy_wait > jiffy_elapsed) | |
468 | + jiffy_wait = jiffy_wait - jiffy_elapsed; | |
469 | + else | |
470 | + jiffy_wait = 1; | |
471 | + | |
472 | + if (wait) | |
473 | + *wait = jiffy_wait; | |
474 | + return 0; | |
475 | +} | |
476 | + | |
477 | +static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, | |
478 | + struct bio *bio, unsigned long *wait) | |
479 | +{ | |
480 | + bool rw = bio_data_dir(bio); | |
481 | + u64 bytes_allowed, extra_bytes, tmp; | |
482 | + unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; | |
483 | + | |
484 | + jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; | |
485 | + | |
486 | + /* Slice has just started. Consider one slice interval */ | |
487 | + if (!jiffy_elapsed) | |
488 | + jiffy_elapsed_rnd = throtl_slice; | |
489 | + | |
490 | + jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); | |
491 | + | |
492 | + tmp = tg->bps[rw] * jiffy_elapsed_rnd; | |
493 | + do_div(tmp, HZ); | |
494 | + bytes_allowed = tmp; | |
495 | + | |
496 | + if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { | |
497 | + if (wait) | |
498 | + *wait = 0; | |
499 | + return 1; | |
500 | + } | |
501 | + | |
502 | + /* Calc approx time to dispatch */ | |
503 | + extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed; | |
504 | + jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); | |
505 | + | |
506 | + if (!jiffy_wait) | |
507 | + jiffy_wait = 1; | |
508 | + | |
509 | + /* | |
510 | + * This wait time is without taking into consideration the rounding | |
511 | + * up we did. Add that time also. | |
512 | + */ | |
513 | + jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); | |
514 | + if (wait) | |
515 | + *wait = jiffy_wait; | |
516 | + return 0; | |
517 | +} | |
518 | + | |
519 | +/* | |
520 | + * Returns whether one can dispatch a bio or not. Also returns approx number | |
521 | + * of jiffies to wait before this bio is with-in IO rate and can be dispatched | |
522 | + */ | |
523 | +static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, | |
524 | + struct bio *bio, unsigned long *wait) | |
525 | +{ | |
526 | + bool rw = bio_data_dir(bio); | |
527 | + unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; | |
528 | + | |
529 | + /* | |
530 | + * Currently whole state machine of group depends on first bio | |
531 | + * queued in the group bio list. So one should not be calling | |
532 | + * this function with a different bio if there are other bios | |
533 | + * queued. | |
534 | + */ | |
535 | + BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); | |
536 | + | |
537 | + /* If tg->bps = -1, then BW is unlimited */ | |
538 | + if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { | |
539 | + if (wait) | |
540 | + *wait = 0; | |
541 | + return 1; | |
542 | + } | |
543 | + | |
544 | + /* | |
545 | + * If previous slice expired, start a new one otherwise renew/extend | |
546 | + * existing slice to make sure it is at least throtl_slice interval | |
547 | + * long since now. | |
548 | + */ | |
549 | + if (throtl_slice_used(td, tg, rw)) | |
550 | + throtl_start_new_slice(td, tg, rw); | |
551 | + else { | |
552 | + if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) | |
553 | + throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); | |
554 | + } | |
555 | + | |
556 | + if (tg_with_in_bps_limit(td, tg, bio, &bps_wait) | |
557 | + && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) { | |
558 | + if (wait) | |
559 | + *wait = 0; | |
560 | + return 1; | |
561 | + } | |
562 | + | |
563 | + max_wait = max(bps_wait, iops_wait); | |
564 | + | |
565 | + if (wait) | |
566 | + *wait = max_wait; | |
567 | + | |
568 | + if (time_before(tg->slice_end[rw], jiffies + max_wait)) | |
569 | + throtl_extend_slice(td, tg, rw, jiffies + max_wait); | |
570 | + | |
571 | + return 0; | |
572 | +} | |
573 | + | |
574 | +static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) | |
575 | +{ | |
576 | + bool rw = bio_data_dir(bio); | |
577 | + bool sync = bio->bi_rw & REQ_SYNC; | |
578 | + | |
579 | + /* Charge the bio to the group */ | |
580 | + tg->bytes_disp[rw] += bio->bi_size; | |
581 | + tg->io_disp[rw]++; | |
582 | + | |
583 | + /* | |
584 | + * TODO: This will take blkg->stats_lock. Figure out a way | |
585 | + * to avoid this cost. | |
586 | + */ | |
587 | + blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); | |
588 | +} | |
589 | + | |
590 | +static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, | |
591 | + struct bio *bio) | |
592 | +{ | |
593 | + bool rw = bio_data_dir(bio); | |
594 | + | |
595 | + bio_list_add(&tg->bio_lists[rw], bio); | |
596 | + /* Take a bio reference on tg */ | |
597 | + throtl_ref_get_tg(tg); | |
598 | + tg->nr_queued[rw]++; | |
599 | + td->nr_queued[rw]++; | |
600 | + throtl_enqueue_tg(td, tg); | |
601 | +} | |
602 | + | |
603 | +static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) | |
604 | +{ | |
605 | + unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; | |
606 | + struct bio *bio; | |
607 | + | |
608 | + if ((bio = bio_list_peek(&tg->bio_lists[READ]))) | |
609 | + tg_may_dispatch(td, tg, bio, &read_wait); | |
610 | + | |
611 | + if ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) | |
612 | + tg_may_dispatch(td, tg, bio, &write_wait); | |
613 | + | |
614 | + min_wait = min(read_wait, write_wait); | |
615 | + disptime = jiffies + min_wait; | |
616 | + | |
617 | + /* Update dispatch time */ | |
618 | + throtl_dequeue_tg(td, tg); | |
619 | + tg->disptime = disptime; | |
620 | + throtl_enqueue_tg(td, tg); | |
621 | +} | |
622 | + | |
623 | +static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, | |
624 | + bool rw, struct bio_list *bl) | |
625 | +{ | |
626 | + struct bio *bio; | |
627 | + | |
628 | + bio = bio_list_pop(&tg->bio_lists[rw]); | |
629 | + tg->nr_queued[rw]--; | |
630 | + /* Drop bio reference on tg */ | |
631 | + throtl_put_tg(tg); | |
632 | + | |
633 | + BUG_ON(td->nr_queued[rw] <= 0); | |
634 | + td->nr_queued[rw]--; | |
635 | + | |
636 | + throtl_charge_bio(tg, bio); | |
637 | + bio_list_add(bl, bio); | |
638 | + bio->bi_rw |= REQ_THROTTLED; | |
639 | + | |
640 | + throtl_trim_slice(td, tg, rw); | |
641 | +} | |
642 | + | |
643 | +static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, | |
644 | + struct bio_list *bl) | |
645 | +{ | |
646 | + unsigned int nr_reads = 0, nr_writes = 0; | |
647 | + unsigned int max_nr_reads = throtl_grp_quantum*3/4; | |
648 | + unsigned int max_nr_writes = throtl_grp_quantum - nr_reads; | |
649 | + struct bio *bio; | |
650 | + | |
651 | + /* Try to dispatch 75% READS and 25% WRITES */ | |
652 | + | |
653 | + while ((bio = bio_list_peek(&tg->bio_lists[READ])) | |
654 | + && tg_may_dispatch(td, tg, bio, NULL)) { | |
655 | + | |
656 | + tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); | |
657 | + nr_reads++; | |
658 | + | |
659 | + if (nr_reads >= max_nr_reads) | |
660 | + break; | |
661 | + } | |
662 | + | |
663 | + while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) | |
664 | + && tg_may_dispatch(td, tg, bio, NULL)) { | |
665 | + | |
666 | + tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); | |
667 | + nr_writes++; | |
668 | + | |
669 | + if (nr_writes >= max_nr_writes) | |
670 | + break; | |
671 | + } | |
672 | + | |
673 | + return nr_reads + nr_writes; | |
674 | +} | |
675 | + | |
676 | +static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) | |
677 | +{ | |
678 | + unsigned int nr_disp = 0; | |
679 | + struct throtl_grp *tg; | |
680 | + struct throtl_rb_root *st = &td->tg_service_tree; | |
681 | + | |
682 | + while (1) { | |
683 | + tg = throtl_rb_first(st); | |
684 | + | |
685 | + if (!tg) | |
686 | + break; | |
687 | + | |
688 | + if (time_before(jiffies, tg->disptime)) | |
689 | + break; | |
690 | + | |
691 | + throtl_dequeue_tg(td, tg); | |
692 | + | |
693 | + nr_disp += throtl_dispatch_tg(td, tg, bl); | |
694 | + | |
695 | + if (tg->nr_queued[0] || tg->nr_queued[1]) { | |
696 | + tg_update_disptime(td, tg); | |
697 | + throtl_enqueue_tg(td, tg); | |
698 | + } | |
699 | + | |
700 | + if (nr_disp >= throtl_quantum) | |
701 | + break; | |
702 | + } | |
703 | + | |
704 | + return nr_disp; | |
705 | +} | |
706 | + | |
707 | +static void throtl_process_limit_change(struct throtl_data *td) | |
708 | +{ | |
709 | + struct throtl_grp *tg; | |
710 | + struct hlist_node *pos, *n; | |
711 | + | |
712 | + /* | |
713 | + * Make sure atomic_inc() effects from | |
714 | + * throtl_update_blkio_group_read_bps(), group of functions are | |
715 | + * visible. | |
716 | + * Is this required or smp_mb__after_atomic_inc() was suffcient | |
717 | + * after the atomic_inc(). | |
718 | + */ | |
719 | + smp_rmb(); | |
720 | + if (!atomic_read(&td->limits_changed)) | |
721 | + return; | |
722 | + | |
723 | + throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed)); | |
724 | + | |
725 | + hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { | |
726 | + /* | |
727 | + * Do I need an smp_rmb() here to make sure tg->limits_changed | |
728 | + * update is visible. I am relying on smp_rmb() at the | |
729 | + * beginning of function and not putting a new one here. | |
730 | + */ | |
731 | + | |
732 | + if (throtl_tg_on_rr(tg) && tg->limits_changed) { | |
733 | + throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" | |
734 | + " riops=%u wiops=%u", tg->bps[READ], | |
735 | + tg->bps[WRITE], tg->iops[READ], | |
736 | + tg->iops[WRITE]); | |
737 | + tg_update_disptime(td, tg); | |
738 | + tg->limits_changed = false; | |
739 | + } | |
740 | + } | |
741 | + | |
742 | + smp_mb__before_atomic_dec(); | |
743 | + atomic_dec(&td->limits_changed); | |
744 | + smp_mb__after_atomic_dec(); | |
745 | +} | |
746 | + | |
747 | +/* Dispatch throttled bios. Should be called without queue lock held. */ | |
748 | +static int throtl_dispatch(struct request_queue *q) | |
749 | +{ | |
750 | + struct throtl_data *td = q->td; | |
751 | + unsigned int nr_disp = 0; | |
752 | + struct bio_list bio_list_on_stack; | |
753 | + struct bio *bio; | |
754 | + | |
755 | + spin_lock_irq(q->queue_lock); | |
756 | + | |
757 | + throtl_process_limit_change(td); | |
758 | + | |
759 | + if (!total_nr_queued(td)) | |
760 | + goto out; | |
761 | + | |
762 | + bio_list_init(&bio_list_on_stack); | |
763 | + | |
764 | + throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u", | |
765 | + total_nr_queued(td), td->nr_queued[READ], | |
766 | + td->nr_queued[WRITE]); | |
767 | + | |
768 | + nr_disp = throtl_select_dispatch(td, &bio_list_on_stack); | |
769 | + | |
770 | + if (nr_disp) | |
771 | + throtl_log(td, "bios disp=%u", nr_disp); | |
772 | + | |
773 | + throtl_schedule_next_dispatch(td); | |
774 | +out: | |
775 | + spin_unlock_irq(q->queue_lock); | |
776 | + | |
777 | + /* | |
778 | + * If we dispatched some requests, unplug the queue to make sure | |
779 | + * immediate dispatch | |
780 | + */ | |
781 | + if (nr_disp) { | |
782 | + while((bio = bio_list_pop(&bio_list_on_stack))) | |
783 | + generic_make_request(bio); | |
784 | + blk_unplug(q); | |
785 | + } | |
786 | + return nr_disp; | |
787 | +} | |
788 | + | |
789 | +void blk_throtl_work(struct work_struct *work) | |
790 | +{ | |
791 | + struct throtl_data *td = container_of(work, struct throtl_data, | |
792 | + throtl_work.work); | |
793 | + struct request_queue *q = td->queue; | |
794 | + | |
795 | + throtl_dispatch(q); | |
796 | +} | |
797 | + | |
798 | +/* Call with queue lock held */ | |
799 | +void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay) | |
800 | +{ | |
801 | + | |
802 | + struct throtl_data *td = q->td; | |
803 | + struct delayed_work *dwork = &td->throtl_work; | |
804 | + | |
805 | + if (total_nr_queued(td) > 0) { | |
806 | + /* | |
807 | + * We might have a work scheduled to be executed in future. | |
808 | + * Cancel that and schedule a new one. | |
809 | + */ | |
810 | + __cancel_delayed_work(dwork); | |
811 | + kblockd_schedule_delayed_work(q, dwork, delay); | |
812 | + throtl_log(td, "schedule work. delay=%lu jiffies=%lu", | |
813 | + delay, jiffies); | |
814 | + } | |
815 | +} | |
816 | +EXPORT_SYMBOL(throtl_schedule_delayed_work); | |
817 | + | |
818 | +static void | |
819 | +throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg) | |
820 | +{ | |
821 | + /* Something wrong if we are trying to remove same group twice */ | |
822 | + BUG_ON(hlist_unhashed(&tg->tg_node)); | |
823 | + | |
824 | + hlist_del_init(&tg->tg_node); | |
825 | + | |
826 | + /* | |
827 | + * Put the reference taken at the time of creation so that when all | |
828 | + * queues are gone, group can be destroyed. | |
829 | + */ | |
830 | + throtl_put_tg(tg); | |
831 | + td->nr_undestroyed_grps--; | |
832 | +} | |
833 | + | |
834 | +static void throtl_release_tgs(struct throtl_data *td) | |
835 | +{ | |
836 | + struct hlist_node *pos, *n; | |
837 | + struct throtl_grp *tg; | |
838 | + | |
839 | + hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { | |
840 | + /* | |
841 | + * If cgroup removal path got to blk_group first and removed | |
842 | + * it from cgroup list, then it will take care of destroying | |
843 | + * cfqg also. | |
844 | + */ | |
845 | + if (!blkiocg_del_blkio_group(&tg->blkg)) | |
846 | + throtl_destroy_tg(td, tg); | |
847 | + } | |
848 | +} | |
849 | + | |
850 | +static void throtl_td_free(struct throtl_data *td) | |
851 | +{ | |
852 | + kfree(td); | |
853 | +} | |
854 | + | |
855 | +/* | |
856 | + * Blk cgroup controller notification saying that blkio_group object is being | |
857 | + * delinked as associated cgroup object is going away. That also means that | |
858 | + * no new IO will come in this group. So get rid of this group as soon as | |
859 | + * any pending IO in the group is finished. | |
860 | + * | |
861 | + * This function is called under rcu_read_lock(). key is the rcu protected | |
862 | + * pointer. That means "key" is a valid throtl_data pointer as long as we are | |
863 | + * rcu read lock. | |
864 | + * | |
865 | + * "key" was fetched from blkio_group under blkio_cgroup->lock. That means | |
866 | + * it should not be NULL as even if queue was going away, cgroup deltion | |
867 | + * path got to it first. | |
868 | + */ | |
869 | +void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg) | |
870 | +{ | |
871 | + unsigned long flags; | |
872 | + struct throtl_data *td = key; | |
873 | + | |
874 | + spin_lock_irqsave(td->queue->queue_lock, flags); | |
875 | + throtl_destroy_tg(td, tg_of_blkg(blkg)); | |
876 | + spin_unlock_irqrestore(td->queue->queue_lock, flags); | |
877 | +} | |
878 | + | |
879 | +/* | |
880 | + * For all update functions, key should be a valid pointer because these | |
881 | + * update functions are called under blkcg_lock, that means, blkg is | |
882 | + * valid and in turn key is valid. queue exit path can not race becuase | |
883 | + * of blkcg_lock | |
884 | + * | |
885 | + * Can not take queue lock in update functions as queue lock under blkcg_lock | |
886 | + * is not allowed. Under other paths we take blkcg_lock under queue_lock. | |
887 | + */ | |
888 | +static void throtl_update_blkio_group_read_bps(void *key, | |
889 | + struct blkio_group *blkg, u64 read_bps) | |
890 | +{ | |
891 | + struct throtl_data *td = key; | |
892 | + | |
893 | + tg_of_blkg(blkg)->bps[READ] = read_bps; | |
894 | + /* Make sure read_bps is updated before setting limits_changed */ | |
895 | + smp_wmb(); | |
896 | + tg_of_blkg(blkg)->limits_changed = true; | |
897 | + | |
898 | + /* Make sure tg->limits_changed is updated before td->limits_changed */ | |
899 | + smp_mb__before_atomic_inc(); | |
900 | + atomic_inc(&td->limits_changed); | |
901 | + smp_mb__after_atomic_inc(); | |
902 | + | |
903 | + /* Schedule a work now to process the limit change */ | |
904 | + throtl_schedule_delayed_work(td->queue, 0); | |
905 | +} | |
906 | + | |
907 | +static void throtl_update_blkio_group_write_bps(void *key, | |
908 | + struct blkio_group *blkg, u64 write_bps) | |
909 | +{ | |
910 | + struct throtl_data *td = key; | |
911 | + | |
912 | + tg_of_blkg(blkg)->bps[WRITE] = write_bps; | |
913 | + smp_wmb(); | |
914 | + tg_of_blkg(blkg)->limits_changed = true; | |
915 | + smp_mb__before_atomic_inc(); | |
916 | + atomic_inc(&td->limits_changed); | |
917 | + smp_mb__after_atomic_inc(); | |
918 | + throtl_schedule_delayed_work(td->queue, 0); | |
919 | +} | |
920 | + | |
921 | +static void throtl_update_blkio_group_read_iops(void *key, | |
922 | + struct blkio_group *blkg, unsigned int read_iops) | |
923 | +{ | |
924 | + struct throtl_data *td = key; | |
925 | + | |
926 | + tg_of_blkg(blkg)->iops[READ] = read_iops; | |
927 | + smp_wmb(); | |
928 | + tg_of_blkg(blkg)->limits_changed = true; | |
929 | + smp_mb__before_atomic_inc(); | |
930 | + atomic_inc(&td->limits_changed); | |
931 | + smp_mb__after_atomic_inc(); | |
932 | + throtl_schedule_delayed_work(td->queue, 0); | |
933 | +} | |
934 | + | |
935 | +static void throtl_update_blkio_group_write_iops(void *key, | |
936 | + struct blkio_group *blkg, unsigned int write_iops) | |
937 | +{ | |
938 | + struct throtl_data *td = key; | |
939 | + | |
940 | + tg_of_blkg(blkg)->iops[WRITE] = write_iops; | |
941 | + smp_wmb(); | |
942 | + tg_of_blkg(blkg)->limits_changed = true; | |
943 | + smp_mb__before_atomic_inc(); | |
944 | + atomic_inc(&td->limits_changed); | |
945 | + smp_mb__after_atomic_inc(); | |
946 | + throtl_schedule_delayed_work(td->queue, 0); | |
947 | +} | |
948 | + | |
949 | +void throtl_shutdown_timer_wq(struct request_queue *q) | |
950 | +{ | |
951 | + struct throtl_data *td = q->td; | |
952 | + | |
953 | + cancel_delayed_work_sync(&td->throtl_work); | |
954 | +} | |
955 | + | |
956 | +static struct blkio_policy_type blkio_policy_throtl = { | |
957 | + .ops = { | |
958 | + .blkio_unlink_group_fn = throtl_unlink_blkio_group, | |
959 | + .blkio_update_group_read_bps_fn = | |
960 | + throtl_update_blkio_group_read_bps, | |
961 | + .blkio_update_group_write_bps_fn = | |
962 | + throtl_update_blkio_group_write_bps, | |
963 | + .blkio_update_group_read_iops_fn = | |
964 | + throtl_update_blkio_group_read_iops, | |
965 | + .blkio_update_group_write_iops_fn = | |
966 | + throtl_update_blkio_group_write_iops, | |
967 | + }, | |
968 | + .plid = BLKIO_POLICY_THROTL, | |
969 | +}; | |
970 | + | |
971 | +int blk_throtl_bio(struct request_queue *q, struct bio **biop) | |
972 | +{ | |
973 | + struct throtl_data *td = q->td; | |
974 | + struct throtl_grp *tg; | |
975 | + struct bio *bio = *biop; | |
976 | + bool rw = bio_data_dir(bio), update_disptime = true; | |
977 | + | |
978 | + if (bio->bi_rw & REQ_THROTTLED) { | |
979 | + bio->bi_rw &= ~REQ_THROTTLED; | |
980 | + return 0; | |
981 | + } | |
982 | + | |
983 | + spin_lock_irq(q->queue_lock); | |
984 | + tg = throtl_get_tg(td); | |
985 | + | |
986 | + if (tg->nr_queued[rw]) { | |
987 | + /* | |
988 | + * There is already another bio queued in same dir. No | |
989 | + * need to update dispatch time. | |
990 | + * Still update the disptime if rate limits on this group | |
991 | + * were changed. | |
992 | + */ | |
993 | + if (!tg->limits_changed) | |
994 | + update_disptime = false; | |
995 | + else | |
996 | + tg->limits_changed = false; | |
997 | + | |
998 | + goto queue_bio; | |
999 | + } | |
1000 | + | |
1001 | + /* Bio is with-in rate limit of group */ | |
1002 | + if (tg_may_dispatch(td, tg, bio, NULL)) { | |
1003 | + throtl_charge_bio(tg, bio); | |
1004 | + goto out; | |
1005 | + } | |
1006 | + | |
1007 | +queue_bio: | |
1008 | + throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu" | |
1009 | + " iodisp=%u iops=%u queued=%d/%d", | |
1010 | + rw == READ ? 'R' : 'W', | |
1011 | + tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], | |
1012 | + tg->io_disp[rw], tg->iops[rw], | |
1013 | + tg->nr_queued[READ], tg->nr_queued[WRITE]); | |
1014 | + | |
1015 | + throtl_add_bio_tg(q->td, tg, bio); | |
1016 | + *biop = NULL; | |
1017 | + | |
1018 | + if (update_disptime) { | |
1019 | + tg_update_disptime(td, tg); | |
1020 | + throtl_schedule_next_dispatch(td); | |
1021 | + } | |
1022 | + | |
1023 | +out: | |
1024 | + spin_unlock_irq(q->queue_lock); | |
1025 | + return 0; | |
1026 | +} | |
1027 | + | |
1028 | +int blk_throtl_init(struct request_queue *q) | |
1029 | +{ | |
1030 | + struct throtl_data *td; | |
1031 | + struct throtl_grp *tg; | |
1032 | + | |
1033 | + td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); | |
1034 | + if (!td) | |
1035 | + return -ENOMEM; | |
1036 | + | |
1037 | + INIT_HLIST_HEAD(&td->tg_list); | |
1038 | + td->tg_service_tree = THROTL_RB_ROOT; | |
1039 | + atomic_set(&td->limits_changed, 0); | |
1040 | + | |
1041 | + /* Init root group */ | |
1042 | + tg = &td->root_tg; | |
1043 | + INIT_HLIST_NODE(&tg->tg_node); | |
1044 | + RB_CLEAR_NODE(&tg->rb_node); | |
1045 | + bio_list_init(&tg->bio_lists[0]); | |
1046 | + bio_list_init(&tg->bio_lists[1]); | |
1047 | + | |
1048 | + /* Practically unlimited BW */ | |
1049 | + tg->bps[0] = tg->bps[1] = -1; | |
1050 | + tg->iops[0] = tg->iops[1] = -1; | |
1051 | + | |
1052 | + /* | |
1053 | + * Set root group reference to 2. One reference will be dropped when | |
1054 | + * all groups on tg_list are being deleted during queue exit. Other | |
1055 | + * reference will remain there as we don't want to delete this group | |
1056 | + * as it is statically allocated and gets destroyed when throtl_data | |
1057 | + * goes away. | |
1058 | + */ | |
1059 | + atomic_set(&tg->ref, 2); | |
1060 | + hlist_add_head(&tg->tg_node, &td->tg_list); | |
1061 | + td->nr_undestroyed_grps++; | |
1062 | + | |
1063 | + INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); | |
1064 | + | |
1065 | + rcu_read_lock(); | |
1066 | + blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td, | |
1067 | + 0, BLKIO_POLICY_THROTL); | |
1068 | + rcu_read_unlock(); | |
1069 | + | |
1070 | + /* Attach throtl data to request queue */ | |
1071 | + td->queue = q; | |
1072 | + q->td = td; | |
1073 | + return 0; | |
1074 | +} | |
1075 | + | |
1076 | +void blk_throtl_exit(struct request_queue *q) | |
1077 | +{ | |
1078 | + struct throtl_data *td = q->td; | |
1079 | + bool wait = false; | |
1080 | + | |
1081 | + BUG_ON(!td); | |
1082 | + | |
1083 | + throtl_shutdown_timer_wq(q); | |
1084 | + | |
1085 | + spin_lock_irq(q->queue_lock); | |
1086 | + throtl_release_tgs(td); | |
1087 | + | |
1088 | + /* If there are other groups */ | |
1089 | + if (td->nr_undestroyed_grps > 0) | |
1090 | + wait = true; | |
1091 | + | |
1092 | + spin_unlock_irq(q->queue_lock); | |
1093 | + | |
1094 | + /* | |
1095 | + * Wait for tg->blkg->key accessors to exit their grace periods. | |
1096 | + * Do this wait only if there are other undestroyed groups out | |
1097 | + * there (other than root group). This can happen if cgroup deletion | |
1098 | + * path claimed the responsibility of cleaning up a group before | |
1099 | + * queue cleanup code get to the group. | |
1100 | + * | |
1101 | + * Do not call synchronize_rcu() unconditionally as there are drivers | |
1102 | + * which create/delete request queue hundreds of times during scan/boot | |
1103 | + * and synchronize_rcu() can take significant time and slow down boot. | |
1104 | + */ | |
1105 | + if (wait) | |
1106 | + synchronize_rcu(); | |
1107 | + | |
1108 | + /* | |
1109 | + * Just being safe to make sure after previous flush if some body did | |
1110 | + * update limits through cgroup and another work got queued, cancel | |
1111 | + * it. | |
1112 | + */ | |
1113 | + throtl_shutdown_timer_wq(q); | |
1114 | + throtl_td_free(td); | |
1115 | +} | |
1116 | + | |
1117 | +static int __init throtl_init(void) | |
1118 | +{ | |
1119 | + blkio_policy_register(&blkio_policy_throtl); | |
1120 | + return 0; | |
1121 | +} | |
1122 | + | |
1123 | +module_init(throtl_init); |
block/blk.h
... | ... | @@ -110,10 +110,6 @@ |
110 | 110 | |
111 | 111 | int blk_dev_init(void); |
112 | 112 | |
113 | -void elv_quiesce_start(struct request_queue *q); | |
114 | -void elv_quiesce_end(struct request_queue *q); | |
115 | - | |
116 | - | |
117 | 113 | /* |
118 | 114 | * Return the threshold (number of used requests) at which the queue is |
119 | 115 | * considered to be congested. It include a little hysteresis to keep the |
... | ... | @@ -131,14 +127,6 @@ |
131 | 127 | { |
132 | 128 | return q->nr_congestion_off; |
133 | 129 | } |
134 | - | |
135 | -#if defined(CONFIG_BLK_DEV_INTEGRITY) | |
136 | - | |
137 | -#define rq_for_each_integrity_segment(bvl, _rq, _iter) \ | |
138 | - __rq_for_each_bio(_iter.bio, _rq) \ | |
139 | - bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i) | |
140 | - | |
141 | -#endif /* BLK_DEV_INTEGRITY */ | |
142 | 130 | |
143 | 131 | static inline int blk_cpu_to_group(int cpu) |
144 | 132 | { |
block/cfq-iosched.c
... | ... | @@ -160,6 +160,7 @@ |
160 | 160 | BE_WORKLOAD = 0, |
161 | 161 | RT_WORKLOAD = 1, |
162 | 162 | IDLE_WORKLOAD = 2, |
163 | + CFQ_PRIO_NR, | |
163 | 164 | }; |
164 | 165 | |
165 | 166 | /* |
166 | 167 | |
... | ... | @@ -184,10 +185,19 @@ |
184 | 185 | /* number of cfqq currently on this group */ |
185 | 186 | int nr_cfqq; |
186 | 187 | |
187 | - /* Per group busy queus average. Useful for workload slice calc. */ | |
188 | - unsigned int busy_queues_avg[2]; | |
189 | 188 | /* |
190 | - * rr lists of queues with requests, onle rr for each priority class. | |
189 | + * Per group busy queus average. Useful for workload slice calc. We | |
190 | + * create the array for each prio class but at run time it is used | |
191 | + * only for RT and BE class and slot for IDLE class remains unused. | |
192 | + * This is primarily done to avoid confusion and a gcc warning. | |
193 | + */ | |
194 | + unsigned int busy_queues_avg[CFQ_PRIO_NR]; | |
195 | + /* | |
196 | + * rr lists of queues with requests. We maintain service trees for | |
197 | + * RT and BE classes. These trees are subdivided in subclasses | |
198 | + * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE | |
199 | + * class there is no subclassification and all the cfq queues go on | |
200 | + * a single tree service_tree_idle. | |
191 | 201 | * Counts are embedded in the cfq_rb_root |
192 | 202 | */ |
193 | 203 | struct cfq_rb_root service_trees[2][3]; |
... | ... | @@ -221,7 +231,6 @@ |
221 | 231 | enum wl_type_t serving_type; |
222 | 232 | unsigned long workload_expires; |
223 | 233 | struct cfq_group *serving_group; |
224 | - bool noidle_tree_requires_idle; | |
225 | 234 | |
226 | 235 | /* |
227 | 236 | * Each priority tree is sorted by next_request position. These |
... | ... | @@ -977,8 +986,8 @@ |
977 | 986 | return NULL; |
978 | 987 | } |
979 | 988 | |
980 | -void | |
981 | -cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) | |
989 | +void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, | |
990 | + unsigned int weight) | |
982 | 991 | { |
983 | 992 | cfqg_of_blkg(blkg)->weight = weight; |
984 | 993 | } |
... | ... | @@ -2180,7 +2189,6 @@ |
2180 | 2189 | slice = max_t(unsigned, slice, CFQ_MIN_TT); |
2181 | 2190 | cfq_log(cfqd, "workload slice:%d", slice); |
2182 | 2191 | cfqd->workload_expires = jiffies + slice; |
2183 | - cfqd->noidle_tree_requires_idle = false; | |
2184 | 2192 | } |
2185 | 2193 | |
2186 | 2194 | static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) |
... | ... | @@ -3177,7 +3185,9 @@ |
3177 | 3185 | if (cfqq->queued[0] + cfqq->queued[1] >= 4) |
3178 | 3186 | cfq_mark_cfqq_deep(cfqq); |
3179 | 3187 | |
3180 | - if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || | |
3188 | + if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) | |
3189 | + enable_idle = 0; | |
3190 | + else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || | |
3181 | 3191 | (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) |
3182 | 3192 | enable_idle = 0; |
3183 | 3193 | else if (sample_valid(cic->ttime_samples)) { |
... | ... | @@ -3494,17 +3504,7 @@ |
3494 | 3504 | cfq_slice_expired(cfqd, 1); |
3495 | 3505 | else if (sync && cfqq_empty && |
3496 | 3506 | !cfq_close_cooperator(cfqd, cfqq)) { |
3497 | - cfqd->noidle_tree_requires_idle |= | |
3498 | - !(rq->cmd_flags & REQ_NOIDLE); | |
3499 | - /* | |
3500 | - * Idling is enabled for SYNC_WORKLOAD. | |
3501 | - * SYNC_NOIDLE_WORKLOAD idles at the end of the tree | |
3502 | - * only if we processed at least one !REQ_NOIDLE request | |
3503 | - */ | |
3504 | - if (cfqd->serving_type == SYNC_WORKLOAD | |
3505 | - || cfqd->noidle_tree_requires_idle | |
3506 | - || cfqq->cfqg->nr_cfqq == 1) | |
3507 | - cfq_arm_slice_timer(cfqd); | |
3507 | + cfq_arm_slice_timer(cfqd); | |
3508 | 3508 | } |
3509 | 3509 | } |
3510 | 3510 | |
... | ... | @@ -4090,6 +4090,7 @@ |
4090 | 4090 | .blkio_unlink_group_fn = cfq_unlink_blkio_group, |
4091 | 4091 | .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, |
4092 | 4092 | }, |
4093 | + .plid = BLKIO_POLICY_PROP, | |
4093 | 4094 | }; |
4094 | 4095 | #else |
4095 | 4096 | static struct blkio_policy_type blkio_policy_cfq; |
block/cfq.h
... | ... | @@ -69,7 +69,7 @@ |
69 | 69 | |
70 | 70 | static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, |
71 | 71 | struct blkio_group *blkg, void *key, dev_t dev) { |
72 | - blkiocg_add_blkio_group(blkcg, blkg, key, dev); | |
72 | + blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP); | |
73 | 73 | } |
74 | 74 | |
75 | 75 | static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) |
block/genhd.c
... | ... | @@ -541,13 +541,15 @@ |
541 | 541 | disk->major = MAJOR(devt); |
542 | 542 | disk->first_minor = MINOR(devt); |
543 | 543 | |
544 | + /* Register BDI before referencing it from bdev */ | |
545 | + bdi = &disk->queue->backing_dev_info; | |
546 | + bdi_register_dev(bdi, disk_devt(disk)); | |
547 | + | |
544 | 548 | blk_register_region(disk_devt(disk), disk->minors, NULL, |
545 | 549 | exact_match, exact_lock, disk); |
546 | 550 | register_disk(disk); |
547 | 551 | blk_register_queue(disk); |
548 | 552 | |
549 | - bdi = &disk->queue->backing_dev_info; | |
550 | - bdi_register_dev(bdi, disk_devt(disk)); | |
551 | 553 | retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, |
552 | 554 | "bdi"); |
553 | 555 | WARN_ON(retval); |
... | ... | @@ -642,6 +644,7 @@ |
642 | 644 | struct hd_struct *part; |
643 | 645 | char name_buf[BDEVNAME_SIZE]; |
644 | 646 | char devt_buf[BDEVT_SIZE]; |
647 | + u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1]; | |
645 | 648 | |
646 | 649 | /* |
647 | 650 | * Don't show empty devices or things that have been |
648 | 651 | |
... | ... | @@ -660,10 +663,14 @@ |
660 | 663 | while ((part = disk_part_iter_next(&piter))) { |
661 | 664 | bool is_part0 = part == &disk->part0; |
662 | 665 | |
663 | - printk("%s%s %10llu %s", is_part0 ? "" : " ", | |
666 | + uuid[0] = 0; | |
667 | + if (part->info) | |
668 | + part_unpack_uuid(part->info->uuid, uuid); | |
669 | + | |
670 | + printk("%s%s %10llu %s %s", is_part0 ? "" : " ", | |
664 | 671 | bdevt_str(part_devt(part), devt_buf), |
665 | 672 | (unsigned long long)part->nr_sects >> 1, |
666 | - disk_name(disk, part->partno, name_buf)); | |
673 | + disk_name(disk, part->partno, name_buf), uuid); | |
667 | 674 | if (is_part0) { |
668 | 675 | if (disk->driverfs_dev != NULL && |
669 | 676 | disk->driverfs_dev->driver != NULL) |
670 | 677 | |
... | ... | @@ -925,8 +932,15 @@ |
925 | 932 | { |
926 | 933 | struct disk_part_tbl *ptbl = |
927 | 934 | container_of(head, struct disk_part_tbl, rcu_head); |
935 | + struct gendisk *disk = ptbl->disk; | |
936 | + struct request_queue *q = disk->queue; | |
937 | + unsigned long flags; | |
928 | 938 | |
929 | 939 | kfree(ptbl); |
940 | + | |
941 | + spin_lock_irqsave(q->queue_lock, flags); | |
942 | + elv_quiesce_end(q); | |
943 | + spin_unlock_irqrestore(q->queue_lock, flags); | |
930 | 944 | } |
931 | 945 | |
932 | 946 | /** |
933 | 947 | |
... | ... | @@ -944,11 +958,17 @@ |
944 | 958 | struct disk_part_tbl *new_ptbl) |
945 | 959 | { |
946 | 960 | struct disk_part_tbl *old_ptbl = disk->part_tbl; |
961 | + struct request_queue *q = disk->queue; | |
947 | 962 | |
948 | 963 | rcu_assign_pointer(disk->part_tbl, new_ptbl); |
949 | 964 | |
950 | 965 | if (old_ptbl) { |
951 | 966 | rcu_assign_pointer(old_ptbl->last_lookup, NULL); |
967 | + | |
968 | + spin_lock_irq(q->queue_lock); | |
969 | + elv_quiesce_start(q); | |
970 | + spin_unlock_irq(q->queue_lock); | |
971 | + | |
952 | 972 | call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb); |
953 | 973 | } |
954 | 974 | } |
... | ... | @@ -989,6 +1009,7 @@ |
989 | 1009 | return -ENOMEM; |
990 | 1010 | |
991 | 1011 | new_ptbl->len = target; |
1012 | + new_ptbl->disk = disk; | |
992 | 1013 | |
993 | 1014 | for (i = 0; i < len; i++) |
994 | 1015 | rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]); |
... | ... | @@ -1004,6 +1025,7 @@ |
1004 | 1025 | kfree(disk->random); |
1005 | 1026 | disk_replace_part_tbl(disk, NULL); |
1006 | 1027 | free_part_stats(&disk->part0); |
1028 | + free_part_info(&disk->part0); | |
1007 | 1029 | kfree(disk); |
1008 | 1030 | } |
1009 | 1031 | struct class block_class = { |
block/ioctl.c
drivers/block/drbd/drbd_receiver.c
... | ... | @@ -2972,7 +2972,6 @@ |
2972 | 2972 | * we still need to figure out whether we accept that. */ |
2973 | 2973 | mdev->p_size = p_size; |
2974 | 2974 | |
2975 | -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | |
2976 | 2975 | if (get_ldev(mdev)) { |
2977 | 2976 | warn_if_differ_considerably(mdev, "lower level device sizes", |
2978 | 2977 | p_size, drbd_get_max_capacity(mdev->ldev)); |
drivers/md/dm-snap.c
... | ... | @@ -706,8 +706,6 @@ |
706 | 706 | return 0; |
707 | 707 | } |
708 | 708 | |
709 | -#define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r))) | |
710 | - | |
711 | 709 | /* |
712 | 710 | * Return a minimum chunk size of all snapshots that have the specified origin. |
713 | 711 | * Return zero if the origin has no snapshots. |
drivers/md/dm-table.c
... | ... | @@ -486,11 +486,6 @@ |
486 | 486 | return 0; |
487 | 487 | } |
488 | 488 | |
489 | -/* | |
490 | - * Returns the minimum that is _not_ zero, unless both are zero. | |
491 | - */ | |
492 | -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | |
493 | - | |
494 | 489 | int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, |
495 | 490 | sector_t start, sector_t len, void *data) |
496 | 491 | { |
drivers/s390/scsi/zfcp_scsi.c
... | ... | @@ -681,6 +681,7 @@ |
681 | 681 | adapter->adapter_features & FSF_FEATURE_DIX_PROT_TCPIP) { |
682 | 682 | mask |= SHOST_DIX_TYPE1_PROTECTION; |
683 | 683 | scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP); |
684 | + shost->sg_prot_tablesize = ZFCP_QDIO_MAX_SBALES_PER_REQ / 2; | |
684 | 685 | shost->sg_tablesize = ZFCP_QDIO_MAX_SBALES_PER_REQ / 2; |
685 | 686 | shost->max_sectors = ZFCP_QDIO_MAX_SBALES_PER_REQ * 8 / 2; |
686 | 687 | } |
drivers/scsi/hosts.c
... | ... | @@ -376,6 +376,7 @@ |
376 | 376 | shost->this_id = sht->this_id; |
377 | 377 | shost->can_queue = sht->can_queue; |
378 | 378 | shost->sg_tablesize = sht->sg_tablesize; |
379 | + shost->sg_prot_tablesize = sht->sg_prot_tablesize; | |
379 | 380 | shost->cmd_per_lun = sht->cmd_per_lun; |
380 | 381 | shost->unchecked_isa_dma = sht->unchecked_isa_dma; |
381 | 382 | shost->use_clustering = sht->use_clustering; |
drivers/scsi/scsi_lib.c
... | ... | @@ -968,11 +968,13 @@ |
968 | 968 | */ |
969 | 969 | int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask) |
970 | 970 | { |
971 | - int error = scsi_init_sgtable(cmd->request, &cmd->sdb, gfp_mask); | |
971 | + struct request *rq = cmd->request; | |
972 | + | |
973 | + int error = scsi_init_sgtable(rq, &cmd->sdb, gfp_mask); | |
972 | 974 | if (error) |
973 | 975 | goto err_exit; |
974 | 976 | |
975 | - if (blk_bidi_rq(cmd->request)) { | |
977 | + if (blk_bidi_rq(rq)) { | |
976 | 978 | struct scsi_data_buffer *bidi_sdb = kmem_cache_zalloc( |
977 | 979 | scsi_sdb_cache, GFP_ATOMIC); |
978 | 980 | if (!bidi_sdb) { |
979 | 981 | |
980 | 982 | |
981 | 983 | |
982 | 984 | |
... | ... | @@ -980,28 +982,28 @@ |
980 | 982 | goto err_exit; |
981 | 983 | } |
982 | 984 | |
983 | - cmd->request->next_rq->special = bidi_sdb; | |
984 | - error = scsi_init_sgtable(cmd->request->next_rq, bidi_sdb, | |
985 | - GFP_ATOMIC); | |
985 | + rq->next_rq->special = bidi_sdb; | |
986 | + error = scsi_init_sgtable(rq->next_rq, bidi_sdb, GFP_ATOMIC); | |
986 | 987 | if (error) |
987 | 988 | goto err_exit; |
988 | 989 | } |
989 | 990 | |
990 | - if (blk_integrity_rq(cmd->request)) { | |
991 | + if (blk_integrity_rq(rq)) { | |
991 | 992 | struct scsi_data_buffer *prot_sdb = cmd->prot_sdb; |
992 | 993 | int ivecs, count; |
993 | 994 | |
994 | 995 | BUG_ON(prot_sdb == NULL); |
995 | - ivecs = blk_rq_count_integrity_sg(cmd->request); | |
996 | + ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio); | |
996 | 997 | |
997 | 998 | if (scsi_alloc_sgtable(prot_sdb, ivecs, gfp_mask)) { |
998 | 999 | error = BLKPREP_DEFER; |
999 | 1000 | goto err_exit; |
1000 | 1001 | } |
1001 | 1002 | |
1002 | - count = blk_rq_map_integrity_sg(cmd->request, | |
1003 | + count = blk_rq_map_integrity_sg(rq->q, rq->bio, | |
1003 | 1004 | prot_sdb->table.sgl); |
1004 | 1005 | BUG_ON(unlikely(count > ivecs)); |
1006 | + BUG_ON(unlikely(count > queue_max_integrity_segments(rq->q))); | |
1005 | 1007 | |
1006 | 1008 | cmd->prot_sdb = prot_sdb; |
1007 | 1009 | cmd->prot_sdb->table.nents = count; |
... | ... | @@ -1624,6 +1626,14 @@ |
1624 | 1626 | */ |
1625 | 1627 | blk_queue_max_segments(q, min_t(unsigned short, shost->sg_tablesize, |
1626 | 1628 | SCSI_MAX_SG_CHAIN_SEGMENTS)); |
1629 | + | |
1630 | + if (scsi_host_prot_dma(shost)) { | |
1631 | + shost->sg_prot_tablesize = | |
1632 | + min_not_zero(shost->sg_prot_tablesize, | |
1633 | + (unsigned short)SCSI_MAX_PROT_SG_SEGMENTS); | |
1634 | + BUG_ON(shost->sg_prot_tablesize < shost->sg_tablesize); | |
1635 | + blk_queue_max_integrity_segments(q, shost->sg_prot_tablesize); | |
1636 | + } | |
1627 | 1637 | |
1628 | 1638 | blk_queue_max_hw_sectors(q, shost->max_sectors); |
1629 | 1639 | blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost)); |
drivers/scsi/scsi_sysfs.c
... | ... | @@ -251,6 +251,7 @@ |
251 | 251 | shost_rd_attr(cmd_per_lun, "%hd\n"); |
252 | 252 | shost_rd_attr(can_queue, "%hd\n"); |
253 | 253 | shost_rd_attr(sg_tablesize, "%hu\n"); |
254 | +shost_rd_attr(sg_prot_tablesize, "%hu\n"); | |
254 | 255 | shost_rd_attr(unchecked_isa_dma, "%d\n"); |
255 | 256 | shost_rd_attr(prot_capabilities, "%u\n"); |
256 | 257 | shost_rd_attr(prot_guard_type, "%hd\n"); |
... | ... | @@ -262,6 +263,7 @@ |
262 | 263 | &dev_attr_cmd_per_lun.attr, |
263 | 264 | &dev_attr_can_queue.attr, |
264 | 265 | &dev_attr_sg_tablesize.attr, |
266 | + &dev_attr_sg_prot_tablesize.attr, | |
265 | 267 | &dev_attr_unchecked_isa_dma.attr, |
266 | 268 | &dev_attr_proc_name.attr, |
267 | 269 | &dev_attr_scan.attr, |
drivers/scsi/sd_dif.c
... | ... | @@ -375,21 +375,20 @@ |
375 | 375 | unsigned int i, j; |
376 | 376 | u32 phys, virt; |
377 | 377 | |
378 | - /* Already remapped? */ | |
379 | - if (rq->cmd_flags & REQ_INTEGRITY) | |
380 | - return 0; | |
381 | - | |
382 | 378 | sdkp = rq->bio->bi_bdev->bd_disk->private_data; |
383 | 379 | |
384 | 380 | if (sdkp->protection_type == SD_DIF_TYPE3_PROTECTION) |
385 | 381 | return 0; |
386 | 382 | |
387 | - rq->cmd_flags |= REQ_INTEGRITY; | |
388 | 383 | phys = hw_sector & 0xffffffff; |
389 | 384 | |
390 | 385 | __rq_for_each_bio(bio, rq) { |
391 | 386 | struct bio_vec *iv; |
392 | 387 | |
388 | + /* Already remapped? */ | |
389 | + if (bio_flagged(bio, BIO_MAPPED_INTEGRITY)) | |
390 | + break; | |
391 | + | |
393 | 392 | virt = bio->bi_integrity->bip_sector & 0xffffffff; |
394 | 393 | |
395 | 394 | bip_for_each_vec(iv, bio->bi_integrity, i) { |
... | ... | @@ -408,6 +407,8 @@ |
408 | 407 | |
409 | 408 | kunmap_atomic(sdt, KM_USER0); |
410 | 409 | } |
410 | + | |
411 | + bio->bi_flags |= BIO_MAPPED_INTEGRITY; | |
411 | 412 | } |
412 | 413 | |
413 | 414 | return 0; |
drivers/scsi/sg.c
... | ... | @@ -1660,7 +1660,7 @@ |
1660 | 1660 | if (sg_allow_dio && hp->flags & SG_FLAG_DIRECT_IO && |
1661 | 1661 | dxfer_dir != SG_DXFER_UNKNOWN && !iov_count && |
1662 | 1662 | !sfp->parentdp->device->host->unchecked_isa_dma && |
1663 | - blk_rq_aligned(q, hp->dxferp, dxfer_len)) | |
1663 | + blk_rq_aligned(q, (unsigned long)hp->dxferp, dxfer_len)) | |
1664 | 1664 | md = NULL; |
1665 | 1665 | else |
1666 | 1666 | md = &map_data; |
fs/jbd/commit.c
fs/jbd2/commit.c
... | ... | @@ -360,7 +360,7 @@ |
360 | 360 | int tag_bytes = journal_tag_bytes(journal); |
361 | 361 | struct buffer_head *cbh = NULL; /* For transactional checksums */ |
362 | 362 | __u32 crc32_sum = ~0; |
363 | - int write_op = WRITE; | |
363 | + int write_op = WRITE_SYNC; | |
364 | 364 | |
365 | 365 | /* |
366 | 366 | * First job: lock down the current transaction and wait for |
fs/partitions/check.c
... | ... | @@ -352,6 +352,7 @@ |
352 | 352 | { |
353 | 353 | struct hd_struct *p = dev_to_part(dev); |
354 | 354 | free_part_stats(p); |
355 | + free_part_info(p); | |
355 | 356 | kfree(p); |
356 | 357 | } |
357 | 358 | |
358 | 359 | |
359 | 360 | |
... | ... | @@ -364,17 +365,25 @@ |
364 | 365 | static void delete_partition_rcu_cb(struct rcu_head *head) |
365 | 366 | { |
366 | 367 | struct hd_struct *part = container_of(head, struct hd_struct, rcu_head); |
368 | + struct gendisk *disk = part_to_disk(part); | |
369 | + struct request_queue *q = disk->queue; | |
370 | + unsigned long flags; | |
367 | 371 | |
368 | 372 | part->start_sect = 0; |
369 | 373 | part->nr_sects = 0; |
370 | 374 | part_stat_set_all(part, 0); |
371 | 375 | put_device(part_to_dev(part)); |
376 | + | |
377 | + spin_lock_irqsave(q->queue_lock, flags); | |
378 | + elv_quiesce_end(q); | |
379 | + spin_unlock_irqrestore(q->queue_lock, flags); | |
372 | 380 | } |
373 | 381 | |
374 | 382 | void delete_partition(struct gendisk *disk, int partno) |
375 | 383 | { |
376 | 384 | struct disk_part_tbl *ptbl = disk->part_tbl; |
377 | 385 | struct hd_struct *part; |
386 | + struct request_queue *q = disk->queue; | |
378 | 387 | |
379 | 388 | if (partno >= ptbl->len) |
380 | 389 | return; |
... | ... | @@ -389,6 +398,10 @@ |
389 | 398 | kobject_put(part->holder_dir); |
390 | 399 | device_del(part_to_dev(part)); |
391 | 400 | |
401 | + spin_lock_irq(q->queue_lock); | |
402 | + elv_quiesce_start(q); | |
403 | + spin_unlock_irq(q->queue_lock); | |
404 | + | |
392 | 405 | call_rcu(&part->rcu_head, delete_partition_rcu_cb); |
393 | 406 | } |
394 | 407 | |
... | ... | @@ -401,7 +414,8 @@ |
401 | 414 | whole_disk_show, NULL); |
402 | 415 | |
403 | 416 | struct hd_struct *add_partition(struct gendisk *disk, int partno, |
404 | - sector_t start, sector_t len, int flags) | |
417 | + sector_t start, sector_t len, int flags, | |
418 | + struct partition_meta_info *info) | |
405 | 419 | { |
406 | 420 | struct hd_struct *p; |
407 | 421 | dev_t devt = MKDEV(0, 0); |
... | ... | @@ -438,6 +452,14 @@ |
438 | 452 | p->partno = partno; |
439 | 453 | p->policy = get_disk_ro(disk); |
440 | 454 | |
455 | + if (info) { | |
456 | + struct partition_meta_info *pinfo = alloc_part_info(disk); | |
457 | + if (!pinfo) | |
458 | + goto out_free_stats; | |
459 | + memcpy(pinfo, info, sizeof(*info)); | |
460 | + p->info = pinfo; | |
461 | + } | |
462 | + | |
441 | 463 | dname = dev_name(ddev); |
442 | 464 | if (isdigit(dname[strlen(dname) - 1])) |
443 | 465 | dev_set_name(pdev, "%sp%d", dname, partno); |
... | ... | @@ -451,7 +473,7 @@ |
451 | 473 | |
452 | 474 | err = blk_alloc_devt(p, &devt); |
453 | 475 | if (err) |
454 | - goto out_free_stats; | |
476 | + goto out_free_info; | |
455 | 477 | pdev->devt = devt; |
456 | 478 | |
457 | 479 | /* delay uevent until 'holders' subdir is created */ |
... | ... | @@ -481,6 +503,8 @@ |
481 | 503 | |
482 | 504 | return p; |
483 | 505 | |
506 | +out_free_info: | |
507 | + free_part_info(p); | |
484 | 508 | out_free_stats: |
485 | 509 | free_part_stats(p); |
486 | 510 | out_free: |
... | ... | @@ -642,6 +666,7 @@ |
642 | 666 | /* add partitions */ |
643 | 667 | for (p = 1; p < state->limit; p++) { |
644 | 668 | sector_t size, from; |
669 | + struct partition_meta_info *info = NULL; | |
645 | 670 | |
646 | 671 | size = state->parts[p].size; |
647 | 672 | if (!size) |
648 | 673 | |
... | ... | @@ -675,8 +700,12 @@ |
675 | 700 | size = get_capacity(disk) - from; |
676 | 701 | } |
677 | 702 | } |
703 | + | |
704 | + if (state->parts[p].has_info) | |
705 | + info = &state->parts[p].info; | |
678 | 706 | part = add_partition(disk, p, from, size, |
679 | - state->parts[p].flags); | |
707 | + state->parts[p].flags, | |
708 | + &state->parts[p].info); | |
680 | 709 | if (IS_ERR(part)) { |
681 | 710 | printk(KERN_ERR " %s: p%d could not be added: %ld\n", |
682 | 711 | disk->disk_name, p, -PTR_ERR(part)); |
fs/partitions/check.h
1 | 1 | #include <linux/pagemap.h> |
2 | 2 | #include <linux/blkdev.h> |
3 | +#include <linux/genhd.h> | |
3 | 4 | |
4 | 5 | /* |
5 | 6 | * add_gd_partition adds a partitions details to the devices partition |
... | ... | @@ -12,6 +13,8 @@ |
12 | 13 | sector_t from; |
13 | 14 | sector_t size; |
14 | 15 | int flags; |
16 | + bool has_info; | |
17 | + struct partition_meta_info info; | |
15 | 18 | } parts[DISK_MAX_PARTS]; |
16 | 19 | int next; |
17 | 20 | int limit; |
fs/partitions/efi.c
... | ... | @@ -94,6 +94,7 @@ |
94 | 94 | * |
95 | 95 | ************************************************************/ |
96 | 96 | #include <linux/crc32.h> |
97 | +#include <linux/ctype.h> | |
97 | 98 | #include <linux/math64.h> |
98 | 99 | #include <linux/slab.h> |
99 | 100 | #include "check.h" |
... | ... | @@ -604,6 +605,7 @@ |
604 | 605 | gpt_entry *ptes = NULL; |
605 | 606 | u32 i; |
606 | 607 | unsigned ssz = bdev_logical_block_size(state->bdev) / 512; |
608 | + u8 unparsed_guid[37]; | |
607 | 609 | |
608 | 610 | if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { |
609 | 611 | kfree(gpt); |
... | ... | @@ -614,6 +616,9 @@ |
614 | 616 | pr_debug("GUID Partition Table is valid! Yea!\n"); |
615 | 617 | |
616 | 618 | for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { |
619 | + struct partition_meta_info *info; | |
620 | + unsigned label_count = 0; | |
621 | + unsigned label_max; | |
617 | 622 | u64 start = le64_to_cpu(ptes[i].starting_lba); |
618 | 623 | u64 size = le64_to_cpu(ptes[i].ending_lba) - |
619 | 624 | le64_to_cpu(ptes[i].starting_lba) + 1ULL; |
... | ... | @@ -627,6 +632,26 @@ |
627 | 632 | if (!efi_guidcmp(ptes[i].partition_type_guid, |
628 | 633 | PARTITION_LINUX_RAID_GUID)) |
629 | 634 | state->parts[i + 1].flags = ADDPART_FLAG_RAID; |
635 | + | |
636 | + info = &state->parts[i + 1].info; | |
637 | + /* Instead of doing a manual swap to big endian, reuse the | |
638 | + * common ASCII hex format as the interim. | |
639 | + */ | |
640 | + efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid); | |
641 | + part_pack_uuid(unparsed_guid, info->uuid); | |
642 | + | |
643 | + /* Naively convert UTF16-LE to 7 bits. */ | |
644 | + label_max = min(sizeof(info->volname) - 1, | |
645 | + sizeof(ptes[i].partition_name)); | |
646 | + info->volname[label_max] = 0; | |
647 | + while (label_count < label_max) { | |
648 | + u8 c = ptes[i].partition_name[label_count] & 0xff; | |
649 | + if (c && !isprint(c)) | |
650 | + c = '!'; | |
651 | + info->volname[label_count] = c; | |
652 | + label_count++; | |
653 | + } | |
654 | + state->parts[i + 1].has_info = true; | |
630 | 655 | } |
631 | 656 | kfree(ptes); |
632 | 657 | kfree(gpt); |
include/linux/bio.h
... | ... | @@ -346,8 +346,15 @@ |
346 | 346 | } |
347 | 347 | |
348 | 348 | #else |
349 | -#define bvec_kmap_irq(bvec, flags) (page_address((bvec)->bv_page) + (bvec)->bv_offset) | |
350 | -#define bvec_kunmap_irq(buf, flags) do { *(flags) = 0; } while (0) | |
349 | +static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) | |
350 | +{ | |
351 | + return page_address(bvec->bv_page) + bvec->bv_offset; | |
352 | +} | |
353 | + | |
354 | +static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) | |
355 | +{ | |
356 | + *flags = 0; | |
357 | +} | |
351 | 358 | #endif |
352 | 359 | |
353 | 360 | static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx, |
... | ... | @@ -495,6 +502,10 @@ |
495 | 502 | |
496 | 503 | #define bip_for_each_vec(bvl, bip, i) \ |
497 | 504 | __bip_for_each_vec(bvl, bip, i, (bip)->bip_idx) |
505 | + | |
506 | +#define bio_for_each_integrity_vec(_bvl, _bio, _iter) \ | |
507 | + for_each_bio(_bio) \ | |
508 | + bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) | |
498 | 509 | |
499 | 510 | #define bio_integrity(bio) (bio->bi_integrity != NULL) |
500 | 511 |
include/linux/blk_types.h
... | ... | @@ -97,6 +97,7 @@ |
97 | 97 | #define BIO_NULL_MAPPED 9 /* contains invalid user pages */ |
98 | 98 | #define BIO_FS_INTEGRITY 10 /* fs owns integrity data, not block layer */ |
99 | 99 | #define BIO_QUIET 11 /* Make BIO Quiet */ |
100 | +#define BIO_MAPPED_INTEGRITY 12/* integrity metadata has been remapped */ | |
100 | 101 | #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) |
101 | 102 | |
102 | 103 | /* |
... | ... | @@ -130,6 +131,8 @@ |
130 | 131 | /* bio only flags */ |
131 | 132 | __REQ_UNPLUG, /* unplug the immediately after submission */ |
132 | 133 | __REQ_RAHEAD, /* read ahead, can fail anytime */ |
134 | + __REQ_THROTTLED, /* This bio has already been subjected to | |
135 | + * throttling rules. Don't do it again. */ | |
133 | 136 | |
134 | 137 | /* request only flags */ |
135 | 138 | __REQ_SORTED, /* elevator knows about this request */ |
... | ... | @@ -146,7 +149,6 @@ |
146 | 149 | __REQ_ORDERED_COLOR, /* is before or after barrier */ |
147 | 150 | __REQ_ALLOCED, /* request came from our alloc pool */ |
148 | 151 | __REQ_COPY_USER, /* contains copies of user pages */ |
149 | - __REQ_INTEGRITY, /* integrity metadata has been remapped */ | |
150 | 152 | __REQ_FLUSH, /* request for cache flush */ |
151 | 153 | __REQ_IO_STAT, /* account I/O stat */ |
152 | 154 | __REQ_MIXED_MERGE, /* merge of different types, fail separately */ |
... | ... | @@ -172,6 +174,7 @@ |
172 | 174 | |
173 | 175 | #define REQ_UNPLUG (1 << __REQ_UNPLUG) |
174 | 176 | #define REQ_RAHEAD (1 << __REQ_RAHEAD) |
177 | +#define REQ_THROTTLED (1 << __REQ_THROTTLED) | |
175 | 178 | |
176 | 179 | #define REQ_SORTED (1 << __REQ_SORTED) |
177 | 180 | #define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER) |
... | ... | @@ -187,7 +190,6 @@ |
187 | 190 | #define REQ_ORDERED_COLOR (1 << __REQ_ORDERED_COLOR) |
188 | 191 | #define REQ_ALLOCED (1 << __REQ_ALLOCED) |
189 | 192 | #define REQ_COPY_USER (1 << __REQ_COPY_USER) |
190 | -#define REQ_INTEGRITY (1 << __REQ_INTEGRITY) | |
191 | 193 | #define REQ_FLUSH (1 << __REQ_FLUSH) |
192 | 194 | #define REQ_IO_STAT (1 << __REQ_IO_STAT) |
193 | 195 | #define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) |
include/linux/blkdev.h
... | ... | @@ -115,6 +115,7 @@ |
115 | 115 | void *elevator_private3; |
116 | 116 | |
117 | 117 | struct gendisk *rq_disk; |
118 | + struct hd_struct *part; | |
118 | 119 | unsigned long start_time; |
119 | 120 | #ifdef CONFIG_BLK_CGROUP |
120 | 121 | unsigned long long start_time_ns; |
... | ... | @@ -124,6 +125,9 @@ |
124 | 125 | * physical address coalescing is performed. |
125 | 126 | */ |
126 | 127 | unsigned short nr_phys_segments; |
128 | +#if defined(CONFIG_BLK_DEV_INTEGRITY) | |
129 | + unsigned short nr_integrity_segments; | |
130 | +#endif | |
127 | 131 | |
128 | 132 | unsigned short ioprio; |
129 | 133 | |
... | ... | @@ -243,6 +247,7 @@ |
243 | 247 | |
244 | 248 | unsigned short logical_block_size; |
245 | 249 | unsigned short max_segments; |
250 | + unsigned short max_integrity_segments; | |
246 | 251 | |
247 | 252 | unsigned char misaligned; |
248 | 253 | unsigned char discard_misaligned; |
... | ... | @@ -367,6 +372,11 @@ |
367 | 372 | #if defined(CONFIG_BLK_DEV_BSG) |
368 | 373 | struct bsg_class_device bsg_dev; |
369 | 374 | #endif |
375 | + | |
376 | +#ifdef CONFIG_BLK_DEV_THROTTLING | |
377 | + /* Throttle data */ | |
378 | + struct throtl_data *td; | |
379 | +#endif | |
370 | 380 | }; |
371 | 381 | |
372 | 382 | #define QUEUE_FLAG_CLUSTER 0 /* cluster several segments into 1 */ |
... | ... | @@ -851,7 +861,7 @@ |
851 | 861 | extern void blk_queue_max_discard_sectors(struct request_queue *q, |
852 | 862 | unsigned int max_discard_sectors); |
853 | 863 | extern void blk_queue_logical_block_size(struct request_queue *, unsigned short); |
854 | -extern void blk_queue_physical_block_size(struct request_queue *, unsigned short); | |
864 | +extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); | |
855 | 865 | extern void blk_queue_alignment_offset(struct request_queue *q, |
856 | 866 | unsigned int alignment); |
857 | 867 | extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); |
... | ... | @@ -1004,7 +1014,7 @@ |
1004 | 1014 | return q->limits.physical_block_size; |
1005 | 1015 | } |
1006 | 1016 | |
1007 | -static inline int bdev_physical_block_size(struct block_device *bdev) | |
1017 | +static inline unsigned int bdev_physical_block_size(struct block_device *bdev) | |
1008 | 1018 | { |
1009 | 1019 | return queue_physical_block_size(bdev_get_queue(bdev)); |
1010 | 1020 | } |
1011 | 1021 | |
... | ... | @@ -1093,11 +1103,11 @@ |
1093 | 1103 | return q ? q->dma_alignment : 511; |
1094 | 1104 | } |
1095 | 1105 | |
1096 | -static inline int blk_rq_aligned(struct request_queue *q, void *addr, | |
1106 | +static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr, | |
1097 | 1107 | unsigned int len) |
1098 | 1108 | { |
1099 | 1109 | unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask; |
1100 | - return !((unsigned long)addr & alignment) && !(len & alignment); | |
1110 | + return !(addr & alignment) && !(len & alignment); | |
1101 | 1111 | } |
1102 | 1112 | |
1103 | 1113 | /* assumes size > 256 */ |
... | ... | @@ -1127,6 +1137,7 @@ |
1127 | 1137 | |
1128 | 1138 | struct work_struct; |
1129 | 1139 | int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); |
1140 | +int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay); | |
1130 | 1141 | |
1131 | 1142 | #ifdef CONFIG_BLK_CGROUP |
1132 | 1143 | /* |
... | ... | @@ -1170,6 +1181,24 @@ |
1170 | 1181 | } |
1171 | 1182 | #endif |
1172 | 1183 | |
1184 | +#ifdef CONFIG_BLK_DEV_THROTTLING | |
1185 | +extern int blk_throtl_init(struct request_queue *q); | |
1186 | +extern void blk_throtl_exit(struct request_queue *q); | |
1187 | +extern int blk_throtl_bio(struct request_queue *q, struct bio **bio); | |
1188 | +extern void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay); | |
1189 | +extern void throtl_shutdown_timer_wq(struct request_queue *q); | |
1190 | +#else /* CONFIG_BLK_DEV_THROTTLING */ | |
1191 | +static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio) | |
1192 | +{ | |
1193 | + return 0; | |
1194 | +} | |
1195 | + | |
1196 | +static inline int blk_throtl_init(struct request_queue *q) { return 0; } | |
1197 | +static inline int blk_throtl_exit(struct request_queue *q) { return 0; } | |
1198 | +static inline void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay) {} | |
1199 | +static inline void throtl_shutdown_timer_wq(struct request_queue *q) {} | |
1200 | +#endif /* CONFIG_BLK_DEV_THROTTLING */ | |
1201 | + | |
1173 | 1202 | #define MODULE_ALIAS_BLOCKDEV(major,minor) \ |
1174 | 1203 | MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor)) |
1175 | 1204 | #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \ |
... | ... | @@ -1213,8 +1242,13 @@ |
1213 | 1242 | extern int blk_integrity_register(struct gendisk *, struct blk_integrity *); |
1214 | 1243 | extern void blk_integrity_unregister(struct gendisk *); |
1215 | 1244 | extern int blk_integrity_compare(struct gendisk *, struct gendisk *); |
1216 | -extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); | |
1217 | -extern int blk_rq_count_integrity_sg(struct request *); | |
1245 | +extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, | |
1246 | + struct scatterlist *); | |
1247 | +extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); | |
1248 | +extern int blk_integrity_merge_rq(struct request_queue *, struct request *, | |
1249 | + struct request *); | |
1250 | +extern int blk_integrity_merge_bio(struct request_queue *, struct request *, | |
1251 | + struct bio *); | |
1218 | 1252 | |
1219 | 1253 | static inline |
1220 | 1254 | struct blk_integrity *bdev_get_integrity(struct block_device *bdev) |
1221 | 1255 | |
1222 | 1256 | |
... | ... | @@ -1235,16 +1269,32 @@ |
1235 | 1269 | return bio_integrity(rq->bio); |
1236 | 1270 | } |
1237 | 1271 | |
1272 | +static inline void blk_queue_max_integrity_segments(struct request_queue *q, | |
1273 | + unsigned int segs) | |
1274 | +{ | |
1275 | + q->limits.max_integrity_segments = segs; | |
1276 | +} | |
1277 | + | |
1278 | +static inline unsigned short | |
1279 | +queue_max_integrity_segments(struct request_queue *q) | |
1280 | +{ | |
1281 | + return q->limits.max_integrity_segments; | |
1282 | +} | |
1283 | + | |
1238 | 1284 | #else /* CONFIG_BLK_DEV_INTEGRITY */ |
1239 | 1285 | |
1240 | 1286 | #define blk_integrity_rq(rq) (0) |
1241 | -#define blk_rq_count_integrity_sg(a) (0) | |
1242 | -#define blk_rq_map_integrity_sg(a, b) (0) | |
1287 | +#define blk_rq_count_integrity_sg(a, b) (0) | |
1288 | +#define blk_rq_map_integrity_sg(a, b, c) (0) | |
1243 | 1289 | #define bdev_get_integrity(a) (0) |
1244 | 1290 | #define blk_get_integrity(a) (0) |
1245 | 1291 | #define blk_integrity_compare(a, b) (0) |
1246 | 1292 | #define blk_integrity_register(a, b) (0) |
1247 | 1293 | #define blk_integrity_unregister(a) do { } while (0); |
1294 | +#define blk_queue_max_integrity_segments(a, b) do { } while (0); | |
1295 | +#define queue_max_integrity_segments(a) (0) | |
1296 | +#define blk_integrity_merge_rq(a, b, c) (0) | |
1297 | +#define blk_integrity_merge_bio(a, b, c) (0) | |
1248 | 1298 | |
1249 | 1299 | #endif /* CONFIG_BLK_DEV_INTEGRITY */ |
1250 | 1300 |
include/linux/elevator.h
... | ... | @@ -122,6 +122,8 @@ |
122 | 122 | extern int elv_set_request(struct request_queue *, struct request *, gfp_t); |
123 | 123 | extern void elv_put_request(struct request_queue *, struct request *); |
124 | 124 | extern void elv_drain_elevator(struct request_queue *); |
125 | +extern void elv_quiesce_start(struct request_queue *); | |
126 | +extern void elv_quiesce_end(struct request_queue *); | |
125 | 127 | |
126 | 128 | /* |
127 | 129 | * io scheduler registration |
include/linux/genhd.h
... | ... | @@ -12,6 +12,7 @@ |
12 | 12 | #include <linux/types.h> |
13 | 13 | #include <linux/kdev_t.h> |
14 | 14 | #include <linux/rcupdate.h> |
15 | +#include <linux/slab.h> | |
15 | 16 | |
16 | 17 | #ifdef CONFIG_BLOCK |
17 | 18 | |
... | ... | @@ -86,7 +87,15 @@ |
86 | 87 | unsigned long io_ticks; |
87 | 88 | unsigned long time_in_queue; |
88 | 89 | }; |
89 | - | |
90 | + | |
91 | +#define PARTITION_META_INFO_VOLNAMELTH 64 | |
92 | +#define PARTITION_META_INFO_UUIDLTH 16 | |
93 | + | |
94 | +struct partition_meta_info { | |
95 | + u8 uuid[PARTITION_META_INFO_UUIDLTH]; /* always big endian */ | |
96 | + u8 volname[PARTITION_META_INFO_VOLNAMELTH]; | |
97 | +}; | |
98 | + | |
90 | 99 | struct hd_struct { |
91 | 100 | sector_t start_sect; |
92 | 101 | sector_t nr_sects; |
... | ... | @@ -95,6 +104,7 @@ |
95 | 104 | struct device __dev; |
96 | 105 | struct kobject *holder_dir; |
97 | 106 | int policy, partno; |
107 | + struct partition_meta_info *info; | |
98 | 108 | #ifdef CONFIG_FAIL_MAKE_REQUEST |
99 | 109 | int make_it_fail; |
100 | 110 | #endif |
... | ... | @@ -130,6 +140,7 @@ |
130 | 140 | struct rcu_head rcu_head; |
131 | 141 | int len; |
132 | 142 | struct hd_struct __rcu *last_lookup; |
143 | + struct gendisk *disk; | |
133 | 144 | struct hd_struct __rcu *part[]; |
134 | 145 | }; |
135 | 146 | |
... | ... | @@ -181,6 +192,30 @@ |
181 | 192 | return NULL; |
182 | 193 | } |
183 | 194 | |
195 | +static inline void part_pack_uuid(const u8 *uuid_str, u8 *to) | |
196 | +{ | |
197 | + int i; | |
198 | + for (i = 0; i < 16; ++i) { | |
199 | + *to++ = (hex_to_bin(*uuid_str) << 4) | | |
200 | + (hex_to_bin(*(uuid_str + 1))); | |
201 | + uuid_str += 2; | |
202 | + switch (i) { | |
203 | + case 3: | |
204 | + case 5: | |
205 | + case 7: | |
206 | + case 9: | |
207 | + uuid_str++; | |
208 | + continue; | |
209 | + } | |
210 | + } | |
211 | +} | |
212 | + | |
213 | +static inline char *part_unpack_uuid(const u8 *uuid, char *out) | |
214 | +{ | |
215 | + sprintf(out, "%pU", uuid); | |
216 | + return out; | |
217 | +} | |
218 | + | |
184 | 219 | static inline int disk_max_parts(struct gendisk *disk) |
185 | 220 | { |
186 | 221 | if (disk->flags & GENHD_FL_EXT_DEVT) |
... | ... | @@ -342,6 +377,19 @@ |
342 | 377 | return part->in_flight[0] + part->in_flight[1]; |
343 | 378 | } |
344 | 379 | |
380 | +static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk) | |
381 | +{ | |
382 | + if (disk) | |
383 | + return kzalloc_node(sizeof(struct partition_meta_info), | |
384 | + GFP_KERNEL, disk->node_id); | |
385 | + return kzalloc(sizeof(struct partition_meta_info), GFP_KERNEL); | |
386 | +} | |
387 | + | |
388 | +static inline void free_part_info(struct hd_struct *part) | |
389 | +{ | |
390 | + kfree(part->info); | |
391 | +} | |
392 | + | |
345 | 393 | /* block/blk-core.c */ |
346 | 394 | extern void part_round_stats(int cpu, struct hd_struct *part); |
347 | 395 | |
... | ... | @@ -533,7 +581,9 @@ |
533 | 581 | extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev); |
534 | 582 | extern struct hd_struct * __must_check add_partition(struct gendisk *disk, |
535 | 583 | int partno, sector_t start, |
536 | - sector_t len, int flags); | |
584 | + sector_t len, int flags, | |
585 | + struct partition_meta_info | |
586 | + *info); | |
537 | 587 | extern void delete_partition(struct gendisk *, int); |
538 | 588 | extern void printk_all_partitions(void); |
539 | 589 |
include/linux/kernel.h
... | ... | @@ -652,6 +652,16 @@ |
652 | 652 | _max1 > _max2 ? _max1 : _max2; }) |
653 | 653 | |
654 | 654 | /** |
655 | + * min_not_zero - return the minimum that is _not_ zero, unless both are zero | |
656 | + * @x: value1 | |
657 | + * @y: value2 | |
658 | + */ | |
659 | +#define min_not_zero(x, y) ({ \ | |
660 | + typeof(x) __x = (x); \ | |
661 | + typeof(y) __y = (y); \ | |
662 | + __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) | |
663 | + | |
664 | +/** | |
655 | 665 | * clamp - return a value clamped to a given range with strict typechecking |
656 | 666 | * @val: current value |
657 | 667 | * @min: minimum allowable value |
include/linux/sched.h
... | ... | @@ -336,6 +336,9 @@ |
336 | 336 | extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, |
337 | 337 | void __user *buffer, |
338 | 338 | size_t *lenp, loff_t *ppos); |
339 | +#else | |
340 | +/* Avoid need for ifdefs elsewhere in the code */ | |
341 | +enum { sysctl_hung_task_timeout_secs = 0 }; | |
339 | 342 | #endif |
340 | 343 | |
341 | 344 | /* Attach to any functions which should be ignored in wchan output. */ |
include/scsi/scsi.h
... | ... | @@ -32,6 +32,12 @@ |
32 | 32 | #endif |
33 | 33 | |
34 | 34 | /* |
35 | + * DIX-capable adapters effectively support infinite chaining for the | |
36 | + * protection information scatterlist | |
37 | + */ | |
38 | +#define SCSI_MAX_PROT_SG_SEGMENTS 0xFFFF | |
39 | + | |
40 | +/* | |
35 | 41 | * Special value for scanning to specify scanning or rescanning of all |
36 | 42 | * possible channels, (target) ids, or luns on a given shost. |
37 | 43 | */ |
include/scsi/scsi_host.h
... | ... | @@ -388,6 +388,7 @@ |
388 | 388 | * of scatter-gather. |
389 | 389 | */ |
390 | 390 | unsigned short sg_tablesize; |
391 | + unsigned short sg_prot_tablesize; | |
391 | 392 | |
392 | 393 | /* |
393 | 394 | * Set this if the host adapter has limitations beside segment count. |
... | ... | @@ -599,6 +600,7 @@ |
599 | 600 | int can_queue; |
600 | 601 | short cmd_per_lun; |
601 | 602 | short unsigned int sg_tablesize; |
603 | + short unsigned int sg_prot_tablesize; | |
602 | 604 | short unsigned int max_sectors; |
603 | 605 | unsigned long dma_boundary; |
604 | 606 | /* |
... | ... | @@ -821,6 +823,11 @@ |
821 | 823 | static inline unsigned int scsi_host_get_prot(struct Scsi_Host *shost) |
822 | 824 | { |
823 | 825 | return shost->prot_capabilities; |
826 | +} | |
827 | + | |
828 | +static inline int scsi_host_prot_dma(struct Scsi_Host *shost) | |
829 | +{ | |
830 | + return shost->prot_capabilities >= SHOST_DIX_TYPE0_PROTECTION; | |
824 | 831 | } |
825 | 832 | |
826 | 833 | static inline unsigned int scsi_host_dif_capable(struct Scsi_Host *shost, unsigned int target_type) |
init/Kconfig
... | ... | @@ -661,11 +661,14 @@ |
661 | 661 | |
662 | 662 | Currently, CFQ IO scheduler uses it to recognize task groups and |
663 | 663 | control disk bandwidth allocation (proportional time slice allocation) |
664 | - to such task groups. | |
664 | + to such task groups. It is also used by bio throttling logic in | |
665 | + block layer to implement upper limit in IO rates on a device. | |
665 | 666 | |
666 | 667 | This option only enables generic Block IO controller infrastructure. |
667 | - One needs to also enable actual IO controlling logic in CFQ for it | |
668 | - to take effect. (CONFIG_CFQ_GROUP_IOSCHED=y). | |
668 | + One needs to also enable actual IO controlling logic/policy. For | |
669 | + enabling proportional weight division of disk bandwidth in CFQ seti | |
670 | + CONFIG_CFQ_GROUP_IOSCHED=y and for enabling throttling policy set | |
671 | + CONFIG_BLK_THROTTLE=y. | |
669 | 672 | |
670 | 673 | See Documentation/cgroups/blkio-controller.txt for more information. |
671 | 674 |
init/do_mounts.c
... | ... | @@ -58,6 +58,62 @@ |
58 | 58 | __setup("ro", readonly); |
59 | 59 | __setup("rw", readwrite); |
60 | 60 | |
61 | +#ifdef CONFIG_BLOCK | |
62 | +/** | |
63 | + * match_dev_by_uuid - callback for finding a partition using its uuid | |
64 | + * @dev: device passed in by the caller | |
65 | + * @data: opaque pointer to a 36 byte char array with a UUID | |
66 | + * | |
67 | + * Returns 1 if the device matches, and 0 otherwise. | |
68 | + */ | |
69 | +static int match_dev_by_uuid(struct device *dev, void *data) | |
70 | +{ | |
71 | + u8 *uuid = data; | |
72 | + struct hd_struct *part = dev_to_part(dev); | |
73 | + | |
74 | + if (!part->info) | |
75 | + goto no_match; | |
76 | + | |
77 | + if (memcmp(uuid, part->info->uuid, sizeof(part->info->uuid))) | |
78 | + goto no_match; | |
79 | + | |
80 | + return 1; | |
81 | +no_match: | |
82 | + return 0; | |
83 | +} | |
84 | + | |
85 | + | |
86 | +/** | |
87 | + * devt_from_partuuid - looks up the dev_t of a partition by its UUID | |
88 | + * @uuid: 36 byte char array containing a hex ascii UUID | |
89 | + * | |
90 | + * The function will return the first partition which contains a matching | |
91 | + * UUID value in its partition_meta_info struct. This does not search | |
92 | + * by filesystem UUIDs. | |
93 | + * | |
94 | + * Returns the matching dev_t on success or 0 on failure. | |
95 | + */ | |
96 | +static dev_t __init devt_from_partuuid(char *uuid_str) | |
97 | +{ | |
98 | + dev_t res = 0; | |
99 | + struct device *dev = NULL; | |
100 | + u8 uuid[16]; | |
101 | + | |
102 | + /* Pack the requested UUID in the expected format. */ | |
103 | + part_pack_uuid(uuid_str, uuid); | |
104 | + | |
105 | + dev = class_find_device(&block_class, NULL, uuid, &match_dev_by_uuid); | |
106 | + if (!dev) | |
107 | + goto done; | |
108 | + | |
109 | + res = dev->devt; | |
110 | + put_device(dev); | |
111 | + | |
112 | +done: | |
113 | + return res; | |
114 | +} | |
115 | +#endif | |
116 | + | |
61 | 117 | /* |
62 | 118 | * Convert a name into device number. We accept the following variants: |
63 | 119 | * |
... | ... | @@ -68,6 +124,8 @@ |
68 | 124 | * of partition - device number of disk plus the partition number |
69 | 125 | * 5) /dev/<disk_name>p<decimal> - same as the above, that form is |
70 | 126 | * used when disk name of partitioned disk ends on a digit. |
127 | + * 6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the | |
128 | + * unique id of a partition if the partition table provides it. | |
71 | 129 | * |
72 | 130 | * If name doesn't have fall into the categories above, we return (0,0). |
73 | 131 | * block_class is used to check if something is a disk name. If the disk |
... | ... | @@ -81,6 +139,18 @@ |
81 | 139 | char *p; |
82 | 140 | dev_t res = 0; |
83 | 141 | int part; |
142 | + | |
143 | +#ifdef CONFIG_BLOCK | |
144 | + if (strncmp(name, "PARTUUID=", 9) == 0) { | |
145 | + name += 9; | |
146 | + if (strlen(name) != 36) | |
147 | + goto fail; | |
148 | + res = devt_from_partuuid(name); | |
149 | + if (!res) | |
150 | + goto fail; | |
151 | + goto done; | |
152 | + } | |
153 | +#endif | |
84 | 154 | |
85 | 155 | if (strncmp(name, "/dev/", 5) != 0) { |
86 | 156 | unsigned maj, min; |