Commit c10b61f0910466b4b99c266a7d76ac4390743fb5

Authored by Jeff Moyer
Committed by Jens Axboe
1 parent fbbf055692

cfq: Don't allow queue merges for queues that have no process references

Hi,

A user reported a kernel bug when running a particular program that did
the following:

created 32 threads
- each thread took a mutex, grabbed a global offset, added a buffer size
  to that offset, released the lock
- read from the given offset in the file
- created a new thread to do the same
- exited

The result is that cfq's close cooperator logic would trigger, as the
threads were issuing I/O within the mean seek distance of one another.
This workload managed to routinely trigger a use after free bug when
walking the list of merge candidates for a particular cfqq
(cfqq->new_cfqq).  The logic used for merging queues looks like this:

static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
{
	int process_refs, new_process_refs;
	struct cfq_queue *__cfqq;

	/* Avoid a circular list and skip interim queue merges */
	while ((__cfqq = new_cfqq->new_cfqq)) {
		if (__cfqq == cfqq)
			return;
		new_cfqq = __cfqq;
	}

	process_refs = cfqq_process_refs(cfqq);
	/*
	 * If the process for the cfqq has gone away, there is no
	 * sense in merging the queues.
	 */
	if (process_refs == 0)
		return;

	/*
	 * Merge in the direction of the lesser amount of work.
	 */
	new_process_refs = cfqq_process_refs(new_cfqq);
	if (new_process_refs >= process_refs) {
		cfqq->new_cfqq = new_cfqq;
		atomic_add(process_refs, &new_cfqq->ref);
	} else {
		new_cfqq->new_cfqq = cfqq;
		atomic_add(new_process_refs, &cfqq->ref);
	}
}

When a merge candidate is found, we add the process references for the
queue with less references to the queue with more.  The actual merging
of queues happens when a new request is issued for a given cfqq.  In the
case of the test program, it only does a single pread call to read in
1MB, so the actual merge never happens.

Normally, this is fine, as when the queue exits, we simply drop the
references we took on the other cfqqs in the merge chain:

	/*
	 * If this queue was scheduled to merge with another queue, be
	 * sure to drop the reference taken on that queue (and others in
	 * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
	 */
	__cfqq = cfqq->new_cfqq;
	while (__cfqq) {
		if (__cfqq == cfqq) {
			WARN(1, "cfqq->new_cfqq loop detected\n");
			break;
		}
		next = __cfqq->new_cfqq;
		cfq_put_queue(__cfqq);
		__cfqq = next;
	}

However, there is a hole in this logic.  Consider the following (and
keep in mind that each I/O keeps a reference to the cfqq):

q1->new_cfqq = q2   // q2 now has 2 process references
q3->new_cfqq = q2   // q2 now has 3 process references

// the process associated with q2 exits
// q2 now has 2 process references

// queue 1 exits, drops its reference on q2
// q2 now has 1 process reference

// q3 exits, so has 0 process references, and hence drops its references
// to q2, which leaves q2 also with 0 process references

q4 comes along and wants to merge with q3

q3->new_cfqq still points at q2!  We follow that link and end up at an
already freed cfqq.

So, the fix is to not follow a merge chain if the top-most queue does
not have a process reference, otherwise any queue in the chain could be
already freed.  I also changed the logic to disallow merging with a
queue that does not have any process references.  Previously, we did
this check for one of the merge candidates, but not the other.  That
doesn't really make sense.

Without the attached patch, my system would BUG within a couple of
seconds of running the reproducer program.  With the patch applied, my
system ran the program for over an hour without issues.

This addresses the following bugzilla:
    https://bugzilla.kernel.org/show_bug.cgi?id=16217

Thanks a ton to Phil Carns for providing the bug report and an excellent
reproducer.

[ Note for stable: this applies to 2.6.32/33/34 ].

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Reported-by: Phil Carns <carns@mcs.anl.gov>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>

Showing 1 changed file with 11 additions and 2 deletions Inline Diff

1 /* 1 /*
2 * CFQ, or complete fairness queueing, disk scheduler. 2 * CFQ, or complete fairness queueing, disk scheduler.
3 * 3 *
4 * Based on ideas from a previously unfinished io 4 * Based on ideas from a previously unfinished io
5 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. 5 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6 * 6 *
7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> 7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8 */ 8 */
9 #include <linux/module.h> 9 #include <linux/module.h>
10 #include <linux/slab.h> 10 #include <linux/slab.h>
11 #include <linux/blkdev.h> 11 #include <linux/blkdev.h>
12 #include <linux/elevator.h> 12 #include <linux/elevator.h>
13 #include <linux/jiffies.h> 13 #include <linux/jiffies.h>
14 #include <linux/rbtree.h> 14 #include <linux/rbtree.h>
15 #include <linux/ioprio.h> 15 #include <linux/ioprio.h>
16 #include <linux/blktrace_api.h> 16 #include <linux/blktrace_api.h>
17 #include "blk-cgroup.h" 17 #include "blk-cgroup.h"
18 18
19 /* 19 /*
20 * tunables 20 * tunables
21 */ 21 */
22 /* max queue in one round of service */ 22 /* max queue in one round of service */
23 static const int cfq_quantum = 8; 23 static const int cfq_quantum = 8;
24 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; 24 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
25 /* maximum backwards seek, in KiB */ 25 /* maximum backwards seek, in KiB */
26 static const int cfq_back_max = 16 * 1024; 26 static const int cfq_back_max = 16 * 1024;
27 /* penalty of a backwards seek */ 27 /* penalty of a backwards seek */
28 static const int cfq_back_penalty = 2; 28 static const int cfq_back_penalty = 2;
29 static const int cfq_slice_sync = HZ / 10; 29 static const int cfq_slice_sync = HZ / 10;
30 static int cfq_slice_async = HZ / 25; 30 static int cfq_slice_async = HZ / 25;
31 static const int cfq_slice_async_rq = 2; 31 static const int cfq_slice_async_rq = 2;
32 static int cfq_slice_idle = HZ / 125; 32 static int cfq_slice_idle = HZ / 125;
33 static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ 33 static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
34 static const int cfq_hist_divisor = 4; 34 static const int cfq_hist_divisor = 4;
35 35
36 /* 36 /*
37 * offset from end of service tree 37 * offset from end of service tree
38 */ 38 */
39 #define CFQ_IDLE_DELAY (HZ / 5) 39 #define CFQ_IDLE_DELAY (HZ / 5)
40 40
41 /* 41 /*
42 * below this threshold, we consider thinktime immediate 42 * below this threshold, we consider thinktime immediate
43 */ 43 */
44 #define CFQ_MIN_TT (2) 44 #define CFQ_MIN_TT (2)
45 45
46 #define CFQ_SLICE_SCALE (5) 46 #define CFQ_SLICE_SCALE (5)
47 #define CFQ_HW_QUEUE_MIN (5) 47 #define CFQ_HW_QUEUE_MIN (5)
48 #define CFQ_SERVICE_SHIFT 12 48 #define CFQ_SERVICE_SHIFT 12
49 49
50 #define CFQQ_SEEK_THR (sector_t)(8 * 100) 50 #define CFQQ_SEEK_THR (sector_t)(8 * 100)
51 #define CFQQ_CLOSE_THR (sector_t)(8 * 1024) 51 #define CFQQ_CLOSE_THR (sector_t)(8 * 1024)
52 #define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) 52 #define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
53 #define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) 53 #define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
54 54
55 #define RQ_CIC(rq) \ 55 #define RQ_CIC(rq) \
56 ((struct cfq_io_context *) (rq)->elevator_private) 56 ((struct cfq_io_context *) (rq)->elevator_private)
57 #define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) 57 #define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2)
58 #define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3) 58 #define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3)
59 59
60 static struct kmem_cache *cfq_pool; 60 static struct kmem_cache *cfq_pool;
61 static struct kmem_cache *cfq_ioc_pool; 61 static struct kmem_cache *cfq_ioc_pool;
62 62
63 static DEFINE_PER_CPU(unsigned long, cfq_ioc_count); 63 static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
64 static struct completion *ioc_gone; 64 static struct completion *ioc_gone;
65 static DEFINE_SPINLOCK(ioc_gone_lock); 65 static DEFINE_SPINLOCK(ioc_gone_lock);
66 66
67 static DEFINE_SPINLOCK(cic_index_lock); 67 static DEFINE_SPINLOCK(cic_index_lock);
68 static DEFINE_IDA(cic_index_ida); 68 static DEFINE_IDA(cic_index_ida);
69 69
70 #define CFQ_PRIO_LISTS IOPRIO_BE_NR 70 #define CFQ_PRIO_LISTS IOPRIO_BE_NR
71 #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 71 #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
72 #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) 72 #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
73 73
74 #define sample_valid(samples) ((samples) > 80) 74 #define sample_valid(samples) ((samples) > 80)
75 #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) 75 #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
76 76
77 /* 77 /*
78 * Most of our rbtree usage is for sorting with min extraction, so 78 * Most of our rbtree usage is for sorting with min extraction, so
79 * if we cache the leftmost node we don't have to walk down the tree 79 * if we cache the leftmost node we don't have to walk down the tree
80 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should 80 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
81 * move this into the elevator for the rq sorting as well. 81 * move this into the elevator for the rq sorting as well.
82 */ 82 */
83 struct cfq_rb_root { 83 struct cfq_rb_root {
84 struct rb_root rb; 84 struct rb_root rb;
85 struct rb_node *left; 85 struct rb_node *left;
86 unsigned count; 86 unsigned count;
87 unsigned total_weight; 87 unsigned total_weight;
88 u64 min_vdisktime; 88 u64 min_vdisktime;
89 struct rb_node *active; 89 struct rb_node *active;
90 }; 90 };
91 #define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \ 91 #define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
92 .count = 0, .min_vdisktime = 0, } 92 .count = 0, .min_vdisktime = 0, }
93 93
94 /* 94 /*
95 * Per process-grouping structure 95 * Per process-grouping structure
96 */ 96 */
97 struct cfq_queue { 97 struct cfq_queue {
98 /* reference count */ 98 /* reference count */
99 atomic_t ref; 99 atomic_t ref;
100 /* various state flags, see below */ 100 /* various state flags, see below */
101 unsigned int flags; 101 unsigned int flags;
102 /* parent cfq_data */ 102 /* parent cfq_data */
103 struct cfq_data *cfqd; 103 struct cfq_data *cfqd;
104 /* service_tree member */ 104 /* service_tree member */
105 struct rb_node rb_node; 105 struct rb_node rb_node;
106 /* service_tree key */ 106 /* service_tree key */
107 unsigned long rb_key; 107 unsigned long rb_key;
108 /* prio tree member */ 108 /* prio tree member */
109 struct rb_node p_node; 109 struct rb_node p_node;
110 /* prio tree root we belong to, if any */ 110 /* prio tree root we belong to, if any */
111 struct rb_root *p_root; 111 struct rb_root *p_root;
112 /* sorted list of pending requests */ 112 /* sorted list of pending requests */
113 struct rb_root sort_list; 113 struct rb_root sort_list;
114 /* if fifo isn't expired, next request to serve */ 114 /* if fifo isn't expired, next request to serve */
115 struct request *next_rq; 115 struct request *next_rq;
116 /* requests queued in sort_list */ 116 /* requests queued in sort_list */
117 int queued[2]; 117 int queued[2];
118 /* currently allocated requests */ 118 /* currently allocated requests */
119 int allocated[2]; 119 int allocated[2];
120 /* fifo list of requests in sort_list */ 120 /* fifo list of requests in sort_list */
121 struct list_head fifo; 121 struct list_head fifo;
122 122
123 /* time when queue got scheduled in to dispatch first request. */ 123 /* time when queue got scheduled in to dispatch first request. */
124 unsigned long dispatch_start; 124 unsigned long dispatch_start;
125 unsigned int allocated_slice; 125 unsigned int allocated_slice;
126 unsigned int slice_dispatch; 126 unsigned int slice_dispatch;
127 /* time when first request from queue completed and slice started. */ 127 /* time when first request from queue completed and slice started. */
128 unsigned long slice_start; 128 unsigned long slice_start;
129 unsigned long slice_end; 129 unsigned long slice_end;
130 long slice_resid; 130 long slice_resid;
131 131
132 /* pending metadata requests */ 132 /* pending metadata requests */
133 int meta_pending; 133 int meta_pending;
134 /* number of requests that are on the dispatch list or inside driver */ 134 /* number of requests that are on the dispatch list or inside driver */
135 int dispatched; 135 int dispatched;
136 136
137 /* io prio of this group */ 137 /* io prio of this group */
138 unsigned short ioprio, org_ioprio; 138 unsigned short ioprio, org_ioprio;
139 unsigned short ioprio_class, org_ioprio_class; 139 unsigned short ioprio_class, org_ioprio_class;
140 140
141 pid_t pid; 141 pid_t pid;
142 142
143 u32 seek_history; 143 u32 seek_history;
144 sector_t last_request_pos; 144 sector_t last_request_pos;
145 145
146 struct cfq_rb_root *service_tree; 146 struct cfq_rb_root *service_tree;
147 struct cfq_queue *new_cfqq; 147 struct cfq_queue *new_cfqq;
148 struct cfq_group *cfqg; 148 struct cfq_group *cfqg;
149 struct cfq_group *orig_cfqg; 149 struct cfq_group *orig_cfqg;
150 }; 150 };
151 151
152 /* 152 /*
153 * First index in the service_trees. 153 * First index in the service_trees.
154 * IDLE is handled separately, so it has negative index 154 * IDLE is handled separately, so it has negative index
155 */ 155 */
156 enum wl_prio_t { 156 enum wl_prio_t {
157 BE_WORKLOAD = 0, 157 BE_WORKLOAD = 0,
158 RT_WORKLOAD = 1, 158 RT_WORKLOAD = 1,
159 IDLE_WORKLOAD = 2, 159 IDLE_WORKLOAD = 2,
160 }; 160 };
161 161
162 /* 162 /*
163 * Second index in the service_trees. 163 * Second index in the service_trees.
164 */ 164 */
165 enum wl_type_t { 165 enum wl_type_t {
166 ASYNC_WORKLOAD = 0, 166 ASYNC_WORKLOAD = 0,
167 SYNC_NOIDLE_WORKLOAD = 1, 167 SYNC_NOIDLE_WORKLOAD = 1,
168 SYNC_WORKLOAD = 2 168 SYNC_WORKLOAD = 2
169 }; 169 };
170 170
171 /* This is per cgroup per device grouping structure */ 171 /* This is per cgroup per device grouping structure */
172 struct cfq_group { 172 struct cfq_group {
173 /* group service_tree member */ 173 /* group service_tree member */
174 struct rb_node rb_node; 174 struct rb_node rb_node;
175 175
176 /* group service_tree key */ 176 /* group service_tree key */
177 u64 vdisktime; 177 u64 vdisktime;
178 unsigned int weight; 178 unsigned int weight;
179 bool on_st; 179 bool on_st;
180 180
181 /* number of cfqq currently on this group */ 181 /* number of cfqq currently on this group */
182 int nr_cfqq; 182 int nr_cfqq;
183 183
184 /* Per group busy queus average. Useful for workload slice calc. */ 184 /* Per group busy queus average. Useful for workload slice calc. */
185 unsigned int busy_queues_avg[2]; 185 unsigned int busy_queues_avg[2];
186 /* 186 /*
187 * rr lists of queues with requests, onle rr for each priority class. 187 * rr lists of queues with requests, onle rr for each priority class.
188 * Counts are embedded in the cfq_rb_root 188 * Counts are embedded in the cfq_rb_root
189 */ 189 */
190 struct cfq_rb_root service_trees[2][3]; 190 struct cfq_rb_root service_trees[2][3];
191 struct cfq_rb_root service_tree_idle; 191 struct cfq_rb_root service_tree_idle;
192 192
193 unsigned long saved_workload_slice; 193 unsigned long saved_workload_slice;
194 enum wl_type_t saved_workload; 194 enum wl_type_t saved_workload;
195 enum wl_prio_t saved_serving_prio; 195 enum wl_prio_t saved_serving_prio;
196 struct blkio_group blkg; 196 struct blkio_group blkg;
197 #ifdef CONFIG_CFQ_GROUP_IOSCHED 197 #ifdef CONFIG_CFQ_GROUP_IOSCHED
198 struct hlist_node cfqd_node; 198 struct hlist_node cfqd_node;
199 atomic_t ref; 199 atomic_t ref;
200 #endif 200 #endif
201 }; 201 };
202 202
203 /* 203 /*
204 * Per block device queue structure 204 * Per block device queue structure
205 */ 205 */
206 struct cfq_data { 206 struct cfq_data {
207 struct request_queue *queue; 207 struct request_queue *queue;
208 /* Root service tree for cfq_groups */ 208 /* Root service tree for cfq_groups */
209 struct cfq_rb_root grp_service_tree; 209 struct cfq_rb_root grp_service_tree;
210 struct cfq_group root_group; 210 struct cfq_group root_group;
211 211
212 /* 212 /*
213 * The priority currently being served 213 * The priority currently being served
214 */ 214 */
215 enum wl_prio_t serving_prio; 215 enum wl_prio_t serving_prio;
216 enum wl_type_t serving_type; 216 enum wl_type_t serving_type;
217 unsigned long workload_expires; 217 unsigned long workload_expires;
218 struct cfq_group *serving_group; 218 struct cfq_group *serving_group;
219 bool noidle_tree_requires_idle; 219 bool noidle_tree_requires_idle;
220 220
221 /* 221 /*
222 * Each priority tree is sorted by next_request position. These 222 * Each priority tree is sorted by next_request position. These
223 * trees are used when determining if two or more queues are 223 * trees are used when determining if two or more queues are
224 * interleaving requests (see cfq_close_cooperator). 224 * interleaving requests (see cfq_close_cooperator).
225 */ 225 */
226 struct rb_root prio_trees[CFQ_PRIO_LISTS]; 226 struct rb_root prio_trees[CFQ_PRIO_LISTS];
227 227
228 unsigned int busy_queues; 228 unsigned int busy_queues;
229 229
230 int rq_in_driver; 230 int rq_in_driver;
231 int rq_in_flight[2]; 231 int rq_in_flight[2];
232 232
233 /* 233 /*
234 * queue-depth detection 234 * queue-depth detection
235 */ 235 */
236 int rq_queued; 236 int rq_queued;
237 int hw_tag; 237 int hw_tag;
238 /* 238 /*
239 * hw_tag can be 239 * hw_tag can be
240 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection) 240 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
241 * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth) 241 * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
242 * 0 => no NCQ 242 * 0 => no NCQ
243 */ 243 */
244 int hw_tag_est_depth; 244 int hw_tag_est_depth;
245 unsigned int hw_tag_samples; 245 unsigned int hw_tag_samples;
246 246
247 /* 247 /*
248 * idle window management 248 * idle window management
249 */ 249 */
250 struct timer_list idle_slice_timer; 250 struct timer_list idle_slice_timer;
251 struct work_struct unplug_work; 251 struct work_struct unplug_work;
252 252
253 struct cfq_queue *active_queue; 253 struct cfq_queue *active_queue;
254 struct cfq_io_context *active_cic; 254 struct cfq_io_context *active_cic;
255 255
256 /* 256 /*
257 * async queue for each priority case 257 * async queue for each priority case
258 */ 258 */
259 struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; 259 struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
260 struct cfq_queue *async_idle_cfqq; 260 struct cfq_queue *async_idle_cfqq;
261 261
262 sector_t last_position; 262 sector_t last_position;
263 263
264 /* 264 /*
265 * tunables, see top of file 265 * tunables, see top of file
266 */ 266 */
267 unsigned int cfq_quantum; 267 unsigned int cfq_quantum;
268 unsigned int cfq_fifo_expire[2]; 268 unsigned int cfq_fifo_expire[2];
269 unsigned int cfq_back_penalty; 269 unsigned int cfq_back_penalty;
270 unsigned int cfq_back_max; 270 unsigned int cfq_back_max;
271 unsigned int cfq_slice[2]; 271 unsigned int cfq_slice[2];
272 unsigned int cfq_slice_async_rq; 272 unsigned int cfq_slice_async_rq;
273 unsigned int cfq_slice_idle; 273 unsigned int cfq_slice_idle;
274 unsigned int cfq_latency; 274 unsigned int cfq_latency;
275 unsigned int cfq_group_isolation; 275 unsigned int cfq_group_isolation;
276 276
277 unsigned int cic_index; 277 unsigned int cic_index;
278 struct list_head cic_list; 278 struct list_head cic_list;
279 279
280 /* 280 /*
281 * Fallback dummy cfqq for extreme OOM conditions 281 * Fallback dummy cfqq for extreme OOM conditions
282 */ 282 */
283 struct cfq_queue oom_cfqq; 283 struct cfq_queue oom_cfqq;
284 284
285 unsigned long last_delayed_sync; 285 unsigned long last_delayed_sync;
286 286
287 /* List of cfq groups being managed on this device*/ 287 /* List of cfq groups being managed on this device*/
288 struct hlist_head cfqg_list; 288 struct hlist_head cfqg_list;
289 struct rcu_head rcu; 289 struct rcu_head rcu;
290 }; 290 };
291 291
292 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); 292 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
293 293
294 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, 294 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
295 enum wl_prio_t prio, 295 enum wl_prio_t prio,
296 enum wl_type_t type) 296 enum wl_type_t type)
297 { 297 {
298 if (!cfqg) 298 if (!cfqg)
299 return NULL; 299 return NULL;
300 300
301 if (prio == IDLE_WORKLOAD) 301 if (prio == IDLE_WORKLOAD)
302 return &cfqg->service_tree_idle; 302 return &cfqg->service_tree_idle;
303 303
304 return &cfqg->service_trees[prio][type]; 304 return &cfqg->service_trees[prio][type];
305 } 305 }
306 306
307 enum cfqq_state_flags { 307 enum cfqq_state_flags {
308 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ 308 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */
309 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ 309 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */
310 CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ 310 CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */
311 CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ 311 CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
312 CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ 312 CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
313 CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ 313 CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */
314 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ 314 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */
315 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ 315 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
316 CFQ_CFQQ_FLAG_sync, /* synchronous queue */ 316 CFQ_CFQQ_FLAG_sync, /* synchronous queue */
317 CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ 317 CFQ_CFQQ_FLAG_coop, /* cfqq is shared */
318 CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */ 318 CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */
319 CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ 319 CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */
320 CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ 320 CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */
321 }; 321 };
322 322
323 #define CFQ_CFQQ_FNS(name) \ 323 #define CFQ_CFQQ_FNS(name) \
324 static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \ 324 static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \
325 { \ 325 { \
326 (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \ 326 (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \
327 } \ 327 } \
328 static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \ 328 static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \
329 { \ 329 { \
330 (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \ 330 (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \
331 } \ 331 } \
332 static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ 332 static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \
333 { \ 333 { \
334 return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ 334 return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \
335 } 335 }
336 336
337 CFQ_CFQQ_FNS(on_rr); 337 CFQ_CFQQ_FNS(on_rr);
338 CFQ_CFQQ_FNS(wait_request); 338 CFQ_CFQQ_FNS(wait_request);
339 CFQ_CFQQ_FNS(must_dispatch); 339 CFQ_CFQQ_FNS(must_dispatch);
340 CFQ_CFQQ_FNS(must_alloc_slice); 340 CFQ_CFQQ_FNS(must_alloc_slice);
341 CFQ_CFQQ_FNS(fifo_expire); 341 CFQ_CFQQ_FNS(fifo_expire);
342 CFQ_CFQQ_FNS(idle_window); 342 CFQ_CFQQ_FNS(idle_window);
343 CFQ_CFQQ_FNS(prio_changed); 343 CFQ_CFQQ_FNS(prio_changed);
344 CFQ_CFQQ_FNS(slice_new); 344 CFQ_CFQQ_FNS(slice_new);
345 CFQ_CFQQ_FNS(sync); 345 CFQ_CFQQ_FNS(sync);
346 CFQ_CFQQ_FNS(coop); 346 CFQ_CFQQ_FNS(coop);
347 CFQ_CFQQ_FNS(split_coop); 347 CFQ_CFQQ_FNS(split_coop);
348 CFQ_CFQQ_FNS(deep); 348 CFQ_CFQQ_FNS(deep);
349 CFQ_CFQQ_FNS(wait_busy); 349 CFQ_CFQQ_FNS(wait_busy);
350 #undef CFQ_CFQQ_FNS 350 #undef CFQ_CFQQ_FNS
351 351
352 #ifdef CONFIG_CFQ_GROUP_IOSCHED 352 #ifdef CONFIG_CFQ_GROUP_IOSCHED
353 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 353 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
354 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ 354 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
355 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ 355 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
356 blkg_path(&(cfqq)->cfqg->blkg), ##args); 356 blkg_path(&(cfqq)->cfqg->blkg), ##args);
357 357
358 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ 358 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \
359 blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ 359 blk_add_trace_msg((cfqd)->queue, "%s " fmt, \
360 blkg_path(&(cfqg)->blkg), ##args); \ 360 blkg_path(&(cfqg)->blkg), ##args); \
361 361
362 #else 362 #else
363 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 363 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
364 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) 364 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
365 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0); 365 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0);
366 #endif 366 #endif
367 #define cfq_log(cfqd, fmt, args...) \ 367 #define cfq_log(cfqd, fmt, args...) \
368 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) 368 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
369 369
370 /* Traverses through cfq group service trees */ 370 /* Traverses through cfq group service trees */
371 #define for_each_cfqg_st(cfqg, i, j, st) \ 371 #define for_each_cfqg_st(cfqg, i, j, st) \
372 for (i = 0; i <= IDLE_WORKLOAD; i++) \ 372 for (i = 0; i <= IDLE_WORKLOAD; i++) \
373 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\ 373 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
374 : &cfqg->service_tree_idle; \ 374 : &cfqg->service_tree_idle; \
375 (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \ 375 (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
376 (i == IDLE_WORKLOAD && j == 0); \ 376 (i == IDLE_WORKLOAD && j == 0); \
377 j++, st = i < IDLE_WORKLOAD ? \ 377 j++, st = i < IDLE_WORKLOAD ? \
378 &cfqg->service_trees[i][j]: NULL) \ 378 &cfqg->service_trees[i][j]: NULL) \
379 379
380 380
381 static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) 381 static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
382 { 382 {
383 if (cfq_class_idle(cfqq)) 383 if (cfq_class_idle(cfqq))
384 return IDLE_WORKLOAD; 384 return IDLE_WORKLOAD;
385 if (cfq_class_rt(cfqq)) 385 if (cfq_class_rt(cfqq))
386 return RT_WORKLOAD; 386 return RT_WORKLOAD;
387 return BE_WORKLOAD; 387 return BE_WORKLOAD;
388 } 388 }
389 389
390 390
391 static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) 391 static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
392 { 392 {
393 if (!cfq_cfqq_sync(cfqq)) 393 if (!cfq_cfqq_sync(cfqq))
394 return ASYNC_WORKLOAD; 394 return ASYNC_WORKLOAD;
395 if (!cfq_cfqq_idle_window(cfqq)) 395 if (!cfq_cfqq_idle_window(cfqq))
396 return SYNC_NOIDLE_WORKLOAD; 396 return SYNC_NOIDLE_WORKLOAD;
397 return SYNC_WORKLOAD; 397 return SYNC_WORKLOAD;
398 } 398 }
399 399
400 static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl, 400 static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
401 struct cfq_data *cfqd, 401 struct cfq_data *cfqd,
402 struct cfq_group *cfqg) 402 struct cfq_group *cfqg)
403 { 403 {
404 if (wl == IDLE_WORKLOAD) 404 if (wl == IDLE_WORKLOAD)
405 return cfqg->service_tree_idle.count; 405 return cfqg->service_tree_idle.count;
406 406
407 return cfqg->service_trees[wl][ASYNC_WORKLOAD].count 407 return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
408 + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count 408 + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
409 + cfqg->service_trees[wl][SYNC_WORKLOAD].count; 409 + cfqg->service_trees[wl][SYNC_WORKLOAD].count;
410 } 410 }
411 411
412 static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, 412 static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
413 struct cfq_group *cfqg) 413 struct cfq_group *cfqg)
414 { 414 {
415 return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count 415 return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
416 + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; 416 + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
417 } 417 }
418 418
419 static void cfq_dispatch_insert(struct request_queue *, struct request *); 419 static void cfq_dispatch_insert(struct request_queue *, struct request *);
420 static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, 420 static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
421 struct io_context *, gfp_t); 421 struct io_context *, gfp_t);
422 static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, 422 static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
423 struct io_context *); 423 struct io_context *);
424 424
425 static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, 425 static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
426 bool is_sync) 426 bool is_sync)
427 { 427 {
428 return cic->cfqq[is_sync]; 428 return cic->cfqq[is_sync];
429 } 429 }
430 430
431 static inline void cic_set_cfqq(struct cfq_io_context *cic, 431 static inline void cic_set_cfqq(struct cfq_io_context *cic,
432 struct cfq_queue *cfqq, bool is_sync) 432 struct cfq_queue *cfqq, bool is_sync)
433 { 433 {
434 cic->cfqq[is_sync] = cfqq; 434 cic->cfqq[is_sync] = cfqq;
435 } 435 }
436 436
437 #define CIC_DEAD_KEY 1ul 437 #define CIC_DEAD_KEY 1ul
438 #define CIC_DEAD_INDEX_SHIFT 1 438 #define CIC_DEAD_INDEX_SHIFT 1
439 439
440 static inline void *cfqd_dead_key(struct cfq_data *cfqd) 440 static inline void *cfqd_dead_key(struct cfq_data *cfqd)
441 { 441 {
442 return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY); 442 return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
443 } 443 }
444 444
445 static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic) 445 static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
446 { 446 {
447 struct cfq_data *cfqd = cic->key; 447 struct cfq_data *cfqd = cic->key;
448 448
449 if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY)) 449 if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))
450 return NULL; 450 return NULL;
451 451
452 return cfqd; 452 return cfqd;
453 } 453 }
454 454
455 /* 455 /*
456 * We regard a request as SYNC, if it's either a read or has the SYNC bit 456 * We regard a request as SYNC, if it's either a read or has the SYNC bit
457 * set (in which case it could also be direct WRITE). 457 * set (in which case it could also be direct WRITE).
458 */ 458 */
459 static inline bool cfq_bio_sync(struct bio *bio) 459 static inline bool cfq_bio_sync(struct bio *bio)
460 { 460 {
461 return bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO); 461 return bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO);
462 } 462 }
463 463
464 /* 464 /*
465 * scheduler run of queue, if there are requests pending and no one in the 465 * scheduler run of queue, if there are requests pending and no one in the
466 * driver that will restart queueing 466 * driver that will restart queueing
467 */ 467 */
468 static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) 468 static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
469 { 469 {
470 if (cfqd->busy_queues) { 470 if (cfqd->busy_queues) {
471 cfq_log(cfqd, "schedule dispatch"); 471 cfq_log(cfqd, "schedule dispatch");
472 kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work); 472 kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
473 } 473 }
474 } 474 }
475 475
476 static int cfq_queue_empty(struct request_queue *q) 476 static int cfq_queue_empty(struct request_queue *q)
477 { 477 {
478 struct cfq_data *cfqd = q->elevator->elevator_data; 478 struct cfq_data *cfqd = q->elevator->elevator_data;
479 479
480 return !cfqd->rq_queued; 480 return !cfqd->rq_queued;
481 } 481 }
482 482
483 /* 483 /*
484 * Scale schedule slice based on io priority. Use the sync time slice only 484 * Scale schedule slice based on io priority. Use the sync time slice only
485 * if a queue is marked sync and has sync io queued. A sync queue with async 485 * if a queue is marked sync and has sync io queued. A sync queue with async
486 * io only, should not get full sync slice length. 486 * io only, should not get full sync slice length.
487 */ 487 */
488 static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync, 488 static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
489 unsigned short prio) 489 unsigned short prio)
490 { 490 {
491 const int base_slice = cfqd->cfq_slice[sync]; 491 const int base_slice = cfqd->cfq_slice[sync];
492 492
493 WARN_ON(prio >= IOPRIO_BE_NR); 493 WARN_ON(prio >= IOPRIO_BE_NR);
494 494
495 return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio)); 495 return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
496 } 496 }
497 497
498 static inline int 498 static inline int
499 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 499 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
500 { 500 {
501 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); 501 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
502 } 502 }
503 503
504 static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) 504 static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
505 { 505 {
506 u64 d = delta << CFQ_SERVICE_SHIFT; 506 u64 d = delta << CFQ_SERVICE_SHIFT;
507 507
508 d = d * BLKIO_WEIGHT_DEFAULT; 508 d = d * BLKIO_WEIGHT_DEFAULT;
509 do_div(d, cfqg->weight); 509 do_div(d, cfqg->weight);
510 return d; 510 return d;
511 } 511 }
512 512
513 static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) 513 static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
514 { 514 {
515 s64 delta = (s64)(vdisktime - min_vdisktime); 515 s64 delta = (s64)(vdisktime - min_vdisktime);
516 if (delta > 0) 516 if (delta > 0)
517 min_vdisktime = vdisktime; 517 min_vdisktime = vdisktime;
518 518
519 return min_vdisktime; 519 return min_vdisktime;
520 } 520 }
521 521
522 static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) 522 static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
523 { 523 {
524 s64 delta = (s64)(vdisktime - min_vdisktime); 524 s64 delta = (s64)(vdisktime - min_vdisktime);
525 if (delta < 0) 525 if (delta < 0)
526 min_vdisktime = vdisktime; 526 min_vdisktime = vdisktime;
527 527
528 return min_vdisktime; 528 return min_vdisktime;
529 } 529 }
530 530
531 static void update_min_vdisktime(struct cfq_rb_root *st) 531 static void update_min_vdisktime(struct cfq_rb_root *st)
532 { 532 {
533 u64 vdisktime = st->min_vdisktime; 533 u64 vdisktime = st->min_vdisktime;
534 struct cfq_group *cfqg; 534 struct cfq_group *cfqg;
535 535
536 if (st->active) { 536 if (st->active) {
537 cfqg = rb_entry_cfqg(st->active); 537 cfqg = rb_entry_cfqg(st->active);
538 vdisktime = cfqg->vdisktime; 538 vdisktime = cfqg->vdisktime;
539 } 539 }
540 540
541 if (st->left) { 541 if (st->left) {
542 cfqg = rb_entry_cfqg(st->left); 542 cfqg = rb_entry_cfqg(st->left);
543 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); 543 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
544 } 544 }
545 545
546 st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); 546 st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
547 } 547 }
548 548
549 /* 549 /*
550 * get averaged number of queues of RT/BE priority. 550 * get averaged number of queues of RT/BE priority.
551 * average is updated, with a formula that gives more weight to higher numbers, 551 * average is updated, with a formula that gives more weight to higher numbers,
552 * to quickly follows sudden increases and decrease slowly 552 * to quickly follows sudden increases and decrease slowly
553 */ 553 */
554 554
555 static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, 555 static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
556 struct cfq_group *cfqg, bool rt) 556 struct cfq_group *cfqg, bool rt)
557 { 557 {
558 unsigned min_q, max_q; 558 unsigned min_q, max_q;
559 unsigned mult = cfq_hist_divisor - 1; 559 unsigned mult = cfq_hist_divisor - 1;
560 unsigned round = cfq_hist_divisor / 2; 560 unsigned round = cfq_hist_divisor / 2;
561 unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg); 561 unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
562 562
563 min_q = min(cfqg->busy_queues_avg[rt], busy); 563 min_q = min(cfqg->busy_queues_avg[rt], busy);
564 max_q = max(cfqg->busy_queues_avg[rt], busy); 564 max_q = max(cfqg->busy_queues_avg[rt], busy);
565 cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) / 565 cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
566 cfq_hist_divisor; 566 cfq_hist_divisor;
567 return cfqg->busy_queues_avg[rt]; 567 return cfqg->busy_queues_avg[rt];
568 } 568 }
569 569
570 static inline unsigned 570 static inline unsigned
571 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) 571 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
572 { 572 {
573 struct cfq_rb_root *st = &cfqd->grp_service_tree; 573 struct cfq_rb_root *st = &cfqd->grp_service_tree;
574 574
575 return cfq_target_latency * cfqg->weight / st->total_weight; 575 return cfq_target_latency * cfqg->weight / st->total_weight;
576 } 576 }
577 577
578 static inline void 578 static inline void
579 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 579 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
580 { 580 {
581 unsigned slice = cfq_prio_to_slice(cfqd, cfqq); 581 unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
582 if (cfqd->cfq_latency) { 582 if (cfqd->cfq_latency) {
583 /* 583 /*
584 * interested queues (we consider only the ones with the same 584 * interested queues (we consider only the ones with the same
585 * priority class in the cfq group) 585 * priority class in the cfq group)
586 */ 586 */
587 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg, 587 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
588 cfq_class_rt(cfqq)); 588 cfq_class_rt(cfqq));
589 unsigned sync_slice = cfqd->cfq_slice[1]; 589 unsigned sync_slice = cfqd->cfq_slice[1];
590 unsigned expect_latency = sync_slice * iq; 590 unsigned expect_latency = sync_slice * iq;
591 unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg); 591 unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
592 592
593 if (expect_latency > group_slice) { 593 if (expect_latency > group_slice) {
594 unsigned base_low_slice = 2 * cfqd->cfq_slice_idle; 594 unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
595 /* scale low_slice according to IO priority 595 /* scale low_slice according to IO priority
596 * and sync vs async */ 596 * and sync vs async */
597 unsigned low_slice = 597 unsigned low_slice =
598 min(slice, base_low_slice * slice / sync_slice); 598 min(slice, base_low_slice * slice / sync_slice);
599 /* the adapted slice value is scaled to fit all iqs 599 /* the adapted slice value is scaled to fit all iqs
600 * into the target latency */ 600 * into the target latency */
601 slice = max(slice * group_slice / expect_latency, 601 slice = max(slice * group_slice / expect_latency,
602 low_slice); 602 low_slice);
603 } 603 }
604 } 604 }
605 cfqq->slice_start = jiffies; 605 cfqq->slice_start = jiffies;
606 cfqq->slice_end = jiffies + slice; 606 cfqq->slice_end = jiffies + slice;
607 cfqq->allocated_slice = slice; 607 cfqq->allocated_slice = slice;
608 cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); 608 cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
609 } 609 }
610 610
611 /* 611 /*
612 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end 612 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
613 * isn't valid until the first request from the dispatch is activated 613 * isn't valid until the first request from the dispatch is activated
614 * and the slice time set. 614 * and the slice time set.
615 */ 615 */
616 static inline bool cfq_slice_used(struct cfq_queue *cfqq) 616 static inline bool cfq_slice_used(struct cfq_queue *cfqq)
617 { 617 {
618 if (cfq_cfqq_slice_new(cfqq)) 618 if (cfq_cfqq_slice_new(cfqq))
619 return 0; 619 return 0;
620 if (time_before(jiffies, cfqq->slice_end)) 620 if (time_before(jiffies, cfqq->slice_end))
621 return 0; 621 return 0;
622 622
623 return 1; 623 return 1;
624 } 624 }
625 625
626 /* 626 /*
627 * Lifted from AS - choose which of rq1 and rq2 that is best served now. 627 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
628 * We choose the request that is closest to the head right now. Distance 628 * We choose the request that is closest to the head right now. Distance
629 * behind the head is penalized and only allowed to a certain extent. 629 * behind the head is penalized and only allowed to a certain extent.
630 */ 630 */
631 static struct request * 631 static struct request *
632 cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last) 632 cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
633 { 633 {
634 sector_t s1, s2, d1 = 0, d2 = 0; 634 sector_t s1, s2, d1 = 0, d2 = 0;
635 unsigned long back_max; 635 unsigned long back_max;
636 #define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ 636 #define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */
637 #define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ 637 #define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */
638 unsigned wrap = 0; /* bit mask: requests behind the disk head? */ 638 unsigned wrap = 0; /* bit mask: requests behind the disk head? */
639 639
640 if (rq1 == NULL || rq1 == rq2) 640 if (rq1 == NULL || rq1 == rq2)
641 return rq2; 641 return rq2;
642 if (rq2 == NULL) 642 if (rq2 == NULL)
643 return rq1; 643 return rq1;
644 644
645 if (rq_is_sync(rq1) && !rq_is_sync(rq2)) 645 if (rq_is_sync(rq1) && !rq_is_sync(rq2))
646 return rq1; 646 return rq1;
647 else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) 647 else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
648 return rq2; 648 return rq2;
649 if (rq_is_meta(rq1) && !rq_is_meta(rq2)) 649 if (rq_is_meta(rq1) && !rq_is_meta(rq2))
650 return rq1; 650 return rq1;
651 else if (rq_is_meta(rq2) && !rq_is_meta(rq1)) 651 else if (rq_is_meta(rq2) && !rq_is_meta(rq1))
652 return rq2; 652 return rq2;
653 653
654 s1 = blk_rq_pos(rq1); 654 s1 = blk_rq_pos(rq1);
655 s2 = blk_rq_pos(rq2); 655 s2 = blk_rq_pos(rq2);
656 656
657 /* 657 /*
658 * by definition, 1KiB is 2 sectors 658 * by definition, 1KiB is 2 sectors
659 */ 659 */
660 back_max = cfqd->cfq_back_max * 2; 660 back_max = cfqd->cfq_back_max * 2;
661 661
662 /* 662 /*
663 * Strict one way elevator _except_ in the case where we allow 663 * Strict one way elevator _except_ in the case where we allow
664 * short backward seeks which are biased as twice the cost of a 664 * short backward seeks which are biased as twice the cost of a
665 * similar forward seek. 665 * similar forward seek.
666 */ 666 */
667 if (s1 >= last) 667 if (s1 >= last)
668 d1 = s1 - last; 668 d1 = s1 - last;
669 else if (s1 + back_max >= last) 669 else if (s1 + back_max >= last)
670 d1 = (last - s1) * cfqd->cfq_back_penalty; 670 d1 = (last - s1) * cfqd->cfq_back_penalty;
671 else 671 else
672 wrap |= CFQ_RQ1_WRAP; 672 wrap |= CFQ_RQ1_WRAP;
673 673
674 if (s2 >= last) 674 if (s2 >= last)
675 d2 = s2 - last; 675 d2 = s2 - last;
676 else if (s2 + back_max >= last) 676 else if (s2 + back_max >= last)
677 d2 = (last - s2) * cfqd->cfq_back_penalty; 677 d2 = (last - s2) * cfqd->cfq_back_penalty;
678 else 678 else
679 wrap |= CFQ_RQ2_WRAP; 679 wrap |= CFQ_RQ2_WRAP;
680 680
681 /* Found required data */ 681 /* Found required data */
682 682
683 /* 683 /*
684 * By doing switch() on the bit mask "wrap" we avoid having to 684 * By doing switch() on the bit mask "wrap" we avoid having to
685 * check two variables for all permutations: --> faster! 685 * check two variables for all permutations: --> faster!
686 */ 686 */
687 switch (wrap) { 687 switch (wrap) {
688 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ 688 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
689 if (d1 < d2) 689 if (d1 < d2)
690 return rq1; 690 return rq1;
691 else if (d2 < d1) 691 else if (d2 < d1)
692 return rq2; 692 return rq2;
693 else { 693 else {
694 if (s1 >= s2) 694 if (s1 >= s2)
695 return rq1; 695 return rq1;
696 else 696 else
697 return rq2; 697 return rq2;
698 } 698 }
699 699
700 case CFQ_RQ2_WRAP: 700 case CFQ_RQ2_WRAP:
701 return rq1; 701 return rq1;
702 case CFQ_RQ1_WRAP: 702 case CFQ_RQ1_WRAP:
703 return rq2; 703 return rq2;
704 case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */ 704 case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
705 default: 705 default:
706 /* 706 /*
707 * Since both rqs are wrapped, 707 * Since both rqs are wrapped,
708 * start with the one that's further behind head 708 * start with the one that's further behind head
709 * (--> only *one* back seek required), 709 * (--> only *one* back seek required),
710 * since back seek takes more time than forward. 710 * since back seek takes more time than forward.
711 */ 711 */
712 if (s1 <= s2) 712 if (s1 <= s2)
713 return rq1; 713 return rq1;
714 else 714 else
715 return rq2; 715 return rq2;
716 } 716 }
717 } 717 }
718 718
719 /* 719 /*
720 * The below is leftmost cache rbtree addon 720 * The below is leftmost cache rbtree addon
721 */ 721 */
722 static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) 722 static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
723 { 723 {
724 /* Service tree is empty */ 724 /* Service tree is empty */
725 if (!root->count) 725 if (!root->count)
726 return NULL; 726 return NULL;
727 727
728 if (!root->left) 728 if (!root->left)
729 root->left = rb_first(&root->rb); 729 root->left = rb_first(&root->rb);
730 730
731 if (root->left) 731 if (root->left)
732 return rb_entry(root->left, struct cfq_queue, rb_node); 732 return rb_entry(root->left, struct cfq_queue, rb_node);
733 733
734 return NULL; 734 return NULL;
735 } 735 }
736 736
737 static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) 737 static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
738 { 738 {
739 if (!root->left) 739 if (!root->left)
740 root->left = rb_first(&root->rb); 740 root->left = rb_first(&root->rb);
741 741
742 if (root->left) 742 if (root->left)
743 return rb_entry_cfqg(root->left); 743 return rb_entry_cfqg(root->left);
744 744
745 return NULL; 745 return NULL;
746 } 746 }
747 747
748 static void rb_erase_init(struct rb_node *n, struct rb_root *root) 748 static void rb_erase_init(struct rb_node *n, struct rb_root *root)
749 { 749 {
750 rb_erase(n, root); 750 rb_erase(n, root);
751 RB_CLEAR_NODE(n); 751 RB_CLEAR_NODE(n);
752 } 752 }
753 753
754 static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) 754 static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
755 { 755 {
756 if (root->left == n) 756 if (root->left == n)
757 root->left = NULL; 757 root->left = NULL;
758 rb_erase_init(n, &root->rb); 758 rb_erase_init(n, &root->rb);
759 --root->count; 759 --root->count;
760 } 760 }
761 761
762 /* 762 /*
763 * would be nice to take fifo expire time into account as well 763 * would be nice to take fifo expire time into account as well
764 */ 764 */
765 static struct request * 765 static struct request *
766 cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 766 cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
767 struct request *last) 767 struct request *last)
768 { 768 {
769 struct rb_node *rbnext = rb_next(&last->rb_node); 769 struct rb_node *rbnext = rb_next(&last->rb_node);
770 struct rb_node *rbprev = rb_prev(&last->rb_node); 770 struct rb_node *rbprev = rb_prev(&last->rb_node);
771 struct request *next = NULL, *prev = NULL; 771 struct request *next = NULL, *prev = NULL;
772 772
773 BUG_ON(RB_EMPTY_NODE(&last->rb_node)); 773 BUG_ON(RB_EMPTY_NODE(&last->rb_node));
774 774
775 if (rbprev) 775 if (rbprev)
776 prev = rb_entry_rq(rbprev); 776 prev = rb_entry_rq(rbprev);
777 777
778 if (rbnext) 778 if (rbnext)
779 next = rb_entry_rq(rbnext); 779 next = rb_entry_rq(rbnext);
780 else { 780 else {
781 rbnext = rb_first(&cfqq->sort_list); 781 rbnext = rb_first(&cfqq->sort_list);
782 if (rbnext && rbnext != &last->rb_node) 782 if (rbnext && rbnext != &last->rb_node)
783 next = rb_entry_rq(rbnext); 783 next = rb_entry_rq(rbnext);
784 } 784 }
785 785
786 return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last)); 786 return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
787 } 787 }
788 788
789 static unsigned long cfq_slice_offset(struct cfq_data *cfqd, 789 static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
790 struct cfq_queue *cfqq) 790 struct cfq_queue *cfqq)
791 { 791 {
792 /* 792 /*
793 * just an approximation, should be ok. 793 * just an approximation, should be ok.
794 */ 794 */
795 return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) - 795 return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
796 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); 796 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
797 } 797 }
798 798
799 static inline s64 799 static inline s64
800 cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) 800 cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
801 { 801 {
802 return cfqg->vdisktime - st->min_vdisktime; 802 return cfqg->vdisktime - st->min_vdisktime;
803 } 803 }
804 804
805 static void 805 static void
806 __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) 806 __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
807 { 807 {
808 struct rb_node **node = &st->rb.rb_node; 808 struct rb_node **node = &st->rb.rb_node;
809 struct rb_node *parent = NULL; 809 struct rb_node *parent = NULL;
810 struct cfq_group *__cfqg; 810 struct cfq_group *__cfqg;
811 s64 key = cfqg_key(st, cfqg); 811 s64 key = cfqg_key(st, cfqg);
812 int left = 1; 812 int left = 1;
813 813
814 while (*node != NULL) { 814 while (*node != NULL) {
815 parent = *node; 815 parent = *node;
816 __cfqg = rb_entry_cfqg(parent); 816 __cfqg = rb_entry_cfqg(parent);
817 817
818 if (key < cfqg_key(st, __cfqg)) 818 if (key < cfqg_key(st, __cfqg))
819 node = &parent->rb_left; 819 node = &parent->rb_left;
820 else { 820 else {
821 node = &parent->rb_right; 821 node = &parent->rb_right;
822 left = 0; 822 left = 0;
823 } 823 }
824 } 824 }
825 825
826 if (left) 826 if (left)
827 st->left = &cfqg->rb_node; 827 st->left = &cfqg->rb_node;
828 828
829 rb_link_node(&cfqg->rb_node, parent, node); 829 rb_link_node(&cfqg->rb_node, parent, node);
830 rb_insert_color(&cfqg->rb_node, &st->rb); 830 rb_insert_color(&cfqg->rb_node, &st->rb);
831 } 831 }
832 832
833 static void 833 static void
834 cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) 834 cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
835 { 835 {
836 struct cfq_rb_root *st = &cfqd->grp_service_tree; 836 struct cfq_rb_root *st = &cfqd->grp_service_tree;
837 struct cfq_group *__cfqg; 837 struct cfq_group *__cfqg;
838 struct rb_node *n; 838 struct rb_node *n;
839 839
840 cfqg->nr_cfqq++; 840 cfqg->nr_cfqq++;
841 if (cfqg->on_st) 841 if (cfqg->on_st)
842 return; 842 return;
843 843
844 /* 844 /*
845 * Currently put the group at the end. Later implement something 845 * Currently put the group at the end. Later implement something
846 * so that groups get lesser vtime based on their weights, so that 846 * so that groups get lesser vtime based on their weights, so that
847 * if group does not loose all if it was not continously backlogged. 847 * if group does not loose all if it was not continously backlogged.
848 */ 848 */
849 n = rb_last(&st->rb); 849 n = rb_last(&st->rb);
850 if (n) { 850 if (n) {
851 __cfqg = rb_entry_cfqg(n); 851 __cfqg = rb_entry_cfqg(n);
852 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; 852 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
853 } else 853 } else
854 cfqg->vdisktime = st->min_vdisktime; 854 cfqg->vdisktime = st->min_vdisktime;
855 855
856 __cfq_group_service_tree_add(st, cfqg); 856 __cfq_group_service_tree_add(st, cfqg);
857 cfqg->on_st = true; 857 cfqg->on_st = true;
858 st->total_weight += cfqg->weight; 858 st->total_weight += cfqg->weight;
859 } 859 }
860 860
861 static void 861 static void
862 cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) 862 cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
863 { 863 {
864 struct cfq_rb_root *st = &cfqd->grp_service_tree; 864 struct cfq_rb_root *st = &cfqd->grp_service_tree;
865 865
866 if (st->active == &cfqg->rb_node) 866 if (st->active == &cfqg->rb_node)
867 st->active = NULL; 867 st->active = NULL;
868 868
869 BUG_ON(cfqg->nr_cfqq < 1); 869 BUG_ON(cfqg->nr_cfqq < 1);
870 cfqg->nr_cfqq--; 870 cfqg->nr_cfqq--;
871 871
872 /* If there are other cfq queues under this group, don't delete it */ 872 /* If there are other cfq queues under this group, don't delete it */
873 if (cfqg->nr_cfqq) 873 if (cfqg->nr_cfqq)
874 return; 874 return;
875 875
876 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); 876 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
877 cfqg->on_st = false; 877 cfqg->on_st = false;
878 st->total_weight -= cfqg->weight; 878 st->total_weight -= cfqg->weight;
879 if (!RB_EMPTY_NODE(&cfqg->rb_node)) 879 if (!RB_EMPTY_NODE(&cfqg->rb_node))
880 cfq_rb_erase(&cfqg->rb_node, st); 880 cfq_rb_erase(&cfqg->rb_node, st);
881 cfqg->saved_workload_slice = 0; 881 cfqg->saved_workload_slice = 0;
882 blkiocg_update_dequeue_stats(&cfqg->blkg, 1); 882 blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
883 } 883 }
884 884
885 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) 885 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
886 { 886 {
887 unsigned int slice_used; 887 unsigned int slice_used;
888 888
889 /* 889 /*
890 * Queue got expired before even a single request completed or 890 * Queue got expired before even a single request completed or
891 * got expired immediately after first request completion. 891 * got expired immediately after first request completion.
892 */ 892 */
893 if (!cfqq->slice_start || cfqq->slice_start == jiffies) { 893 if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
894 /* 894 /*
895 * Also charge the seek time incurred to the group, otherwise 895 * Also charge the seek time incurred to the group, otherwise
896 * if there are mutiple queues in the group, each can dispatch 896 * if there are mutiple queues in the group, each can dispatch
897 * a single request on seeky media and cause lots of seek time 897 * a single request on seeky media and cause lots of seek time
898 * and group will never know it. 898 * and group will never know it.
899 */ 899 */
900 slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start), 900 slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
901 1); 901 1);
902 } else { 902 } else {
903 slice_used = jiffies - cfqq->slice_start; 903 slice_used = jiffies - cfqq->slice_start;
904 if (slice_used > cfqq->allocated_slice) 904 if (slice_used > cfqq->allocated_slice)
905 slice_used = cfqq->allocated_slice; 905 slice_used = cfqq->allocated_slice;
906 } 906 }
907 907
908 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used); 908 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);
909 return slice_used; 909 return slice_used;
910 } 910 }
911 911
912 static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, 912 static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
913 struct cfq_queue *cfqq) 913 struct cfq_queue *cfqq)
914 { 914 {
915 struct cfq_rb_root *st = &cfqd->grp_service_tree; 915 struct cfq_rb_root *st = &cfqd->grp_service_tree;
916 unsigned int used_sl, charge_sl; 916 unsigned int used_sl, charge_sl;
917 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) 917 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
918 - cfqg->service_tree_idle.count; 918 - cfqg->service_tree_idle.count;
919 919
920 BUG_ON(nr_sync < 0); 920 BUG_ON(nr_sync < 0);
921 used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq); 921 used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq);
922 922
923 if (!cfq_cfqq_sync(cfqq) && !nr_sync) 923 if (!cfq_cfqq_sync(cfqq) && !nr_sync)
924 charge_sl = cfqq->allocated_slice; 924 charge_sl = cfqq->allocated_slice;
925 925
926 /* Can't update vdisktime while group is on service tree */ 926 /* Can't update vdisktime while group is on service tree */
927 cfq_rb_erase(&cfqg->rb_node, st); 927 cfq_rb_erase(&cfqg->rb_node, st);
928 cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg); 928 cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg);
929 __cfq_group_service_tree_add(st, cfqg); 929 __cfq_group_service_tree_add(st, cfqg);
930 930
931 /* This group is being expired. Save the context */ 931 /* This group is being expired. Save the context */
932 if (time_after(cfqd->workload_expires, jiffies)) { 932 if (time_after(cfqd->workload_expires, jiffies)) {
933 cfqg->saved_workload_slice = cfqd->workload_expires 933 cfqg->saved_workload_slice = cfqd->workload_expires
934 - jiffies; 934 - jiffies;
935 cfqg->saved_workload = cfqd->serving_type; 935 cfqg->saved_workload = cfqd->serving_type;
936 cfqg->saved_serving_prio = cfqd->serving_prio; 936 cfqg->saved_serving_prio = cfqd->serving_prio;
937 } else 937 } else
938 cfqg->saved_workload_slice = 0; 938 cfqg->saved_workload_slice = 0;
939 939
940 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, 940 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
941 st->min_vdisktime); 941 st->min_vdisktime);
942 blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); 942 blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
943 blkiocg_set_start_empty_time(&cfqg->blkg); 943 blkiocg_set_start_empty_time(&cfqg->blkg);
944 } 944 }
945 945
946 #ifdef CONFIG_CFQ_GROUP_IOSCHED 946 #ifdef CONFIG_CFQ_GROUP_IOSCHED
947 static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) 947 static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
948 { 948 {
949 if (blkg) 949 if (blkg)
950 return container_of(blkg, struct cfq_group, blkg); 950 return container_of(blkg, struct cfq_group, blkg);
951 return NULL; 951 return NULL;
952 } 952 }
953 953
954 void 954 void
955 cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) 955 cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
956 { 956 {
957 cfqg_of_blkg(blkg)->weight = weight; 957 cfqg_of_blkg(blkg)->weight = weight;
958 } 958 }
959 959
960 static struct cfq_group * 960 static struct cfq_group *
961 cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) 961 cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
962 { 962 {
963 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); 963 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
964 struct cfq_group *cfqg = NULL; 964 struct cfq_group *cfqg = NULL;
965 void *key = cfqd; 965 void *key = cfqd;
966 int i, j; 966 int i, j;
967 struct cfq_rb_root *st; 967 struct cfq_rb_root *st;
968 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; 968 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
969 unsigned int major, minor; 969 unsigned int major, minor;
970 970
971 cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); 971 cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
972 if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { 972 if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
973 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 973 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
974 cfqg->blkg.dev = MKDEV(major, minor); 974 cfqg->blkg.dev = MKDEV(major, minor);
975 goto done; 975 goto done;
976 } 976 }
977 if (cfqg || !create) 977 if (cfqg || !create)
978 goto done; 978 goto done;
979 979
980 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); 980 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
981 if (!cfqg) 981 if (!cfqg)
982 goto done; 982 goto done;
983 983
984 for_each_cfqg_st(cfqg, i, j, st) 984 for_each_cfqg_st(cfqg, i, j, st)
985 *st = CFQ_RB_ROOT; 985 *st = CFQ_RB_ROOT;
986 RB_CLEAR_NODE(&cfqg->rb_node); 986 RB_CLEAR_NODE(&cfqg->rb_node);
987 987
988 /* 988 /*
989 * Take the initial reference that will be released on destroy 989 * Take the initial reference that will be released on destroy
990 * This can be thought of a joint reference by cgroup and 990 * This can be thought of a joint reference by cgroup and
991 * elevator which will be dropped by either elevator exit 991 * elevator which will be dropped by either elevator exit
992 * or cgroup deletion path depending on who is exiting first. 992 * or cgroup deletion path depending on who is exiting first.
993 */ 993 */
994 atomic_set(&cfqg->ref, 1); 994 atomic_set(&cfqg->ref, 1);
995 995
996 /* Add group onto cgroup list */ 996 /* Add group onto cgroup list */
997 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 997 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
998 blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, 998 blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
999 MKDEV(major, minor)); 999 MKDEV(major, minor));
1000 cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); 1000 cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
1001 1001
1002 /* Add group on cfqd list */ 1002 /* Add group on cfqd list */
1003 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); 1003 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
1004 1004
1005 done: 1005 done:
1006 return cfqg; 1006 return cfqg;
1007 } 1007 }
1008 1008
1009 /* 1009 /*
1010 * Search for the cfq group current task belongs to. If create = 1, then also 1010 * Search for the cfq group current task belongs to. If create = 1, then also
1011 * create the cfq group if it does not exist. request_queue lock must be held. 1011 * create the cfq group if it does not exist. request_queue lock must be held.
1012 */ 1012 */
1013 static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 1013 static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
1014 { 1014 {
1015 struct cgroup *cgroup; 1015 struct cgroup *cgroup;
1016 struct cfq_group *cfqg = NULL; 1016 struct cfq_group *cfqg = NULL;
1017 1017
1018 rcu_read_lock(); 1018 rcu_read_lock();
1019 cgroup = task_cgroup(current, blkio_subsys_id); 1019 cgroup = task_cgroup(current, blkio_subsys_id);
1020 cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create); 1020 cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);
1021 if (!cfqg && create) 1021 if (!cfqg && create)
1022 cfqg = &cfqd->root_group; 1022 cfqg = &cfqd->root_group;
1023 rcu_read_unlock(); 1023 rcu_read_unlock();
1024 return cfqg; 1024 return cfqg;
1025 } 1025 }
1026 1026
1027 static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) 1027 static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1028 { 1028 {
1029 atomic_inc(&cfqg->ref); 1029 atomic_inc(&cfqg->ref);
1030 return cfqg; 1030 return cfqg;
1031 } 1031 }
1032 1032
1033 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) 1033 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1034 { 1034 {
1035 /* Currently, all async queues are mapped to root group */ 1035 /* Currently, all async queues are mapped to root group */
1036 if (!cfq_cfqq_sync(cfqq)) 1036 if (!cfq_cfqq_sync(cfqq))
1037 cfqg = &cfqq->cfqd->root_group; 1037 cfqg = &cfqq->cfqd->root_group;
1038 1038
1039 cfqq->cfqg = cfqg; 1039 cfqq->cfqg = cfqg;
1040 /* cfqq reference on cfqg */ 1040 /* cfqq reference on cfqg */
1041 atomic_inc(&cfqq->cfqg->ref); 1041 atomic_inc(&cfqq->cfqg->ref);
1042 } 1042 }
1043 1043
1044 static void cfq_put_cfqg(struct cfq_group *cfqg) 1044 static void cfq_put_cfqg(struct cfq_group *cfqg)
1045 { 1045 {
1046 struct cfq_rb_root *st; 1046 struct cfq_rb_root *st;
1047 int i, j; 1047 int i, j;
1048 1048
1049 BUG_ON(atomic_read(&cfqg->ref) <= 0); 1049 BUG_ON(atomic_read(&cfqg->ref) <= 0);
1050 if (!atomic_dec_and_test(&cfqg->ref)) 1050 if (!atomic_dec_and_test(&cfqg->ref))
1051 return; 1051 return;
1052 for_each_cfqg_st(cfqg, i, j, st) 1052 for_each_cfqg_st(cfqg, i, j, st)
1053 BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); 1053 BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
1054 kfree(cfqg); 1054 kfree(cfqg);
1055 } 1055 }
1056 1056
1057 static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) 1057 static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
1058 { 1058 {
1059 /* Something wrong if we are trying to remove same group twice */ 1059 /* Something wrong if we are trying to remove same group twice */
1060 BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); 1060 BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
1061 1061
1062 hlist_del_init(&cfqg->cfqd_node); 1062 hlist_del_init(&cfqg->cfqd_node);
1063 1063
1064 /* 1064 /*
1065 * Put the reference taken at the time of creation so that when all 1065 * Put the reference taken at the time of creation so that when all
1066 * queues are gone, group can be destroyed. 1066 * queues are gone, group can be destroyed.
1067 */ 1067 */
1068 cfq_put_cfqg(cfqg); 1068 cfq_put_cfqg(cfqg);
1069 } 1069 }
1070 1070
1071 static void cfq_release_cfq_groups(struct cfq_data *cfqd) 1071 static void cfq_release_cfq_groups(struct cfq_data *cfqd)
1072 { 1072 {
1073 struct hlist_node *pos, *n; 1073 struct hlist_node *pos, *n;
1074 struct cfq_group *cfqg; 1074 struct cfq_group *cfqg;
1075 1075
1076 hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) { 1076 hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
1077 /* 1077 /*
1078 * If cgroup removal path got to blk_group first and removed 1078 * If cgroup removal path got to blk_group first and removed
1079 * it from cgroup list, then it will take care of destroying 1079 * it from cgroup list, then it will take care of destroying
1080 * cfqg also. 1080 * cfqg also.
1081 */ 1081 */
1082 if (!blkiocg_del_blkio_group(&cfqg->blkg)) 1082 if (!blkiocg_del_blkio_group(&cfqg->blkg))
1083 cfq_destroy_cfqg(cfqd, cfqg); 1083 cfq_destroy_cfqg(cfqd, cfqg);
1084 } 1084 }
1085 } 1085 }
1086 1086
1087 /* 1087 /*
1088 * Blk cgroup controller notification saying that blkio_group object is being 1088 * Blk cgroup controller notification saying that blkio_group object is being
1089 * delinked as associated cgroup object is going away. That also means that 1089 * delinked as associated cgroup object is going away. That also means that
1090 * no new IO will come in this group. So get rid of this group as soon as 1090 * no new IO will come in this group. So get rid of this group as soon as
1091 * any pending IO in the group is finished. 1091 * any pending IO in the group is finished.
1092 * 1092 *
1093 * This function is called under rcu_read_lock(). key is the rcu protected 1093 * This function is called under rcu_read_lock(). key is the rcu protected
1094 * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu 1094 * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
1095 * read lock. 1095 * read lock.
1096 * 1096 *
1097 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means 1097 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
1098 * it should not be NULL as even if elevator was exiting, cgroup deltion 1098 * it should not be NULL as even if elevator was exiting, cgroup deltion
1099 * path got to it first. 1099 * path got to it first.
1100 */ 1100 */
1101 void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) 1101 void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
1102 { 1102 {
1103 unsigned long flags; 1103 unsigned long flags;
1104 struct cfq_data *cfqd = key; 1104 struct cfq_data *cfqd = key;
1105 1105
1106 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1106 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1107 cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg)); 1107 cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
1108 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 1108 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1109 } 1109 }
1110 1110
1111 #else /* GROUP_IOSCHED */ 1111 #else /* GROUP_IOSCHED */
1112 static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 1112 static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
1113 { 1113 {
1114 return &cfqd->root_group; 1114 return &cfqd->root_group;
1115 } 1115 }
1116 1116
1117 static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) 1117 static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1118 { 1118 {
1119 return cfqg; 1119 return cfqg;
1120 } 1120 }
1121 1121
1122 static inline void 1122 static inline void
1123 cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { 1123 cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
1124 cfqq->cfqg = cfqg; 1124 cfqq->cfqg = cfqg;
1125 } 1125 }
1126 1126
1127 static void cfq_release_cfq_groups(struct cfq_data *cfqd) {} 1127 static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
1128 static inline void cfq_put_cfqg(struct cfq_group *cfqg) {} 1128 static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
1129 1129
1130 #endif /* GROUP_IOSCHED */ 1130 #endif /* GROUP_IOSCHED */
1131 1131
1132 /* 1132 /*
1133 * The cfqd->service_trees holds all pending cfq_queue's that have 1133 * The cfqd->service_trees holds all pending cfq_queue's that have
1134 * requests waiting to be processed. It is sorted in the order that 1134 * requests waiting to be processed. It is sorted in the order that
1135 * we will service the queues. 1135 * we will service the queues.
1136 */ 1136 */
1137 static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1137 static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1138 bool add_front) 1138 bool add_front)
1139 { 1139 {
1140 struct rb_node **p, *parent; 1140 struct rb_node **p, *parent;
1141 struct cfq_queue *__cfqq; 1141 struct cfq_queue *__cfqq;
1142 unsigned long rb_key; 1142 unsigned long rb_key;
1143 struct cfq_rb_root *service_tree; 1143 struct cfq_rb_root *service_tree;
1144 int left; 1144 int left;
1145 int new_cfqq = 1; 1145 int new_cfqq = 1;
1146 int group_changed = 0; 1146 int group_changed = 0;
1147 1147
1148 #ifdef CONFIG_CFQ_GROUP_IOSCHED 1148 #ifdef CONFIG_CFQ_GROUP_IOSCHED
1149 if (!cfqd->cfq_group_isolation 1149 if (!cfqd->cfq_group_isolation
1150 && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD 1150 && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
1151 && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) { 1151 && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
1152 /* Move this cfq to root group */ 1152 /* Move this cfq to root group */
1153 cfq_log_cfqq(cfqd, cfqq, "moving to root group"); 1153 cfq_log_cfqq(cfqd, cfqq, "moving to root group");
1154 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 1154 if (!RB_EMPTY_NODE(&cfqq->rb_node))
1155 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1155 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1156 cfqq->orig_cfqg = cfqq->cfqg; 1156 cfqq->orig_cfqg = cfqq->cfqg;
1157 cfqq->cfqg = &cfqd->root_group; 1157 cfqq->cfqg = &cfqd->root_group;
1158 atomic_inc(&cfqd->root_group.ref); 1158 atomic_inc(&cfqd->root_group.ref);
1159 group_changed = 1; 1159 group_changed = 1;
1160 } else if (!cfqd->cfq_group_isolation 1160 } else if (!cfqd->cfq_group_isolation
1161 && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) { 1161 && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
1162 /* cfqq is sequential now needs to go to its original group */ 1162 /* cfqq is sequential now needs to go to its original group */
1163 BUG_ON(cfqq->cfqg != &cfqd->root_group); 1163 BUG_ON(cfqq->cfqg != &cfqd->root_group);
1164 if (!RB_EMPTY_NODE(&cfqq->rb_node)) 1164 if (!RB_EMPTY_NODE(&cfqq->rb_node))
1165 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1165 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1166 cfq_put_cfqg(cfqq->cfqg); 1166 cfq_put_cfqg(cfqq->cfqg);
1167 cfqq->cfqg = cfqq->orig_cfqg; 1167 cfqq->cfqg = cfqq->orig_cfqg;
1168 cfqq->orig_cfqg = NULL; 1168 cfqq->orig_cfqg = NULL;
1169 group_changed = 1; 1169 group_changed = 1;
1170 cfq_log_cfqq(cfqd, cfqq, "moved to origin group"); 1170 cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
1171 } 1171 }
1172 #endif 1172 #endif
1173 1173
1174 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), 1174 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
1175 cfqq_type(cfqq)); 1175 cfqq_type(cfqq));
1176 if (cfq_class_idle(cfqq)) { 1176 if (cfq_class_idle(cfqq)) {
1177 rb_key = CFQ_IDLE_DELAY; 1177 rb_key = CFQ_IDLE_DELAY;
1178 parent = rb_last(&service_tree->rb); 1178 parent = rb_last(&service_tree->rb);
1179 if (parent && parent != &cfqq->rb_node) { 1179 if (parent && parent != &cfqq->rb_node) {
1180 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 1180 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1181 rb_key += __cfqq->rb_key; 1181 rb_key += __cfqq->rb_key;
1182 } else 1182 } else
1183 rb_key += jiffies; 1183 rb_key += jiffies;
1184 } else if (!add_front) { 1184 } else if (!add_front) {
1185 /* 1185 /*
1186 * Get our rb key offset. Subtract any residual slice 1186 * Get our rb key offset. Subtract any residual slice
1187 * value carried from last service. A negative resid 1187 * value carried from last service. A negative resid
1188 * count indicates slice overrun, and this should position 1188 * count indicates slice overrun, and this should position
1189 * the next service time further away in the tree. 1189 * the next service time further away in the tree.
1190 */ 1190 */
1191 rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies; 1191 rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
1192 rb_key -= cfqq->slice_resid; 1192 rb_key -= cfqq->slice_resid;
1193 cfqq->slice_resid = 0; 1193 cfqq->slice_resid = 0;
1194 } else { 1194 } else {
1195 rb_key = -HZ; 1195 rb_key = -HZ;
1196 __cfqq = cfq_rb_first(service_tree); 1196 __cfqq = cfq_rb_first(service_tree);
1197 rb_key += __cfqq ? __cfqq->rb_key : jiffies; 1197 rb_key += __cfqq ? __cfqq->rb_key : jiffies;
1198 } 1198 }
1199 1199
1200 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 1200 if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1201 new_cfqq = 0; 1201 new_cfqq = 0;
1202 /* 1202 /*
1203 * same position, nothing more to do 1203 * same position, nothing more to do
1204 */ 1204 */
1205 if (rb_key == cfqq->rb_key && 1205 if (rb_key == cfqq->rb_key &&
1206 cfqq->service_tree == service_tree) 1206 cfqq->service_tree == service_tree)
1207 return; 1207 return;
1208 1208
1209 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); 1209 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1210 cfqq->service_tree = NULL; 1210 cfqq->service_tree = NULL;
1211 } 1211 }
1212 1212
1213 left = 1; 1213 left = 1;
1214 parent = NULL; 1214 parent = NULL;
1215 cfqq->service_tree = service_tree; 1215 cfqq->service_tree = service_tree;
1216 p = &service_tree->rb.rb_node; 1216 p = &service_tree->rb.rb_node;
1217 while (*p) { 1217 while (*p) {
1218 struct rb_node **n; 1218 struct rb_node **n;
1219 1219
1220 parent = *p; 1220 parent = *p;
1221 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 1221 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1222 1222
1223 /* 1223 /*
1224 * sort by key, that represents service time. 1224 * sort by key, that represents service time.
1225 */ 1225 */
1226 if (time_before(rb_key, __cfqq->rb_key)) 1226 if (time_before(rb_key, __cfqq->rb_key))
1227 n = &(*p)->rb_left; 1227 n = &(*p)->rb_left;
1228 else { 1228 else {
1229 n = &(*p)->rb_right; 1229 n = &(*p)->rb_right;
1230 left = 0; 1230 left = 0;
1231 } 1231 }
1232 1232
1233 p = n; 1233 p = n;
1234 } 1234 }
1235 1235
1236 if (left) 1236 if (left)
1237 service_tree->left = &cfqq->rb_node; 1237 service_tree->left = &cfqq->rb_node;
1238 1238
1239 cfqq->rb_key = rb_key; 1239 cfqq->rb_key = rb_key;
1240 rb_link_node(&cfqq->rb_node, parent, p); 1240 rb_link_node(&cfqq->rb_node, parent, p);
1241 rb_insert_color(&cfqq->rb_node, &service_tree->rb); 1241 rb_insert_color(&cfqq->rb_node, &service_tree->rb);
1242 service_tree->count++; 1242 service_tree->count++;
1243 if ((add_front || !new_cfqq) && !group_changed) 1243 if ((add_front || !new_cfqq) && !group_changed)
1244 return; 1244 return;
1245 cfq_group_service_tree_add(cfqd, cfqq->cfqg); 1245 cfq_group_service_tree_add(cfqd, cfqq->cfqg);
1246 } 1246 }
1247 1247
1248 static struct cfq_queue * 1248 static struct cfq_queue *
1249 cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root, 1249 cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
1250 sector_t sector, struct rb_node **ret_parent, 1250 sector_t sector, struct rb_node **ret_parent,
1251 struct rb_node ***rb_link) 1251 struct rb_node ***rb_link)
1252 { 1252 {
1253 struct rb_node **p, *parent; 1253 struct rb_node **p, *parent;
1254 struct cfq_queue *cfqq = NULL; 1254 struct cfq_queue *cfqq = NULL;
1255 1255
1256 parent = NULL; 1256 parent = NULL;
1257 p = &root->rb_node; 1257 p = &root->rb_node;
1258 while (*p) { 1258 while (*p) {
1259 struct rb_node **n; 1259 struct rb_node **n;
1260 1260
1261 parent = *p; 1261 parent = *p;
1262 cfqq = rb_entry(parent, struct cfq_queue, p_node); 1262 cfqq = rb_entry(parent, struct cfq_queue, p_node);
1263 1263
1264 /* 1264 /*
1265 * Sort strictly based on sector. Smallest to the left, 1265 * Sort strictly based on sector. Smallest to the left,
1266 * largest to the right. 1266 * largest to the right.
1267 */ 1267 */
1268 if (sector > blk_rq_pos(cfqq->next_rq)) 1268 if (sector > blk_rq_pos(cfqq->next_rq))
1269 n = &(*p)->rb_right; 1269 n = &(*p)->rb_right;
1270 else if (sector < blk_rq_pos(cfqq->next_rq)) 1270 else if (sector < blk_rq_pos(cfqq->next_rq))
1271 n = &(*p)->rb_left; 1271 n = &(*p)->rb_left;
1272 else 1272 else
1273 break; 1273 break;
1274 p = n; 1274 p = n;
1275 cfqq = NULL; 1275 cfqq = NULL;
1276 } 1276 }
1277 1277
1278 *ret_parent = parent; 1278 *ret_parent = parent;
1279 if (rb_link) 1279 if (rb_link)
1280 *rb_link = p; 1280 *rb_link = p;
1281 return cfqq; 1281 return cfqq;
1282 } 1282 }
1283 1283
1284 static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1284 static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1285 { 1285 {
1286 struct rb_node **p, *parent; 1286 struct rb_node **p, *parent;
1287 struct cfq_queue *__cfqq; 1287 struct cfq_queue *__cfqq;
1288 1288
1289 if (cfqq->p_root) { 1289 if (cfqq->p_root) {
1290 rb_erase(&cfqq->p_node, cfqq->p_root); 1290 rb_erase(&cfqq->p_node, cfqq->p_root);
1291 cfqq->p_root = NULL; 1291 cfqq->p_root = NULL;
1292 } 1292 }
1293 1293
1294 if (cfq_class_idle(cfqq)) 1294 if (cfq_class_idle(cfqq))
1295 return; 1295 return;
1296 if (!cfqq->next_rq) 1296 if (!cfqq->next_rq)
1297 return; 1297 return;
1298 1298
1299 cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio]; 1299 cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
1300 __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root, 1300 __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
1301 blk_rq_pos(cfqq->next_rq), &parent, &p); 1301 blk_rq_pos(cfqq->next_rq), &parent, &p);
1302 if (!__cfqq) { 1302 if (!__cfqq) {
1303 rb_link_node(&cfqq->p_node, parent, p); 1303 rb_link_node(&cfqq->p_node, parent, p);
1304 rb_insert_color(&cfqq->p_node, cfqq->p_root); 1304 rb_insert_color(&cfqq->p_node, cfqq->p_root);
1305 } else 1305 } else
1306 cfqq->p_root = NULL; 1306 cfqq->p_root = NULL;
1307 } 1307 }
1308 1308
1309 /* 1309 /*
1310 * Update cfqq's position in the service tree. 1310 * Update cfqq's position in the service tree.
1311 */ 1311 */
1312 static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1312 static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1313 { 1313 {
1314 /* 1314 /*
1315 * Resorting requires the cfqq to be on the RR list already. 1315 * Resorting requires the cfqq to be on the RR list already.
1316 */ 1316 */
1317 if (cfq_cfqq_on_rr(cfqq)) { 1317 if (cfq_cfqq_on_rr(cfqq)) {
1318 cfq_service_tree_add(cfqd, cfqq, 0); 1318 cfq_service_tree_add(cfqd, cfqq, 0);
1319 cfq_prio_tree_add(cfqd, cfqq); 1319 cfq_prio_tree_add(cfqd, cfqq);
1320 } 1320 }
1321 } 1321 }
1322 1322
1323 /* 1323 /*
1324 * add to busy list of queues for service, trying to be fair in ordering 1324 * add to busy list of queues for service, trying to be fair in ordering
1325 * the pending list according to last request service 1325 * the pending list according to last request service
1326 */ 1326 */
1327 static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1327 static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1328 { 1328 {
1329 cfq_log_cfqq(cfqd, cfqq, "add_to_rr"); 1329 cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
1330 BUG_ON(cfq_cfqq_on_rr(cfqq)); 1330 BUG_ON(cfq_cfqq_on_rr(cfqq));
1331 cfq_mark_cfqq_on_rr(cfqq); 1331 cfq_mark_cfqq_on_rr(cfqq);
1332 cfqd->busy_queues++; 1332 cfqd->busy_queues++;
1333 1333
1334 cfq_resort_rr_list(cfqd, cfqq); 1334 cfq_resort_rr_list(cfqd, cfqq);
1335 } 1335 }
1336 1336
1337 /* 1337 /*
1338 * Called when the cfqq no longer has requests pending, remove it from 1338 * Called when the cfqq no longer has requests pending, remove it from
1339 * the service tree. 1339 * the service tree.
1340 */ 1340 */
1341 static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1341 static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1342 { 1342 {
1343 cfq_log_cfqq(cfqd, cfqq, "del_from_rr"); 1343 cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
1344 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 1344 BUG_ON(!cfq_cfqq_on_rr(cfqq));
1345 cfq_clear_cfqq_on_rr(cfqq); 1345 cfq_clear_cfqq_on_rr(cfqq);
1346 1346
1347 if (!RB_EMPTY_NODE(&cfqq->rb_node)) { 1347 if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
1348 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); 1348 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
1349 cfqq->service_tree = NULL; 1349 cfqq->service_tree = NULL;
1350 } 1350 }
1351 if (cfqq->p_root) { 1351 if (cfqq->p_root) {
1352 rb_erase(&cfqq->p_node, cfqq->p_root); 1352 rb_erase(&cfqq->p_node, cfqq->p_root);
1353 cfqq->p_root = NULL; 1353 cfqq->p_root = NULL;
1354 } 1354 }
1355 1355
1356 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1356 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1357 BUG_ON(!cfqd->busy_queues); 1357 BUG_ON(!cfqd->busy_queues);
1358 cfqd->busy_queues--; 1358 cfqd->busy_queues--;
1359 } 1359 }
1360 1360
1361 /* 1361 /*
1362 * rb tree support functions 1362 * rb tree support functions
1363 */ 1363 */
1364 static void cfq_del_rq_rb(struct request *rq) 1364 static void cfq_del_rq_rb(struct request *rq)
1365 { 1365 {
1366 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1366 struct cfq_queue *cfqq = RQ_CFQQ(rq);
1367 const int sync = rq_is_sync(rq); 1367 const int sync = rq_is_sync(rq);
1368 1368
1369 BUG_ON(!cfqq->queued[sync]); 1369 BUG_ON(!cfqq->queued[sync]);
1370 cfqq->queued[sync]--; 1370 cfqq->queued[sync]--;
1371 1371
1372 elv_rb_del(&cfqq->sort_list, rq); 1372 elv_rb_del(&cfqq->sort_list, rq);
1373 1373
1374 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) { 1374 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
1375 /* 1375 /*
1376 * Queue will be deleted from service tree when we actually 1376 * Queue will be deleted from service tree when we actually
1377 * expire it later. Right now just remove it from prio tree 1377 * expire it later. Right now just remove it from prio tree
1378 * as it is empty. 1378 * as it is empty.
1379 */ 1379 */
1380 if (cfqq->p_root) { 1380 if (cfqq->p_root) {
1381 rb_erase(&cfqq->p_node, cfqq->p_root); 1381 rb_erase(&cfqq->p_node, cfqq->p_root);
1382 cfqq->p_root = NULL; 1382 cfqq->p_root = NULL;
1383 } 1383 }
1384 } 1384 }
1385 } 1385 }
1386 1386
1387 static void cfq_add_rq_rb(struct request *rq) 1387 static void cfq_add_rq_rb(struct request *rq)
1388 { 1388 {
1389 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1389 struct cfq_queue *cfqq = RQ_CFQQ(rq);
1390 struct cfq_data *cfqd = cfqq->cfqd; 1390 struct cfq_data *cfqd = cfqq->cfqd;
1391 struct request *__alias, *prev; 1391 struct request *__alias, *prev;
1392 1392
1393 cfqq->queued[rq_is_sync(rq)]++; 1393 cfqq->queued[rq_is_sync(rq)]++;
1394 1394
1395 /* 1395 /*
1396 * looks a little odd, but the first insert might return an alias. 1396 * looks a little odd, but the first insert might return an alias.
1397 * if that happens, put the alias on the dispatch list 1397 * if that happens, put the alias on the dispatch list
1398 */ 1398 */
1399 while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL) 1399 while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
1400 cfq_dispatch_insert(cfqd->queue, __alias); 1400 cfq_dispatch_insert(cfqd->queue, __alias);
1401 1401
1402 if (!cfq_cfqq_on_rr(cfqq)) 1402 if (!cfq_cfqq_on_rr(cfqq))
1403 cfq_add_cfqq_rr(cfqd, cfqq); 1403 cfq_add_cfqq_rr(cfqd, cfqq);
1404 1404
1405 /* 1405 /*
1406 * check if this request is a better next-serve candidate 1406 * check if this request is a better next-serve candidate
1407 */ 1407 */
1408 prev = cfqq->next_rq; 1408 prev = cfqq->next_rq;
1409 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position); 1409 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
1410 1410
1411 /* 1411 /*
1412 * adjust priority tree position, if ->next_rq changes 1412 * adjust priority tree position, if ->next_rq changes
1413 */ 1413 */
1414 if (prev != cfqq->next_rq) 1414 if (prev != cfqq->next_rq)
1415 cfq_prio_tree_add(cfqd, cfqq); 1415 cfq_prio_tree_add(cfqd, cfqq);
1416 1416
1417 BUG_ON(!cfqq->next_rq); 1417 BUG_ON(!cfqq->next_rq);
1418 } 1418 }
1419 1419
1420 static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) 1420 static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
1421 { 1421 {
1422 elv_rb_del(&cfqq->sort_list, rq); 1422 elv_rb_del(&cfqq->sort_list, rq);
1423 cfqq->queued[rq_is_sync(rq)]--; 1423 cfqq->queued[rq_is_sync(rq)]--;
1424 blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq), 1424 blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
1425 rq_is_sync(rq)); 1425 rq_is_sync(rq));
1426 cfq_add_rq_rb(rq); 1426 cfq_add_rq_rb(rq);
1427 blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, 1427 blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
1428 &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq), 1428 &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
1429 rq_is_sync(rq)); 1429 rq_is_sync(rq));
1430 } 1430 }
1431 1431
1432 static struct request * 1432 static struct request *
1433 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) 1433 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
1434 { 1434 {
1435 struct task_struct *tsk = current; 1435 struct task_struct *tsk = current;
1436 struct cfq_io_context *cic; 1436 struct cfq_io_context *cic;
1437 struct cfq_queue *cfqq; 1437 struct cfq_queue *cfqq;
1438 1438
1439 cic = cfq_cic_lookup(cfqd, tsk->io_context); 1439 cic = cfq_cic_lookup(cfqd, tsk->io_context);
1440 if (!cic) 1440 if (!cic)
1441 return NULL; 1441 return NULL;
1442 1442
1443 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 1443 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
1444 if (cfqq) { 1444 if (cfqq) {
1445 sector_t sector = bio->bi_sector + bio_sectors(bio); 1445 sector_t sector = bio->bi_sector + bio_sectors(bio);
1446 1446
1447 return elv_rb_find(&cfqq->sort_list, sector); 1447 return elv_rb_find(&cfqq->sort_list, sector);
1448 } 1448 }
1449 1449
1450 return NULL; 1450 return NULL;
1451 } 1451 }
1452 1452
1453 static void cfq_activate_request(struct request_queue *q, struct request *rq) 1453 static void cfq_activate_request(struct request_queue *q, struct request *rq)
1454 { 1454 {
1455 struct cfq_data *cfqd = q->elevator->elevator_data; 1455 struct cfq_data *cfqd = q->elevator->elevator_data;
1456 1456
1457 cfqd->rq_in_driver++; 1457 cfqd->rq_in_driver++;
1458 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", 1458 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
1459 cfqd->rq_in_driver); 1459 cfqd->rq_in_driver);
1460 1460
1461 cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); 1461 cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
1462 } 1462 }
1463 1463
1464 static void cfq_deactivate_request(struct request_queue *q, struct request *rq) 1464 static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
1465 { 1465 {
1466 struct cfq_data *cfqd = q->elevator->elevator_data; 1466 struct cfq_data *cfqd = q->elevator->elevator_data;
1467 1467
1468 WARN_ON(!cfqd->rq_in_driver); 1468 WARN_ON(!cfqd->rq_in_driver);
1469 cfqd->rq_in_driver--; 1469 cfqd->rq_in_driver--;
1470 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", 1470 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
1471 cfqd->rq_in_driver); 1471 cfqd->rq_in_driver);
1472 } 1472 }
1473 1473
1474 static void cfq_remove_request(struct request *rq) 1474 static void cfq_remove_request(struct request *rq)
1475 { 1475 {
1476 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1476 struct cfq_queue *cfqq = RQ_CFQQ(rq);
1477 1477
1478 if (cfqq->next_rq == rq) 1478 if (cfqq->next_rq == rq)
1479 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq); 1479 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
1480 1480
1481 list_del_init(&rq->queuelist); 1481 list_del_init(&rq->queuelist);
1482 cfq_del_rq_rb(rq); 1482 cfq_del_rq_rb(rq);
1483 1483
1484 cfqq->cfqd->rq_queued--; 1484 cfqq->cfqd->rq_queued--;
1485 blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq), 1485 blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
1486 rq_is_sync(rq)); 1486 rq_is_sync(rq));
1487 if (rq_is_meta(rq)) { 1487 if (rq_is_meta(rq)) {
1488 WARN_ON(!cfqq->meta_pending); 1488 WARN_ON(!cfqq->meta_pending);
1489 cfqq->meta_pending--; 1489 cfqq->meta_pending--;
1490 } 1490 }
1491 } 1491 }
1492 1492
1493 static int cfq_merge(struct request_queue *q, struct request **req, 1493 static int cfq_merge(struct request_queue *q, struct request **req,
1494 struct bio *bio) 1494 struct bio *bio)
1495 { 1495 {
1496 struct cfq_data *cfqd = q->elevator->elevator_data; 1496 struct cfq_data *cfqd = q->elevator->elevator_data;
1497 struct request *__rq; 1497 struct request *__rq;
1498 1498
1499 __rq = cfq_find_rq_fmerge(cfqd, bio); 1499 __rq = cfq_find_rq_fmerge(cfqd, bio);
1500 if (__rq && elv_rq_merge_ok(__rq, bio)) { 1500 if (__rq && elv_rq_merge_ok(__rq, bio)) {
1501 *req = __rq; 1501 *req = __rq;
1502 return ELEVATOR_FRONT_MERGE; 1502 return ELEVATOR_FRONT_MERGE;
1503 } 1503 }
1504 1504
1505 return ELEVATOR_NO_MERGE; 1505 return ELEVATOR_NO_MERGE;
1506 } 1506 }
1507 1507
1508 static void cfq_merged_request(struct request_queue *q, struct request *req, 1508 static void cfq_merged_request(struct request_queue *q, struct request *req,
1509 int type) 1509 int type)
1510 { 1510 {
1511 if (type == ELEVATOR_FRONT_MERGE) { 1511 if (type == ELEVATOR_FRONT_MERGE) {
1512 struct cfq_queue *cfqq = RQ_CFQQ(req); 1512 struct cfq_queue *cfqq = RQ_CFQQ(req);
1513 1513
1514 cfq_reposition_rq_rb(cfqq, req); 1514 cfq_reposition_rq_rb(cfqq, req);
1515 } 1515 }
1516 } 1516 }
1517 1517
1518 static void cfq_bio_merged(struct request_queue *q, struct request *req, 1518 static void cfq_bio_merged(struct request_queue *q, struct request *req,
1519 struct bio *bio) 1519 struct bio *bio)
1520 { 1520 {
1521 blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, bio_data_dir(bio), 1521 blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, bio_data_dir(bio),
1522 cfq_bio_sync(bio)); 1522 cfq_bio_sync(bio));
1523 } 1523 }
1524 1524
1525 static void 1525 static void
1526 cfq_merged_requests(struct request_queue *q, struct request *rq, 1526 cfq_merged_requests(struct request_queue *q, struct request *rq,
1527 struct request *next) 1527 struct request *next)
1528 { 1528 {
1529 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1529 struct cfq_queue *cfqq = RQ_CFQQ(rq);
1530 /* 1530 /*
1531 * reposition in fifo if next is older than rq 1531 * reposition in fifo if next is older than rq
1532 */ 1532 */
1533 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && 1533 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
1534 time_before(rq_fifo_time(next), rq_fifo_time(rq))) { 1534 time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
1535 list_move(&rq->queuelist, &next->queuelist); 1535 list_move(&rq->queuelist, &next->queuelist);
1536 rq_set_fifo_time(rq, rq_fifo_time(next)); 1536 rq_set_fifo_time(rq, rq_fifo_time(next));
1537 } 1537 }
1538 1538
1539 if (cfqq->next_rq == next) 1539 if (cfqq->next_rq == next)
1540 cfqq->next_rq = rq; 1540 cfqq->next_rq = rq;
1541 cfq_remove_request(next); 1541 cfq_remove_request(next);
1542 blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(next), 1542 blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(next),
1543 rq_is_sync(next)); 1543 rq_is_sync(next));
1544 } 1544 }
1545 1545
1546 static int cfq_allow_merge(struct request_queue *q, struct request *rq, 1546 static int cfq_allow_merge(struct request_queue *q, struct request *rq,
1547 struct bio *bio) 1547 struct bio *bio)
1548 { 1548 {
1549 struct cfq_data *cfqd = q->elevator->elevator_data; 1549 struct cfq_data *cfqd = q->elevator->elevator_data;
1550 struct cfq_io_context *cic; 1550 struct cfq_io_context *cic;
1551 struct cfq_queue *cfqq; 1551 struct cfq_queue *cfqq;
1552 1552
1553 /* 1553 /*
1554 * Disallow merge of a sync bio into an async request. 1554 * Disallow merge of a sync bio into an async request.
1555 */ 1555 */
1556 if (cfq_bio_sync(bio) && !rq_is_sync(rq)) 1556 if (cfq_bio_sync(bio) && !rq_is_sync(rq))
1557 return false; 1557 return false;
1558 1558
1559 /* 1559 /*
1560 * Lookup the cfqq that this bio will be queued with. Allow 1560 * Lookup the cfqq that this bio will be queued with. Allow
1561 * merge only if rq is queued there. 1561 * merge only if rq is queued there.
1562 */ 1562 */
1563 cic = cfq_cic_lookup(cfqd, current->io_context); 1563 cic = cfq_cic_lookup(cfqd, current->io_context);
1564 if (!cic) 1564 if (!cic)
1565 return false; 1565 return false;
1566 1566
1567 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 1567 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
1568 return cfqq == RQ_CFQQ(rq); 1568 return cfqq == RQ_CFQQ(rq);
1569 } 1569 }
1570 1570
1571 static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1571 static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1572 { 1572 {
1573 del_timer(&cfqd->idle_slice_timer); 1573 del_timer(&cfqd->idle_slice_timer);
1574 blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg); 1574 blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
1575 } 1575 }
1576 1576
1577 static void __cfq_set_active_queue(struct cfq_data *cfqd, 1577 static void __cfq_set_active_queue(struct cfq_data *cfqd,
1578 struct cfq_queue *cfqq) 1578 struct cfq_queue *cfqq)
1579 { 1579 {
1580 if (cfqq) { 1580 if (cfqq) {
1581 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", 1581 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
1582 cfqd->serving_prio, cfqd->serving_type); 1582 cfqd->serving_prio, cfqd->serving_type);
1583 blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg); 1583 blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
1584 cfqq->slice_start = 0; 1584 cfqq->slice_start = 0;
1585 cfqq->dispatch_start = jiffies; 1585 cfqq->dispatch_start = jiffies;
1586 cfqq->allocated_slice = 0; 1586 cfqq->allocated_slice = 0;
1587 cfqq->slice_end = 0; 1587 cfqq->slice_end = 0;
1588 cfqq->slice_dispatch = 0; 1588 cfqq->slice_dispatch = 0;
1589 1589
1590 cfq_clear_cfqq_wait_request(cfqq); 1590 cfq_clear_cfqq_wait_request(cfqq);
1591 cfq_clear_cfqq_must_dispatch(cfqq); 1591 cfq_clear_cfqq_must_dispatch(cfqq);
1592 cfq_clear_cfqq_must_alloc_slice(cfqq); 1592 cfq_clear_cfqq_must_alloc_slice(cfqq);
1593 cfq_clear_cfqq_fifo_expire(cfqq); 1593 cfq_clear_cfqq_fifo_expire(cfqq);
1594 cfq_mark_cfqq_slice_new(cfqq); 1594 cfq_mark_cfqq_slice_new(cfqq);
1595 1595
1596 cfq_del_timer(cfqd, cfqq); 1596 cfq_del_timer(cfqd, cfqq);
1597 } 1597 }
1598 1598
1599 cfqd->active_queue = cfqq; 1599 cfqd->active_queue = cfqq;
1600 } 1600 }
1601 1601
1602 /* 1602 /*
1603 * current cfqq expired its slice (or was too idle), select new one 1603 * current cfqq expired its slice (or was too idle), select new one
1604 */ 1604 */
1605 static void 1605 static void
1606 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1606 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1607 bool timed_out) 1607 bool timed_out)
1608 { 1608 {
1609 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); 1609 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
1610 1610
1611 if (cfq_cfqq_wait_request(cfqq)) 1611 if (cfq_cfqq_wait_request(cfqq))
1612 cfq_del_timer(cfqd, cfqq); 1612 cfq_del_timer(cfqd, cfqq);
1613 1613
1614 cfq_clear_cfqq_wait_request(cfqq); 1614 cfq_clear_cfqq_wait_request(cfqq);
1615 cfq_clear_cfqq_wait_busy(cfqq); 1615 cfq_clear_cfqq_wait_busy(cfqq);
1616 1616
1617 /* 1617 /*
1618 * If this cfqq is shared between multiple processes, check to 1618 * If this cfqq is shared between multiple processes, check to
1619 * make sure that those processes are still issuing I/Os within 1619 * make sure that those processes are still issuing I/Os within
1620 * the mean seek distance. If not, it may be time to break the 1620 * the mean seek distance. If not, it may be time to break the
1621 * queues apart again. 1621 * queues apart again.
1622 */ 1622 */
1623 if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq)) 1623 if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
1624 cfq_mark_cfqq_split_coop(cfqq); 1624 cfq_mark_cfqq_split_coop(cfqq);
1625 1625
1626 /* 1626 /*
1627 * store what was left of this slice, if the queue idled/timed out 1627 * store what was left of this slice, if the queue idled/timed out
1628 */ 1628 */
1629 if (timed_out && !cfq_cfqq_slice_new(cfqq)) { 1629 if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
1630 cfqq->slice_resid = cfqq->slice_end - jiffies; 1630 cfqq->slice_resid = cfqq->slice_end - jiffies;
1631 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); 1631 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
1632 } 1632 }
1633 1633
1634 cfq_group_served(cfqd, cfqq->cfqg, cfqq); 1634 cfq_group_served(cfqd, cfqq->cfqg, cfqq);
1635 1635
1636 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) 1636 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
1637 cfq_del_cfqq_rr(cfqd, cfqq); 1637 cfq_del_cfqq_rr(cfqd, cfqq);
1638 1638
1639 cfq_resort_rr_list(cfqd, cfqq); 1639 cfq_resort_rr_list(cfqd, cfqq);
1640 1640
1641 if (cfqq == cfqd->active_queue) 1641 if (cfqq == cfqd->active_queue)
1642 cfqd->active_queue = NULL; 1642 cfqd->active_queue = NULL;
1643 1643
1644 if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active) 1644 if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
1645 cfqd->grp_service_tree.active = NULL; 1645 cfqd->grp_service_tree.active = NULL;
1646 1646
1647 if (cfqd->active_cic) { 1647 if (cfqd->active_cic) {
1648 put_io_context(cfqd->active_cic->ioc); 1648 put_io_context(cfqd->active_cic->ioc);
1649 cfqd->active_cic = NULL; 1649 cfqd->active_cic = NULL;
1650 } 1650 }
1651 } 1651 }
1652 1652
1653 static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) 1653 static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
1654 { 1654 {
1655 struct cfq_queue *cfqq = cfqd->active_queue; 1655 struct cfq_queue *cfqq = cfqd->active_queue;
1656 1656
1657 if (cfqq) 1657 if (cfqq)
1658 __cfq_slice_expired(cfqd, cfqq, timed_out); 1658 __cfq_slice_expired(cfqd, cfqq, timed_out);
1659 } 1659 }
1660 1660
1661 /* 1661 /*
1662 * Get next queue for service. Unless we have a queue preemption, 1662 * Get next queue for service. Unless we have a queue preemption,
1663 * we'll simply select the first cfqq in the service tree. 1663 * we'll simply select the first cfqq in the service tree.
1664 */ 1664 */
1665 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) 1665 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
1666 { 1666 {
1667 struct cfq_rb_root *service_tree = 1667 struct cfq_rb_root *service_tree =
1668 service_tree_for(cfqd->serving_group, cfqd->serving_prio, 1668 service_tree_for(cfqd->serving_group, cfqd->serving_prio,
1669 cfqd->serving_type); 1669 cfqd->serving_type);
1670 1670
1671 if (!cfqd->rq_queued) 1671 if (!cfqd->rq_queued)
1672 return NULL; 1672 return NULL;
1673 1673
1674 /* There is nothing to dispatch */ 1674 /* There is nothing to dispatch */
1675 if (!service_tree) 1675 if (!service_tree)
1676 return NULL; 1676 return NULL;
1677 if (RB_EMPTY_ROOT(&service_tree->rb)) 1677 if (RB_EMPTY_ROOT(&service_tree->rb))
1678 return NULL; 1678 return NULL;
1679 return cfq_rb_first(service_tree); 1679 return cfq_rb_first(service_tree);
1680 } 1680 }
1681 1681
1682 static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) 1682 static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
1683 { 1683 {
1684 struct cfq_group *cfqg; 1684 struct cfq_group *cfqg;
1685 struct cfq_queue *cfqq; 1685 struct cfq_queue *cfqq;
1686 int i, j; 1686 int i, j;
1687 struct cfq_rb_root *st; 1687 struct cfq_rb_root *st;
1688 1688
1689 if (!cfqd->rq_queued) 1689 if (!cfqd->rq_queued)
1690 return NULL; 1690 return NULL;
1691 1691
1692 cfqg = cfq_get_next_cfqg(cfqd); 1692 cfqg = cfq_get_next_cfqg(cfqd);
1693 if (!cfqg) 1693 if (!cfqg)
1694 return NULL; 1694 return NULL;
1695 1695
1696 for_each_cfqg_st(cfqg, i, j, st) 1696 for_each_cfqg_st(cfqg, i, j, st)
1697 if ((cfqq = cfq_rb_first(st)) != NULL) 1697 if ((cfqq = cfq_rb_first(st)) != NULL)
1698 return cfqq; 1698 return cfqq;
1699 return NULL; 1699 return NULL;
1700 } 1700 }
1701 1701
1702 /* 1702 /*
1703 * Get and set a new active queue for service. 1703 * Get and set a new active queue for service.
1704 */ 1704 */
1705 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, 1705 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
1706 struct cfq_queue *cfqq) 1706 struct cfq_queue *cfqq)
1707 { 1707 {
1708 if (!cfqq) 1708 if (!cfqq)
1709 cfqq = cfq_get_next_queue(cfqd); 1709 cfqq = cfq_get_next_queue(cfqd);
1710 1710
1711 __cfq_set_active_queue(cfqd, cfqq); 1711 __cfq_set_active_queue(cfqd, cfqq);
1712 return cfqq; 1712 return cfqq;
1713 } 1713 }
1714 1714
1715 static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, 1715 static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
1716 struct request *rq) 1716 struct request *rq)
1717 { 1717 {
1718 if (blk_rq_pos(rq) >= cfqd->last_position) 1718 if (blk_rq_pos(rq) >= cfqd->last_position)
1719 return blk_rq_pos(rq) - cfqd->last_position; 1719 return blk_rq_pos(rq) - cfqd->last_position;
1720 else 1720 else
1721 return cfqd->last_position - blk_rq_pos(rq); 1721 return cfqd->last_position - blk_rq_pos(rq);
1722 } 1722 }
1723 1723
1724 static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, 1724 static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1725 struct request *rq) 1725 struct request *rq)
1726 { 1726 {
1727 return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR; 1727 return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
1728 } 1728 }
1729 1729
1730 static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, 1730 static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
1731 struct cfq_queue *cur_cfqq) 1731 struct cfq_queue *cur_cfqq)
1732 { 1732 {
1733 struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio]; 1733 struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
1734 struct rb_node *parent, *node; 1734 struct rb_node *parent, *node;
1735 struct cfq_queue *__cfqq; 1735 struct cfq_queue *__cfqq;
1736 sector_t sector = cfqd->last_position; 1736 sector_t sector = cfqd->last_position;
1737 1737
1738 if (RB_EMPTY_ROOT(root)) 1738 if (RB_EMPTY_ROOT(root))
1739 return NULL; 1739 return NULL;
1740 1740
1741 /* 1741 /*
1742 * First, if we find a request starting at the end of the last 1742 * First, if we find a request starting at the end of the last
1743 * request, choose it. 1743 * request, choose it.
1744 */ 1744 */
1745 __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL); 1745 __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
1746 if (__cfqq) 1746 if (__cfqq)
1747 return __cfqq; 1747 return __cfqq;
1748 1748
1749 /* 1749 /*
1750 * If the exact sector wasn't found, the parent of the NULL leaf 1750 * If the exact sector wasn't found, the parent of the NULL leaf
1751 * will contain the closest sector. 1751 * will contain the closest sector.
1752 */ 1752 */
1753 __cfqq = rb_entry(parent, struct cfq_queue, p_node); 1753 __cfqq = rb_entry(parent, struct cfq_queue, p_node);
1754 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) 1754 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1755 return __cfqq; 1755 return __cfqq;
1756 1756
1757 if (blk_rq_pos(__cfqq->next_rq) < sector) 1757 if (blk_rq_pos(__cfqq->next_rq) < sector)
1758 node = rb_next(&__cfqq->p_node); 1758 node = rb_next(&__cfqq->p_node);
1759 else 1759 else
1760 node = rb_prev(&__cfqq->p_node); 1760 node = rb_prev(&__cfqq->p_node);
1761 if (!node) 1761 if (!node)
1762 return NULL; 1762 return NULL;
1763 1763
1764 __cfqq = rb_entry(node, struct cfq_queue, p_node); 1764 __cfqq = rb_entry(node, struct cfq_queue, p_node);
1765 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) 1765 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
1766 return __cfqq; 1766 return __cfqq;
1767 1767
1768 return NULL; 1768 return NULL;
1769 } 1769 }
1770 1770
1771 /* 1771 /*
1772 * cfqd - obvious 1772 * cfqd - obvious
1773 * cur_cfqq - passed in so that we don't decide that the current queue is 1773 * cur_cfqq - passed in so that we don't decide that the current queue is
1774 * closely cooperating with itself. 1774 * closely cooperating with itself.
1775 * 1775 *
1776 * So, basically we're assuming that that cur_cfqq has dispatched at least 1776 * So, basically we're assuming that that cur_cfqq has dispatched at least
1777 * one request, and that cfqd->last_position reflects a position on the disk 1777 * one request, and that cfqd->last_position reflects a position on the disk
1778 * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid 1778 * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid
1779 * assumption. 1779 * assumption.
1780 */ 1780 */
1781 static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, 1781 static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
1782 struct cfq_queue *cur_cfqq) 1782 struct cfq_queue *cur_cfqq)
1783 { 1783 {
1784 struct cfq_queue *cfqq; 1784 struct cfq_queue *cfqq;
1785 1785
1786 if (cfq_class_idle(cur_cfqq)) 1786 if (cfq_class_idle(cur_cfqq))
1787 return NULL; 1787 return NULL;
1788 if (!cfq_cfqq_sync(cur_cfqq)) 1788 if (!cfq_cfqq_sync(cur_cfqq))
1789 return NULL; 1789 return NULL;
1790 if (CFQQ_SEEKY(cur_cfqq)) 1790 if (CFQQ_SEEKY(cur_cfqq))
1791 return NULL; 1791 return NULL;
1792 1792
1793 /* 1793 /*
1794 * Don't search priority tree if it's the only queue in the group. 1794 * Don't search priority tree if it's the only queue in the group.
1795 */ 1795 */
1796 if (cur_cfqq->cfqg->nr_cfqq == 1) 1796 if (cur_cfqq->cfqg->nr_cfqq == 1)
1797 return NULL; 1797 return NULL;
1798 1798
1799 /* 1799 /*
1800 * We should notice if some of the queues are cooperating, eg 1800 * We should notice if some of the queues are cooperating, eg
1801 * working closely on the same area of the disk. In that case, 1801 * working closely on the same area of the disk. In that case,
1802 * we can group them together and don't waste time idling. 1802 * we can group them together and don't waste time idling.
1803 */ 1803 */
1804 cfqq = cfqq_close(cfqd, cur_cfqq); 1804 cfqq = cfqq_close(cfqd, cur_cfqq);
1805 if (!cfqq) 1805 if (!cfqq)
1806 return NULL; 1806 return NULL;
1807 1807
1808 /* If new queue belongs to different cfq_group, don't choose it */ 1808 /* If new queue belongs to different cfq_group, don't choose it */
1809 if (cur_cfqq->cfqg != cfqq->cfqg) 1809 if (cur_cfqq->cfqg != cfqq->cfqg)
1810 return NULL; 1810 return NULL;
1811 1811
1812 /* 1812 /*
1813 * It only makes sense to merge sync queues. 1813 * It only makes sense to merge sync queues.
1814 */ 1814 */
1815 if (!cfq_cfqq_sync(cfqq)) 1815 if (!cfq_cfqq_sync(cfqq))
1816 return NULL; 1816 return NULL;
1817 if (CFQQ_SEEKY(cfqq)) 1817 if (CFQQ_SEEKY(cfqq))
1818 return NULL; 1818 return NULL;
1819 1819
1820 /* 1820 /*
1821 * Do not merge queues of different priority classes 1821 * Do not merge queues of different priority classes
1822 */ 1822 */
1823 if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq)) 1823 if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
1824 return NULL; 1824 return NULL;
1825 1825
1826 return cfqq; 1826 return cfqq;
1827 } 1827 }
1828 1828
1829 /* 1829 /*
1830 * Determine whether we should enforce idle window for this queue. 1830 * Determine whether we should enforce idle window for this queue.
1831 */ 1831 */
1832 1832
1833 static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1833 static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1834 { 1834 {
1835 enum wl_prio_t prio = cfqq_prio(cfqq); 1835 enum wl_prio_t prio = cfqq_prio(cfqq);
1836 struct cfq_rb_root *service_tree = cfqq->service_tree; 1836 struct cfq_rb_root *service_tree = cfqq->service_tree;
1837 1837
1838 BUG_ON(!service_tree); 1838 BUG_ON(!service_tree);
1839 BUG_ON(!service_tree->count); 1839 BUG_ON(!service_tree->count);
1840 1840
1841 /* We never do for idle class queues. */ 1841 /* We never do for idle class queues. */
1842 if (prio == IDLE_WORKLOAD) 1842 if (prio == IDLE_WORKLOAD)
1843 return false; 1843 return false;
1844 1844
1845 /* We do for queues that were marked with idle window flag. */ 1845 /* We do for queues that were marked with idle window flag. */
1846 if (cfq_cfqq_idle_window(cfqq) && 1846 if (cfq_cfqq_idle_window(cfqq) &&
1847 !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)) 1847 !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
1848 return true; 1848 return true;
1849 1849
1850 /* 1850 /*
1851 * Otherwise, we do only if they are the last ones 1851 * Otherwise, we do only if they are the last ones
1852 * in their service tree. 1852 * in their service tree.
1853 */ 1853 */
1854 if (service_tree->count == 1 && cfq_cfqq_sync(cfqq)) 1854 if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
1855 return 1; 1855 return 1;
1856 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", 1856 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
1857 service_tree->count); 1857 service_tree->count);
1858 return 0; 1858 return 0;
1859 } 1859 }
1860 1860
1861 static void cfq_arm_slice_timer(struct cfq_data *cfqd) 1861 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1862 { 1862 {
1863 struct cfq_queue *cfqq = cfqd->active_queue; 1863 struct cfq_queue *cfqq = cfqd->active_queue;
1864 struct cfq_io_context *cic; 1864 struct cfq_io_context *cic;
1865 unsigned long sl; 1865 unsigned long sl;
1866 1866
1867 /* 1867 /*
1868 * SSD device without seek penalty, disable idling. But only do so 1868 * SSD device without seek penalty, disable idling. But only do so
1869 * for devices that support queuing, otherwise we still have a problem 1869 * for devices that support queuing, otherwise we still have a problem
1870 * with sync vs async workloads. 1870 * with sync vs async workloads.
1871 */ 1871 */
1872 if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag) 1872 if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
1873 return; 1873 return;
1874 1874
1875 WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); 1875 WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
1876 WARN_ON(cfq_cfqq_slice_new(cfqq)); 1876 WARN_ON(cfq_cfqq_slice_new(cfqq));
1877 1877
1878 /* 1878 /*
1879 * idle is disabled, either manually or by past process history 1879 * idle is disabled, either manually or by past process history
1880 */ 1880 */
1881 if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq)) 1881 if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
1882 return; 1882 return;
1883 1883
1884 /* 1884 /*
1885 * still active requests from this queue, don't idle 1885 * still active requests from this queue, don't idle
1886 */ 1886 */
1887 if (cfqq->dispatched) 1887 if (cfqq->dispatched)
1888 return; 1888 return;
1889 1889
1890 /* 1890 /*
1891 * task has exited, don't wait 1891 * task has exited, don't wait
1892 */ 1892 */
1893 cic = cfqd->active_cic; 1893 cic = cfqd->active_cic;
1894 if (!cic || !atomic_read(&cic->ioc->nr_tasks)) 1894 if (!cic || !atomic_read(&cic->ioc->nr_tasks))
1895 return; 1895 return;
1896 1896
1897 /* 1897 /*
1898 * If our average think time is larger than the remaining time 1898 * If our average think time is larger than the remaining time
1899 * slice, then don't idle. This avoids overrunning the allotted 1899 * slice, then don't idle. This avoids overrunning the allotted
1900 * time slice. 1900 * time slice.
1901 */ 1901 */
1902 if (sample_valid(cic->ttime_samples) && 1902 if (sample_valid(cic->ttime_samples) &&
1903 (cfqq->slice_end - jiffies < cic->ttime_mean)) { 1903 (cfqq->slice_end - jiffies < cic->ttime_mean)) {
1904 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d", 1904 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",
1905 cic->ttime_mean); 1905 cic->ttime_mean);
1906 return; 1906 return;
1907 } 1907 }
1908 1908
1909 cfq_mark_cfqq_wait_request(cfqq); 1909 cfq_mark_cfqq_wait_request(cfqq);
1910 1910
1911 sl = cfqd->cfq_slice_idle; 1911 sl = cfqd->cfq_slice_idle;
1912 1912
1913 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1913 mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
1914 blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); 1914 blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
1915 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); 1915 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
1916 } 1916 }
1917 1917
1918 /* 1918 /*
1919 * Move request from internal lists to the request queue dispatch list. 1919 * Move request from internal lists to the request queue dispatch list.
1920 */ 1920 */
1921 static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) 1921 static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
1922 { 1922 {
1923 struct cfq_data *cfqd = q->elevator->elevator_data; 1923 struct cfq_data *cfqd = q->elevator->elevator_data;
1924 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1924 struct cfq_queue *cfqq = RQ_CFQQ(rq);
1925 1925
1926 cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); 1926 cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
1927 1927
1928 cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); 1928 cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
1929 cfq_remove_request(rq); 1929 cfq_remove_request(rq);
1930 cfqq->dispatched++; 1930 cfqq->dispatched++;
1931 elv_dispatch_sort(q, rq); 1931 elv_dispatch_sort(q, rq);
1932 1932
1933 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; 1933 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
1934 blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq), 1934 blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
1935 rq_data_dir(rq), rq_is_sync(rq)); 1935 rq_data_dir(rq), rq_is_sync(rq));
1936 } 1936 }
1937 1937
1938 /* 1938 /*
1939 * return expired entry, or NULL to just start from scratch in rbtree 1939 * return expired entry, or NULL to just start from scratch in rbtree
1940 */ 1940 */
1941 static struct request *cfq_check_fifo(struct cfq_queue *cfqq) 1941 static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
1942 { 1942 {
1943 struct request *rq = NULL; 1943 struct request *rq = NULL;
1944 1944
1945 if (cfq_cfqq_fifo_expire(cfqq)) 1945 if (cfq_cfqq_fifo_expire(cfqq))
1946 return NULL; 1946 return NULL;
1947 1947
1948 cfq_mark_cfqq_fifo_expire(cfqq); 1948 cfq_mark_cfqq_fifo_expire(cfqq);
1949 1949
1950 if (list_empty(&cfqq->fifo)) 1950 if (list_empty(&cfqq->fifo))
1951 return NULL; 1951 return NULL;
1952 1952
1953 rq = rq_entry_fifo(cfqq->fifo.next); 1953 rq = rq_entry_fifo(cfqq->fifo.next);
1954 if (time_before(jiffies, rq_fifo_time(rq))) 1954 if (time_before(jiffies, rq_fifo_time(rq)))
1955 rq = NULL; 1955 rq = NULL;
1956 1956
1957 cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); 1957 cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
1958 return rq; 1958 return rq;
1959 } 1959 }
1960 1960
1961 static inline int 1961 static inline int
1962 cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1962 cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1963 { 1963 {
1964 const int base_rq = cfqd->cfq_slice_async_rq; 1964 const int base_rq = cfqd->cfq_slice_async_rq;
1965 1965
1966 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); 1966 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
1967 1967
1968 return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); 1968 return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
1969 } 1969 }
1970 1970
1971 /* 1971 /*
1972 * Must be called with the queue_lock held. 1972 * Must be called with the queue_lock held.
1973 */ 1973 */
1974 static int cfqq_process_refs(struct cfq_queue *cfqq) 1974 static int cfqq_process_refs(struct cfq_queue *cfqq)
1975 { 1975 {
1976 int process_refs, io_refs; 1976 int process_refs, io_refs;
1977 1977
1978 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; 1978 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
1979 process_refs = atomic_read(&cfqq->ref) - io_refs; 1979 process_refs = atomic_read(&cfqq->ref) - io_refs;
1980 BUG_ON(process_refs < 0); 1980 BUG_ON(process_refs < 0);
1981 return process_refs; 1981 return process_refs;
1982 } 1982 }
1983 1983
1984 static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) 1984 static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
1985 { 1985 {
1986 int process_refs, new_process_refs; 1986 int process_refs, new_process_refs;
1987 struct cfq_queue *__cfqq; 1987 struct cfq_queue *__cfqq;
1988 1988
1989 /*
1990 * If there are no process references on the new_cfqq, then it is
1991 * unsafe to follow the ->new_cfqq chain as other cfqq's in the
1992 * chain may have dropped their last reference (not just their
1993 * last process reference).
1994 */
1995 if (!cfqq_process_refs(new_cfqq))
1996 return;
1997
1989 /* Avoid a circular list and skip interim queue merges */ 1998 /* Avoid a circular list and skip interim queue merges */
1990 while ((__cfqq = new_cfqq->new_cfqq)) { 1999 while ((__cfqq = new_cfqq->new_cfqq)) {
1991 if (__cfqq == cfqq) 2000 if (__cfqq == cfqq)
1992 return; 2001 return;
1993 new_cfqq = __cfqq; 2002 new_cfqq = __cfqq;
1994 } 2003 }
1995 2004
1996 process_refs = cfqq_process_refs(cfqq); 2005 process_refs = cfqq_process_refs(cfqq);
2006 new_process_refs = cfqq_process_refs(new_cfqq);
1997 /* 2007 /*
1998 * If the process for the cfqq has gone away, there is no 2008 * If the process for the cfqq has gone away, there is no
1999 * sense in merging the queues. 2009 * sense in merging the queues.
2000 */ 2010 */
2001 if (process_refs == 0) 2011 if (process_refs == 0 || new_process_refs == 0)
2002 return; 2012 return;
2003 2013
2004 /* 2014 /*
2005 * Merge in the direction of the lesser amount of work. 2015 * Merge in the direction of the lesser amount of work.
2006 */ 2016 */
2007 new_process_refs = cfqq_process_refs(new_cfqq);
2008 if (new_process_refs >= process_refs) { 2017 if (new_process_refs >= process_refs) {
2009 cfqq->new_cfqq = new_cfqq; 2018 cfqq->new_cfqq = new_cfqq;
2010 atomic_add(process_refs, &new_cfqq->ref); 2019 atomic_add(process_refs, &new_cfqq->ref);
2011 } else { 2020 } else {
2012 new_cfqq->new_cfqq = cfqq; 2021 new_cfqq->new_cfqq = cfqq;
2013 atomic_add(new_process_refs, &cfqq->ref); 2022 atomic_add(new_process_refs, &cfqq->ref);
2014 } 2023 }
2015 } 2024 }
2016 2025
2017 static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, 2026 static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
2018 struct cfq_group *cfqg, enum wl_prio_t prio) 2027 struct cfq_group *cfqg, enum wl_prio_t prio)
2019 { 2028 {
2020 struct cfq_queue *queue; 2029 struct cfq_queue *queue;
2021 int i; 2030 int i;
2022 bool key_valid = false; 2031 bool key_valid = false;
2023 unsigned long lowest_key = 0; 2032 unsigned long lowest_key = 0;
2024 enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD; 2033 enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
2025 2034
2026 for (i = 0; i <= SYNC_WORKLOAD; ++i) { 2035 for (i = 0; i <= SYNC_WORKLOAD; ++i) {
2027 /* select the one with lowest rb_key */ 2036 /* select the one with lowest rb_key */
2028 queue = cfq_rb_first(service_tree_for(cfqg, prio, i)); 2037 queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
2029 if (queue && 2038 if (queue &&
2030 (!key_valid || time_before(queue->rb_key, lowest_key))) { 2039 (!key_valid || time_before(queue->rb_key, lowest_key))) {
2031 lowest_key = queue->rb_key; 2040 lowest_key = queue->rb_key;
2032 cur_best = i; 2041 cur_best = i;
2033 key_valid = true; 2042 key_valid = true;
2034 } 2043 }
2035 } 2044 }
2036 2045
2037 return cur_best; 2046 return cur_best;
2038 } 2047 }
2039 2048
2040 static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) 2049 static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2041 { 2050 {
2042 unsigned slice; 2051 unsigned slice;
2043 unsigned count; 2052 unsigned count;
2044 struct cfq_rb_root *st; 2053 struct cfq_rb_root *st;
2045 unsigned group_slice; 2054 unsigned group_slice;
2046 2055
2047 if (!cfqg) { 2056 if (!cfqg) {
2048 cfqd->serving_prio = IDLE_WORKLOAD; 2057 cfqd->serving_prio = IDLE_WORKLOAD;
2049 cfqd->workload_expires = jiffies + 1; 2058 cfqd->workload_expires = jiffies + 1;
2050 return; 2059 return;
2051 } 2060 }
2052 2061
2053 /* Choose next priority. RT > BE > IDLE */ 2062 /* Choose next priority. RT > BE > IDLE */
2054 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) 2063 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
2055 cfqd->serving_prio = RT_WORKLOAD; 2064 cfqd->serving_prio = RT_WORKLOAD;
2056 else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) 2065 else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
2057 cfqd->serving_prio = BE_WORKLOAD; 2066 cfqd->serving_prio = BE_WORKLOAD;
2058 else { 2067 else {
2059 cfqd->serving_prio = IDLE_WORKLOAD; 2068 cfqd->serving_prio = IDLE_WORKLOAD;
2060 cfqd->workload_expires = jiffies + 1; 2069 cfqd->workload_expires = jiffies + 1;
2061 return; 2070 return;
2062 } 2071 }
2063 2072
2064 /* 2073 /*
2065 * For RT and BE, we have to choose also the type 2074 * For RT and BE, we have to choose also the type
2066 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload 2075 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
2067 * expiration time 2076 * expiration time
2068 */ 2077 */
2069 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); 2078 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2070 count = st->count; 2079 count = st->count;
2071 2080
2072 /* 2081 /*
2073 * check workload expiration, and that we still have other queues ready 2082 * check workload expiration, and that we still have other queues ready
2074 */ 2083 */
2075 if (count && !time_after(jiffies, cfqd->workload_expires)) 2084 if (count && !time_after(jiffies, cfqd->workload_expires))
2076 return; 2085 return;
2077 2086
2078 /* otherwise select new workload type */ 2087 /* otherwise select new workload type */
2079 cfqd->serving_type = 2088 cfqd->serving_type =
2080 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio); 2089 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
2081 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); 2090 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
2082 count = st->count; 2091 count = st->count;
2083 2092
2084 /* 2093 /*
2085 * the workload slice is computed as a fraction of target latency 2094 * the workload slice is computed as a fraction of target latency
2086 * proportional to the number of queues in that workload, over 2095 * proportional to the number of queues in that workload, over
2087 * all the queues in the same priority class 2096 * all the queues in the same priority class
2088 */ 2097 */
2089 group_slice = cfq_group_slice(cfqd, cfqg); 2098 group_slice = cfq_group_slice(cfqd, cfqg);
2090 2099
2091 slice = group_slice * count / 2100 slice = group_slice * count /
2092 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio], 2101 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
2093 cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg)); 2102 cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
2094 2103
2095 if (cfqd->serving_type == ASYNC_WORKLOAD) { 2104 if (cfqd->serving_type == ASYNC_WORKLOAD) {
2096 unsigned int tmp; 2105 unsigned int tmp;
2097 2106
2098 /* 2107 /*
2099 * Async queues are currently system wide. Just taking 2108 * Async queues are currently system wide. Just taking
2100 * proportion of queues with-in same group will lead to higher 2109 * proportion of queues with-in same group will lead to higher
2101 * async ratio system wide as generally root group is going 2110 * async ratio system wide as generally root group is going
2102 * to have higher weight. A more accurate thing would be to 2111 * to have higher weight. A more accurate thing would be to
2103 * calculate system wide asnc/sync ratio. 2112 * calculate system wide asnc/sync ratio.
2104 */ 2113 */
2105 tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg); 2114 tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
2106 tmp = tmp/cfqd->busy_queues; 2115 tmp = tmp/cfqd->busy_queues;
2107 slice = min_t(unsigned, slice, tmp); 2116 slice = min_t(unsigned, slice, tmp);
2108 2117
2109 /* async workload slice is scaled down according to 2118 /* async workload slice is scaled down according to
2110 * the sync/async slice ratio. */ 2119 * the sync/async slice ratio. */
2111 slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1]; 2120 slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
2112 } else 2121 } else
2113 /* sync workload slice is at least 2 * cfq_slice_idle */ 2122 /* sync workload slice is at least 2 * cfq_slice_idle */
2114 slice = max(slice, 2 * cfqd->cfq_slice_idle); 2123 slice = max(slice, 2 * cfqd->cfq_slice_idle);
2115 2124
2116 slice = max_t(unsigned, slice, CFQ_MIN_TT); 2125 slice = max_t(unsigned, slice, CFQ_MIN_TT);
2117 cfq_log(cfqd, "workload slice:%d", slice); 2126 cfq_log(cfqd, "workload slice:%d", slice);
2118 cfqd->workload_expires = jiffies + slice; 2127 cfqd->workload_expires = jiffies + slice;
2119 cfqd->noidle_tree_requires_idle = false; 2128 cfqd->noidle_tree_requires_idle = false;
2120 } 2129 }
2121 2130
2122 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) 2131 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
2123 { 2132 {
2124 struct cfq_rb_root *st = &cfqd->grp_service_tree; 2133 struct cfq_rb_root *st = &cfqd->grp_service_tree;
2125 struct cfq_group *cfqg; 2134 struct cfq_group *cfqg;
2126 2135
2127 if (RB_EMPTY_ROOT(&st->rb)) 2136 if (RB_EMPTY_ROOT(&st->rb))
2128 return NULL; 2137 return NULL;
2129 cfqg = cfq_rb_first_group(st); 2138 cfqg = cfq_rb_first_group(st);
2130 st->active = &cfqg->rb_node; 2139 st->active = &cfqg->rb_node;
2131 update_min_vdisktime(st); 2140 update_min_vdisktime(st);
2132 return cfqg; 2141 return cfqg;
2133 } 2142 }
2134 2143
2135 static void cfq_choose_cfqg(struct cfq_data *cfqd) 2144 static void cfq_choose_cfqg(struct cfq_data *cfqd)
2136 { 2145 {
2137 struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd); 2146 struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
2138 2147
2139 cfqd->serving_group = cfqg; 2148 cfqd->serving_group = cfqg;
2140 2149
2141 /* Restore the workload type data */ 2150 /* Restore the workload type data */
2142 if (cfqg->saved_workload_slice) { 2151 if (cfqg->saved_workload_slice) {
2143 cfqd->workload_expires = jiffies + cfqg->saved_workload_slice; 2152 cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
2144 cfqd->serving_type = cfqg->saved_workload; 2153 cfqd->serving_type = cfqg->saved_workload;
2145 cfqd->serving_prio = cfqg->saved_serving_prio; 2154 cfqd->serving_prio = cfqg->saved_serving_prio;
2146 } else 2155 } else
2147 cfqd->workload_expires = jiffies - 1; 2156 cfqd->workload_expires = jiffies - 1;
2148 2157
2149 choose_service_tree(cfqd, cfqg); 2158 choose_service_tree(cfqd, cfqg);
2150 } 2159 }
2151 2160
2152 /* 2161 /*
2153 * Select a queue for service. If we have a current active queue, 2162 * Select a queue for service. If we have a current active queue,
2154 * check whether to continue servicing it, or retrieve and set a new one. 2163 * check whether to continue servicing it, or retrieve and set a new one.
2155 */ 2164 */
2156 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) 2165 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
2157 { 2166 {
2158 struct cfq_queue *cfqq, *new_cfqq = NULL; 2167 struct cfq_queue *cfqq, *new_cfqq = NULL;
2159 2168
2160 cfqq = cfqd->active_queue; 2169 cfqq = cfqd->active_queue;
2161 if (!cfqq) 2170 if (!cfqq)
2162 goto new_queue; 2171 goto new_queue;
2163 2172
2164 if (!cfqd->rq_queued) 2173 if (!cfqd->rq_queued)
2165 return NULL; 2174 return NULL;
2166 2175
2167 /* 2176 /*
2168 * We were waiting for group to get backlogged. Expire the queue 2177 * We were waiting for group to get backlogged. Expire the queue
2169 */ 2178 */
2170 if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list)) 2179 if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
2171 goto expire; 2180 goto expire;
2172 2181
2173 /* 2182 /*
2174 * The active queue has run out of time, expire it and select new. 2183 * The active queue has run out of time, expire it and select new.
2175 */ 2184 */
2176 if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) { 2185 if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
2177 /* 2186 /*
2178 * If slice had not expired at the completion of last request 2187 * If slice had not expired at the completion of last request
2179 * we might not have turned on wait_busy flag. Don't expire 2188 * we might not have turned on wait_busy flag. Don't expire
2180 * the queue yet. Allow the group to get backlogged. 2189 * the queue yet. Allow the group to get backlogged.
2181 * 2190 *
2182 * The very fact that we have used the slice, that means we 2191 * The very fact that we have used the slice, that means we
2183 * have been idling all along on this queue and it should be 2192 * have been idling all along on this queue and it should be
2184 * ok to wait for this request to complete. 2193 * ok to wait for this request to complete.
2185 */ 2194 */
2186 if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list) 2195 if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
2187 && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { 2196 && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2188 cfqq = NULL; 2197 cfqq = NULL;
2189 goto keep_queue; 2198 goto keep_queue;
2190 } else 2199 } else
2191 goto expire; 2200 goto expire;
2192 } 2201 }
2193 2202
2194 /* 2203 /*
2195 * The active queue has requests and isn't expired, allow it to 2204 * The active queue has requests and isn't expired, allow it to
2196 * dispatch. 2205 * dispatch.
2197 */ 2206 */
2198 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 2207 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
2199 goto keep_queue; 2208 goto keep_queue;
2200 2209
2201 /* 2210 /*
2202 * If another queue has a request waiting within our mean seek 2211 * If another queue has a request waiting within our mean seek
2203 * distance, let it run. The expire code will check for close 2212 * distance, let it run. The expire code will check for close
2204 * cooperators and put the close queue at the front of the service 2213 * cooperators and put the close queue at the front of the service
2205 * tree. If possible, merge the expiring queue with the new cfqq. 2214 * tree. If possible, merge the expiring queue with the new cfqq.
2206 */ 2215 */
2207 new_cfqq = cfq_close_cooperator(cfqd, cfqq); 2216 new_cfqq = cfq_close_cooperator(cfqd, cfqq);
2208 if (new_cfqq) { 2217 if (new_cfqq) {
2209 if (!cfqq->new_cfqq) 2218 if (!cfqq->new_cfqq)
2210 cfq_setup_merge(cfqq, new_cfqq); 2219 cfq_setup_merge(cfqq, new_cfqq);
2211 goto expire; 2220 goto expire;
2212 } 2221 }
2213 2222
2214 /* 2223 /*
2215 * No requests pending. If the active queue still has requests in 2224 * No requests pending. If the active queue still has requests in
2216 * flight or is idling for a new request, allow either of these 2225 * flight or is idling for a new request, allow either of these
2217 * conditions to happen (or time out) before selecting a new queue. 2226 * conditions to happen (or time out) before selecting a new queue.
2218 */ 2227 */
2219 if (timer_pending(&cfqd->idle_slice_timer) || 2228 if (timer_pending(&cfqd->idle_slice_timer) ||
2220 (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) { 2229 (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {
2221 cfqq = NULL; 2230 cfqq = NULL;
2222 goto keep_queue; 2231 goto keep_queue;
2223 } 2232 }
2224 2233
2225 expire: 2234 expire:
2226 cfq_slice_expired(cfqd, 0); 2235 cfq_slice_expired(cfqd, 0);
2227 new_queue: 2236 new_queue:
2228 /* 2237 /*
2229 * Current queue expired. Check if we have to switch to a new 2238 * Current queue expired. Check if we have to switch to a new
2230 * service tree 2239 * service tree
2231 */ 2240 */
2232 if (!new_cfqq) 2241 if (!new_cfqq)
2233 cfq_choose_cfqg(cfqd); 2242 cfq_choose_cfqg(cfqd);
2234 2243
2235 cfqq = cfq_set_active_queue(cfqd, new_cfqq); 2244 cfqq = cfq_set_active_queue(cfqd, new_cfqq);
2236 keep_queue: 2245 keep_queue:
2237 return cfqq; 2246 return cfqq;
2238 } 2247 }
2239 2248
2240 static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) 2249 static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
2241 { 2250 {
2242 int dispatched = 0; 2251 int dispatched = 0;
2243 2252
2244 while (cfqq->next_rq) { 2253 while (cfqq->next_rq) {
2245 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq); 2254 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
2246 dispatched++; 2255 dispatched++;
2247 } 2256 }
2248 2257
2249 BUG_ON(!list_empty(&cfqq->fifo)); 2258 BUG_ON(!list_empty(&cfqq->fifo));
2250 2259
2251 /* By default cfqq is not expired if it is empty. Do it explicitly */ 2260 /* By default cfqq is not expired if it is empty. Do it explicitly */
2252 __cfq_slice_expired(cfqq->cfqd, cfqq, 0); 2261 __cfq_slice_expired(cfqq->cfqd, cfqq, 0);
2253 return dispatched; 2262 return dispatched;
2254 } 2263 }
2255 2264
2256 /* 2265 /*
2257 * Drain our current requests. Used for barriers and when switching 2266 * Drain our current requests. Used for barriers and when switching
2258 * io schedulers on-the-fly. 2267 * io schedulers on-the-fly.
2259 */ 2268 */
2260 static int cfq_forced_dispatch(struct cfq_data *cfqd) 2269 static int cfq_forced_dispatch(struct cfq_data *cfqd)
2261 { 2270 {
2262 struct cfq_queue *cfqq; 2271 struct cfq_queue *cfqq;
2263 int dispatched = 0; 2272 int dispatched = 0;
2264 2273
2265 /* Expire the timeslice of the current active queue first */ 2274 /* Expire the timeslice of the current active queue first */
2266 cfq_slice_expired(cfqd, 0); 2275 cfq_slice_expired(cfqd, 0);
2267 while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) { 2276 while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
2268 __cfq_set_active_queue(cfqd, cfqq); 2277 __cfq_set_active_queue(cfqd, cfqq);
2269 dispatched += __cfq_forced_dispatch_cfqq(cfqq); 2278 dispatched += __cfq_forced_dispatch_cfqq(cfqq);
2270 } 2279 }
2271 2280
2272 BUG_ON(cfqd->busy_queues); 2281 BUG_ON(cfqd->busy_queues);
2273 2282
2274 cfq_log(cfqd, "forced_dispatch=%d", dispatched); 2283 cfq_log(cfqd, "forced_dispatch=%d", dispatched);
2275 return dispatched; 2284 return dispatched;
2276 } 2285 }
2277 2286
2278 static inline bool cfq_slice_used_soon(struct cfq_data *cfqd, 2287 static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
2279 struct cfq_queue *cfqq) 2288 struct cfq_queue *cfqq)
2280 { 2289 {
2281 /* the queue hasn't finished any request, can't estimate */ 2290 /* the queue hasn't finished any request, can't estimate */
2282 if (cfq_cfqq_slice_new(cfqq)) 2291 if (cfq_cfqq_slice_new(cfqq))
2283 return 1; 2292 return 1;
2284 if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched, 2293 if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
2285 cfqq->slice_end)) 2294 cfqq->slice_end))
2286 return 1; 2295 return 1;
2287 2296
2288 return 0; 2297 return 0;
2289 } 2298 }
2290 2299
2291 static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2300 static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2292 { 2301 {
2293 unsigned int max_dispatch; 2302 unsigned int max_dispatch;
2294 2303
2295 /* 2304 /*
2296 * Drain async requests before we start sync IO 2305 * Drain async requests before we start sync IO
2297 */ 2306 */
2298 if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC]) 2307 if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
2299 return false; 2308 return false;
2300 2309
2301 /* 2310 /*
2302 * If this is an async queue and we have sync IO in flight, let it wait 2311 * If this is an async queue and we have sync IO in flight, let it wait
2303 */ 2312 */
2304 if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq)) 2313 if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
2305 return false; 2314 return false;
2306 2315
2307 max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1); 2316 max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
2308 if (cfq_class_idle(cfqq)) 2317 if (cfq_class_idle(cfqq))
2309 max_dispatch = 1; 2318 max_dispatch = 1;
2310 2319
2311 /* 2320 /*
2312 * Does this cfqq already have too much IO in flight? 2321 * Does this cfqq already have too much IO in flight?
2313 */ 2322 */
2314 if (cfqq->dispatched >= max_dispatch) { 2323 if (cfqq->dispatched >= max_dispatch) {
2315 /* 2324 /*
2316 * idle queue must always only have a single IO in flight 2325 * idle queue must always only have a single IO in flight
2317 */ 2326 */
2318 if (cfq_class_idle(cfqq)) 2327 if (cfq_class_idle(cfqq))
2319 return false; 2328 return false;
2320 2329
2321 /* 2330 /*
2322 * We have other queues, don't allow more IO from this one 2331 * We have other queues, don't allow more IO from this one
2323 */ 2332 */
2324 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq)) 2333 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq))
2325 return false; 2334 return false;
2326 2335
2327 /* 2336 /*
2328 * Sole queue user, no limit 2337 * Sole queue user, no limit
2329 */ 2338 */
2330 if (cfqd->busy_queues == 1) 2339 if (cfqd->busy_queues == 1)
2331 max_dispatch = -1; 2340 max_dispatch = -1;
2332 else 2341 else
2333 /* 2342 /*
2334 * Normally we start throttling cfqq when cfq_quantum/2 2343 * Normally we start throttling cfqq when cfq_quantum/2
2335 * requests have been dispatched. But we can drive 2344 * requests have been dispatched. But we can drive
2336 * deeper queue depths at the beginning of slice 2345 * deeper queue depths at the beginning of slice
2337 * subjected to upper limit of cfq_quantum. 2346 * subjected to upper limit of cfq_quantum.
2338 * */ 2347 * */
2339 max_dispatch = cfqd->cfq_quantum; 2348 max_dispatch = cfqd->cfq_quantum;
2340 } 2349 }
2341 2350
2342 /* 2351 /*
2343 * Async queues must wait a bit before being allowed dispatch. 2352 * Async queues must wait a bit before being allowed dispatch.
2344 * We also ramp up the dispatch depth gradually for async IO, 2353 * We also ramp up the dispatch depth gradually for async IO,
2345 * based on the last sync IO we serviced 2354 * based on the last sync IO we serviced
2346 */ 2355 */
2347 if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) { 2356 if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
2348 unsigned long last_sync = jiffies - cfqd->last_delayed_sync; 2357 unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
2349 unsigned int depth; 2358 unsigned int depth;
2350 2359
2351 depth = last_sync / cfqd->cfq_slice[1]; 2360 depth = last_sync / cfqd->cfq_slice[1];
2352 if (!depth && !cfqq->dispatched) 2361 if (!depth && !cfqq->dispatched)
2353 depth = 1; 2362 depth = 1;
2354 if (depth < max_dispatch) 2363 if (depth < max_dispatch)
2355 max_dispatch = depth; 2364 max_dispatch = depth;
2356 } 2365 }
2357 2366
2358 /* 2367 /*
2359 * If we're below the current max, allow a dispatch 2368 * If we're below the current max, allow a dispatch
2360 */ 2369 */
2361 return cfqq->dispatched < max_dispatch; 2370 return cfqq->dispatched < max_dispatch;
2362 } 2371 }
2363 2372
2364 /* 2373 /*
2365 * Dispatch a request from cfqq, moving them to the request queue 2374 * Dispatch a request from cfqq, moving them to the request queue
2366 * dispatch list. 2375 * dispatch list.
2367 */ 2376 */
2368 static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2377 static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2369 { 2378 {
2370 struct request *rq; 2379 struct request *rq;
2371 2380
2372 BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); 2381 BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
2373 2382
2374 if (!cfq_may_dispatch(cfqd, cfqq)) 2383 if (!cfq_may_dispatch(cfqd, cfqq))
2375 return false; 2384 return false;
2376 2385
2377 /* 2386 /*
2378 * follow expired path, else get first next available 2387 * follow expired path, else get first next available
2379 */ 2388 */
2380 rq = cfq_check_fifo(cfqq); 2389 rq = cfq_check_fifo(cfqq);
2381 if (!rq) 2390 if (!rq)
2382 rq = cfqq->next_rq; 2391 rq = cfqq->next_rq;
2383 2392
2384 /* 2393 /*
2385 * insert request into driver dispatch list 2394 * insert request into driver dispatch list
2386 */ 2395 */
2387 cfq_dispatch_insert(cfqd->queue, rq); 2396 cfq_dispatch_insert(cfqd->queue, rq);
2388 2397
2389 if (!cfqd->active_cic) { 2398 if (!cfqd->active_cic) {
2390 struct cfq_io_context *cic = RQ_CIC(rq); 2399 struct cfq_io_context *cic = RQ_CIC(rq);
2391 2400
2392 atomic_long_inc(&cic->ioc->refcount); 2401 atomic_long_inc(&cic->ioc->refcount);
2393 cfqd->active_cic = cic; 2402 cfqd->active_cic = cic;
2394 } 2403 }
2395 2404
2396 return true; 2405 return true;
2397 } 2406 }
2398 2407
2399 /* 2408 /*
2400 * Find the cfqq that we need to service and move a request from that to the 2409 * Find the cfqq that we need to service and move a request from that to the
2401 * dispatch list 2410 * dispatch list
2402 */ 2411 */
2403 static int cfq_dispatch_requests(struct request_queue *q, int force) 2412 static int cfq_dispatch_requests(struct request_queue *q, int force)
2404 { 2413 {
2405 struct cfq_data *cfqd = q->elevator->elevator_data; 2414 struct cfq_data *cfqd = q->elevator->elevator_data;
2406 struct cfq_queue *cfqq; 2415 struct cfq_queue *cfqq;
2407 2416
2408 if (!cfqd->busy_queues) 2417 if (!cfqd->busy_queues)
2409 return 0; 2418 return 0;
2410 2419
2411 if (unlikely(force)) 2420 if (unlikely(force))
2412 return cfq_forced_dispatch(cfqd); 2421 return cfq_forced_dispatch(cfqd);
2413 2422
2414 cfqq = cfq_select_queue(cfqd); 2423 cfqq = cfq_select_queue(cfqd);
2415 if (!cfqq) 2424 if (!cfqq)
2416 return 0; 2425 return 0;
2417 2426
2418 /* 2427 /*
2419 * Dispatch a request from this cfqq, if it is allowed 2428 * Dispatch a request from this cfqq, if it is allowed
2420 */ 2429 */
2421 if (!cfq_dispatch_request(cfqd, cfqq)) 2430 if (!cfq_dispatch_request(cfqd, cfqq))
2422 return 0; 2431 return 0;
2423 2432
2424 cfqq->slice_dispatch++; 2433 cfqq->slice_dispatch++;
2425 cfq_clear_cfqq_must_dispatch(cfqq); 2434 cfq_clear_cfqq_must_dispatch(cfqq);
2426 2435
2427 /* 2436 /*
2428 * expire an async queue immediately if it has used up its slice. idle 2437 * expire an async queue immediately if it has used up its slice. idle
2429 * queue always expire after 1 dispatch round. 2438 * queue always expire after 1 dispatch round.
2430 */ 2439 */
2431 if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) && 2440 if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
2432 cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) || 2441 cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
2433 cfq_class_idle(cfqq))) { 2442 cfq_class_idle(cfqq))) {
2434 cfqq->slice_end = jiffies + 1; 2443 cfqq->slice_end = jiffies + 1;
2435 cfq_slice_expired(cfqd, 0); 2444 cfq_slice_expired(cfqd, 0);
2436 } 2445 }
2437 2446
2438 cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); 2447 cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
2439 return 1; 2448 return 1;
2440 } 2449 }
2441 2450
2442 /* 2451 /*
2443 * task holds one reference to the queue, dropped when task exits. each rq 2452 * task holds one reference to the queue, dropped when task exits. each rq
2444 * in-flight on this queue also holds a reference, dropped when rq is freed. 2453 * in-flight on this queue also holds a reference, dropped when rq is freed.
2445 * 2454 *
2446 * Each cfq queue took a reference on the parent group. Drop it now. 2455 * Each cfq queue took a reference on the parent group. Drop it now.
2447 * queue lock must be held here. 2456 * queue lock must be held here.
2448 */ 2457 */
2449 static void cfq_put_queue(struct cfq_queue *cfqq) 2458 static void cfq_put_queue(struct cfq_queue *cfqq)
2450 { 2459 {
2451 struct cfq_data *cfqd = cfqq->cfqd; 2460 struct cfq_data *cfqd = cfqq->cfqd;
2452 struct cfq_group *cfqg, *orig_cfqg; 2461 struct cfq_group *cfqg, *orig_cfqg;
2453 2462
2454 BUG_ON(atomic_read(&cfqq->ref) <= 0); 2463 BUG_ON(atomic_read(&cfqq->ref) <= 0);
2455 2464
2456 if (!atomic_dec_and_test(&cfqq->ref)) 2465 if (!atomic_dec_and_test(&cfqq->ref))
2457 return; 2466 return;
2458 2467
2459 cfq_log_cfqq(cfqd, cfqq, "put_queue"); 2468 cfq_log_cfqq(cfqd, cfqq, "put_queue");
2460 BUG_ON(rb_first(&cfqq->sort_list)); 2469 BUG_ON(rb_first(&cfqq->sort_list));
2461 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 2470 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
2462 cfqg = cfqq->cfqg; 2471 cfqg = cfqq->cfqg;
2463 orig_cfqg = cfqq->orig_cfqg; 2472 orig_cfqg = cfqq->orig_cfqg;
2464 2473
2465 if (unlikely(cfqd->active_queue == cfqq)) { 2474 if (unlikely(cfqd->active_queue == cfqq)) {
2466 __cfq_slice_expired(cfqd, cfqq, 0); 2475 __cfq_slice_expired(cfqd, cfqq, 0);
2467 cfq_schedule_dispatch(cfqd); 2476 cfq_schedule_dispatch(cfqd);
2468 } 2477 }
2469 2478
2470 BUG_ON(cfq_cfqq_on_rr(cfqq)); 2479 BUG_ON(cfq_cfqq_on_rr(cfqq));
2471 kmem_cache_free(cfq_pool, cfqq); 2480 kmem_cache_free(cfq_pool, cfqq);
2472 cfq_put_cfqg(cfqg); 2481 cfq_put_cfqg(cfqg);
2473 if (orig_cfqg) 2482 if (orig_cfqg)
2474 cfq_put_cfqg(orig_cfqg); 2483 cfq_put_cfqg(orig_cfqg);
2475 } 2484 }
2476 2485
2477 /* 2486 /*
2478 * Must always be called with the rcu_read_lock() held 2487 * Must always be called with the rcu_read_lock() held
2479 */ 2488 */
2480 static void 2489 static void
2481 __call_for_each_cic(struct io_context *ioc, 2490 __call_for_each_cic(struct io_context *ioc,
2482 void (*func)(struct io_context *, struct cfq_io_context *)) 2491 void (*func)(struct io_context *, struct cfq_io_context *))
2483 { 2492 {
2484 struct cfq_io_context *cic; 2493 struct cfq_io_context *cic;
2485 struct hlist_node *n; 2494 struct hlist_node *n;
2486 2495
2487 hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list) 2496 hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
2488 func(ioc, cic); 2497 func(ioc, cic);
2489 } 2498 }
2490 2499
2491 /* 2500 /*
2492 * Call func for each cic attached to this ioc. 2501 * Call func for each cic attached to this ioc.
2493 */ 2502 */
2494 static void 2503 static void
2495 call_for_each_cic(struct io_context *ioc, 2504 call_for_each_cic(struct io_context *ioc,
2496 void (*func)(struct io_context *, struct cfq_io_context *)) 2505 void (*func)(struct io_context *, struct cfq_io_context *))
2497 { 2506 {
2498 rcu_read_lock(); 2507 rcu_read_lock();
2499 __call_for_each_cic(ioc, func); 2508 __call_for_each_cic(ioc, func);
2500 rcu_read_unlock(); 2509 rcu_read_unlock();
2501 } 2510 }
2502 2511
2503 static void cfq_cic_free_rcu(struct rcu_head *head) 2512 static void cfq_cic_free_rcu(struct rcu_head *head)
2504 { 2513 {
2505 struct cfq_io_context *cic; 2514 struct cfq_io_context *cic;
2506 2515
2507 cic = container_of(head, struct cfq_io_context, rcu_head); 2516 cic = container_of(head, struct cfq_io_context, rcu_head);
2508 2517
2509 kmem_cache_free(cfq_ioc_pool, cic); 2518 kmem_cache_free(cfq_ioc_pool, cic);
2510 elv_ioc_count_dec(cfq_ioc_count); 2519 elv_ioc_count_dec(cfq_ioc_count);
2511 2520
2512 if (ioc_gone) { 2521 if (ioc_gone) {
2513 /* 2522 /*
2514 * CFQ scheduler is exiting, grab exit lock and check 2523 * CFQ scheduler is exiting, grab exit lock and check
2515 * the pending io context count. If it hits zero, 2524 * the pending io context count. If it hits zero,
2516 * complete ioc_gone and set it back to NULL 2525 * complete ioc_gone and set it back to NULL
2517 */ 2526 */
2518 spin_lock(&ioc_gone_lock); 2527 spin_lock(&ioc_gone_lock);
2519 if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) { 2528 if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
2520 complete(ioc_gone); 2529 complete(ioc_gone);
2521 ioc_gone = NULL; 2530 ioc_gone = NULL;
2522 } 2531 }
2523 spin_unlock(&ioc_gone_lock); 2532 spin_unlock(&ioc_gone_lock);
2524 } 2533 }
2525 } 2534 }
2526 2535
2527 static void cfq_cic_free(struct cfq_io_context *cic) 2536 static void cfq_cic_free(struct cfq_io_context *cic)
2528 { 2537 {
2529 call_rcu(&cic->rcu_head, cfq_cic_free_rcu); 2538 call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
2530 } 2539 }
2531 2540
2532 static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) 2541 static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
2533 { 2542 {
2534 unsigned long flags; 2543 unsigned long flags;
2535 unsigned long dead_key = (unsigned long) cic->key; 2544 unsigned long dead_key = (unsigned long) cic->key;
2536 2545
2537 BUG_ON(!(dead_key & CIC_DEAD_KEY)); 2546 BUG_ON(!(dead_key & CIC_DEAD_KEY));
2538 2547
2539 spin_lock_irqsave(&ioc->lock, flags); 2548 spin_lock_irqsave(&ioc->lock, flags);
2540 radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT); 2549 radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
2541 hlist_del_rcu(&cic->cic_list); 2550 hlist_del_rcu(&cic->cic_list);
2542 spin_unlock_irqrestore(&ioc->lock, flags); 2551 spin_unlock_irqrestore(&ioc->lock, flags);
2543 2552
2544 cfq_cic_free(cic); 2553 cfq_cic_free(cic);
2545 } 2554 }
2546 2555
2547 /* 2556 /*
2548 * Must be called with rcu_read_lock() held or preemption otherwise disabled. 2557 * Must be called with rcu_read_lock() held or preemption otherwise disabled.
2549 * Only two callers of this - ->dtor() which is called with the rcu_read_lock(), 2558 * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
2550 * and ->trim() which is called with the task lock held 2559 * and ->trim() which is called with the task lock held
2551 */ 2560 */
2552 static void cfq_free_io_context(struct io_context *ioc) 2561 static void cfq_free_io_context(struct io_context *ioc)
2553 { 2562 {
2554 /* 2563 /*
2555 * ioc->refcount is zero here, or we are called from elv_unregister(), 2564 * ioc->refcount is zero here, or we are called from elv_unregister(),
2556 * so no more cic's are allowed to be linked into this ioc. So it 2565 * so no more cic's are allowed to be linked into this ioc. So it
2557 * should be ok to iterate over the known list, we will see all cic's 2566 * should be ok to iterate over the known list, we will see all cic's
2558 * since no new ones are added. 2567 * since no new ones are added.
2559 */ 2568 */
2560 __call_for_each_cic(ioc, cic_free_func); 2569 __call_for_each_cic(ioc, cic_free_func);
2561 } 2570 }
2562 2571
2563 static void cfq_put_cooperator(struct cfq_queue *cfqq) 2572 static void cfq_put_cooperator(struct cfq_queue *cfqq)
2564 { 2573 {
2565 struct cfq_queue *__cfqq, *next; 2574 struct cfq_queue *__cfqq, *next;
2566 2575
2567 /* 2576 /*
2568 * If this queue was scheduled to merge with another queue, be 2577 * If this queue was scheduled to merge with another queue, be
2569 * sure to drop the reference taken on that queue (and others in 2578 * sure to drop the reference taken on that queue (and others in
2570 * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs. 2579 * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs.
2571 */ 2580 */
2572 __cfqq = cfqq->new_cfqq; 2581 __cfqq = cfqq->new_cfqq;
2573 while (__cfqq) { 2582 while (__cfqq) {
2574 if (__cfqq == cfqq) { 2583 if (__cfqq == cfqq) {
2575 WARN(1, "cfqq->new_cfqq loop detected\n"); 2584 WARN(1, "cfqq->new_cfqq loop detected\n");
2576 break; 2585 break;
2577 } 2586 }
2578 next = __cfqq->new_cfqq; 2587 next = __cfqq->new_cfqq;
2579 cfq_put_queue(__cfqq); 2588 cfq_put_queue(__cfqq);
2580 __cfqq = next; 2589 __cfqq = next;
2581 } 2590 }
2582 } 2591 }
2583 2592
2584 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2593 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2585 { 2594 {
2586 if (unlikely(cfqq == cfqd->active_queue)) { 2595 if (unlikely(cfqq == cfqd->active_queue)) {
2587 __cfq_slice_expired(cfqd, cfqq, 0); 2596 __cfq_slice_expired(cfqd, cfqq, 0);
2588 cfq_schedule_dispatch(cfqd); 2597 cfq_schedule_dispatch(cfqd);
2589 } 2598 }
2590 2599
2591 cfq_put_cooperator(cfqq); 2600 cfq_put_cooperator(cfqq);
2592 2601
2593 cfq_put_queue(cfqq); 2602 cfq_put_queue(cfqq);
2594 } 2603 }
2595 2604
2596 static void __cfq_exit_single_io_context(struct cfq_data *cfqd, 2605 static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
2597 struct cfq_io_context *cic) 2606 struct cfq_io_context *cic)
2598 { 2607 {
2599 struct io_context *ioc = cic->ioc; 2608 struct io_context *ioc = cic->ioc;
2600 2609
2601 list_del_init(&cic->queue_list); 2610 list_del_init(&cic->queue_list);
2602 2611
2603 /* 2612 /*
2604 * Make sure dead mark is seen for dead queues 2613 * Make sure dead mark is seen for dead queues
2605 */ 2614 */
2606 smp_wmb(); 2615 smp_wmb();
2607 cic->key = cfqd_dead_key(cfqd); 2616 cic->key = cfqd_dead_key(cfqd);
2608 2617
2609 if (ioc->ioc_data == cic) 2618 if (ioc->ioc_data == cic)
2610 rcu_assign_pointer(ioc->ioc_data, NULL); 2619 rcu_assign_pointer(ioc->ioc_data, NULL);
2611 2620
2612 if (cic->cfqq[BLK_RW_ASYNC]) { 2621 if (cic->cfqq[BLK_RW_ASYNC]) {
2613 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); 2622 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
2614 cic->cfqq[BLK_RW_ASYNC] = NULL; 2623 cic->cfqq[BLK_RW_ASYNC] = NULL;
2615 } 2624 }
2616 2625
2617 if (cic->cfqq[BLK_RW_SYNC]) { 2626 if (cic->cfqq[BLK_RW_SYNC]) {
2618 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]); 2627 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
2619 cic->cfqq[BLK_RW_SYNC] = NULL; 2628 cic->cfqq[BLK_RW_SYNC] = NULL;
2620 } 2629 }
2621 } 2630 }
2622 2631
2623 static void cfq_exit_single_io_context(struct io_context *ioc, 2632 static void cfq_exit_single_io_context(struct io_context *ioc,
2624 struct cfq_io_context *cic) 2633 struct cfq_io_context *cic)
2625 { 2634 {
2626 struct cfq_data *cfqd = cic_to_cfqd(cic); 2635 struct cfq_data *cfqd = cic_to_cfqd(cic);
2627 2636
2628 if (cfqd) { 2637 if (cfqd) {
2629 struct request_queue *q = cfqd->queue; 2638 struct request_queue *q = cfqd->queue;
2630 unsigned long flags; 2639 unsigned long flags;
2631 2640
2632 spin_lock_irqsave(q->queue_lock, flags); 2641 spin_lock_irqsave(q->queue_lock, flags);
2633 2642
2634 /* 2643 /*
2635 * Ensure we get a fresh copy of the ->key to prevent 2644 * Ensure we get a fresh copy of the ->key to prevent
2636 * race between exiting task and queue 2645 * race between exiting task and queue
2637 */ 2646 */
2638 smp_read_barrier_depends(); 2647 smp_read_barrier_depends();
2639 if (cic->key == cfqd) 2648 if (cic->key == cfqd)
2640 __cfq_exit_single_io_context(cfqd, cic); 2649 __cfq_exit_single_io_context(cfqd, cic);
2641 2650
2642 spin_unlock_irqrestore(q->queue_lock, flags); 2651 spin_unlock_irqrestore(q->queue_lock, flags);
2643 } 2652 }
2644 } 2653 }
2645 2654
2646 /* 2655 /*
2647 * The process that ioc belongs to has exited, we need to clean up 2656 * The process that ioc belongs to has exited, we need to clean up
2648 * and put the internal structures we have that belongs to that process. 2657 * and put the internal structures we have that belongs to that process.
2649 */ 2658 */
2650 static void cfq_exit_io_context(struct io_context *ioc) 2659 static void cfq_exit_io_context(struct io_context *ioc)
2651 { 2660 {
2652 call_for_each_cic(ioc, cfq_exit_single_io_context); 2661 call_for_each_cic(ioc, cfq_exit_single_io_context);
2653 } 2662 }
2654 2663
2655 static struct cfq_io_context * 2664 static struct cfq_io_context *
2656 cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 2665 cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
2657 { 2666 {
2658 struct cfq_io_context *cic; 2667 struct cfq_io_context *cic;
2659 2668
2660 cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO, 2669 cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
2661 cfqd->queue->node); 2670 cfqd->queue->node);
2662 if (cic) { 2671 if (cic) {
2663 cic->last_end_request = jiffies; 2672 cic->last_end_request = jiffies;
2664 INIT_LIST_HEAD(&cic->queue_list); 2673 INIT_LIST_HEAD(&cic->queue_list);
2665 INIT_HLIST_NODE(&cic->cic_list); 2674 INIT_HLIST_NODE(&cic->cic_list);
2666 cic->dtor = cfq_free_io_context; 2675 cic->dtor = cfq_free_io_context;
2667 cic->exit = cfq_exit_io_context; 2676 cic->exit = cfq_exit_io_context;
2668 elv_ioc_count_inc(cfq_ioc_count); 2677 elv_ioc_count_inc(cfq_ioc_count);
2669 } 2678 }
2670 2679
2671 return cic; 2680 return cic;
2672 } 2681 }
2673 2682
2674 static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) 2683 static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
2675 { 2684 {
2676 struct task_struct *tsk = current; 2685 struct task_struct *tsk = current;
2677 int ioprio_class; 2686 int ioprio_class;
2678 2687
2679 if (!cfq_cfqq_prio_changed(cfqq)) 2688 if (!cfq_cfqq_prio_changed(cfqq))
2680 return; 2689 return;
2681 2690
2682 ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); 2691 ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
2683 switch (ioprio_class) { 2692 switch (ioprio_class) {
2684 default: 2693 default:
2685 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); 2694 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
2686 case IOPRIO_CLASS_NONE: 2695 case IOPRIO_CLASS_NONE:
2687 /* 2696 /*
2688 * no prio set, inherit CPU scheduling settings 2697 * no prio set, inherit CPU scheduling settings
2689 */ 2698 */
2690 cfqq->ioprio = task_nice_ioprio(tsk); 2699 cfqq->ioprio = task_nice_ioprio(tsk);
2691 cfqq->ioprio_class = task_nice_ioclass(tsk); 2700 cfqq->ioprio_class = task_nice_ioclass(tsk);
2692 break; 2701 break;
2693 case IOPRIO_CLASS_RT: 2702 case IOPRIO_CLASS_RT:
2694 cfqq->ioprio = task_ioprio(ioc); 2703 cfqq->ioprio = task_ioprio(ioc);
2695 cfqq->ioprio_class = IOPRIO_CLASS_RT; 2704 cfqq->ioprio_class = IOPRIO_CLASS_RT;
2696 break; 2705 break;
2697 case IOPRIO_CLASS_BE: 2706 case IOPRIO_CLASS_BE:
2698 cfqq->ioprio = task_ioprio(ioc); 2707 cfqq->ioprio = task_ioprio(ioc);
2699 cfqq->ioprio_class = IOPRIO_CLASS_BE; 2708 cfqq->ioprio_class = IOPRIO_CLASS_BE;
2700 break; 2709 break;
2701 case IOPRIO_CLASS_IDLE: 2710 case IOPRIO_CLASS_IDLE:
2702 cfqq->ioprio_class = IOPRIO_CLASS_IDLE; 2711 cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
2703 cfqq->ioprio = 7; 2712 cfqq->ioprio = 7;
2704 cfq_clear_cfqq_idle_window(cfqq); 2713 cfq_clear_cfqq_idle_window(cfqq);
2705 break; 2714 break;
2706 } 2715 }
2707 2716
2708 /* 2717 /*
2709 * keep track of original prio settings in case we have to temporarily 2718 * keep track of original prio settings in case we have to temporarily
2710 * elevate the priority of this queue 2719 * elevate the priority of this queue
2711 */ 2720 */
2712 cfqq->org_ioprio = cfqq->ioprio; 2721 cfqq->org_ioprio = cfqq->ioprio;
2713 cfqq->org_ioprio_class = cfqq->ioprio_class; 2722 cfqq->org_ioprio_class = cfqq->ioprio_class;
2714 cfq_clear_cfqq_prio_changed(cfqq); 2723 cfq_clear_cfqq_prio_changed(cfqq);
2715 } 2724 }
2716 2725
2717 static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) 2726 static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
2718 { 2727 {
2719 struct cfq_data *cfqd = cic_to_cfqd(cic); 2728 struct cfq_data *cfqd = cic_to_cfqd(cic);
2720 struct cfq_queue *cfqq; 2729 struct cfq_queue *cfqq;
2721 unsigned long flags; 2730 unsigned long flags;
2722 2731
2723 if (unlikely(!cfqd)) 2732 if (unlikely(!cfqd))
2724 return; 2733 return;
2725 2734
2726 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 2735 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2727 2736
2728 cfqq = cic->cfqq[BLK_RW_ASYNC]; 2737 cfqq = cic->cfqq[BLK_RW_ASYNC];
2729 if (cfqq) { 2738 if (cfqq) {
2730 struct cfq_queue *new_cfqq; 2739 struct cfq_queue *new_cfqq;
2731 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc, 2740 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,
2732 GFP_ATOMIC); 2741 GFP_ATOMIC);
2733 if (new_cfqq) { 2742 if (new_cfqq) {
2734 cic->cfqq[BLK_RW_ASYNC] = new_cfqq; 2743 cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
2735 cfq_put_queue(cfqq); 2744 cfq_put_queue(cfqq);
2736 } 2745 }
2737 } 2746 }
2738 2747
2739 cfqq = cic->cfqq[BLK_RW_SYNC]; 2748 cfqq = cic->cfqq[BLK_RW_SYNC];
2740 if (cfqq) 2749 if (cfqq)
2741 cfq_mark_cfqq_prio_changed(cfqq); 2750 cfq_mark_cfqq_prio_changed(cfqq);
2742 2751
2743 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 2752 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2744 } 2753 }
2745 2754
2746 static void cfq_ioc_set_ioprio(struct io_context *ioc) 2755 static void cfq_ioc_set_ioprio(struct io_context *ioc)
2747 { 2756 {
2748 call_for_each_cic(ioc, changed_ioprio); 2757 call_for_each_cic(ioc, changed_ioprio);
2749 ioc->ioprio_changed = 0; 2758 ioc->ioprio_changed = 0;
2750 } 2759 }
2751 2760
2752 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 2761 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2753 pid_t pid, bool is_sync) 2762 pid_t pid, bool is_sync)
2754 { 2763 {
2755 RB_CLEAR_NODE(&cfqq->rb_node); 2764 RB_CLEAR_NODE(&cfqq->rb_node);
2756 RB_CLEAR_NODE(&cfqq->p_node); 2765 RB_CLEAR_NODE(&cfqq->p_node);
2757 INIT_LIST_HEAD(&cfqq->fifo); 2766 INIT_LIST_HEAD(&cfqq->fifo);
2758 2767
2759 atomic_set(&cfqq->ref, 0); 2768 atomic_set(&cfqq->ref, 0);
2760 cfqq->cfqd = cfqd; 2769 cfqq->cfqd = cfqd;
2761 2770
2762 cfq_mark_cfqq_prio_changed(cfqq); 2771 cfq_mark_cfqq_prio_changed(cfqq);
2763 2772
2764 if (is_sync) { 2773 if (is_sync) {
2765 if (!cfq_class_idle(cfqq)) 2774 if (!cfq_class_idle(cfqq))
2766 cfq_mark_cfqq_idle_window(cfqq); 2775 cfq_mark_cfqq_idle_window(cfqq);
2767 cfq_mark_cfqq_sync(cfqq); 2776 cfq_mark_cfqq_sync(cfqq);
2768 } 2777 }
2769 cfqq->pid = pid; 2778 cfqq->pid = pid;
2770 } 2779 }
2771 2780
2772 #ifdef CONFIG_CFQ_GROUP_IOSCHED 2781 #ifdef CONFIG_CFQ_GROUP_IOSCHED
2773 static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) 2782 static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
2774 { 2783 {
2775 struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); 2784 struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
2776 struct cfq_data *cfqd = cic_to_cfqd(cic); 2785 struct cfq_data *cfqd = cic_to_cfqd(cic);
2777 unsigned long flags; 2786 unsigned long flags;
2778 struct request_queue *q; 2787 struct request_queue *q;
2779 2788
2780 if (unlikely(!cfqd)) 2789 if (unlikely(!cfqd))
2781 return; 2790 return;
2782 2791
2783 q = cfqd->queue; 2792 q = cfqd->queue;
2784 2793
2785 spin_lock_irqsave(q->queue_lock, flags); 2794 spin_lock_irqsave(q->queue_lock, flags);
2786 2795
2787 if (sync_cfqq) { 2796 if (sync_cfqq) {
2788 /* 2797 /*
2789 * Drop reference to sync queue. A new sync queue will be 2798 * Drop reference to sync queue. A new sync queue will be
2790 * assigned in new group upon arrival of a fresh request. 2799 * assigned in new group upon arrival of a fresh request.
2791 */ 2800 */
2792 cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup"); 2801 cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
2793 cic_set_cfqq(cic, NULL, 1); 2802 cic_set_cfqq(cic, NULL, 1);
2794 cfq_put_queue(sync_cfqq); 2803 cfq_put_queue(sync_cfqq);
2795 } 2804 }
2796 2805
2797 spin_unlock_irqrestore(q->queue_lock, flags); 2806 spin_unlock_irqrestore(q->queue_lock, flags);
2798 } 2807 }
2799 2808
2800 static void cfq_ioc_set_cgroup(struct io_context *ioc) 2809 static void cfq_ioc_set_cgroup(struct io_context *ioc)
2801 { 2810 {
2802 call_for_each_cic(ioc, changed_cgroup); 2811 call_for_each_cic(ioc, changed_cgroup);
2803 ioc->cgroup_changed = 0; 2812 ioc->cgroup_changed = 0;
2804 } 2813 }
2805 #endif /* CONFIG_CFQ_GROUP_IOSCHED */ 2814 #endif /* CONFIG_CFQ_GROUP_IOSCHED */
2806 2815
2807 static struct cfq_queue * 2816 static struct cfq_queue *
2808 cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, 2817 cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
2809 struct io_context *ioc, gfp_t gfp_mask) 2818 struct io_context *ioc, gfp_t gfp_mask)
2810 { 2819 {
2811 struct cfq_queue *cfqq, *new_cfqq = NULL; 2820 struct cfq_queue *cfqq, *new_cfqq = NULL;
2812 struct cfq_io_context *cic; 2821 struct cfq_io_context *cic;
2813 struct cfq_group *cfqg; 2822 struct cfq_group *cfqg;
2814 2823
2815 retry: 2824 retry:
2816 cfqg = cfq_get_cfqg(cfqd, 1); 2825 cfqg = cfq_get_cfqg(cfqd, 1);
2817 cic = cfq_cic_lookup(cfqd, ioc); 2826 cic = cfq_cic_lookup(cfqd, ioc);
2818 /* cic always exists here */ 2827 /* cic always exists here */
2819 cfqq = cic_to_cfqq(cic, is_sync); 2828 cfqq = cic_to_cfqq(cic, is_sync);
2820 2829
2821 /* 2830 /*
2822 * Always try a new alloc if we fell back to the OOM cfqq 2831 * Always try a new alloc if we fell back to the OOM cfqq
2823 * originally, since it should just be a temporary situation. 2832 * originally, since it should just be a temporary situation.
2824 */ 2833 */
2825 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 2834 if (!cfqq || cfqq == &cfqd->oom_cfqq) {
2826 cfqq = NULL; 2835 cfqq = NULL;
2827 if (new_cfqq) { 2836 if (new_cfqq) {
2828 cfqq = new_cfqq; 2837 cfqq = new_cfqq;
2829 new_cfqq = NULL; 2838 new_cfqq = NULL;
2830 } else if (gfp_mask & __GFP_WAIT) { 2839 } else if (gfp_mask & __GFP_WAIT) {
2831 spin_unlock_irq(cfqd->queue->queue_lock); 2840 spin_unlock_irq(cfqd->queue->queue_lock);
2832 new_cfqq = kmem_cache_alloc_node(cfq_pool, 2841 new_cfqq = kmem_cache_alloc_node(cfq_pool,
2833 gfp_mask | __GFP_ZERO, 2842 gfp_mask | __GFP_ZERO,
2834 cfqd->queue->node); 2843 cfqd->queue->node);
2835 spin_lock_irq(cfqd->queue->queue_lock); 2844 spin_lock_irq(cfqd->queue->queue_lock);
2836 if (new_cfqq) 2845 if (new_cfqq)
2837 goto retry; 2846 goto retry;
2838 } else { 2847 } else {
2839 cfqq = kmem_cache_alloc_node(cfq_pool, 2848 cfqq = kmem_cache_alloc_node(cfq_pool,
2840 gfp_mask | __GFP_ZERO, 2849 gfp_mask | __GFP_ZERO,
2841 cfqd->queue->node); 2850 cfqd->queue->node);
2842 } 2851 }
2843 2852
2844 if (cfqq) { 2853 if (cfqq) {
2845 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); 2854 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
2846 cfq_init_prio_data(cfqq, ioc); 2855 cfq_init_prio_data(cfqq, ioc);
2847 cfq_link_cfqq_cfqg(cfqq, cfqg); 2856 cfq_link_cfqq_cfqg(cfqq, cfqg);
2848 cfq_log_cfqq(cfqd, cfqq, "alloced"); 2857 cfq_log_cfqq(cfqd, cfqq, "alloced");
2849 } else 2858 } else
2850 cfqq = &cfqd->oom_cfqq; 2859 cfqq = &cfqd->oom_cfqq;
2851 } 2860 }
2852 2861
2853 if (new_cfqq) 2862 if (new_cfqq)
2854 kmem_cache_free(cfq_pool, new_cfqq); 2863 kmem_cache_free(cfq_pool, new_cfqq);
2855 2864
2856 return cfqq; 2865 return cfqq;
2857 } 2866 }
2858 2867
2859 static struct cfq_queue ** 2868 static struct cfq_queue **
2860 cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) 2869 cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
2861 { 2870 {
2862 switch (ioprio_class) { 2871 switch (ioprio_class) {
2863 case IOPRIO_CLASS_RT: 2872 case IOPRIO_CLASS_RT:
2864 return &cfqd->async_cfqq[0][ioprio]; 2873 return &cfqd->async_cfqq[0][ioprio];
2865 case IOPRIO_CLASS_BE: 2874 case IOPRIO_CLASS_BE:
2866 return &cfqd->async_cfqq[1][ioprio]; 2875 return &cfqd->async_cfqq[1][ioprio];
2867 case IOPRIO_CLASS_IDLE: 2876 case IOPRIO_CLASS_IDLE:
2868 return &cfqd->async_idle_cfqq; 2877 return &cfqd->async_idle_cfqq;
2869 default: 2878 default:
2870 BUG(); 2879 BUG();
2871 } 2880 }
2872 } 2881 }
2873 2882
2874 static struct cfq_queue * 2883 static struct cfq_queue *
2875 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, 2884 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
2876 gfp_t gfp_mask) 2885 gfp_t gfp_mask)
2877 { 2886 {
2878 const int ioprio = task_ioprio(ioc); 2887 const int ioprio = task_ioprio(ioc);
2879 const int ioprio_class = task_ioprio_class(ioc); 2888 const int ioprio_class = task_ioprio_class(ioc);
2880 struct cfq_queue **async_cfqq = NULL; 2889 struct cfq_queue **async_cfqq = NULL;
2881 struct cfq_queue *cfqq = NULL; 2890 struct cfq_queue *cfqq = NULL;
2882 2891
2883 if (!is_sync) { 2892 if (!is_sync) {
2884 async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); 2893 async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
2885 cfqq = *async_cfqq; 2894 cfqq = *async_cfqq;
2886 } 2895 }
2887 2896
2888 if (!cfqq) 2897 if (!cfqq)
2889 cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask); 2898 cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
2890 2899
2891 /* 2900 /*
2892 * pin the queue now that it's allocated, scheduler exit will prune it 2901 * pin the queue now that it's allocated, scheduler exit will prune it
2893 */ 2902 */
2894 if (!is_sync && !(*async_cfqq)) { 2903 if (!is_sync && !(*async_cfqq)) {
2895 atomic_inc(&cfqq->ref); 2904 atomic_inc(&cfqq->ref);
2896 *async_cfqq = cfqq; 2905 *async_cfqq = cfqq;
2897 } 2906 }
2898 2907
2899 atomic_inc(&cfqq->ref); 2908 atomic_inc(&cfqq->ref);
2900 return cfqq; 2909 return cfqq;
2901 } 2910 }
2902 2911
2903 /* 2912 /*
2904 * We drop cfq io contexts lazily, so we may find a dead one. 2913 * We drop cfq io contexts lazily, so we may find a dead one.
2905 */ 2914 */
2906 static void 2915 static void
2907 cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc, 2916 cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
2908 struct cfq_io_context *cic) 2917 struct cfq_io_context *cic)
2909 { 2918 {
2910 unsigned long flags; 2919 unsigned long flags;
2911 2920
2912 WARN_ON(!list_empty(&cic->queue_list)); 2921 WARN_ON(!list_empty(&cic->queue_list));
2913 BUG_ON(cic->key != cfqd_dead_key(cfqd)); 2922 BUG_ON(cic->key != cfqd_dead_key(cfqd));
2914 2923
2915 spin_lock_irqsave(&ioc->lock, flags); 2924 spin_lock_irqsave(&ioc->lock, flags);
2916 2925
2917 BUG_ON(ioc->ioc_data == cic); 2926 BUG_ON(ioc->ioc_data == cic);
2918 2927
2919 radix_tree_delete(&ioc->radix_root, cfqd->cic_index); 2928 radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
2920 hlist_del_rcu(&cic->cic_list); 2929 hlist_del_rcu(&cic->cic_list);
2921 spin_unlock_irqrestore(&ioc->lock, flags); 2930 spin_unlock_irqrestore(&ioc->lock, flags);
2922 2931
2923 cfq_cic_free(cic); 2932 cfq_cic_free(cic);
2924 } 2933 }
2925 2934
2926 static struct cfq_io_context * 2935 static struct cfq_io_context *
2927 cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) 2936 cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
2928 { 2937 {
2929 struct cfq_io_context *cic; 2938 struct cfq_io_context *cic;
2930 unsigned long flags; 2939 unsigned long flags;
2931 2940
2932 if (unlikely(!ioc)) 2941 if (unlikely(!ioc))
2933 return NULL; 2942 return NULL;
2934 2943
2935 rcu_read_lock(); 2944 rcu_read_lock();
2936 2945
2937 /* 2946 /*
2938 * we maintain a last-hit cache, to avoid browsing over the tree 2947 * we maintain a last-hit cache, to avoid browsing over the tree
2939 */ 2948 */
2940 cic = rcu_dereference(ioc->ioc_data); 2949 cic = rcu_dereference(ioc->ioc_data);
2941 if (cic && cic->key == cfqd) { 2950 if (cic && cic->key == cfqd) {
2942 rcu_read_unlock(); 2951 rcu_read_unlock();
2943 return cic; 2952 return cic;
2944 } 2953 }
2945 2954
2946 do { 2955 do {
2947 cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index); 2956 cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);
2948 rcu_read_unlock(); 2957 rcu_read_unlock();
2949 if (!cic) 2958 if (!cic)
2950 break; 2959 break;
2951 if (unlikely(cic->key != cfqd)) { 2960 if (unlikely(cic->key != cfqd)) {
2952 cfq_drop_dead_cic(cfqd, ioc, cic); 2961 cfq_drop_dead_cic(cfqd, ioc, cic);
2953 rcu_read_lock(); 2962 rcu_read_lock();
2954 continue; 2963 continue;
2955 } 2964 }
2956 2965
2957 spin_lock_irqsave(&ioc->lock, flags); 2966 spin_lock_irqsave(&ioc->lock, flags);
2958 rcu_assign_pointer(ioc->ioc_data, cic); 2967 rcu_assign_pointer(ioc->ioc_data, cic);
2959 spin_unlock_irqrestore(&ioc->lock, flags); 2968 spin_unlock_irqrestore(&ioc->lock, flags);
2960 break; 2969 break;
2961 } while (1); 2970 } while (1);
2962 2971
2963 return cic; 2972 return cic;
2964 } 2973 }
2965 2974
2966 /* 2975 /*
2967 * Add cic into ioc, using cfqd as the search key. This enables us to lookup 2976 * Add cic into ioc, using cfqd as the search key. This enables us to lookup
2968 * the process specific cfq io context when entered from the block layer. 2977 * the process specific cfq io context when entered from the block layer.
2969 * Also adds the cic to a per-cfqd list, used when this queue is removed. 2978 * Also adds the cic to a per-cfqd list, used when this queue is removed.
2970 */ 2979 */
2971 static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, 2980 static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
2972 struct cfq_io_context *cic, gfp_t gfp_mask) 2981 struct cfq_io_context *cic, gfp_t gfp_mask)
2973 { 2982 {
2974 unsigned long flags; 2983 unsigned long flags;
2975 int ret; 2984 int ret;
2976 2985
2977 ret = radix_tree_preload(gfp_mask); 2986 ret = radix_tree_preload(gfp_mask);
2978 if (!ret) { 2987 if (!ret) {
2979 cic->ioc = ioc; 2988 cic->ioc = ioc;
2980 cic->key = cfqd; 2989 cic->key = cfqd;
2981 2990
2982 spin_lock_irqsave(&ioc->lock, flags); 2991 spin_lock_irqsave(&ioc->lock, flags);
2983 ret = radix_tree_insert(&ioc->radix_root, 2992 ret = radix_tree_insert(&ioc->radix_root,
2984 cfqd->cic_index, cic); 2993 cfqd->cic_index, cic);
2985 if (!ret) 2994 if (!ret)
2986 hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list); 2995 hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
2987 spin_unlock_irqrestore(&ioc->lock, flags); 2996 spin_unlock_irqrestore(&ioc->lock, flags);
2988 2997
2989 radix_tree_preload_end(); 2998 radix_tree_preload_end();
2990 2999
2991 if (!ret) { 3000 if (!ret) {
2992 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 3001 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2993 list_add(&cic->queue_list, &cfqd->cic_list); 3002 list_add(&cic->queue_list, &cfqd->cic_list);
2994 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 3003 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2995 } 3004 }
2996 } 3005 }
2997 3006
2998 if (ret) 3007 if (ret)
2999 printk(KERN_ERR "cfq: cic link failed!\n"); 3008 printk(KERN_ERR "cfq: cic link failed!\n");
3000 3009
3001 return ret; 3010 return ret;
3002 } 3011 }
3003 3012
3004 /* 3013 /*
3005 * Setup general io context and cfq io context. There can be several cfq 3014 * Setup general io context and cfq io context. There can be several cfq
3006 * io contexts per general io context, if this process is doing io to more 3015 * io contexts per general io context, if this process is doing io to more
3007 * than one device managed by cfq. 3016 * than one device managed by cfq.
3008 */ 3017 */
3009 static struct cfq_io_context * 3018 static struct cfq_io_context *
3010 cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 3019 cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
3011 { 3020 {
3012 struct io_context *ioc = NULL; 3021 struct io_context *ioc = NULL;
3013 struct cfq_io_context *cic; 3022 struct cfq_io_context *cic;
3014 3023
3015 might_sleep_if(gfp_mask & __GFP_WAIT); 3024 might_sleep_if(gfp_mask & __GFP_WAIT);
3016 3025
3017 ioc = get_io_context(gfp_mask, cfqd->queue->node); 3026 ioc = get_io_context(gfp_mask, cfqd->queue->node);
3018 if (!ioc) 3027 if (!ioc)
3019 return NULL; 3028 return NULL;
3020 3029
3021 cic = cfq_cic_lookup(cfqd, ioc); 3030 cic = cfq_cic_lookup(cfqd, ioc);
3022 if (cic) 3031 if (cic)
3023 goto out; 3032 goto out;
3024 3033
3025 cic = cfq_alloc_io_context(cfqd, gfp_mask); 3034 cic = cfq_alloc_io_context(cfqd, gfp_mask);
3026 if (cic == NULL) 3035 if (cic == NULL)
3027 goto err; 3036 goto err;
3028 3037
3029 if (cfq_cic_link(cfqd, ioc, cic, gfp_mask)) 3038 if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
3030 goto err_free; 3039 goto err_free;
3031 3040
3032 out: 3041 out:
3033 smp_read_barrier_depends(); 3042 smp_read_barrier_depends();
3034 if (unlikely(ioc->ioprio_changed)) 3043 if (unlikely(ioc->ioprio_changed))
3035 cfq_ioc_set_ioprio(ioc); 3044 cfq_ioc_set_ioprio(ioc);
3036 3045
3037 #ifdef CONFIG_CFQ_GROUP_IOSCHED 3046 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3038 if (unlikely(ioc->cgroup_changed)) 3047 if (unlikely(ioc->cgroup_changed))
3039 cfq_ioc_set_cgroup(ioc); 3048 cfq_ioc_set_cgroup(ioc);
3040 #endif 3049 #endif
3041 return cic; 3050 return cic;
3042 err_free: 3051 err_free:
3043 cfq_cic_free(cic); 3052 cfq_cic_free(cic);
3044 err: 3053 err:
3045 put_io_context(ioc); 3054 put_io_context(ioc);
3046 return NULL; 3055 return NULL;
3047 } 3056 }
3048 3057
3049 static void 3058 static void
3050 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic) 3059 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
3051 { 3060 {
3052 unsigned long elapsed = jiffies - cic->last_end_request; 3061 unsigned long elapsed = jiffies - cic->last_end_request;
3053 unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle); 3062 unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
3054 3063
3055 cic->ttime_samples = (7*cic->ttime_samples + 256) / 8; 3064 cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
3056 cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8; 3065 cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
3057 cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; 3066 cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
3058 } 3067 }
3059 3068
3060 static void 3069 static void
3061 cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3070 cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3062 struct request *rq) 3071 struct request *rq)
3063 { 3072 {
3064 sector_t sdist = 0; 3073 sector_t sdist = 0;
3065 sector_t n_sec = blk_rq_sectors(rq); 3074 sector_t n_sec = blk_rq_sectors(rq);
3066 if (cfqq->last_request_pos) { 3075 if (cfqq->last_request_pos) {
3067 if (cfqq->last_request_pos < blk_rq_pos(rq)) 3076 if (cfqq->last_request_pos < blk_rq_pos(rq))
3068 sdist = blk_rq_pos(rq) - cfqq->last_request_pos; 3077 sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
3069 else 3078 else
3070 sdist = cfqq->last_request_pos - blk_rq_pos(rq); 3079 sdist = cfqq->last_request_pos - blk_rq_pos(rq);
3071 } 3080 }
3072 3081
3073 cfqq->seek_history <<= 1; 3082 cfqq->seek_history <<= 1;
3074 if (blk_queue_nonrot(cfqd->queue)) 3083 if (blk_queue_nonrot(cfqd->queue))
3075 cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT); 3084 cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
3076 else 3085 else
3077 cfqq->seek_history |= (sdist > CFQQ_SEEK_THR); 3086 cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
3078 } 3087 }
3079 3088
3080 /* 3089 /*
3081 * Disable idle window if the process thinks too long or seeks so much that 3090 * Disable idle window if the process thinks too long or seeks so much that
3082 * it doesn't matter 3091 * it doesn't matter
3083 */ 3092 */
3084 static void 3093 static void
3085 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3094 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3086 struct cfq_io_context *cic) 3095 struct cfq_io_context *cic)
3087 { 3096 {
3088 int old_idle, enable_idle; 3097 int old_idle, enable_idle;
3089 3098
3090 /* 3099 /*
3091 * Don't idle for async or idle io prio class 3100 * Don't idle for async or idle io prio class
3092 */ 3101 */
3093 if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq)) 3102 if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
3094 return; 3103 return;
3095 3104
3096 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); 3105 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
3097 3106
3098 if (cfqq->queued[0] + cfqq->queued[1] >= 4) 3107 if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3099 cfq_mark_cfqq_deep(cfqq); 3108 cfq_mark_cfqq_deep(cfqq);
3100 3109
3101 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 3110 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
3102 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) 3111 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3103 enable_idle = 0; 3112 enable_idle = 0;
3104 else if (sample_valid(cic->ttime_samples)) { 3113 else if (sample_valid(cic->ttime_samples)) {
3105 if (cic->ttime_mean > cfqd->cfq_slice_idle) 3114 if (cic->ttime_mean > cfqd->cfq_slice_idle)
3106 enable_idle = 0; 3115 enable_idle = 0;
3107 else 3116 else
3108 enable_idle = 1; 3117 enable_idle = 1;
3109 } 3118 }
3110 3119
3111 if (old_idle != enable_idle) { 3120 if (old_idle != enable_idle) {
3112 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle); 3121 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
3113 if (enable_idle) 3122 if (enable_idle)
3114 cfq_mark_cfqq_idle_window(cfqq); 3123 cfq_mark_cfqq_idle_window(cfqq);
3115 else 3124 else
3116 cfq_clear_cfqq_idle_window(cfqq); 3125 cfq_clear_cfqq_idle_window(cfqq);
3117 } 3126 }
3118 } 3127 }
3119 3128
3120 /* 3129 /*
3121 * Check if new_cfqq should preempt the currently active queue. Return 0 for 3130 * Check if new_cfqq should preempt the currently active queue. Return 0 for
3122 * no or if we aren't sure, a 1 will cause a preempt. 3131 * no or if we aren't sure, a 1 will cause a preempt.
3123 */ 3132 */
3124 static bool 3133 static bool
3125 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, 3134 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3126 struct request *rq) 3135 struct request *rq)
3127 { 3136 {
3128 struct cfq_queue *cfqq; 3137 struct cfq_queue *cfqq;
3129 3138
3130 cfqq = cfqd->active_queue; 3139 cfqq = cfqd->active_queue;
3131 if (!cfqq) 3140 if (!cfqq)
3132 return false; 3141 return false;
3133 3142
3134 if (cfq_class_idle(new_cfqq)) 3143 if (cfq_class_idle(new_cfqq))
3135 return false; 3144 return false;
3136 3145
3137 if (cfq_class_idle(cfqq)) 3146 if (cfq_class_idle(cfqq))
3138 return true; 3147 return true;
3139 3148
3140 /* 3149 /*
3141 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice. 3150 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
3142 */ 3151 */
3143 if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq)) 3152 if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
3144 return false; 3153 return false;
3145 3154
3146 /* 3155 /*
3147 * if the new request is sync, but the currently running queue is 3156 * if the new request is sync, but the currently running queue is
3148 * not, let the sync request have priority. 3157 * not, let the sync request have priority.
3149 */ 3158 */
3150 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) 3159 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
3151 return true; 3160 return true;
3152 3161
3153 if (new_cfqq->cfqg != cfqq->cfqg) 3162 if (new_cfqq->cfqg != cfqq->cfqg)
3154 return false; 3163 return false;
3155 3164
3156 if (cfq_slice_used(cfqq)) 3165 if (cfq_slice_used(cfqq))
3157 return true; 3166 return true;
3158 3167
3159 /* Allow preemption only if we are idling on sync-noidle tree */ 3168 /* Allow preemption only if we are idling on sync-noidle tree */
3160 if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && 3169 if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
3161 cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && 3170 cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
3162 new_cfqq->service_tree->count == 2 && 3171 new_cfqq->service_tree->count == 2 &&
3163 RB_EMPTY_ROOT(&cfqq->sort_list)) 3172 RB_EMPTY_ROOT(&cfqq->sort_list))
3164 return true; 3173 return true;
3165 3174
3166 /* 3175 /*
3167 * So both queues are sync. Let the new request get disk time if 3176 * So both queues are sync. Let the new request get disk time if
3168 * it's a metadata request and the current queue is doing regular IO. 3177 * it's a metadata request and the current queue is doing regular IO.
3169 */ 3178 */
3170 if (rq_is_meta(rq) && !cfqq->meta_pending) 3179 if (rq_is_meta(rq) && !cfqq->meta_pending)
3171 return true; 3180 return true;
3172 3181
3173 /* 3182 /*
3174 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. 3183 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
3175 */ 3184 */
3176 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) 3185 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
3177 return true; 3186 return true;
3178 3187
3179 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) 3188 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
3180 return false; 3189 return false;
3181 3190
3182 /* 3191 /*
3183 * if this request is as-good as one we would expect from the 3192 * if this request is as-good as one we would expect from the
3184 * current cfqq, let it preempt 3193 * current cfqq, let it preempt
3185 */ 3194 */
3186 if (cfq_rq_close(cfqd, cfqq, rq)) 3195 if (cfq_rq_close(cfqd, cfqq, rq))
3187 return true; 3196 return true;
3188 3197
3189 return false; 3198 return false;
3190 } 3199 }
3191 3200
3192 /* 3201 /*
3193 * cfqq preempts the active queue. if we allowed preempt with no slice left, 3202 * cfqq preempts the active queue. if we allowed preempt with no slice left,
3194 * let it have half of its nominal slice. 3203 * let it have half of its nominal slice.
3195 */ 3204 */
3196 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3205 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3197 { 3206 {
3198 cfq_log_cfqq(cfqd, cfqq, "preempt"); 3207 cfq_log_cfqq(cfqd, cfqq, "preempt");
3199 cfq_slice_expired(cfqd, 1); 3208 cfq_slice_expired(cfqd, 1);
3200 3209
3201 /* 3210 /*
3202 * Put the new queue at the front of the of the current list, 3211 * Put the new queue at the front of the of the current list,
3203 * so we know that it will be selected next. 3212 * so we know that it will be selected next.
3204 */ 3213 */
3205 BUG_ON(!cfq_cfqq_on_rr(cfqq)); 3214 BUG_ON(!cfq_cfqq_on_rr(cfqq));
3206 3215
3207 cfq_service_tree_add(cfqd, cfqq, 1); 3216 cfq_service_tree_add(cfqd, cfqq, 1);
3208 3217
3209 cfqq->slice_end = 0; 3218 cfqq->slice_end = 0;
3210 cfq_mark_cfqq_slice_new(cfqq); 3219 cfq_mark_cfqq_slice_new(cfqq);
3211 } 3220 }
3212 3221
3213 /* 3222 /*
3214 * Called when a new fs request (rq) is added (to cfqq). Check if there's 3223 * Called when a new fs request (rq) is added (to cfqq). Check if there's
3215 * something we should do about it 3224 * something we should do about it
3216 */ 3225 */
3217 static void 3226 static void
3218 cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3227 cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3219 struct request *rq) 3228 struct request *rq)
3220 { 3229 {
3221 struct cfq_io_context *cic = RQ_CIC(rq); 3230 struct cfq_io_context *cic = RQ_CIC(rq);
3222 3231
3223 cfqd->rq_queued++; 3232 cfqd->rq_queued++;
3224 if (rq_is_meta(rq)) 3233 if (rq_is_meta(rq))
3225 cfqq->meta_pending++; 3234 cfqq->meta_pending++;
3226 3235
3227 cfq_update_io_thinktime(cfqd, cic); 3236 cfq_update_io_thinktime(cfqd, cic);
3228 cfq_update_io_seektime(cfqd, cfqq, rq); 3237 cfq_update_io_seektime(cfqd, cfqq, rq);
3229 cfq_update_idle_window(cfqd, cfqq, cic); 3238 cfq_update_idle_window(cfqd, cfqq, cic);
3230 3239
3231 cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); 3240 cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3232 3241
3233 if (cfqq == cfqd->active_queue) { 3242 if (cfqq == cfqd->active_queue) {
3234 /* 3243 /*
3235 * Remember that we saw a request from this process, but 3244 * Remember that we saw a request from this process, but
3236 * don't start queuing just yet. Otherwise we risk seeing lots 3245 * don't start queuing just yet. Otherwise we risk seeing lots
3237 * of tiny requests, because we disrupt the normal plugging 3246 * of tiny requests, because we disrupt the normal plugging
3238 * and merging. If the request is already larger than a single 3247 * and merging. If the request is already larger than a single
3239 * page, let it rip immediately. For that case we assume that 3248 * page, let it rip immediately. For that case we assume that
3240 * merging is already done. Ditto for a busy system that 3249 * merging is already done. Ditto for a busy system that
3241 * has other work pending, don't risk delaying until the 3250 * has other work pending, don't risk delaying until the
3242 * idle timer unplug to continue working. 3251 * idle timer unplug to continue working.
3243 */ 3252 */
3244 if (cfq_cfqq_wait_request(cfqq)) { 3253 if (cfq_cfqq_wait_request(cfqq)) {
3245 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || 3254 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
3246 cfqd->busy_queues > 1) { 3255 cfqd->busy_queues > 1) {
3247 cfq_del_timer(cfqd, cfqq); 3256 cfq_del_timer(cfqd, cfqq);
3248 cfq_clear_cfqq_wait_request(cfqq); 3257 cfq_clear_cfqq_wait_request(cfqq);
3249 __blk_run_queue(cfqd->queue); 3258 __blk_run_queue(cfqd->queue);
3250 } else { 3259 } else {
3251 blkiocg_update_idle_time_stats( 3260 blkiocg_update_idle_time_stats(
3252 &cfqq->cfqg->blkg); 3261 &cfqq->cfqg->blkg);
3253 cfq_mark_cfqq_must_dispatch(cfqq); 3262 cfq_mark_cfqq_must_dispatch(cfqq);
3254 } 3263 }
3255 } 3264 }
3256 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 3265 } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
3257 /* 3266 /*
3258 * not the active queue - expire current slice if it is 3267 * not the active queue - expire current slice if it is
3259 * idle and has expired it's mean thinktime or this new queue 3268 * idle and has expired it's mean thinktime or this new queue
3260 * has some old slice time left and is of higher priority or 3269 * has some old slice time left and is of higher priority or
3261 * this new queue is RT and the current one is BE 3270 * this new queue is RT and the current one is BE
3262 */ 3271 */
3263 cfq_preempt_queue(cfqd, cfqq); 3272 cfq_preempt_queue(cfqd, cfqq);
3264 __blk_run_queue(cfqd->queue); 3273 __blk_run_queue(cfqd->queue);
3265 } 3274 }
3266 } 3275 }
3267 3276
3268 static void cfq_insert_request(struct request_queue *q, struct request *rq) 3277 static void cfq_insert_request(struct request_queue *q, struct request *rq)
3269 { 3278 {
3270 struct cfq_data *cfqd = q->elevator->elevator_data; 3279 struct cfq_data *cfqd = q->elevator->elevator_data;
3271 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3280 struct cfq_queue *cfqq = RQ_CFQQ(rq);
3272 3281
3273 cfq_log_cfqq(cfqd, cfqq, "insert_request"); 3282 cfq_log_cfqq(cfqd, cfqq, "insert_request");
3274 cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc); 3283 cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
3275 3284
3276 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); 3285 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
3277 list_add_tail(&rq->queuelist, &cfqq->fifo); 3286 list_add_tail(&rq->queuelist, &cfqq->fifo);
3278 cfq_add_rq_rb(rq); 3287 cfq_add_rq_rb(rq);
3279 blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, 3288 blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
3280 &cfqd->serving_group->blkg, rq_data_dir(rq), 3289 &cfqd->serving_group->blkg, rq_data_dir(rq),
3281 rq_is_sync(rq)); 3290 rq_is_sync(rq));
3282 cfq_rq_enqueued(cfqd, cfqq, rq); 3291 cfq_rq_enqueued(cfqd, cfqq, rq);
3283 } 3292 }
3284 3293
3285 /* 3294 /*
3286 * Update hw_tag based on peak queue depth over 50 samples under 3295 * Update hw_tag based on peak queue depth over 50 samples under
3287 * sufficient load. 3296 * sufficient load.
3288 */ 3297 */
3289 static void cfq_update_hw_tag(struct cfq_data *cfqd) 3298 static void cfq_update_hw_tag(struct cfq_data *cfqd)
3290 { 3299 {
3291 struct cfq_queue *cfqq = cfqd->active_queue; 3300 struct cfq_queue *cfqq = cfqd->active_queue;
3292 3301
3293 if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth) 3302 if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
3294 cfqd->hw_tag_est_depth = cfqd->rq_in_driver; 3303 cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
3295 3304
3296 if (cfqd->hw_tag == 1) 3305 if (cfqd->hw_tag == 1)
3297 return; 3306 return;
3298 3307
3299 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && 3308 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
3300 cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN) 3309 cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
3301 return; 3310 return;
3302 3311
3303 /* 3312 /*
3304 * If active queue hasn't enough requests and can idle, cfq might not 3313 * If active queue hasn't enough requests and can idle, cfq might not
3305 * dispatch sufficient requests to hardware. Don't zero hw_tag in this 3314 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
3306 * case 3315 * case
3307 */ 3316 */
3308 if (cfqq && cfq_cfqq_idle_window(cfqq) && 3317 if (cfqq && cfq_cfqq_idle_window(cfqq) &&
3309 cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] < 3318 cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
3310 CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN) 3319 CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
3311 return; 3320 return;
3312 3321
3313 if (cfqd->hw_tag_samples++ < 50) 3322 if (cfqd->hw_tag_samples++ < 50)
3314 return; 3323 return;
3315 3324
3316 if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN) 3325 if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
3317 cfqd->hw_tag = 1; 3326 cfqd->hw_tag = 1;
3318 else 3327 else
3319 cfqd->hw_tag = 0; 3328 cfqd->hw_tag = 0;
3320 } 3329 }
3321 3330
3322 static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3331 static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3323 { 3332 {
3324 struct cfq_io_context *cic = cfqd->active_cic; 3333 struct cfq_io_context *cic = cfqd->active_cic;
3325 3334
3326 /* If there are other queues in the group, don't wait */ 3335 /* If there are other queues in the group, don't wait */
3327 if (cfqq->cfqg->nr_cfqq > 1) 3336 if (cfqq->cfqg->nr_cfqq > 1)
3328 return false; 3337 return false;
3329 3338
3330 if (cfq_slice_used(cfqq)) 3339 if (cfq_slice_used(cfqq))
3331 return true; 3340 return true;
3332 3341
3333 /* if slice left is less than think time, wait busy */ 3342 /* if slice left is less than think time, wait busy */
3334 if (cic && sample_valid(cic->ttime_samples) 3343 if (cic && sample_valid(cic->ttime_samples)
3335 && (cfqq->slice_end - jiffies < cic->ttime_mean)) 3344 && (cfqq->slice_end - jiffies < cic->ttime_mean))
3336 return true; 3345 return true;
3337 3346
3338 /* 3347 /*
3339 * If think times is less than a jiffy than ttime_mean=0 and above 3348 * If think times is less than a jiffy than ttime_mean=0 and above
3340 * will not be true. It might happen that slice has not expired yet 3349 * will not be true. It might happen that slice has not expired yet
3341 * but will expire soon (4-5 ns) during select_queue(). To cover the 3350 * but will expire soon (4-5 ns) during select_queue(). To cover the
3342 * case where think time is less than a jiffy, mark the queue wait 3351 * case where think time is less than a jiffy, mark the queue wait
3343 * busy if only 1 jiffy is left in the slice. 3352 * busy if only 1 jiffy is left in the slice.
3344 */ 3353 */
3345 if (cfqq->slice_end - jiffies == 1) 3354 if (cfqq->slice_end - jiffies == 1)
3346 return true; 3355 return true;
3347 3356
3348 return false; 3357 return false;
3349 } 3358 }
3350 3359
3351 static void cfq_completed_request(struct request_queue *q, struct request *rq) 3360 static void cfq_completed_request(struct request_queue *q, struct request *rq)
3352 { 3361 {
3353 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3362 struct cfq_queue *cfqq = RQ_CFQQ(rq);
3354 struct cfq_data *cfqd = cfqq->cfqd; 3363 struct cfq_data *cfqd = cfqq->cfqd;
3355 const int sync = rq_is_sync(rq); 3364 const int sync = rq_is_sync(rq);
3356 unsigned long now; 3365 unsigned long now;
3357 3366
3358 now = jiffies; 3367 now = jiffies;
3359 cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq)); 3368 cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq));
3360 3369
3361 cfq_update_hw_tag(cfqd); 3370 cfq_update_hw_tag(cfqd);
3362 3371
3363 WARN_ON(!cfqd->rq_in_driver); 3372 WARN_ON(!cfqd->rq_in_driver);
3364 WARN_ON(!cfqq->dispatched); 3373 WARN_ON(!cfqq->dispatched);
3365 cfqd->rq_in_driver--; 3374 cfqd->rq_in_driver--;
3366 cfqq->dispatched--; 3375 cfqq->dispatched--;
3367 blkiocg_update_completion_stats(&cfqq->cfqg->blkg, rq_start_time_ns(rq), 3376 blkiocg_update_completion_stats(&cfqq->cfqg->blkg, rq_start_time_ns(rq),
3368 rq_io_start_time_ns(rq), rq_data_dir(rq), 3377 rq_io_start_time_ns(rq), rq_data_dir(rq),
3369 rq_is_sync(rq)); 3378 rq_is_sync(rq));
3370 3379
3371 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; 3380 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3372 3381
3373 if (sync) { 3382 if (sync) {
3374 RQ_CIC(rq)->last_end_request = now; 3383 RQ_CIC(rq)->last_end_request = now;
3375 if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now)) 3384 if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
3376 cfqd->last_delayed_sync = now; 3385 cfqd->last_delayed_sync = now;
3377 } 3386 }
3378 3387
3379 /* 3388 /*
3380 * If this is the active queue, check if it needs to be expired, 3389 * If this is the active queue, check if it needs to be expired,
3381 * or if we want to idle in case it has no pending requests. 3390 * or if we want to idle in case it has no pending requests.
3382 */ 3391 */
3383 if (cfqd->active_queue == cfqq) { 3392 if (cfqd->active_queue == cfqq) {
3384 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list); 3393 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
3385 3394
3386 if (cfq_cfqq_slice_new(cfqq)) { 3395 if (cfq_cfqq_slice_new(cfqq)) {
3387 cfq_set_prio_slice(cfqd, cfqq); 3396 cfq_set_prio_slice(cfqd, cfqq);
3388 cfq_clear_cfqq_slice_new(cfqq); 3397 cfq_clear_cfqq_slice_new(cfqq);
3389 } 3398 }
3390 3399
3391 /* 3400 /*
3392 * Should we wait for next request to come in before we expire 3401 * Should we wait for next request to come in before we expire
3393 * the queue. 3402 * the queue.
3394 */ 3403 */
3395 if (cfq_should_wait_busy(cfqd, cfqq)) { 3404 if (cfq_should_wait_busy(cfqd, cfqq)) {
3396 cfqq->slice_end = jiffies + cfqd->cfq_slice_idle; 3405 cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;
3397 cfq_mark_cfqq_wait_busy(cfqq); 3406 cfq_mark_cfqq_wait_busy(cfqq);
3398 cfq_log_cfqq(cfqd, cfqq, "will busy wait"); 3407 cfq_log_cfqq(cfqd, cfqq, "will busy wait");
3399 } 3408 }
3400 3409
3401 /* 3410 /*
3402 * Idling is not enabled on: 3411 * Idling is not enabled on:
3403 * - expired queues 3412 * - expired queues
3404 * - idle-priority queues 3413 * - idle-priority queues
3405 * - async queues 3414 * - async queues
3406 * - queues with still some requests queued 3415 * - queues with still some requests queued
3407 * - when there is a close cooperator 3416 * - when there is a close cooperator
3408 */ 3417 */
3409 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) 3418 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
3410 cfq_slice_expired(cfqd, 1); 3419 cfq_slice_expired(cfqd, 1);
3411 else if (sync && cfqq_empty && 3420 else if (sync && cfqq_empty &&
3412 !cfq_close_cooperator(cfqd, cfqq)) { 3421 !cfq_close_cooperator(cfqd, cfqq)) {
3413 cfqd->noidle_tree_requires_idle |= !rq_noidle(rq); 3422 cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);
3414 /* 3423 /*
3415 * Idling is enabled for SYNC_WORKLOAD. 3424 * Idling is enabled for SYNC_WORKLOAD.
3416 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree 3425 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
3417 * only if we processed at least one !rq_noidle request 3426 * only if we processed at least one !rq_noidle request
3418 */ 3427 */
3419 if (cfqd->serving_type == SYNC_WORKLOAD 3428 if (cfqd->serving_type == SYNC_WORKLOAD
3420 || cfqd->noidle_tree_requires_idle 3429 || cfqd->noidle_tree_requires_idle
3421 || cfqq->cfqg->nr_cfqq == 1) 3430 || cfqq->cfqg->nr_cfqq == 1)
3422 cfq_arm_slice_timer(cfqd); 3431 cfq_arm_slice_timer(cfqd);
3423 } 3432 }
3424 } 3433 }
3425 3434
3426 if (!cfqd->rq_in_driver) 3435 if (!cfqd->rq_in_driver)
3427 cfq_schedule_dispatch(cfqd); 3436 cfq_schedule_dispatch(cfqd);
3428 } 3437 }
3429 3438
3430 /* 3439 /*
3431 * we temporarily boost lower priority queues if they are holding fs exclusive 3440 * we temporarily boost lower priority queues if they are holding fs exclusive
3432 * resources. they are boosted to normal prio (CLASS_BE/4) 3441 * resources. they are boosted to normal prio (CLASS_BE/4)
3433 */ 3442 */
3434 static void cfq_prio_boost(struct cfq_queue *cfqq) 3443 static void cfq_prio_boost(struct cfq_queue *cfqq)
3435 { 3444 {
3436 if (has_fs_excl()) { 3445 if (has_fs_excl()) {
3437 /* 3446 /*
3438 * boost idle prio on transactions that would lock out other 3447 * boost idle prio on transactions that would lock out other
3439 * users of the filesystem 3448 * users of the filesystem
3440 */ 3449 */
3441 if (cfq_class_idle(cfqq)) 3450 if (cfq_class_idle(cfqq))
3442 cfqq->ioprio_class = IOPRIO_CLASS_BE; 3451 cfqq->ioprio_class = IOPRIO_CLASS_BE;
3443 if (cfqq->ioprio > IOPRIO_NORM) 3452 if (cfqq->ioprio > IOPRIO_NORM)
3444 cfqq->ioprio = IOPRIO_NORM; 3453 cfqq->ioprio = IOPRIO_NORM;
3445 } else { 3454 } else {
3446 /* 3455 /*
3447 * unboost the queue (if needed) 3456 * unboost the queue (if needed)
3448 */ 3457 */
3449 cfqq->ioprio_class = cfqq->org_ioprio_class; 3458 cfqq->ioprio_class = cfqq->org_ioprio_class;
3450 cfqq->ioprio = cfqq->org_ioprio; 3459 cfqq->ioprio = cfqq->org_ioprio;
3451 } 3460 }
3452 } 3461 }
3453 3462
3454 static inline int __cfq_may_queue(struct cfq_queue *cfqq) 3463 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
3455 { 3464 {
3456 if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) { 3465 if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
3457 cfq_mark_cfqq_must_alloc_slice(cfqq); 3466 cfq_mark_cfqq_must_alloc_slice(cfqq);
3458 return ELV_MQUEUE_MUST; 3467 return ELV_MQUEUE_MUST;
3459 } 3468 }
3460 3469
3461 return ELV_MQUEUE_MAY; 3470 return ELV_MQUEUE_MAY;
3462 } 3471 }
3463 3472
3464 static int cfq_may_queue(struct request_queue *q, int rw) 3473 static int cfq_may_queue(struct request_queue *q, int rw)
3465 { 3474 {
3466 struct cfq_data *cfqd = q->elevator->elevator_data; 3475 struct cfq_data *cfqd = q->elevator->elevator_data;
3467 struct task_struct *tsk = current; 3476 struct task_struct *tsk = current;
3468 struct cfq_io_context *cic; 3477 struct cfq_io_context *cic;
3469 struct cfq_queue *cfqq; 3478 struct cfq_queue *cfqq;
3470 3479
3471 /* 3480 /*
3472 * don't force setup of a queue from here, as a call to may_queue 3481 * don't force setup of a queue from here, as a call to may_queue
3473 * does not necessarily imply that a request actually will be queued. 3482 * does not necessarily imply that a request actually will be queued.
3474 * so just lookup a possibly existing queue, or return 'may queue' 3483 * so just lookup a possibly existing queue, or return 'may queue'
3475 * if that fails 3484 * if that fails
3476 */ 3485 */
3477 cic = cfq_cic_lookup(cfqd, tsk->io_context); 3486 cic = cfq_cic_lookup(cfqd, tsk->io_context);
3478 if (!cic) 3487 if (!cic)
3479 return ELV_MQUEUE_MAY; 3488 return ELV_MQUEUE_MAY;
3480 3489
3481 cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); 3490 cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
3482 if (cfqq) { 3491 if (cfqq) {
3483 cfq_init_prio_data(cfqq, cic->ioc); 3492 cfq_init_prio_data(cfqq, cic->ioc);
3484 cfq_prio_boost(cfqq); 3493 cfq_prio_boost(cfqq);
3485 3494
3486 return __cfq_may_queue(cfqq); 3495 return __cfq_may_queue(cfqq);
3487 } 3496 }
3488 3497
3489 return ELV_MQUEUE_MAY; 3498 return ELV_MQUEUE_MAY;
3490 } 3499 }
3491 3500
3492 /* 3501 /*
3493 * queue lock held here 3502 * queue lock held here
3494 */ 3503 */
3495 static void cfq_put_request(struct request *rq) 3504 static void cfq_put_request(struct request *rq)
3496 { 3505 {
3497 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3506 struct cfq_queue *cfqq = RQ_CFQQ(rq);
3498 3507
3499 if (cfqq) { 3508 if (cfqq) {
3500 const int rw = rq_data_dir(rq); 3509 const int rw = rq_data_dir(rq);
3501 3510
3502 BUG_ON(!cfqq->allocated[rw]); 3511 BUG_ON(!cfqq->allocated[rw]);
3503 cfqq->allocated[rw]--; 3512 cfqq->allocated[rw]--;
3504 3513
3505 put_io_context(RQ_CIC(rq)->ioc); 3514 put_io_context(RQ_CIC(rq)->ioc);
3506 3515
3507 rq->elevator_private = NULL; 3516 rq->elevator_private = NULL;
3508 rq->elevator_private2 = NULL; 3517 rq->elevator_private2 = NULL;
3509 3518
3510 /* Put down rq reference on cfqg */ 3519 /* Put down rq reference on cfqg */
3511 cfq_put_cfqg(RQ_CFQG(rq)); 3520 cfq_put_cfqg(RQ_CFQG(rq));
3512 rq->elevator_private3 = NULL; 3521 rq->elevator_private3 = NULL;
3513 3522
3514 cfq_put_queue(cfqq); 3523 cfq_put_queue(cfqq);
3515 } 3524 }
3516 } 3525 }
3517 3526
3518 static struct cfq_queue * 3527 static struct cfq_queue *
3519 cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic, 3528 cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
3520 struct cfq_queue *cfqq) 3529 struct cfq_queue *cfqq)
3521 { 3530 {
3522 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); 3531 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
3523 cic_set_cfqq(cic, cfqq->new_cfqq, 1); 3532 cic_set_cfqq(cic, cfqq->new_cfqq, 1);
3524 cfq_mark_cfqq_coop(cfqq->new_cfqq); 3533 cfq_mark_cfqq_coop(cfqq->new_cfqq);
3525 cfq_put_queue(cfqq); 3534 cfq_put_queue(cfqq);
3526 return cic_to_cfqq(cic, 1); 3535 return cic_to_cfqq(cic, 1);
3527 } 3536 }
3528 3537
3529 /* 3538 /*
3530 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this 3539 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
3531 * was the last process referring to said cfqq. 3540 * was the last process referring to said cfqq.
3532 */ 3541 */
3533 static struct cfq_queue * 3542 static struct cfq_queue *
3534 split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq) 3543 split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
3535 { 3544 {
3536 if (cfqq_process_refs(cfqq) == 1) { 3545 if (cfqq_process_refs(cfqq) == 1) {
3537 cfqq->pid = current->pid; 3546 cfqq->pid = current->pid;
3538 cfq_clear_cfqq_coop(cfqq); 3547 cfq_clear_cfqq_coop(cfqq);
3539 cfq_clear_cfqq_split_coop(cfqq); 3548 cfq_clear_cfqq_split_coop(cfqq);
3540 return cfqq; 3549 return cfqq;
3541 } 3550 }
3542 3551
3543 cic_set_cfqq(cic, NULL, 1); 3552 cic_set_cfqq(cic, NULL, 1);
3544 3553
3545 cfq_put_cooperator(cfqq); 3554 cfq_put_cooperator(cfqq);
3546 3555
3547 cfq_put_queue(cfqq); 3556 cfq_put_queue(cfqq);
3548 return NULL; 3557 return NULL;
3549 } 3558 }
3550 /* 3559 /*
3551 * Allocate cfq data structures associated with this request. 3560 * Allocate cfq data structures associated with this request.
3552 */ 3561 */
3553 static int 3562 static int
3554 cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) 3563 cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
3555 { 3564 {
3556 struct cfq_data *cfqd = q->elevator->elevator_data; 3565 struct cfq_data *cfqd = q->elevator->elevator_data;
3557 struct cfq_io_context *cic; 3566 struct cfq_io_context *cic;
3558 const int rw = rq_data_dir(rq); 3567 const int rw = rq_data_dir(rq);
3559 const bool is_sync = rq_is_sync(rq); 3568 const bool is_sync = rq_is_sync(rq);
3560 struct cfq_queue *cfqq; 3569 struct cfq_queue *cfqq;
3561 unsigned long flags; 3570 unsigned long flags;
3562 3571
3563 might_sleep_if(gfp_mask & __GFP_WAIT); 3572 might_sleep_if(gfp_mask & __GFP_WAIT);
3564 3573
3565 cic = cfq_get_io_context(cfqd, gfp_mask); 3574 cic = cfq_get_io_context(cfqd, gfp_mask);
3566 3575
3567 spin_lock_irqsave(q->queue_lock, flags); 3576 spin_lock_irqsave(q->queue_lock, flags);
3568 3577
3569 if (!cic) 3578 if (!cic)
3570 goto queue_fail; 3579 goto queue_fail;
3571 3580
3572 new_queue: 3581 new_queue:
3573 cfqq = cic_to_cfqq(cic, is_sync); 3582 cfqq = cic_to_cfqq(cic, is_sync);
3574 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 3583 if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3575 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); 3584 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
3576 cic_set_cfqq(cic, cfqq, is_sync); 3585 cic_set_cfqq(cic, cfqq, is_sync);
3577 } else { 3586 } else {
3578 /* 3587 /*
3579 * If the queue was seeky for too long, break it apart. 3588 * If the queue was seeky for too long, break it apart.
3580 */ 3589 */
3581 if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) { 3590 if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
3582 cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq"); 3591 cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
3583 cfqq = split_cfqq(cic, cfqq); 3592 cfqq = split_cfqq(cic, cfqq);
3584 if (!cfqq) 3593 if (!cfqq)
3585 goto new_queue; 3594 goto new_queue;
3586 } 3595 }
3587 3596
3588 /* 3597 /*
3589 * Check to see if this queue is scheduled to merge with 3598 * Check to see if this queue is scheduled to merge with
3590 * another, closely cooperating queue. The merging of 3599 * another, closely cooperating queue. The merging of
3591 * queues happens here as it must be done in process context. 3600 * queues happens here as it must be done in process context.
3592 * The reference on new_cfqq was taken in merge_cfqqs. 3601 * The reference on new_cfqq was taken in merge_cfqqs.
3593 */ 3602 */
3594 if (cfqq->new_cfqq) 3603 if (cfqq->new_cfqq)
3595 cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq); 3604 cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
3596 } 3605 }
3597 3606
3598 cfqq->allocated[rw]++; 3607 cfqq->allocated[rw]++;
3599 atomic_inc(&cfqq->ref); 3608 atomic_inc(&cfqq->ref);
3600 3609
3601 spin_unlock_irqrestore(q->queue_lock, flags); 3610 spin_unlock_irqrestore(q->queue_lock, flags);
3602 3611
3603 rq->elevator_private = cic; 3612 rq->elevator_private = cic;
3604 rq->elevator_private2 = cfqq; 3613 rq->elevator_private2 = cfqq;
3605 rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg); 3614 rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
3606 return 0; 3615 return 0;
3607 3616
3608 queue_fail: 3617 queue_fail:
3609 if (cic) 3618 if (cic)
3610 put_io_context(cic->ioc); 3619 put_io_context(cic->ioc);
3611 3620
3612 cfq_schedule_dispatch(cfqd); 3621 cfq_schedule_dispatch(cfqd);
3613 spin_unlock_irqrestore(q->queue_lock, flags); 3622 spin_unlock_irqrestore(q->queue_lock, flags);
3614 cfq_log(cfqd, "set_request fail"); 3623 cfq_log(cfqd, "set_request fail");
3615 return 1; 3624 return 1;
3616 } 3625 }
3617 3626
3618 static void cfq_kick_queue(struct work_struct *work) 3627 static void cfq_kick_queue(struct work_struct *work)
3619 { 3628 {
3620 struct cfq_data *cfqd = 3629 struct cfq_data *cfqd =
3621 container_of(work, struct cfq_data, unplug_work); 3630 container_of(work, struct cfq_data, unplug_work);
3622 struct request_queue *q = cfqd->queue; 3631 struct request_queue *q = cfqd->queue;
3623 3632
3624 spin_lock_irq(q->queue_lock); 3633 spin_lock_irq(q->queue_lock);
3625 __blk_run_queue(cfqd->queue); 3634 __blk_run_queue(cfqd->queue);
3626 spin_unlock_irq(q->queue_lock); 3635 spin_unlock_irq(q->queue_lock);
3627 } 3636 }
3628 3637
3629 /* 3638 /*
3630 * Timer running if the active_queue is currently idling inside its time slice 3639 * Timer running if the active_queue is currently idling inside its time slice
3631 */ 3640 */
3632 static void cfq_idle_slice_timer(unsigned long data) 3641 static void cfq_idle_slice_timer(unsigned long data)
3633 { 3642 {
3634 struct cfq_data *cfqd = (struct cfq_data *) data; 3643 struct cfq_data *cfqd = (struct cfq_data *) data;
3635 struct cfq_queue *cfqq; 3644 struct cfq_queue *cfqq;
3636 unsigned long flags; 3645 unsigned long flags;
3637 int timed_out = 1; 3646 int timed_out = 1;
3638 3647
3639 cfq_log(cfqd, "idle timer fired"); 3648 cfq_log(cfqd, "idle timer fired");
3640 3649
3641 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 3650 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
3642 3651
3643 cfqq = cfqd->active_queue; 3652 cfqq = cfqd->active_queue;
3644 if (cfqq) { 3653 if (cfqq) {
3645 timed_out = 0; 3654 timed_out = 0;
3646 3655
3647 /* 3656 /*
3648 * We saw a request before the queue expired, let it through 3657 * We saw a request before the queue expired, let it through
3649 */ 3658 */
3650 if (cfq_cfqq_must_dispatch(cfqq)) 3659 if (cfq_cfqq_must_dispatch(cfqq))
3651 goto out_kick; 3660 goto out_kick;
3652 3661
3653 /* 3662 /*
3654 * expired 3663 * expired
3655 */ 3664 */
3656 if (cfq_slice_used(cfqq)) 3665 if (cfq_slice_used(cfqq))
3657 goto expire; 3666 goto expire;
3658 3667
3659 /* 3668 /*
3660 * only expire and reinvoke request handler, if there are 3669 * only expire and reinvoke request handler, if there are
3661 * other queues with pending requests 3670 * other queues with pending requests
3662 */ 3671 */
3663 if (!cfqd->busy_queues) 3672 if (!cfqd->busy_queues)
3664 goto out_cont; 3673 goto out_cont;
3665 3674
3666 /* 3675 /*
3667 * not expired and it has a request pending, let it dispatch 3676 * not expired and it has a request pending, let it dispatch
3668 */ 3677 */
3669 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 3678 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3670 goto out_kick; 3679 goto out_kick;
3671 3680
3672 /* 3681 /*
3673 * Queue depth flag is reset only when the idle didn't succeed 3682 * Queue depth flag is reset only when the idle didn't succeed
3674 */ 3683 */
3675 cfq_clear_cfqq_deep(cfqq); 3684 cfq_clear_cfqq_deep(cfqq);
3676 } 3685 }
3677 expire: 3686 expire:
3678 cfq_slice_expired(cfqd, timed_out); 3687 cfq_slice_expired(cfqd, timed_out);
3679 out_kick: 3688 out_kick:
3680 cfq_schedule_dispatch(cfqd); 3689 cfq_schedule_dispatch(cfqd);
3681 out_cont: 3690 out_cont:
3682 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 3691 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
3683 } 3692 }
3684 3693
3685 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) 3694 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
3686 { 3695 {
3687 del_timer_sync(&cfqd->idle_slice_timer); 3696 del_timer_sync(&cfqd->idle_slice_timer);
3688 cancel_work_sync(&cfqd->unplug_work); 3697 cancel_work_sync(&cfqd->unplug_work);
3689 } 3698 }
3690 3699
3691 static void cfq_put_async_queues(struct cfq_data *cfqd) 3700 static void cfq_put_async_queues(struct cfq_data *cfqd)
3692 { 3701 {
3693 int i; 3702 int i;
3694 3703
3695 for (i = 0; i < IOPRIO_BE_NR; i++) { 3704 for (i = 0; i < IOPRIO_BE_NR; i++) {
3696 if (cfqd->async_cfqq[0][i]) 3705 if (cfqd->async_cfqq[0][i])
3697 cfq_put_queue(cfqd->async_cfqq[0][i]); 3706 cfq_put_queue(cfqd->async_cfqq[0][i]);
3698 if (cfqd->async_cfqq[1][i]) 3707 if (cfqd->async_cfqq[1][i])
3699 cfq_put_queue(cfqd->async_cfqq[1][i]); 3708 cfq_put_queue(cfqd->async_cfqq[1][i]);
3700 } 3709 }
3701 3710
3702 if (cfqd->async_idle_cfqq) 3711 if (cfqd->async_idle_cfqq)
3703 cfq_put_queue(cfqd->async_idle_cfqq); 3712 cfq_put_queue(cfqd->async_idle_cfqq);
3704 } 3713 }
3705 3714
3706 static void cfq_cfqd_free(struct rcu_head *head) 3715 static void cfq_cfqd_free(struct rcu_head *head)
3707 { 3716 {
3708 kfree(container_of(head, struct cfq_data, rcu)); 3717 kfree(container_of(head, struct cfq_data, rcu));
3709 } 3718 }
3710 3719
3711 static void cfq_exit_queue(struct elevator_queue *e) 3720 static void cfq_exit_queue(struct elevator_queue *e)
3712 { 3721 {
3713 struct cfq_data *cfqd = e->elevator_data; 3722 struct cfq_data *cfqd = e->elevator_data;
3714 struct request_queue *q = cfqd->queue; 3723 struct request_queue *q = cfqd->queue;
3715 3724
3716 cfq_shutdown_timer_wq(cfqd); 3725 cfq_shutdown_timer_wq(cfqd);
3717 3726
3718 spin_lock_irq(q->queue_lock); 3727 spin_lock_irq(q->queue_lock);
3719 3728
3720 if (cfqd->active_queue) 3729 if (cfqd->active_queue)
3721 __cfq_slice_expired(cfqd, cfqd->active_queue, 0); 3730 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
3722 3731
3723 while (!list_empty(&cfqd->cic_list)) { 3732 while (!list_empty(&cfqd->cic_list)) {
3724 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, 3733 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
3725 struct cfq_io_context, 3734 struct cfq_io_context,
3726 queue_list); 3735 queue_list);
3727 3736
3728 __cfq_exit_single_io_context(cfqd, cic); 3737 __cfq_exit_single_io_context(cfqd, cic);
3729 } 3738 }
3730 3739
3731 cfq_put_async_queues(cfqd); 3740 cfq_put_async_queues(cfqd);
3732 cfq_release_cfq_groups(cfqd); 3741 cfq_release_cfq_groups(cfqd);
3733 blkiocg_del_blkio_group(&cfqd->root_group.blkg); 3742 blkiocg_del_blkio_group(&cfqd->root_group.blkg);
3734 3743
3735 spin_unlock_irq(q->queue_lock); 3744 spin_unlock_irq(q->queue_lock);
3736 3745
3737 cfq_shutdown_timer_wq(cfqd); 3746 cfq_shutdown_timer_wq(cfqd);
3738 3747
3739 spin_lock(&cic_index_lock); 3748 spin_lock(&cic_index_lock);
3740 ida_remove(&cic_index_ida, cfqd->cic_index); 3749 ida_remove(&cic_index_ida, cfqd->cic_index);
3741 spin_unlock(&cic_index_lock); 3750 spin_unlock(&cic_index_lock);
3742 3751
3743 /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ 3752 /* Wait for cfqg->blkg->key accessors to exit their grace periods. */
3744 call_rcu(&cfqd->rcu, cfq_cfqd_free); 3753 call_rcu(&cfqd->rcu, cfq_cfqd_free);
3745 } 3754 }
3746 3755
3747 static int cfq_alloc_cic_index(void) 3756 static int cfq_alloc_cic_index(void)
3748 { 3757 {
3749 int index, error; 3758 int index, error;
3750 3759
3751 do { 3760 do {
3752 if (!ida_pre_get(&cic_index_ida, GFP_KERNEL)) 3761 if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
3753 return -ENOMEM; 3762 return -ENOMEM;
3754 3763
3755 spin_lock(&cic_index_lock); 3764 spin_lock(&cic_index_lock);
3756 error = ida_get_new(&cic_index_ida, &index); 3765 error = ida_get_new(&cic_index_ida, &index);
3757 spin_unlock(&cic_index_lock); 3766 spin_unlock(&cic_index_lock);
3758 if (error && error != -EAGAIN) 3767 if (error && error != -EAGAIN)
3759 return error; 3768 return error;
3760 } while (error); 3769 } while (error);
3761 3770
3762 return index; 3771 return index;
3763 } 3772 }
3764 3773
3765 static void *cfq_init_queue(struct request_queue *q) 3774 static void *cfq_init_queue(struct request_queue *q)
3766 { 3775 {
3767 struct cfq_data *cfqd; 3776 struct cfq_data *cfqd;
3768 int i, j; 3777 int i, j;
3769 struct cfq_group *cfqg; 3778 struct cfq_group *cfqg;
3770 struct cfq_rb_root *st; 3779 struct cfq_rb_root *st;
3771 3780
3772 i = cfq_alloc_cic_index(); 3781 i = cfq_alloc_cic_index();
3773 if (i < 0) 3782 if (i < 0)
3774 return NULL; 3783 return NULL;
3775 3784
3776 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 3785 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
3777 if (!cfqd) 3786 if (!cfqd)
3778 return NULL; 3787 return NULL;
3779 3788
3780 cfqd->cic_index = i; 3789 cfqd->cic_index = i;
3781 3790
3782 /* Init root service tree */ 3791 /* Init root service tree */
3783 cfqd->grp_service_tree = CFQ_RB_ROOT; 3792 cfqd->grp_service_tree = CFQ_RB_ROOT;
3784 3793
3785 /* Init root group */ 3794 /* Init root group */
3786 cfqg = &cfqd->root_group; 3795 cfqg = &cfqd->root_group;
3787 for_each_cfqg_st(cfqg, i, j, st) 3796 for_each_cfqg_st(cfqg, i, j, st)
3788 *st = CFQ_RB_ROOT; 3797 *st = CFQ_RB_ROOT;
3789 RB_CLEAR_NODE(&cfqg->rb_node); 3798 RB_CLEAR_NODE(&cfqg->rb_node);
3790 3799
3791 /* Give preference to root group over other groups */ 3800 /* Give preference to root group over other groups */
3792 cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; 3801 cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
3793 3802
3794 #ifdef CONFIG_CFQ_GROUP_IOSCHED 3803 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3795 /* 3804 /*
3796 * Take a reference to root group which we never drop. This is just 3805 * Take a reference to root group which we never drop. This is just
3797 * to make sure that cfq_put_cfqg() does not try to kfree root group 3806 * to make sure that cfq_put_cfqg() does not try to kfree root group
3798 */ 3807 */
3799 atomic_set(&cfqg->ref, 1); 3808 atomic_set(&cfqg->ref, 1);
3800 rcu_read_lock(); 3809 rcu_read_lock();
3801 blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd, 3810 blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd,
3802 0); 3811 0);
3803 rcu_read_unlock(); 3812 rcu_read_unlock();
3804 #endif 3813 #endif
3805 /* 3814 /*
3806 * Not strictly needed (since RB_ROOT just clears the node and we 3815 * Not strictly needed (since RB_ROOT just clears the node and we
3807 * zeroed cfqd on alloc), but better be safe in case someone decides 3816 * zeroed cfqd on alloc), but better be safe in case someone decides
3808 * to add magic to the rb code 3817 * to add magic to the rb code
3809 */ 3818 */
3810 for (i = 0; i < CFQ_PRIO_LISTS; i++) 3819 for (i = 0; i < CFQ_PRIO_LISTS; i++)
3811 cfqd->prio_trees[i] = RB_ROOT; 3820 cfqd->prio_trees[i] = RB_ROOT;
3812 3821
3813 /* 3822 /*
3814 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. 3823 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
3815 * Grab a permanent reference to it, so that the normal code flow 3824 * Grab a permanent reference to it, so that the normal code flow
3816 * will not attempt to free it. 3825 * will not attempt to free it.
3817 */ 3826 */
3818 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); 3827 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
3819 atomic_inc(&cfqd->oom_cfqq.ref); 3828 atomic_inc(&cfqd->oom_cfqq.ref);
3820 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); 3829 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
3821 3830
3822 INIT_LIST_HEAD(&cfqd->cic_list); 3831 INIT_LIST_HEAD(&cfqd->cic_list);
3823 3832
3824 cfqd->queue = q; 3833 cfqd->queue = q;
3825 3834
3826 init_timer(&cfqd->idle_slice_timer); 3835 init_timer(&cfqd->idle_slice_timer);
3827 cfqd->idle_slice_timer.function = cfq_idle_slice_timer; 3836 cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
3828 cfqd->idle_slice_timer.data = (unsigned long) cfqd; 3837 cfqd->idle_slice_timer.data = (unsigned long) cfqd;
3829 3838
3830 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue); 3839 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
3831 3840
3832 cfqd->cfq_quantum = cfq_quantum; 3841 cfqd->cfq_quantum = cfq_quantum;
3833 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; 3842 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
3834 cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; 3843 cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
3835 cfqd->cfq_back_max = cfq_back_max; 3844 cfqd->cfq_back_max = cfq_back_max;
3836 cfqd->cfq_back_penalty = cfq_back_penalty; 3845 cfqd->cfq_back_penalty = cfq_back_penalty;
3837 cfqd->cfq_slice[0] = cfq_slice_async; 3846 cfqd->cfq_slice[0] = cfq_slice_async;
3838 cfqd->cfq_slice[1] = cfq_slice_sync; 3847 cfqd->cfq_slice[1] = cfq_slice_sync;
3839 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 3848 cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
3840 cfqd->cfq_slice_idle = cfq_slice_idle; 3849 cfqd->cfq_slice_idle = cfq_slice_idle;
3841 cfqd->cfq_latency = 1; 3850 cfqd->cfq_latency = 1;
3842 cfqd->cfq_group_isolation = 0; 3851 cfqd->cfq_group_isolation = 0;
3843 cfqd->hw_tag = -1; 3852 cfqd->hw_tag = -1;
3844 /* 3853 /*
3845 * we optimistically start assuming sync ops weren't delayed in last 3854 * we optimistically start assuming sync ops weren't delayed in last
3846 * second, in order to have larger depth for async operations. 3855 * second, in order to have larger depth for async operations.
3847 */ 3856 */
3848 cfqd->last_delayed_sync = jiffies - HZ; 3857 cfqd->last_delayed_sync = jiffies - HZ;
3849 return cfqd; 3858 return cfqd;
3850 } 3859 }
3851 3860
3852 static void cfq_slab_kill(void) 3861 static void cfq_slab_kill(void)
3853 { 3862 {
3854 /* 3863 /*
3855 * Caller already ensured that pending RCU callbacks are completed, 3864 * Caller already ensured that pending RCU callbacks are completed,
3856 * so we should have no busy allocations at this point. 3865 * so we should have no busy allocations at this point.
3857 */ 3866 */
3858 if (cfq_pool) 3867 if (cfq_pool)
3859 kmem_cache_destroy(cfq_pool); 3868 kmem_cache_destroy(cfq_pool);
3860 if (cfq_ioc_pool) 3869 if (cfq_ioc_pool)
3861 kmem_cache_destroy(cfq_ioc_pool); 3870 kmem_cache_destroy(cfq_ioc_pool);
3862 } 3871 }
3863 3872
3864 static int __init cfq_slab_setup(void) 3873 static int __init cfq_slab_setup(void)
3865 { 3874 {
3866 cfq_pool = KMEM_CACHE(cfq_queue, 0); 3875 cfq_pool = KMEM_CACHE(cfq_queue, 0);
3867 if (!cfq_pool) 3876 if (!cfq_pool)
3868 goto fail; 3877 goto fail;
3869 3878
3870 cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0); 3879 cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
3871 if (!cfq_ioc_pool) 3880 if (!cfq_ioc_pool)
3872 goto fail; 3881 goto fail;
3873 3882
3874 return 0; 3883 return 0;
3875 fail: 3884 fail:
3876 cfq_slab_kill(); 3885 cfq_slab_kill();
3877 return -ENOMEM; 3886 return -ENOMEM;
3878 } 3887 }
3879 3888
3880 /* 3889 /*
3881 * sysfs parts below --> 3890 * sysfs parts below -->
3882 */ 3891 */
3883 static ssize_t 3892 static ssize_t
3884 cfq_var_show(unsigned int var, char *page) 3893 cfq_var_show(unsigned int var, char *page)
3885 { 3894 {
3886 return sprintf(page, "%d\n", var); 3895 return sprintf(page, "%d\n", var);
3887 } 3896 }
3888 3897
3889 static ssize_t 3898 static ssize_t
3890 cfq_var_store(unsigned int *var, const char *page, size_t count) 3899 cfq_var_store(unsigned int *var, const char *page, size_t count)
3891 { 3900 {
3892 char *p = (char *) page; 3901 char *p = (char *) page;
3893 3902
3894 *var = simple_strtoul(p, &p, 10); 3903 *var = simple_strtoul(p, &p, 10);
3895 return count; 3904 return count;
3896 } 3905 }
3897 3906
3898 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ 3907 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
3899 static ssize_t __FUNC(struct elevator_queue *e, char *page) \ 3908 static ssize_t __FUNC(struct elevator_queue *e, char *page) \
3900 { \ 3909 { \
3901 struct cfq_data *cfqd = e->elevator_data; \ 3910 struct cfq_data *cfqd = e->elevator_data; \
3902 unsigned int __data = __VAR; \ 3911 unsigned int __data = __VAR; \
3903 if (__CONV) \ 3912 if (__CONV) \
3904 __data = jiffies_to_msecs(__data); \ 3913 __data = jiffies_to_msecs(__data); \
3905 return cfq_var_show(__data, (page)); \ 3914 return cfq_var_show(__data, (page)); \
3906 } 3915 }
3907 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); 3916 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
3908 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); 3917 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
3909 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); 3918 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
3910 SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); 3919 SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
3911 SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); 3920 SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
3912 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); 3921 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
3913 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); 3922 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
3914 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 3923 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
3915 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 3924 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
3916 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); 3925 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
3917 SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0); 3926 SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
3918 #undef SHOW_FUNCTION 3927 #undef SHOW_FUNCTION
3919 3928
3920 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 3929 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
3921 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ 3930 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
3922 { \ 3931 { \
3923 struct cfq_data *cfqd = e->elevator_data; \ 3932 struct cfq_data *cfqd = e->elevator_data; \
3924 unsigned int __data; \ 3933 unsigned int __data; \
3925 int ret = cfq_var_store(&__data, (page), count); \ 3934 int ret = cfq_var_store(&__data, (page), count); \
3926 if (__data < (MIN)) \ 3935 if (__data < (MIN)) \
3927 __data = (MIN); \ 3936 __data = (MIN); \
3928 else if (__data > (MAX)) \ 3937 else if (__data > (MAX)) \
3929 __data = (MAX); \ 3938 __data = (MAX); \
3930 if (__CONV) \ 3939 if (__CONV) \
3931 *(__PTR) = msecs_to_jiffies(__data); \ 3940 *(__PTR) = msecs_to_jiffies(__data); \
3932 else \ 3941 else \
3933 *(__PTR) = __data; \ 3942 *(__PTR) = __data; \
3934 return ret; \ 3943 return ret; \
3935 } 3944 }
3936 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); 3945 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
3937 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, 3946 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
3938 UINT_MAX, 1); 3947 UINT_MAX, 1);
3939 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, 3948 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
3940 UINT_MAX, 1); 3949 UINT_MAX, 1);
3941 STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); 3950 STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
3942 STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, 3951 STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
3943 UINT_MAX, 0); 3952 UINT_MAX, 0);
3944 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); 3953 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
3945 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); 3954 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
3946 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); 3955 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
3947 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 3956 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
3948 UINT_MAX, 0); 3957 UINT_MAX, 0);
3949 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); 3958 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
3950 STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0); 3959 STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
3951 #undef STORE_FUNCTION 3960 #undef STORE_FUNCTION
3952 3961
3953 #define CFQ_ATTR(name) \ 3962 #define CFQ_ATTR(name) \
3954 __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store) 3963 __ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
3955 3964
3956 static struct elv_fs_entry cfq_attrs[] = { 3965 static struct elv_fs_entry cfq_attrs[] = {
3957 CFQ_ATTR(quantum), 3966 CFQ_ATTR(quantum),
3958 CFQ_ATTR(fifo_expire_sync), 3967 CFQ_ATTR(fifo_expire_sync),
3959 CFQ_ATTR(fifo_expire_async), 3968 CFQ_ATTR(fifo_expire_async),
3960 CFQ_ATTR(back_seek_max), 3969 CFQ_ATTR(back_seek_max),
3961 CFQ_ATTR(back_seek_penalty), 3970 CFQ_ATTR(back_seek_penalty),
3962 CFQ_ATTR(slice_sync), 3971 CFQ_ATTR(slice_sync),
3963 CFQ_ATTR(slice_async), 3972 CFQ_ATTR(slice_async),
3964 CFQ_ATTR(slice_async_rq), 3973 CFQ_ATTR(slice_async_rq),
3965 CFQ_ATTR(slice_idle), 3974 CFQ_ATTR(slice_idle),
3966 CFQ_ATTR(low_latency), 3975 CFQ_ATTR(low_latency),
3967 CFQ_ATTR(group_isolation), 3976 CFQ_ATTR(group_isolation),
3968 __ATTR_NULL 3977 __ATTR_NULL
3969 }; 3978 };
3970 3979
3971 static struct elevator_type iosched_cfq = { 3980 static struct elevator_type iosched_cfq = {
3972 .ops = { 3981 .ops = {
3973 .elevator_merge_fn = cfq_merge, 3982 .elevator_merge_fn = cfq_merge,
3974 .elevator_merged_fn = cfq_merged_request, 3983 .elevator_merged_fn = cfq_merged_request,
3975 .elevator_merge_req_fn = cfq_merged_requests, 3984 .elevator_merge_req_fn = cfq_merged_requests,
3976 .elevator_allow_merge_fn = cfq_allow_merge, 3985 .elevator_allow_merge_fn = cfq_allow_merge,
3977 .elevator_bio_merged_fn = cfq_bio_merged, 3986 .elevator_bio_merged_fn = cfq_bio_merged,
3978 .elevator_dispatch_fn = cfq_dispatch_requests, 3987 .elevator_dispatch_fn = cfq_dispatch_requests,
3979 .elevator_add_req_fn = cfq_insert_request, 3988 .elevator_add_req_fn = cfq_insert_request,
3980 .elevator_activate_req_fn = cfq_activate_request, 3989 .elevator_activate_req_fn = cfq_activate_request,
3981 .elevator_deactivate_req_fn = cfq_deactivate_request, 3990 .elevator_deactivate_req_fn = cfq_deactivate_request,
3982 .elevator_queue_empty_fn = cfq_queue_empty, 3991 .elevator_queue_empty_fn = cfq_queue_empty,
3983 .elevator_completed_req_fn = cfq_completed_request, 3992 .elevator_completed_req_fn = cfq_completed_request,
3984 .elevator_former_req_fn = elv_rb_former_request, 3993 .elevator_former_req_fn = elv_rb_former_request,
3985 .elevator_latter_req_fn = elv_rb_latter_request, 3994 .elevator_latter_req_fn = elv_rb_latter_request,
3986 .elevator_set_req_fn = cfq_set_request, 3995 .elevator_set_req_fn = cfq_set_request,
3987 .elevator_put_req_fn = cfq_put_request, 3996 .elevator_put_req_fn = cfq_put_request,
3988 .elevator_may_queue_fn = cfq_may_queue, 3997 .elevator_may_queue_fn = cfq_may_queue,
3989 .elevator_init_fn = cfq_init_queue, 3998 .elevator_init_fn = cfq_init_queue,
3990 .elevator_exit_fn = cfq_exit_queue, 3999 .elevator_exit_fn = cfq_exit_queue,
3991 .trim = cfq_free_io_context, 4000 .trim = cfq_free_io_context,
3992 }, 4001 },
3993 .elevator_attrs = cfq_attrs, 4002 .elevator_attrs = cfq_attrs,
3994 .elevator_name = "cfq", 4003 .elevator_name = "cfq",
3995 .elevator_owner = THIS_MODULE, 4004 .elevator_owner = THIS_MODULE,
3996 }; 4005 };
3997 4006
3998 #ifdef CONFIG_CFQ_GROUP_IOSCHED 4007 #ifdef CONFIG_CFQ_GROUP_IOSCHED
3999 static struct blkio_policy_type blkio_policy_cfq = { 4008 static struct blkio_policy_type blkio_policy_cfq = {
4000 .ops = { 4009 .ops = {
4001 .blkio_unlink_group_fn = cfq_unlink_blkio_group, 4010 .blkio_unlink_group_fn = cfq_unlink_blkio_group,
4002 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, 4011 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
4003 }, 4012 },
4004 }; 4013 };
4005 #else 4014 #else
4006 static struct blkio_policy_type blkio_policy_cfq; 4015 static struct blkio_policy_type blkio_policy_cfq;
4007 #endif 4016 #endif
4008 4017
4009 static int __init cfq_init(void) 4018 static int __init cfq_init(void)
4010 { 4019 {
4011 /* 4020 /*
4012 * could be 0 on HZ < 1000 setups 4021 * could be 0 on HZ < 1000 setups
4013 */ 4022 */
4014 if (!cfq_slice_async) 4023 if (!cfq_slice_async)
4015 cfq_slice_async = 1; 4024 cfq_slice_async = 1;
4016 if (!cfq_slice_idle) 4025 if (!cfq_slice_idle)
4017 cfq_slice_idle = 1; 4026 cfq_slice_idle = 1;
4018 4027
4019 if (cfq_slab_setup()) 4028 if (cfq_slab_setup())
4020 return -ENOMEM; 4029 return -ENOMEM;
4021 4030
4022 elv_register(&iosched_cfq); 4031 elv_register(&iosched_cfq);
4023 blkio_policy_register(&blkio_policy_cfq); 4032 blkio_policy_register(&blkio_policy_cfq);
4024 4033
4025 return 0; 4034 return 0;
4026 } 4035 }
4027 4036
4028 static void __exit cfq_exit(void) 4037 static void __exit cfq_exit(void)
4029 { 4038 {
4030 DECLARE_COMPLETION_ONSTACK(all_gone); 4039 DECLARE_COMPLETION_ONSTACK(all_gone);
4031 blkio_policy_unregister(&blkio_policy_cfq); 4040 blkio_policy_unregister(&blkio_policy_cfq);
4032 elv_unregister(&iosched_cfq); 4041 elv_unregister(&iosched_cfq);
4033 ioc_gone = &all_gone; 4042 ioc_gone = &all_gone;
4034 /* ioc_gone's update must be visible before reading ioc_count */ 4043 /* ioc_gone's update must be visible before reading ioc_count */
4035 smp_wmb(); 4044 smp_wmb();
4036 4045
4037 /* 4046 /*
4038 * this also protects us from entering cfq_slab_kill() with 4047 * this also protects us from entering cfq_slab_kill() with
4039 * pending RCU callbacks 4048 * pending RCU callbacks
4040 */ 4049 */
4041 if (elv_ioc_count_read(cfq_ioc_count)) 4050 if (elv_ioc_count_read(cfq_ioc_count))
4042 wait_for_completion(&all_gone); 4051 wait_for_completion(&all_gone);
4043 ida_destroy(&cic_index_ida); 4052 ida_destroy(&cic_index_ida);
4044 cfq_slab_kill(); 4053 cfq_slab_kill();
4045 } 4054 }
4046 4055
4047 module_init(cfq_init); 4056 module_init(cfq_init);
4048 module_exit(cfq_exit); 4057 module_exit(cfq_exit);
4049 4058
4050 MODULE_AUTHOR("Jens Axboe"); 4059 MODULE_AUTHOR("Jens Axboe");
4051 MODULE_LICENSE("GPL"); 4060 MODULE_LICENSE("GPL");
4052 MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler"); 4061 MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");