Blame view

kernel/sched_fair.c 99.5 KB
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
  /*
   * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   *
   *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   *
   *  Interactivity improvements by Mike Galbraith
   *  (C) 2007 Mike Galbraith <efault@gmx.de>
   *
   *  Various enhancements by Dmitry Adamushko.
   *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
   *
   *  Group scheduling enhancements by Srivatsa Vaddagiri
   *  Copyright IBM Corporation, 2007
   *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
   *
   *  Scaled math optimizations by Thomas Gleixner
   *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
218050855   Peter Zijlstra   sched: adaptive s...
18
19
20
   *
   *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
   *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
21
   */
9745512ce   Arjan van de Ven   sched: latencytop...
22
  #include <linux/latencytop.h>
1983a922a   Christian Ehrhardt   sched: Make tunab...
23
  #include <linux/sched.h>
9745512ce   Arjan van de Ven   sched: latencytop...
24

bf0f6f24a   Ingo Molnar   sched: cfs core, ...
25
  /*
218050855   Peter Zijlstra   sched: adaptive s...
26
   * Targeted preemption latency for CPU-bound tasks:
172e082a9   Mike Galbraith   sched: Re-tune th...
27
   * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
28
   *
218050855   Peter Zijlstra   sched: adaptive s...
29
   * NOTE: this latency value is not the same as the concept of
d274a4cee   Ingo Molnar   sched: update com...
30
31
32
   * 'timeslice length' - timeslices in CFS are of variable length
   * and have no persistent notion like in traditional, time-slice
   * based scheduling concepts.
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
33
   *
d274a4cee   Ingo Molnar   sched: update com...
34
35
   * (to see the precise effective timeslice length of your workload,
   *  run vmstat and monitor the context-switches (cs) field)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
36
   */
21406928a   Mike Galbraith   sched: Tweak sche...
37
38
  unsigned int sysctl_sched_latency = 6000000ULL;
  unsigned int normalized_sysctl_sched_latency = 6000000ULL;
2bd8e6d42   Ingo Molnar   sched: use consta...
39
40
  
  /*
1983a922a   Christian Ehrhardt   sched: Make tunab...
41
42
43
44
45
46
47
48
49
50
51
52
   * The initial- and re-scaling of tunables is configurable
   * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
   *
   * Options are:
   * SCHED_TUNABLESCALING_NONE - unscaled, always *1
   * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
   * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
   */
  enum sched_tunable_scaling sysctl_sched_tunable_scaling
  	= SCHED_TUNABLESCALING_LOG;
  
  /*
b2be5e96d   Peter Zijlstra   sched: reintroduc...
53
   * Minimal preemption granularity for CPU-bound tasks:
21406928a   Mike Galbraith   sched: Tweak sche...
54
   * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
2bd8e6d42   Ingo Molnar   sched: use consta...
55
   */
0bf377bbb   Ingo Molnar   sched: Improve la...
56
57
  unsigned int sysctl_sched_min_granularity = 750000ULL;
  unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
218050855   Peter Zijlstra   sched: adaptive s...
58
59
  
  /*
b2be5e96d   Peter Zijlstra   sched: reintroduc...
60
61
   * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
   */
0bf377bbb   Ingo Molnar   sched: Improve la...
62
  static unsigned int sched_nr_latency = 8;
b2be5e96d   Peter Zijlstra   sched: reintroduc...
63
64
  
  /*
2bba22c50   Mike Galbraith   sched: Turn off c...
65
   * After fork, child runs first. If set to 0 (default) then
b2be5e96d   Peter Zijlstra   sched: reintroduc...
66
   * parent will (try to) run first.
218050855   Peter Zijlstra   sched: adaptive s...
67
   */
2bba22c50   Mike Galbraith   sched: Turn off c...
68
  unsigned int sysctl_sched_child_runs_first __read_mostly;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
69
70
  
  /*
1799e35d5   Ingo Molnar   sched: add /proc/...
71
72
73
74
75
76
77
78
   * sys_sched_yield() compat mode
   *
   * This option switches the agressive yield implementation of the
   * old scheduler back on.
   */
  unsigned int __read_mostly sysctl_sched_compat_yield;
  
  /*
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
79
   * SCHED_OTHER wake-up granularity.
172e082a9   Mike Galbraith   sched: Re-tune th...
80
   * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
81
82
83
84
85
   *
   * This option delays the preemption effects of decoupled workloads
   * and reduces their over-scheduling. Synchronous workloads will still
   * have immediate wakeup/sleep latencies.
   */
172e082a9   Mike Galbraith   sched: Re-tune th...
86
  unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
0bcdcf28c   Christian Ehrhardt   sched: Fix missin...
87
  unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
88

da84d9617   Ingo Molnar   sched: reintroduc...
89
  const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
a4c2f00f5   Peter Zijlstra   sched: fair sched...
90
  static const struct sched_class fair_sched_class;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
91
92
93
  /**************************************************************
   * CFS operations on generic schedulable entities:
   */
62160e3f4   Ingo Molnar   sched: track cfs_...
94
  #ifdef CONFIG_FAIR_GROUP_SCHED
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
95

62160e3f4   Ingo Molnar   sched: track cfs_...
96
  /* cpu runqueue to which this cfs_rq is attached */
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
97
98
  static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
  {
62160e3f4   Ingo Molnar   sched: track cfs_...
99
  	return cfs_rq->rq;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
100
  }
62160e3f4   Ingo Molnar   sched: track cfs_...
101
102
  /* An entity is a task if it doesn't "own" a runqueue */
  #define entity_is_task(se)	(!se->my_q)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
103

8f48894fc   Peter Zijlstra   sched: Add debug ...
104
105
106
107
108
109
110
  static inline struct task_struct *task_of(struct sched_entity *se)
  {
  #ifdef CONFIG_SCHED_DEBUG
  	WARN_ON_ONCE(!entity_is_task(se));
  #endif
  	return container_of(se, struct task_struct, se);
  }
b758149c0   Peter Zijlstra   sched: prepatory ...
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
  /* Walk up scheduling entities hierarchy */
  #define for_each_sched_entity(se) \
  		for (; se; se = se->parent)
  
  static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
  {
  	return p->se.cfs_rq;
  }
  
  /* runqueue on which this entity is (to be) queued */
  static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
  {
  	return se->cfs_rq;
  }
  
  /* runqueue "owned" by this group */
  static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
  {
  	return grp->my_q;
  }
  
  /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
   * another cpu ('this_cpu')
   */
  static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
  {
  	return cfs_rq->tg->cfs_rq[this_cpu];
  }
  
  /* Iterate thr' all leaf cfs_rq's on a runqueue */
  #define for_each_leaf_cfs_rq(rq, cfs_rq) \
  	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
  
  /* Do the two (enqueued) entities belong to the same group ? */
  static inline int
  is_same_group(struct sched_entity *se, struct sched_entity *pse)
  {
  	if (se->cfs_rq == pse->cfs_rq)
  		return 1;
  
  	return 0;
  }
  
  static inline struct sched_entity *parent_entity(struct sched_entity *se)
  {
  	return se->parent;
  }
464b75273   Peter Zijlstra   sched: re-instate...
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
  /* return depth at which a sched entity is present in the hierarchy */
  static inline int depth_se(struct sched_entity *se)
  {
  	int depth = 0;
  
  	for_each_sched_entity(se)
  		depth++;
  
  	return depth;
  }
  
  static void
  find_matching_se(struct sched_entity **se, struct sched_entity **pse)
  {
  	int se_depth, pse_depth;
  
  	/*
  	 * preemption test can be made between sibling entities who are in the
  	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
  	 * both tasks until we find their ancestors who are siblings of common
  	 * parent.
  	 */
  
  	/* First walk up until both entities are at same depth */
  	se_depth = depth_se(*se);
  	pse_depth = depth_se(*pse);
  
  	while (se_depth > pse_depth) {
  		se_depth--;
  		*se = parent_entity(*se);
  	}
  
  	while (pse_depth > se_depth) {
  		pse_depth--;
  		*pse = parent_entity(*pse);
  	}
  
  	while (!is_same_group(*se, *pse)) {
  		*se = parent_entity(*se);
  		*pse = parent_entity(*pse);
  	}
  }
8f48894fc   Peter Zijlstra   sched: Add debug ...
200
201
202
203
204
205
  #else	/* !CONFIG_FAIR_GROUP_SCHED */
  
  static inline struct task_struct *task_of(struct sched_entity *se)
  {
  	return container_of(se, struct task_struct, se);
  }
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
206

62160e3f4   Ingo Molnar   sched: track cfs_...
207
208
209
  static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
  {
  	return container_of(cfs_rq, struct rq, cfs);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
210
211
212
  }
  
  #define entity_is_task(se)	1
b758149c0   Peter Zijlstra   sched: prepatory ...
213
214
  #define for_each_sched_entity(se) \
  		for (; se; se = NULL)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
215

b758149c0   Peter Zijlstra   sched: prepatory ...
216
  static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
217
  {
b758149c0   Peter Zijlstra   sched: prepatory ...
218
  	return &task_rq(p)->cfs;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
219
  }
b758149c0   Peter Zijlstra   sched: prepatory ...
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
  static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
  {
  	struct task_struct *p = task_of(se);
  	struct rq *rq = task_rq(p);
  
  	return &rq->cfs;
  }
  
  /* runqueue "owned" by this group */
  static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
  {
  	return NULL;
  }
  
  static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
  {
  	return &cpu_rq(this_cpu)->cfs;
  }
  
  #define for_each_leaf_cfs_rq(rq, cfs_rq) \
  		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
  
  static inline int
  is_same_group(struct sched_entity *se, struct sched_entity *pse)
  {
  	return 1;
  }
  
  static inline struct sched_entity *parent_entity(struct sched_entity *se)
  {
  	return NULL;
  }
464b75273   Peter Zijlstra   sched: re-instate...
252
253
254
255
  static inline void
  find_matching_se(struct sched_entity **se, struct sched_entity **pse)
  {
  }
b758149c0   Peter Zijlstra   sched: prepatory ...
256
  #endif	/* CONFIG_FAIR_GROUP_SCHED */
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
257
258
259
260
  
  /**************************************************************
   * Scheduling class tree data structure manipulation methods:
   */
0702e3ebc   Ingo Molnar   sched: cleanup: f...
261
  static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
02e0431a3   Peter Zijlstra   sched: better min...
262
  {
368059a97   Peter Zijlstra   sched: max_vrunti...
263
264
  	s64 delta = (s64)(vruntime - min_vruntime);
  	if (delta > 0)
02e0431a3   Peter Zijlstra   sched: better min...
265
266
267
268
  		min_vruntime = vruntime;
  
  	return min_vruntime;
  }
0702e3ebc   Ingo Molnar   sched: cleanup: f...
269
  static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
b0ffd246e   Peter Zijlstra   sched: clean up m...
270
271
272
273
274
275
276
  {
  	s64 delta = (s64)(vruntime - min_vruntime);
  	if (delta < 0)
  		min_vruntime = vruntime;
  
  	return min_vruntime;
  }
54fdc5816   Fabio Checconi   sched: Account fo...
277
278
279
280
281
  static inline int entity_before(struct sched_entity *a,
  				struct sched_entity *b)
  {
  	return (s64)(a->vruntime - b->vruntime) < 0;
  }
0702e3ebc   Ingo Molnar   sched: cleanup: f...
282
  static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
9014623c0   Peter Zijlstra   sched: handle vru...
283
  {
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
284
  	return se->vruntime - cfs_rq->min_vruntime;
9014623c0   Peter Zijlstra   sched: handle vru...
285
  }
1af5f730f   Peter Zijlstra   sched: more accur...
286
287
288
289
290
291
292
293
294
295
296
  static void update_min_vruntime(struct cfs_rq *cfs_rq)
  {
  	u64 vruntime = cfs_rq->min_vruntime;
  
  	if (cfs_rq->curr)
  		vruntime = cfs_rq->curr->vruntime;
  
  	if (cfs_rq->rb_leftmost) {
  		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
  						   struct sched_entity,
  						   run_node);
e17036dac   Peter Zijlstra   sched: fix update...
297
  		if (!cfs_rq->curr)
1af5f730f   Peter Zijlstra   sched: more accur...
298
299
300
301
302
303
304
  			vruntime = se->vruntime;
  		else
  			vruntime = min_vruntime(vruntime, se->vruntime);
  	}
  
  	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
  }
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
305
306
307
  /*
   * Enqueue an entity into the rb-tree:
   */
0702e3ebc   Ingo Molnar   sched: cleanup: f...
308
  static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
309
310
311
312
  {
  	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
  	struct rb_node *parent = NULL;
  	struct sched_entity *entry;
9014623c0   Peter Zijlstra   sched: handle vru...
313
  	s64 key = entity_key(cfs_rq, se);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
314
315
316
317
318
319
320
321
322
323
324
325
  	int leftmost = 1;
  
  	/*
  	 * Find the right place in the rbtree:
  	 */
  	while (*link) {
  		parent = *link;
  		entry = rb_entry(parent, struct sched_entity, run_node);
  		/*
  		 * We dont care about collisions. Nodes with
  		 * the same key stay together.
  		 */
9014623c0   Peter Zijlstra   sched: handle vru...
326
  		if (key < entity_key(cfs_rq, entry)) {
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
327
328
329
330
331
332
333
334
335
336
337
  			link = &parent->rb_left;
  		} else {
  			link = &parent->rb_right;
  			leftmost = 0;
  		}
  	}
  
  	/*
  	 * Maintain a cache of leftmost tree entries (it is frequently
  	 * used):
  	 */
1af5f730f   Peter Zijlstra   sched: more accur...
338
  	if (leftmost)
57cb499df   Ingo Molnar   sched: remove set...
339
  		cfs_rq->rb_leftmost = &se->run_node;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
340
341
342
  
  	rb_link_node(&se->run_node, parent, link);
  	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
343
  }
0702e3ebc   Ingo Molnar   sched: cleanup: f...
344
  static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
345
  {
3fe69747d   Peter Zijlstra   sched: min_vrunti...
346
347
  	if (cfs_rq->rb_leftmost == &se->run_node) {
  		struct rb_node *next_node;
3fe69747d   Peter Zijlstra   sched: min_vrunti...
348
349
350
  
  		next_node = rb_next(&se->run_node);
  		cfs_rq->rb_leftmost = next_node;
3fe69747d   Peter Zijlstra   sched: min_vrunti...
351
  	}
e9acbff64   Ingo Molnar   sched: introduce ...
352

bf0f6f24a   Ingo Molnar   sched: cfs core, ...
353
  	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
354
  }
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
355
356
  static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
  {
f4b6755fb   Peter Zijlstra   sched: cleanup fa...
357
358
359
360
361
362
  	struct rb_node *left = cfs_rq->rb_leftmost;
  
  	if (!left)
  		return NULL;
  
  	return rb_entry(left, struct sched_entity, run_node);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
363
  }
f4b6755fb   Peter Zijlstra   sched: cleanup fa...
364
  static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
aeb73b040   Peter Zijlstra   sched: clean up n...
365
  {
7eee3e677   Ingo Molnar   sched: clean up _...
366
  	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
aeb73b040   Peter Zijlstra   sched: clean up n...
367

70eee74b7   Balbir Singh   sched: remove dup...
368
369
  	if (!last)
  		return NULL;
7eee3e677   Ingo Molnar   sched: clean up _...
370
371
  
  	return rb_entry(last, struct sched_entity, run_node);
aeb73b040   Peter Zijlstra   sched: clean up n...
372
  }
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
373
374
375
  /**************************************************************
   * Scheduling class statistics methods:
   */
b2be5e96d   Peter Zijlstra   sched: reintroduc...
376
  #ifdef CONFIG_SCHED_DEBUG
acb4a848d   Christian Ehrhardt   sched: Update nor...
377
  int sched_proc_update_handler(struct ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
378
  		void __user *buffer, size_t *lenp,
b2be5e96d   Peter Zijlstra   sched: reintroduc...
379
380
  		loff_t *ppos)
  {
8d65af789   Alexey Dobriyan   sysctl: remove "s...
381
  	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
acb4a848d   Christian Ehrhardt   sched: Update nor...
382
  	int factor = get_update_sysctl_factor();
b2be5e96d   Peter Zijlstra   sched: reintroduc...
383
384
385
386
387
388
  
  	if (ret || !write)
  		return ret;
  
  	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
  					sysctl_sched_min_granularity);
acb4a848d   Christian Ehrhardt   sched: Update nor...
389
390
391
392
393
394
395
  #define WRT_SYSCTL(name) \
  	(normalized_sysctl_##name = sysctl_##name / (factor))
  	WRT_SYSCTL(sched_min_granularity);
  	WRT_SYSCTL(sched_latency);
  	WRT_SYSCTL(sched_wakeup_granularity);
  	WRT_SYSCTL(sched_shares_ratelimit);
  #undef WRT_SYSCTL
b2be5e96d   Peter Zijlstra   sched: reintroduc...
396
397
398
  	return 0;
  }
  #endif
647e7cac2   Ingo Molnar   sched: vslice fix...
399
400
  
  /*
f9c0b0950   Peter Zijlstra   sched: revert bac...
401
   * delta /= w
a7be37ac8   Peter Zijlstra   sched: revert the...
402
403
404
405
   */
  static inline unsigned long
  calc_delta_fair(unsigned long delta, struct sched_entity *se)
  {
f9c0b0950   Peter Zijlstra   sched: revert bac...
406
407
  	if (unlikely(se->load.weight != NICE_0_LOAD))
  		delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
a7be37ac8   Peter Zijlstra   sched: revert the...
408
409
410
411
412
  
  	return delta;
  }
  
  /*
647e7cac2   Ingo Molnar   sched: vslice fix...
413
414
415
416
417
418
419
   * The idea is to set a period in which each task runs once.
   *
   * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
   * this period because otherwise the slices get too small.
   *
   * p = (nr <= nl) ? l : l*nr/nl
   */
4d78e7b65   Peter Zijlstra   sched: new task p...
420
421
422
  static u64 __sched_period(unsigned long nr_running)
  {
  	u64 period = sysctl_sched_latency;
b2be5e96d   Peter Zijlstra   sched: reintroduc...
423
  	unsigned long nr_latency = sched_nr_latency;
4d78e7b65   Peter Zijlstra   sched: new task p...
424
425
  
  	if (unlikely(nr_running > nr_latency)) {
4bf0b7715   Peter Zijlstra   sched: remove do_...
426
  		period = sysctl_sched_min_granularity;
4d78e7b65   Peter Zijlstra   sched: new task p...
427
  		period *= nr_running;
4d78e7b65   Peter Zijlstra   sched: new task p...
428
429
430
431
  	}
  
  	return period;
  }
647e7cac2   Ingo Molnar   sched: vslice fix...
432
433
434
435
  /*
   * We calculate the wall-time slice from the period by taking a part
   * proportional to the weight.
   *
f9c0b0950   Peter Zijlstra   sched: revert bac...
436
   * s = p*P[w/rw]
647e7cac2   Ingo Molnar   sched: vslice fix...
437
   */
6d0f0ebd0   Peter Zijlstra   sched: simplify a...
438
  static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
218050855   Peter Zijlstra   sched: adaptive s...
439
  {
0a582440f   Mike Galbraith   sched: fix sched_...
440
  	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
f9c0b0950   Peter Zijlstra   sched: revert bac...
441

0a582440f   Mike Galbraith   sched: fix sched_...
442
  	for_each_sched_entity(se) {
6272d68cc   Lin Ming   sched: sched_slic...
443
  		struct load_weight *load;
3104bf03a   Christian Engelmayer   sched: Fix out of...
444
  		struct load_weight lw;
6272d68cc   Lin Ming   sched: sched_slic...
445
446
447
  
  		cfs_rq = cfs_rq_of(se);
  		load = &cfs_rq->load;
f9c0b0950   Peter Zijlstra   sched: revert bac...
448

0a582440f   Mike Galbraith   sched: fix sched_...
449
  		if (unlikely(!se->on_rq)) {
3104bf03a   Christian Engelmayer   sched: Fix out of...
450
  			lw = cfs_rq->load;
0a582440f   Mike Galbraith   sched: fix sched_...
451
452
453
454
455
456
457
  
  			update_load_add(&lw, se->load.weight);
  			load = &lw;
  		}
  		slice = calc_delta_mine(slice, se->load.weight, load);
  	}
  	return slice;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
458
  }
647e7cac2   Ingo Molnar   sched: vslice fix...
459
  /*
ac884dec6   Peter Zijlstra   sched: fair-group...
460
   * We calculate the vruntime slice of a to be inserted task
647e7cac2   Ingo Molnar   sched: vslice fix...
461
   *
f9c0b0950   Peter Zijlstra   sched: revert bac...
462
   * vs = s/w
647e7cac2   Ingo Molnar   sched: vslice fix...
463
   */
f9c0b0950   Peter Zijlstra   sched: revert bac...
464
  static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
67e9fb2a3   Peter Zijlstra   sched: add vslice
465
  {
f9c0b0950   Peter Zijlstra   sched: revert bac...
466
  	return calc_delta_fair(sched_slice(cfs_rq, se), se);
a7be37ac8   Peter Zijlstra   sched: revert the...
467
468
469
  }
  
  /*
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
470
471
472
473
   * Update the current task's runtime statistics. Skip current tasks that
   * are not in our scheduling class.
   */
  static inline void
8ebc91d93   Ingo Molnar   sched: remove sta...
474
475
  __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
  	      unsigned long delta_exec)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
476
  {
bbdba7c0e   Ingo Molnar   sched: remove wai...
477
  	unsigned long delta_exec_weighted;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
478

41acab885   Lucas De Marchi   sched: Implement ...
479
480
  	schedstat_set(curr->statistics.exec_max,
  		      max((u64)delta_exec, curr->statistics.exec_max));
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
481
482
  
  	curr->sum_exec_runtime += delta_exec;
7a62eabc4   Ingo Molnar   sched: debug: upd...
483
  	schedstat_add(cfs_rq, exec_clock, delta_exec);
a7be37ac8   Peter Zijlstra   sched: revert the...
484
  	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
88ec22d3e   Peter Zijlstra   sched: Remove the...
485

e9acbff64   Ingo Molnar   sched: introduce ...
486
  	curr->vruntime += delta_exec_weighted;
1af5f730f   Peter Zijlstra   sched: more accur...
487
  	update_min_vruntime(cfs_rq);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
488
  }
b7cc08965   Ingo Molnar   sched: remove the...
489
  static void update_curr(struct cfs_rq *cfs_rq)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
490
  {
429d43bcc   Ingo Molnar   sched: cleanup: s...
491
  	struct sched_entity *curr = cfs_rq->curr;
8ebc91d93   Ingo Molnar   sched: remove sta...
492
  	u64 now = rq_of(cfs_rq)->clock;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
493
494
495
496
497
498
499
500
501
502
  	unsigned long delta_exec;
  
  	if (unlikely(!curr))
  		return;
  
  	/*
  	 * Get the amount of time the current task was running
  	 * since the last time we changed load (this cannot
  	 * overflow on 32 bits):
  	 */
8ebc91d93   Ingo Molnar   sched: remove sta...
503
  	delta_exec = (unsigned long)(now - curr->exec_start);
34f28ecd0   Peter Zijlstra   sched: optimize u...
504
505
  	if (!delta_exec)
  		return;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
506

8ebc91d93   Ingo Molnar   sched: remove sta...
507
508
  	__update_curr(cfs_rq, curr, delta_exec);
  	curr->exec_start = now;
d842de871   Srivatsa Vaddagiri   sched: cpu accoun...
509
510
511
  
  	if (entity_is_task(curr)) {
  		struct task_struct *curtask = task_of(curr);
f977bb493   Ingo Molnar   perf_counter, sch...
512
  		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
d842de871   Srivatsa Vaddagiri   sched: cpu accoun...
513
  		cpuacct_charge(curtask, delta_exec);
f06febc96   Frank Mayhar   timers: fix itime...
514
  		account_group_exec_runtime(curtask, delta_exec);
d842de871   Srivatsa Vaddagiri   sched: cpu accoun...
515
  	}
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
516
517
518
  }
  
  static inline void
5870db5b8   Ingo Molnar   sched: remove the...
519
  update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
520
  {
41acab885   Lucas De Marchi   sched: Implement ...
521
  	schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
522
  }
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
523
524
525
  /*
   * Task is being enqueued - update stats:
   */
d2417e5a3   Ingo Molnar   sched: remove the...
526
  static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
527
  {
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
528
529
530
531
  	/*
  	 * Are we enqueueing a waiting task? (for current tasks
  	 * a dequeue/enqueue event is a NOP)
  	 */
429d43bcc   Ingo Molnar   sched: cleanup: s...
532
  	if (se != cfs_rq->curr)
5870db5b8   Ingo Molnar   sched: remove the...
533
  		update_stats_wait_start(cfs_rq, se);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
534
  }
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
535
  static void
9ef0a9615   Ingo Molnar   sched: remove the...
536
  update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
537
  {
41acab885   Lucas De Marchi   sched: Implement ...
538
539
540
541
542
  	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
  			rq_of(cfs_rq)->clock - se->statistics.wait_start));
  	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
  	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
  			rq_of(cfs_rq)->clock - se->statistics.wait_start);
768d0c272   Peter Zijlstra   sched: Add wait, ...
543
544
545
  #ifdef CONFIG_SCHEDSTATS
  	if (entity_is_task(se)) {
  		trace_sched_stat_wait(task_of(se),
41acab885   Lucas De Marchi   sched: Implement ...
546
  			rq_of(cfs_rq)->clock - se->statistics.wait_start);
768d0c272   Peter Zijlstra   sched: Add wait, ...
547
548
  	}
  #endif
41acab885   Lucas De Marchi   sched: Implement ...
549
  	schedstat_set(se->statistics.wait_start, 0);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
550
551
552
  }
  
  static inline void
19b6a2e37   Ingo Molnar   sched: remove the...
553
  update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
554
  {
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
555
556
557
558
  	/*
  	 * Mark the end of the wait period if dequeueing a
  	 * waiting task:
  	 */
429d43bcc   Ingo Molnar   sched: cleanup: s...
559
  	if (se != cfs_rq->curr)
9ef0a9615   Ingo Molnar   sched: remove the...
560
  		update_stats_wait_end(cfs_rq, se);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
561
562
563
564
565
566
  }
  
  /*
   * We are picking a new current task - update its stats:
   */
  static inline void
79303e9e0   Ingo Molnar   sched: remove the...
567
  update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
568
569
570
571
  {
  	/*
  	 * We are starting a new run period:
  	 */
d281918d7   Ingo Molnar   sched: remove 'no...
572
  	se->exec_start = rq_of(cfs_rq)->clock;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
573
  }
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
574
575
576
  /**************************************************
   * Scheduling class queueing methods:
   */
c09595f63   Peter Zijlstra   sched: revert rev...
577
578
579
580
581
582
583
584
585
586
587
588
  #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
  static void
  add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
  {
  	cfs_rq->task_weight += weight;
  }
  #else
  static inline void
  add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
  {
  }
  #endif
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
589
590
591
592
  static void
  account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
  	update_load_add(&cfs_rq->load, se->load.weight);
c09595f63   Peter Zijlstra   sched: revert rev...
593
594
  	if (!parent_entity(se))
  		inc_cpu_load(rq_of(cfs_rq), se->load.weight);
b87f17242   Bharata B Rao   sched: maintain o...
595
  	if (entity_is_task(se)) {
c09595f63   Peter Zijlstra   sched: revert rev...
596
  		add_cfs_task_weight(cfs_rq, se->load.weight);
b87f17242   Bharata B Rao   sched: maintain o...
597
598
  		list_add(&se->group_node, &cfs_rq->tasks);
  	}
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
599
600
601
602
603
604
605
606
  	cfs_rq->nr_running++;
  	se->on_rq = 1;
  }
  
  static void
  account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
  	update_load_sub(&cfs_rq->load, se->load.weight);
c09595f63   Peter Zijlstra   sched: revert rev...
607
608
  	if (!parent_entity(se))
  		dec_cpu_load(rq_of(cfs_rq), se->load.weight);
b87f17242   Bharata B Rao   sched: maintain o...
609
  	if (entity_is_task(se)) {
c09595f63   Peter Zijlstra   sched: revert rev...
610
  		add_cfs_task_weight(cfs_rq, -se->load.weight);
b87f17242   Bharata B Rao   sched: maintain o...
611
612
  		list_del_init(&se->group_node);
  	}
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
613
614
615
  	cfs_rq->nr_running--;
  	se->on_rq = 0;
  }
2396af69b   Ingo Molnar   sched: remove the...
616
  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
617
  {
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
618
  #ifdef CONFIG_SCHEDSTATS
e414314cc   Peter Zijlstra   sched: Fix latenc...
619
620
621
622
  	struct task_struct *tsk = NULL;
  
  	if (entity_is_task(se))
  		tsk = task_of(se);
41acab885   Lucas De Marchi   sched: Implement ...
623
624
  	if (se->statistics.sleep_start) {
  		u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
625
626
627
  
  		if ((s64)delta < 0)
  			delta = 0;
41acab885   Lucas De Marchi   sched: Implement ...
628
629
  		if (unlikely(delta > se->statistics.sleep_max))
  			se->statistics.sleep_max = delta;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
630

41acab885   Lucas De Marchi   sched: Implement ...
631
632
  		se->statistics.sleep_start = 0;
  		se->statistics.sum_sleep_runtime += delta;
9745512ce   Arjan van de Ven   sched: latencytop...
633

768d0c272   Peter Zijlstra   sched: Add wait, ...
634
  		if (tsk) {
e414314cc   Peter Zijlstra   sched: Fix latenc...
635
  			account_scheduler_latency(tsk, delta >> 10, 1);
768d0c272   Peter Zijlstra   sched: Add wait, ...
636
637
  			trace_sched_stat_sleep(tsk, delta);
  		}
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
638
  	}
41acab885   Lucas De Marchi   sched: Implement ...
639
640
  	if (se->statistics.block_start) {
  		u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
641
642
643
  
  		if ((s64)delta < 0)
  			delta = 0;
41acab885   Lucas De Marchi   sched: Implement ...
644
645
  		if (unlikely(delta > se->statistics.block_max))
  			se->statistics.block_max = delta;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
646

41acab885   Lucas De Marchi   sched: Implement ...
647
648
  		se->statistics.block_start = 0;
  		se->statistics.sum_sleep_runtime += delta;
30084fbd1   Ingo Molnar   sched: fix profil...
649

e414314cc   Peter Zijlstra   sched: Fix latenc...
650
  		if (tsk) {
8f0dfc34e   Arjan van de Ven   sched: Provide io...
651
  			if (tsk->in_iowait) {
41acab885   Lucas De Marchi   sched: Implement ...
652
653
  				se->statistics.iowait_sum += delta;
  				se->statistics.iowait_count++;
768d0c272   Peter Zijlstra   sched: Add wait, ...
654
  				trace_sched_stat_iowait(tsk, delta);
8f0dfc34e   Arjan van de Ven   sched: Provide io...
655
  			}
e414314cc   Peter Zijlstra   sched: Fix latenc...
656
657
658
659
660
661
662
663
664
665
666
  			/*
  			 * Blocking time is in units of nanosecs, so shift by
  			 * 20 to get a milliseconds-range estimation of the
  			 * amount of time that the task spent sleeping:
  			 */
  			if (unlikely(prof_on == SLEEP_PROFILING)) {
  				profile_hits(SLEEP_PROFILING,
  						(void *)get_wchan(tsk),
  						delta >> 20);
  			}
  			account_scheduler_latency(tsk, delta >> 10, 0);
30084fbd1   Ingo Molnar   sched: fix profil...
667
  		}
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
668
669
670
  	}
  #endif
  }
ddc972975   Peter Zijlstra   sched debug: chec...
671
672
673
674
675
676
677
678
679
680
681
682
  static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
  #ifdef CONFIG_SCHED_DEBUG
  	s64 d = se->vruntime - cfs_rq->min_vruntime;
  
  	if (d < 0)
  		d = -d;
  
  	if (d > 3*sysctl_sched_latency)
  		schedstat_inc(cfs_rq, nr_spread_over);
  #endif
  }
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
683
  static void
aeb73b040   Peter Zijlstra   sched: clean up n...
684
685
  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
  {
1af5f730f   Peter Zijlstra   sched: more accur...
686
  	u64 vruntime = cfs_rq->min_vruntime;
94dfb5e75   Peter Zijlstra   sched: add tree b...
687

2cb8600e6   Peter Zijlstra   sched: documentat...
688
689
690
691
692
693
  	/*
  	 * The 'current' period is already promised to the current tasks,
  	 * however the extra weight of the new task will slow them down a
  	 * little, place the new task so that it fits in the slot that
  	 * stays open at the end.
  	 */
94dfb5e75   Peter Zijlstra   sched: add tree b...
694
  	if (initial && sched_feat(START_DEBIT))
f9c0b0950   Peter Zijlstra   sched: revert bac...
695
  		vruntime += sched_vslice(cfs_rq, se);
aeb73b040   Peter Zijlstra   sched: clean up n...
696

a2e7a7eb2   Mike Galbraith   sched: Remove unn...
697
  	/* sleeps up to a single latency don't count. */
5ca9880c6   Mike Galbraith   sched: Remove FAI...
698
  	if (!initial) {
a2e7a7eb2   Mike Galbraith   sched: Remove unn...
699
  		unsigned long thresh = sysctl_sched_latency;
a7be37ac8   Peter Zijlstra   sched: revert the...
700

a2e7a7eb2   Mike Galbraith   sched: Remove unn...
701
  		/*
a2e7a7eb2   Mike Galbraith   sched: Remove unn...
702
703
704
705
706
  		 * Halve their sleep time's effect, to allow
  		 * for a gentler effect of sleepers:
  		 */
  		if (sched_feat(GENTLE_FAIR_SLEEPERS))
  			thresh >>= 1;
51e0304ce   Ingo Molnar   sched: Implement ...
707

a2e7a7eb2   Mike Galbraith   sched: Remove unn...
708
  		vruntime -= thresh;
aeb73b040   Peter Zijlstra   sched: clean up n...
709
  	}
b5d9d734a   Mike Galbraith   sched: Ensure tha...
710
711
  	/* ensure we never gain time by being placed backwards. */
  	vruntime = max_vruntime(se->vruntime, vruntime);
67e9fb2a3   Peter Zijlstra   sched: add vslice
712
  	se->vruntime = vruntime;
aeb73b040   Peter Zijlstra   sched: clean up n...
713
714
715
  }
  
  static void
88ec22d3e   Peter Zijlstra   sched: Remove the...
716
  enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
717
718
  {
  	/*
88ec22d3e   Peter Zijlstra   sched: Remove the...
719
720
721
  	 * Update the normalized vruntime before updating min_vruntime
  	 * through callig update_curr().
  	 */
371fd7e7a   Peter Zijlstra   sched: Add enqueu...
722
  	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
88ec22d3e   Peter Zijlstra   sched: Remove the...
723
724
725
  		se->vruntime += cfs_rq->min_vruntime;
  
  	/*
a2a2d6807   Dmitry Adamushko   sched: cleanup, m...
726
  	 * Update run-time statistics of the 'current'.
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
727
  	 */
b7cc08965   Ingo Molnar   sched: remove the...
728
  	update_curr(cfs_rq);
a992241de   Peter Zijlstra   sched: fix normal...
729
  	account_entity_enqueue(cfs_rq, se);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
730

88ec22d3e   Peter Zijlstra   sched: Remove the...
731
  	if (flags & ENQUEUE_WAKEUP) {
aeb73b040   Peter Zijlstra   sched: clean up n...
732
  		place_entity(cfs_rq, se, 0);
2396af69b   Ingo Molnar   sched: remove the...
733
  		enqueue_sleeper(cfs_rq, se);
e9acbff64   Ingo Molnar   sched: introduce ...
734
  	}
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
735

d2417e5a3   Ingo Molnar   sched: remove the...
736
  	update_stats_enqueue(cfs_rq, se);
ddc972975   Peter Zijlstra   sched debug: chec...
737
  	check_spread(cfs_rq, se);
83b699ed2   Srivatsa Vaddagiri   sched: revert rec...
738
739
  	if (se != cfs_rq->curr)
  		__enqueue_entity(cfs_rq, se);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
740
  }
a571bbeaf   Peter Zijlstra   sched: fix buddie...
741
  static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
2002c6959   Peter Zijlstra   sched: release bu...
742
  {
de69a80be   Peter Zijlstra   sched: Stop buddi...
743
  	if (!se || cfs_rq->last == se)
2002c6959   Peter Zijlstra   sched: release bu...
744
  		cfs_rq->last = NULL;
de69a80be   Peter Zijlstra   sched: Stop buddi...
745
  	if (!se || cfs_rq->next == se)
2002c6959   Peter Zijlstra   sched: release bu...
746
747
  		cfs_rq->next = NULL;
  }
a571bbeaf   Peter Zijlstra   sched: fix buddie...
748
749
750
751
752
  static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
  	for_each_sched_entity(se)
  		__clear_buddies(cfs_rq_of(se), se);
  }
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
753
  static void
371fd7e7a   Peter Zijlstra   sched: Add enqueu...
754
  dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
755
  {
a2a2d6807   Dmitry Adamushko   sched: cleanup, m...
756
757
758
759
  	/*
  	 * Update run-time statistics of the 'current'.
  	 */
  	update_curr(cfs_rq);
19b6a2e37   Ingo Molnar   sched: remove the...
760
  	update_stats_dequeue(cfs_rq, se);
371fd7e7a   Peter Zijlstra   sched: Add enqueu...
761
  	if (flags & DEQUEUE_SLEEP) {
67e9fb2a3   Peter Zijlstra   sched: add vslice
762
  #ifdef CONFIG_SCHEDSTATS
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
763
764
765
766
  		if (entity_is_task(se)) {
  			struct task_struct *tsk = task_of(se);
  
  			if (tsk->state & TASK_INTERRUPTIBLE)
41acab885   Lucas De Marchi   sched: Implement ...
767
  				se->statistics.sleep_start = rq_of(cfs_rq)->clock;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
768
  			if (tsk->state & TASK_UNINTERRUPTIBLE)
41acab885   Lucas De Marchi   sched: Implement ...
769
  				se->statistics.block_start = rq_of(cfs_rq)->clock;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
770
  		}
db36cc7d6   Dmitry Adamushko   sched: clean up s...
771
  #endif
67e9fb2a3   Peter Zijlstra   sched: add vslice
772
  	}
2002c6959   Peter Zijlstra   sched: release bu...
773
  	clear_buddies(cfs_rq, se);
4793241be   Peter Zijlstra   sched: backward l...
774

83b699ed2   Srivatsa Vaddagiri   sched: revert rec...
775
  	if (se != cfs_rq->curr)
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
776
777
  		__dequeue_entity(cfs_rq, se);
  	account_entity_dequeue(cfs_rq, se);
1af5f730f   Peter Zijlstra   sched: more accur...
778
  	update_min_vruntime(cfs_rq);
88ec22d3e   Peter Zijlstra   sched: Remove the...
779
780
781
782
783
784
  
  	/*
  	 * Normalize the entity after updating the min_vruntime because the
  	 * update can refer to the ->curr item and we need to reflect this
  	 * movement in our normalized position.
  	 */
371fd7e7a   Peter Zijlstra   sched: Add enqueu...
785
  	if (!(flags & DEQUEUE_SLEEP))
88ec22d3e   Peter Zijlstra   sched: Remove the...
786
  		se->vruntime -= cfs_rq->min_vruntime;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
787
788
789
790
791
  }
  
  /*
   * Preempt the current task with a newly woken task if needed:
   */
7c92e54f6   Peter Zijlstra   sched: simplify _...
792
  static void
2e09bf556   Ingo Molnar   sched: wakeup gra...
793
  check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
794
  {
116978308   Peter Zijlstra   sched: fix ideal_...
795
  	unsigned long ideal_runtime, delta_exec;
6d0f0ebd0   Peter Zijlstra   sched: simplify a...
796
  	ideal_runtime = sched_slice(cfs_rq, curr);
116978308   Peter Zijlstra   sched: fix ideal_...
797
  	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
a9f3e2b54   Mike Galbraith   sched: clear budd...
798
  	if (delta_exec > ideal_runtime) {
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
799
  		resched_task(rq_of(cfs_rq)->curr);
a9f3e2b54   Mike Galbraith   sched: clear budd...
800
801
802
803
804
  		/*
  		 * The current task ran long enough, ensure it doesn't get
  		 * re-elected due to buddy favours.
  		 */
  		clear_buddies(cfs_rq, curr);
f685ceaca   Mike Galbraith   sched: Strengthen...
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
  		return;
  	}
  
  	/*
  	 * Ensure that a task that missed wakeup preemption by a
  	 * narrow margin doesn't have to wait for a full slice.
  	 * This also mitigates buddy induced latencies under load.
  	 */
  	if (!sched_feat(WAKEUP_PREEMPT))
  		return;
  
  	if (delta_exec < sysctl_sched_min_granularity)
  		return;
  
  	if (cfs_rq->nr_running > 1) {
  		struct sched_entity *se = __pick_next_entity(cfs_rq);
  		s64 delta = curr->vruntime - se->vruntime;
  
  		if (delta > ideal_runtime)
  			resched_task(rq_of(cfs_rq)->curr);
a9f3e2b54   Mike Galbraith   sched: clear budd...
825
  	}
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
826
  }
83b699ed2   Srivatsa Vaddagiri   sched: revert rec...
827
  static void
8494f412e   Ingo Molnar   sched: remove the...
828
  set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
829
  {
83b699ed2   Srivatsa Vaddagiri   sched: revert rec...
830
831
832
833
834
835
836
837
838
839
  	/* 'current' is not kept within the tree. */
  	if (se->on_rq) {
  		/*
  		 * Any task has to be enqueued before it get to execute on
  		 * a CPU. So account for the time it spent waiting on the
  		 * runqueue.
  		 */
  		update_stats_wait_end(cfs_rq, se);
  		__dequeue_entity(cfs_rq, se);
  	}
79303e9e0   Ingo Molnar   sched: remove the...
840
  	update_stats_curr_start(cfs_rq, se);
429d43bcc   Ingo Molnar   sched: cleanup: s...
841
  	cfs_rq->curr = se;
eba1ed4b7   Ingo Molnar   sched: debug: tra...
842
843
844
845
846
847
  #ifdef CONFIG_SCHEDSTATS
  	/*
  	 * Track our maximum slice length, if the CPU's load is at
  	 * least twice that of our own weight (i.e. dont track it
  	 * when there are only lesser-weight tasks around):
  	 */
495eca494   Dmitry Adamushko   sched: clean up s...
848
  	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
41acab885   Lucas De Marchi   sched: Implement ...
849
  		se->statistics.slice_max = max(se->statistics.slice_max,
eba1ed4b7   Ingo Molnar   sched: debug: tra...
850
851
852
  			se->sum_exec_runtime - se->prev_sum_exec_runtime);
  	}
  #endif
4a55b4503   Peter Zijlstra   sched: improve pr...
853
  	se->prev_sum_exec_runtime = se->sum_exec_runtime;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
854
  }
3f3a49048   Peter Zijlstra   sched: virtual ti...
855
856
  static int
  wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
f4b6755fb   Peter Zijlstra   sched: cleanup fa...
857
  static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
aa2ac2522   Peter Zijlstra   sched: fix overlo...
858
  {
f4b6755fb   Peter Zijlstra   sched: cleanup fa...
859
  	struct sched_entity *se = __pick_next_entity(cfs_rq);
f685ceaca   Mike Galbraith   sched: Strengthen...
860
  	struct sched_entity *left = se;
f4b6755fb   Peter Zijlstra   sched: cleanup fa...
861

f685ceaca   Mike Galbraith   sched: Strengthen...
862
863
  	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
  		se = cfs_rq->next;
aa2ac2522   Peter Zijlstra   sched: fix overlo...
864

f685ceaca   Mike Galbraith   sched: Strengthen...
865
866
867
868
869
870
871
  	/*
  	 * Prefer last buddy, try to return the CPU to a preempted task.
  	 */
  	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
  		se = cfs_rq->last;
  
  	clear_buddies(cfs_rq, se);
4793241be   Peter Zijlstra   sched: backward l...
872
873
  
  	return se;
aa2ac2522   Peter Zijlstra   sched: fix overlo...
874
  }
ab6cde269   Ingo Molnar   sched: remove the...
875
  static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
876
877
878
879
880
881
  {
  	/*
  	 * If still on the runqueue then deactivate_task()
  	 * was not called and update_curr() has to be done:
  	 */
  	if (prev->on_rq)
b7cc08965   Ingo Molnar   sched: remove the...
882
  		update_curr(cfs_rq);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
883

ddc972975   Peter Zijlstra   sched debug: chec...
884
  	check_spread(cfs_rq, prev);
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
885
  	if (prev->on_rq) {
5870db5b8   Ingo Molnar   sched: remove the...
886
  		update_stats_wait_start(cfs_rq, prev);
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
887
888
889
  		/* Put 'current' back into the tree. */
  		__enqueue_entity(cfs_rq, prev);
  	}
429d43bcc   Ingo Molnar   sched: cleanup: s...
890
  	cfs_rq->curr = NULL;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
891
  }
8f4d37ec0   Peter Zijlstra   sched: high-res p...
892
893
  static void
  entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
894
  {
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
895
  	/*
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
896
  	 * Update run-time statistics of the 'current'.
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
897
  	 */
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
898
  	update_curr(cfs_rq);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
899

8f4d37ec0   Peter Zijlstra   sched: high-res p...
900
901
902
903
904
  #ifdef CONFIG_SCHED_HRTICK
  	/*
  	 * queued ticks are scheduled to match the slice, so don't bother
  	 * validating it and just reschedule.
  	 */
983ed7a66   Harvey Harrison   sched: add static...
905
906
907
908
  	if (queued) {
  		resched_task(rq_of(cfs_rq)->curr);
  		return;
  	}
8f4d37ec0   Peter Zijlstra   sched: high-res p...
909
910
911
912
913
914
915
  	/*
  	 * don't let the period tick interfere with the hrtick preemption
  	 */
  	if (!sched_feat(DOUBLE_TICK) &&
  			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
  		return;
  #endif
ce6c13113   Peter Zijlstra   sched: disable fo...
916
  	if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
2e09bf556   Ingo Molnar   sched: wakeup gra...
917
  		check_preempt_tick(cfs_rq, curr);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
918
919
920
921
922
  }
  
  /**************************************************
   * CFS operations on tasks:
   */
8f4d37ec0   Peter Zijlstra   sched: high-res p...
923
924
925
  #ifdef CONFIG_SCHED_HRTICK
  static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
  {
8f4d37ec0   Peter Zijlstra   sched: high-res p...
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
  	struct sched_entity *se = &p->se;
  	struct cfs_rq *cfs_rq = cfs_rq_of(se);
  
  	WARN_ON(task_rq(p) != rq);
  
  	if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
  		u64 slice = sched_slice(cfs_rq, se);
  		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
  		s64 delta = slice - ran;
  
  		if (delta < 0) {
  			if (rq->curr == p)
  				resched_task(p);
  			return;
  		}
  
  		/*
  		 * Don't schedule slices shorter than 10000ns, that just
  		 * doesn't make sense. Rely on vruntime for fairness.
  		 */
31656519e   Peter Zijlstra   sched, x86: clean...
946
  		if (rq->curr != p)
157124c11   Peter Zijlstra   sched: fix warnin...
947
  			delta = max_t(s64, 10000LL, delta);
8f4d37ec0   Peter Zijlstra   sched: high-res p...
948

31656519e   Peter Zijlstra   sched, x86: clean...
949
  		hrtick_start(rq, delta);
8f4d37ec0   Peter Zijlstra   sched: high-res p...
950
951
  	}
  }
a4c2f00f5   Peter Zijlstra   sched: fair sched...
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
  
  /*
   * called from enqueue/dequeue and updates the hrtick when the
   * current task is from our class and nr_running is low enough
   * to matter.
   */
  static void hrtick_update(struct rq *rq)
  {
  	struct task_struct *curr = rq->curr;
  
  	if (curr->sched_class != &fair_sched_class)
  		return;
  
  	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
  		hrtick_start_fair(rq, curr);
  }
55e12e5e7   Dhaval Giani   sched: make sched...
968
  #else /* !CONFIG_SCHED_HRTICK */
8f4d37ec0   Peter Zijlstra   sched: high-res p...
969
970
971
972
  static inline void
  hrtick_start_fair(struct rq *rq, struct task_struct *p)
  {
  }
a4c2f00f5   Peter Zijlstra   sched: fair sched...
973
974
975
976
  
  static inline void hrtick_update(struct rq *rq)
  {
  }
8f4d37ec0   Peter Zijlstra   sched: high-res p...
977
  #endif
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
978
979
980
981
982
  /*
   * The enqueue_task method is called before nr_running is
   * increased. Here we update the fair scheduling stats and
   * then put the task into the rbtree:
   */
ea87bb785   Thomas Gleixner   sched: Extend enq...
983
  static void
371fd7e7a   Peter Zijlstra   sched: Add enqueu...
984
  enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
985
986
  {
  	struct cfs_rq *cfs_rq;
62fb18513   Peter Zijlstra   sched: revert loa...
987
  	struct sched_entity *se = &p->se;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
988
989
  
  	for_each_sched_entity(se) {
62fb18513   Peter Zijlstra   sched: revert loa...
990
  		if (se->on_rq)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
991
992
  			break;
  		cfs_rq = cfs_rq_of(se);
88ec22d3e   Peter Zijlstra   sched: Remove the...
993
994
  		enqueue_entity(cfs_rq, se, flags);
  		flags = ENQUEUE_WAKEUP;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
995
  	}
8f4d37ec0   Peter Zijlstra   sched: high-res p...
996

a4c2f00f5   Peter Zijlstra   sched: fair sched...
997
  	hrtick_update(rq);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
998
999
1000
1001
1002
1003
1004
  }
  
  /*
   * The dequeue_task method is called before nr_running is
   * decreased. We remove the task from the rbtree and
   * update the fair scheduling stats:
   */
371fd7e7a   Peter Zijlstra   sched: Add enqueu...
1005
  static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1006
1007
  {
  	struct cfs_rq *cfs_rq;
62fb18513   Peter Zijlstra   sched: revert loa...
1008
  	struct sched_entity *se = &p->se;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1009
1010
1011
  
  	for_each_sched_entity(se) {
  		cfs_rq = cfs_rq_of(se);
371fd7e7a   Peter Zijlstra   sched: Add enqueu...
1012
  		dequeue_entity(cfs_rq, se, flags);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1013
  		/* Don't dequeue parent if it has other entities besides us */
62fb18513   Peter Zijlstra   sched: revert loa...
1014
  		if (cfs_rq->load.weight)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1015
  			break;
371fd7e7a   Peter Zijlstra   sched: Add enqueu...
1016
  		flags |= DEQUEUE_SLEEP;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1017
  	}
8f4d37ec0   Peter Zijlstra   sched: high-res p...
1018

a4c2f00f5   Peter Zijlstra   sched: fair sched...
1019
  	hrtick_update(rq);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1020
1021
1022
  }
  
  /*
1799e35d5   Ingo Molnar   sched: add /proc/...
1023
1024
1025
   * sched_yield() support is very simple - we dequeue and enqueue.
   *
   * If compat_yield is turned on then we requeue to the end of the tree.
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1026
   */
4530d7ab0   Dmitry Adamushko   sched: simplify s...
1027
  static void yield_task_fair(struct rq *rq)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1028
  {
db292ca30   Ingo Molnar   sched: default to...
1029
1030
1031
  	struct task_struct *curr = rq->curr;
  	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
  	struct sched_entity *rightmost, *se = &curr->se;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1032
1033
  
  	/*
1799e35d5   Ingo Molnar   sched: add /proc/...
1034
1035
1036
1037
  	 * Are we the only task in the tree?
  	 */
  	if (unlikely(cfs_rq->nr_running == 1))
  		return;
2002c6959   Peter Zijlstra   sched: release bu...
1038
  	clear_buddies(cfs_rq, se);
db292ca30   Ingo Molnar   sched: default to...
1039
  	if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
3e51f33fc   Peter Zijlstra   sched: add option...
1040
  		update_rq_clock(rq);
1799e35d5   Ingo Molnar   sched: add /proc/...
1041
  		/*
a2a2d6807   Dmitry Adamushko   sched: cleanup, m...
1042
  		 * Update run-time statistics of the 'current'.
1799e35d5   Ingo Molnar   sched: add /proc/...
1043
  		 */
2b1e315dd   Dmitry Adamushko   sched: yield fix
1044
  		update_curr(cfs_rq);
1799e35d5   Ingo Molnar   sched: add /proc/...
1045
1046
1047
1048
1049
  
  		return;
  	}
  	/*
  	 * Find the rightmost entry in the rbtree:
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1050
  	 */
2b1e315dd   Dmitry Adamushko   sched: yield fix
1051
  	rightmost = __pick_last_entity(cfs_rq);
1799e35d5   Ingo Molnar   sched: add /proc/...
1052
1053
1054
  	/*
  	 * Already in the rightmost position?
  	 */
54fdc5816   Fabio Checconi   sched: Account fo...
1055
  	if (unlikely(!rightmost || entity_before(rightmost, se)))
1799e35d5   Ingo Molnar   sched: add /proc/...
1056
1057
1058
1059
  		return;
  
  	/*
  	 * Minimally necessary key value to be last in the tree:
2b1e315dd   Dmitry Adamushko   sched: yield fix
1060
1061
  	 * Upon rescheduling, sched_class::put_prev_task() will place
  	 * 'current' within the tree based on its new key value.
1799e35d5   Ingo Molnar   sched: add /proc/...
1062
  	 */
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
1063
  	se->vruntime = rightmost->vruntime + 1;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1064
  }
e7693a362   Gregory Haskins   sched: de-SCHED_O...
1065
  #ifdef CONFIG_SMP
098fb9db2   Ingo Molnar   sched: clean up w...
1066

88ec22d3e   Peter Zijlstra   sched: Remove the...
1067
1068
1069
1070
1071
1072
1073
  static void task_waking_fair(struct rq *rq, struct task_struct *p)
  {
  	struct sched_entity *se = &p->se;
  	struct cfs_rq *cfs_rq = cfs_rq_of(se);
  
  	se->vruntime -= cfs_rq->min_vruntime;
  }
bb3469ac9   Peter Zijlstra   sched: hierarchic...
1074
  #ifdef CONFIG_FAIR_GROUP_SCHED
f5bfb7d9f   Peter Zijlstra   sched: bias effec...
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
  /*
   * effective_load() calculates the load change as seen from the root_task_group
   *
   * Adding load to a group doesn't make a group heavier, but can cause movement
   * of group shares between cpus. Assuming the shares were perfectly aligned one
   * can calculate the shift in shares.
   *
   * The problem is that perfectly aligning the shares is rather expensive, hence
   * we try to avoid doing that too often - see update_shares(), which ratelimits
   * this change.
   *
   * We compensate this by not only taking the current delta into account, but
   * also considering the delta between when the shares were last adjusted and
   * now.
   *
   * We still saw a performance dip, some tracing learned us that between
   * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
   * significantly. Therefore try to bias the error in direction of failing
   * the affine wakeup.
   *
   */
f1d239f73   Peter Zijlstra   sched: incrementa...
1096
1097
  static long effective_load(struct task_group *tg, int cpu,
  		long wl, long wg)
bb3469ac9   Peter Zijlstra   sched: hierarchic...
1098
  {
4be9daaa1   Peter Zijlstra   sched: fix task_h...
1099
  	struct sched_entity *se = tg->se[cpu];
f1d239f73   Peter Zijlstra   sched: incrementa...
1100
1101
1102
1103
1104
  
  	if (!tg->parent)
  		return wl;
  
  	/*
f5bfb7d9f   Peter Zijlstra   sched: bias effec...
1105
1106
1107
1108
1109
  	 * By not taking the decrease of shares on the other cpu into
  	 * account our error leans towards reducing the affine wakeups.
  	 */
  	if (!wl && sched_feat(ASYM_EFF_LOAD))
  		return wl;
4be9daaa1   Peter Zijlstra   sched: fix task_h...
1110
  	for_each_sched_entity(se) {
cb5ef42a0   Peter Zijlstra   sched: optimize e...
1111
  		long S, rw, s, a, b;
940959e93   Peter Zijlstra   sched: fixlet for...
1112
1113
1114
1115
1116
1117
1118
1119
1120
  		long more_w;
  
  		/*
  		 * Instead of using this increment, also add the difference
  		 * between when the shares were last updated and now.
  		 */
  		more_w = se->my_q->load.weight - se->my_q->rq_weight;
  		wl += more_w;
  		wg += more_w;
4be9daaa1   Peter Zijlstra   sched: fix task_h...
1121
1122
1123
  
  		S = se->my_q->tg->shares;
  		s = se->my_q->shares;
f1d239f73   Peter Zijlstra   sched: incrementa...
1124
  		rw = se->my_q->rq_weight;
bb3469ac9   Peter Zijlstra   sched: hierarchic...
1125

cb5ef42a0   Peter Zijlstra   sched: optimize e...
1126
1127
  		a = S*(rw + wl);
  		b = S*rw + s*wg;
4be9daaa1   Peter Zijlstra   sched: fix task_h...
1128

940959e93   Peter Zijlstra   sched: fixlet for...
1129
1130
1131
1132
  		wl = s*(a-b);
  
  		if (likely(b))
  			wl /= b;
83378269a   Peter Zijlstra   sched: correct wa...
1133
1134
1135
1136
1137
1138
1139
  		/*
  		 * Assume the group is already running and will
  		 * thus already be accounted for in the weight.
  		 *
  		 * That is, moving shares between CPUs, does not
  		 * alter the group weight.
  		 */
4be9daaa1   Peter Zijlstra   sched: fix task_h...
1140
  		wg = 0;
4be9daaa1   Peter Zijlstra   sched: fix task_h...
1141
  	}
bb3469ac9   Peter Zijlstra   sched: hierarchic...
1142

4be9daaa1   Peter Zijlstra   sched: fix task_h...
1143
  	return wl;
bb3469ac9   Peter Zijlstra   sched: hierarchic...
1144
  }
4be9daaa1   Peter Zijlstra   sched: fix task_h...
1145

bb3469ac9   Peter Zijlstra   sched: hierarchic...
1146
  #else
4be9daaa1   Peter Zijlstra   sched: fix task_h...
1147

83378269a   Peter Zijlstra   sched: correct wa...
1148
1149
  static inline unsigned long effective_load(struct task_group *tg, int cpu,
  		unsigned long wl, unsigned long wg)
4be9daaa1   Peter Zijlstra   sched: fix task_h...
1150
  {
83378269a   Peter Zijlstra   sched: correct wa...
1151
  	return wl;
bb3469ac9   Peter Zijlstra   sched: hierarchic...
1152
  }
4be9daaa1   Peter Zijlstra   sched: fix task_h...
1153

bb3469ac9   Peter Zijlstra   sched: hierarchic...
1154
  #endif
c88d59108   Peter Zijlstra   sched: Merge sele...
1155
  static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
098fb9db2   Ingo Molnar   sched: clean up w...
1156
  {
c88d59108   Peter Zijlstra   sched: Merge sele...
1157
1158
  	unsigned long this_load, load;
  	int idx, this_cpu, prev_cpu;
098fb9db2   Ingo Molnar   sched: clean up w...
1159
  	unsigned long tl_per_task;
c88d59108   Peter Zijlstra   sched: Merge sele...
1160
  	struct task_group *tg;
83378269a   Peter Zijlstra   sched: correct wa...
1161
  	unsigned long weight;
b3137bc8e   Mike Galbraith   sched: stop wake_...
1162
  	int balanced;
098fb9db2   Ingo Molnar   sched: clean up w...
1163

c88d59108   Peter Zijlstra   sched: Merge sele...
1164
1165
1166
1167
1168
  	idx	  = sd->wake_idx;
  	this_cpu  = smp_processor_id();
  	prev_cpu  = task_cpu(p);
  	load	  = source_load(prev_cpu, idx);
  	this_load = target_load(this_cpu, idx);
098fb9db2   Ingo Molnar   sched: clean up w...
1169
1170
  
  	/*
b3137bc8e   Mike Galbraith   sched: stop wake_...
1171
1172
1173
1174
  	 * If sync wakeup then subtract the (maximum possible)
  	 * effect of the currently running task from the load
  	 * of the current CPU:
  	 */
f3b577dec   Daniel J Blueman   rcu: apply RCU pr...
1175
  	rcu_read_lock();
83378269a   Peter Zijlstra   sched: correct wa...
1176
1177
1178
  	if (sync) {
  		tg = task_group(current);
  		weight = current->se.load.weight;
c88d59108   Peter Zijlstra   sched: Merge sele...
1179
  		this_load += effective_load(tg, this_cpu, -weight, -weight);
83378269a   Peter Zijlstra   sched: correct wa...
1180
1181
  		load += effective_load(tg, prev_cpu, 0, -weight);
  	}
b3137bc8e   Mike Galbraith   sched: stop wake_...
1182

83378269a   Peter Zijlstra   sched: correct wa...
1183
1184
  	tg = task_group(p);
  	weight = p->se.load.weight;
b3137bc8e   Mike Galbraith   sched: stop wake_...
1185

71a29aa7b   Peter Zijlstra   sched: Deal with ...
1186
1187
  	/*
  	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
c88d59108   Peter Zijlstra   sched: Merge sele...
1188
1189
1190
  	 * due to the sync cause above having dropped this_load to 0, we'll
  	 * always have an imbalance, but there's really nothing you can do
  	 * about that, so that's good too.
71a29aa7b   Peter Zijlstra   sched: Deal with ...
1191
1192
1193
1194
  	 *
  	 * Otherwise check if either cpus are near enough in load to allow this
  	 * task to be woken on this_cpu.
  	 */
e51fd5e22   Peter Zijlstra   sched: Fix wake_a...
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
  	if (this_load) {
  		unsigned long this_eff_load, prev_eff_load;
  
  		this_eff_load = 100;
  		this_eff_load *= power_of(prev_cpu);
  		this_eff_load *= this_load +
  			effective_load(tg, this_cpu, weight, weight);
  
  		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
  		prev_eff_load *= power_of(this_cpu);
  		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
  
  		balanced = this_eff_load <= prev_eff_load;
  	} else
  		balanced = true;
f3b577dec   Daniel J Blueman   rcu: apply RCU pr...
1210
  	rcu_read_unlock();
b3137bc8e   Mike Galbraith   sched: stop wake_...
1211
1212
  
  	/*
4ae7d5cef   Ingo Molnar   sched: improve af...
1213
1214
1215
  	 * If the currently running task will sleep within
  	 * a reasonable amount of time then attract this newly
  	 * woken task:
098fb9db2   Ingo Molnar   sched: clean up w...
1216
  	 */
2fb7635c4   Peter Zijlstra   sched: sync wakeu...
1217
1218
  	if (sync && balanced)
  		return 1;
098fb9db2   Ingo Molnar   sched: clean up w...
1219

41acab885   Lucas De Marchi   sched: Implement ...
1220
  	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
098fb9db2   Ingo Molnar   sched: clean up w...
1221
  	tl_per_task = cpu_avg_load_per_task(this_cpu);
c88d59108   Peter Zijlstra   sched: Merge sele...
1222
1223
1224
  	if (balanced ||
  	    (this_load <= load &&
  	     this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
098fb9db2   Ingo Molnar   sched: clean up w...
1225
1226
1227
1228
1229
  		/*
  		 * This domain has SD_WAKE_AFFINE and
  		 * p is cache cold in this domain, and
  		 * there is no bad imbalance.
  		 */
c88d59108   Peter Zijlstra   sched: Merge sele...
1230
  		schedstat_inc(sd, ttwu_move_affine);
41acab885   Lucas De Marchi   sched: Implement ...
1231
  		schedstat_inc(p, se.statistics.nr_wakeups_affine);
098fb9db2   Ingo Molnar   sched: clean up w...
1232
1233
1234
1235
1236
  
  		return 1;
  	}
  	return 0;
  }
aaee1203c   Peter Zijlstra   sched: Move sched...
1237
1238
1239
1240
1241
  /*
   * find_idlest_group finds and returns the least busy CPU group within the
   * domain.
   */
  static struct sched_group *
78e7ed53c   Peter Zijlstra   sched: Tweak wake...
1242
  find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5158f4e44   Peter Zijlstra   sched: Clean up t...
1243
  		  int this_cpu, int load_idx)
e7693a362   Gregory Haskins   sched: de-SCHED_O...
1244
  {
b3bd3de66   Andi Kleen   gcc-4.6: kernel/*...
1245
  	struct sched_group *idlest = NULL, *group = sd->groups;
aaee1203c   Peter Zijlstra   sched: Move sched...
1246
  	unsigned long min_load = ULONG_MAX, this_load = 0;
aaee1203c   Peter Zijlstra   sched: Move sched...
1247
  	int imbalance = 100 + (sd->imbalance_pct-100)/2;
e7693a362   Gregory Haskins   sched: de-SCHED_O...
1248

aaee1203c   Peter Zijlstra   sched: Move sched...
1249
1250
1251
1252
  	do {
  		unsigned long load, avg_load;
  		int local_group;
  		int i;
e7693a362   Gregory Haskins   sched: de-SCHED_O...
1253

aaee1203c   Peter Zijlstra   sched: Move sched...
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
  		/* Skip over this group if it has no CPUs allowed */
  		if (!cpumask_intersects(sched_group_cpus(group),
  					&p->cpus_allowed))
  			continue;
  
  		local_group = cpumask_test_cpu(this_cpu,
  					       sched_group_cpus(group));
  
  		/* Tally up the load of all CPUs in the group */
  		avg_load = 0;
  
  		for_each_cpu(i, sched_group_cpus(group)) {
  			/* Bias balancing toward cpus of our domain */
  			if (local_group)
  				load = source_load(i, load_idx);
  			else
  				load = target_load(i, load_idx);
  
  			avg_load += load;
  		}
  
  		/* Adjust by relative CPU power of the group */
  		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
  
  		if (local_group) {
  			this_load = avg_load;
aaee1203c   Peter Zijlstra   sched: Move sched...
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
  		} else if (avg_load < min_load) {
  			min_load = avg_load;
  			idlest = group;
  		}
  	} while (group = group->next, group != sd->groups);
  
  	if (!idlest || 100*this_load < imbalance*min_load)
  		return NULL;
  	return idlest;
  }
  
  /*
   * find_idlest_cpu - find the idlest cpu among the cpus in group.
   */
  static int
  find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  {
  	unsigned long load, min_load = ULONG_MAX;
  	int idlest = -1;
  	int i;
  
  	/* Traverse only the allowed CPUs */
  	for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
  		load = weighted_cpuload(i);
  
  		if (load < min_load || (load == min_load && i == this_cpu)) {
  			min_load = load;
  			idlest = i;
e7693a362   Gregory Haskins   sched: de-SCHED_O...
1308
1309
  		}
  	}
aaee1203c   Peter Zijlstra   sched: Move sched...
1310
1311
  	return idlest;
  }
e7693a362   Gregory Haskins   sched: de-SCHED_O...
1312

aaee1203c   Peter Zijlstra   sched: Move sched...
1313
  /*
a50bde513   Peter Zijlstra   sched: Cleanup se...
1314
1315
   * Try and locate an idle CPU in the sched_domain.
   */
99bd5e2f2   Suresh Siddha   sched: Fix select...
1316
  static int select_idle_sibling(struct task_struct *p, int target)
a50bde513   Peter Zijlstra   sched: Cleanup se...
1317
1318
1319
  {
  	int cpu = smp_processor_id();
  	int prev_cpu = task_cpu(p);
99bd5e2f2   Suresh Siddha   sched: Fix select...
1320
  	struct sched_domain *sd;
a50bde513   Peter Zijlstra   sched: Cleanup se...
1321
1322
1323
  	int i;
  
  	/*
99bd5e2f2   Suresh Siddha   sched: Fix select...
1324
1325
  	 * If the task is going to be woken-up on this cpu and if it is
  	 * already idle, then it is the right target.
a50bde513   Peter Zijlstra   sched: Cleanup se...
1326
  	 */
99bd5e2f2   Suresh Siddha   sched: Fix select...
1327
1328
1329
1330
1331
1332
1333
1334
  	if (target == cpu && idle_cpu(cpu))
  		return cpu;
  
  	/*
  	 * If the task is going to be woken-up on the cpu where it previously
  	 * ran and if it is currently idle, then it the right target.
  	 */
  	if (target == prev_cpu && idle_cpu(prev_cpu))
fe3bcfe1f   Peter Zijlstra   sched: More gener...
1335
  		return prev_cpu;
a50bde513   Peter Zijlstra   sched: Cleanup se...
1336
1337
  
  	/*
99bd5e2f2   Suresh Siddha   sched: Fix select...
1338
  	 * Otherwise, iterate the domains and find an elegible idle cpu.
a50bde513   Peter Zijlstra   sched: Cleanup se...
1339
  	 */
99bd5e2f2   Suresh Siddha   sched: Fix select...
1340
1341
  	for_each_domain(target, sd) {
  		if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
fe3bcfe1f   Peter Zijlstra   sched: More gener...
1342
  			break;
99bd5e2f2   Suresh Siddha   sched: Fix select...
1343
1344
1345
1346
1347
1348
  
  		for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
  			if (idle_cpu(i)) {
  				target = i;
  				break;
  			}
a50bde513   Peter Zijlstra   sched: Cleanup se...
1349
  		}
99bd5e2f2   Suresh Siddha   sched: Fix select...
1350
1351
1352
1353
1354
1355
1356
1357
  
  		/*
  		 * Lets stop looking for an idle sibling when we reached
  		 * the domain that spans the current cpu and prev_cpu.
  		 */
  		if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
  		    cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
  			break;
a50bde513   Peter Zijlstra   sched: Cleanup se...
1358
1359
1360
1361
1362
1363
  	}
  
  	return target;
  }
  
  /*
aaee1203c   Peter Zijlstra   sched: Move sched...
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
   * sched_balance_self: balance the current task (running on cpu) in domains
   * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
   * SD_BALANCE_EXEC.
   *
   * Balance, ie. select the least loaded group.
   *
   * Returns the target CPU number, or the same CPU if no balancing is needed.
   *
   * preempt must be disabled.
   */
0017d7350   Peter Zijlstra   sched: Fix TASK_W...
1374
1375
  static int
  select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
aaee1203c   Peter Zijlstra   sched: Move sched...
1376
  {
29cd8bae3   Peter Zijlstra   sched: Fix SD_POW...
1377
  	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
c88d59108   Peter Zijlstra   sched: Merge sele...
1378
1379
1380
  	int cpu = smp_processor_id();
  	int prev_cpu = task_cpu(p);
  	int new_cpu = cpu;
99bd5e2f2   Suresh Siddha   sched: Fix select...
1381
  	int want_affine = 0;
29cd8bae3   Peter Zijlstra   sched: Fix SD_POW...
1382
  	int want_sd = 1;
5158f4e44   Peter Zijlstra   sched: Clean up t...
1383
  	int sync = wake_flags & WF_SYNC;
c88d59108   Peter Zijlstra   sched: Merge sele...
1384

0763a660a   Peter Zijlstra   sched: Rename sel...
1385
  	if (sd_flag & SD_BALANCE_WAKE) {
beac4c7e4   Mike Galbraith   sched: Remove AFF...
1386
  		if (cpumask_test_cpu(cpu, &p->cpus_allowed))
c88d59108   Peter Zijlstra   sched: Merge sele...
1387
1388
1389
  			want_affine = 1;
  		new_cpu = prev_cpu;
  	}
aaee1203c   Peter Zijlstra   sched: Move sched...
1390
1391
  
  	for_each_domain(cpu, tmp) {
e4f428884   Peter Zijlstra   sched: Select_tas...
1392
1393
  		if (!(tmp->flags & SD_LOAD_BALANCE))
  			continue;
aaee1203c   Peter Zijlstra   sched: Move sched...
1394
  		/*
ae154be1f   Peter Zijlstra   sched: Weaken SD_...
1395
1396
  		 * If power savings logic is enabled for a domain, see if we
  		 * are not overloaded, if so, don't balance wider.
aaee1203c   Peter Zijlstra   sched: Move sched...
1397
  		 */
59abf0264   Peter Zijlstra   sched: Add SD_PRE...
1398
  		if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
ae154be1f   Peter Zijlstra   sched: Weaken SD_...
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
  			unsigned long power = 0;
  			unsigned long nr_running = 0;
  			unsigned long capacity;
  			int i;
  
  			for_each_cpu(i, sched_domain_span(tmp)) {
  				power += power_of(i);
  				nr_running += cpu_rq(i)->cfs.nr_running;
  			}
  
  			capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
59abf0264   Peter Zijlstra   sched: Add SD_PRE...
1410
1411
1412
1413
  			if (tmp->flags & SD_POWERSAVINGS_BALANCE)
  				nr_running /= 2;
  
  			if (nr_running < capacity)
29cd8bae3   Peter Zijlstra   sched: Fix SD_POW...
1414
  				want_sd = 0;
ae154be1f   Peter Zijlstra   sched: Weaken SD_...
1415
  		}
aaee1203c   Peter Zijlstra   sched: Move sched...
1416

fe3bcfe1f   Peter Zijlstra   sched: More gener...
1417
  		/*
99bd5e2f2   Suresh Siddha   sched: Fix select...
1418
1419
  		 * If both cpu and prev_cpu are part of this domain,
  		 * cpu is a valid SD_WAKE_AFFINE target.
fe3bcfe1f   Peter Zijlstra   sched: More gener...
1420
  		 */
99bd5e2f2   Suresh Siddha   sched: Fix select...
1421
1422
1423
1424
  		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
  		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
  			affine_sd = tmp;
  			want_affine = 0;
c88d59108   Peter Zijlstra   sched: Merge sele...
1425
  		}
29cd8bae3   Peter Zijlstra   sched: Fix SD_POW...
1426
1427
  		if (!want_sd && !want_affine)
  			break;
0763a660a   Peter Zijlstra   sched: Rename sel...
1428
  		if (!(tmp->flags & sd_flag))
c88d59108   Peter Zijlstra   sched: Merge sele...
1429
  			continue;
29cd8bae3   Peter Zijlstra   sched: Fix SD_POW...
1430
1431
1432
  		if (want_sd)
  			sd = tmp;
  	}
8b911acdf   Mike Galbraith   sched: Fix select...
1433
  #ifdef CONFIG_FAIR_GROUP_SCHED
29cd8bae3   Peter Zijlstra   sched: Fix SD_POW...
1434
1435
1436
1437
1438
  	if (sched_feat(LB_SHARES_UPDATE)) {
  		/*
  		 * Pick the largest domain to update shares over
  		 */
  		tmp = sd;
669c55e9f   Peter Zijlstra   sched: Pre-comput...
1439
  		if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
29cd8bae3   Peter Zijlstra   sched: Fix SD_POW...
1440
  			tmp = affine_sd;
0017d7350   Peter Zijlstra   sched: Fix TASK_W...
1441
1442
  		if (tmp) {
  			raw_spin_unlock(&rq->lock);
29cd8bae3   Peter Zijlstra   sched: Fix SD_POW...
1443
  			update_shares(tmp);
0017d7350   Peter Zijlstra   sched: Fix TASK_W...
1444
1445
  			raw_spin_lock(&rq->lock);
  		}
c88d59108   Peter Zijlstra   sched: Merge sele...
1446
  	}
8b911acdf   Mike Galbraith   sched: Fix select...
1447
  #endif
aaee1203c   Peter Zijlstra   sched: Move sched...
1448

8b911acdf   Mike Galbraith   sched: Fix select...
1449
  	if (affine_sd) {
99bd5e2f2   Suresh Siddha   sched: Fix select...
1450
1451
1452
1453
  		if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
  			return select_idle_sibling(p, cpu);
  		else
  			return select_idle_sibling(p, prev_cpu);
8b911acdf   Mike Galbraith   sched: Fix select...
1454
  	}
e7693a362   Gregory Haskins   sched: de-SCHED_O...
1455

aaee1203c   Peter Zijlstra   sched: Move sched...
1456
  	while (sd) {
5158f4e44   Peter Zijlstra   sched: Clean up t...
1457
  		int load_idx = sd->forkexec_idx;
aaee1203c   Peter Zijlstra   sched: Move sched...
1458
  		struct sched_group *group;
c88d59108   Peter Zijlstra   sched: Merge sele...
1459
  		int weight;
098fb9db2   Ingo Molnar   sched: clean up w...
1460

0763a660a   Peter Zijlstra   sched: Rename sel...
1461
  		if (!(sd->flags & sd_flag)) {
aaee1203c   Peter Zijlstra   sched: Move sched...
1462
1463
1464
  			sd = sd->child;
  			continue;
  		}
098fb9db2   Ingo Molnar   sched: clean up w...
1465

5158f4e44   Peter Zijlstra   sched: Clean up t...
1466
1467
  		if (sd_flag & SD_BALANCE_WAKE)
  			load_idx = sd->wake_idx;
098fb9db2   Ingo Molnar   sched: clean up w...
1468

5158f4e44   Peter Zijlstra   sched: Clean up t...
1469
  		group = find_idlest_group(sd, p, cpu, load_idx);
aaee1203c   Peter Zijlstra   sched: Move sched...
1470
1471
1472
1473
  		if (!group) {
  			sd = sd->child;
  			continue;
  		}
4ae7d5cef   Ingo Molnar   sched: improve af...
1474

d7c33c493   Peter Zijlstra   sched: Fix task a...
1475
  		new_cpu = find_idlest_cpu(group, p, cpu);
aaee1203c   Peter Zijlstra   sched: Move sched...
1476
1477
1478
1479
  		if (new_cpu == -1 || new_cpu == cpu) {
  			/* Now try balancing at a lower domain level of cpu */
  			sd = sd->child;
  			continue;
e7693a362   Gregory Haskins   sched: de-SCHED_O...
1480
  		}
aaee1203c   Peter Zijlstra   sched: Move sched...
1481
1482
1483
  
  		/* Now try balancing at a lower domain level of new_cpu */
  		cpu = new_cpu;
669c55e9f   Peter Zijlstra   sched: Pre-comput...
1484
  		weight = sd->span_weight;
aaee1203c   Peter Zijlstra   sched: Move sched...
1485
1486
  		sd = NULL;
  		for_each_domain(cpu, tmp) {
669c55e9f   Peter Zijlstra   sched: Pre-comput...
1487
  			if (weight <= tmp->span_weight)
aaee1203c   Peter Zijlstra   sched: Move sched...
1488
  				break;
0763a660a   Peter Zijlstra   sched: Rename sel...
1489
  			if (tmp->flags & sd_flag)
aaee1203c   Peter Zijlstra   sched: Move sched...
1490
1491
1492
  				sd = tmp;
  		}
  		/* while loop will break here if sd == NULL */
e7693a362   Gregory Haskins   sched: de-SCHED_O...
1493
  	}
c88d59108   Peter Zijlstra   sched: Merge sele...
1494
  	return new_cpu;
e7693a362   Gregory Haskins   sched: de-SCHED_O...
1495
1496
  }
  #endif /* CONFIG_SMP */
e52fb7c09   Peter Zijlstra   sched: prefer wakers
1497
1498
  static unsigned long
  wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
0bbd3336e   Peter Zijlstra   sched: fix wakeup...
1499
1500
1501
1502
  {
  	unsigned long gran = sysctl_sched_wakeup_granularity;
  
  	/*
e52fb7c09   Peter Zijlstra   sched: prefer wakers
1503
1504
  	 * Since its curr running now, convert the gran from real-time
  	 * to virtual-time in his units.
13814d42e   Mike Galbraith   sched: Remove ASY...
1505
1506
1507
1508
1509
1510
1511
1512
1513
  	 *
  	 * By using 'se' instead of 'curr' we penalize light tasks, so
  	 * they get preempted easier. That is, if 'se' < 'curr' then
  	 * the resulting gran will be larger, therefore penalizing the
  	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
  	 * be smaller, again penalizing the lighter task.
  	 *
  	 * This is especially important for buddies when the leftmost
  	 * task is higher priority than the buddy.
0bbd3336e   Peter Zijlstra   sched: fix wakeup...
1514
  	 */
13814d42e   Mike Galbraith   sched: Remove ASY...
1515
1516
  	if (unlikely(se->load.weight != NICE_0_LOAD))
  		gran = calc_delta_fair(gran, se);
0bbd3336e   Peter Zijlstra   sched: fix wakeup...
1517
1518
1519
1520
1521
  
  	return gran;
  }
  
  /*
464b75273   Peter Zijlstra   sched: re-instate...
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
   * Should 'se' preempt 'curr'.
   *
   *             |s1
   *        |s2
   *   |s3
   *         g
   *      |<--->|c
   *
   *  w(c, s1) = -1
   *  w(c, s2) =  0
   *  w(c, s3) =  1
   *
   */
  static int
  wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
  {
  	s64 gran, vdiff = curr->vruntime - se->vruntime;
  
  	if (vdiff <= 0)
  		return -1;
e52fb7c09   Peter Zijlstra   sched: prefer wakers
1542
  	gran = wakeup_gran(curr, se);
464b75273   Peter Zijlstra   sched: re-instate...
1543
1544
1545
1546
1547
  	if (vdiff > gran)
  		return 1;
  
  	return 0;
  }
02479099c   Peter Zijlstra   sched: fix buddie...
1548
1549
  static void set_last_buddy(struct sched_entity *se)
  {
6bc912b71   Peter Zijlstra   sched: SCHED_OTHE...
1550
1551
1552
1553
  	if (likely(task_of(se)->policy != SCHED_IDLE)) {
  		for_each_sched_entity(se)
  			cfs_rq_of(se)->last = se;
  	}
02479099c   Peter Zijlstra   sched: fix buddie...
1554
1555
1556
1557
  }
  
  static void set_next_buddy(struct sched_entity *se)
  {
6bc912b71   Peter Zijlstra   sched: SCHED_OTHE...
1558
1559
1560
1561
  	if (likely(task_of(se)->policy != SCHED_IDLE)) {
  		for_each_sched_entity(se)
  			cfs_rq_of(se)->next = se;
  	}
02479099c   Peter Zijlstra   sched: fix buddie...
1562
  }
464b75273   Peter Zijlstra   sched: re-instate...
1563
  /*
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1564
1565
   * Preempt the current task with a newly woken task if needed:
   */
5a9b86f64   Peter Zijlstra   sched: Rename fla...
1566
  static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1567
1568
  {
  	struct task_struct *curr = rq->curr;
8651a86c3   Srivatsa Vaddagiri   sched: group sche...
1569
  	struct sched_entity *se = &curr->se, *pse = &p->se;
03e89e457   Mike Galbraith   sched: fix wakeup...
1570
  	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
f685ceaca   Mike Galbraith   sched: Strengthen...
1571
  	int scale = cfs_rq->nr_running >= sched_nr_latency;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1572

3a7e73a2e   Peter Zijlstra   sched: Clean up c...
1573
1574
  	if (unlikely(rt_prio(p->prio)))
  		goto preempt;
aa2ac2522   Peter Zijlstra   sched: fix overlo...
1575

d95f98d06   Peter Zijlstra   sched: fix fair p...
1576
1577
  	if (unlikely(p->sched_class != &fair_sched_class))
  		return;
4ae7d5cef   Ingo Molnar   sched: improve af...
1578
1579
  	if (unlikely(se == pse))
  		return;
f685ceaca   Mike Galbraith   sched: Strengthen...
1580
  	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
3cb63d527   Mike Galbraith   sched: Complete b...
1581
  		set_next_buddy(pse);
57fdc26d4   Peter Zijlstra   sched: fixup budd...
1582

aec0a5142   Bharata B Rao   sched: call resch...
1583
1584
1585
1586
1587
1588
  	/*
  	 * We can come here with TIF_NEED_RESCHED already set from new task
  	 * wake up path.
  	 */
  	if (test_tsk_need_resched(curr))
  		return;
91c234b4e   Ingo Molnar   sched: do not wak...
1589
  	/*
6bc912b71   Peter Zijlstra   sched: SCHED_OTHE...
1590
  	 * Batch and idle tasks do not preempt (their preemption is driven by
91c234b4e   Ingo Molnar   sched: do not wak...
1591
1592
  	 * the tick):
  	 */
6bc912b71   Peter Zijlstra   sched: SCHED_OTHE...
1593
  	if (unlikely(p->policy != SCHED_NORMAL))
91c234b4e   Ingo Molnar   sched: do not wak...
1594
  		return;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1595

6bc912b71   Peter Zijlstra   sched: SCHED_OTHE...
1596
  	/* Idle tasks are by definition preempted by everybody. */
3a7e73a2e   Peter Zijlstra   sched: Clean up c...
1597
1598
  	if (unlikely(curr->policy == SCHED_IDLE))
  		goto preempt;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1599

ad4b78bbc   Peter Zijlstra   sched: Add new wa...
1600
1601
  	if (!sched_feat(WAKEUP_PREEMPT))
  		return;
3a7e73a2e   Peter Zijlstra   sched: Clean up c...
1602
  	update_curr(cfs_rq);
464b75273   Peter Zijlstra   sched: re-instate...
1603
  	find_matching_se(&se, &pse);
002f128b4   Paul Turner   sched: remove red...
1604
  	BUG_ON(!pse);
3a7e73a2e   Peter Zijlstra   sched: Clean up c...
1605
1606
  	if (wakeup_preempt_entity(se, pse) == 1)
  		goto preempt;
464b75273   Peter Zijlstra   sched: re-instate...
1607

3a7e73a2e   Peter Zijlstra   sched: Clean up c...
1608
  	return;
a65ac745e   Jupyung Lee   sched: Move updat...
1609

3a7e73a2e   Peter Zijlstra   sched: Clean up c...
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
  preempt:
  	resched_task(curr);
  	/*
  	 * Only set the backward buddy when the current task is still
  	 * on the rq. This can happen when a wakeup gets interleaved
  	 * with schedule on the ->pre_schedule() or idle_balance()
  	 * point, either of which can * drop the rq lock.
  	 *
  	 * Also, during early boot the idle thread is in the fair class,
  	 * for obvious reasons its a bad idea to schedule back to it.
  	 */
  	if (unlikely(!se->on_rq || curr == rq->idle))
  		return;
  
  	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
  		set_last_buddy(se);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1626
  }
fb8d47240   Ingo Molnar   sched: remove the...
1627
  static struct task_struct *pick_next_task_fair(struct rq *rq)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1628
  {
8f4d37ec0   Peter Zijlstra   sched: high-res p...
1629
  	struct task_struct *p;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1630
1631
  	struct cfs_rq *cfs_rq = &rq->cfs;
  	struct sched_entity *se;
36ace27e3   Tim Blechmann   sched: Optimize b...
1632
  	if (!cfs_rq->nr_running)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1633
1634
1635
  		return NULL;
  
  	do {
9948f4b2a   Ingo Molnar   sched: remove the...
1636
  		se = pick_next_entity(cfs_rq);
f4b6755fb   Peter Zijlstra   sched: cleanup fa...
1637
  		set_next_entity(cfs_rq, se);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1638
1639
  		cfs_rq = group_cfs_rq(se);
  	} while (cfs_rq);
8f4d37ec0   Peter Zijlstra   sched: high-res p...
1640
1641
1642
1643
  	p = task_of(se);
  	hrtick_start_fair(rq, p);
  
  	return p;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1644
1645
1646
1647
1648
  }
  
  /*
   * Account for a descheduled task:
   */
31ee529cc   Ingo Molnar   sched: remove the...
1649
  static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1650
1651
1652
1653
1654
1655
  {
  	struct sched_entity *se = &prev->se;
  	struct cfs_rq *cfs_rq;
  
  	for_each_sched_entity(se) {
  		cfs_rq = cfs_rq_of(se);
ab6cde269   Ingo Molnar   sched: remove the...
1656
  		put_prev_entity(cfs_rq, se);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1657
1658
  	}
  }
681f3e685   Peter Williams   sched: isolate SM...
1659
  #ifdef CONFIG_SMP
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1660
1661
1662
  /**************************************************
   * Fair scheduling class load-balancing methods:
   */
1e3c88bde   Peter Zijlstra   sched: Move load ...
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
  /*
   * pull_task - move a task from a remote runqueue to the local runqueue.
   * Both runqueues must be locked.
   */
  static void pull_task(struct rq *src_rq, struct task_struct *p,
  		      struct rq *this_rq, int this_cpu)
  {
  	deactivate_task(src_rq, p, 0);
  	set_task_cpu(p, this_cpu);
  	activate_task(this_rq, p, 0);
  	check_preempt_curr(this_rq, p, 0);
  }
  
  /*
   * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
   */
  static
  int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
  		     struct sched_domain *sd, enum cpu_idle_type idle,
  		     int *all_pinned)
  {
  	int tsk_cache_hot = 0;
  	/*
  	 * We do not migrate tasks that are:
  	 * 1) running (obviously), or
  	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
  	 * 3) are cache-hot on their current CPU.
  	 */
  	if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
41acab885   Lucas De Marchi   sched: Implement ...
1692
  		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
1e3c88bde   Peter Zijlstra   sched: Move load ...
1693
1694
1695
1696
1697
  		return 0;
  	}
  	*all_pinned = 0;
  
  	if (task_running(rq, p)) {
41acab885   Lucas De Marchi   sched: Implement ...
1698
  		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
1e3c88bde   Peter Zijlstra   sched: Move load ...
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
  		return 0;
  	}
  
  	/*
  	 * Aggressive migration if:
  	 * 1) task is cache cold, or
  	 * 2) too many balance attempts have failed.
  	 */
  
  	tsk_cache_hot = task_hot(p, rq->clock, sd);
  	if (!tsk_cache_hot ||
  		sd->nr_balance_failed > sd->cache_nice_tries) {
  #ifdef CONFIG_SCHEDSTATS
  		if (tsk_cache_hot) {
  			schedstat_inc(sd, lb_hot_gained[idle]);
41acab885   Lucas De Marchi   sched: Implement ...
1714
  			schedstat_inc(p, se.statistics.nr_forced_migrations);
1e3c88bde   Peter Zijlstra   sched: Move load ...
1715
1716
1717
1718
1719
1720
  		}
  #endif
  		return 1;
  	}
  
  	if (tsk_cache_hot) {
41acab885   Lucas De Marchi   sched: Implement ...
1721
  		schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
1e3c88bde   Peter Zijlstra   sched: Move load ...
1722
1723
1724
1725
  		return 0;
  	}
  	return 1;
  }
897c395f4   Peter Zijlstra   sched: Remove rq_...
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
  /*
   * move_one_task tries to move exactly one task from busiest to this_rq, as
   * part of active balancing operations within "domain".
   * Returns 1 if successful and 0 otherwise.
   *
   * Called with both runqueues locked.
   */
  static int
  move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
  	      struct sched_domain *sd, enum cpu_idle_type idle)
  {
  	struct task_struct *p, *n;
  	struct cfs_rq *cfs_rq;
  	int pinned = 0;
  
  	for_each_leaf_cfs_rq(busiest, cfs_rq) {
  		list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
  
  			if (!can_migrate_task(p, busiest, this_cpu,
  						sd, idle, &pinned))
  				continue;
  
  			pull_task(busiest, p, this_rq, this_cpu);
  			/*
  			 * Right now, this is only the second place pull_task()
  			 * is called, so we can safely collect pull_task()
  			 * stats here rather than inside pull_task().
  			 */
  			schedstat_inc(sd, lb_gained[idle]);
  			return 1;
  		}
  	}
  
  	return 0;
  }
1e3c88bde   Peter Zijlstra   sched: Move load ...
1761
1762
1763
1764
  static unsigned long
  balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
  	      unsigned long max_load_move, struct sched_domain *sd,
  	      enum cpu_idle_type idle, int *all_pinned,
ee00e66ff   Peter Zijlstra   sched: Remove rq_...
1765
  	      int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
1e3c88bde   Peter Zijlstra   sched: Move load ...
1766
1767
  {
  	int loops = 0, pulled = 0, pinned = 0;
1e3c88bde   Peter Zijlstra   sched: Move load ...
1768
  	long rem_load_move = max_load_move;
ee00e66ff   Peter Zijlstra   sched: Remove rq_...
1769
  	struct task_struct *p, *n;
1e3c88bde   Peter Zijlstra   sched: Move load ...
1770
1771
1772
1773
1774
  
  	if (max_load_move == 0)
  		goto out;
  
  	pinned = 1;
ee00e66ff   Peter Zijlstra   sched: Remove rq_...
1775
1776
1777
  	list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
  		if (loops++ > sysctl_sched_nr_migrate)
  			break;
1e3c88bde   Peter Zijlstra   sched: Move load ...
1778

ee00e66ff   Peter Zijlstra   sched: Remove rq_...
1779
1780
1781
  		if ((p->se.load.weight >> 1) > rem_load_move ||
  		    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
  			continue;
1e3c88bde   Peter Zijlstra   sched: Move load ...
1782

ee00e66ff   Peter Zijlstra   sched: Remove rq_...
1783
1784
1785
  		pull_task(busiest, p, this_rq, this_cpu);
  		pulled++;
  		rem_load_move -= p->se.load.weight;
1e3c88bde   Peter Zijlstra   sched: Move load ...
1786
1787
  
  #ifdef CONFIG_PREEMPT
ee00e66ff   Peter Zijlstra   sched: Remove rq_...
1788
1789
1790
1791
1792
1793
1794
  		/*
  		 * NEWIDLE balancing is a source of latency, so preemptible
  		 * kernels will stop after the first task is pulled to minimize
  		 * the critical section.
  		 */
  		if (idle == CPU_NEWLY_IDLE)
  			break;
1e3c88bde   Peter Zijlstra   sched: Move load ...
1795
  #endif
ee00e66ff   Peter Zijlstra   sched: Remove rq_...
1796
1797
1798
1799
1800
1801
  		/*
  		 * We only want to steal up to the prescribed amount of
  		 * weighted load.
  		 */
  		if (rem_load_move <= 0)
  			break;
1e3c88bde   Peter Zijlstra   sched: Move load ...
1802
1803
  		if (p->prio < *this_best_prio)
  			*this_best_prio = p->prio;
1e3c88bde   Peter Zijlstra   sched: Move load ...
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
  	}
  out:
  	/*
  	 * Right now, this is one of only two places pull_task() is called,
  	 * so we can safely collect pull_task() stats here rather than
  	 * inside pull_task().
  	 */
  	schedstat_add(sd, lb_gained[idle], pulled);
  
  	if (all_pinned)
  		*all_pinned = pinned;
  
  	return max_load_move - rem_load_move;
  }
230059de7   Peter Zijlstra   sched: Remove fro...
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
  #ifdef CONFIG_FAIR_GROUP_SCHED
  static unsigned long
  load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
  		  unsigned long max_load_move,
  		  struct sched_domain *sd, enum cpu_idle_type idle,
  		  int *all_pinned, int *this_best_prio)
  {
  	long rem_load_move = max_load_move;
  	int busiest_cpu = cpu_of(busiest);
  	struct task_group *tg;
  
  	rcu_read_lock();
  	update_h_load(busiest_cpu);
  
  	list_for_each_entry_rcu(tg, &task_groups, list) {
  		struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
  		unsigned long busiest_h_load = busiest_cfs_rq->h_load;
  		unsigned long busiest_weight = busiest_cfs_rq->load.weight;
  		u64 rem_load, moved_load;
  
  		/*
  		 * empty group
  		 */
  		if (!busiest_cfs_rq->task_weight)
  			continue;
  
  		rem_load = (u64)rem_load_move * busiest_weight;
  		rem_load = div_u64(rem_load, busiest_h_load + 1);
  
  		moved_load = balance_tasks(this_rq, this_cpu, busiest,
  				rem_load, sd, idle, all_pinned, this_best_prio,
  				busiest_cfs_rq);
  
  		if (!moved_load)
  			continue;
  
  		moved_load *= busiest_h_load;
  		moved_load = div_u64(moved_load, busiest_weight + 1);
  
  		rem_load_move -= moved_load;
  		if (rem_load_move < 0)
  			break;
  	}
  	rcu_read_unlock();
  
  	return max_load_move - rem_load_move;
  }
  #else
  static unsigned long
  load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
  		  unsigned long max_load_move,
  		  struct sched_domain *sd, enum cpu_idle_type idle,
  		  int *all_pinned, int *this_best_prio)
  {
  	return balance_tasks(this_rq, this_cpu, busiest,
  			max_load_move, sd, idle, all_pinned,
  			this_best_prio, &busiest->cfs);
  }
  #endif
1e3c88bde   Peter Zijlstra   sched: Move load ...
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
  /*
   * move_tasks tries to move up to max_load_move weighted load from busiest to
   * this_rq, as part of a balancing operation within domain "sd".
   * Returns 1 if successful and 0 otherwise.
   *
   * Called with both runqueues locked.
   */
  static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
  		      unsigned long max_load_move,
  		      struct sched_domain *sd, enum cpu_idle_type idle,
  		      int *all_pinned)
  {
3d45fd804   Peter Zijlstra   sched: Remove the...
1889
  	unsigned long total_load_moved = 0, load_moved;
1e3c88bde   Peter Zijlstra   sched: Move load ...
1890
1891
1892
  	int this_best_prio = this_rq->curr->prio;
  
  	do {
3d45fd804   Peter Zijlstra   sched: Remove the...
1893
  		load_moved = load_balance_fair(this_rq, this_cpu, busiest,
1e3c88bde   Peter Zijlstra   sched: Move load ...
1894
1895
  				max_load_move - total_load_moved,
  				sd, idle, all_pinned, &this_best_prio);
3d45fd804   Peter Zijlstra   sched: Remove the...
1896
1897
  
  		total_load_moved += load_moved;
1e3c88bde   Peter Zijlstra   sched: Move load ...
1898
1899
1900
1901
1902
1903
1904
1905
1906
  
  #ifdef CONFIG_PREEMPT
  		/*
  		 * NEWIDLE balancing is a source of latency, so preemptible
  		 * kernels will stop after the first task is pulled to minimize
  		 * the critical section.
  		 */
  		if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
  			break;
baa8c1102   Peter Zijlstra   sched: Add a lock...
1907
1908
1909
1910
  
  		if (raw_spin_is_contended(&this_rq->lock) ||
  				raw_spin_is_contended(&busiest->lock))
  			break;
1e3c88bde   Peter Zijlstra   sched: Move load ...
1911
  #endif
3d45fd804   Peter Zijlstra   sched: Remove the...
1912
  	} while (load_moved && max_load_move > total_load_moved);
1e3c88bde   Peter Zijlstra   sched: Move load ...
1913
1914
1915
  
  	return total_load_moved > 0;
  }
1e3c88bde   Peter Zijlstra   sched: Move load ...
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
  /********** Helpers for find_busiest_group ************************/
  /*
   * sd_lb_stats - Structure to store the statistics of a sched_domain
   * 		during load balancing.
   */
  struct sd_lb_stats {
  	struct sched_group *busiest; /* Busiest group in this sd */
  	struct sched_group *this;  /* Local group in this sd */
  	unsigned long total_load;  /* Total load of all groups in sd */
  	unsigned long total_pwr;   /*	Total power of all groups in sd */
  	unsigned long avg_load;	   /* Average load across all groups in sd */
  
  	/** Statistics of this group */
  	unsigned long this_load;
  	unsigned long this_load_per_task;
  	unsigned long this_nr_running;
  
  	/* Statistics of the busiest group */
  	unsigned long max_load;
  	unsigned long busiest_load_per_task;
  	unsigned long busiest_nr_running;
dd5feea14   Suresh Siddha   sched: Fix SCHED_...
1937
  	unsigned long busiest_group_capacity;
1e3c88bde   Peter Zijlstra   sched: Move load ...
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
  
  	int group_imb; /* Is there imbalance in this sd */
  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
  	int power_savings_balance; /* Is powersave balance needed for this sd */
  	struct sched_group *group_min; /* Least loaded group in sd */
  	struct sched_group *group_leader; /* Group which relieves group_min */
  	unsigned long min_load_per_task; /* load_per_task in group_min */
  	unsigned long leader_nr_running; /* Nr running of group_leader */
  	unsigned long min_nr_running; /* Nr running of group_min */
  #endif
  };
  
  /*
   * sg_lb_stats - stats of a sched_group required for load_balancing
   */
  struct sg_lb_stats {
  	unsigned long avg_load; /*Avg load across the CPUs of the group */
  	unsigned long group_load; /* Total load over the CPUs of the group */
  	unsigned long sum_nr_running; /* Nr tasks running in the group */
  	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
  	unsigned long group_capacity;
  	int group_imb; /* Is there an imbalance in the group ? */
  };
  
  /**
   * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
   * @group: The group whose first cpu is to be returned.
   */
  static inline unsigned int group_first_cpu(struct sched_group *group)
  {
  	return cpumask_first(sched_group_cpus(group));
  }
  
  /**
   * get_sd_load_idx - Obtain the load index for a given sched domain.
   * @sd: The sched_domain whose load_idx is to be obtained.
   * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
   */
  static inline int get_sd_load_idx(struct sched_domain *sd,
  					enum cpu_idle_type idle)
  {
  	int load_idx;
  
  	switch (idle) {
  	case CPU_NOT_IDLE:
  		load_idx = sd->busy_idx;
  		break;
  
  	case CPU_NEWLY_IDLE:
  		load_idx = sd->newidle_idx;
  		break;
  	default:
  		load_idx = sd->idle_idx;
  		break;
  	}
  
  	return load_idx;
  }
  
  
  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
  /**
   * init_sd_power_savings_stats - Initialize power savings statistics for
   * the given sched_domain, during load balancing.
   *
   * @sd: Sched domain whose power-savings statistics are to be initialized.
   * @sds: Variable containing the statistics for sd.
   * @idle: Idle status of the CPU at which we're performing load-balancing.
   */
  static inline void init_sd_power_savings_stats(struct sched_domain *sd,
  	struct sd_lb_stats *sds, enum cpu_idle_type idle)
  {
  	/*
  	 * Busy processors will not participate in power savings
  	 * balance.
  	 */
  	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
  		sds->power_savings_balance = 0;
  	else {
  		sds->power_savings_balance = 1;
  		sds->min_nr_running = ULONG_MAX;
  		sds->leader_nr_running = 0;
  	}
  }
  
  /**
   * update_sd_power_savings_stats - Update the power saving stats for a
   * sched_domain while performing load balancing.
   *
   * @group: sched_group belonging to the sched_domain under consideration.
   * @sds: Variable containing the statistics of the sched_domain
   * @local_group: Does group contain the CPU for which we're performing
   * 		load balancing ?
   * @sgs: Variable containing the statistics of the group.
   */
  static inline void update_sd_power_savings_stats(struct sched_group *group,
  	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
  {
  
  	if (!sds->power_savings_balance)
  		return;
  
  	/*
  	 * If the local group is idle or completely loaded
  	 * no need to do power savings balance at this domain
  	 */
  	if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
  				!sds->this_nr_running))
  		sds->power_savings_balance = 0;
  
  	/*
  	 * If a group is already running at full capacity or idle,
  	 * don't include that group in power savings calculations
  	 */
  	if (!sds->power_savings_balance ||
  		sgs->sum_nr_running >= sgs->group_capacity ||
  		!sgs->sum_nr_running)
  		return;
  
  	/*
  	 * Calculate the group which has the least non-idle load.
  	 * This is the group from where we need to pick up the load
  	 * for saving power
  	 */
  	if ((sgs->sum_nr_running < sds->min_nr_running) ||
  	    (sgs->sum_nr_running == sds->min_nr_running &&
  	     group_first_cpu(group) > group_first_cpu(sds->group_min))) {
  		sds->group_min = group;
  		sds->min_nr_running = sgs->sum_nr_running;
  		sds->min_load_per_task = sgs->sum_weighted_load /
  						sgs->sum_nr_running;
  	}
  
  	/*
  	 * Calculate the group which is almost near its
  	 * capacity but still has some space to pick up some load
  	 * from other group and save more power
  	 */
  	if (sgs->sum_nr_running + 1 > sgs->group_capacity)
  		return;
  
  	if (sgs->sum_nr_running > sds->leader_nr_running ||
  	    (sgs->sum_nr_running == sds->leader_nr_running &&
  	     group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
  		sds->group_leader = group;
  		sds->leader_nr_running = sgs->sum_nr_running;
  	}
  }
  
  /**
   * check_power_save_busiest_group - see if there is potential for some power-savings balance
   * @sds: Variable containing the statistics of the sched_domain
   *	under consideration.
   * @this_cpu: Cpu at which we're currently performing load-balancing.
   * @imbalance: Variable to store the imbalance.
   *
   * Description:
   * Check if we have potential to perform some power-savings balance.
   * If yes, set the busiest group to be the least loaded group in the
   * sched_domain, so that it's CPUs can be put to idle.
   *
   * Returns 1 if there is potential to perform power-savings balance.
   * Else returns 0.
   */
  static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
  					int this_cpu, unsigned long *imbalance)
  {
  	if (!sds->power_savings_balance)
  		return 0;
  
  	if (sds->this != sds->group_leader ||
  			sds->group_leader == sds->group_min)
  		return 0;
  
  	*imbalance = sds->min_load_per_task;
  	sds->busiest = sds->group_min;
  
  	return 1;
  
  }
  #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
  static inline void init_sd_power_savings_stats(struct sched_domain *sd,
  	struct sd_lb_stats *sds, enum cpu_idle_type idle)
  {
  	return;
  }
  
  static inline void update_sd_power_savings_stats(struct sched_group *group,
  	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
  {
  	return;
  }
  
  static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
  					int this_cpu, unsigned long *imbalance)
  {
  	return 0;
  }
  #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
  
  
  unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
  {
  	return SCHED_LOAD_SCALE;
  }
  
  unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
  {
  	return default_scale_freq_power(sd, cpu);
  }
  
  unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
  {
669c55e9f   Peter Zijlstra   sched: Pre-comput...
2151
  	unsigned long weight = sd->span_weight;
1e3c88bde   Peter Zijlstra   sched: Move load ...
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
  	unsigned long smt_gain = sd->smt_gain;
  
  	smt_gain /= weight;
  
  	return smt_gain;
  }
  
  unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
  {
  	return default_scale_smt_power(sd, cpu);
  }
  
  unsigned long scale_rt_power(int cpu)
  {
  	struct rq *rq = cpu_rq(cpu);
  	u64 total, available;
1e3c88bde   Peter Zijlstra   sched: Move load ...
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
  	total = sched_avg_period() + (rq->clock - rq->age_stamp);
  	available = total - rq->rt_avg;
  
  	if (unlikely((s64)total < SCHED_LOAD_SCALE))
  		total = SCHED_LOAD_SCALE;
  
  	total >>= SCHED_LOAD_SHIFT;
  
  	return div_u64(available, total);
  }
  
  static void update_cpu_power(struct sched_domain *sd, int cpu)
  {
669c55e9f   Peter Zijlstra   sched: Pre-comput...
2181
  	unsigned long weight = sd->span_weight;
1e3c88bde   Peter Zijlstra   sched: Move load ...
2182
2183
  	unsigned long power = SCHED_LOAD_SCALE;
  	struct sched_group *sdg = sd->groups;
1e3c88bde   Peter Zijlstra   sched: Move load ...
2184
2185
2186
2187
2188
2189
2190
2191
  	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
  		if (sched_feat(ARCH_POWER))
  			power *= arch_scale_smt_power(sd, cpu);
  		else
  			power *= default_scale_smt_power(sd, cpu);
  
  		power >>= SCHED_LOAD_SHIFT;
  	}
9d5efe05e   Srivatsa Vaddagiri   sched: Fix capaci...
2192
2193
2194
2195
2196
2197
2198
2199
  	sdg->cpu_power_orig = power;
  
  	if (sched_feat(ARCH_POWER))
  		power *= arch_scale_freq_power(sd, cpu);
  	else
  		power *= default_scale_freq_power(sd, cpu);
  
  	power >>= SCHED_LOAD_SHIFT;
1e3c88bde   Peter Zijlstra   sched: Move load ...
2200
2201
2202
2203
2204
  	power *= scale_rt_power(cpu);
  	power >>= SCHED_LOAD_SHIFT;
  
  	if (!power)
  		power = 1;
e51fd5e22   Peter Zijlstra   sched: Fix wake_a...
2205
  	cpu_rq(cpu)->cpu_power = power;
1e3c88bde   Peter Zijlstra   sched: Move load ...
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
  	sdg->cpu_power = power;
  }
  
  static void update_group_power(struct sched_domain *sd, int cpu)
  {
  	struct sched_domain *child = sd->child;
  	struct sched_group *group, *sdg = sd->groups;
  	unsigned long power;
  
  	if (!child) {
  		update_cpu_power(sd, cpu);
  		return;
  	}
  
  	power = 0;
  
  	group = child->groups;
  	do {
  		power += group->cpu_power;
  		group = group->next;
  	} while (group != child->groups);
  
  	sdg->cpu_power = power;
  }
9d5efe05e   Srivatsa Vaddagiri   sched: Fix capaci...
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
  /*
   * Try and fix up capacity for tiny siblings, this is needed when
   * things like SD_ASYM_PACKING need f_b_g to select another sibling
   * which on its own isn't powerful enough.
   *
   * See update_sd_pick_busiest() and check_asym_packing().
   */
  static inline int
  fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
  {
  	/*
  	 * Only siblings can have significantly less than SCHED_LOAD_SCALE
  	 */
  	if (sd->level != SD_LV_SIBLING)
  		return 0;
  
  	/*
  	 * If ~90% of the cpu_power is still there, we're good.
  	 */
694f5a111   Michael Neuling   sched: Fix fix_sm...
2249
  	if (group->cpu_power * 32 > group->cpu_power_orig * 29)
9d5efe05e   Srivatsa Vaddagiri   sched: Fix capaci...
2250
2251
2252
2253
  		return 1;
  
  	return 0;
  }
1e3c88bde   Peter Zijlstra   sched: Move load ...
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
  /**
   * update_sg_lb_stats - Update sched_group's statistics for load balancing.
   * @sd: The sched_domain whose statistics are to be updated.
   * @group: sched_group whose statistics are to be updated.
   * @this_cpu: Cpu for which load balance is currently performed.
   * @idle: Idle status of this_cpu
   * @load_idx: Load index of sched_domain of this_cpu for load calc.
   * @sd_idle: Idle status of the sched_domain containing group.
   * @local_group: Does group contain this_cpu.
   * @cpus: Set of cpus considered for load balancing.
   * @balance: Should we balance.
   * @sgs: variable to hold the statistics for this group.
   */
  static inline void update_sg_lb_stats(struct sched_domain *sd,
  			struct sched_group *group, int this_cpu,
  			enum cpu_idle_type idle, int load_idx, int *sd_idle,
  			int local_group, const struct cpumask *cpus,
  			int *balance, struct sg_lb_stats *sgs)
  {
  	unsigned long load, max_cpu_load, min_cpu_load;
  	int i;
  	unsigned int balance_cpu = -1, first_idle_cpu = 0;
dd5feea14   Suresh Siddha   sched: Fix SCHED_...
2276
  	unsigned long avg_load_per_task = 0;
1e3c88bde   Peter Zijlstra   sched: Move load ...
2277

871e35bc9   Gautham R Shenoy   sched: Fix the pl...
2278
  	if (local_group)
1e3c88bde   Peter Zijlstra   sched: Move load ...
2279
  		balance_cpu = group_first_cpu(group);
1e3c88bde   Peter Zijlstra   sched: Move load ...
2280
2281
  
  	/* Tally up the load of all CPUs in the group */
1e3c88bde   Peter Zijlstra   sched: Move load ...
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
  	max_cpu_load = 0;
  	min_cpu_load = ~0UL;
  
  	for_each_cpu_and(i, sched_group_cpus(group), cpus) {
  		struct rq *rq = cpu_rq(i);
  
  		if (*sd_idle && rq->nr_running)
  			*sd_idle = 0;
  
  		/* Bias balancing toward cpus of our domain */
  		if (local_group) {
  			if (idle_cpu(i) && !first_idle_cpu) {
  				first_idle_cpu = 1;
  				balance_cpu = i;
  			}
  
  			load = target_load(i, load_idx);
  		} else {
  			load = source_load(i, load_idx);
  			if (load > max_cpu_load)
  				max_cpu_load = load;
  			if (min_cpu_load > load)
  				min_cpu_load = load;
  		}
  
  		sgs->group_load += load;
  		sgs->sum_nr_running += rq->nr_running;
  		sgs->sum_weighted_load += weighted_cpuload(i);
1e3c88bde   Peter Zijlstra   sched: Move load ...
2310
2311
2312
2313
2314
2315
2316
2317
  	}
  
  	/*
  	 * First idle cpu or the first cpu(busiest) in this sched group
  	 * is eligible for doing load balancing at this and above
  	 * domains. In the newly idle case, we will allow all the cpu's
  	 * to do the newly idle load balance.
  	 */
bbc8cb5ba   Peter Zijlstra   sched: Reduce upd...
2318
2319
2320
2321
2322
2323
  	if (idle != CPU_NEWLY_IDLE && local_group) {
  		if (balance_cpu != this_cpu) {
  			*balance = 0;
  			return;
  		}
  		update_group_power(sd, this_cpu);
1e3c88bde   Peter Zijlstra   sched: Move load ...
2324
2325
2326
2327
  	}
  
  	/* Adjust by relative CPU power of the group */
  	sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
1e3c88bde   Peter Zijlstra   sched: Move load ...
2328
2329
2330
2331
2332
2333
2334
2335
2336
  	/*
  	 * Consider the group unbalanced when the imbalance is larger
  	 * than the average weight of two tasks.
  	 *
  	 * APZ: with cgroup the avg task weight can vary wildly and
  	 *      might not be a suitable number - should we keep a
  	 *      normalized nr_running number somewhere that negates
  	 *      the hierarchy?
  	 */
dd5feea14   Suresh Siddha   sched: Fix SCHED_...
2337
2338
  	if (sgs->sum_nr_running)
  		avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
1e3c88bde   Peter Zijlstra   sched: Move load ...
2339
2340
2341
2342
2343
2344
  
  	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
  		sgs->group_imb = 1;
  
  	sgs->group_capacity =
  		DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
9d5efe05e   Srivatsa Vaddagiri   sched: Fix capaci...
2345
2346
  	if (!sgs->group_capacity)
  		sgs->group_capacity = fix_small_capacity(sd, group);
1e3c88bde   Peter Zijlstra   sched: Move load ...
2347
2348
2349
  }
  
  /**
532cb4c40   Michael Neuling   sched: Add asymme...
2350
2351
2352
2353
   * update_sd_pick_busiest - return 1 on busiest group
   * @sd: sched_domain whose statistics are to be checked
   * @sds: sched_domain statistics
   * @sg: sched_group candidate to be checked for being the busiest
b6b122944   Michael Neuling   sched: Fix commen...
2354
2355
   * @sgs: sched_group statistics
   * @this_cpu: the current cpu
532cb4c40   Michael Neuling   sched: Add asymme...
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
   *
   * Determine if @sg is a busier group than the previously selected
   * busiest group.
   */
  static bool update_sd_pick_busiest(struct sched_domain *sd,
  				   struct sd_lb_stats *sds,
  				   struct sched_group *sg,
  				   struct sg_lb_stats *sgs,
  				   int this_cpu)
  {
  	if (sgs->avg_load <= sds->max_load)
  		return false;
  
  	if (sgs->sum_nr_running > sgs->group_capacity)
  		return true;
  
  	if (sgs->group_imb)
  		return true;
  
  	/*
  	 * ASYM_PACKING needs to move all the work to the lowest
  	 * numbered CPUs in the group, therefore mark all groups
  	 * higher than ourself as busy.
  	 */
  	if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
  	    this_cpu < group_first_cpu(sg)) {
  		if (!sds->busiest)
  			return true;
  
  		if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
  			return true;
  	}
  
  	return false;
  }
  
  /**
1e3c88bde   Peter Zijlstra   sched: Move load ...
2393
2394
2395
2396
   * update_sd_lb_stats - Update sched_group's statistics for load balancing.
   * @sd: sched_domain whose statistics are to be updated.
   * @this_cpu: Cpu for which load balance is currently performed.
   * @idle: Idle status of this_cpu
532cb4c40   Michael Neuling   sched: Add asymme...
2397
   * @sd_idle: Idle status of the sched_domain containing sg.
1e3c88bde   Peter Zijlstra   sched: Move load ...
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
   * @cpus: Set of cpus considered for load balancing.
   * @balance: Should we balance.
   * @sds: variable to hold the statistics for this sched_domain.
   */
  static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
  			enum cpu_idle_type idle, int *sd_idle,
  			const struct cpumask *cpus, int *balance,
  			struct sd_lb_stats *sds)
  {
  	struct sched_domain *child = sd->child;
532cb4c40   Michael Neuling   sched: Add asymme...
2408
  	struct sched_group *sg = sd->groups;
1e3c88bde   Peter Zijlstra   sched: Move load ...
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
  	struct sg_lb_stats sgs;
  	int load_idx, prefer_sibling = 0;
  
  	if (child && child->flags & SD_PREFER_SIBLING)
  		prefer_sibling = 1;
  
  	init_sd_power_savings_stats(sd, sds, idle);
  	load_idx = get_sd_load_idx(sd, idle);
  
  	do {
  		int local_group;
532cb4c40   Michael Neuling   sched: Add asymme...
2420
  		local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
1e3c88bde   Peter Zijlstra   sched: Move load ...
2421
  		memset(&sgs, 0, sizeof(sgs));
532cb4c40   Michael Neuling   sched: Add asymme...
2422
  		update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
1e3c88bde   Peter Zijlstra   sched: Move load ...
2423
  				local_group, cpus, balance, &sgs);
8f190fb3f   Peter Zijlstra   sched: Assume *ba...
2424
  		if (local_group && !(*balance))
1e3c88bde   Peter Zijlstra   sched: Move load ...
2425
2426
2427
  			return;
  
  		sds->total_load += sgs.group_load;
532cb4c40   Michael Neuling   sched: Add asymme...
2428
  		sds->total_pwr += sg->cpu_power;
1e3c88bde   Peter Zijlstra   sched: Move load ...
2429
2430
2431
  
  		/*
  		 * In case the child domain prefers tasks go to siblings
532cb4c40   Michael Neuling   sched: Add asymme...
2432
  		 * first, lower the sg capacity to one so that we'll try
1e3c88bde   Peter Zijlstra   sched: Move load ...
2433
2434
2435
2436
2437
2438
2439
  		 * and move all the excess tasks away.
  		 */
  		if (prefer_sibling)
  			sgs.group_capacity = min(sgs.group_capacity, 1UL);
  
  		if (local_group) {
  			sds->this_load = sgs.avg_load;
532cb4c40   Michael Neuling   sched: Add asymme...
2440
  			sds->this = sg;
1e3c88bde   Peter Zijlstra   sched: Move load ...
2441
2442
  			sds->this_nr_running = sgs.sum_nr_running;
  			sds->this_load_per_task = sgs.sum_weighted_load;
532cb4c40   Michael Neuling   sched: Add asymme...
2443
  		} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
1e3c88bde   Peter Zijlstra   sched: Move load ...
2444
  			sds->max_load = sgs.avg_load;
532cb4c40   Michael Neuling   sched: Add asymme...
2445
  			sds->busiest = sg;
1e3c88bde   Peter Zijlstra   sched: Move load ...
2446
  			sds->busiest_nr_running = sgs.sum_nr_running;
dd5feea14   Suresh Siddha   sched: Fix SCHED_...
2447
  			sds->busiest_group_capacity = sgs.group_capacity;
1e3c88bde   Peter Zijlstra   sched: Move load ...
2448
2449
2450
  			sds->busiest_load_per_task = sgs.sum_weighted_load;
  			sds->group_imb = sgs.group_imb;
  		}
532cb4c40   Michael Neuling   sched: Add asymme...
2451
2452
2453
2454
  		update_sd_power_savings_stats(sg, sds, local_group, &sgs);
  		sg = sg->next;
  	} while (sg != sd->groups);
  }
2ec57d448   Michael Neuling   sched: Fix spelli...
2455
  int __weak arch_sd_sibling_asym_packing(void)
532cb4c40   Michael Neuling   sched: Add asymme...
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
  {
         return 0*SD_ASYM_PACKING;
  }
  
  /**
   * check_asym_packing - Check to see if the group is packed into the
   *			sched doman.
   *
   * This is primarily intended to used at the sibling level.  Some
   * cores like POWER7 prefer to use lower numbered SMT threads.  In the
   * case of POWER7, it can move to lower SMT modes only when higher
   * threads are idle.  When in lower SMT modes, the threads will
   * perform better since they share less core resources.  Hence when we
   * have idle threads, we want them to be the higher ones.
   *
   * This packing function is run on idle threads.  It checks to see if
   * the busiest CPU in this domain (core in the P7 case) has a higher
   * CPU number than the packing function is being run on.  Here we are
   * assuming lower CPU number will be equivalent to lower a SMT thread
   * number.
   *
b6b122944   Michael Neuling   sched: Fix commen...
2477
2478
2479
   * Returns 1 when packing is required and a task should be moved to
   * this CPU.  The amount of the imbalance is returned in *imbalance.
   *
532cb4c40   Michael Neuling   sched: Add asymme...
2480
2481
2482
2483
   * @sd: The sched_domain whose packing is to be checked.
   * @sds: Statistics of the sched_domain which is to be packed
   * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
   * @imbalance: returns amount of imbalanced due to packing.
532cb4c40   Michael Neuling   sched: Add asymme...
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
   */
  static int check_asym_packing(struct sched_domain *sd,
  			      struct sd_lb_stats *sds,
  			      int this_cpu, unsigned long *imbalance)
  {
  	int busiest_cpu;
  
  	if (!(sd->flags & SD_ASYM_PACKING))
  		return 0;
  
  	if (!sds->busiest)
  		return 0;
  
  	busiest_cpu = group_first_cpu(sds->busiest);
  	if (this_cpu > busiest_cpu)
  		return 0;
  
  	*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
  				       SCHED_LOAD_SCALE);
  	return 1;
1e3c88bde   Peter Zijlstra   sched: Move load ...
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
  }
  
  /**
   * fix_small_imbalance - Calculate the minor imbalance that exists
   *			amongst the groups of a sched_domain, during
   *			load balancing.
   * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
   * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
   * @imbalance: Variable to store the imbalance.
   */
  static inline void fix_small_imbalance(struct sd_lb_stats *sds,
  				int this_cpu, unsigned long *imbalance)
  {
  	unsigned long tmp, pwr_now = 0, pwr_move = 0;
  	unsigned int imbn = 2;
dd5feea14   Suresh Siddha   sched: Fix SCHED_...
2519
  	unsigned long scaled_busy_load_per_task;
1e3c88bde   Peter Zijlstra   sched: Move load ...
2520
2521
2522
2523
2524
2525
2526
2527
2528
  
  	if (sds->this_nr_running) {
  		sds->this_load_per_task /= sds->this_nr_running;
  		if (sds->busiest_load_per_task >
  				sds->this_load_per_task)
  			imbn = 1;
  	} else
  		sds->this_load_per_task =
  			cpu_avg_load_per_task(this_cpu);
dd5feea14   Suresh Siddha   sched: Fix SCHED_...
2529
2530
2531
2532
2533
2534
  	scaled_busy_load_per_task = sds->busiest_load_per_task
  						 * SCHED_LOAD_SCALE;
  	scaled_busy_load_per_task /= sds->busiest->cpu_power;
  
  	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
  			(scaled_busy_load_per_task * imbn)) {
1e3c88bde   Peter Zijlstra   sched: Move load ...
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
  		*imbalance = sds->busiest_load_per_task;
  		return;
  	}
  
  	/*
  	 * OK, we don't have enough imbalance to justify moving tasks,
  	 * however we may be able to increase total CPU power used by
  	 * moving them.
  	 */
  
  	pwr_now += sds->busiest->cpu_power *
  			min(sds->busiest_load_per_task, sds->max_load);
  	pwr_now += sds->this->cpu_power *
  			min(sds->this_load_per_task, sds->this_load);
  	pwr_now /= SCHED_LOAD_SCALE;
  
  	/* Amount of load we'd subtract */
  	tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
  		sds->busiest->cpu_power;
  	if (sds->max_load > tmp)
  		pwr_move += sds->busiest->cpu_power *
  			min(sds->busiest_load_per_task, sds->max_load - tmp);
  
  	/* Amount of load we'd add */
  	if (sds->max_load * sds->busiest->cpu_power <
  		sds->busiest_load_per_task * SCHED_LOAD_SCALE)
  		tmp = (sds->max_load * sds->busiest->cpu_power) /
  			sds->this->cpu_power;
  	else
  		tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
  			sds->this->cpu_power;
  	pwr_move += sds->this->cpu_power *
  			min(sds->this_load_per_task, sds->this_load + tmp);
  	pwr_move /= SCHED_LOAD_SCALE;
  
  	/* Move if we gain throughput */
  	if (pwr_move > pwr_now)
  		*imbalance = sds->busiest_load_per_task;
  }
  
  /**
   * calculate_imbalance - Calculate the amount of imbalance present within the
   *			 groups of a given sched_domain during load balance.
   * @sds: statistics of the sched_domain whose imbalance is to be calculated.
   * @this_cpu: Cpu for which currently load balance is being performed.
   * @imbalance: The variable to store the imbalance.
   */
  static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
  		unsigned long *imbalance)
  {
dd5feea14   Suresh Siddha   sched: Fix SCHED_...
2585
2586
2587
2588
2589
2590
2591
  	unsigned long max_pull, load_above_capacity = ~0UL;
  
  	sds->busiest_load_per_task /= sds->busiest_nr_running;
  	if (sds->group_imb) {
  		sds->busiest_load_per_task =
  			min(sds->busiest_load_per_task, sds->avg_load);
  	}
1e3c88bde   Peter Zijlstra   sched: Move load ...
2592
2593
2594
2595
2596
2597
2598
2599
2600
  	/*
  	 * In the presence of smp nice balancing, certain scenarios can have
  	 * max load less than avg load(as we skip the groups at or below
  	 * its cpu_power, while calculating max_load..)
  	 */
  	if (sds->max_load < sds->avg_load) {
  		*imbalance = 0;
  		return fix_small_imbalance(sds, this_cpu, imbalance);
  	}
dd5feea14   Suresh Siddha   sched: Fix SCHED_...
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
  	if (!sds->group_imb) {
  		/*
  		 * Don't want to pull so many tasks that a group would go idle.
  		 */
  		load_above_capacity = (sds->busiest_nr_running -
  						sds->busiest_group_capacity);
  
  		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
  
  		load_above_capacity /= sds->busiest->cpu_power;
  	}
  
  	/*
  	 * We're trying to get all the cpus to the average_load, so we don't
  	 * want to push ourselves above the average load, nor do we wish to
  	 * reduce the max loaded cpu below the average load. At the same time,
  	 * we also don't want to reduce the group load below the group capacity
  	 * (so that we can implement power-savings policies etc). Thus we look
  	 * for the minimum possible imbalance.
  	 * Be careful of negative numbers as they'll appear as very large values
  	 * with unsigned longs.
  	 */
  	max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
1e3c88bde   Peter Zijlstra   sched: Move load ...
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
  
  	/* How much load to actually move to equalise the imbalance */
  	*imbalance = min(max_pull * sds->busiest->cpu_power,
  		(sds->avg_load - sds->this_load) * sds->this->cpu_power)
  			/ SCHED_LOAD_SCALE;
  
  	/*
  	 * if *imbalance is less than the average load per runnable task
  	 * there is no gaurantee that any tasks will be moved so we'll have
  	 * a think about bumping its value to force at least one task to be
  	 * moved
  	 */
  	if (*imbalance < sds->busiest_load_per_task)
  		return fix_small_imbalance(sds, this_cpu, imbalance);
  
  }
  /******* find_busiest_group() helpers end here *********************/
  
  /**
   * find_busiest_group - Returns the busiest group within the sched_domain
   * if there is an imbalance. If there isn't an imbalance, and
   * the user has opted for power-savings, it returns a group whose
   * CPUs can be put to idle by rebalancing those tasks elsewhere, if
   * such a group exists.
   *
   * Also calculates the amount of weighted load which should be moved
   * to restore balance.
   *
   * @sd: The sched_domain whose busiest group is to be returned.
   * @this_cpu: The cpu for which load balancing is currently being performed.
   * @imbalance: Variable which stores amount of weighted load which should
   *		be moved to restore balance/put a group to idle.
   * @idle: The idle status of this_cpu.
   * @sd_idle: The idleness of sd
   * @cpus: The set of CPUs under consideration for load-balancing.
   * @balance: Pointer to a variable indicating if this_cpu
   *	is the appropriate cpu to perform load balancing at this_level.
   *
   * Returns:	- the busiest group if imbalance exists.
   *		- If no imbalance and user has opted for power-savings balance,
   *		   return the least loaded group whose CPUs can be
   *		   put to idle by rebalancing its tasks onto our group.
   */
  static struct sched_group *
  find_busiest_group(struct sched_domain *sd, int this_cpu,
  		   unsigned long *imbalance, enum cpu_idle_type idle,
  		   int *sd_idle, const struct cpumask *cpus, int *balance)
  {
  	struct sd_lb_stats sds;
  
  	memset(&sds, 0, sizeof(sds));
  
  	/*
  	 * Compute the various statistics relavent for load balancing at
  	 * this level.
  	 */
  	update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
  					balance, &sds);
  
  	/* Cases where imbalance does not exist from POV of this_cpu */
  	/* 1) this_cpu is not the appropriate cpu to perform load balancing
  	 *    at this level.
  	 * 2) There is no busy sibling group to pull from.
  	 * 3) This group is the busiest group.
  	 * 4) This group is more busy than the avg busieness at this
  	 *    sched_domain.
  	 * 5) The imbalance is within the specified limit.
1e3c88bde   Peter Zijlstra   sched: Move load ...
2691
  	 */
8f190fb3f   Peter Zijlstra   sched: Assume *ba...
2692
  	if (!(*balance))
1e3c88bde   Peter Zijlstra   sched: Move load ...
2693
  		goto ret;
532cb4c40   Michael Neuling   sched: Add asymme...
2694
2695
2696
  	if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
  	    check_asym_packing(sd, &sds, this_cpu, imbalance))
  		return sds.busiest;
1e3c88bde   Peter Zijlstra   sched: Move load ...
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
  	if (!sds.busiest || sds.busiest_nr_running == 0)
  		goto out_balanced;
  
  	if (sds.this_load >= sds.max_load)
  		goto out_balanced;
  
  	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
  
  	if (sds.this_load >= sds.avg_load)
  		goto out_balanced;
  
  	if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
  		goto out_balanced;
1e3c88bde   Peter Zijlstra   sched: Move load ...
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
  	/* Looks like there is an imbalance. Compute it */
  	calculate_imbalance(&sds, this_cpu, imbalance);
  	return sds.busiest;
  
  out_balanced:
  	/*
  	 * There is no obvious imbalance. But check if we can do some balancing
  	 * to save power.
  	 */
  	if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
  		return sds.busiest;
  ret:
  	*imbalance = 0;
  	return NULL;
  }
  
  /*
   * find_busiest_queue - find the busiest runqueue among the cpus in group.
   */
  static struct rq *
9d5efe05e   Srivatsa Vaddagiri   sched: Fix capaci...
2730
2731
2732
  find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
  		   enum cpu_idle_type idle, unsigned long imbalance,
  		   const struct cpumask *cpus)
1e3c88bde   Peter Zijlstra   sched: Move load ...
2733
2734
2735
2736
2737
2738
2739
2740
2741
  {
  	struct rq *busiest = NULL, *rq;
  	unsigned long max_load = 0;
  	int i;
  
  	for_each_cpu(i, sched_group_cpus(group)) {
  		unsigned long power = power_of(i);
  		unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
  		unsigned long wl;
9d5efe05e   Srivatsa Vaddagiri   sched: Fix capaci...
2742
2743
  		if (!capacity)
  			capacity = fix_small_capacity(sd, group);
1e3c88bde   Peter Zijlstra   sched: Move load ...
2744
2745
2746
2747
  		if (!cpumask_test_cpu(i, cpus))
  			continue;
  
  		rq = cpu_rq(i);
6e40f5bbb   Thomas Gleixner   Merge branch 'sch...
2748
  		wl = weighted_cpuload(i);
1e3c88bde   Peter Zijlstra   sched: Move load ...
2749

6e40f5bbb   Thomas Gleixner   Merge branch 'sch...
2750
2751
2752
2753
  		/*
  		 * When comparing with imbalance, use weighted_cpuload()
  		 * which is not scaled with the cpu power.
  		 */
1e3c88bde   Peter Zijlstra   sched: Move load ...
2754
2755
  		if (capacity && rq->nr_running == 1 && wl > imbalance)
  			continue;
6e40f5bbb   Thomas Gleixner   Merge branch 'sch...
2756
2757
2758
2759
2760
2761
2762
  		/*
  		 * For the load comparisons with the other cpu's, consider
  		 * the weighted_cpuload() scaled with the cpu power, so that
  		 * the load can be moved away from the cpu that is potentially
  		 * running at a lower capacity.
  		 */
  		wl = (wl * SCHED_LOAD_SCALE) / power;
1e3c88bde   Peter Zijlstra   sched: Move load ...
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
  		if (wl > max_load) {
  			max_load = wl;
  			busiest = rq;
  		}
  	}
  
  	return busiest;
  }
  
  /*
   * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
   * so long as it is large enough.
   */
  #define MAX_PINNED_INTERVAL	512
  
  /* Working cpumask for load_balance and load_balance_newidle. */
  static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
532cb4c40   Michael Neuling   sched: Add asymme...
2780
2781
  static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
  			       int busiest_cpu, int this_cpu)
1af3ed3dd   Peter Zijlstra   sched: Unify load...
2782
2783
  {
  	if (idle == CPU_NEWLY_IDLE) {
532cb4c40   Michael Neuling   sched: Add asymme...
2784
2785
2786
2787
2788
2789
2790
2791
  
  		/*
  		 * ASYM_PACKING needs to force migrate tasks from busy but
  		 * higher numbered CPUs in order to pack all tasks in the
  		 * lowest numbered CPUs.
  		 */
  		if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
  			return 1;
1af3ed3dd   Peter Zijlstra   sched: Unify load...
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
  		/*
  		 * The only task running in a non-idle cpu can be moved to this
  		 * cpu in an attempt to completely freeup the other CPU
  		 * package.
  		 *
  		 * The package power saving logic comes from
  		 * find_busiest_group(). If there are no imbalance, then
  		 * f_b_g() will return NULL. However when sched_mc={1,2} then
  		 * f_b_g() will select a group from which a running task may be
  		 * pulled to this cpu in order to make the other package idle.
  		 * If there is no opportunity to make a package idle and if
  		 * there are no imbalance, then f_b_g() will return NULL and no
  		 * action will be taken in load_balance_newidle().
  		 *
  		 * Under normal task pull operation due to imbalance, there
  		 * will be more than one task in the source run queue and
  		 * move_tasks() will succeed.  ld_moved will be true and this
  		 * active balance code will not be triggered.
  		 */
  		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
  		    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
  			return 0;
  
  		if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
  			return 0;
  	}
  
  	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
  }
969c79215   Tejun Heo   sched: replace mi...
2821
  static int active_load_balance_cpu_stop(void *data);
1e3c88bde   Peter Zijlstra   sched: Move load ...
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
  /*
   * Check this_cpu to ensure it is balanced within domain. Attempt to move
   * tasks if there is an imbalance.
   */
  static int load_balance(int this_cpu, struct rq *this_rq,
  			struct sched_domain *sd, enum cpu_idle_type idle,
  			int *balance)
  {
  	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
  	struct sched_group *group;
  	unsigned long imbalance;
  	struct rq *busiest;
  	unsigned long flags;
  	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
  
  	cpumask_copy(cpus, cpu_active_mask);
  
  	/*
  	 * When power savings policy is enabled for the parent domain, idle
  	 * sibling can pick up load irrespective of busy siblings. In this case,
  	 * let the state of idle sibling percolate up as CPU_IDLE, instead of
  	 * portraying it as CPU_NOT_IDLE.
  	 */
  	if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
  	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
  		sd_idle = 1;
  
  	schedstat_inc(sd, lb_count[idle]);
  
  redo:
  	update_shares(sd);
  	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
  				   cpus, balance);
  
  	if (*balance == 0)
  		goto out_balanced;
  
  	if (!group) {
  		schedstat_inc(sd, lb_nobusyg[idle]);
  		goto out_balanced;
  	}
9d5efe05e   Srivatsa Vaddagiri   sched: Fix capaci...
2863
  	busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
1e3c88bde   Peter Zijlstra   sched: Move load ...
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
  	if (!busiest) {
  		schedstat_inc(sd, lb_nobusyq[idle]);
  		goto out_balanced;
  	}
  
  	BUG_ON(busiest == this_rq);
  
  	schedstat_add(sd, lb_imbalance[idle], imbalance);
  
  	ld_moved = 0;
  	if (busiest->nr_running > 1) {
  		/*
  		 * Attempt to move tasks. If find_busiest_group has found
  		 * an imbalance but busiest->nr_running <= 1, the group is
  		 * still unbalanced. ld_moved simply stays zero, so it is
  		 * correctly treated as an imbalance.
  		 */
  		local_irq_save(flags);
  		double_rq_lock(this_rq, busiest);
  		ld_moved = move_tasks(this_rq, this_cpu, busiest,
  				      imbalance, sd, idle, &all_pinned);
  		double_rq_unlock(this_rq, busiest);
  		local_irq_restore(flags);
  
  		/*
  		 * some other cpu did the load balance for us.
  		 */
  		if (ld_moved && this_cpu != smp_processor_id())
  			resched_cpu(this_cpu);
  
  		/* All tasks on this runqueue were pinned by CPU affinity */
  		if (unlikely(all_pinned)) {
  			cpumask_clear_cpu(cpu_of(busiest), cpus);
  			if (!cpumask_empty(cpus))
  				goto redo;
  			goto out_balanced;
  		}
  	}
  
  	if (!ld_moved) {
  		schedstat_inc(sd, lb_failed[idle]);
  		sd->nr_balance_failed++;
532cb4c40   Michael Neuling   sched: Add asymme...
2906
2907
  		if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
  					this_cpu)) {
1e3c88bde   Peter Zijlstra   sched: Move load ...
2908
  			raw_spin_lock_irqsave(&busiest->lock, flags);
969c79215   Tejun Heo   sched: replace mi...
2909
2910
2911
  			/* don't kick the active_load_balance_cpu_stop,
  			 * if the curr task on busiest cpu can't be
  			 * moved to this_cpu
1e3c88bde   Peter Zijlstra   sched: Move load ...
2912
2913
2914
2915
2916
2917
2918
2919
  			 */
  			if (!cpumask_test_cpu(this_cpu,
  					      &busiest->curr->cpus_allowed)) {
  				raw_spin_unlock_irqrestore(&busiest->lock,
  							    flags);
  				all_pinned = 1;
  				goto out_one_pinned;
  			}
969c79215   Tejun Heo   sched: replace mi...
2920
2921
2922
2923
2924
  			/*
  			 * ->active_balance synchronizes accesses to
  			 * ->active_balance_work.  Once set, it's cleared
  			 * only after active load balance is finished.
  			 */
1e3c88bde   Peter Zijlstra   sched: Move load ...
2925
2926
2927
2928
2929
2930
  			if (!busiest->active_balance) {
  				busiest->active_balance = 1;
  				busiest->push_cpu = this_cpu;
  				active_balance = 1;
  			}
  			raw_spin_unlock_irqrestore(&busiest->lock, flags);
969c79215   Tejun Heo   sched: replace mi...
2931

1e3c88bde   Peter Zijlstra   sched: Move load ...
2932
  			if (active_balance)
969c79215   Tejun Heo   sched: replace mi...
2933
2934
2935
  				stop_one_cpu_nowait(cpu_of(busiest),
  					active_load_balance_cpu_stop, busiest,
  					&busiest->active_balance_work);
1e3c88bde   Peter Zijlstra   sched: Move load ...
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
  
  			/*
  			 * We've kicked active balancing, reset the failure
  			 * counter.
  			 */
  			sd->nr_balance_failed = sd->cache_nice_tries+1;
  		}
  	} else
  		sd->nr_balance_failed = 0;
  
  	if (likely(!active_balance)) {
  		/* We were unbalanced, so reset the balancing interval */
  		sd->balance_interval = sd->min_interval;
  	} else {
  		/*
  		 * If we've begun active balancing, start to back off. This
  		 * case may not be covered by the all_pinned logic if there
  		 * is only 1 task on the busy runqueue (because we don't call
  		 * move_tasks).
  		 */
  		if (sd->balance_interval < sd->max_interval)
  			sd->balance_interval *= 2;
  	}
  
  	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
  	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
  		ld_moved = -1;
  
  	goto out;
  
  out_balanced:
  	schedstat_inc(sd, lb_balanced[idle]);
  
  	sd->nr_balance_failed = 0;
  
  out_one_pinned:
  	/* tune up the balancing interval */
  	if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
  			(sd->balance_interval < sd->max_interval))
  		sd->balance_interval *= 2;
  
  	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
  	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
  		ld_moved = -1;
  	else
  		ld_moved = 0;
  out:
  	if (ld_moved)
  		update_shares(sd);
  	return ld_moved;
  }
  
  /*
1e3c88bde   Peter Zijlstra   sched: Move load ...
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
   * idle_balance is called by schedule() if this_cpu is about to become
   * idle. Attempts to pull tasks from other CPUs.
   */
  static void idle_balance(int this_cpu, struct rq *this_rq)
  {
  	struct sched_domain *sd;
  	int pulled_task = 0;
  	unsigned long next_balance = jiffies + HZ;
  
  	this_rq->idle_stamp = this_rq->clock;
  
  	if (this_rq->avg_idle < sysctl_sched_migration_cost)
  		return;
f492e12ef   Peter Zijlstra   sched: Remove loa...
3002
3003
3004
3005
  	/*
  	 * Drop the rq->lock, but keep IRQ/preempt disabled.
  	 */
  	raw_spin_unlock(&this_rq->lock);
1e3c88bde   Peter Zijlstra   sched: Move load ...
3006
3007
  	for_each_domain(this_cpu, sd) {
  		unsigned long interval;
f492e12ef   Peter Zijlstra   sched: Remove loa...
3008
  		int balance = 1;
1e3c88bde   Peter Zijlstra   sched: Move load ...
3009
3010
3011
  
  		if (!(sd->flags & SD_LOAD_BALANCE))
  			continue;
f492e12ef   Peter Zijlstra   sched: Remove loa...
3012
  		if (sd->flags & SD_BALANCE_NEWIDLE) {
1e3c88bde   Peter Zijlstra   sched: Move load ...
3013
  			/* If we've pulled tasks over stop searching: */
f492e12ef   Peter Zijlstra   sched: Remove loa...
3014
3015
3016
  			pulled_task = load_balance(this_cpu, this_rq,
  						   sd, CPU_NEWLY_IDLE, &balance);
  		}
1e3c88bde   Peter Zijlstra   sched: Move load ...
3017
3018
3019
3020
3021
3022
3023
3024
3025
  
  		interval = msecs_to_jiffies(sd->balance_interval);
  		if (time_after(next_balance, sd->last_balance + interval))
  			next_balance = sd->last_balance + interval;
  		if (pulled_task) {
  			this_rq->idle_stamp = 0;
  			break;
  		}
  	}
f492e12ef   Peter Zijlstra   sched: Remove loa...
3026
3027
  
  	raw_spin_lock(&this_rq->lock);
1e3c88bde   Peter Zijlstra   sched: Move load ...
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
  	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
  		/*
  		 * We are going idle. next_balance may be set based on
  		 * a busy processor. So reset next_balance.
  		 */
  		this_rq->next_balance = next_balance;
  	}
  }
  
  /*
969c79215   Tejun Heo   sched: replace mi...
3038
3039
3040
3041
   * active_load_balance_cpu_stop is run by cpu stopper. It pushes
   * running tasks off the busiest CPU onto idle CPUs. It requires at
   * least 1 task to be running on each physical CPU where possible, and
   * avoids physical / logical imbalances.
1e3c88bde   Peter Zijlstra   sched: Move load ...
3042
   */
969c79215   Tejun Heo   sched: replace mi...
3043
  static int active_load_balance_cpu_stop(void *data)
1e3c88bde   Peter Zijlstra   sched: Move load ...
3044
  {
969c79215   Tejun Heo   sched: replace mi...
3045
3046
  	struct rq *busiest_rq = data;
  	int busiest_cpu = cpu_of(busiest_rq);
1e3c88bde   Peter Zijlstra   sched: Move load ...
3047
  	int target_cpu = busiest_rq->push_cpu;
969c79215   Tejun Heo   sched: replace mi...
3048
  	struct rq *target_rq = cpu_rq(target_cpu);
1e3c88bde   Peter Zijlstra   sched: Move load ...
3049
  	struct sched_domain *sd;
969c79215   Tejun Heo   sched: replace mi...
3050
3051
3052
3053
3054
3055
3056
  
  	raw_spin_lock_irq(&busiest_rq->lock);
  
  	/* make sure the requested cpu hasn't gone down in the meantime */
  	if (unlikely(busiest_cpu != smp_processor_id() ||
  		     !busiest_rq->active_balance))
  		goto out_unlock;
1e3c88bde   Peter Zijlstra   sched: Move load ...
3057
3058
3059
  
  	/* Is there any task to move? */
  	if (busiest_rq->nr_running <= 1)
969c79215   Tejun Heo   sched: replace mi...
3060
  		goto out_unlock;
1e3c88bde   Peter Zijlstra   sched: Move load ...
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
  
  	/*
  	 * This condition is "impossible", if it occurs
  	 * we need to fix it. Originally reported by
  	 * Bjorn Helgaas on a 128-cpu setup.
  	 */
  	BUG_ON(busiest_rq == target_rq);
  
  	/* move a task from busiest_rq to target_rq */
  	double_lock_balance(busiest_rq, target_rq);
1e3c88bde   Peter Zijlstra   sched: Move load ...
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
  
  	/* Search for an sd spanning us and the target CPU. */
  	for_each_domain(target_cpu, sd) {
  		if ((sd->flags & SD_LOAD_BALANCE) &&
  		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
  				break;
  	}
  
  	if (likely(sd)) {
  		schedstat_inc(sd, alb_count);
  
  		if (move_one_task(target_rq, target_cpu, busiest_rq,
  				  sd, CPU_IDLE))
  			schedstat_inc(sd, alb_pushed);
  		else
  			schedstat_inc(sd, alb_failed);
  	}
  	double_unlock_balance(busiest_rq, target_rq);
969c79215   Tejun Heo   sched: replace mi...
3089
3090
3091
3092
  out_unlock:
  	busiest_rq->active_balance = 0;
  	raw_spin_unlock_irq(&busiest_rq->lock);
  	return 0;
1e3c88bde   Peter Zijlstra   sched: Move load ...
3093
3094
3095
  }
  
  #ifdef CONFIG_NO_HZ
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
  
  static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
  
  static void trigger_sched_softirq(void *data)
  {
  	raise_softirq_irqoff(SCHED_SOFTIRQ);
  }
  
  static inline void init_sched_softirq_csd(struct call_single_data *csd)
  {
  	csd->func = trigger_sched_softirq;
  	csd->info = NULL;
  	csd->flags = 0;
  	csd->priv = 0;
  }
  
  /*
   * idle load balancing details
   * - One of the idle CPUs nominates itself as idle load_balancer, while
   *   entering idle.
   * - This idle load balancer CPU will also go into tickless mode when
   *   it is idle, just like all other idle CPUs
   * - When one of the busy CPUs notice that there may be an idle rebalancing
   *   needed, they will kick the idle load balancer, which then does idle
   *   load balancing for all the idle CPUs.
   */
1e3c88bde   Peter Zijlstra   sched: Move load ...
3122
3123
  static struct {
  	atomic_t load_balancer;
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3124
3125
3126
3127
3128
3129
  	atomic_t first_pick_cpu;
  	atomic_t second_pick_cpu;
  	cpumask_var_t idle_cpus_mask;
  	cpumask_var_t grp_idle_mask;
  	unsigned long next_balance;     /* in jiffy units */
  } nohz ____cacheline_aligned;
1e3c88bde   Peter Zijlstra   sched: Move load ...
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
  
  int get_nohz_load_balancer(void)
  {
  	return atomic_read(&nohz.load_balancer);
  }
  
  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
  /**
   * lowest_flag_domain - Return lowest sched_domain containing flag.
   * @cpu:	The cpu whose lowest level of sched domain is to
   *		be returned.
   * @flag:	The flag to check for the lowest sched_domain
   *		for the given cpu.
   *
   * Returns the lowest sched_domain of a cpu which contains the given flag.
   */
  static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
  {
  	struct sched_domain *sd;
  
  	for_each_domain(cpu, sd)
  		if (sd && (sd->flags & flag))
  			break;
  
  	return sd;
  }
  
  /**
   * for_each_flag_domain - Iterates over sched_domains containing the flag.
   * @cpu:	The cpu whose domains we're iterating over.
   * @sd:		variable holding the value of the power_savings_sd
   *		for cpu.
   * @flag:	The flag to filter the sched_domains to be iterated.
   *
   * Iterates over all the scheduler domains for a given cpu that has the 'flag'
   * set, starting from the lowest sched_domain to the highest.
   */
  #define for_each_flag_domain(cpu, sd, flag) \
  	for (sd = lowest_flag_domain(cpu, flag); \
  		(sd && (sd->flags & flag)); sd = sd->parent)
  
  /**
   * is_semi_idle_group - Checks if the given sched_group is semi-idle.
   * @ilb_group:	group to be checked for semi-idleness
   *
   * Returns:	1 if the group is semi-idle. 0 otherwise.
   *
   * We define a sched_group to be semi idle if it has atleast one idle-CPU
   * and atleast one non-idle CPU. This helper function checks if the given
   * sched_group is semi-idle or not.
   */
  static inline int is_semi_idle_group(struct sched_group *ilb_group)
  {
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3183
  	cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
1e3c88bde   Peter Zijlstra   sched: Move load ...
3184
3185
3186
3187
3188
3189
  					sched_group_cpus(ilb_group));
  
  	/*
  	 * A sched_group is semi-idle when it has atleast one busy cpu
  	 * and atleast one idle cpu.
  	 */
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3190
  	if (cpumask_empty(nohz.grp_idle_mask))
1e3c88bde   Peter Zijlstra   sched: Move load ...
3191
  		return 0;
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3192
  	if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
1e3c88bde   Peter Zijlstra   sched: Move load ...
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
  		return 0;
  
  	return 1;
  }
  /**
   * find_new_ilb - Finds the optimum idle load balancer for nomination.
   * @cpu:	The cpu which is nominating a new idle_load_balancer.
   *
   * Returns:	Returns the id of the idle load balancer if it exists,
   *		Else, returns >= nr_cpu_ids.
   *
   * This algorithm picks the idle load balancer such that it belongs to a
   * semi-idle powersavings sched_domain. The idea is to try and avoid
   * completely idle packages/cores just for the purpose of idle load balancing
   * when there are other idle cpu's which are better suited for that job.
   */
  static int find_new_ilb(int cpu)
  {
  	struct sched_domain *sd;
  	struct sched_group *ilb_group;
  
  	/*
  	 * Have idle load balancer selection from semi-idle packages only
  	 * when power-aware load balancing is enabled
  	 */
  	if (!(sched_smt_power_savings || sched_mc_power_savings))
  		goto out_done;
  
  	/*
  	 * Optimize for the case when we have no idle CPUs or only one
  	 * idle CPU. Don't walk the sched_domain hierarchy in such cases
  	 */
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3225
  	if (cpumask_weight(nohz.idle_cpus_mask) < 2)
1e3c88bde   Peter Zijlstra   sched: Move load ...
3226
3227
3228
3229
3230
3231
3232
  		goto out_done;
  
  	for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
  		ilb_group = sd->groups;
  
  		do {
  			if (is_semi_idle_group(ilb_group))
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3233
  				return cpumask_first(nohz.grp_idle_mask);
1e3c88bde   Peter Zijlstra   sched: Move load ...
3234
3235
3236
3237
3238
3239
3240
  
  			ilb_group = ilb_group->next;
  
  		} while (ilb_group != sd->groups);
  	}
  
  out_done:
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3241
  	return nr_cpu_ids;
1e3c88bde   Peter Zijlstra   sched: Move load ...
3242
3243
3244
3245
  }
  #else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
  static inline int find_new_ilb(int call_cpu)
  {
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3246
  	return nr_cpu_ids;
1e3c88bde   Peter Zijlstra   sched: Move load ...
3247
3248
3249
3250
  }
  #endif
  
  /*
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
   * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
   * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
   * CPU (if there is one).
   */
  static void nohz_balancer_kick(int cpu)
  {
  	int ilb_cpu;
  
  	nohz.next_balance++;
  
  	ilb_cpu = get_nohz_load_balancer();
  
  	if (ilb_cpu >= nr_cpu_ids) {
  		ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
  		if (ilb_cpu >= nr_cpu_ids)
  			return;
  	}
  
  	if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
  		struct call_single_data *cp;
  
  		cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
  		cp = &per_cpu(remote_sched_softirq_cb, cpu);
  		__smp_call_function_single(ilb_cpu, cp, 0);
  	}
  	return;
  }
  
  /*
1e3c88bde   Peter Zijlstra   sched: Move load ...
3280
3281
   * This routine will try to nominate the ilb (idle load balancing)
   * owner among the cpus whose ticks are stopped. ilb owner will do the idle
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3282
   * load balancing on behalf of all those cpus.
1e3c88bde   Peter Zijlstra   sched: Move load ...
3283
   *
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3284
3285
3286
   * When the ilb owner becomes busy, we will not have new ilb owner until some
   * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
   * idle load balancing by kicking one of the idle CPUs.
1e3c88bde   Peter Zijlstra   sched: Move load ...
3287
   *
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3288
3289
3290
   * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
   * ilb owner CPU in future (when there is a need for idle load balancing on
   * behalf of all idle CPUs).
1e3c88bde   Peter Zijlstra   sched: Move load ...
3291
   */
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3292
  void select_nohz_load_balancer(int stop_tick)
1e3c88bde   Peter Zijlstra   sched: Move load ...
3293
3294
3295
3296
  {
  	int cpu = smp_processor_id();
  
  	if (stop_tick) {
1e3c88bde   Peter Zijlstra   sched: Move load ...
3297
3298
  		if (!cpu_active(cpu)) {
  			if (atomic_read(&nohz.load_balancer) != cpu)
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3299
  				return;
1e3c88bde   Peter Zijlstra   sched: Move load ...
3300
3301
3302
3303
3304
  
  			/*
  			 * If we are going offline and still the leader,
  			 * give up!
  			 */
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3305
3306
  			if (atomic_cmpxchg(&nohz.load_balancer, cpu,
  					   nr_cpu_ids) != cpu)
1e3c88bde   Peter Zijlstra   sched: Move load ...
3307
  				BUG();
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3308
  			return;
1e3c88bde   Peter Zijlstra   sched: Move load ...
3309
  		}
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3310
  		cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
1e3c88bde   Peter Zijlstra   sched: Move load ...
3311

83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3312
3313
3314
3315
  		if (atomic_read(&nohz.first_pick_cpu) == cpu)
  			atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
  		if (atomic_read(&nohz.second_pick_cpu) == cpu)
  			atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
1e3c88bde   Peter Zijlstra   sched: Move load ...
3316

83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3317
  		if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
1e3c88bde   Peter Zijlstra   sched: Move load ...
3318
  			int new_ilb;
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3319
3320
3321
3322
  			/* make me the ilb owner */
  			if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
  					   cpu) != nr_cpu_ids)
  				return;
1e3c88bde   Peter Zijlstra   sched: Move load ...
3323
3324
3325
3326
3327
3328
  			/*
  			 * Check to see if there is a more power-efficient
  			 * ilb.
  			 */
  			new_ilb = find_new_ilb(cpu);
  			if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3329
  				atomic_set(&nohz.load_balancer, nr_cpu_ids);
1e3c88bde   Peter Zijlstra   sched: Move load ...
3330
  				resched_cpu(new_ilb);
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3331
  				return;
1e3c88bde   Peter Zijlstra   sched: Move load ...
3332
  			}
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3333
  			return;
1e3c88bde   Peter Zijlstra   sched: Move load ...
3334
3335
  		}
  	} else {
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3336
3337
  		if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
  			return;
1e3c88bde   Peter Zijlstra   sched: Move load ...
3338

83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3339
  		cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
1e3c88bde   Peter Zijlstra   sched: Move load ...
3340
3341
  
  		if (atomic_read(&nohz.load_balancer) == cpu)
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3342
3343
  			if (atomic_cmpxchg(&nohz.load_balancer, cpu,
  					   nr_cpu_ids) != cpu)
1e3c88bde   Peter Zijlstra   sched: Move load ...
3344
3345
  				BUG();
  	}
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3346
  	return;
1e3c88bde   Peter Zijlstra   sched: Move load ...
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
  }
  #endif
  
  static DEFINE_SPINLOCK(balancing);
  
  /*
   * It checks each scheduling domain to see if it is due to be balanced,
   * and initiates a balancing operation if so.
   *
   * Balancing parameters are set up in arch_init_sched_domains.
   */
  static void rebalance_domains(int cpu, enum cpu_idle_type idle)
  {
  	int balance = 1;
  	struct rq *rq = cpu_rq(cpu);
  	unsigned long interval;
  	struct sched_domain *sd;
  	/* Earliest time when we have to do rebalance again */
  	unsigned long next_balance = jiffies + 60*HZ;
  	int update_next_balance = 0;
  	int need_serialize;
  
  	for_each_domain(cpu, sd) {
  		if (!(sd->flags & SD_LOAD_BALANCE))
  			continue;
  
  		interval = sd->balance_interval;
  		if (idle != CPU_IDLE)
  			interval *= sd->busy_factor;
  
  		/* scale ms to jiffies */
  		interval = msecs_to_jiffies(interval);
  		if (unlikely(!interval))
  			interval = 1;
  		if (interval > HZ*NR_CPUS/10)
  			interval = HZ*NR_CPUS/10;
  
  		need_serialize = sd->flags & SD_SERIALIZE;
  
  		if (need_serialize) {
  			if (!spin_trylock(&balancing))
  				goto out;
  		}
  
  		if (time_after_eq(jiffies, sd->last_balance + interval)) {
  			if (load_balance(cpu, rq, sd, idle, &balance)) {
  				/*
  				 * We've pulled tasks over so either we're no
  				 * longer idle, or one of our SMT siblings is
  				 * not idle.
  				 */
  				idle = CPU_NOT_IDLE;
  			}
  			sd->last_balance = jiffies;
  		}
  		if (need_serialize)
  			spin_unlock(&balancing);
  out:
  		if (time_after(next_balance, sd->last_balance + interval)) {
  			next_balance = sd->last_balance + interval;
  			update_next_balance = 1;
  		}
  
  		/*
  		 * Stop the load balance at this level. There is another
  		 * CPU in our sched group which is doing load balancing more
  		 * actively.
  		 */
  		if (!balance)
  			break;
  	}
  
  	/*
  	 * next_balance will be updated only when there is a need.
  	 * When the cpu is attached to null domain for ex, it will not be
  	 * updated.
  	 */
  	if (likely(update_next_balance))
  		rq->next_balance = next_balance;
  }
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3427
  #ifdef CONFIG_NO_HZ
1e3c88bde   Peter Zijlstra   sched: Move load ...
3428
  /*
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3429
   * In CONFIG_NO_HZ case, the idle balance kickee will do the
1e3c88bde   Peter Zijlstra   sched: Move load ...
3430
3431
   * rebalancing for all the cpus for whom scheduler ticks are stopped.
   */
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
  static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
  {
  	struct rq *this_rq = cpu_rq(this_cpu);
  	struct rq *rq;
  	int balance_cpu;
  
  	if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
  		return;
  
  	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
  		if (balance_cpu == this_cpu)
  			continue;
  
  		/*
  		 * If this cpu gets work to do, stop the load balancing
  		 * work being done for other cpus. Next load
  		 * balancing owner will pick it up.
  		 */
  		if (need_resched()) {
  			this_rq->nohz_balance_kick = 0;
  			break;
  		}
  
  		raw_spin_lock_irq(&this_rq->lock);
5343bdb8f   Suresh Siddha   sched: Update rq-...
3456
  		update_rq_clock(this_rq);
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
  		update_cpu_load(this_rq);
  		raw_spin_unlock_irq(&this_rq->lock);
  
  		rebalance_domains(balance_cpu, CPU_IDLE);
  
  		rq = cpu_rq(balance_cpu);
  		if (time_after(this_rq->next_balance, rq->next_balance))
  			this_rq->next_balance = rq->next_balance;
  	}
  	nohz.next_balance = this_rq->next_balance;
  	this_rq->nohz_balance_kick = 0;
  }
  
  /*
   * Current heuristic for kicking the idle load balancer
   * - first_pick_cpu is the one of the busy CPUs. It will kick
   *   idle load balancer when it has more than one process active. This
   *   eliminates the need for idle load balancing altogether when we have
   *   only one running process in the system (common case).
   * - If there are more than one busy CPU, idle load balancer may have
   *   to run for active_load_balance to happen (i.e., two busy CPUs are
   *   SMT or core siblings and can run better if they move to different
   *   physical CPUs). So, second_pick_cpu is the second of the busy CPUs
   *   which will kick idle load balancer as soon as it has any load.
   */
  static inline int nohz_kick_needed(struct rq *rq, int cpu)
  {
  	unsigned long now = jiffies;
  	int ret;
  	int first_pick_cpu, second_pick_cpu;
  
  	if (time_before(now, nohz.next_balance))
  		return 0;
f6c3f1686   Suresh Siddha   sched: Fix nohz b...
3490
  	if (rq->idle_at_tick)
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
  		return 0;
  
  	first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
  	second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
  
  	if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
  	    second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
  		return 0;
  
  	ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
  	if (ret == nr_cpu_ids || ret == cpu) {
  		atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
  		if (rq->nr_running > 1)
  			return 1;
  	} else {
  		ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
  		if (ret == nr_cpu_ids || ret == cpu) {
  			if (rq->nr_running)
  				return 1;
  		}
  	}
  	return 0;
  }
  #else
  static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
  #endif
  
  /*
   * run_rebalance_domains is triggered when needed from the scheduler tick.
   * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
   */
1e3c88bde   Peter Zijlstra   sched: Move load ...
3522
3523
3524
3525
3526
3527
3528
3529
  static void run_rebalance_domains(struct softirq_action *h)
  {
  	int this_cpu = smp_processor_id();
  	struct rq *this_rq = cpu_rq(this_cpu);
  	enum cpu_idle_type idle = this_rq->idle_at_tick ?
  						CPU_IDLE : CPU_NOT_IDLE;
  
  	rebalance_domains(this_cpu, idle);
1e3c88bde   Peter Zijlstra   sched: Move load ...
3530
  	/*
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3531
  	 * If this cpu has a pending nohz_balance_kick, then do the
1e3c88bde   Peter Zijlstra   sched: Move load ...
3532
3533
3534
  	 * balancing on behalf of the other idle cpus whose ticks are
  	 * stopped.
  	 */
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3535
  	nohz_idle_balance(this_cpu, idle);
1e3c88bde   Peter Zijlstra   sched: Move load ...
3536
3537
3538
3539
  }
  
  static inline int on_null_domain(int cpu)
  {
90a6501f9   Paul E. McKenney   sched, rcu: Fix r...
3540
  	return !rcu_dereference_sched(cpu_rq(cpu)->sd);
1e3c88bde   Peter Zijlstra   sched: Move load ...
3541
3542
3543
3544
  }
  
  /*
   * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
1e3c88bde   Peter Zijlstra   sched: Move load ...
3545
3546
3547
   */
  static inline void trigger_load_balance(struct rq *rq, int cpu)
  {
1e3c88bde   Peter Zijlstra   sched: Move load ...
3548
3549
3550
3551
  	/* Don't need to rebalance while attached to NULL domain */
  	if (time_after_eq(jiffies, rq->next_balance) &&
  	    likely(!on_null_domain(cpu)))
  		raise_softirq(SCHED_SOFTIRQ);
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
3552
3553
3554
3555
  #ifdef CONFIG_NO_HZ
  	else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
  		nohz_balancer_kick(cpu);
  #endif
1e3c88bde   Peter Zijlstra   sched: Move load ...
3556
  }
0bcdcf28c   Christian Ehrhardt   sched: Fix missin...
3557
3558
3559
3560
3561
3562
3563
3564
3565
  static void rq_online_fair(struct rq *rq)
  {
  	update_sysctl();
  }
  
  static void rq_offline_fair(struct rq *rq)
  {
  	update_sysctl();
  }
1e3c88bde   Peter Zijlstra   sched: Move load ...
3566
3567
3568
3569
3570
3571
3572
3573
  #else	/* CONFIG_SMP */
  
  /*
   * on UP we do not need to balance between CPUs:
   */
  static inline void idle_balance(int cpu, struct rq *rq)
  {
  }
55e12e5e7   Dhaval Giani   sched: make sched...
3574
  #endif /* CONFIG_SMP */
e1d1484f7   Peter Williams   sched: reduce bal...
3575

bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3576
3577
3578
  /*
   * scheduler tick hitting a task of our scheduling class:
   */
8f4d37ec0   Peter Zijlstra   sched: high-res p...
3579
  static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3580
3581
3582
3583
3584
3585
  {
  	struct cfs_rq *cfs_rq;
  	struct sched_entity *se = &curr->se;
  
  	for_each_sched_entity(se) {
  		cfs_rq = cfs_rq_of(se);
8f4d37ec0   Peter Zijlstra   sched: high-res p...
3586
  		entity_tick(cfs_rq, se, queued);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3587
3588
3589
3590
  	}
  }
  
  /*
cd29fe6f2   Peter Zijlstra   sched: Sanitize f...
3591
3592
3593
   * called on fork with the child task as argument from the parent's context
   *  - child not yet on the tasklist
   *  - preemption disabled
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3594
   */
cd29fe6f2   Peter Zijlstra   sched: Sanitize f...
3595
  static void task_fork_fair(struct task_struct *p)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3596
  {
cd29fe6f2   Peter Zijlstra   sched: Sanitize f...
3597
  	struct cfs_rq *cfs_rq = task_cfs_rq(current);
429d43bcc   Ingo Molnar   sched: cleanup: s...
3598
  	struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
00bf7bfc2   Ingo Molnar   sched: fix: move ...
3599
  	int this_cpu = smp_processor_id();
cd29fe6f2   Peter Zijlstra   sched: Sanitize f...
3600
3601
  	struct rq *rq = this_rq();
  	unsigned long flags;
05fa785cf   Thomas Gleixner   sched: Convert rq...
3602
  	raw_spin_lock_irqsave(&rq->lock, flags);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3603

861d034ee   Peter Zijlstra   sched: Fix rq->cl...
3604
  	update_rq_clock(rq);
cd29fe6f2   Peter Zijlstra   sched: Sanitize f...
3605
3606
  	if (unlikely(task_cpu(p) != this_cpu))
  		__set_task_cpu(p, this_cpu);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3607

7109c4429   Ting Yang   sched: call updat...
3608
  	update_curr(cfs_rq);
cd29fe6f2   Peter Zijlstra   sched: Sanitize f...
3609

b5d9d734a   Mike Galbraith   sched: Ensure tha...
3610
3611
  	if (curr)
  		se->vruntime = curr->vruntime;
aeb73b040   Peter Zijlstra   sched: clean up n...
3612
  	place_entity(cfs_rq, se, 1);
4d78e7b65   Peter Zijlstra   sched: new task p...
3613

cd29fe6f2   Peter Zijlstra   sched: Sanitize f...
3614
  	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
87fefa381   Dmitry Adamushko   sched: optimize t...
3615
  		/*
edcb60a30   Ingo Molnar   sched: kernel/sch...
3616
3617
3618
  		 * Upon rescheduling, sched_class::put_prev_task() will place
  		 * 'current' within the tree based on its new key value.
  		 */
4d78e7b65   Peter Zijlstra   sched: new task p...
3619
  		swap(curr->vruntime, se->vruntime);
aec0a5142   Bharata B Rao   sched: call resch...
3620
  		resched_task(rq->curr);
4d78e7b65   Peter Zijlstra   sched: new task p...
3621
  	}
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3622

88ec22d3e   Peter Zijlstra   sched: Remove the...
3623
  	se->vruntime -= cfs_rq->min_vruntime;
05fa785cf   Thomas Gleixner   sched: Convert rq...
3624
  	raw_spin_unlock_irqrestore(&rq->lock, flags);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3625
  }
cb4698450   Steven Rostedt   sched: RT-balance...
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
  /*
   * Priority of the task has changed. Check to see if we preempt
   * the current task.
   */
  static void prio_changed_fair(struct rq *rq, struct task_struct *p,
  			      int oldprio, int running)
  {
  	/*
  	 * Reschedule if we are currently running on this runqueue and
  	 * our priority decreased, or if we are not currently running on
  	 * this runqueue and our priority is higher than the current's
  	 */
  	if (running) {
  		if (p->prio > oldprio)
  			resched_task(rq->curr);
  	} else
15afe09bf   Peter Zijlstra   sched: wakeup pre...
3642
  		check_preempt_curr(rq, p, 0);
cb4698450   Steven Rostedt   sched: RT-balance...
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
  }
  
  /*
   * We switched to the sched_fair class.
   */
  static void switched_to_fair(struct rq *rq, struct task_struct *p,
  			     int running)
  {
  	/*
  	 * We were most likely switched from sched_rt, so
  	 * kick off the schedule if running, otherwise just see
  	 * if we can still preempt the current task.
  	 */
  	if (running)
  		resched_task(rq->curr);
  	else
15afe09bf   Peter Zijlstra   sched: wakeup pre...
3659
  		check_preempt_curr(rq, p, 0);
cb4698450   Steven Rostedt   sched: RT-balance...
3660
  }
83b699ed2   Srivatsa Vaddagiri   sched: revert rec...
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
  /* Account for a task changing its policy or group.
   *
   * This routine is mostly called to set cfs_rq->curr field when a task
   * migrates between groups/classes.
   */
  static void set_curr_task_fair(struct rq *rq)
  {
  	struct sched_entity *se = &rq->curr->se;
  
  	for_each_sched_entity(se)
  		set_next_entity(cfs_rq_of(se), se);
  }
810b38179   Peter Zijlstra   sched: retain vru...
3673
  #ifdef CONFIG_FAIR_GROUP_SCHED
88ec22d3e   Peter Zijlstra   sched: Remove the...
3674
  static void moved_group_fair(struct task_struct *p, int on_rq)
810b38179   Peter Zijlstra   sched: retain vru...
3675
3676
3677
3678
  {
  	struct cfs_rq *cfs_rq = task_cfs_rq(p);
  
  	update_curr(cfs_rq);
88ec22d3e   Peter Zijlstra   sched: Remove the...
3679
3680
  	if (!on_rq)
  		place_entity(cfs_rq, &p->se, 1);
810b38179   Peter Zijlstra   sched: retain vru...
3681
3682
  }
  #endif
6d686f456   H Hartley Sweeten   sched: Don't expo...
3683
  static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
0d721cead   Peter Williams   sched: Simplify s...
3684
3685
  {
  	struct sched_entity *se = &task->se;
0d721cead   Peter Williams   sched: Simplify s...
3686
3687
3688
3689
3690
3691
  	unsigned int rr_interval = 0;
  
  	/*
  	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
  	 * idle runqueue:
  	 */
0d721cead   Peter Williams   sched: Simplify s...
3692
3693
  	if (rq->cfs.load.weight)
  		rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
0d721cead   Peter Williams   sched: Simplify s...
3694
3695
3696
  
  	return rr_interval;
  }
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3697
3698
3699
  /*
   * All the scheduling class methods:
   */
5522d5d5f   Ingo Molnar   sched: mark sched...
3700
3701
  static const struct sched_class fair_sched_class = {
  	.next			= &idle_sched_class,
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3702
3703
3704
  	.enqueue_task		= enqueue_task_fair,
  	.dequeue_task		= dequeue_task_fair,
  	.yield_task		= yield_task_fair,
2e09bf556   Ingo Molnar   sched: wakeup gra...
3705
  	.check_preempt_curr	= check_preempt_wakeup,
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3706
3707
3708
  
  	.pick_next_task		= pick_next_task_fair,
  	.put_prev_task		= put_prev_task_fair,
681f3e685   Peter Williams   sched: isolate SM...
3709
  #ifdef CONFIG_SMP
4ce72a2c0   Li Zefan   sched: add CONFIG...
3710
  	.select_task_rq		= select_task_rq_fair,
0bcdcf28c   Christian Ehrhardt   sched: Fix missin...
3711
3712
  	.rq_online		= rq_online_fair,
  	.rq_offline		= rq_offline_fair,
88ec22d3e   Peter Zijlstra   sched: Remove the...
3713
3714
  
  	.task_waking		= task_waking_fair,
681f3e685   Peter Williams   sched: isolate SM...
3715
  #endif
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3716

83b699ed2   Srivatsa Vaddagiri   sched: revert rec...
3717
  	.set_curr_task          = set_curr_task_fair,
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3718
  	.task_tick		= task_tick_fair,
cd29fe6f2   Peter Zijlstra   sched: Sanitize f...
3719
  	.task_fork		= task_fork_fair,
cb4698450   Steven Rostedt   sched: RT-balance...
3720
3721
3722
  
  	.prio_changed		= prio_changed_fair,
  	.switched_to		= switched_to_fair,
810b38179   Peter Zijlstra   sched: retain vru...
3723

0d721cead   Peter Williams   sched: Simplify s...
3724
  	.get_rr_interval	= get_rr_interval_fair,
810b38179   Peter Zijlstra   sched: retain vru...
3725
3726
3727
  #ifdef CONFIG_FAIR_GROUP_SCHED
  	.moved_group		= moved_group_fair,
  #endif
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3728
3729
3730
  };
  
  #ifdef CONFIG_SCHED_DEBUG
5cef9eca3   Ingo Molnar   sched: remove the...
3731
  static void print_cfs_stats(struct seq_file *m, int cpu)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3732
  {
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3733
  	struct cfs_rq *cfs_rq;
5973e5b95   Peter Zijlstra   sched: fix: don't...
3734
  	rcu_read_lock();
c3b64f1e4   Ingo Molnar   sched: clean up s...
3735
  	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
5cef9eca3   Ingo Molnar   sched: remove the...
3736
  		print_cfs_rq(m, cpu, cfs_rq);
5973e5b95   Peter Zijlstra   sched: fix: don't...
3737
  	rcu_read_unlock();
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3738
3739
  }
  #endif