Blame view

kernel/sched/fair.c 198 KB
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
  /*
   * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   *
   *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   *
   *  Interactivity improvements by Mike Galbraith
   *  (C) 2007 Mike Galbraith <efault@gmx.de>
   *
   *  Various enhancements by Dmitry Adamushko.
   *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
   *
   *  Group scheduling enhancements by Srivatsa Vaddagiri
   *  Copyright IBM Corporation, 2007
   *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
   *
   *  Scaled math optimizations by Thomas Gleixner
   *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
218050855   Peter Zijlstra   sched: adaptive s...
18
19
20
   *
   *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
   *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
21
   */
9745512ce   Arjan van de Ven   sched: latencytop...
22
  #include <linux/latencytop.h>
1983a922a   Christian Ehrhardt   sched: Make tunab...
23
  #include <linux/sched.h>
3436ae129   Sisir Koppaka   sched: Fix rebala...
24
  #include <linux/cpumask.h>
029632fbb   Peter Zijlstra   sched: Make separ...
25
26
27
  #include <linux/slab.h>
  #include <linux/profile.h>
  #include <linux/interrupt.h>
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
28
  #include <linux/mempolicy.h>
e14808b49   Mel Gorman   mm: numa: Rate li...
29
  #include <linux/migrate.h>
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
30
  #include <linux/task_work.h>
029632fbb   Peter Zijlstra   sched: Make separ...
31
32
33
34
  
  #include <trace/events/sched.h>
  
  #include "sched.h"
9745512ce   Arjan van de Ven   sched: latencytop...
35

bf0f6f24a   Ingo Molnar   sched: cfs core, ...
36
  /*
218050855   Peter Zijlstra   sched: adaptive s...
37
   * Targeted preemption latency for CPU-bound tasks:
864616ee6   Takuya Yoshikawa   sched: Comment up...
38
   * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
39
   *
218050855   Peter Zijlstra   sched: adaptive s...
40
   * NOTE: this latency value is not the same as the concept of
d274a4cee   Ingo Molnar   sched: update com...
41
42
43
   * 'timeslice length' - timeslices in CFS are of variable length
   * and have no persistent notion like in traditional, time-slice
   * based scheduling concepts.
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
44
   *
d274a4cee   Ingo Molnar   sched: update com...
45
46
   * (to see the precise effective timeslice length of your workload,
   *  run vmstat and monitor the context-switches (cs) field)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
47
   */
21406928a   Mike Galbraith   sched: Tweak sche...
48
49
  unsigned int sysctl_sched_latency = 6000000ULL;
  unsigned int normalized_sysctl_sched_latency = 6000000ULL;
2bd8e6d42   Ingo Molnar   sched: use consta...
50
51
  
  /*
1983a922a   Christian Ehrhardt   sched: Make tunab...
52
53
54
55
56
57
58
59
60
61
62
63
   * The initial- and re-scaling of tunables is configurable
   * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
   *
   * Options are:
   * SCHED_TUNABLESCALING_NONE - unscaled, always *1
   * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
   * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
   */
  enum sched_tunable_scaling sysctl_sched_tunable_scaling
  	= SCHED_TUNABLESCALING_LOG;
  
  /*
b2be5e96d   Peter Zijlstra   sched: reintroduc...
64
   * Minimal preemption granularity for CPU-bound tasks:
864616ee6   Takuya Yoshikawa   sched: Comment up...
65
   * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
2bd8e6d42   Ingo Molnar   sched: use consta...
66
   */
0bf377bbb   Ingo Molnar   sched: Improve la...
67
68
  unsigned int sysctl_sched_min_granularity = 750000ULL;
  unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
218050855   Peter Zijlstra   sched: adaptive s...
69
70
  
  /*
b2be5e96d   Peter Zijlstra   sched: reintroduc...
71
72
   * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
   */
0bf377bbb   Ingo Molnar   sched: Improve la...
73
  static unsigned int sched_nr_latency = 8;
b2be5e96d   Peter Zijlstra   sched: reintroduc...
74
75
  
  /*
2bba22c50   Mike Galbraith   sched: Turn off c...
76
   * After fork, child runs first. If set to 0 (default) then
b2be5e96d   Peter Zijlstra   sched: reintroduc...
77
   * parent will (try to) run first.
218050855   Peter Zijlstra   sched: adaptive s...
78
   */
2bba22c50   Mike Galbraith   sched: Turn off c...
79
  unsigned int sysctl_sched_child_runs_first __read_mostly;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
80
81
  
  /*
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
82
   * SCHED_OTHER wake-up granularity.
172e082a9   Mike Galbraith   sched: Re-tune th...
83
   * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
84
85
86
87
88
   *
   * This option delays the preemption effects of decoupled workloads
   * and reduces their over-scheduling. Synchronous workloads will still
   * have immediate wakeup/sleep latencies.
   */
172e082a9   Mike Galbraith   sched: Re-tune th...
89
  unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
0bcdcf28c   Christian Ehrhardt   sched: Fix missin...
90
  unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
91

da84d9617   Ingo Molnar   sched: reintroduc...
92
  const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
a7a4f8a75   Paul Turner   sched: Add sysctl...
93
94
95
96
97
98
  /*
   * The exponential sliding  window over which load is averaged for shares
   * distribution.
   * (default: 10msec)
   */
  unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
ec12cb7f3   Paul Turner   sched: Accumulate...
99
100
101
102
103
104
105
106
107
108
109
110
111
  #ifdef CONFIG_CFS_BANDWIDTH
  /*
   * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
   * each time a cfs_rq requests quota.
   *
   * Note: in the case that the slice exceeds the runtime remaining (either due
   * to consumption or the quota being specified to be smaller than the slice)
   * we will always only issue the remaining available time.
   *
   * default: 5 msec, units: microseconds
    */
  unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
  #endif
8527632dc   Paul Gortmaker   sched: Move updat...
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
  static inline void update_load_add(struct load_weight *lw, unsigned long inc)
  {
  	lw->weight += inc;
  	lw->inv_weight = 0;
  }
  
  static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
  {
  	lw->weight -= dec;
  	lw->inv_weight = 0;
  }
  
  static inline void update_load_set(struct load_weight *lw, unsigned long w)
  {
  	lw->weight = w;
  	lw->inv_weight = 0;
  }
029632fbb   Peter Zijlstra   sched: Make separ...
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
  /*
   * Increase the granularity value when there are more CPUs,
   * because with more CPUs the 'effective latency' as visible
   * to users decreases. But the relationship is not linear,
   * so pick a second-best guess by going with the log2 of the
   * number of CPUs.
   *
   * This idea comes from the SD scheduler of Con Kolivas:
   */
  static int get_update_sysctl_factor(void)
  {
  	unsigned int cpus = min_t(int, num_online_cpus(), 8);
  	unsigned int factor;
  
  	switch (sysctl_sched_tunable_scaling) {
  	case SCHED_TUNABLESCALING_NONE:
  		factor = 1;
  		break;
  	case SCHED_TUNABLESCALING_LINEAR:
  		factor = cpus;
  		break;
  	case SCHED_TUNABLESCALING_LOG:
  	default:
  		factor = 1 + ilog2(cpus);
  		break;
  	}
  
  	return factor;
  }
  
  static void update_sysctl(void)
  {
  	unsigned int factor = get_update_sysctl_factor();
  
  #define SET_SYSCTL(name) \
  	(sysctl_##name = (factor) * normalized_sysctl_##name)
  	SET_SYSCTL(sched_min_granularity);
  	SET_SYSCTL(sched_latency);
  	SET_SYSCTL(sched_wakeup_granularity);
  #undef SET_SYSCTL
  }
  
  void sched_init_granularity(void)
  {
  	update_sysctl();
  }
9dbdb1555   Peter Zijlstra   sched/fair: Rewor...
175
  #define WMULT_CONST	(~0U)
029632fbb   Peter Zijlstra   sched: Make separ...
176
  #define WMULT_SHIFT	32
9dbdb1555   Peter Zijlstra   sched/fair: Rewor...
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
  static void __update_inv_weight(struct load_weight *lw)
  {
  	unsigned long w;
  
  	if (likely(lw->inv_weight))
  		return;
  
  	w = scale_load_down(lw->weight);
  
  	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
  		lw->inv_weight = 1;
  	else if (unlikely(!w))
  		lw->inv_weight = WMULT_CONST;
  	else
  		lw->inv_weight = WMULT_CONST / w;
  }
029632fbb   Peter Zijlstra   sched: Make separ...
193
194
  
  /*
9dbdb1555   Peter Zijlstra   sched/fair: Rewor...
195
196
197
198
199
200
201
202
203
204
   * delta_exec * weight / lw.weight
   *   OR
   * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
   *
   * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
   * we're guaranteed shift stays positive because inv_weight is guaranteed to
   * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
   *
   * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
   * weight/lw.weight <= 1, and therefore our shift will also be positive.
029632fbb   Peter Zijlstra   sched: Make separ...
205
   */
9dbdb1555   Peter Zijlstra   sched/fair: Rewor...
206
  static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
029632fbb   Peter Zijlstra   sched: Make separ...
207
  {
9dbdb1555   Peter Zijlstra   sched/fair: Rewor...
208
209
  	u64 fact = scale_load_down(weight);
  	int shift = WMULT_SHIFT;
029632fbb   Peter Zijlstra   sched: Make separ...
210

9dbdb1555   Peter Zijlstra   sched/fair: Rewor...
211
  	__update_inv_weight(lw);
029632fbb   Peter Zijlstra   sched: Make separ...
212

9dbdb1555   Peter Zijlstra   sched/fair: Rewor...
213
214
215
216
217
  	if (unlikely(fact >> 32)) {
  		while (fact >> 32) {
  			fact >>= 1;
  			shift--;
  		}
029632fbb   Peter Zijlstra   sched: Make separ...
218
  	}
9dbdb1555   Peter Zijlstra   sched/fair: Rewor...
219
220
  	/* hint to use a 32x32->64 mul */
  	fact = (u64)(u32)fact * lw->inv_weight;
029632fbb   Peter Zijlstra   sched: Make separ...
221

9dbdb1555   Peter Zijlstra   sched/fair: Rewor...
222
223
224
225
  	while (fact >> 32) {
  		fact >>= 1;
  		shift--;
  	}
029632fbb   Peter Zijlstra   sched: Make separ...
226

9dbdb1555   Peter Zijlstra   sched/fair: Rewor...
227
  	return mul_u64_u32_shr(delta_exec, fact, shift);
029632fbb   Peter Zijlstra   sched: Make separ...
228
229
230
231
  }
  
  
  const struct sched_class fair_sched_class;
a4c2f00f5   Peter Zijlstra   sched: fair sched...
232

bf0f6f24a   Ingo Molnar   sched: cfs core, ...
233
234
235
  /**************************************************************
   * CFS operations on generic schedulable entities:
   */
62160e3f4   Ingo Molnar   sched: track cfs_...
236
  #ifdef CONFIG_FAIR_GROUP_SCHED
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
237

62160e3f4   Ingo Molnar   sched: track cfs_...
238
  /* cpu runqueue to which this cfs_rq is attached */
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
239
240
  static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
  {
62160e3f4   Ingo Molnar   sched: track cfs_...
241
  	return cfs_rq->rq;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
242
  }
62160e3f4   Ingo Molnar   sched: track cfs_...
243
244
  /* An entity is a task if it doesn't "own" a runqueue */
  #define entity_is_task(se)	(!se->my_q)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
245

8f48894fc   Peter Zijlstra   sched: Add debug ...
246
247
248
249
250
251
252
  static inline struct task_struct *task_of(struct sched_entity *se)
  {
  #ifdef CONFIG_SCHED_DEBUG
  	WARN_ON_ONCE(!entity_is_task(se));
  #endif
  	return container_of(se, struct task_struct, se);
  }
b758149c0   Peter Zijlstra   sched: prepatory ...
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
  /* Walk up scheduling entities hierarchy */
  #define for_each_sched_entity(se) \
  		for (; se; se = se->parent)
  
  static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
  {
  	return p->se.cfs_rq;
  }
  
  /* runqueue on which this entity is (to be) queued */
  static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
  {
  	return se->cfs_rq;
  }
  
  /* runqueue "owned" by this group */
  static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
  {
  	return grp->my_q;
  }
aff3e4988   Paul Turner   sched: Account fo...
273
274
  static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
  				       int force_update);
9ee474f55   Paul Turner   sched: Maintain t...
275

3d4b47b4b   Peter Zijlstra   sched: Implement ...
276
277
278
  static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
  	if (!cfs_rq->on_list) {
67e86250f   Paul Turner   sched: Introduce ...
279
280
281
282
283
284
285
286
287
288
289
290
  		/*
  		 * Ensure we either appear before our parent (if already
  		 * enqueued) or force our parent to appear after us when it is
  		 * enqueued.  The fact that we always enqueue bottom-up
  		 * reduces this to two cases.
  		 */
  		if (cfs_rq->tg->parent &&
  		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
  			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
  				&rq_of(cfs_rq)->leaf_cfs_rq_list);
  		} else {
  			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
3d4b47b4b   Peter Zijlstra   sched: Implement ...
291
  				&rq_of(cfs_rq)->leaf_cfs_rq_list);
67e86250f   Paul Turner   sched: Introduce ...
292
  		}
3d4b47b4b   Peter Zijlstra   sched: Implement ...
293
294
  
  		cfs_rq->on_list = 1;
9ee474f55   Paul Turner   sched: Maintain t...
295
  		/* We should have no load, but we need to update last_decay. */
aff3e4988   Paul Turner   sched: Account fo...
296
  		update_cfs_rq_blocked_load(cfs_rq, 0);
3d4b47b4b   Peter Zijlstra   sched: Implement ...
297
298
299
300
301
302
303
304
305
306
  	}
  }
  
  static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
  	if (cfs_rq->on_list) {
  		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
  		cfs_rq->on_list = 0;
  	}
  }
b758149c0   Peter Zijlstra   sched: prepatory ...
307
308
309
310
311
  /* Iterate thr' all leaf cfs_rq's on a runqueue */
  #define for_each_leaf_cfs_rq(rq, cfs_rq) \
  	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
  
  /* Do the two (enqueued) entities belong to the same group ? */
fed14d45f   Peter Zijlstra   sched/fair: Track...
312
  static inline struct cfs_rq *
b758149c0   Peter Zijlstra   sched: prepatory ...
313
314
315
  is_same_group(struct sched_entity *se, struct sched_entity *pse)
  {
  	if (se->cfs_rq == pse->cfs_rq)
fed14d45f   Peter Zijlstra   sched/fair: Track...
316
  		return se->cfs_rq;
b758149c0   Peter Zijlstra   sched: prepatory ...
317

fed14d45f   Peter Zijlstra   sched/fair: Track...
318
  	return NULL;
b758149c0   Peter Zijlstra   sched: prepatory ...
319
320
321
322
323
324
  }
  
  static inline struct sched_entity *parent_entity(struct sched_entity *se)
  {
  	return se->parent;
  }
464b75273   Peter Zijlstra   sched: re-instate...
325
326
327
328
329
330
331
332
333
334
335
336
337
  static void
  find_matching_se(struct sched_entity **se, struct sched_entity **pse)
  {
  	int se_depth, pse_depth;
  
  	/*
  	 * preemption test can be made between sibling entities who are in the
  	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
  	 * both tasks until we find their ancestors who are siblings of common
  	 * parent.
  	 */
  
  	/* First walk up until both entities are at same depth */
fed14d45f   Peter Zijlstra   sched/fair: Track...
338
339
  	se_depth = (*se)->depth;
  	pse_depth = (*pse)->depth;
464b75273   Peter Zijlstra   sched: re-instate...
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
  
  	while (se_depth > pse_depth) {
  		se_depth--;
  		*se = parent_entity(*se);
  	}
  
  	while (pse_depth > se_depth) {
  		pse_depth--;
  		*pse = parent_entity(*pse);
  	}
  
  	while (!is_same_group(*se, *pse)) {
  		*se = parent_entity(*se);
  		*pse = parent_entity(*pse);
  	}
  }
8f48894fc   Peter Zijlstra   sched: Add debug ...
356
357
358
359
360
361
  #else	/* !CONFIG_FAIR_GROUP_SCHED */
  
  static inline struct task_struct *task_of(struct sched_entity *se)
  {
  	return container_of(se, struct task_struct, se);
  }
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
362

62160e3f4   Ingo Molnar   sched: track cfs_...
363
364
365
  static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
  {
  	return container_of(cfs_rq, struct rq, cfs);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
366
367
368
  }
  
  #define entity_is_task(se)	1
b758149c0   Peter Zijlstra   sched: prepatory ...
369
370
  #define for_each_sched_entity(se) \
  		for (; se; se = NULL)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
371

b758149c0   Peter Zijlstra   sched: prepatory ...
372
  static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
373
  {
b758149c0   Peter Zijlstra   sched: prepatory ...
374
  	return &task_rq(p)->cfs;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
375
  }
b758149c0   Peter Zijlstra   sched: prepatory ...
376
377
378
379
380
381
382
383
384
385
386
387
388
  static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
  {
  	struct task_struct *p = task_of(se);
  	struct rq *rq = task_rq(p);
  
  	return &rq->cfs;
  }
  
  /* runqueue "owned" by this group */
  static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
  {
  	return NULL;
  }
3d4b47b4b   Peter Zijlstra   sched: Implement ...
389
390
391
392
393
394
395
  static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
  }
  
  static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
  }
b758149c0   Peter Zijlstra   sched: prepatory ...
396
397
  #define for_each_leaf_cfs_rq(rq, cfs_rq) \
  		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
b758149c0   Peter Zijlstra   sched: prepatory ...
398
399
400
401
  static inline struct sched_entity *parent_entity(struct sched_entity *se)
  {
  	return NULL;
  }
464b75273   Peter Zijlstra   sched: re-instate...
402
403
404
405
  static inline void
  find_matching_se(struct sched_entity **se, struct sched_entity **pse)
  {
  }
b758149c0   Peter Zijlstra   sched: prepatory ...
406
  #endif	/* CONFIG_FAIR_GROUP_SCHED */
6c16a6dcb   Peter Zijlstra   sched: Fix compil...
407
  static __always_inline
9dbdb1555   Peter Zijlstra   sched/fair: Rewor...
408
  void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
409
410
411
412
  
  /**************************************************************
   * Scheduling class tree data structure manipulation methods:
   */
1bf08230f   Andrei Epure   sched: Fix variab...
413
  static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
02e0431a3   Peter Zijlstra   sched: better min...
414
  {
1bf08230f   Andrei Epure   sched: Fix variab...
415
  	s64 delta = (s64)(vruntime - max_vruntime);
368059a97   Peter Zijlstra   sched: max_vrunti...
416
  	if (delta > 0)
1bf08230f   Andrei Epure   sched: Fix variab...
417
  		max_vruntime = vruntime;
02e0431a3   Peter Zijlstra   sched: better min...
418

1bf08230f   Andrei Epure   sched: Fix variab...
419
  	return max_vruntime;
02e0431a3   Peter Zijlstra   sched: better min...
420
  }
0702e3ebc   Ingo Molnar   sched: cleanup: f...
421
  static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
b0ffd246e   Peter Zijlstra   sched: clean up m...
422
423
424
425
426
427
428
  {
  	s64 delta = (s64)(vruntime - min_vruntime);
  	if (delta < 0)
  		min_vruntime = vruntime;
  
  	return min_vruntime;
  }
54fdc5816   Fabio Checconi   sched: Account fo...
429
430
431
432
433
  static inline int entity_before(struct sched_entity *a,
  				struct sched_entity *b)
  {
  	return (s64)(a->vruntime - b->vruntime) < 0;
  }
1af5f730f   Peter Zijlstra   sched: more accur...
434
435
436
437
438
439
440
441
442
443
444
  static void update_min_vruntime(struct cfs_rq *cfs_rq)
  {
  	u64 vruntime = cfs_rq->min_vruntime;
  
  	if (cfs_rq->curr)
  		vruntime = cfs_rq->curr->vruntime;
  
  	if (cfs_rq->rb_leftmost) {
  		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
  						   struct sched_entity,
  						   run_node);
e17036dac   Peter Zijlstra   sched: fix update...
445
  		if (!cfs_rq->curr)
1af5f730f   Peter Zijlstra   sched: more accur...
446
447
448
449
  			vruntime = se->vruntime;
  		else
  			vruntime = min_vruntime(vruntime, se->vruntime);
  	}
1bf08230f   Andrei Epure   sched: Fix variab...
450
  	/* ensure we never gain time by being placed backwards. */
1af5f730f   Peter Zijlstra   sched: more accur...
451
  	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
3fe1698b7   Peter Zijlstra   sched: Deal with ...
452
453
454
455
  #ifndef CONFIG_64BIT
  	smp_wmb();
  	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
  #endif
1af5f730f   Peter Zijlstra   sched: more accur...
456
  }
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
457
458
459
  /*
   * Enqueue an entity into the rb-tree:
   */
0702e3ebc   Ingo Molnar   sched: cleanup: f...
460
  static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
461
462
463
464
  {
  	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
  	struct rb_node *parent = NULL;
  	struct sched_entity *entry;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
465
466
467
468
469
470
471
472
473
474
475
476
  	int leftmost = 1;
  
  	/*
  	 * Find the right place in the rbtree:
  	 */
  	while (*link) {
  		parent = *link;
  		entry = rb_entry(parent, struct sched_entity, run_node);
  		/*
  		 * We dont care about collisions. Nodes with
  		 * the same key stay together.
  		 */
2bd2d6f2d   Stephan Baerwolf   sched: Replace us...
477
  		if (entity_before(se, entry)) {
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
478
479
480
481
482
483
484
485
486
487
488
  			link = &parent->rb_left;
  		} else {
  			link = &parent->rb_right;
  			leftmost = 0;
  		}
  	}
  
  	/*
  	 * Maintain a cache of leftmost tree entries (it is frequently
  	 * used):
  	 */
1af5f730f   Peter Zijlstra   sched: more accur...
489
  	if (leftmost)
57cb499df   Ingo Molnar   sched: remove set...
490
  		cfs_rq->rb_leftmost = &se->run_node;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
491
492
493
  
  	rb_link_node(&se->run_node, parent, link);
  	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
494
  }
0702e3ebc   Ingo Molnar   sched: cleanup: f...
495
  static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
496
  {
3fe69747d   Peter Zijlstra   sched: min_vrunti...
497
498
  	if (cfs_rq->rb_leftmost == &se->run_node) {
  		struct rb_node *next_node;
3fe69747d   Peter Zijlstra   sched: min_vrunti...
499
500
501
  
  		next_node = rb_next(&se->run_node);
  		cfs_rq->rb_leftmost = next_node;
3fe69747d   Peter Zijlstra   sched: min_vrunti...
502
  	}
e9acbff64   Ingo Molnar   sched: introduce ...
503

bf0f6f24a   Ingo Molnar   sched: cfs core, ...
504
  	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
505
  }
029632fbb   Peter Zijlstra   sched: Make separ...
506
  struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
507
  {
f4b6755fb   Peter Zijlstra   sched: cleanup fa...
508
509
510
511
512
513
  	struct rb_node *left = cfs_rq->rb_leftmost;
  
  	if (!left)
  		return NULL;
  
  	return rb_entry(left, struct sched_entity, run_node);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
514
  }
ac53db596   Rik van Riel   sched: Use a budd...
515
516
517
518
519
520
521
522
523
524
525
  static struct sched_entity *__pick_next_entity(struct sched_entity *se)
  {
  	struct rb_node *next = rb_next(&se->run_node);
  
  	if (!next)
  		return NULL;
  
  	return rb_entry(next, struct sched_entity, run_node);
  }
  
  #ifdef CONFIG_SCHED_DEBUG
029632fbb   Peter Zijlstra   sched: Make separ...
526
  struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
aeb73b040   Peter Zijlstra   sched: clean up n...
527
  {
7eee3e677   Ingo Molnar   sched: clean up _...
528
  	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
aeb73b040   Peter Zijlstra   sched: clean up n...
529

70eee74b7   Balbir Singh   sched: remove dup...
530
531
  	if (!last)
  		return NULL;
7eee3e677   Ingo Molnar   sched: clean up _...
532
533
  
  	return rb_entry(last, struct sched_entity, run_node);
aeb73b040   Peter Zijlstra   sched: clean up n...
534
  }
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
535
536
537
  /**************************************************************
   * Scheduling class statistics methods:
   */
acb4a848d   Christian Ehrhardt   sched: Update nor...
538
  int sched_proc_update_handler(struct ctl_table *table, int write,
8d65af789   Alexey Dobriyan   sysctl: remove "s...
539
  		void __user *buffer, size_t *lenp,
b2be5e96d   Peter Zijlstra   sched: reintroduc...
540
541
  		loff_t *ppos)
  {
8d65af789   Alexey Dobriyan   sysctl: remove "s...
542
  	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
acb4a848d   Christian Ehrhardt   sched: Update nor...
543
  	int factor = get_update_sysctl_factor();
b2be5e96d   Peter Zijlstra   sched: reintroduc...
544
545
546
547
548
549
  
  	if (ret || !write)
  		return ret;
  
  	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
  					sysctl_sched_min_granularity);
acb4a848d   Christian Ehrhardt   sched: Update nor...
550
551
552
553
554
  #define WRT_SYSCTL(name) \
  	(normalized_sysctl_##name = sysctl_##name / (factor))
  	WRT_SYSCTL(sched_min_granularity);
  	WRT_SYSCTL(sched_latency);
  	WRT_SYSCTL(sched_wakeup_granularity);
acb4a848d   Christian Ehrhardt   sched: Update nor...
555
  #undef WRT_SYSCTL
b2be5e96d   Peter Zijlstra   sched: reintroduc...
556
557
558
  	return 0;
  }
  #endif
647e7cac2   Ingo Molnar   sched: vslice fix...
559
560
  
  /*
f9c0b0950   Peter Zijlstra   sched: revert bac...
561
   * delta /= w
a7be37ac8   Peter Zijlstra   sched: revert the...
562
   */
9dbdb1555   Peter Zijlstra   sched/fair: Rewor...
563
  static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
a7be37ac8   Peter Zijlstra   sched: revert the...
564
  {
f9c0b0950   Peter Zijlstra   sched: revert bac...
565
  	if (unlikely(se->load.weight != NICE_0_LOAD))
9dbdb1555   Peter Zijlstra   sched/fair: Rewor...
566
  		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
a7be37ac8   Peter Zijlstra   sched: revert the...
567
568
569
570
571
  
  	return delta;
  }
  
  /*
647e7cac2   Ingo Molnar   sched: vslice fix...
572
573
   * The idea is to set a period in which each task runs once.
   *
532b1858c   Borislav Petkov   sched: Fix __sche...
574
   * When there are too many tasks (sched_nr_latency) we have to stretch
647e7cac2   Ingo Molnar   sched: vslice fix...
575
576
577
578
   * this period because otherwise the slices get too small.
   *
   * p = (nr <= nl) ? l : l*nr/nl
   */
4d78e7b65   Peter Zijlstra   sched: new task p...
579
580
581
  static u64 __sched_period(unsigned long nr_running)
  {
  	u64 period = sysctl_sched_latency;
b2be5e96d   Peter Zijlstra   sched: reintroduc...
582
  	unsigned long nr_latency = sched_nr_latency;
4d78e7b65   Peter Zijlstra   sched: new task p...
583
584
  
  	if (unlikely(nr_running > nr_latency)) {
4bf0b7715   Peter Zijlstra   sched: remove do_...
585
  		period = sysctl_sched_min_granularity;
4d78e7b65   Peter Zijlstra   sched: new task p...
586
  		period *= nr_running;
4d78e7b65   Peter Zijlstra   sched: new task p...
587
588
589
590
  	}
  
  	return period;
  }
647e7cac2   Ingo Molnar   sched: vslice fix...
591
592
593
594
  /*
   * We calculate the wall-time slice from the period by taking a part
   * proportional to the weight.
   *
f9c0b0950   Peter Zijlstra   sched: revert bac...
595
   * s = p*P[w/rw]
647e7cac2   Ingo Molnar   sched: vslice fix...
596
   */
6d0f0ebd0   Peter Zijlstra   sched: simplify a...
597
  static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
218050855   Peter Zijlstra   sched: adaptive s...
598
  {
0a582440f   Mike Galbraith   sched: fix sched_...
599
  	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
f9c0b0950   Peter Zijlstra   sched: revert bac...
600

0a582440f   Mike Galbraith   sched: fix sched_...
601
  	for_each_sched_entity(se) {
6272d68cc   Lin Ming   sched: sched_slic...
602
  		struct load_weight *load;
3104bf03a   Christian Engelmayer   sched: Fix out of...
603
  		struct load_weight lw;
6272d68cc   Lin Ming   sched: sched_slic...
604
605
606
  
  		cfs_rq = cfs_rq_of(se);
  		load = &cfs_rq->load;
f9c0b0950   Peter Zijlstra   sched: revert bac...
607

0a582440f   Mike Galbraith   sched: fix sched_...
608
  		if (unlikely(!se->on_rq)) {
3104bf03a   Christian Engelmayer   sched: Fix out of...
609
  			lw = cfs_rq->load;
0a582440f   Mike Galbraith   sched: fix sched_...
610
611
612
613
  
  			update_load_add(&lw, se->load.weight);
  			load = &lw;
  		}
9dbdb1555   Peter Zijlstra   sched/fair: Rewor...
614
  		slice = __calc_delta(slice, se->load.weight, load);
0a582440f   Mike Galbraith   sched: fix sched_...
615
616
  	}
  	return slice;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
617
  }
647e7cac2   Ingo Molnar   sched: vslice fix...
618
  /*
660cc00f8   Andrei Epure   sched: Spelling fix
619
   * We calculate the vruntime slice of a to-be-inserted task.
647e7cac2   Ingo Molnar   sched: vslice fix...
620
   *
f9c0b0950   Peter Zijlstra   sched: revert bac...
621
   * vs = s/w
647e7cac2   Ingo Molnar   sched: vslice fix...
622
   */
f9c0b0950   Peter Zijlstra   sched: revert bac...
623
  static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
67e9fb2a3   Peter Zijlstra   sched: add vslice
624
  {
f9c0b0950   Peter Zijlstra   sched: revert bac...
625
  	return calc_delta_fair(sched_slice(cfs_rq, se), se);
a7be37ac8   Peter Zijlstra   sched: revert the...
626
  }
a75cdaa91   Alex Shi   sched: Set an ini...
627
  #ifdef CONFIG_SMP
fb13c7ee0   Mel Gorman   sched/numa: Use a...
628
  static unsigned long task_h_load(struct task_struct *p);
a75cdaa91   Alex Shi   sched: Set an ini...
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
  static inline void __update_task_entity_contrib(struct sched_entity *se);
  
  /* Give new task start runnable values to heavy its load in infant time */
  void init_task_runnable_average(struct task_struct *p)
  {
  	u32 slice;
  
  	p->se.avg.decay_count = 0;
  	slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
  	p->se.avg.runnable_avg_sum = slice;
  	p->se.avg.runnable_avg_period = slice;
  	__update_task_entity_contrib(&p->se);
  }
  #else
  void init_task_runnable_average(struct task_struct *p)
  {
  }
  #endif
a7be37ac8   Peter Zijlstra   sched: revert the...
647
  /*
9dbdb1555   Peter Zijlstra   sched/fair: Rewor...
648
   * Update the current task's runtime statistics.
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
649
   */
b7cc08965   Ingo Molnar   sched: remove the...
650
  static void update_curr(struct cfs_rq *cfs_rq)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
651
  {
429d43bcc   Ingo Molnar   sched: cleanup: s...
652
  	struct sched_entity *curr = cfs_rq->curr;
78becc270   Frederic Weisbecker   sched: Use an acc...
653
  	u64 now = rq_clock_task(rq_of(cfs_rq));
9dbdb1555   Peter Zijlstra   sched/fair: Rewor...
654
  	u64 delta_exec;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
655
656
657
  
  	if (unlikely(!curr))
  		return;
9dbdb1555   Peter Zijlstra   sched/fair: Rewor...
658
659
  	delta_exec = now - curr->exec_start;
  	if (unlikely((s64)delta_exec <= 0))
34f28ecd0   Peter Zijlstra   sched: optimize u...
660
  		return;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
661

8ebc91d93   Ingo Molnar   sched: remove sta...
662
  	curr->exec_start = now;
d842de871   Srivatsa Vaddagiri   sched: cpu accoun...
663

9dbdb1555   Peter Zijlstra   sched/fair: Rewor...
664
665
666
667
668
669
670
671
  	schedstat_set(curr->statistics.exec_max,
  		      max(delta_exec, curr->statistics.exec_max));
  
  	curr->sum_exec_runtime += delta_exec;
  	schedstat_add(cfs_rq, exec_clock, delta_exec);
  
  	curr->vruntime += calc_delta_fair(delta_exec, curr);
  	update_min_vruntime(cfs_rq);
d842de871   Srivatsa Vaddagiri   sched: cpu accoun...
672
673
  	if (entity_is_task(curr)) {
  		struct task_struct *curtask = task_of(curr);
f977bb493   Ingo Molnar   perf_counter, sch...
674
  		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
d842de871   Srivatsa Vaddagiri   sched: cpu accoun...
675
  		cpuacct_charge(curtask, delta_exec);
f06febc96   Frank Mayhar   timers: fix itime...
676
  		account_group_exec_runtime(curtask, delta_exec);
d842de871   Srivatsa Vaddagiri   sched: cpu accoun...
677
  	}
ec12cb7f3   Paul Turner   sched: Accumulate...
678
679
  
  	account_cfs_rq_runtime(cfs_rq, delta_exec);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
680
681
682
  }
  
  static inline void
5870db5b8   Ingo Molnar   sched: remove the...
683
  update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
684
  {
78becc270   Frederic Weisbecker   sched: Use an acc...
685
  	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
686
  }
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
687
688
689
  /*
   * Task is being enqueued - update stats:
   */
d2417e5a3   Ingo Molnar   sched: remove the...
690
  static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
691
  {
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
692
693
694
695
  	/*
  	 * Are we enqueueing a waiting task? (for current tasks
  	 * a dequeue/enqueue event is a NOP)
  	 */
429d43bcc   Ingo Molnar   sched: cleanup: s...
696
  	if (se != cfs_rq->curr)
5870db5b8   Ingo Molnar   sched: remove the...
697
  		update_stats_wait_start(cfs_rq, se);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
698
  }
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
699
  static void
9ef0a9615   Ingo Molnar   sched: remove the...
700
  update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
701
  {
41acab885   Lucas De Marchi   sched: Implement ...
702
  	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
78becc270   Frederic Weisbecker   sched: Use an acc...
703
  			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
41acab885   Lucas De Marchi   sched: Implement ...
704
705
  	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
  	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
78becc270   Frederic Weisbecker   sched: Use an acc...
706
  			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
768d0c272   Peter Zijlstra   sched: Add wait, ...
707
708
709
  #ifdef CONFIG_SCHEDSTATS
  	if (entity_is_task(se)) {
  		trace_sched_stat_wait(task_of(se),
78becc270   Frederic Weisbecker   sched: Use an acc...
710
  			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
768d0c272   Peter Zijlstra   sched: Add wait, ...
711
712
  	}
  #endif
41acab885   Lucas De Marchi   sched: Implement ...
713
  	schedstat_set(se->statistics.wait_start, 0);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
714
715
716
  }
  
  static inline void
19b6a2e37   Ingo Molnar   sched: remove the...
717
  update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
718
  {
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
719
720
721
722
  	/*
  	 * Mark the end of the wait period if dequeueing a
  	 * waiting task:
  	 */
429d43bcc   Ingo Molnar   sched: cleanup: s...
723
  	if (se != cfs_rq->curr)
9ef0a9615   Ingo Molnar   sched: remove the...
724
  		update_stats_wait_end(cfs_rq, se);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
725
726
727
728
729
730
  }
  
  /*
   * We are picking a new current task - update its stats:
   */
  static inline void
79303e9e0   Ingo Molnar   sched: remove the...
731
  update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
732
733
734
735
  {
  	/*
  	 * We are starting a new run period:
  	 */
78becc270   Frederic Weisbecker   sched: Use an acc...
736
  	se->exec_start = rq_clock_task(rq_of(cfs_rq));
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
737
  }
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
738
739
740
  /**************************************************
   * Scheduling class queueing methods:
   */
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
741
742
  #ifdef CONFIG_NUMA_BALANCING
  /*
598f0ec0b   Mel Gorman   sched/numa: Set t...
743
744
745
   * Approximate time to scan a full NUMA task in ms. The task scan period is
   * calculated based on the tasks virtual memory size and
   * numa_balancing_scan_size.
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
746
   */
598f0ec0b   Mel Gorman   sched/numa: Set t...
747
748
  unsigned int sysctl_numa_balancing_scan_period_min = 1000;
  unsigned int sysctl_numa_balancing_scan_period_max = 60000;
6e5fb223e   Peter Zijlstra   mm: sched: numa: ...
749
750
751
  
  /* Portion of address space to scan in MB */
  unsigned int sysctl_numa_balancing_scan_size = 256;
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
752

4b96a29ba   Peter Zijlstra   mm: sched: numa: ...
753
754
  /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
  unsigned int sysctl_numa_balancing_scan_delay = 1000;
598f0ec0b   Mel Gorman   sched/numa: Set t...
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
  static unsigned int task_nr_scan_windows(struct task_struct *p)
  {
  	unsigned long rss = 0;
  	unsigned long nr_scan_pages;
  
  	/*
  	 * Calculations based on RSS as non-present and empty pages are skipped
  	 * by the PTE scanner and NUMA hinting faults should be trapped based
  	 * on resident pages
  	 */
  	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
  	rss = get_mm_rss(p->mm);
  	if (!rss)
  		rss = nr_scan_pages;
  
  	rss = round_up(rss, nr_scan_pages);
  	return rss / nr_scan_pages;
  }
  
  /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
  #define MAX_SCAN_WINDOW 2560
  
  static unsigned int task_scan_min(struct task_struct *p)
  {
  	unsigned int scan, floor;
  	unsigned int windows = 1;
  
  	if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
  		windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
  	floor = 1000 / windows;
  
  	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
  	return max_t(unsigned int, floor, scan);
  }
  
  static unsigned int task_scan_max(struct task_struct *p)
  {
  	unsigned int smin = task_scan_min(p);
  	unsigned int smax;
  
  	/* Watch for min being lower than max due to floor calculations */
  	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
  	return max(smin, smax);
  }
0ec8aa00f   Peter Zijlstra   sched/numa: Avoid...
799
800
801
802
803
804
805
806
807
808
809
  static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
  {
  	rq->nr_numa_running += (p->numa_preferred_nid != -1);
  	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
  }
  
  static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
  {
  	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
  	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
  }
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
810
811
812
813
814
  struct numa_group {
  	atomic_t refcount;
  
  	spinlock_t lock; /* nr_tasks, tasks */
  	int nr_tasks;
e29cf08b0   Mel Gorman   sched/numa: Repor...
815
  	pid_t gid;
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
816
817
818
  	struct list_head task_list;
  
  	struct rcu_head rcu;
20e07dea2   Rik van Riel   sched/numa: Build...
819
  	nodemask_t active_nodes;
989348b5f   Mel Gorman   sched/numa: Use u...
820
  	unsigned long total_faults;
7e2703e60   Rik van Riel   sched/numa: Norma...
821
822
823
824
825
  	/*
  	 * Faults_cpu is used to decide whether memory should move
  	 * towards the CPU. As a consequence, these stats are weighted
  	 * more by CPU use than by memory faults.
  	 */
50ec8a401   Rik van Riel   sched/numa: Track...
826
  	unsigned long *faults_cpu;
989348b5f   Mel Gorman   sched/numa: Use u...
827
  	unsigned long faults[0];
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
828
  };
be1e4e760   Rik van Riel   sched/numa: Turn ...
829
830
831
832
833
834
835
836
  /* Shared or private faults. */
  #define NR_NUMA_HINT_FAULT_TYPES 2
  
  /* Memory and CPU locality */
  #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
  
  /* Averaged statistics, and temporary buffers. */
  #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
e29cf08b0   Mel Gorman   sched/numa: Repor...
837
838
839
840
  pid_t task_numa_group_id(struct task_struct *p)
  {
  	return p->numa_group ? p->numa_group->gid : 0;
  }
ac8e895bd   Mel Gorman   sched/numa: Add i...
841
842
  static inline int task_faults_idx(int nid, int priv)
  {
be1e4e760   Rik van Riel   sched/numa: Turn ...
843
  	return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
ac8e895bd   Mel Gorman   sched/numa: Add i...
844
845
846
847
  }
  
  static inline unsigned long task_faults(struct task_struct *p, int nid)
  {
ff1df896a   Rik van Riel   sched/numa: Renam...
848
  	if (!p->numa_faults_memory)
ac8e895bd   Mel Gorman   sched/numa: Add i...
849
  		return 0;
ff1df896a   Rik van Riel   sched/numa: Renam...
850
851
  	return p->numa_faults_memory[task_faults_idx(nid, 0)] +
  		p->numa_faults_memory[task_faults_idx(nid, 1)];
ac8e895bd   Mel Gorman   sched/numa: Add i...
852
  }
83e1d2cd9   Mel Gorman   sched/numa: Use g...
853
854
855
856
  static inline unsigned long group_faults(struct task_struct *p, int nid)
  {
  	if (!p->numa_group)
  		return 0;
82897b4fd   Wanpeng Li   sched/numa: Use w...
857
858
  	return p->numa_group->faults[task_faults_idx(nid, 0)] +
  		p->numa_group->faults[task_faults_idx(nid, 1)];
83e1d2cd9   Mel Gorman   sched/numa: Use g...
859
  }
20e07dea2   Rik van Riel   sched/numa: Build...
860
861
862
863
864
  static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
  {
  	return group->faults_cpu[task_faults_idx(nid, 0)] +
  		group->faults_cpu[task_faults_idx(nid, 1)];
  }
83e1d2cd9   Mel Gorman   sched/numa: Use g...
865
866
867
868
869
870
871
872
873
  /*
   * These return the fraction of accesses done by a particular task, or
   * task group, on a particular numa node.  The group weight is given a
   * larger multiplier, in order to group tasks together that are almost
   * evenly spread out between numa nodes.
   */
  static inline unsigned long task_weight(struct task_struct *p, int nid)
  {
  	unsigned long total_faults;
ff1df896a   Rik van Riel   sched/numa: Renam...
874
  	if (!p->numa_faults_memory)
83e1d2cd9   Mel Gorman   sched/numa: Use g...
875
876
877
878
879
880
881
882
883
884
885
886
  		return 0;
  
  	total_faults = p->total_numa_faults;
  
  	if (!total_faults)
  		return 0;
  
  	return 1000 * task_faults(p, nid) / total_faults;
  }
  
  static inline unsigned long group_weight(struct task_struct *p, int nid)
  {
989348b5f   Mel Gorman   sched/numa: Use u...
887
  	if (!p->numa_group || !p->numa_group->total_faults)
83e1d2cd9   Mel Gorman   sched/numa: Use g...
888
  		return 0;
989348b5f   Mel Gorman   sched/numa: Use u...
889
  	return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
83e1d2cd9   Mel Gorman   sched/numa: Use g...
890
  }
10f390427   Rik van Riel   sched/numa, mm: U...
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
  bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
  				int src_nid, int dst_cpu)
  {
  	struct numa_group *ng = p->numa_group;
  	int dst_nid = cpu_to_node(dst_cpu);
  	int last_cpupid, this_cpupid;
  
  	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
  
  	/*
  	 * Multi-stage node selection is used in conjunction with a periodic
  	 * migration fault to build a temporal task<->page relation. By using
  	 * a two-stage filter we remove short/unlikely relations.
  	 *
  	 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
  	 * a task's usage of a particular page (n_p) per total usage of this
  	 * page (n_t) (in a given time-span) to a probability.
  	 *
  	 * Our periodic faults will sample this probability and getting the
  	 * same result twice in a row, given these samples are fully
  	 * independent, is then given by P(n)^2, provided our sample period
  	 * is sufficiently short compared to the usage pattern.
  	 *
  	 * This quadric squishes small probabilities, making it less likely we
  	 * act on an unlikely task<->page relation.
  	 */
  	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
  	if (!cpupid_pid_unset(last_cpupid) &&
  				cpupid_to_nid(last_cpupid) != dst_nid)
  		return false;
  
  	/* Always allow migrate on private faults */
  	if (cpupid_match_pid(p, last_cpupid))
  		return true;
  
  	/* A shared fault, but p->numa_group has not been set up yet. */
  	if (!ng)
  		return true;
  
  	/*
  	 * Do not migrate if the destination is not a node that
  	 * is actively used by this numa group.
  	 */
  	if (!node_isset(dst_nid, ng->active_nodes))
  		return false;
  
  	/*
  	 * Source is a node that is not actively used by this
  	 * numa group, while the destination is. Migrate.
  	 */
  	if (!node_isset(src_nid, ng->active_nodes))
  		return true;
  
  	/*
  	 * Both source and destination are nodes in active
  	 * use by this numa group. Maximize memory bandwidth
  	 * by migrating from more heavily used groups, to less
  	 * heavily used ones, spreading the load around.
  	 * Use a 1/4 hysteresis to avoid spurious page movement.
  	 */
  	return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
  }
e6628d5b0   Mel Gorman   sched/numa: Resch...
953
  static unsigned long weighted_cpuload(const int cpu);
58d081b50   Mel Gorman   sched/numa: Avoid...
954
955
956
957
  static unsigned long source_load(int cpu, int type);
  static unsigned long target_load(int cpu, int type);
  static unsigned long power_of(int cpu);
  static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
fb13c7ee0   Mel Gorman   sched/numa: Use a...
958
  /* Cached statistics for all CPUs within a node */
58d081b50   Mel Gorman   sched/numa: Avoid...
959
  struct numa_stats {
fb13c7ee0   Mel Gorman   sched/numa: Use a...
960
  	unsigned long nr_running;
58d081b50   Mel Gorman   sched/numa: Avoid...
961
  	unsigned long load;
fb13c7ee0   Mel Gorman   sched/numa: Use a...
962
963
964
965
966
967
968
  
  	/* Total compute capacity of CPUs on a node */
  	unsigned long power;
  
  	/* Approximate capacity in terms of runnable tasks on a node */
  	unsigned long capacity;
  	int has_capacity;
58d081b50   Mel Gorman   sched/numa: Avoid...
969
  };
e6628d5b0   Mel Gorman   sched/numa: Resch...
970

fb13c7ee0   Mel Gorman   sched/numa: Use a...
971
972
973
974
975
  /*
   * XXX borrowed from update_sg_lb_stats
   */
  static void update_numa_stats(struct numa_stats *ns, int nid)
  {
5eca82a9a   Peter Zijlstra   sched/numa: Cure ...
976
  	int cpu, cpus = 0;
fb13c7ee0   Mel Gorman   sched/numa: Use a...
977
978
979
980
981
982
983
984
  
  	memset(ns, 0, sizeof(*ns));
  	for_each_cpu(cpu, cpumask_of_node(nid)) {
  		struct rq *rq = cpu_rq(cpu);
  
  		ns->nr_running += rq->nr_running;
  		ns->load += weighted_cpuload(cpu);
  		ns->power += power_of(cpu);
5eca82a9a   Peter Zijlstra   sched/numa: Cure ...
985
986
  
  		cpus++;
fb13c7ee0   Mel Gorman   sched/numa: Use a...
987
  	}
5eca82a9a   Peter Zijlstra   sched/numa: Cure ...
988
989
990
991
992
993
994
995
996
997
  	/*
  	 * If we raced with hotplug and there are no CPUs left in our mask
  	 * the @ns structure is NULL'ed and task_numa_compare() will
  	 * not find this node attractive.
  	 *
  	 * We'll either bail at !has_capacity, or we'll detect a huge imbalance
  	 * and bail there.
  	 */
  	if (!cpus)
  		return;
fb13c7ee0   Mel Gorman   sched/numa: Use a...
998
999
1000
1001
  	ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
  	ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
  	ns->has_capacity = (ns->nr_running < ns->capacity);
  }
58d081b50   Mel Gorman   sched/numa: Avoid...
1002
1003
  struct task_numa_env {
  	struct task_struct *p;
e6628d5b0   Mel Gorman   sched/numa: Resch...
1004

58d081b50   Mel Gorman   sched/numa: Avoid...
1005
1006
  	int src_cpu, src_nid;
  	int dst_cpu, dst_nid;
e6628d5b0   Mel Gorman   sched/numa: Resch...
1007

58d081b50   Mel Gorman   sched/numa: Avoid...
1008
  	struct numa_stats src_stats, dst_stats;
e6628d5b0   Mel Gorman   sched/numa: Resch...
1009

40ea2b42d   Wanpeng Li   sched/numa: Drop ...
1010
  	int imbalance_pct;
fb13c7ee0   Mel Gorman   sched/numa: Use a...
1011
1012
1013
  
  	struct task_struct *best_task;
  	long best_imp;
58d081b50   Mel Gorman   sched/numa: Avoid...
1014
1015
  	int best_cpu;
  };
fb13c7ee0   Mel Gorman   sched/numa: Use a...
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
  static void task_numa_assign(struct task_numa_env *env,
  			     struct task_struct *p, long imp)
  {
  	if (env->best_task)
  		put_task_struct(env->best_task);
  	if (p)
  		get_task_struct(p);
  
  	env->best_task = p;
  	env->best_imp = imp;
  	env->best_cpu = env->dst_cpu;
  }
  
  /*
   * This checks if the overall compute and NUMA accesses of the system would
   * be improved if the source tasks was migrated to the target dst_cpu taking
   * into account that it might be best if task running on the dst_cpu should
   * be exchanged with the source task
   */
887c290e8   Rik van Riel   sched/numa: Decid...
1035
1036
  static void task_numa_compare(struct task_numa_env *env,
  			      long taskimp, long groupimp)
fb13c7ee0   Mel Gorman   sched/numa: Use a...
1037
1038
1039
1040
1041
1042
  {
  	struct rq *src_rq = cpu_rq(env->src_cpu);
  	struct rq *dst_rq = cpu_rq(env->dst_cpu);
  	struct task_struct *cur;
  	long dst_load, src_load;
  	long load;
887c290e8   Rik van Riel   sched/numa: Decid...
1043
  	long imp = (groupimp > 0) ? groupimp : taskimp;
fb13c7ee0   Mel Gorman   sched/numa: Use a...
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
  
  	rcu_read_lock();
  	cur = ACCESS_ONCE(dst_rq->curr);
  	if (cur->pid == 0) /* idle */
  		cur = NULL;
  
  	/*
  	 * "imp" is the fault differential for the source task between the
  	 * source and destination node. Calculate the total differential for
  	 * the source task and potential destination task. The more negative
  	 * the value is, the more rmeote accesses that would be expected to
  	 * be incurred if the tasks were swapped.
  	 */
  	if (cur) {
  		/* Skip this swap candidate if cannot move to the source cpu */
  		if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
  			goto unlock;
887c290e8   Rik van Riel   sched/numa: Decid...
1061
1062
  		/*
  		 * If dst and source tasks are in the same NUMA group, or not
ca28aa53d   Rik van Riel   sched/numa: Fix t...
1063
  		 * in any group then look only at task weights.
887c290e8   Rik van Riel   sched/numa: Decid...
1064
  		 */
ca28aa53d   Rik van Riel   sched/numa: Fix t...
1065
  		if (cur->numa_group == env->p->numa_group) {
887c290e8   Rik van Riel   sched/numa: Decid...
1066
1067
  			imp = taskimp + task_weight(cur, env->src_nid) -
  			      task_weight(cur, env->dst_nid);
ca28aa53d   Rik van Riel   sched/numa: Fix t...
1068
1069
1070
1071
1072
1073
  			/*
  			 * Add some hysteresis to prevent swapping the
  			 * tasks within a group over tiny differences.
  			 */
  			if (cur->numa_group)
  				imp -= imp/16;
887c290e8   Rik van Riel   sched/numa: Decid...
1074
  		} else {
ca28aa53d   Rik van Riel   sched/numa: Fix t...
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
  			/*
  			 * Compare the group weights. If a task is all by
  			 * itself (not part of a group), use the task weight
  			 * instead.
  			 */
  			if (env->p->numa_group)
  				imp = groupimp;
  			else
  				imp = taskimp;
  
  			if (cur->numa_group)
  				imp += group_weight(cur, env->src_nid) -
  				       group_weight(cur, env->dst_nid);
  			else
  				imp += task_weight(cur, env->src_nid) -
  				       task_weight(cur, env->dst_nid);
887c290e8   Rik van Riel   sched/numa: Decid...
1091
  		}
fb13c7ee0   Mel Gorman   sched/numa: Use a...
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
  	}
  
  	if (imp < env->best_imp)
  		goto unlock;
  
  	if (!cur) {
  		/* Is there capacity at our destination? */
  		if (env->src_stats.has_capacity &&
  		    !env->dst_stats.has_capacity)
  			goto unlock;
  
  		goto balance;
  	}
  
  	/* Balance doesn't matter much if we're running a task per cpu */
  	if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
  		goto assign;
  
  	/*
  	 * In the overloaded case, try and keep the load balanced.
  	 */
  balance:
  	dst_load = env->dst_stats.load;
  	src_load = env->src_stats.load;
  
  	/* XXX missing power terms */
  	load = task_h_load(env->p);
  	dst_load += load;
  	src_load -= load;
  
  	if (cur) {
  		load = task_h_load(cur);
  		dst_load -= load;
  		src_load += load;
  	}
  
  	/* make src_load the smaller */
  	if (dst_load < src_load)
  		swap(dst_load, src_load);
  
  	if (src_load * env->imbalance_pct < dst_load * 100)
  		goto unlock;
  
  assign:
  	task_numa_assign(env, cur, imp);
  unlock:
  	rcu_read_unlock();
  }
887c290e8   Rik van Riel   sched/numa: Decid...
1140
1141
  static void task_numa_find_cpu(struct task_numa_env *env,
  				long taskimp, long groupimp)
2c8a50aa8   Mel Gorman   sched/numa: Favor...
1142
1143
1144
1145
1146
1147
1148
1149
1150
  {
  	int cpu;
  
  	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
  		/* Skip this CPU if the source task cannot migrate */
  		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
  			continue;
  
  		env->dst_cpu = cpu;
887c290e8   Rik van Riel   sched/numa: Decid...
1151
  		task_numa_compare(env, taskimp, groupimp);
2c8a50aa8   Mel Gorman   sched/numa: Favor...
1152
1153
  	}
  }
58d081b50   Mel Gorman   sched/numa: Avoid...
1154
1155
  static int task_numa_migrate(struct task_struct *p)
  {
58d081b50   Mel Gorman   sched/numa: Avoid...
1156
1157
  	struct task_numa_env env = {
  		.p = p,
fb13c7ee0   Mel Gorman   sched/numa: Use a...
1158

58d081b50   Mel Gorman   sched/numa: Avoid...
1159
  		.src_cpu = task_cpu(p),
b32e86b43   Ingo Molnar   sched/numa: Add d...
1160
  		.src_nid = task_node(p),
fb13c7ee0   Mel Gorman   sched/numa: Use a...
1161
1162
1163
1164
1165
1166
  
  		.imbalance_pct = 112,
  
  		.best_task = NULL,
  		.best_imp = 0,
  		.best_cpu = -1
58d081b50   Mel Gorman   sched/numa: Avoid...
1167
1168
  	};
  	struct sched_domain *sd;
887c290e8   Rik van Riel   sched/numa: Decid...
1169
  	unsigned long taskweight, groupweight;
2c8a50aa8   Mel Gorman   sched/numa: Favor...
1170
  	int nid, ret;
887c290e8   Rik van Riel   sched/numa: Decid...
1171
  	long taskimp, groupimp;
e6628d5b0   Mel Gorman   sched/numa: Resch...
1172

58d081b50   Mel Gorman   sched/numa: Avoid...
1173
  	/*
fb13c7ee0   Mel Gorman   sched/numa: Use a...
1174
1175
1176
1177
1178
1179
  	 * Pick the lowest SD_NUMA domain, as that would have the smallest
  	 * imbalance and would be the first to start moving tasks about.
  	 *
  	 * And we want to avoid any moving of tasks about, as that would create
  	 * random movement of tasks -- counter the numa conditions we're trying
  	 * to satisfy here.
58d081b50   Mel Gorman   sched/numa: Avoid...
1180
1181
  	 */
  	rcu_read_lock();
fb13c7ee0   Mel Gorman   sched/numa: Use a...
1182
  	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
46a73e8a1   Rik van Riel   sched/numa: Fix N...
1183
1184
  	if (sd)
  		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
e6628d5b0   Mel Gorman   sched/numa: Resch...
1185
  	rcu_read_unlock();
46a73e8a1   Rik van Riel   sched/numa: Fix N...
1186
1187
1188
1189
1190
1191
1192
  	/*
  	 * Cpusets can break the scheduler domain tree into smaller
  	 * balance domains, some of which do not cross NUMA boundaries.
  	 * Tasks that are "trapped" in such domains cannot be migrated
  	 * elsewhere, so there is no point in (re)trying.
  	 */
  	if (unlikely(!sd)) {
de1b301a1   Wanpeng Li   sched/numa: Use w...
1193
  		p->numa_preferred_nid = task_node(p);
46a73e8a1   Rik van Riel   sched/numa: Fix N...
1194
1195
  		return -EINVAL;
  	}
887c290e8   Rik van Riel   sched/numa: Decid...
1196
1197
  	taskweight = task_weight(p, env.src_nid);
  	groupweight = group_weight(p, env.src_nid);
fb13c7ee0   Mel Gorman   sched/numa: Use a...
1198
  	update_numa_stats(&env.src_stats, env.src_nid);
2c8a50aa8   Mel Gorman   sched/numa: Favor...
1199
  	env.dst_nid = p->numa_preferred_nid;
887c290e8   Rik van Riel   sched/numa: Decid...
1200
1201
  	taskimp = task_weight(p, env.dst_nid) - taskweight;
  	groupimp = group_weight(p, env.dst_nid) - groupweight;
2c8a50aa8   Mel Gorman   sched/numa: Favor...
1202
  	update_numa_stats(&env.dst_stats, env.dst_nid);
58d081b50   Mel Gorman   sched/numa: Avoid...
1203

e1dda8a79   Rik van Riel   sched/numa: Fix p...
1204
1205
  	/* If the preferred nid has capacity, try to use it. */
  	if (env.dst_stats.has_capacity)
887c290e8   Rik van Riel   sched/numa: Decid...
1206
  		task_numa_find_cpu(&env, taskimp, groupimp);
e1dda8a79   Rik van Riel   sched/numa: Fix p...
1207
1208
1209
  
  	/* No space available on the preferred nid. Look elsewhere. */
  	if (env.best_cpu == -1) {
2c8a50aa8   Mel Gorman   sched/numa: Favor...
1210
1211
1212
  		for_each_online_node(nid) {
  			if (nid == env.src_nid || nid == p->numa_preferred_nid)
  				continue;
58d081b50   Mel Gorman   sched/numa: Avoid...
1213

83e1d2cd9   Mel Gorman   sched/numa: Use g...
1214
  			/* Only consider nodes where both task and groups benefit */
887c290e8   Rik van Riel   sched/numa: Decid...
1215
1216
1217
  			taskimp = task_weight(p, nid) - taskweight;
  			groupimp = group_weight(p, nid) - groupweight;
  			if (taskimp < 0 && groupimp < 0)
fb13c7ee0   Mel Gorman   sched/numa: Use a...
1218
  				continue;
2c8a50aa8   Mel Gorman   sched/numa: Favor...
1219
1220
  			env.dst_nid = nid;
  			update_numa_stats(&env.dst_stats, env.dst_nid);
887c290e8   Rik van Riel   sched/numa: Decid...
1221
  			task_numa_find_cpu(&env, taskimp, groupimp);
58d081b50   Mel Gorman   sched/numa: Avoid...
1222
1223
  		}
  	}
fb13c7ee0   Mel Gorman   sched/numa: Use a...
1224
1225
1226
  	/* No better CPU than the current one was found. */
  	if (env.best_cpu == -1)
  		return -EAGAIN;
0ec8aa00f   Peter Zijlstra   sched/numa: Avoid...
1227
  	sched_setnuma(p, env.dst_nid);
04bb2f947   Rik van Riel   sched/numa: Adjus...
1228
1229
1230
1231
1232
  	/*
  	 * Reset the scan period if the task is being rescheduled on an
  	 * alternative node to recheck if the tasks is now properly placed.
  	 */
  	p->numa_scan_period = task_scan_min(p);
fb13c7ee0   Mel Gorman   sched/numa: Use a...
1233
  	if (env.best_task == NULL) {
286549dca   Mel Gorman   sched: add tracep...
1234
1235
1236
  		ret = migrate_task_to(p, env.best_cpu);
  		if (ret != 0)
  			trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
fb13c7ee0   Mel Gorman   sched/numa: Use a...
1237
1238
1239
1240
  		return ret;
  	}
  
  	ret = migrate_swap(p, env.best_task);
286549dca   Mel Gorman   sched: add tracep...
1241
1242
  	if (ret != 0)
  		trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
fb13c7ee0   Mel Gorman   sched/numa: Use a...
1243
1244
  	put_task_struct(env.best_task);
  	return ret;
e6628d5b0   Mel Gorman   sched/numa: Resch...
1245
  }
6b9a7460b   Mel Gorman   sched/numa: Retry...
1246
1247
1248
  /* Attempt to migrate a task to a CPU on the preferred node. */
  static void numa_migrate_preferred(struct task_struct *p)
  {
2739d3eef   Rik van Riel   sched/numa: Retry...
1249
  	/* This task has no NUMA fault statistics yet */
ff1df896a   Rik van Riel   sched/numa: Renam...
1250
  	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
6b9a7460b   Mel Gorman   sched/numa: Retry...
1251
  		return;
2739d3eef   Rik van Riel   sched/numa: Retry...
1252
1253
1254
1255
  	/* Periodically retry migrating the task to the preferred node */
  	p->numa_migrate_retry = jiffies + HZ;
  
  	/* Success if task is already running on preferred CPU */
de1b301a1   Wanpeng Li   sched/numa: Use w...
1256
  	if (task_node(p) == p->numa_preferred_nid)
6b9a7460b   Mel Gorman   sched/numa: Retry...
1257
1258
1259
  		return;
  
  	/* Otherwise, try migrate to a CPU on the preferred node */
2739d3eef   Rik van Riel   sched/numa: Retry...
1260
  	task_numa_migrate(p);
6b9a7460b   Mel Gorman   sched/numa: Retry...
1261
  }
04bb2f947   Rik van Riel   sched/numa: Adjus...
1262
  /*
20e07dea2   Rik van Riel   sched/numa: Build...
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
   * Find the nodes on which the workload is actively running. We do this by
   * tracking the nodes from which NUMA hinting faults are triggered. This can
   * be different from the set of nodes where the workload's memory is currently
   * located.
   *
   * The bitmask is used to make smarter decisions on when to do NUMA page
   * migrations, To prevent flip-flopping, and excessive page migrations, nodes
   * are added when they cause over 6/16 of the maximum number of faults, but
   * only removed when they drop below 3/16.
   */
  static void update_numa_active_node_mask(struct numa_group *numa_group)
  {
  	unsigned long faults, max_faults = 0;
  	int nid;
  
  	for_each_online_node(nid) {
  		faults = group_faults_cpu(numa_group, nid);
  		if (faults > max_faults)
  			max_faults = faults;
  	}
  
  	for_each_online_node(nid) {
  		faults = group_faults_cpu(numa_group, nid);
  		if (!node_isset(nid, numa_group->active_nodes)) {
  			if (faults > max_faults * 6 / 16)
  				node_set(nid, numa_group->active_nodes);
  		} else if (faults < max_faults * 3 / 16)
  			node_clear(nid, numa_group->active_nodes);
  	}
  }
  
  /*
04bb2f947   Rik van Riel   sched/numa: Adjus...
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
   * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
   * increments. The more local the fault statistics are, the higher the scan
   * period will be for the next scan window. If local/remote ratio is below
   * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
   * scan period will decrease
   */
  #define NUMA_PERIOD_SLOTS 10
  #define NUMA_PERIOD_THRESHOLD 3
  
  /*
   * Increase the scan period (slow down scanning) if the majority of
   * our memory is already on our local node, or if the majority of
   * the page accesses are shared with other processes.
   * Otherwise, decrease the scan period.
   */
  static void update_task_scan_period(struct task_struct *p,
  			unsigned long shared, unsigned long private)
  {
  	unsigned int period_slot;
  	int ratio;
  	int diff;
  
  	unsigned long remote = p->numa_faults_locality[0];
  	unsigned long local = p->numa_faults_locality[1];
  
  	/*
  	 * If there were no record hinting faults then either the task is
  	 * completely idle or all activity is areas that are not of interest
  	 * to automatic numa balancing. Scan slower
  	 */
  	if (local + shared == 0) {
  		p->numa_scan_period = min(p->numa_scan_period_max,
  			p->numa_scan_period << 1);
  
  		p->mm->numa_next_scan = jiffies +
  			msecs_to_jiffies(p->numa_scan_period);
  
  		return;
  	}
  
  	/*
  	 * Prepare to scale scan period relative to the current period.
  	 *	 == NUMA_PERIOD_THRESHOLD scan period stays the same
  	 *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
  	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
  	 */
  	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
  	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
  	if (ratio >= NUMA_PERIOD_THRESHOLD) {
  		int slot = ratio - NUMA_PERIOD_THRESHOLD;
  		if (!slot)
  			slot = 1;
  		diff = slot * period_slot;
  	} else {
  		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
  
  		/*
  		 * Scale scan rate increases based on sharing. There is an
  		 * inverse relationship between the degree of sharing and
  		 * the adjustment made to the scanning period. Broadly
  		 * speaking the intent is that there is little point
  		 * scanning faster if shared accesses dominate as it may
  		 * simply bounce migrations uselessly
  		 */
04bb2f947   Rik van Riel   sched/numa: Adjus...
1359
1360
1361
1362
1363
1364
1365
1366
  		ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
  		diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
  	}
  
  	p->numa_scan_period = clamp(p->numa_scan_period + diff,
  			task_scan_min(p), task_scan_max(p));
  	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
  }
7e2703e60   Rik van Riel   sched/numa: Norma...
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
  /*
   * Get the fraction of time the task has been running since the last
   * NUMA placement cycle. The scheduler keeps similar statistics, but
   * decays those on a 32ms period, which is orders of magnitude off
   * from the dozens-of-seconds NUMA balancing period. Use the scheduler
   * stats only if the task is so new there are no NUMA statistics yet.
   */
  static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
  {
  	u64 runtime, delta, now;
  	/* Use the start of this time slice to avoid calculations. */
  	now = p->se.exec_start;
  	runtime = p->se.sum_exec_runtime;
  
  	if (p->last_task_numa_placement) {
  		delta = runtime - p->last_sum_exec_runtime;
  		*period = now - p->last_task_numa_placement;
  	} else {
  		delta = p->se.avg.runnable_avg_sum;
  		*period = p->se.avg.runnable_avg_period;
  	}
  
  	p->last_sum_exec_runtime = runtime;
  	p->last_task_numa_placement = now;
  
  	return delta;
  }
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
1394
1395
  static void task_numa_placement(struct task_struct *p)
  {
83e1d2cd9   Mel Gorman   sched/numa: Use g...
1396
1397
  	int seq, nid, max_nid = -1, max_group_nid = -1;
  	unsigned long max_faults = 0, max_group_faults = 0;
04bb2f947   Rik van Riel   sched/numa: Adjus...
1398
  	unsigned long fault_types[2] = { 0, 0 };
7e2703e60   Rik van Riel   sched/numa: Norma...
1399
1400
  	unsigned long total_faults;
  	u64 runtime, period;
7dbd13ed0   Mel Gorman   sched/numa: Preve...
1401
  	spinlock_t *group_lock = NULL;
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
1402

2832bc19f   Hugh Dickins   sched: numa: ksm:...
1403
  	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
1404
1405
1406
  	if (p->numa_scan_seq == seq)
  		return;
  	p->numa_scan_seq = seq;
598f0ec0b   Mel Gorman   sched/numa: Set t...
1407
  	p->numa_scan_period_max = task_scan_max(p);
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
1408

7e2703e60   Rik van Riel   sched/numa: Norma...
1409
1410
1411
  	total_faults = p->numa_faults_locality[0] +
  		       p->numa_faults_locality[1];
  	runtime = numa_get_avg_runtime(p, &period);
7dbd13ed0   Mel Gorman   sched/numa: Preve...
1412
1413
1414
  	/* If the task is part of a group prevent parallel updates to group stats */
  	if (p->numa_group) {
  		group_lock = &p->numa_group->lock;
60e69eed8   Mike Galbraith   sched/numa: Fix t...
1415
  		spin_lock_irq(group_lock);
7dbd13ed0   Mel Gorman   sched/numa: Preve...
1416
  	}
688b7585d   Mel Gorman   sched/numa: Selec...
1417
1418
  	/* Find the node with the highest number of faults */
  	for_each_online_node(nid) {
83e1d2cd9   Mel Gorman   sched/numa: Use g...
1419
  		unsigned long faults = 0, group_faults = 0;
ac8e895bd   Mel Gorman   sched/numa: Add i...
1420
  		int priv, i;
745d61476   Mel Gorman   sched/numa: Updat...
1421

be1e4e760   Rik van Riel   sched/numa: Turn ...
1422
  		for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
7e2703e60   Rik van Riel   sched/numa: Norma...
1423
  			long diff, f_diff, f_weight;
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1424

ac8e895bd   Mel Gorman   sched/numa: Add i...
1425
  			i = task_faults_idx(nid, priv);
745d61476   Mel Gorman   sched/numa: Updat...
1426

ac8e895bd   Mel Gorman   sched/numa: Add i...
1427
  			/* Decay existing window, copy faults since last scan */
35664fd41   Rik van Riel   sched/numa: Do st...
1428
  			diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
ff1df896a   Rik van Riel   sched/numa: Renam...
1429
1430
  			fault_types[priv] += p->numa_faults_buffer_memory[i];
  			p->numa_faults_buffer_memory[i] = 0;
fb13c7ee0   Mel Gorman   sched/numa: Use a...
1431

7e2703e60   Rik van Riel   sched/numa: Norma...
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
  			/*
  			 * Normalize the faults_from, so all tasks in a group
  			 * count according to CPU use, instead of by the raw
  			 * number of faults. Tasks with little runtime have
  			 * little over-all impact on throughput, and thus their
  			 * faults are less important.
  			 */
  			f_weight = div64_u64(runtime << 16, period + 1);
  			f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
  				   (total_faults + 1);
35664fd41   Rik van Riel   sched/numa: Do st...
1442
  			f_diff = f_weight - p->numa_faults_cpu[i] / 2;
50ec8a401   Rik van Riel   sched/numa: Track...
1443
  			p->numa_faults_buffer_cpu[i] = 0;
35664fd41   Rik van Riel   sched/numa: Do st...
1444
1445
  			p->numa_faults_memory[i] += diff;
  			p->numa_faults_cpu[i] += f_diff;
ff1df896a   Rik van Riel   sched/numa: Renam...
1446
  			faults += p->numa_faults_memory[i];
83e1d2cd9   Mel Gorman   sched/numa: Use g...
1447
  			p->total_numa_faults += diff;
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1448
1449
  			if (p->numa_group) {
  				/* safe because we can only change our own group */
989348b5f   Mel Gorman   sched/numa: Use u...
1450
  				p->numa_group->faults[i] += diff;
50ec8a401   Rik van Riel   sched/numa: Track...
1451
  				p->numa_group->faults_cpu[i] += f_diff;
989348b5f   Mel Gorman   sched/numa: Use u...
1452
1453
  				p->numa_group->total_faults += diff;
  				group_faults += p->numa_group->faults[i];
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1454
  			}
ac8e895bd   Mel Gorman   sched/numa: Add i...
1455
  		}
688b7585d   Mel Gorman   sched/numa: Selec...
1456
1457
1458
1459
  		if (faults > max_faults) {
  			max_faults = faults;
  			max_nid = nid;
  		}
83e1d2cd9   Mel Gorman   sched/numa: Use g...
1460
1461
1462
1463
1464
1465
  
  		if (group_faults > max_group_faults) {
  			max_group_faults = group_faults;
  			max_group_nid = nid;
  		}
  	}
04bb2f947   Rik van Riel   sched/numa: Adjus...
1466
  	update_task_scan_period(p, fault_types[0], fault_types[1]);
7dbd13ed0   Mel Gorman   sched/numa: Preve...
1467
  	if (p->numa_group) {
20e07dea2   Rik van Riel   sched/numa: Build...
1468
  		update_numa_active_node_mask(p->numa_group);
7dbd13ed0   Mel Gorman   sched/numa: Preve...
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
  		/*
  		 * If the preferred task and group nids are different,
  		 * iterate over the nodes again to find the best place.
  		 */
  		if (max_nid != max_group_nid) {
  			unsigned long weight, max_weight = 0;
  
  			for_each_online_node(nid) {
  				weight = task_weight(p, nid) + group_weight(p, nid);
  				if (weight > max_weight) {
  					max_weight = weight;
  					max_nid = nid;
  				}
83e1d2cd9   Mel Gorman   sched/numa: Use g...
1482
1483
  			}
  		}
7dbd13ed0   Mel Gorman   sched/numa: Preve...
1484

60e69eed8   Mike Galbraith   sched/numa: Fix t...
1485
  		spin_unlock_irq(group_lock);
688b7585d   Mel Gorman   sched/numa: Selec...
1486
  	}
6b9a7460b   Mel Gorman   sched/numa: Retry...
1487
  	/* Preferred node as the node with the most faults */
3a7053b32   Mel Gorman   sched/numa: Favou...
1488
  	if (max_faults && max_nid != p->numa_preferred_nid) {
e6628d5b0   Mel Gorman   sched/numa: Resch...
1489
  		/* Update the preferred nid and migrate task if possible */
0ec8aa00f   Peter Zijlstra   sched/numa: Avoid...
1490
  		sched_setnuma(p, max_nid);
6b9a7460b   Mel Gorman   sched/numa: Retry...
1491
  		numa_migrate_preferred(p);
3a7053b32   Mel Gorman   sched/numa: Favou...
1492
  	}
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
1493
  }
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
  static inline int get_numa_group(struct numa_group *grp)
  {
  	return atomic_inc_not_zero(&grp->refcount);
  }
  
  static inline void put_numa_group(struct numa_group *grp)
  {
  	if (atomic_dec_and_test(&grp->refcount))
  		kfree_rcu(grp, rcu);
  }
3e6a9418c   Mel Gorman   sched/numa: Take ...
1504
1505
  static void task_numa_group(struct task_struct *p, int cpupid, int flags,
  			int *priv)
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1506
1507
1508
1509
1510
1511
1512
1513
1514
  {
  	struct numa_group *grp, *my_grp;
  	struct task_struct *tsk;
  	bool join = false;
  	int cpu = cpupid_to_cpu(cpupid);
  	int i;
  
  	if (unlikely(!p->numa_group)) {
  		unsigned int size = sizeof(struct numa_group) +
50ec8a401   Rik van Riel   sched/numa: Track...
1515
  				    4*nr_node_ids*sizeof(unsigned long);
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1516
1517
1518
1519
1520
1521
1522
1523
  
  		grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
  		if (!grp)
  			return;
  
  		atomic_set(&grp->refcount, 1);
  		spin_lock_init(&grp->lock);
  		INIT_LIST_HEAD(&grp->task_list);
e29cf08b0   Mel Gorman   sched/numa: Repor...
1524
  		grp->gid = p->pid;
50ec8a401   Rik van Riel   sched/numa: Track...
1525
  		/* Second half of the array tracks nids where faults happen */
be1e4e760   Rik van Riel   sched/numa: Turn ...
1526
1527
  		grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
  						nr_node_ids;
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1528

20e07dea2   Rik van Riel   sched/numa: Build...
1529
  		node_set(task_node(current), grp->active_nodes);
be1e4e760   Rik van Riel   sched/numa: Turn ...
1530
  		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
ff1df896a   Rik van Riel   sched/numa: Renam...
1531
  			grp->faults[i] = p->numa_faults_memory[i];
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1532

989348b5f   Mel Gorman   sched/numa: Use u...
1533
  		grp->total_faults = p->total_numa_faults;
83e1d2cd9   Mel Gorman   sched/numa: Use g...
1534

8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1535
1536
1537
1538
1539
1540
1541
1542
1543
  		list_add(&p->numa_entry, &grp->task_list);
  		grp->nr_tasks++;
  		rcu_assign_pointer(p->numa_group, grp);
  	}
  
  	rcu_read_lock();
  	tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
  
  	if (!cpupid_match_pid(tsk, cpupid))
3354781a2   Peter Zijlstra   sched/numa: Reflo...
1544
  		goto no_join;
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1545
1546
1547
  
  	grp = rcu_dereference(tsk->numa_group);
  	if (!grp)
3354781a2   Peter Zijlstra   sched/numa: Reflo...
1548
  		goto no_join;
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1549
1550
1551
  
  	my_grp = p->numa_group;
  	if (grp == my_grp)
3354781a2   Peter Zijlstra   sched/numa: Reflo...
1552
  		goto no_join;
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1553
1554
1555
1556
1557
1558
  
  	/*
  	 * Only join the other group if its bigger; if we're the bigger group,
  	 * the other task will join us.
  	 */
  	if (my_grp->nr_tasks > grp->nr_tasks)
3354781a2   Peter Zijlstra   sched/numa: Reflo...
1559
  		goto no_join;
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1560
1561
1562
1563
1564
  
  	/*
  	 * Tie-break on the grp address.
  	 */
  	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
3354781a2   Peter Zijlstra   sched/numa: Reflo...
1565
  		goto no_join;
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1566

dabe1d992   Rik van Riel   sched/numa: Be mo...
1567
1568
1569
1570
1571
1572
1573
  	/* Always join threads in the same process. */
  	if (tsk->mm == current->mm)
  		join = true;
  
  	/* Simple filter to avoid false positives due to PID collisions */
  	if (flags & TNF_SHARED)
  		join = true;
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1574

3e6a9418c   Mel Gorman   sched/numa: Take ...
1575
1576
  	/* Update priv based on whether false sharing was detected */
  	*priv = !join;
dabe1d992   Rik van Riel   sched/numa: Be mo...
1577
  	if (join && !get_numa_group(grp))
3354781a2   Peter Zijlstra   sched/numa: Reflo...
1578
  		goto no_join;
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1579

8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1580
1581
1582
1583
  	rcu_read_unlock();
  
  	if (!join)
  		return;
60e69eed8   Mike Galbraith   sched/numa: Fix t...
1584
1585
  	BUG_ON(irqs_disabled());
  	double_lock_irq(&my_grp->lock, &grp->lock);
989348b5f   Mel Gorman   sched/numa: Use u...
1586

be1e4e760   Rik van Riel   sched/numa: Turn ...
1587
  	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
ff1df896a   Rik van Riel   sched/numa: Renam...
1588
1589
  		my_grp->faults[i] -= p->numa_faults_memory[i];
  		grp->faults[i] += p->numa_faults_memory[i];
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1590
  	}
989348b5f   Mel Gorman   sched/numa: Use u...
1591
1592
  	my_grp->total_faults -= p->total_numa_faults;
  	grp->total_faults += p->total_numa_faults;
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1593
1594
1595
1596
1597
1598
  
  	list_move(&p->numa_entry, &grp->task_list);
  	my_grp->nr_tasks--;
  	grp->nr_tasks++;
  
  	spin_unlock(&my_grp->lock);
60e69eed8   Mike Galbraith   sched/numa: Fix t...
1599
  	spin_unlock_irq(&grp->lock);
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1600
1601
1602
1603
  
  	rcu_assign_pointer(p->numa_group, grp);
  
  	put_numa_group(my_grp);
3354781a2   Peter Zijlstra   sched/numa: Reflo...
1604
1605
1606
1607
1608
  	return;
  
  no_join:
  	rcu_read_unlock();
  	return;
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1609
1610
1611
1612
1613
  }
  
  void task_numa_free(struct task_struct *p)
  {
  	struct numa_group *grp = p->numa_group;
ff1df896a   Rik van Riel   sched/numa: Renam...
1614
  	void *numa_faults = p->numa_faults_memory;
e9dd685ce   Steven Rostedt   sched/numa: Fix u...
1615
1616
  	unsigned long flags;
  	int i;
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1617
1618
  
  	if (grp) {
e9dd685ce   Steven Rostedt   sched/numa: Fix u...
1619
  		spin_lock_irqsave(&grp->lock, flags);
be1e4e760   Rik van Riel   sched/numa: Turn ...
1620
  		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
ff1df896a   Rik van Riel   sched/numa: Renam...
1621
  			grp->faults[i] -= p->numa_faults_memory[i];
989348b5f   Mel Gorman   sched/numa: Use u...
1622
  		grp->total_faults -= p->total_numa_faults;
83e1d2cd9   Mel Gorman   sched/numa: Use g...
1623

8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1624
1625
  		list_del(&p->numa_entry);
  		grp->nr_tasks--;
e9dd685ce   Steven Rostedt   sched/numa: Fix u...
1626
  		spin_unlock_irqrestore(&grp->lock, flags);
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1627
1628
1629
  		rcu_assign_pointer(p->numa_group, NULL);
  		put_numa_group(grp);
  	}
ff1df896a   Rik van Riel   sched/numa: Renam...
1630
1631
  	p->numa_faults_memory = NULL;
  	p->numa_faults_buffer_memory = NULL;
50ec8a401   Rik van Riel   sched/numa: Track...
1632
1633
  	p->numa_faults_cpu= NULL;
  	p->numa_faults_buffer_cpu = NULL;
82727018b   Rik van Riel   sched/numa: Call ...
1634
  	kfree(numa_faults);
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1635
  }
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
1636
1637
1638
  /*
   * Got a PROT_NONE fault for a page on @node.
   */
58b46da33   Rik van Riel   sched/numa: Renam...
1639
  void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
1640
1641
  {
  	struct task_struct *p = current;
6688cc054   Peter Zijlstra   mm: numa: Do not ...
1642
  	bool migrated = flags & TNF_MIGRATED;
58b46da33   Rik van Riel   sched/numa: Renam...
1643
  	int cpu_node = task_node(current);
ac8e895bd   Mel Gorman   sched/numa: Add i...
1644
  	int priv;
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
1645

10e84b97e   Dave Kleikamp   mm: sched: numa: ...
1646
  	if (!numabalancing_enabled)
1a687c2e9   Mel Gorman   mm: sched: numa: ...
1647
  		return;
9ff1d9ff3   Mel Gorman   sched/numa: Check...
1648
1649
1650
  	/* for example, ksmd faulting in a user's mm */
  	if (!p->mm)
  		return;
82727018b   Rik van Riel   sched/numa: Call ...
1651
1652
1653
  	/* Do not worry about placement if exiting */
  	if (p->state == TASK_DEAD)
  		return;
f809ca9a5   Mel Gorman   sched/numa: Track...
1654
  	/* Allocate buffer to track faults on a per-node basis */
ff1df896a   Rik van Riel   sched/numa: Renam...
1655
  	if (unlikely(!p->numa_faults_memory)) {
be1e4e760   Rik van Riel   sched/numa: Turn ...
1656
1657
  		int size = sizeof(*p->numa_faults_memory) *
  			   NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
f809ca9a5   Mel Gorman   sched/numa: Track...
1658

be1e4e760   Rik van Riel   sched/numa: Turn ...
1659
  		p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
ff1df896a   Rik van Riel   sched/numa: Renam...
1660
  		if (!p->numa_faults_memory)
f809ca9a5   Mel Gorman   sched/numa: Track...
1661
  			return;
745d61476   Mel Gorman   sched/numa: Updat...
1662

ff1df896a   Rik van Riel   sched/numa: Renam...
1663
  		BUG_ON(p->numa_faults_buffer_memory);
be1e4e760   Rik van Riel   sched/numa: Turn ...
1664
1665
1666
1667
1668
1669
  		/*
  		 * The averaged statistics, shared & private, memory & cpu,
  		 * occupy the first half of the array. The second half of the
  		 * array is for current counters, which are averaged into the
  		 * first set by task_numa_placement.
  		 */
50ec8a401   Rik van Riel   sched/numa: Track...
1670
1671
1672
  		p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
  		p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
  		p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
83e1d2cd9   Mel Gorman   sched/numa: Use g...
1673
  		p->total_numa_faults = 0;
04bb2f947   Rik van Riel   sched/numa: Adjus...
1674
  		memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
f809ca9a5   Mel Gorman   sched/numa: Track...
1675
  	}
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
1676

fb003b80d   Mel Gorman   sched: numa: Slow...
1677
  	/*
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1678
1679
1680
1681
1682
1683
1684
  	 * First accesses are treated as private, otherwise consider accesses
  	 * to be private if the accessing pid has not changed
  	 */
  	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
  		priv = 1;
  	} else {
  		priv = cpupid_match_pid(p, last_cpupid);
6688cc054   Peter Zijlstra   mm: numa: Do not ...
1685
  		if (!priv && !(flags & TNF_NO_GROUP))
3e6a9418c   Mel Gorman   sched/numa: Take ...
1686
  			task_numa_group(p, last_cpupid, flags, &priv);
8c8a743c5   Peter Zijlstra   sched/numa: Use {...
1687
  	}
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
1688
  	task_numa_placement(p);
f809ca9a5   Mel Gorman   sched/numa: Track...
1689

2739d3eef   Rik van Riel   sched/numa: Retry...
1690
1691
1692
1693
1694
  	/*
  	 * Retry task to preferred node migration periodically, in case it
  	 * case it previously failed, or the scheduler moved us.
  	 */
  	if (time_after(jiffies, p->numa_migrate_retry))
6b9a7460b   Mel Gorman   sched/numa: Retry...
1695
  		numa_migrate_preferred(p);
b32e86b43   Ingo Molnar   sched/numa: Add d...
1696
1697
  	if (migrated)
  		p->numa_pages_migrated += pages;
58b46da33   Rik van Riel   sched/numa: Renam...
1698
1699
  	p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
  	p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
04bb2f947   Rik van Riel   sched/numa: Adjus...
1700
  	p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
1701
  }
6e5fb223e   Peter Zijlstra   mm: sched: numa: ...
1702
1703
1704
1705
1706
  static void reset_ptenuma_scan(struct task_struct *p)
  {
  	ACCESS_ONCE(p->mm->numa_scan_seq)++;
  	p->mm->numa_scan_offset = 0;
  }
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
1707
1708
1709
1710
1711
1712
1713
1714
1715
  /*
   * The expensive part of numa migration is done from task_work context.
   * Triggered from task_tick_numa().
   */
  void task_numa_work(struct callback_head *work)
  {
  	unsigned long migrate, next_scan, now = jiffies;
  	struct task_struct *p = current;
  	struct mm_struct *mm = p->mm;
6e5fb223e   Peter Zijlstra   mm: sched: numa: ...
1716
  	struct vm_area_struct *vma;
9f40604cd   Mel Gorman   sched, numa, mm: ...
1717
  	unsigned long start, end;
598f0ec0b   Mel Gorman   sched/numa: Set t...
1718
  	unsigned long nr_pte_updates = 0;
9f40604cd   Mel Gorman   sched, numa, mm: ...
1719
  	long pages;
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
  
  	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
  
  	work->next = work; /* protect against double add */
  	/*
  	 * Who cares about NUMA placement when they're dying.
  	 *
  	 * NOTE: make sure not to dereference p->mm before this check,
  	 * exit_task_work() happens _after_ exit_mm() so we could be called
  	 * without p->mm even though we still had it when we enqueued this
  	 * work.
  	 */
  	if (p->flags & PF_EXITING)
  		return;
930aa174f   Mel Gorman   sched/numa: Remov...
1734
  	if (!mm->numa_next_scan) {
7e8d16b6c   Mel Gorman   sched/numa: Initi...
1735
1736
  		mm->numa_next_scan = now +
  			msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
b8593bfda   Mel Gorman   mm: sched: Adapt ...
1737
1738
1739
  	}
  
  	/*
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
1740
1741
1742
1743
1744
  	 * Enforce maximal scan/migration frequency..
  	 */
  	migrate = mm->numa_next_scan;
  	if (time_before(now, migrate))
  		return;
598f0ec0b   Mel Gorman   sched/numa: Set t...
1745
1746
1747
1748
  	if (p->numa_scan_period == 0) {
  		p->numa_scan_period_max = task_scan_max(p);
  		p->numa_scan_period = task_scan_min(p);
  	}
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
1749

fb003b80d   Mel Gorman   sched: numa: Slow...
1750
  	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
1751
1752
  	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
  		return;
e14808b49   Mel Gorman   mm: numa: Rate li...
1753
  	/*
19a78d110   Peter Zijlstra   sched/numa: Mitig...
1754
1755
1756
1757
  	 * Delay this task enough that another task of this mm will likely win
  	 * the next time around.
  	 */
  	p->node_stamp += 2 * TICK_NSEC;
9f40604cd   Mel Gorman   sched, numa, mm: ...
1758
1759
1760
1761
1762
  	start = mm->numa_scan_offset;
  	pages = sysctl_numa_balancing_scan_size;
  	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
  	if (!pages)
  		return;
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
1763

6e5fb223e   Peter Zijlstra   mm: sched: numa: ...
1764
  	down_read(&mm->mmap_sem);
9f40604cd   Mel Gorman   sched, numa, mm: ...
1765
  	vma = find_vma(mm, start);
6e5fb223e   Peter Zijlstra   mm: sched: numa: ...
1766
1767
  	if (!vma) {
  		reset_ptenuma_scan(p);
9f40604cd   Mel Gorman   sched, numa, mm: ...
1768
  		start = 0;
6e5fb223e   Peter Zijlstra   mm: sched: numa: ...
1769
1770
  		vma = mm->mmap;
  	}
9f40604cd   Mel Gorman   sched, numa, mm: ...
1771
  	for (; vma; vma = vma->vm_next) {
fc3147245   Mel Gorman   mm: numa: Limit N...
1772
  		if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
6e5fb223e   Peter Zijlstra   mm: sched: numa: ...
1773
  			continue;
4591ce4f2   Mel Gorman   sched/numa: Do no...
1774
1775
1776
1777
1778
1779
1780
1781
1782
  		/*
  		 * Shared library pages mapped by multiple processes are not
  		 * migrated as it is expected they are cache replicated. Avoid
  		 * hinting faults in read-only file-backed mappings or the vdso
  		 * as migrating the pages will be of marginal benefit.
  		 */
  		if (!vma->vm_mm ||
  		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
  			continue;
3c67f4745   Mel Gorman   sched: numa: skip...
1783
1784
1785
1786
1787
1788
  		/*
  		 * Skip inaccessible VMAs to avoid any confusion between
  		 * PROT_NONE and NUMA hinting ptes
  		 */
  		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
  			continue;
4591ce4f2   Mel Gorman   sched/numa: Do no...
1789

9f40604cd   Mel Gorman   sched, numa, mm: ...
1790
1791
1792
1793
  		do {
  			start = max(start, vma->vm_start);
  			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
  			end = min(end, vma->vm_end);
598f0ec0b   Mel Gorman   sched/numa: Set t...
1794
1795
1796
1797
1798
1799
1800
1801
1802
  			nr_pte_updates += change_prot_numa(vma, start, end);
  
  			/*
  			 * Scan sysctl_numa_balancing_scan_size but ensure that
  			 * at least one PTE is updated so that unused virtual
  			 * address space is quickly skipped.
  			 */
  			if (nr_pte_updates)
  				pages -= (end - start) >> PAGE_SHIFT;
6e5fb223e   Peter Zijlstra   mm: sched: numa: ...
1803

9f40604cd   Mel Gorman   sched, numa, mm: ...
1804
1805
1806
  			start = end;
  			if (pages <= 0)
  				goto out;
3cf1962cd   Rik van Riel   sched,numa: add c...
1807
1808
  
  			cond_resched();
9f40604cd   Mel Gorman   sched, numa, mm: ...
1809
  		} while (end != vma->vm_end);
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
1810
  	}
6e5fb223e   Peter Zijlstra   mm: sched: numa: ...
1811

9f40604cd   Mel Gorman   sched, numa, mm: ...
1812
  out:
6e5fb223e   Peter Zijlstra   mm: sched: numa: ...
1813
  	/*
c69307d53   Peter Zijlstra   sched/numa: Fix c...
1814
1815
1816
1817
  	 * It is possible to reach the end of the VMA list but the last few
  	 * VMAs are not guaranteed to the vma_migratable. If they are not, we
  	 * would find the !migratable VMA on the next scan but not reset the
  	 * scanner to the start so check it now.
6e5fb223e   Peter Zijlstra   mm: sched: numa: ...
1818
1819
  	 */
  	if (vma)
9f40604cd   Mel Gorman   sched, numa, mm: ...
1820
  		mm->numa_scan_offset = start;
6e5fb223e   Peter Zijlstra   mm: sched: numa: ...
1821
1822
1823
  	else
  		reset_ptenuma_scan(p);
  	up_read(&mm->mmap_sem);
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
  }
  
  /*
   * Drive the periodic memory faults..
   */
  void task_tick_numa(struct rq *rq, struct task_struct *curr)
  {
  	struct callback_head *work = &curr->numa_work;
  	u64 period, now;
  
  	/*
  	 * We don't care about NUMA placement if we don't have memory.
  	 */
  	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
  		return;
  
  	/*
  	 * Using runtime rather than walltime has the dual advantage that
  	 * we (mostly) drive the selection from busy threads and that the
  	 * task needs to have done some actual work before we bother with
  	 * NUMA placement.
  	 */
  	now = curr->se.sum_exec_runtime;
  	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
  
  	if (now - curr->node_stamp > period) {
4b96a29ba   Peter Zijlstra   mm: sched: numa: ...
1850
  		if (!curr->node_stamp)
598f0ec0b   Mel Gorman   sched/numa: Set t...
1851
  			curr->numa_scan_period = task_scan_min(curr);
19a78d110   Peter Zijlstra   sched/numa: Mitig...
1852
  		curr->node_stamp += period;
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
  
  		if (!time_before(jiffies, curr->mm->numa_next_scan)) {
  			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
  			task_work_add(curr, work, true);
  		}
  	}
  }
  #else
  static void task_tick_numa(struct rq *rq, struct task_struct *curr)
  {
  }
0ec8aa00f   Peter Zijlstra   sched/numa: Avoid...
1864
1865
1866
1867
1868
1869
1870
1871
  
  static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
  {
  }
  
  static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
  {
  }
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
1872
  #endif /* CONFIG_NUMA_BALANCING */
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
1873
1874
1875
1876
  static void
  account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
  	update_load_add(&cfs_rq->load, se->load.weight);
c09595f63   Peter Zijlstra   sched: revert rev...
1877
  	if (!parent_entity(se))
029632fbb   Peter Zijlstra   sched: Make separ...
1878
  		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
367456c75   Peter Zijlstra   sched: Ditch per ...
1879
  #ifdef CONFIG_SMP
0ec8aa00f   Peter Zijlstra   sched/numa: Avoid...
1880
1881
1882
1883
1884
1885
  	if (entity_is_task(se)) {
  		struct rq *rq = rq_of(cfs_rq);
  
  		account_numa_enqueue(rq, task_of(se));
  		list_add(&se->group_node, &rq->cfs_tasks);
  	}
367456c75   Peter Zijlstra   sched: Ditch per ...
1886
  #endif
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
1887
  	cfs_rq->nr_running++;
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
1888
1889
1890
1891
1892
1893
  }
  
  static void
  account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
  	update_load_sub(&cfs_rq->load, se->load.weight);
c09595f63   Peter Zijlstra   sched: revert rev...
1894
  	if (!parent_entity(se))
029632fbb   Peter Zijlstra   sched: Make separ...
1895
  		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
0ec8aa00f   Peter Zijlstra   sched/numa: Avoid...
1896
1897
  	if (entity_is_task(se)) {
  		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
b87f17242   Bharata B Rao   sched: maintain o...
1898
  		list_del_init(&se->group_node);
0ec8aa00f   Peter Zijlstra   sched/numa: Avoid...
1899
  	}
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
1900
  	cfs_rq->nr_running--;
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
1901
  }
3ff6dcac7   Yong Zhang   sched: Fix poor i...
1902
1903
  #ifdef CONFIG_FAIR_GROUP_SCHED
  # ifdef CONFIG_SMP
cf5f0acf3   Peter Zijlstra   sched: Add a comm...
1904
1905
1906
1907
1908
1909
1910
1911
1912
  static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
  {
  	long tg_weight;
  
  	/*
  	 * Use this CPU's actual weight instead of the last load_contribution
  	 * to gain a more accurate current total weight. See
  	 * update_cfs_rq_load_contribution().
  	 */
bf5b986ed   Alex Shi   sched/tg: Use 'un...
1913
  	tg_weight = atomic_long_read(&tg->load_avg);
82958366c   Paul Turner   sched: Replace up...
1914
  	tg_weight -= cfs_rq->tg_load_contrib;
cf5f0acf3   Peter Zijlstra   sched: Add a comm...
1915
1916
1917
1918
  	tg_weight += cfs_rq->load.weight;
  
  	return tg_weight;
  }
6d5ab2932   Paul Turner   sched: Simplify u...
1919
  static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
3ff6dcac7   Yong Zhang   sched: Fix poor i...
1920
  {
cf5f0acf3   Peter Zijlstra   sched: Add a comm...
1921
  	long tg_weight, load, shares;
3ff6dcac7   Yong Zhang   sched: Fix poor i...
1922

cf5f0acf3   Peter Zijlstra   sched: Add a comm...
1923
  	tg_weight = calc_tg_weight(tg, cfs_rq);
6d5ab2932   Paul Turner   sched: Simplify u...
1924
  	load = cfs_rq->load.weight;
3ff6dcac7   Yong Zhang   sched: Fix poor i...
1925

3ff6dcac7   Yong Zhang   sched: Fix poor i...
1926
  	shares = (tg->shares * load);
cf5f0acf3   Peter Zijlstra   sched: Add a comm...
1927
1928
  	if (tg_weight)
  		shares /= tg_weight;
3ff6dcac7   Yong Zhang   sched: Fix poor i...
1929
1930
1931
1932
1933
1934
1935
1936
  
  	if (shares < MIN_SHARES)
  		shares = MIN_SHARES;
  	if (shares > tg->shares)
  		shares = tg->shares;
  
  	return shares;
  }
3ff6dcac7   Yong Zhang   sched: Fix poor i...
1937
  # else /* CONFIG_SMP */
6d5ab2932   Paul Turner   sched: Simplify u...
1938
  static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
3ff6dcac7   Yong Zhang   sched: Fix poor i...
1939
1940
1941
  {
  	return tg->shares;
  }
3ff6dcac7   Yong Zhang   sched: Fix poor i...
1942
  # endif /* CONFIG_SMP */
2069dd75c   Peter Zijlstra   sched: Rewrite tg...
1943
1944
1945
  static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
  			    unsigned long weight)
  {
19e5eebb8   Paul Turner   sched: Fix intera...
1946
1947
1948
1949
  	if (se->on_rq) {
  		/* commit outstanding execution time */
  		if (cfs_rq->curr == se)
  			update_curr(cfs_rq);
2069dd75c   Peter Zijlstra   sched: Rewrite tg...
1950
  		account_entity_dequeue(cfs_rq, se);
19e5eebb8   Paul Turner   sched: Fix intera...
1951
  	}
2069dd75c   Peter Zijlstra   sched: Rewrite tg...
1952
1953
1954
1955
1956
1957
  
  	update_load_set(&se->load, weight);
  
  	if (se->on_rq)
  		account_entity_enqueue(cfs_rq, se);
  }
82958366c   Paul Turner   sched: Replace up...
1958
  static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
6d5ab2932   Paul Turner   sched: Simplify u...
1959
  static void update_cfs_shares(struct cfs_rq *cfs_rq)
2069dd75c   Peter Zijlstra   sched: Rewrite tg...
1960
1961
1962
  {
  	struct task_group *tg;
  	struct sched_entity *se;
3ff6dcac7   Yong Zhang   sched: Fix poor i...
1963
  	long shares;
2069dd75c   Peter Zijlstra   sched: Rewrite tg...
1964

2069dd75c   Peter Zijlstra   sched: Rewrite tg...
1965
1966
  	tg = cfs_rq->tg;
  	se = tg->se[cpu_of(rq_of(cfs_rq))];
64660c864   Paul Turner   sched: Prevent in...
1967
  	if (!se || throttled_hierarchy(cfs_rq))
2069dd75c   Peter Zijlstra   sched: Rewrite tg...
1968
  		return;
3ff6dcac7   Yong Zhang   sched: Fix poor i...
1969
1970
1971
1972
  #ifndef CONFIG_SMP
  	if (likely(se->load.weight == tg->shares))
  		return;
  #endif
6d5ab2932   Paul Turner   sched: Simplify u...
1973
  	shares = calc_cfs_shares(cfs_rq, tg);
2069dd75c   Peter Zijlstra   sched: Rewrite tg...
1974
1975
1976
1977
  
  	reweight_entity(cfs_rq_of(se), se, shares);
  }
  #else /* CONFIG_FAIR_GROUP_SCHED */
6d5ab2932   Paul Turner   sched: Simplify u...
1978
  static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
2069dd75c   Peter Zijlstra   sched: Rewrite tg...
1979
1980
1981
  {
  }
  #endif /* CONFIG_FAIR_GROUP_SCHED */
141965c74   Alex Shi   Revert "sched: In...
1982
  #ifdef CONFIG_SMP
9d85f21c9   Paul Turner   sched: Track the ...
1983
  /*
5b51f2f80   Paul Turner   sched: Make __upd...
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
   * We choose a half-life close to 1 scheduling period.
   * Note: The tables below are dependent on this value.
   */
  #define LOAD_AVG_PERIOD 32
  #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
  #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
  
  /* Precomputed fixed inverse multiplies for multiplication by y^n */
  static const u32 runnable_avg_yN_inv[] = {
  	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
  	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
  	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
  	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
  	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
  	0x85aac367, 0x82cd8698,
  };
  
  /*
   * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
   * over-estimates when re-combining.
   */
  static const u32 runnable_avg_yN_sum[] = {
  	    0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
  	 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
  	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
  };
  
  /*
9d85f21c9   Paul Turner   sched: Track the ...
2012
2013
2014
2015
2016
   * Approximate:
   *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
   */
  static __always_inline u64 decay_load(u64 val, u64 n)
  {
5b51f2f80   Paul Turner   sched: Make __upd...
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
  	unsigned int local_n;
  
  	if (!n)
  		return val;
  	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
  		return 0;
  
  	/* after bounds checking we can collapse to 32-bit */
  	local_n = n;
  
  	/*
  	 * As y^PERIOD = 1/2, we can combine
  	 *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
  	 * With a look-up table which covers k^n (n<PERIOD)
  	 *
  	 * To achieve constant time decay_load.
  	 */
  	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
  		val >>= local_n / LOAD_AVG_PERIOD;
  		local_n %= LOAD_AVG_PERIOD;
9d85f21c9   Paul Turner   sched: Track the ...
2037
  	}
5b51f2f80   Paul Turner   sched: Make __upd...
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
  	val *= runnable_avg_yN_inv[local_n];
  	/* We don't use SRR here since we always want to round down. */
  	return val >> 32;
  }
  
  /*
   * For updates fully spanning n periods, the contribution to runnable
   * average will be: \Sum 1024*y^n
   *
   * We can compute this reasonably efficiently by combining:
   *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
   */
  static u32 __compute_runnable_contrib(u64 n)
  {
  	u32 contrib = 0;
  
  	if (likely(n <= LOAD_AVG_PERIOD))
  		return runnable_avg_yN_sum[n];
  	else if (unlikely(n >= LOAD_AVG_MAX_N))
  		return LOAD_AVG_MAX;
  
  	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
  	do {
  		contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
  		contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
  
  		n -= LOAD_AVG_PERIOD;
  	} while (n > LOAD_AVG_PERIOD);
  
  	contrib = decay_load(contrib, n);
  	return contrib + runnable_avg_yN_sum[n];
9d85f21c9   Paul Turner   sched: Track the ...
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
  }
  
  /*
   * We can represent the historical contribution to runnable average as the
   * coefficients of a geometric series.  To do this we sub-divide our runnable
   * history into segments of approximately 1ms (1024us); label the segment that
   * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
   *
   * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
   *      p0            p1           p2
   *     (now)       (~1ms ago)  (~2ms ago)
   *
   * Let u_i denote the fraction of p_i that the entity was runnable.
   *
   * We then designate the fractions u_i as our co-efficients, yielding the
   * following representation of historical load:
   *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
   *
   * We choose y based on the with of a reasonably scheduling period, fixing:
   *   y^32 = 0.5
   *
   * This means that the contribution to load ~32ms ago (u_32) will be weighted
   * approximately half as much as the contribution to load within the last ms
   * (u_0).
   *
   * When a period "rolls over" and we have new u_0`, multiplying the previous
   * sum again by y is sufficient to update:
   *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
   *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
   */
  static __always_inline int __update_entity_runnable_avg(u64 now,
  							struct sched_avg *sa,
  							int runnable)
  {
5b51f2f80   Paul Turner   sched: Make __upd...
2103
2104
  	u64 delta, periods;
  	u32 runnable_contrib;
9d85f21c9   Paul Turner   sched: Track the ...
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
  	int delta_w, decayed = 0;
  
  	delta = now - sa->last_runnable_update;
  	/*
  	 * This should only happen when time goes backwards, which it
  	 * unfortunately does during sched clock init when we swap over to TSC.
  	 */
  	if ((s64)delta < 0) {
  		sa->last_runnable_update = now;
  		return 0;
  	}
  
  	/*
  	 * Use 1024ns as the unit of measurement since it's a reasonable
  	 * approximation of 1us and fast to compute.
  	 */
  	delta >>= 10;
  	if (!delta)
  		return 0;
  	sa->last_runnable_update = now;
  
  	/* delta_w is the amount already accumulated against our next period */
  	delta_w = sa->runnable_avg_period % 1024;
  	if (delta + delta_w >= 1024) {
  		/* period roll-over */
  		decayed = 1;
  
  		/*
  		 * Now that we know we're crossing a period boundary, figure
  		 * out how much from delta we need to complete the current
  		 * period and accrue it.
  		 */
  		delta_w = 1024 - delta_w;
5b51f2f80   Paul Turner   sched: Make __upd...
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
  		if (runnable)
  			sa->runnable_avg_sum += delta_w;
  		sa->runnable_avg_period += delta_w;
  
  		delta -= delta_w;
  
  		/* Figure out how many additional periods this update spans */
  		periods = delta / 1024;
  		delta %= 1024;
  
  		sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
  						  periods + 1);
  		sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
  						     periods + 1);
  
  		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
  		runnable_contrib = __compute_runnable_contrib(periods);
  		if (runnable)
  			sa->runnable_avg_sum += runnable_contrib;
  		sa->runnable_avg_period += runnable_contrib;
9d85f21c9   Paul Turner   sched: Track the ...
2158
2159
2160
2161
2162
2163
2164
2165
2166
  	}
  
  	/* Remainder of delta accrued against u_0` */
  	if (runnable)
  		sa->runnable_avg_sum += delta;
  	sa->runnable_avg_period += delta;
  
  	return decayed;
  }
9ee474f55   Paul Turner   sched: Maintain t...
2167
  /* Synchronize an entity's decay with its parenting cfs_rq.*/
aff3e4988   Paul Turner   sched: Account fo...
2168
  static inline u64 __synchronize_entity_decay(struct sched_entity *se)
9ee474f55   Paul Turner   sched: Maintain t...
2169
2170
2171
2172
2173
2174
  {
  	struct cfs_rq *cfs_rq = cfs_rq_of(se);
  	u64 decays = atomic64_read(&cfs_rq->decay_counter);
  
  	decays -= se->avg.decay_count;
  	if (!decays)
aff3e4988   Paul Turner   sched: Account fo...
2175
  		return 0;
9ee474f55   Paul Turner   sched: Maintain t...
2176
2177
2178
  
  	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
  	se->avg.decay_count = 0;
aff3e4988   Paul Turner   sched: Account fo...
2179
2180
  
  	return decays;
9ee474f55   Paul Turner   sched: Maintain t...
2181
  }
c566e8e9e   Paul Turner   sched: Aggregate ...
2182
2183
2184
2185
2186
  #ifdef CONFIG_FAIR_GROUP_SCHED
  static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
  						 int force_update)
  {
  	struct task_group *tg = cfs_rq->tg;
bf5b986ed   Alex Shi   sched/tg: Use 'un...
2187
  	long tg_contrib;
c566e8e9e   Paul Turner   sched: Aggregate ...
2188
2189
2190
  
  	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
  	tg_contrib -= cfs_rq->tg_load_contrib;
bf5b986ed   Alex Shi   sched/tg: Use 'un...
2191
2192
  	if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
  		atomic_long_add(tg_contrib, &tg->load_avg);
c566e8e9e   Paul Turner   sched: Aggregate ...
2193
2194
2195
  		cfs_rq->tg_load_contrib += tg_contrib;
  	}
  }
8165e145c   Paul Turner   sched: Compute lo...
2196

bb17f6557   Paul Turner   sched: Normalize ...
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
  /*
   * Aggregate cfs_rq runnable averages into an equivalent task_group
   * representation for computing load contributions.
   */
  static inline void __update_tg_runnable_avg(struct sched_avg *sa,
  						  struct cfs_rq *cfs_rq)
  {
  	struct task_group *tg = cfs_rq->tg;
  	long contrib;
  
  	/* The fraction of a cpu used by this cfs_rq */
85b088e93   Michal Nazarewicz   sched/fair: Avoid...
2208
  	contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
bb17f6557   Paul Turner   sched: Normalize ...
2209
2210
2211
2212
2213
2214
2215
2216
  			  sa->runnable_avg_period + 1);
  	contrib -= cfs_rq->tg_runnable_contrib;
  
  	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
  		atomic_add(contrib, &tg->runnable_avg);
  		cfs_rq->tg_runnable_contrib += contrib;
  	}
  }
8165e145c   Paul Turner   sched: Compute lo...
2217
2218
2219
2220
  static inline void __update_group_entity_contrib(struct sched_entity *se)
  {
  	struct cfs_rq *cfs_rq = group_cfs_rq(se);
  	struct task_group *tg = cfs_rq->tg;
bb17f6557   Paul Turner   sched: Normalize ...
2221
  	int runnable_avg;
8165e145c   Paul Turner   sched: Compute lo...
2222
2223
2224
  	u64 contrib;
  
  	contrib = cfs_rq->tg_load_contrib * tg->shares;
bf5b986ed   Alex Shi   sched/tg: Use 'un...
2225
2226
  	se->avg.load_avg_contrib = div_u64(contrib,
  				     atomic_long_read(&tg->load_avg) + 1);
bb17f6557   Paul Turner   sched: Normalize ...
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
  
  	/*
  	 * For group entities we need to compute a correction term in the case
  	 * that they are consuming <1 cpu so that we would contribute the same
  	 * load as a task of equal weight.
  	 *
  	 * Explicitly co-ordinating this measurement would be expensive, but
  	 * fortunately the sum of each cpus contribution forms a usable
  	 * lower-bound on the true value.
  	 *
  	 * Consider the aggregate of 2 contributions.  Either they are disjoint
  	 * (and the sum represents true value) or they are disjoint and we are
  	 * understating by the aggregate of their overlap.
  	 *
  	 * Extending this to N cpus, for a given overlap, the maximum amount we
  	 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
  	 * cpus that overlap for this interval and w_i is the interval width.
  	 *
  	 * On a small machine; the first term is well-bounded which bounds the
  	 * total error since w_i is a subset of the period.  Whereas on a
  	 * larger machine, while this first term can be larger, if w_i is the
  	 * of consequential size guaranteed to see n_i*w_i quickly converge to
  	 * our upper bound of 1-cpu.
  	 */
  	runnable_avg = atomic_read(&tg->runnable_avg);
  	if (runnable_avg < NICE_0_LOAD) {
  		se->avg.load_avg_contrib *= runnable_avg;
  		se->avg.load_avg_contrib >>= NICE_0_SHIFT;
  	}
8165e145c   Paul Turner   sched: Compute lo...
2256
  }
f5f9739d7   Dietmar Eggemann   sched: Put rq's s...
2257
2258
2259
2260
2261
2262
  
  static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
  {
  	__update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
  	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
  }
6e83125c6   Peter Zijlstra   sched/fair: Remov...
2263
  #else /* CONFIG_FAIR_GROUP_SCHED */
c566e8e9e   Paul Turner   sched: Aggregate ...
2264
2265
  static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
  						 int force_update) {}
bb17f6557   Paul Turner   sched: Normalize ...
2266
2267
  static inline void __update_tg_runnable_avg(struct sched_avg *sa,
  						  struct cfs_rq *cfs_rq) {}
8165e145c   Paul Turner   sched: Compute lo...
2268
  static inline void __update_group_entity_contrib(struct sched_entity *se) {}
f5f9739d7   Dietmar Eggemann   sched: Put rq's s...
2269
  static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
6e83125c6   Peter Zijlstra   sched/fair: Remov...
2270
  #endif /* CONFIG_FAIR_GROUP_SCHED */
c566e8e9e   Paul Turner   sched: Aggregate ...
2271

8165e145c   Paul Turner   sched: Compute lo...
2272
2273
2274
2275
2276
2277
2278
2279
2280
  static inline void __update_task_entity_contrib(struct sched_entity *se)
  {
  	u32 contrib;
  
  	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
  	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
  	contrib /= (se->avg.runnable_avg_period + 1);
  	se->avg.load_avg_contrib = scale_load(contrib);
  }
2dac754e1   Paul Turner   sched: Aggregate ...
2281
2282
2283
2284
  /* Compute the current contribution to load_avg by se, return any delta */
  static long __update_entity_load_avg_contrib(struct sched_entity *se)
  {
  	long old_contrib = se->avg.load_avg_contrib;
8165e145c   Paul Turner   sched: Compute lo...
2285
2286
2287
  	if (entity_is_task(se)) {
  		__update_task_entity_contrib(se);
  	} else {
bb17f6557   Paul Turner   sched: Normalize ...
2288
  		__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
8165e145c   Paul Turner   sched: Compute lo...
2289
2290
  		__update_group_entity_contrib(se);
  	}
2dac754e1   Paul Turner   sched: Aggregate ...
2291
2292
2293
  
  	return se->avg.load_avg_contrib - old_contrib;
  }
9ee474f55   Paul Turner   sched: Maintain t...
2294
2295
2296
2297
2298
2299
2300
2301
  static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
  						 long load_contrib)
  {
  	if (likely(load_contrib < cfs_rq->blocked_load_avg))
  		cfs_rq->blocked_load_avg -= load_contrib;
  	else
  		cfs_rq->blocked_load_avg = 0;
  }
f1b17280e   Paul Turner   sched: Maintain r...
2302
  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
9d85f21c9   Paul Turner   sched: Track the ...
2303
  /* Update a sched_entity's runnable average */
9ee474f55   Paul Turner   sched: Maintain t...
2304
2305
  static inline void update_entity_load_avg(struct sched_entity *se,
  					  int update_cfs_rq)
9d85f21c9   Paul Turner   sched: Track the ...
2306
  {
2dac754e1   Paul Turner   sched: Aggregate ...
2307
2308
  	struct cfs_rq *cfs_rq = cfs_rq_of(se);
  	long contrib_delta;
f1b17280e   Paul Turner   sched: Maintain r...
2309
  	u64 now;
2dac754e1   Paul Turner   sched: Aggregate ...
2310

f1b17280e   Paul Turner   sched: Maintain r...
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
  	/*
  	 * For a group entity we need to use their owned cfs_rq_clock_task() in
  	 * case they are the parent of a throttled hierarchy.
  	 */
  	if (entity_is_task(se))
  		now = cfs_rq_clock_task(cfs_rq);
  	else
  		now = cfs_rq_clock_task(group_cfs_rq(se));
  
  	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
2dac754e1   Paul Turner   sched: Aggregate ...
2321
2322
2323
  		return;
  
  	contrib_delta = __update_entity_load_avg_contrib(se);
9ee474f55   Paul Turner   sched: Maintain t...
2324
2325
2326
  
  	if (!update_cfs_rq)
  		return;
2dac754e1   Paul Turner   sched: Aggregate ...
2327
2328
  	if (se->on_rq)
  		cfs_rq->runnable_load_avg += contrib_delta;
9ee474f55   Paul Turner   sched: Maintain t...
2329
2330
2331
2332
2333
2334
2335
2336
  	else
  		subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
  }
  
  /*
   * Decay the load contributed by all blocked children and account this so that
   * their contribution may appropriately discounted when they wake up.
   */
aff3e4988   Paul Turner   sched: Account fo...
2337
  static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
9ee474f55   Paul Turner   sched: Maintain t...
2338
  {
f1b17280e   Paul Turner   sched: Maintain r...
2339
  	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
9ee474f55   Paul Turner   sched: Maintain t...
2340
2341
2342
  	u64 decays;
  
  	decays = now - cfs_rq->last_decay;
aff3e4988   Paul Turner   sched: Account fo...
2343
  	if (!decays && !force_update)
9ee474f55   Paul Turner   sched: Maintain t...
2344
  		return;
2509940fd   Alex Shi   sched/cfs_rq: Cha...
2345
2346
2347
  	if (atomic_long_read(&cfs_rq->removed_load)) {
  		unsigned long removed_load;
  		removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
aff3e4988   Paul Turner   sched: Account fo...
2348
2349
  		subtract_blocked_load_contrib(cfs_rq, removed_load);
  	}
9ee474f55   Paul Turner   sched: Maintain t...
2350

aff3e4988   Paul Turner   sched: Account fo...
2351
2352
2353
2354
2355
2356
  	if (decays) {
  		cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
  						      decays);
  		atomic64_add(decays, &cfs_rq->decay_counter);
  		cfs_rq->last_decay = now;
  	}
c566e8e9e   Paul Turner   sched: Aggregate ...
2357
2358
  
  	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
9d85f21c9   Paul Turner   sched: Track the ...
2359
  }
18bf2805d   Ben Segall   sched: Maintain p...
2360

2dac754e1   Paul Turner   sched: Aggregate ...
2361
2362
  /* Add the load generated by se into cfs_rq's child load-average */
  static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
9ee474f55   Paul Turner   sched: Maintain t...
2363
2364
  						  struct sched_entity *se,
  						  int wakeup)
2dac754e1   Paul Turner   sched: Aggregate ...
2365
  {
aff3e4988   Paul Turner   sched: Account fo...
2366
2367
2368
2369
  	/*
  	 * We track migrations using entity decay_count <= 0, on a wake-up
  	 * migration we use a negative decay count to track the remote decays
  	 * accumulated while sleeping.
a75cdaa91   Alex Shi   sched: Set an ini...
2370
2371
2372
2373
  	 *
  	 * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
  	 * are seen by enqueue_entity_load_avg() as a migration with an already
  	 * constructed load_avg_contrib.
aff3e4988   Paul Turner   sched: Account fo...
2374
2375
  	 */
  	if (unlikely(se->avg.decay_count <= 0)) {
78becc270   Frederic Weisbecker   sched: Use an acc...
2376
  		se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
aff3e4988   Paul Turner   sched: Account fo...
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
  		if (se->avg.decay_count) {
  			/*
  			 * In a wake-up migration we have to approximate the
  			 * time sleeping.  This is because we can't synchronize
  			 * clock_task between the two cpus, and it is not
  			 * guaranteed to be read-safe.  Instead, we can
  			 * approximate this using our carried decays, which are
  			 * explicitly atomically readable.
  			 */
  			se->avg.last_runnable_update -= (-se->avg.decay_count)
  							<< 20;
  			update_entity_load_avg(se, 0);
  			/* Indicate that we're now synchronized and on-rq */
  			se->avg.decay_count = 0;
  		}
9ee474f55   Paul Turner   sched: Maintain t...
2392
2393
  		wakeup = 0;
  	} else {
9390675af   Vincent Guittot   Revert "sched: Fi...
2394
  		__synchronize_entity_decay(se);
9ee474f55   Paul Turner   sched: Maintain t...
2395
  	}
aff3e4988   Paul Turner   sched: Account fo...
2396
2397
  	/* migrated tasks did not contribute to our blocked load */
  	if (wakeup) {
9ee474f55   Paul Turner   sched: Maintain t...
2398
  		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
aff3e4988   Paul Turner   sched: Account fo...
2399
2400
  		update_entity_load_avg(se, 0);
  	}
9ee474f55   Paul Turner   sched: Maintain t...
2401

2dac754e1   Paul Turner   sched: Aggregate ...
2402
  	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
aff3e4988   Paul Turner   sched: Account fo...
2403
2404
  	/* we force update consideration on load-balancer moves */
  	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
2dac754e1   Paul Turner   sched: Aggregate ...
2405
  }
9ee474f55   Paul Turner   sched: Maintain t...
2406
2407
2408
2409
2410
  /*
   * Remove se's load from this cfs_rq child load-average, if the entity is
   * transitioning to a blocked state we track its projected decay using
   * blocked_load_avg.
   */
2dac754e1   Paul Turner   sched: Aggregate ...
2411
  static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
9ee474f55   Paul Turner   sched: Maintain t...
2412
2413
  						  struct sched_entity *se,
  						  int sleep)
2dac754e1   Paul Turner   sched: Aggregate ...
2414
  {
9ee474f55   Paul Turner   sched: Maintain t...
2415
  	update_entity_load_avg(se, 1);
aff3e4988   Paul Turner   sched: Account fo...
2416
2417
  	/* we force update consideration on load-balancer moves */
  	update_cfs_rq_blocked_load(cfs_rq, !sleep);
9ee474f55   Paul Turner   sched: Maintain t...
2418

2dac754e1   Paul Turner   sched: Aggregate ...
2419
  	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
9ee474f55   Paul Turner   sched: Maintain t...
2420
2421
2422
2423
  	if (sleep) {
  		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
  		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
  	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
2dac754e1   Paul Turner   sched: Aggregate ...
2424
  }
642dbc39a   Vincent Guittot   sched: Fix wrong ...
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
  
  /*
   * Update the rq's load with the elapsed running time before entering
   * idle. if the last scheduled task is not a CFS task, idle_enter will
   * be the only way to update the runnable statistic.
   */
  void idle_enter_fair(struct rq *this_rq)
  {
  	update_rq_runnable_avg(this_rq, 1);
  }
  
  /*
   * Update the rq's load with the elapsed idle time before a task is
   * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
   * be the only way to update the runnable statistic.
   */
  void idle_exit_fair(struct rq *this_rq)
  {
  	update_rq_runnable_avg(this_rq, 0);
  }
6e83125c6   Peter Zijlstra   sched/fair: Remov...
2445
  static int idle_balance(struct rq *this_rq);
38033c37f   Peter Zijlstra   sched: Push down ...
2446
  #else /* CONFIG_SMP */
9ee474f55   Paul Turner   sched: Maintain t...
2447
2448
  static inline void update_entity_load_avg(struct sched_entity *se,
  					  int update_cfs_rq) {}
18bf2805d   Ben Segall   sched: Maintain p...
2449
  static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
2dac754e1   Paul Turner   sched: Aggregate ...
2450
  static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
9ee474f55   Paul Turner   sched: Maintain t...
2451
2452
  					   struct sched_entity *se,
  					   int wakeup) {}
2dac754e1   Paul Turner   sched: Aggregate ...
2453
  static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
9ee474f55   Paul Turner   sched: Maintain t...
2454
2455
  					   struct sched_entity *se,
  					   int sleep) {}
aff3e4988   Paul Turner   sched: Account fo...
2456
2457
  static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
  					      int force_update) {}
6e83125c6   Peter Zijlstra   sched/fair: Remov...
2458
2459
2460
2461
2462
  
  static inline int idle_balance(struct rq *rq)
  {
  	return 0;
  }
38033c37f   Peter Zijlstra   sched: Push down ...
2463
  #endif /* CONFIG_SMP */
9d85f21c9   Paul Turner   sched: Track the ...
2464

2396af69b   Ingo Molnar   sched: remove the...
2465
  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2466
  {
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2467
  #ifdef CONFIG_SCHEDSTATS
e414314cc   Peter Zijlstra   sched: Fix latenc...
2468
2469
2470
2471
  	struct task_struct *tsk = NULL;
  
  	if (entity_is_task(se))
  		tsk = task_of(se);
41acab885   Lucas De Marchi   sched: Implement ...
2472
  	if (se->statistics.sleep_start) {
78becc270   Frederic Weisbecker   sched: Use an acc...
2473
  		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2474
2475
2476
  
  		if ((s64)delta < 0)
  			delta = 0;
41acab885   Lucas De Marchi   sched: Implement ...
2477
2478
  		if (unlikely(delta > se->statistics.sleep_max))
  			se->statistics.sleep_max = delta;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2479

8c79a045f   Peter Zijlstra   sched/events: Rev...
2480
  		se->statistics.sleep_start = 0;
41acab885   Lucas De Marchi   sched: Implement ...
2481
  		se->statistics.sum_sleep_runtime += delta;
9745512ce   Arjan van de Ven   sched: latencytop...
2482

768d0c272   Peter Zijlstra   sched: Add wait, ...
2483
  		if (tsk) {
e414314cc   Peter Zijlstra   sched: Fix latenc...
2484
  			account_scheduler_latency(tsk, delta >> 10, 1);
768d0c272   Peter Zijlstra   sched: Add wait, ...
2485
2486
  			trace_sched_stat_sleep(tsk, delta);
  		}
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2487
  	}
41acab885   Lucas De Marchi   sched: Implement ...
2488
  	if (se->statistics.block_start) {
78becc270   Frederic Weisbecker   sched: Use an acc...
2489
  		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2490
2491
2492
  
  		if ((s64)delta < 0)
  			delta = 0;
41acab885   Lucas De Marchi   sched: Implement ...
2493
2494
  		if (unlikely(delta > se->statistics.block_max))
  			se->statistics.block_max = delta;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2495

8c79a045f   Peter Zijlstra   sched/events: Rev...
2496
  		se->statistics.block_start = 0;
41acab885   Lucas De Marchi   sched: Implement ...
2497
  		se->statistics.sum_sleep_runtime += delta;
30084fbd1   Ingo Molnar   sched: fix profil...
2498

e414314cc   Peter Zijlstra   sched: Fix latenc...
2499
  		if (tsk) {
8f0dfc34e   Arjan van de Ven   sched: Provide io...
2500
  			if (tsk->in_iowait) {
41acab885   Lucas De Marchi   sched: Implement ...
2501
2502
  				se->statistics.iowait_sum += delta;
  				se->statistics.iowait_count++;
768d0c272   Peter Zijlstra   sched: Add wait, ...
2503
  				trace_sched_stat_iowait(tsk, delta);
8f0dfc34e   Arjan van de Ven   sched: Provide io...
2504
  			}
b781a602a   Andrew Vagin   events, sched: Ad...
2505
  			trace_sched_stat_blocked(tsk, delta);
e414314cc   Peter Zijlstra   sched: Fix latenc...
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
  			/*
  			 * Blocking time is in units of nanosecs, so shift by
  			 * 20 to get a milliseconds-range estimation of the
  			 * amount of time that the task spent sleeping:
  			 */
  			if (unlikely(prof_on == SLEEP_PROFILING)) {
  				profile_hits(SLEEP_PROFILING,
  						(void *)get_wchan(tsk),
  						delta >> 20);
  			}
  			account_scheduler_latency(tsk, delta >> 10, 0);
30084fbd1   Ingo Molnar   sched: fix profil...
2517
  		}
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2518
2519
2520
  	}
  #endif
  }
ddc972975   Peter Zijlstra   sched debug: chec...
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
  static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
  #ifdef CONFIG_SCHED_DEBUG
  	s64 d = se->vruntime - cfs_rq->min_vruntime;
  
  	if (d < 0)
  		d = -d;
  
  	if (d > 3*sysctl_sched_latency)
  		schedstat_inc(cfs_rq, nr_spread_over);
  #endif
  }
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2533
  static void
aeb73b040   Peter Zijlstra   sched: clean up n...
2534
2535
  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
  {
1af5f730f   Peter Zijlstra   sched: more accur...
2536
  	u64 vruntime = cfs_rq->min_vruntime;
94dfb5e75   Peter Zijlstra   sched: add tree b...
2537

2cb8600e6   Peter Zijlstra   sched: documentat...
2538
2539
2540
2541
2542
2543
  	/*
  	 * The 'current' period is already promised to the current tasks,
  	 * however the extra weight of the new task will slow them down a
  	 * little, place the new task so that it fits in the slot that
  	 * stays open at the end.
  	 */
94dfb5e75   Peter Zijlstra   sched: add tree b...
2544
  	if (initial && sched_feat(START_DEBIT))
f9c0b0950   Peter Zijlstra   sched: revert bac...
2545
  		vruntime += sched_vslice(cfs_rq, se);
aeb73b040   Peter Zijlstra   sched: clean up n...
2546

a2e7a7eb2   Mike Galbraith   sched: Remove unn...
2547
  	/* sleeps up to a single latency don't count. */
5ca9880c6   Mike Galbraith   sched: Remove FAI...
2548
  	if (!initial) {
a2e7a7eb2   Mike Galbraith   sched: Remove unn...
2549
  		unsigned long thresh = sysctl_sched_latency;
a7be37ac8   Peter Zijlstra   sched: revert the...
2550

a2e7a7eb2   Mike Galbraith   sched: Remove unn...
2551
  		/*
a2e7a7eb2   Mike Galbraith   sched: Remove unn...
2552
2553
2554
2555
2556
  		 * Halve their sleep time's effect, to allow
  		 * for a gentler effect of sleepers:
  		 */
  		if (sched_feat(GENTLE_FAIR_SLEEPERS))
  			thresh >>= 1;
51e0304ce   Ingo Molnar   sched: Implement ...
2557

a2e7a7eb2   Mike Galbraith   sched: Remove unn...
2558
  		vruntime -= thresh;
aeb73b040   Peter Zijlstra   sched: clean up n...
2559
  	}
b5d9d734a   Mike Galbraith   sched: Ensure tha...
2560
  	/* ensure we never gain time by being placed backwards. */
16c8f1c72   Viresh Kumar   sched/fair: Set s...
2561
  	se->vruntime = max_vruntime(se->vruntime, vruntime);
aeb73b040   Peter Zijlstra   sched: clean up n...
2562
  }
d3d9dc330   Paul Turner   sched: Throttle e...
2563
  static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
aeb73b040   Peter Zijlstra   sched: clean up n...
2564
  static void
88ec22d3e   Peter Zijlstra   sched: Remove the...
2565
  enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2566
2567
  {
  	/*
88ec22d3e   Peter Zijlstra   sched: Remove the...
2568
  	 * Update the normalized vruntime before updating min_vruntime
0fc576d59   Kamalesh Babulal   sched/fair: Fix t...
2569
  	 * through calling update_curr().
88ec22d3e   Peter Zijlstra   sched: Remove the...
2570
  	 */
371fd7e7a   Peter Zijlstra   sched: Add enqueu...
2571
  	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
88ec22d3e   Peter Zijlstra   sched: Remove the...
2572
2573
2574
  		se->vruntime += cfs_rq->min_vruntime;
  
  	/*
a2a2d6807   Dmitry Adamushko   sched: cleanup, m...
2575
  	 * Update run-time statistics of the 'current'.
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2576
  	 */
b7cc08965   Ingo Molnar   sched: remove the...
2577
  	update_curr(cfs_rq);
f269ae046   Paul Turner   sched: Update_cfs...
2578
  	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
17bc14b76   Linus Torvalds   Revert "sched: Up...
2579
2580
  	account_entity_enqueue(cfs_rq, se);
  	update_cfs_shares(cfs_rq);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2581

88ec22d3e   Peter Zijlstra   sched: Remove the...
2582
  	if (flags & ENQUEUE_WAKEUP) {
aeb73b040   Peter Zijlstra   sched: clean up n...
2583
  		place_entity(cfs_rq, se, 0);
2396af69b   Ingo Molnar   sched: remove the...
2584
  		enqueue_sleeper(cfs_rq, se);
e9acbff64   Ingo Molnar   sched: introduce ...
2585
  	}
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2586

d2417e5a3   Ingo Molnar   sched: remove the...
2587
  	update_stats_enqueue(cfs_rq, se);
ddc972975   Peter Zijlstra   sched debug: chec...
2588
  	check_spread(cfs_rq, se);
83b699ed2   Srivatsa Vaddagiri   sched: revert rec...
2589
2590
  	if (se != cfs_rq->curr)
  		__enqueue_entity(cfs_rq, se);
2069dd75c   Peter Zijlstra   sched: Rewrite tg...
2591
  	se->on_rq = 1;
3d4b47b4b   Peter Zijlstra   sched: Implement ...
2592

d3d9dc330   Paul Turner   sched: Throttle e...
2593
  	if (cfs_rq->nr_running == 1) {
3d4b47b4b   Peter Zijlstra   sched: Implement ...
2594
  		list_add_leaf_cfs_rq(cfs_rq);
d3d9dc330   Paul Turner   sched: Throttle e...
2595
2596
  		check_enqueue_throttle(cfs_rq);
  	}
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2597
  }
2c13c919d   Rik van Riel   sched: Limit the ...
2598
  static void __clear_buddies_last(struct sched_entity *se)
2002c6959   Peter Zijlstra   sched: release bu...
2599
  {
2c13c919d   Rik van Riel   sched: Limit the ...
2600
2601
  	for_each_sched_entity(se) {
  		struct cfs_rq *cfs_rq = cfs_rq_of(se);
f10447998   Peter Zijlstra   sched/fair: Clean...
2602
  		if (cfs_rq->last != se)
2c13c919d   Rik van Riel   sched: Limit the ...
2603
  			break;
f10447998   Peter Zijlstra   sched/fair: Clean...
2604
2605
  
  		cfs_rq->last = NULL;
2c13c919d   Rik van Riel   sched: Limit the ...
2606
2607
  	}
  }
2002c6959   Peter Zijlstra   sched: release bu...
2608

2c13c919d   Rik van Riel   sched: Limit the ...
2609
2610
2611
2612
  static void __clear_buddies_next(struct sched_entity *se)
  {
  	for_each_sched_entity(se) {
  		struct cfs_rq *cfs_rq = cfs_rq_of(se);
f10447998   Peter Zijlstra   sched/fair: Clean...
2613
  		if (cfs_rq->next != se)
2c13c919d   Rik van Riel   sched: Limit the ...
2614
  			break;
f10447998   Peter Zijlstra   sched/fair: Clean...
2615
2616
  
  		cfs_rq->next = NULL;
2c13c919d   Rik van Riel   sched: Limit the ...
2617
  	}
2002c6959   Peter Zijlstra   sched: release bu...
2618
  }
ac53db596   Rik van Riel   sched: Use a budd...
2619
2620
2621
2622
  static void __clear_buddies_skip(struct sched_entity *se)
  {
  	for_each_sched_entity(se) {
  		struct cfs_rq *cfs_rq = cfs_rq_of(se);
f10447998   Peter Zijlstra   sched/fair: Clean...
2623
  		if (cfs_rq->skip != se)
ac53db596   Rik van Riel   sched: Use a budd...
2624
  			break;
f10447998   Peter Zijlstra   sched/fair: Clean...
2625
2626
  
  		cfs_rq->skip = NULL;
ac53db596   Rik van Riel   sched: Use a budd...
2627
2628
  	}
  }
a571bbeaf   Peter Zijlstra   sched: fix buddie...
2629
2630
  static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
2c13c919d   Rik van Riel   sched: Limit the ...
2631
2632
2633
2634
2635
  	if (cfs_rq->last == se)
  		__clear_buddies_last(se);
  
  	if (cfs_rq->next == se)
  		__clear_buddies_next(se);
ac53db596   Rik van Riel   sched: Use a budd...
2636
2637
2638
  
  	if (cfs_rq->skip == se)
  		__clear_buddies_skip(se);
a571bbeaf   Peter Zijlstra   sched: fix buddie...
2639
  }
6c16a6dcb   Peter Zijlstra   sched: Fix compil...
2640
  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
d8b4986d3   Paul Turner   sched: Return unu...
2641

bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2642
  static void
371fd7e7a   Peter Zijlstra   sched: Add enqueu...
2643
  dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2644
  {
a2a2d6807   Dmitry Adamushko   sched: cleanup, m...
2645
2646
2647
2648
  	/*
  	 * Update run-time statistics of the 'current'.
  	 */
  	update_curr(cfs_rq);
17bc14b76   Linus Torvalds   Revert "sched: Up...
2649
  	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
a2a2d6807   Dmitry Adamushko   sched: cleanup, m...
2650

19b6a2e37   Ingo Molnar   sched: remove the...
2651
  	update_stats_dequeue(cfs_rq, se);
371fd7e7a   Peter Zijlstra   sched: Add enqueu...
2652
  	if (flags & DEQUEUE_SLEEP) {
67e9fb2a3   Peter Zijlstra   sched: add vslice
2653
  #ifdef CONFIG_SCHEDSTATS
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2654
2655
2656
2657
  		if (entity_is_task(se)) {
  			struct task_struct *tsk = task_of(se);
  
  			if (tsk->state & TASK_INTERRUPTIBLE)
78becc270   Frederic Weisbecker   sched: Use an acc...
2658
  				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2659
  			if (tsk->state & TASK_UNINTERRUPTIBLE)
78becc270   Frederic Weisbecker   sched: Use an acc...
2660
  				se->statistics.block_start = rq_clock(rq_of(cfs_rq));
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2661
  		}
db36cc7d6   Dmitry Adamushko   sched: clean up s...
2662
  #endif
67e9fb2a3   Peter Zijlstra   sched: add vslice
2663
  	}
2002c6959   Peter Zijlstra   sched: release bu...
2664
  	clear_buddies(cfs_rq, se);
4793241be   Peter Zijlstra   sched: backward l...
2665

83b699ed2   Srivatsa Vaddagiri   sched: revert rec...
2666
  	if (se != cfs_rq->curr)
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
2667
  		__dequeue_entity(cfs_rq, se);
17bc14b76   Linus Torvalds   Revert "sched: Up...
2668
  	se->on_rq = 0;
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
2669
  	account_entity_dequeue(cfs_rq, se);
88ec22d3e   Peter Zijlstra   sched: Remove the...
2670
2671
2672
2673
2674
2675
  
  	/*
  	 * Normalize the entity after updating the min_vruntime because the
  	 * update can refer to the ->curr item and we need to reflect this
  	 * movement in our normalized position.
  	 */
371fd7e7a   Peter Zijlstra   sched: Add enqueu...
2676
  	if (!(flags & DEQUEUE_SLEEP))
88ec22d3e   Peter Zijlstra   sched: Remove the...
2677
  		se->vruntime -= cfs_rq->min_vruntime;
1e8762317   Peter Zijlstra   sched: Fix ->min_...
2678

d8b4986d3   Paul Turner   sched: Return unu...
2679
2680
  	/* return excess runtime on last dequeue */
  	return_cfs_rq_runtime(cfs_rq);
1e8762317   Peter Zijlstra   sched: Fix ->min_...
2681
  	update_min_vruntime(cfs_rq);
17bc14b76   Linus Torvalds   Revert "sched: Up...
2682
  	update_cfs_shares(cfs_rq);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2683
2684
2685
2686
2687
  }
  
  /*
   * Preempt the current task with a newly woken task if needed:
   */
7c92e54f6   Peter Zijlstra   sched: simplify _...
2688
  static void
2e09bf556   Ingo Molnar   sched: wakeup gra...
2689
  check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2690
  {
116978308   Peter Zijlstra   sched: fix ideal_...
2691
  	unsigned long ideal_runtime, delta_exec;
f4cfb33ed   Wang Xingchao   sched: Remove red...
2692
2693
  	struct sched_entity *se;
  	s64 delta;
116978308   Peter Zijlstra   sched: fix ideal_...
2694

6d0f0ebd0   Peter Zijlstra   sched: simplify a...
2695
  	ideal_runtime = sched_slice(cfs_rq, curr);
116978308   Peter Zijlstra   sched: fix ideal_...
2696
  	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
a9f3e2b54   Mike Galbraith   sched: clear budd...
2697
  	if (delta_exec > ideal_runtime) {
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2698
  		resched_task(rq_of(cfs_rq)->curr);
a9f3e2b54   Mike Galbraith   sched: clear budd...
2699
2700
2701
2702
2703
  		/*
  		 * The current task ran long enough, ensure it doesn't get
  		 * re-elected due to buddy favours.
  		 */
  		clear_buddies(cfs_rq, curr);
f685ceaca   Mike Galbraith   sched: Strengthen...
2704
2705
2706
2707
2708
2709
2710
2711
  		return;
  	}
  
  	/*
  	 * Ensure that a task that missed wakeup preemption by a
  	 * narrow margin doesn't have to wait for a full slice.
  	 * This also mitigates buddy induced latencies under load.
  	 */
f685ceaca   Mike Galbraith   sched: Strengthen...
2712
2713
  	if (delta_exec < sysctl_sched_min_granularity)
  		return;
f4cfb33ed   Wang Xingchao   sched: Remove red...
2714
2715
  	se = __pick_first_entity(cfs_rq);
  	delta = curr->vruntime - se->vruntime;
f685ceaca   Mike Galbraith   sched: Strengthen...
2716

f4cfb33ed   Wang Xingchao   sched: Remove red...
2717
2718
  	if (delta < 0)
  		return;
d7d829441   Mike Galbraith   sched: Fix signed...
2719

f4cfb33ed   Wang Xingchao   sched: Remove red...
2720
2721
  	if (delta > ideal_runtime)
  		resched_task(rq_of(cfs_rq)->curr);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2722
  }
83b699ed2   Srivatsa Vaddagiri   sched: revert rec...
2723
  static void
8494f412e   Ingo Molnar   sched: remove the...
2724
  set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2725
  {
83b699ed2   Srivatsa Vaddagiri   sched: revert rec...
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
  	/* 'current' is not kept within the tree. */
  	if (se->on_rq) {
  		/*
  		 * Any task has to be enqueued before it get to execute on
  		 * a CPU. So account for the time it spent waiting on the
  		 * runqueue.
  		 */
  		update_stats_wait_end(cfs_rq, se);
  		__dequeue_entity(cfs_rq, se);
  	}
79303e9e0   Ingo Molnar   sched: remove the...
2736
  	update_stats_curr_start(cfs_rq, se);
429d43bcc   Ingo Molnar   sched: cleanup: s...
2737
  	cfs_rq->curr = se;
eba1ed4b7   Ingo Molnar   sched: debug: tra...
2738
2739
2740
2741
2742
2743
  #ifdef CONFIG_SCHEDSTATS
  	/*
  	 * Track our maximum slice length, if the CPU's load is at
  	 * least twice that of our own weight (i.e. dont track it
  	 * when there are only lesser-weight tasks around):
  	 */
495eca494   Dmitry Adamushko   sched: clean up s...
2744
  	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
41acab885   Lucas De Marchi   sched: Implement ...
2745
  		se->statistics.slice_max = max(se->statistics.slice_max,
eba1ed4b7   Ingo Molnar   sched: debug: tra...
2746
2747
2748
  			se->sum_exec_runtime - se->prev_sum_exec_runtime);
  	}
  #endif
4a55b4503   Peter Zijlstra   sched: improve pr...
2749
  	se->prev_sum_exec_runtime = se->sum_exec_runtime;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2750
  }
3f3a49048   Peter Zijlstra   sched: virtual ti...
2751
2752
  static int
  wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
ac53db596   Rik van Riel   sched: Use a budd...
2753
2754
2755
2756
2757
2758
2759
  /*
   * Pick the next process, keeping these things in mind, in this order:
   * 1) keep things fair between processes/task groups
   * 2) pick the "next" process, since someone really wants that to run
   * 3) pick the "last" process, for cache locality
   * 4) do not run the "skip" process, if something else is available
   */
678d5718d   Peter Zijlstra   sched/fair: Optim...
2760
2761
  static struct sched_entity *
  pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
aa2ac2522   Peter Zijlstra   sched: fix overlo...
2762
  {
678d5718d   Peter Zijlstra   sched/fair: Optim...
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
  	struct sched_entity *left = __pick_first_entity(cfs_rq);
  	struct sched_entity *se;
  
  	/*
  	 * If curr is set we have to see if its left of the leftmost entity
  	 * still in the tree, provided there was anything in the tree at all.
  	 */
  	if (!left || (curr && entity_before(curr, left)))
  		left = curr;
  
  	se = left; /* ideally we run the leftmost entity */
f4b6755fb   Peter Zijlstra   sched: cleanup fa...
2774

ac53db596   Rik van Riel   sched: Use a budd...
2775
2776
2777
2778
2779
  	/*
  	 * Avoid running the skip buddy, if running something else can
  	 * be done without getting too unfair.
  	 */
  	if (cfs_rq->skip == se) {
678d5718d   Peter Zijlstra   sched/fair: Optim...
2780
2781
2782
2783
2784
2785
2786
2787
2788
  		struct sched_entity *second;
  
  		if (se == curr) {
  			second = __pick_first_entity(cfs_rq);
  		} else {
  			second = __pick_next_entity(se);
  			if (!second || (curr && entity_before(curr, second)))
  				second = curr;
  		}
ac53db596   Rik van Riel   sched: Use a budd...
2789
2790
2791
  		if (second && wakeup_preempt_entity(second, left) < 1)
  			se = second;
  	}
aa2ac2522   Peter Zijlstra   sched: fix overlo...
2792

f685ceaca   Mike Galbraith   sched: Strengthen...
2793
2794
2795
2796
2797
  	/*
  	 * Prefer last buddy, try to return the CPU to a preempted task.
  	 */
  	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
  		se = cfs_rq->last;
ac53db596   Rik van Riel   sched: Use a budd...
2798
2799
2800
2801
2802
  	/*
  	 * Someone really wants this to run. If it's not unfair, run it.
  	 */
  	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
  		se = cfs_rq->next;
f685ceaca   Mike Galbraith   sched: Strengthen...
2803
  	clear_buddies(cfs_rq, se);
4793241be   Peter Zijlstra   sched: backward l...
2804
2805
  
  	return se;
aa2ac2522   Peter Zijlstra   sched: fix overlo...
2806
  }
678d5718d   Peter Zijlstra   sched/fair: Optim...
2807
  static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
d3d9dc330   Paul Turner   sched: Throttle e...
2808

ab6cde269   Ingo Molnar   sched: remove the...
2809
  static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2810
2811
2812
2813
2814
2815
  {
  	/*
  	 * If still on the runqueue then deactivate_task()
  	 * was not called and update_curr() has to be done:
  	 */
  	if (prev->on_rq)
b7cc08965   Ingo Molnar   sched: remove the...
2816
  		update_curr(cfs_rq);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2817

d3d9dc330   Paul Turner   sched: Throttle e...
2818
2819
  	/* throttle cfs_rqs exceeding runtime */
  	check_cfs_rq_runtime(cfs_rq);
ddc972975   Peter Zijlstra   sched debug: chec...
2820
  	check_spread(cfs_rq, prev);
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
2821
  	if (prev->on_rq) {
5870db5b8   Ingo Molnar   sched: remove the...
2822
  		update_stats_wait_start(cfs_rq, prev);
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
2823
2824
  		/* Put 'current' back into the tree. */
  		__enqueue_entity(cfs_rq, prev);
9d85f21c9   Paul Turner   sched: Track the ...
2825
  		/* in !on_rq case, update occurred at dequeue */
9ee474f55   Paul Turner   sched: Maintain t...
2826
  		update_entity_load_avg(prev, 1);
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
2827
  	}
429d43bcc   Ingo Molnar   sched: cleanup: s...
2828
  	cfs_rq->curr = NULL;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2829
  }
8f4d37ec0   Peter Zijlstra   sched: high-res p...
2830
2831
  static void
  entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2832
  {
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2833
  	/*
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
2834
  	 * Update run-time statistics of the 'current'.
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2835
  	 */
30cfdcfc5   Dmitry Adamushko   sched: do not kee...
2836
  	update_curr(cfs_rq);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2837

43365bd7f   Paul Turner   sched: Move perio...
2838
  	/*
9d85f21c9   Paul Turner   sched: Track the ...
2839
2840
  	 * Ensure that runnable average is periodically updated.
  	 */
9ee474f55   Paul Turner   sched: Maintain t...
2841
  	update_entity_load_avg(curr, 1);
aff3e4988   Paul Turner   sched: Account fo...
2842
  	update_cfs_rq_blocked_load(cfs_rq, 1);
bf0bd948d   Peter Zijlstra   sched: Ensure upd...
2843
  	update_cfs_shares(cfs_rq);
9d85f21c9   Paul Turner   sched: Track the ...
2844

8f4d37ec0   Peter Zijlstra   sched: high-res p...
2845
2846
2847
2848
2849
  #ifdef CONFIG_SCHED_HRTICK
  	/*
  	 * queued ticks are scheduled to match the slice, so don't bother
  	 * validating it and just reschedule.
  	 */
983ed7a66   Harvey Harrison   sched: add static...
2850
2851
2852
2853
  	if (queued) {
  		resched_task(rq_of(cfs_rq)->curr);
  		return;
  	}
8f4d37ec0   Peter Zijlstra   sched: high-res p...
2854
2855
2856
2857
2858
2859
2860
  	/*
  	 * don't let the period tick interfere with the hrtick preemption
  	 */
  	if (!sched_feat(DOUBLE_TICK) &&
  			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
  		return;
  #endif
2c2efaed9   Yong Zhang   sched: Kill WAKEU...
2861
  	if (cfs_rq->nr_running > 1)
2e09bf556   Ingo Molnar   sched: wakeup gra...
2862
  		check_preempt_tick(cfs_rq, curr);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
2863
  }
ab84d31e1   Paul Turner   sched: Introduce ...
2864
2865
2866
2867
2868
2869
  
  /**************************************************
   * CFS bandwidth control machinery
   */
  
  #ifdef CONFIG_CFS_BANDWIDTH
029632fbb   Peter Zijlstra   sched: Make separ...
2870
2871
  
  #ifdef HAVE_JUMP_LABEL
c5905afb0   Ingo Molnar   static keys: Intr...
2872
  static struct static_key __cfs_bandwidth_used;
029632fbb   Peter Zijlstra   sched: Make separ...
2873
2874
2875
  
  static inline bool cfs_bandwidth_used(void)
  {
c5905afb0   Ingo Molnar   static keys: Intr...
2876
  	return static_key_false(&__cfs_bandwidth_used);
029632fbb   Peter Zijlstra   sched: Make separ...
2877
  }
1ee14e6c8   Ben Segall   sched: Fix race o...
2878
  void cfs_bandwidth_usage_inc(void)
029632fbb   Peter Zijlstra   sched: Make separ...
2879
  {
1ee14e6c8   Ben Segall   sched: Fix race o...
2880
2881
2882
2883
2884
2885
  	static_key_slow_inc(&__cfs_bandwidth_used);
  }
  
  void cfs_bandwidth_usage_dec(void)
  {
  	static_key_slow_dec(&__cfs_bandwidth_used);
029632fbb   Peter Zijlstra   sched: Make separ...
2886
2887
2888
2889
2890
2891
  }
  #else /* HAVE_JUMP_LABEL */
  static bool cfs_bandwidth_used(void)
  {
  	return true;
  }
1ee14e6c8   Ben Segall   sched: Fix race o...
2892
2893
  void cfs_bandwidth_usage_inc(void) {}
  void cfs_bandwidth_usage_dec(void) {}
029632fbb   Peter Zijlstra   sched: Make separ...
2894
  #endif /* HAVE_JUMP_LABEL */
ab84d31e1   Paul Turner   sched: Introduce ...
2895
2896
2897
2898
2899
2900
2901
2902
  /*
   * default period for cfs group bandwidth.
   * default: 0.1s, units: nanoseconds
   */
  static inline u64 default_cfs_period(void)
  {
  	return 100000000ULL;
  }
ec12cb7f3   Paul Turner   sched: Accumulate...
2903
2904
2905
2906
2907
  
  static inline u64 sched_cfs_bandwidth_slice(void)
  {
  	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
  }
a9cf55b28   Paul Turner   sched: Expire inv...
2908
2909
2910
2911
2912
2913
2914
  /*
   * Replenish runtime according to assigned quota and update expiration time.
   * We use sched_clock_cpu directly instead of rq->clock to avoid adding
   * additional synchronization around rq->lock.
   *
   * requires cfs_b->lock
   */
029632fbb   Peter Zijlstra   sched: Make separ...
2915
  void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
a9cf55b28   Paul Turner   sched: Expire inv...
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
  {
  	u64 now;
  
  	if (cfs_b->quota == RUNTIME_INF)
  		return;
  
  	now = sched_clock_cpu(smp_processor_id());
  	cfs_b->runtime = cfs_b->quota;
  	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
  }
029632fbb   Peter Zijlstra   sched: Make separ...
2926
2927
2928
2929
  static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
  {
  	return &tg->cfs_bandwidth;
  }
f1b17280e   Paul Turner   sched: Maintain r...
2930
2931
2932
2933
2934
  /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
  {
  	if (unlikely(cfs_rq->throttle_count))
  		return cfs_rq->throttled_clock_task;
78becc270   Frederic Weisbecker   sched: Use an acc...
2935
  	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
f1b17280e   Paul Turner   sched: Maintain r...
2936
  }
85dac906b   Paul Turner   sched: Add suppor...
2937
2938
  /* returns 0 on failure to allocate runtime */
  static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
ec12cb7f3   Paul Turner   sched: Accumulate...
2939
2940
2941
  {
  	struct task_group *tg = cfs_rq->tg;
  	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
a9cf55b28   Paul Turner   sched: Expire inv...
2942
  	u64 amount = 0, min_amount, expires;
ec12cb7f3   Paul Turner   sched: Accumulate...
2943
2944
2945
2946
2947
2948
2949
  
  	/* note: this is a positive sum as runtime_remaining <= 0 */
  	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
  
  	raw_spin_lock(&cfs_b->lock);
  	if (cfs_b->quota == RUNTIME_INF)
  		amount = min_amount;
58088ad01   Paul Turner   sched: Add a time...
2950
  	else {
a9cf55b28   Paul Turner   sched: Expire inv...
2951
2952
2953
2954
2955
2956
2957
2958
  		/*
  		 * If the bandwidth pool has become inactive, then at least one
  		 * period must have elapsed since the last consumption.
  		 * Refresh the global state and ensure bandwidth timer becomes
  		 * active.
  		 */
  		if (!cfs_b->timer_active) {
  			__refill_cfs_bandwidth_runtime(cfs_b);
09dc4ab03   Roman Gushchin   sched/fair: Fix t...
2959
  			__start_cfs_bandwidth(cfs_b, false);
a9cf55b28   Paul Turner   sched: Expire inv...
2960
  		}
58088ad01   Paul Turner   sched: Add a time...
2961
2962
2963
2964
2965
2966
  
  		if (cfs_b->runtime > 0) {
  			amount = min(cfs_b->runtime, min_amount);
  			cfs_b->runtime -= amount;
  			cfs_b->idle = 0;
  		}
ec12cb7f3   Paul Turner   sched: Accumulate...
2967
  	}
a9cf55b28   Paul Turner   sched: Expire inv...
2968
  	expires = cfs_b->runtime_expires;
ec12cb7f3   Paul Turner   sched: Accumulate...
2969
2970
2971
  	raw_spin_unlock(&cfs_b->lock);
  
  	cfs_rq->runtime_remaining += amount;
a9cf55b28   Paul Turner   sched: Expire inv...
2972
2973
2974
2975
2976
2977
2978
  	/*
  	 * we may have advanced our local expiration to account for allowed
  	 * spread between our sched_clock and the one on which runtime was
  	 * issued.
  	 */
  	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
  		cfs_rq->runtime_expires = expires;
85dac906b   Paul Turner   sched: Add suppor...
2979
2980
  
  	return cfs_rq->runtime_remaining > 0;
ec12cb7f3   Paul Turner   sched: Accumulate...
2981
  }
a9cf55b28   Paul Turner   sched: Expire inv...
2982
2983
2984
2985
2986
  /*
   * Note: This depends on the synchronization provided by sched_clock and the
   * fact that rq->clock snapshots this value.
   */
  static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
ec12cb7f3   Paul Turner   sched: Accumulate...
2987
  {
a9cf55b28   Paul Turner   sched: Expire inv...
2988
  	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
a9cf55b28   Paul Turner   sched: Expire inv...
2989
2990
  
  	/* if the deadline is ahead of our clock, nothing to do */
78becc270   Frederic Weisbecker   sched: Use an acc...
2991
  	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
ec12cb7f3   Paul Turner   sched: Accumulate...
2992
  		return;
a9cf55b28   Paul Turner   sched: Expire inv...
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
  	if (cfs_rq->runtime_remaining < 0)
  		return;
  
  	/*
  	 * If the local deadline has passed we have to consider the
  	 * possibility that our sched_clock is 'fast' and the global deadline
  	 * has not truly expired.
  	 *
  	 * Fortunately we can check determine whether this the case by checking
  	 * whether the global deadline has advanced.
  	 */
  
  	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
  		/* extend local deadline, drift is bounded above by 2 ticks */
  		cfs_rq->runtime_expires += TICK_NSEC;
  	} else {
  		/* global deadline is ahead, expiration has passed */
  		cfs_rq->runtime_remaining = 0;
  	}
  }
9dbdb1555   Peter Zijlstra   sched/fair: Rewor...
3013
  static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
a9cf55b28   Paul Turner   sched: Expire inv...
3014
3015
  {
  	/* dock delta_exec before expiring quota (as it could span periods) */
ec12cb7f3   Paul Turner   sched: Accumulate...
3016
  	cfs_rq->runtime_remaining -= delta_exec;
a9cf55b28   Paul Turner   sched: Expire inv...
3017
3018
3019
  	expire_cfs_rq_runtime(cfs_rq);
  
  	if (likely(cfs_rq->runtime_remaining > 0))
ec12cb7f3   Paul Turner   sched: Accumulate...
3020
  		return;
85dac906b   Paul Turner   sched: Add suppor...
3021
3022
3023
3024
3025
3026
  	/*
  	 * if we're unable to extend our runtime we resched so that the active
  	 * hierarchy can be throttled
  	 */
  	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
  		resched_task(rq_of(cfs_rq)->curr);
ec12cb7f3   Paul Turner   sched: Accumulate...
3027
  }
6c16a6dcb   Peter Zijlstra   sched: Fix compil...
3028
  static __always_inline
9dbdb1555   Peter Zijlstra   sched/fair: Rewor...
3029
  void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
ec12cb7f3   Paul Turner   sched: Accumulate...
3030
  {
56f570e51   Paul Turner   sched: Use jump l...
3031
  	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
ec12cb7f3   Paul Turner   sched: Accumulate...
3032
3033
3034
3035
  		return;
  
  	__account_cfs_rq_runtime(cfs_rq, delta_exec);
  }
85dac906b   Paul Turner   sched: Add suppor...
3036
3037
  static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
  {
56f570e51   Paul Turner   sched: Use jump l...
3038
  	return cfs_bandwidth_used() && cfs_rq->throttled;
85dac906b   Paul Turner   sched: Add suppor...
3039
  }
64660c864   Paul Turner   sched: Prevent in...
3040
3041
3042
  /* check whether cfs_rq, or any parent, is throttled */
  static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
  {
56f570e51   Paul Turner   sched: Use jump l...
3043
  	return cfs_bandwidth_used() && cfs_rq->throttle_count;
64660c864   Paul Turner   sched: Prevent in...
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
  }
  
  /*
   * Ensure that neither of the group entities corresponding to src_cpu or
   * dest_cpu are members of a throttled hierarchy when performing group
   * load-balance operations.
   */
  static inline int throttled_lb_pair(struct task_group *tg,
  				    int src_cpu, int dest_cpu)
  {
  	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
  
  	src_cfs_rq = tg->cfs_rq[src_cpu];
  	dest_cfs_rq = tg->cfs_rq[dest_cpu];
  
  	return throttled_hierarchy(src_cfs_rq) ||
  	       throttled_hierarchy(dest_cfs_rq);
  }
  
  /* updated child weight may affect parent so we have to do this bottom up */
  static int tg_unthrottle_up(struct task_group *tg, void *data)
  {
  	struct rq *rq = data;
  	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
  
  	cfs_rq->throttle_count--;
  #ifdef CONFIG_SMP
  	if (!cfs_rq->throttle_count) {
f1b17280e   Paul Turner   sched: Maintain r...
3072
  		/* adjust cfs_rq_clock_task() */
78becc270   Frederic Weisbecker   sched: Use an acc...
3073
  		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
f1b17280e   Paul Turner   sched: Maintain r...
3074
  					     cfs_rq->throttled_clock_task;
64660c864   Paul Turner   sched: Prevent in...
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
  	}
  #endif
  
  	return 0;
  }
  
  static int tg_throttle_down(struct task_group *tg, void *data)
  {
  	struct rq *rq = data;
  	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
82958366c   Paul Turner   sched: Replace up...
3085
3086
  	/* group is entering throttled state, stop time */
  	if (!cfs_rq->throttle_count)
78becc270   Frederic Weisbecker   sched: Use an acc...
3087
  		cfs_rq->throttled_clock_task = rq_clock_task(rq);
64660c864   Paul Turner   sched: Prevent in...
3088
3089
3090
3091
  	cfs_rq->throttle_count++;
  
  	return 0;
  }
d3d9dc330   Paul Turner   sched: Throttle e...
3092
  static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
85dac906b   Paul Turner   sched: Add suppor...
3093
3094
3095
3096
3097
3098
3099
  {
  	struct rq *rq = rq_of(cfs_rq);
  	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
  	struct sched_entity *se;
  	long task_delta, dequeue = 1;
  
  	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
f1b17280e   Paul Turner   sched: Maintain r...
3100
  	/* freeze hierarchy runnable averages while throttled */
64660c864   Paul Turner   sched: Prevent in...
3101
3102
3103
  	rcu_read_lock();
  	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
  	rcu_read_unlock();
85dac906b   Paul Turner   sched: Add suppor...
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
  
  	task_delta = cfs_rq->h_nr_running;
  	for_each_sched_entity(se) {
  		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
  		/* throttled entity or throttle-on-deactivate */
  		if (!se->on_rq)
  			break;
  
  		if (dequeue)
  			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
  		qcfs_rq->h_nr_running -= task_delta;
  
  		if (qcfs_rq->load.weight)
  			dequeue = 0;
  	}
  
  	if (!se)
  		rq->nr_running -= task_delta;
  
  	cfs_rq->throttled = 1;
78becc270   Frederic Weisbecker   sched: Use an acc...
3124
  	cfs_rq->throttled_clock = rq_clock(rq);
85dac906b   Paul Turner   sched: Add suppor...
3125
3126
  	raw_spin_lock(&cfs_b->lock);
  	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
f9f9ffc23   Ben Segall   sched: Avoid thro...
3127
  	if (!cfs_b->timer_active)
09dc4ab03   Roman Gushchin   sched/fair: Fix t...
3128
  		__start_cfs_bandwidth(cfs_b, false);
85dac906b   Paul Turner   sched: Add suppor...
3129
3130
  	raw_spin_unlock(&cfs_b->lock);
  }
029632fbb   Peter Zijlstra   sched: Make separ...
3131
  void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
671fd9dab   Paul Turner   sched: Add suppor...
3132
3133
3134
3135
3136
3137
  {
  	struct rq *rq = rq_of(cfs_rq);
  	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
  	struct sched_entity *se;
  	int enqueue = 1;
  	long task_delta;
22b958d8c   Michael Wang   sched: Refine the...
3138
  	se = cfs_rq->tg->se[cpu_of(rq)];
671fd9dab   Paul Turner   sched: Add suppor...
3139
3140
  
  	cfs_rq->throttled = 0;
1a55af2e4   Frederic Weisbecker   sched: Update rq ...
3141
3142
  
  	update_rq_clock(rq);
671fd9dab   Paul Turner   sched: Add suppor...
3143
  	raw_spin_lock(&cfs_b->lock);
78becc270   Frederic Weisbecker   sched: Use an acc...
3144
  	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
671fd9dab   Paul Turner   sched: Add suppor...
3145
3146
  	list_del_rcu(&cfs_rq->throttled_list);
  	raw_spin_unlock(&cfs_b->lock);
64660c864   Paul Turner   sched: Prevent in...
3147
3148
  	/* update hierarchical throttle state */
  	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
671fd9dab   Paul Turner   sched: Add suppor...
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
  	if (!cfs_rq->load.weight)
  		return;
  
  	task_delta = cfs_rq->h_nr_running;
  	for_each_sched_entity(se) {
  		if (se->on_rq)
  			enqueue = 0;
  
  		cfs_rq = cfs_rq_of(se);
  		if (enqueue)
  			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
  		cfs_rq->h_nr_running += task_delta;
  
  		if (cfs_rq_throttled(cfs_rq))
  			break;
  	}
  
  	if (!se)
  		rq->nr_running += task_delta;
  
  	/* determine whether we need to wake up potentially idle cpu */
  	if (rq->curr == rq->idle && rq->cfs.nr_running)
  		resched_task(rq->curr);
  }
  
  static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
  		u64 remaining, u64 expires)
  {
  	struct cfs_rq *cfs_rq;
  	u64 runtime = remaining;
  
  	rcu_read_lock();
  	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
  				throttled_list) {
  		struct rq *rq = rq_of(cfs_rq);
  
  		raw_spin_lock(&rq->lock);
  		if (!cfs_rq_throttled(cfs_rq))
  			goto next;
  
  		runtime = -cfs_rq->runtime_remaining + 1;
  		if (runtime > remaining)
  			runtime = remaining;
  		remaining -= runtime;
  
  		cfs_rq->runtime_remaining += runtime;
  		cfs_rq->runtime_expires = expires;
  
  		/* we check whether we're throttled above */
  		if (cfs_rq->runtime_remaining > 0)
  			unthrottle_cfs_rq(cfs_rq);
  
  next:
  		raw_spin_unlock(&rq->lock);
  
  		if (!remaining)
  			break;
  	}
  	rcu_read_unlock();
  
  	return remaining;
  }
58088ad01   Paul Turner   sched: Add a time...
3211
3212
3213
3214
3215
3216
3217
3218
  /*
   * Responsible for refilling a task_group's bandwidth and unthrottling its
   * cfs_rqs as appropriate. If there has been no activity within the last
   * period the timer is deactivated until scheduling resumes; cfs_b->idle is
   * used to track this state.
   */
  static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
  {
671fd9dab   Paul Turner   sched: Add suppor...
3219
3220
  	u64 runtime, runtime_expires;
  	int idle = 1, throttled;
58088ad01   Paul Turner   sched: Add a time...
3221
3222
3223
3224
3225
  
  	raw_spin_lock(&cfs_b->lock);
  	/* no need to continue the timer with no bandwidth constraint */
  	if (cfs_b->quota == RUNTIME_INF)
  		goto out_unlock;
671fd9dab   Paul Turner   sched: Add suppor...
3226
3227
3228
  	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
  	/* idle depends on !throttled (for the case of a large deficit) */
  	idle = cfs_b->idle && !throttled;
e8da1b18b   Nikhil Rao   sched: Add export...
3229
  	cfs_b->nr_periods += overrun;
671fd9dab   Paul Turner   sched: Add suppor...
3230

a9cf55b28   Paul Turner   sched: Expire inv...
3231
3232
3233
  	/* if we're going inactive then everything else can be deferred */
  	if (idle)
  		goto out_unlock;
927b54fcc   Ben Segall   sched: Fix hrtime...
3234
3235
3236
3237
3238
3239
  	/*
  	 * if we have relooped after returning idle once, we need to update our
  	 * status as actually running, so that other cpus doing
  	 * __start_cfs_bandwidth will stop trying to cancel us.
  	 */
  	cfs_b->timer_active = 1;
a9cf55b28   Paul Turner   sched: Expire inv...
3240
  	__refill_cfs_bandwidth_runtime(cfs_b);
671fd9dab   Paul Turner   sched: Add suppor...
3241
3242
3243
3244
3245
  	if (!throttled) {
  		/* mark as potentially idle for the upcoming period */
  		cfs_b->idle = 1;
  		goto out_unlock;
  	}
e8da1b18b   Nikhil Rao   sched: Add export...
3246
3247
  	/* account preceding periods in which throttling occurred */
  	cfs_b->nr_throttled += overrun;
671fd9dab   Paul Turner   sched: Add suppor...
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
  	/*
  	 * There are throttled entities so we must first use the new bandwidth
  	 * to unthrottle them before making it generally available.  This
  	 * ensures that all existing debts will be paid before a new cfs_rq is
  	 * allowed to run.
  	 */
  	runtime = cfs_b->runtime;
  	runtime_expires = cfs_b->runtime_expires;
  	cfs_b->runtime = 0;
  
  	/*
  	 * This check is repeated as we are holding onto the new bandwidth
  	 * while we unthrottle.  This can potentially race with an unthrottled
  	 * group trying to acquire new bandwidth from the global pool.
  	 */
  	while (throttled && runtime > 0) {
  		raw_spin_unlock(&cfs_b->lock);
  		/* we can't nest cfs_b->lock while distributing bandwidth */
  		runtime = distribute_cfs_runtime(cfs_b, runtime,
  						 runtime_expires);
  		raw_spin_lock(&cfs_b->lock);
  
  		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
  	}
58088ad01   Paul Turner   sched: Add a time...
3272

671fd9dab   Paul Turner   sched: Add suppor...
3273
3274
3275
3276
3277
3278
3279
3280
3281
  	/* return (any) remaining runtime */
  	cfs_b->runtime = runtime;
  	/*
  	 * While we are ensured activity in the period following an
  	 * unthrottle, this also covers the case in which the new bandwidth is
  	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
  	 * timer to remain active while there are any throttled entities.)
  	 */
  	cfs_b->idle = 0;
58088ad01   Paul Turner   sched: Add a time...
3282
3283
3284
3285
3286
3287
3288
  out_unlock:
  	if (idle)
  		cfs_b->timer_active = 0;
  	raw_spin_unlock(&cfs_b->lock);
  
  	return idle;
  }
d3d9dc330   Paul Turner   sched: Throttle e...
3289

d8b4986d3   Paul Turner   sched: Return unu...
3290
3291
3292
3293
3294
3295
  /* a cfs_rq won't donate quota below this amount */
  static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
  /* minimum remaining period time to redistribute slack quota */
  static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
  /* how long we wait to gather additional slack before distributing */
  static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
db06e78cc   Ben Segall   sched: Fix cfs_ba...
3296
3297
3298
3299
3300
3301
3302
  /*
   * Are we near the end of the current quota period?
   *
   * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
   * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
   * migrate_hrtimers, base is never cleared, so we are fine.
   */
d8b4986d3   Paul Turner   sched: Return unu...
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
  static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
  {
  	struct hrtimer *refresh_timer = &cfs_b->period_timer;
  	u64 remaining;
  
  	/* if the call-back is running a quota refresh is already occurring */
  	if (hrtimer_callback_running(refresh_timer))
  		return 1;
  
  	/* is a quota refresh about to occur? */
  	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
  	if (remaining < min_expire)
  		return 1;
  
  	return 0;
  }
  
  static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
  {
  	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
  
  	/* if there's a quota refresh soon don't bother with slack */
  	if (runtime_refresh_within(cfs_b, min_left))
  		return;
  
  	start_bandwidth_timer(&cfs_b->slack_timer,
  				ns_to_ktime(cfs_bandwidth_slack_period));
  }
  
  /* we know any runtime found here is valid as update_curr() precedes return */
  static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  {
  	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
  	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
  
  	if (slack_runtime <= 0)
  		return;
  
  	raw_spin_lock(&cfs_b->lock);
  	if (cfs_b->quota != RUNTIME_INF &&
  	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
  		cfs_b->runtime += slack_runtime;
  
  		/* we are under rq->lock, defer unthrottling using a timer */
  		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
  		    !list_empty(&cfs_b->throttled_cfs_rq))
  			start_cfs_slack_bandwidth(cfs_b);
  	}
  	raw_spin_unlock(&cfs_b->lock);
  
  	/* even if it's not valid for return we don't want to try again */
  	cfs_rq->runtime_remaining -= slack_runtime;
  }
  
  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  {
56f570e51   Paul Turner   sched: Use jump l...
3359
3360
  	if (!cfs_bandwidth_used())
  		return;
fccfdc6f0   Paul Turner   sched: Fix buglet...
3361
  	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
d8b4986d3   Paul Turner   sched: Return unu...
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
  		return;
  
  	__return_cfs_rq_runtime(cfs_rq);
  }
  
  /*
   * This is done with a timer (instead of inline with bandwidth return) since
   * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
   */
  static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
  {
  	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
  	u64 expires;
  
  	/* confirm we're still not at a refresh boundary */
db06e78cc   Ben Segall   sched: Fix cfs_ba...
3377
3378
3379
  	raw_spin_lock(&cfs_b->lock);
  	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
  		raw_spin_unlock(&cfs_b->lock);
d8b4986d3   Paul Turner   sched: Return unu...
3380
  		return;
db06e78cc   Ben Segall   sched: Fix cfs_ba...
3381
  	}
d8b4986d3   Paul Turner   sched: Return unu...
3382

d8b4986d3   Paul Turner   sched: Return unu...
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
  	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
  		runtime = cfs_b->runtime;
  		cfs_b->runtime = 0;
  	}
  	expires = cfs_b->runtime_expires;
  	raw_spin_unlock(&cfs_b->lock);
  
  	if (!runtime)
  		return;
  
  	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
  
  	raw_spin_lock(&cfs_b->lock);
  	if (expires == cfs_b->runtime_expires)
  		cfs_b->runtime = runtime;
  	raw_spin_unlock(&cfs_b->lock);
  }
d3d9dc330   Paul Turner   sched: Throttle e...
3400
3401
3402
3403
3404
3405
3406
  /*
   * When a group wakes up we want to make sure that its quota is not already
   * expired/exceeded, otherwise it may be allowed to steal additional ticks of
   * runtime as update_curr() throttling can not not trigger until it's on-rq.
   */
  static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
  {
56f570e51   Paul Turner   sched: Use jump l...
3407
3408
  	if (!cfs_bandwidth_used())
  		return;
d3d9dc330   Paul Turner   sched: Throttle e...
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
  	/* an active group must be handled by the update_curr()->put() path */
  	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
  		return;
  
  	/* ensure the group is not already throttled */
  	if (cfs_rq_throttled(cfs_rq))
  		return;
  
  	/* update runtime allocation */
  	account_cfs_rq_runtime(cfs_rq, 0);
  	if (cfs_rq->runtime_remaining <= 0)
  		throttle_cfs_rq(cfs_rq);
  }
  
  /* conditionally throttle active cfs_rq's from put_prev_entity() */
678d5718d   Peter Zijlstra   sched/fair: Optim...
3424
  static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
d3d9dc330   Paul Turner   sched: Throttle e...
3425
  {
56f570e51   Paul Turner   sched: Use jump l...
3426
  	if (!cfs_bandwidth_used())
678d5718d   Peter Zijlstra   sched/fair: Optim...
3427
  		return false;
56f570e51   Paul Turner   sched: Use jump l...
3428

d3d9dc330   Paul Turner   sched: Throttle e...
3429
  	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
678d5718d   Peter Zijlstra   sched/fair: Optim...
3430
  		return false;
d3d9dc330   Paul Turner   sched: Throttle e...
3431
3432
3433
3434
3435
3436
  
  	/*
  	 * it's possible for a throttled entity to be forced into a running
  	 * state (e.g. set_curr_task), in this case we're finished.
  	 */
  	if (cfs_rq_throttled(cfs_rq))
678d5718d   Peter Zijlstra   sched/fair: Optim...
3437
  		return true;
d3d9dc330   Paul Turner   sched: Throttle e...
3438
3439
  
  	throttle_cfs_rq(cfs_rq);
678d5718d   Peter Zijlstra   sched/fair: Optim...
3440
  	return true;
d3d9dc330   Paul Turner   sched: Throttle e...
3441
  }
029632fbb   Peter Zijlstra   sched: Make separ...
3442

029632fbb   Peter Zijlstra   sched: Make separ...
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
  static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
  {
  	struct cfs_bandwidth *cfs_b =
  		container_of(timer, struct cfs_bandwidth, slack_timer);
  	do_sched_cfs_slack_timer(cfs_b);
  
  	return HRTIMER_NORESTART;
  }
  
  static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
  {
  	struct cfs_bandwidth *cfs_b =
  		container_of(timer, struct cfs_bandwidth, period_timer);
  	ktime_t now;
  	int overrun;
  	int idle = 0;
  
  	for (;;) {
  		now = hrtimer_cb_get_time(timer);
  		overrun = hrtimer_forward(timer, now, cfs_b->period);
  
  		if (!overrun)
  			break;
  
  		idle = do_sched_cfs_period_timer(cfs_b, overrun);
  	}
  
  	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
  }
  
  void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
  {
  	raw_spin_lock_init(&cfs_b->lock);
  	cfs_b->runtime = 0;
  	cfs_b->quota = RUNTIME_INF;
  	cfs_b->period = ns_to_ktime(default_cfs_period());
  
  	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
  	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  	cfs_b->period_timer.function = sched_cfs_period_timer;
  	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  	cfs_b->slack_timer.function = sched_cfs_slack_timer;
  }
  
  static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  {
  	cfs_rq->runtime_enabled = 0;
  	INIT_LIST_HEAD(&cfs_rq->throttled_list);
  }
  
  /* requires cfs_b->lock, may release to reprogram timer */
09dc4ab03   Roman Gushchin   sched/fair: Fix t...
3494
  void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
029632fbb   Peter Zijlstra   sched: Make separ...
3495
3496
3497
3498
3499
3500
3501
  {
  	/*
  	 * The timer may be active because we're trying to set a new bandwidth
  	 * period or because we're racing with the tear-down path
  	 * (timer_active==0 becomes visible before the hrtimer call-back
  	 * terminates).  In either case we ensure that it's re-programmed
  	 */
927b54fcc   Ben Segall   sched: Fix hrtime...
3502
3503
3504
  	while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
  	       hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
  		/* bounce the lock to allow do_sched_cfs_period_timer to run */
029632fbb   Peter Zijlstra   sched: Make separ...
3505
  		raw_spin_unlock(&cfs_b->lock);
927b54fcc   Ben Segall   sched: Fix hrtime...
3506
  		cpu_relax();
029632fbb   Peter Zijlstra   sched: Make separ...
3507
3508
  		raw_spin_lock(&cfs_b->lock);
  		/* if someone else restarted the timer then we're done */
09dc4ab03   Roman Gushchin   sched/fair: Fix t...
3509
  		if (!force && cfs_b->timer_active)
029632fbb   Peter Zijlstra   sched: Make separ...
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
  			return;
  	}
  
  	cfs_b->timer_active = 1;
  	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
  }
  
  static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
  {
  	hrtimer_cancel(&cfs_b->period_timer);
  	hrtimer_cancel(&cfs_b->slack_timer);
  }
38dc3348e   Arnd Bergmann   sched: Fix warnin...
3522
  static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
029632fbb   Peter Zijlstra   sched: Make separ...
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
  {
  	struct cfs_rq *cfs_rq;
  
  	for_each_leaf_cfs_rq(rq, cfs_rq) {
  		struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
  
  		if (!cfs_rq->runtime_enabled)
  			continue;
  
  		/*
  		 * clock_task is not advancing so we just need to make sure
  		 * there's some valid quota amount
  		 */
  		cfs_rq->runtime_remaining = cfs_b->quota;
  		if (cfs_rq_throttled(cfs_rq))
  			unthrottle_cfs_rq(cfs_rq);
  	}
  }
  
  #else /* CONFIG_CFS_BANDWIDTH */
f1b17280e   Paul Turner   sched: Maintain r...
3543
3544
  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
  {
78becc270   Frederic Weisbecker   sched: Use an acc...
3545
  	return rq_clock_task(rq_of(cfs_rq));
f1b17280e   Paul Turner   sched: Maintain r...
3546
  }
9dbdb1555   Peter Zijlstra   sched/fair: Rewor...
3547
  static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
678d5718d   Peter Zijlstra   sched/fair: Optim...
3548
  static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
d3d9dc330   Paul Turner   sched: Throttle e...
3549
  static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
6c16a6dcb   Peter Zijlstra   sched: Fix compil...
3550
  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
85dac906b   Paul Turner   sched: Add suppor...
3551
3552
3553
3554
3555
  
  static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
  {
  	return 0;
  }
64660c864   Paul Turner   sched: Prevent in...
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
  
  static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
  {
  	return 0;
  }
  
  static inline int throttled_lb_pair(struct task_group *tg,
  				    int src_cpu, int dest_cpu)
  {
  	return 0;
  }
029632fbb   Peter Zijlstra   sched: Make separ...
3567
3568
3569
3570
3571
  
  void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
ab84d31e1   Paul Turner   sched: Introduce ...
3572
  #endif
029632fbb   Peter Zijlstra   sched: Make separ...
3573
3574
3575
3576
3577
  static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
  {
  	return NULL;
  }
  static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
a4c96ae31   Peter Boonstoppel   sched: Unthrottle...
3578
  static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
029632fbb   Peter Zijlstra   sched: Make separ...
3579
3580
  
  #endif /* CONFIG_CFS_BANDWIDTH */
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3581
3582
3583
  /**************************************************
   * CFS operations on tasks:
   */
8f4d37ec0   Peter Zijlstra   sched: high-res p...
3584
3585
3586
  #ifdef CONFIG_SCHED_HRTICK
  static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
  {
8f4d37ec0   Peter Zijlstra   sched: high-res p...
3587
3588
3589
3590
  	struct sched_entity *se = &p->se;
  	struct cfs_rq *cfs_rq = cfs_rq_of(se);
  
  	WARN_ON(task_rq(p) != rq);
b39e66eaf   Mike Galbraith   sched: Save some ...
3591
  	if (cfs_rq->nr_running > 1) {
8f4d37ec0   Peter Zijlstra   sched: high-res p...
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
  		u64 slice = sched_slice(cfs_rq, se);
  		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
  		s64 delta = slice - ran;
  
  		if (delta < 0) {
  			if (rq->curr == p)
  				resched_task(p);
  			return;
  		}
  
  		/*
  		 * Don't schedule slices shorter than 10000ns, that just
  		 * doesn't make sense. Rely on vruntime for fairness.
  		 */
31656519e   Peter Zijlstra   sched, x86: clean...
3606
  		if (rq->curr != p)
157124c11   Peter Zijlstra   sched: fix warnin...
3607
  			delta = max_t(s64, 10000LL, delta);
8f4d37ec0   Peter Zijlstra   sched: high-res p...
3608

31656519e   Peter Zijlstra   sched, x86: clean...
3609
  		hrtick_start(rq, delta);
8f4d37ec0   Peter Zijlstra   sched: high-res p...
3610
3611
  	}
  }
a4c2f00f5   Peter Zijlstra   sched: fair sched...
3612
3613
3614
3615
3616
3617
3618
3619
3620
  
  /*
   * called from enqueue/dequeue and updates the hrtick when the
   * current task is from our class and nr_running is low enough
   * to matter.
   */
  static void hrtick_update(struct rq *rq)
  {
  	struct task_struct *curr = rq->curr;
b39e66eaf   Mike Galbraith   sched: Save some ...
3621
  	if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
a4c2f00f5   Peter Zijlstra   sched: fair sched...
3622
3623
3624
3625
3626
  		return;
  
  	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
  		hrtick_start_fair(rq, curr);
  }
55e12e5e7   Dhaval Giani   sched: make sched...
3627
  #else /* !CONFIG_SCHED_HRTICK */
8f4d37ec0   Peter Zijlstra   sched: high-res p...
3628
3629
3630
3631
  static inline void
  hrtick_start_fair(struct rq *rq, struct task_struct *p)
  {
  }
a4c2f00f5   Peter Zijlstra   sched: fair sched...
3632
3633
3634
3635
  
  static inline void hrtick_update(struct rq *rq)
  {
  }
8f4d37ec0   Peter Zijlstra   sched: high-res p...
3636
  #endif
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3637
3638
3639
3640
3641
  /*
   * The enqueue_task method is called before nr_running is
   * increased. Here we update the fair scheduling stats and
   * then put the task into the rbtree:
   */
ea87bb785   Thomas Gleixner   sched: Extend enq...
3642
  static void
371fd7e7a   Peter Zijlstra   sched: Add enqueu...
3643
  enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3644
3645
  {
  	struct cfs_rq *cfs_rq;
62fb18513   Peter Zijlstra   sched: revert loa...
3646
  	struct sched_entity *se = &p->se;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3647
3648
  
  	for_each_sched_entity(se) {
62fb18513   Peter Zijlstra   sched: revert loa...
3649
  		if (se->on_rq)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3650
3651
  			break;
  		cfs_rq = cfs_rq_of(se);
88ec22d3e   Peter Zijlstra   sched: Remove the...
3652
  		enqueue_entity(cfs_rq, se, flags);
85dac906b   Paul Turner   sched: Add suppor...
3653
3654
3655
3656
3657
3658
3659
3660
3661
  
  		/*
  		 * end evaluation on encountering a throttled cfs_rq
  		 *
  		 * note: in the case of encountering a throttled cfs_rq we will
  		 * post the final h_nr_running increment below.
  		*/
  		if (cfs_rq_throttled(cfs_rq))
  			break;
953bfcd10   Paul Turner   sched: Implement ...
3662
  		cfs_rq->h_nr_running++;
85dac906b   Paul Turner   sched: Add suppor...
3663

88ec22d3e   Peter Zijlstra   sched: Remove the...
3664
  		flags = ENQUEUE_WAKEUP;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3665
  	}
8f4d37ec0   Peter Zijlstra   sched: high-res p...
3666

2069dd75c   Peter Zijlstra   sched: Rewrite tg...
3667
  	for_each_sched_entity(se) {
0f3171438   Lin Ming   sched: Cleanup du...
3668
  		cfs_rq = cfs_rq_of(se);
953bfcd10   Paul Turner   sched: Implement ...
3669
  		cfs_rq->h_nr_running++;
2069dd75c   Peter Zijlstra   sched: Rewrite tg...
3670

85dac906b   Paul Turner   sched: Add suppor...
3671
3672
  		if (cfs_rq_throttled(cfs_rq))
  			break;
17bc14b76   Linus Torvalds   Revert "sched: Up...
3673
  		update_cfs_shares(cfs_rq);
9ee474f55   Paul Turner   sched: Maintain t...
3674
  		update_entity_load_avg(se, 1);
2069dd75c   Peter Zijlstra   sched: Rewrite tg...
3675
  	}
18bf2805d   Ben Segall   sched: Maintain p...
3676
3677
  	if (!se) {
  		update_rq_runnable_avg(rq, rq->nr_running);
85dac906b   Paul Turner   sched: Add suppor...
3678
  		inc_nr_running(rq);
18bf2805d   Ben Segall   sched: Maintain p...
3679
  	}
a4c2f00f5   Peter Zijlstra   sched: fair sched...
3680
  	hrtick_update(rq);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3681
  }
2f36825b1   Venkatesh Pallipadi   sched: Next buddy...
3682
  static void set_next_buddy(struct sched_entity *se);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3683
3684
3685
3686
3687
  /*
   * The dequeue_task method is called before nr_running is
   * decreased. We remove the task from the rbtree and
   * update the fair scheduling stats:
   */
371fd7e7a   Peter Zijlstra   sched: Add enqueu...
3688
  static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3689
3690
  {
  	struct cfs_rq *cfs_rq;
62fb18513   Peter Zijlstra   sched: revert loa...
3691
  	struct sched_entity *se = &p->se;
2f36825b1   Venkatesh Pallipadi   sched: Next buddy...
3692
  	int task_sleep = flags & DEQUEUE_SLEEP;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3693
3694
3695
  
  	for_each_sched_entity(se) {
  		cfs_rq = cfs_rq_of(se);
371fd7e7a   Peter Zijlstra   sched: Add enqueu...
3696
  		dequeue_entity(cfs_rq, se, flags);
85dac906b   Paul Turner   sched: Add suppor...
3697
3698
3699
3700
3701
3702
3703
3704
3705
  
  		/*
  		 * end evaluation on encountering a throttled cfs_rq
  		 *
  		 * note: in the case of encountering a throttled cfs_rq we will
  		 * post the final h_nr_running decrement below.
  		*/
  		if (cfs_rq_throttled(cfs_rq))
  			break;
953bfcd10   Paul Turner   sched: Implement ...
3706
  		cfs_rq->h_nr_running--;
2069dd75c   Peter Zijlstra   sched: Rewrite tg...
3707

bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3708
  		/* Don't dequeue parent if it has other entities besides us */
2f36825b1   Venkatesh Pallipadi   sched: Next buddy...
3709
3710
3711
3712
3713
3714
3715
  		if (cfs_rq->load.weight) {
  			/*
  			 * Bias pick_next to pick a task from this cfs_rq, as
  			 * p is sleeping when it is within its sched_slice.
  			 */
  			if (task_sleep && parent_entity(se))
  				set_next_buddy(parent_entity(se));
9598c82dc   Paul Turner   sched: Don't upda...
3716
3717
3718
  
  			/* avoid re-evaluating load for this entity */
  			se = parent_entity(se);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3719
  			break;
2f36825b1   Venkatesh Pallipadi   sched: Next buddy...
3720
  		}
371fd7e7a   Peter Zijlstra   sched: Add enqueu...
3721
  		flags |= DEQUEUE_SLEEP;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3722
  	}
8f4d37ec0   Peter Zijlstra   sched: high-res p...
3723

2069dd75c   Peter Zijlstra   sched: Rewrite tg...
3724
  	for_each_sched_entity(se) {
0f3171438   Lin Ming   sched: Cleanup du...
3725
  		cfs_rq = cfs_rq_of(se);
953bfcd10   Paul Turner   sched: Implement ...
3726
  		cfs_rq->h_nr_running--;
2069dd75c   Peter Zijlstra   sched: Rewrite tg...
3727

85dac906b   Paul Turner   sched: Add suppor...
3728
3729
  		if (cfs_rq_throttled(cfs_rq))
  			break;
17bc14b76   Linus Torvalds   Revert "sched: Up...
3730
  		update_cfs_shares(cfs_rq);
9ee474f55   Paul Turner   sched: Maintain t...
3731
  		update_entity_load_avg(se, 1);
2069dd75c   Peter Zijlstra   sched: Rewrite tg...
3732
  	}
18bf2805d   Ben Segall   sched: Maintain p...
3733
  	if (!se) {
85dac906b   Paul Turner   sched: Add suppor...
3734
  		dec_nr_running(rq);
18bf2805d   Ben Segall   sched: Maintain p...
3735
3736
  		update_rq_runnable_avg(rq, 1);
  	}
a4c2f00f5   Peter Zijlstra   sched: fair sched...
3737
  	hrtick_update(rq);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
3738
  }
e7693a362   Gregory Haskins   sched: de-SCHED_O...
3739
  #ifdef CONFIG_SMP
029632fbb   Peter Zijlstra   sched: Make separ...
3740
3741
3742
  /* Used instead of source_load when we know the type == 0 */
  static unsigned long weighted_cpuload(const int cpu)
  {
b92486cbf   Alex Shi   sched: Compute ru...
3743
  	return cpu_rq(cpu)->cfs.runnable_load_avg;
029632fbb   Peter Zijlstra   sched: Make separ...
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
  }
  
  /*
   * Return a low guess at the load of a migration-source cpu weighted
   * according to the scheduling class and "nice" value.
   *
   * We want to under-estimate the load of migration sources, to
   * balance conservatively.
   */
  static unsigned long source_load(int cpu, int type)
  {
  	struct rq *rq = cpu_rq(cpu);
  	unsigned long total = weighted_cpuload(cpu);
  
  	if (type == 0 || !sched_feat(LB_BIAS))
  		return total;
  
  	return min(rq->cpu_load[type-1], total);
  }
  
  /*
   * Return a high guess at the load of a migration-target cpu weighted
   * according to the scheduling class and "nice" value.
   */
  static unsigned long target_load(int cpu, int type)
  {
  	struct rq *rq = cpu_rq(cpu);
  	unsigned long total = weighted_cpuload(cpu);
  
  	if (type == 0 || !sched_feat(LB_BIAS))
  		return total;
  
  	return max(rq->cpu_load[type-1], total);
  }
  
  static unsigned long power_of(int cpu)
  {
  	return cpu_rq(cpu)->cpu_power;
  }
  
  static unsigned long cpu_avg_load_per_task(int cpu)
  {
  	struct rq *rq = cpu_rq(cpu);
  	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
b92486cbf   Alex Shi   sched: Compute ru...
3788
  	unsigned long load_avg = rq->cfs.runnable_load_avg;
029632fbb   Peter Zijlstra   sched: Make separ...
3789
3790
  
  	if (nr_running)
b92486cbf   Alex Shi   sched: Compute ru...
3791
  		return load_avg / nr_running;
029632fbb   Peter Zijlstra   sched: Make separ...
3792
3793
3794
  
  	return 0;
  }
62470419e   Michael Wang   sched: Implement ...
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
  static void record_wakee(struct task_struct *p)
  {
  	/*
  	 * Rough decay (wiping) for cost saving, don't worry
  	 * about the boundary, really active task won't care
  	 * about the loss.
  	 */
  	if (jiffies > current->wakee_flip_decay_ts + HZ) {
  		current->wakee_flips = 0;
  		current->wakee_flip_decay_ts = jiffies;
  	}
  
  	if (current->last_wakee != p) {
  		current->last_wakee = p;
  		current->wakee_flips++;
  	}
  }
098fb9db2   Ingo Molnar   sched: clean up w...
3812

74f8e4b23   Peter Zijlstra   sched: Remove rq ...
3813
  static void task_waking_fair(struct task_struct *p)
88ec22d3e   Peter Zijlstra   sched: Remove the...
3814
3815
3816
  {
  	struct sched_entity *se = &p->se;
  	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3fe1698b7   Peter Zijlstra   sched: Deal with ...
3817
3818
3819
3820
  	u64 min_vruntime;
  
  #ifndef CONFIG_64BIT
  	u64 min_vruntime_copy;
88ec22d3e   Peter Zijlstra   sched: Remove the...
3821

3fe1698b7   Peter Zijlstra   sched: Deal with ...
3822
3823
3824
3825
3826
3827
3828
3829
  	do {
  		min_vruntime_copy = cfs_rq->min_vruntime_copy;
  		smp_rmb();
  		min_vruntime = cfs_rq->min_vruntime;
  	} while (min_vruntime != min_vruntime_copy);
  #else
  	min_vruntime = cfs_rq->min_vruntime;
  #endif
88ec22d3e   Peter Zijlstra   sched: Remove the...
3830

3fe1698b7   Peter Zijlstra   sched: Deal with ...
3831
  	se->vruntime -= min_vruntime;
62470419e   Michael Wang   sched: Implement ...
3832
  	record_wakee(p);
88ec22d3e   Peter Zijlstra   sched: Remove the...
3833
  }
bb3469ac9   Peter Zijlstra   sched: hierarchic...
3834
  #ifdef CONFIG_FAIR_GROUP_SCHED
f5bfb7d9f   Peter Zijlstra   sched: bias effec...
3835
3836
3837
3838
3839
3840
  /*
   * effective_load() calculates the load change as seen from the root_task_group
   *
   * Adding load to a group doesn't make a group heavier, but can cause movement
   * of group shares between cpus. Assuming the shares were perfectly aligned one
   * can calculate the shift in shares.
cf5f0acf3   Peter Zijlstra   sched: Add a comm...
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
   *
   * Calculate the effective load difference if @wl is added (subtracted) to @tg
   * on this @cpu and results in a total addition (subtraction) of @wg to the
   * total group weight.
   *
   * Given a runqueue weight distribution (rw_i) we can compute a shares
   * distribution (s_i) using:
   *
   *   s_i = rw_i / \Sum rw_j						(1)
   *
   * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
   * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
   * shares distribution (s_i):
   *
   *   rw_i = {   2,   4,   1,   0 }
   *   s_i  = { 2/7, 4/7, 1/7,   0 }
   *
   * As per wake_affine() we're interested in the load of two CPUs (the CPU the
   * task used to run on and the CPU the waker is running on), we need to
   * compute the effect of waking a task on either CPU and, in case of a sync
   * wakeup, compute the effect of the current task going to sleep.
   *
   * So for a change of @wl to the local @cpu with an overall group weight change
   * of @wl we can compute the new shares distribution (s'_i) using:
   *
   *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)				(2)
   *
   * Suppose we're interested in CPUs 0 and 1, and want to compute the load
   * differences in waking a task to CPU 0. The additional task changes the
   * weight and shares distributions like:
   *
   *   rw'_i = {   3,   4,   1,   0 }
   *   s'_i  = { 3/8, 4/8, 1/8,   0 }
   *
   * We can then compute the difference in effective weight by using:
   *
   *   dw_i = S * (s'_i - s_i)						(3)
   *
   * Where 'S' is the group weight as seen by its parent.
   *
   * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
   * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
   * 4/7) times the weight of the group.
f5bfb7d9f   Peter Zijlstra   sched: bias effec...
3884
   */
2069dd75c   Peter Zijlstra   sched: Rewrite tg...
3885
  static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
bb3469ac9   Peter Zijlstra   sched: hierarchic...
3886
  {
4be9daaa1   Peter Zijlstra   sched: fix task_h...
3887
  	struct sched_entity *se = tg->se[cpu];
f1d239f73   Peter Zijlstra   sched: incrementa...
3888

9722c2dac   Rik van Riel   sched: Calculate ...
3889
  	if (!tg->parent)	/* the trivial, non-cgroup case */
f1d239f73   Peter Zijlstra   sched: incrementa...
3890
  		return wl;
4be9daaa1   Peter Zijlstra   sched: fix task_h...
3891
  	for_each_sched_entity(se) {
cf5f0acf3   Peter Zijlstra   sched: Add a comm...
3892
  		long w, W;
4be9daaa1   Peter Zijlstra   sched: fix task_h...
3893

977dda7c9   Paul Turner   sched: Update eff...
3894
  		tg = se->my_q->tg;
bb3469ac9   Peter Zijlstra   sched: hierarchic...
3895

cf5f0acf3   Peter Zijlstra   sched: Add a comm...
3896
3897
3898
3899
  		/*
  		 * W = @wg + \Sum rw_j
  		 */
  		W = wg + calc_tg_weight(tg, se->my_q);
4be9daaa1   Peter Zijlstra   sched: fix task_h...
3900

cf5f0acf3   Peter Zijlstra   sched: Add a comm...
3901
3902
3903
3904
  		/*
  		 * w = rw_i + @wl
  		 */
  		w = se->my_q->load.weight + wl;
940959e93   Peter Zijlstra   sched: fixlet for...
3905

cf5f0acf3   Peter Zijlstra   sched: Add a comm...
3906
3907
3908
3909
3910
  		/*
  		 * wl = S * s'_i; see (2)
  		 */
  		if (W > 0 && w < W)
  			wl = (w * tg->shares) / W;
977dda7c9   Paul Turner   sched: Update eff...
3911
3912
  		else
  			wl = tg->shares;
940959e93   Peter Zijlstra   sched: fixlet for...
3913

cf5f0acf3   Peter Zijlstra   sched: Add a comm...
3914
3915
3916
3917
3918
  		/*
  		 * Per the above, wl is the new se->load.weight value; since
  		 * those are clipped to [MIN_SHARES, ...) do so now. See
  		 * calc_cfs_shares().
  		 */
977dda7c9   Paul Turner   sched: Update eff...
3919
3920
  		if (wl < MIN_SHARES)
  			wl = MIN_SHARES;
cf5f0acf3   Peter Zijlstra   sched: Add a comm...
3921
3922
3923
3924
  
  		/*
  		 * wl = dw_i = S * (s'_i - s_i); see (3)
  		 */
977dda7c9   Paul Turner   sched: Update eff...
3925
  		wl -= se->load.weight;
cf5f0acf3   Peter Zijlstra   sched: Add a comm...
3926
3927
3928
3929
3930
3931
3932
3933
  
  		/*
  		 * Recursively apply this logic to all parent groups to compute
  		 * the final effective load change on the root group. Since
  		 * only the @tg group gets extra weight, all parent groups can
  		 * only redistribute existing shares. @wl is the shift in shares
  		 * resulting from this level per the above.
  		 */
4be9daaa1   Peter Zijlstra   sched: fix task_h...
3934
  		wg = 0;
4be9daaa1   Peter Zijlstra   sched: fix task_h...
3935
  	}
bb3469ac9   Peter Zijlstra   sched: hierarchic...
3936

4be9daaa1   Peter Zijlstra   sched: fix task_h...
3937
  	return wl;
bb3469ac9   Peter Zijlstra   sched: hierarchic...
3938
3939
  }
  #else
4be9daaa1   Peter Zijlstra   sched: fix task_h...
3940

58d081b50   Mel Gorman   sched/numa: Avoid...
3941
  static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4be9daaa1   Peter Zijlstra   sched: fix task_h...
3942
  {
83378269a   Peter Zijlstra   sched: correct wa...
3943
  	return wl;
bb3469ac9   Peter Zijlstra   sched: hierarchic...
3944
  }
4be9daaa1   Peter Zijlstra   sched: fix task_h...
3945

bb3469ac9   Peter Zijlstra   sched: hierarchic...
3946
  #endif
62470419e   Michael Wang   sched: Implement ...
3947
3948
  static int wake_wide(struct task_struct *p)
  {
7d9ffa896   Peter Zijlstra   sched: Micro-opti...
3949
  	int factor = this_cpu_read(sd_llc_size);
62470419e   Michael Wang   sched: Implement ...
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
  
  	/*
  	 * Yeah, it's the switching-frequency, could means many wakee or
  	 * rapidly switch, use factor here will just help to automatically
  	 * adjust the loose-degree, so bigger node will lead to more pull.
  	 */
  	if (p->wakee_flips > factor) {
  		/*
  		 * wakee is somewhat hot, it needs certain amount of cpu
  		 * resource, so if waker is far more hot, prefer to leave
  		 * it alone.
  		 */
  		if (current->wakee_flips > (factor * p->wakee_flips))
  			return 1;
  	}
  
  	return 0;
  }
c88d59108   Peter Zijlstra   sched: Merge sele...
3968
  static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
098fb9db2   Ingo Molnar   sched: clean up w...
3969
  {
e37b6a7b2   Paul Turner   sched: Fix sign u...
3970
  	s64 this_load, load;
c88d59108   Peter Zijlstra   sched: Merge sele...
3971
  	int idx, this_cpu, prev_cpu;
098fb9db2   Ingo Molnar   sched: clean up w...
3972
  	unsigned long tl_per_task;
c88d59108   Peter Zijlstra   sched: Merge sele...
3973
  	struct task_group *tg;
83378269a   Peter Zijlstra   sched: correct wa...
3974
  	unsigned long weight;
b3137bc8e   Mike Galbraith   sched: stop wake_...
3975
  	int balanced;
098fb9db2   Ingo Molnar   sched: clean up w...
3976

62470419e   Michael Wang   sched: Implement ...
3977
3978
3979
3980
3981
3982
  	/*
  	 * If we wake multiple tasks be careful to not bounce
  	 * ourselves around too much.
  	 */
  	if (wake_wide(p))
  		return 0;
c88d59108   Peter Zijlstra   sched: Merge sele...
3983
3984
3985
3986
3987
  	idx	  = sd->wake_idx;
  	this_cpu  = smp_processor_id();
  	prev_cpu  = task_cpu(p);
  	load	  = source_load(prev_cpu, idx);
  	this_load = target_load(this_cpu, idx);
098fb9db2   Ingo Molnar   sched: clean up w...
3988
3989
  
  	/*
b3137bc8e   Mike Galbraith   sched: stop wake_...
3990
3991
3992
3993
  	 * If sync wakeup then subtract the (maximum possible)
  	 * effect of the currently running task from the load
  	 * of the current CPU:
  	 */
83378269a   Peter Zijlstra   sched: correct wa...
3994
3995
3996
  	if (sync) {
  		tg = task_group(current);
  		weight = current->se.load.weight;
c88d59108   Peter Zijlstra   sched: Merge sele...
3997
  		this_load += effective_load(tg, this_cpu, -weight, -weight);
83378269a   Peter Zijlstra   sched: correct wa...
3998
3999
  		load += effective_load(tg, prev_cpu, 0, -weight);
  	}
b3137bc8e   Mike Galbraith   sched: stop wake_...
4000

83378269a   Peter Zijlstra   sched: correct wa...
4001
4002
  	tg = task_group(p);
  	weight = p->se.load.weight;
b3137bc8e   Mike Galbraith   sched: stop wake_...
4003

71a29aa7b   Peter Zijlstra   sched: Deal with ...
4004
4005
  	/*
  	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
c88d59108   Peter Zijlstra   sched: Merge sele...
4006
4007
4008
  	 * due to the sync cause above having dropped this_load to 0, we'll
  	 * always have an imbalance, but there's really nothing you can do
  	 * about that, so that's good too.
71a29aa7b   Peter Zijlstra   sched: Deal with ...
4009
4010
4011
4012
  	 *
  	 * Otherwise check if either cpus are near enough in load to allow this
  	 * task to be woken on this_cpu.
  	 */
e37b6a7b2   Paul Turner   sched: Fix sign u...
4013
4014
  	if (this_load > 0) {
  		s64 this_eff_load, prev_eff_load;
e51fd5e22   Peter Zijlstra   sched: Fix wake_a...
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
  
  		this_eff_load = 100;
  		this_eff_load *= power_of(prev_cpu);
  		this_eff_load *= this_load +
  			effective_load(tg, this_cpu, weight, weight);
  
  		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
  		prev_eff_load *= power_of(this_cpu);
  		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
  
  		balanced = this_eff_load <= prev_eff_load;
  	} else
  		balanced = true;
b3137bc8e   Mike Galbraith   sched: stop wake_...
4028
4029
  
  	/*
4ae7d5cef   Ingo Molnar   sched: improve af...
4030
4031
4032
  	 * If the currently running task will sleep within
  	 * a reasonable amount of time then attract this newly
  	 * woken task:
098fb9db2   Ingo Molnar   sched: clean up w...
4033
  	 */
2fb7635c4   Peter Zijlstra   sched: sync wakeu...
4034
4035
  	if (sync && balanced)
  		return 1;
098fb9db2   Ingo Molnar   sched: clean up w...
4036

41acab885   Lucas De Marchi   sched: Implement ...
4037
  	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
098fb9db2   Ingo Molnar   sched: clean up w...
4038
  	tl_per_task = cpu_avg_load_per_task(this_cpu);
c88d59108   Peter Zijlstra   sched: Merge sele...
4039
4040
4041
  	if (balanced ||
  	    (this_load <= load &&
  	     this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
098fb9db2   Ingo Molnar   sched: clean up w...
4042
4043
4044
4045
4046
  		/*
  		 * This domain has SD_WAKE_AFFINE and
  		 * p is cache cold in this domain, and
  		 * there is no bad imbalance.
  		 */
c88d59108   Peter Zijlstra   sched: Merge sele...
4047
  		schedstat_inc(sd, ttwu_move_affine);
41acab885   Lucas De Marchi   sched: Implement ...
4048
  		schedstat_inc(p, se.statistics.nr_wakeups_affine);
098fb9db2   Ingo Molnar   sched: clean up w...
4049
4050
4051
4052
4053
  
  		return 1;
  	}
  	return 0;
  }
aaee1203c   Peter Zijlstra   sched: Move sched...
4054
4055
4056
4057
4058
  /*
   * find_idlest_group finds and returns the least busy CPU group within the
   * domain.
   */
  static struct sched_group *
78e7ed53c   Peter Zijlstra   sched: Tweak wake...
4059
  find_idlest_group(struct sched_domain *sd, struct task_struct *p,
c44f2a020   Vincent Guittot   sched/fair: Move ...
4060
  		  int this_cpu, int sd_flag)
e7693a362   Gregory Haskins   sched: de-SCHED_O...
4061
  {
b3bd3de66   Andi Kleen   gcc-4.6: kernel/*...
4062
  	struct sched_group *idlest = NULL, *group = sd->groups;
aaee1203c   Peter Zijlstra   sched: Move sched...
4063
  	unsigned long min_load = ULONG_MAX, this_load = 0;
c44f2a020   Vincent Guittot   sched/fair: Move ...
4064
  	int load_idx = sd->forkexec_idx;
aaee1203c   Peter Zijlstra   sched: Move sched...
4065
  	int imbalance = 100 + (sd->imbalance_pct-100)/2;
e7693a362   Gregory Haskins   sched: de-SCHED_O...
4066

c44f2a020   Vincent Guittot   sched/fair: Move ...
4067
4068
  	if (sd_flag & SD_BALANCE_WAKE)
  		load_idx = sd->wake_idx;
aaee1203c   Peter Zijlstra   sched: Move sched...
4069
4070
4071
4072
  	do {
  		unsigned long load, avg_load;
  		int local_group;
  		int i;
e7693a362   Gregory Haskins   sched: de-SCHED_O...
4073

aaee1203c   Peter Zijlstra   sched: Move sched...
4074
4075
  		/* Skip over this group if it has no CPUs allowed */
  		if (!cpumask_intersects(sched_group_cpus(group),
fa17b507f   Peter Zijlstra   sched: Wrap sched...
4076
  					tsk_cpus_allowed(p)))
aaee1203c   Peter Zijlstra   sched: Move sched...
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
  			continue;
  
  		local_group = cpumask_test_cpu(this_cpu,
  					       sched_group_cpus(group));
  
  		/* Tally up the load of all CPUs in the group */
  		avg_load = 0;
  
  		for_each_cpu(i, sched_group_cpus(group)) {
  			/* Bias balancing toward cpus of our domain */
  			if (local_group)
  				load = source_load(i, load_idx);
  			else
  				load = target_load(i, load_idx);
  
  			avg_load += load;
  		}
  
  		/* Adjust by relative CPU power of the group */
9c3f75cbd   Peter Zijlstra   sched: Break out ...
4096
  		avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
aaee1203c   Peter Zijlstra   sched: Move sched...
4097
4098
4099
  
  		if (local_group) {
  			this_load = avg_load;
aaee1203c   Peter Zijlstra   sched: Move sched...
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
  		} else if (avg_load < min_load) {
  			min_load = avg_load;
  			idlest = group;
  		}
  	} while (group = group->next, group != sd->groups);
  
  	if (!idlest || 100*this_load < imbalance*min_load)
  		return NULL;
  	return idlest;
  }
  
  /*
   * find_idlest_cpu - find the idlest cpu among the cpus in group.
   */
  static int
  find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  {
  	unsigned long load, min_load = ULONG_MAX;
  	int idlest = -1;
  	int i;
  
  	/* Traverse only the allowed CPUs */
fa17b507f   Peter Zijlstra   sched: Wrap sched...
4122
  	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
aaee1203c   Peter Zijlstra   sched: Move sched...
4123
4124
4125
4126
4127
  		load = weighted_cpuload(i);
  
  		if (load < min_load || (load == min_load && i == this_cpu)) {
  			min_load = load;
  			idlest = i;
e7693a362   Gregory Haskins   sched: de-SCHED_O...
4128
4129
  		}
  	}
aaee1203c   Peter Zijlstra   sched: Move sched...
4130
4131
  	return idlest;
  }
e7693a362   Gregory Haskins   sched: de-SCHED_O...
4132

aaee1203c   Peter Zijlstra   sched: Move sched...
4133
  /*
a50bde513   Peter Zijlstra   sched: Cleanup se...
4134
4135
   * Try and locate an idle CPU in the sched_domain.
   */
99bd5e2f2   Suresh Siddha   sched: Fix select...
4136
  static int select_idle_sibling(struct task_struct *p, int target)
a50bde513   Peter Zijlstra   sched: Cleanup se...
4137
  {
99bd5e2f2   Suresh Siddha   sched: Fix select...
4138
  	struct sched_domain *sd;
37407ea7f   Linus Torvalds   Revert "sched: Im...
4139
  	struct sched_group *sg;
e0a79f529   Mike Galbraith   sched: Fix select...
4140
  	int i = task_cpu(p);
a50bde513   Peter Zijlstra   sched: Cleanup se...
4141

e0a79f529   Mike Galbraith   sched: Fix select...
4142
4143
  	if (idle_cpu(target))
  		return target;
99bd5e2f2   Suresh Siddha   sched: Fix select...
4144
4145
  
  	/*
e0a79f529   Mike Galbraith   sched: Fix select...
4146
  	 * If the prevous cpu is cache affine and idle, don't be stupid.
99bd5e2f2   Suresh Siddha   sched: Fix select...
4147
  	 */
e0a79f529   Mike Galbraith   sched: Fix select...
4148
4149
  	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
  		return i;
a50bde513   Peter Zijlstra   sched: Cleanup se...
4150
4151
  
  	/*
37407ea7f   Linus Torvalds   Revert "sched: Im...
4152
  	 * Otherwise, iterate the domains and find an elegible idle cpu.
a50bde513   Peter Zijlstra   sched: Cleanup se...
4153
  	 */
518cd6234   Peter Zijlstra   sched: Only queue...
4154
  	sd = rcu_dereference(per_cpu(sd_llc, target));
970e17898   Mike Galbraith   sched: Improve sc...
4155
  	for_each_lower_domain(sd) {
37407ea7f   Linus Torvalds   Revert "sched: Im...
4156
4157
4158
4159
4160
4161
4162
  		sg = sd->groups;
  		do {
  			if (!cpumask_intersects(sched_group_cpus(sg),
  						tsk_cpus_allowed(p)))
  				goto next;
  
  			for_each_cpu(i, sched_group_cpus(sg)) {
e0a79f529   Mike Galbraith   sched: Fix select...
4163
  				if (i == target || !idle_cpu(i))
37407ea7f   Linus Torvalds   Revert "sched: Im...
4164
4165
  					goto next;
  			}
970e17898   Mike Galbraith   sched: Improve sc...
4166

37407ea7f   Linus Torvalds   Revert "sched: Im...
4167
4168
4169
4170
4171
4172
4173
4174
  			target = cpumask_first_and(sched_group_cpus(sg),
  					tsk_cpus_allowed(p));
  			goto done;
  next:
  			sg = sg->next;
  		} while (sg != sd->groups);
  	}
  done:
a50bde513   Peter Zijlstra   sched: Cleanup se...
4175
4176
4177
4178
  	return target;
  }
  
  /*
de91b9cb9   Morten Rasmussen   sched: Fix select...
4179
4180
4181
   * select_task_rq_fair: Select target runqueue for the waking task in domains
   * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
   * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
aaee1203c   Peter Zijlstra   sched: Move sched...
4182
   *
de91b9cb9   Morten Rasmussen   sched: Fix select...
4183
4184
   * Balances load by selecting the idlest cpu in the idlest group, or under
   * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
aaee1203c   Peter Zijlstra   sched: Move sched...
4185
   *
de91b9cb9   Morten Rasmussen   sched: Fix select...
4186
   * Returns the target cpu number.
aaee1203c   Peter Zijlstra   sched: Move sched...
4187
4188
4189
   *
   * preempt must be disabled.
   */
0017d7350   Peter Zijlstra   sched: Fix TASK_W...
4190
  static int
ac66f5477   Peter Zijlstra   sched/numa: Intro...
4191
  select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
aaee1203c   Peter Zijlstra   sched: Move sched...
4192
  {
29cd8bae3   Peter Zijlstra   sched: Fix SD_POW...
4193
  	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
c88d59108   Peter Zijlstra   sched: Merge sele...
4194
  	int cpu = smp_processor_id();
c88d59108   Peter Zijlstra   sched: Merge sele...
4195
  	int new_cpu = cpu;
99bd5e2f2   Suresh Siddha   sched: Fix select...
4196
  	int want_affine = 0;
5158f4e44   Peter Zijlstra   sched: Clean up t...
4197
  	int sync = wake_flags & WF_SYNC;
c88d59108   Peter Zijlstra   sched: Merge sele...
4198

29baa7478   Peter Zijlstra   sched: Move nr_cp...
4199
  	if (p->nr_cpus_allowed == 1)
76854c7e8   Mike Galbraith   sched: Use rt.nr_...
4200
  		return prev_cpu;
0763a660a   Peter Zijlstra   sched: Rename sel...
4201
  	if (sd_flag & SD_BALANCE_WAKE) {
fa17b507f   Peter Zijlstra   sched: Wrap sched...
4202
  		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
c88d59108   Peter Zijlstra   sched: Merge sele...
4203
4204
4205
  			want_affine = 1;
  		new_cpu = prev_cpu;
  	}
aaee1203c   Peter Zijlstra   sched: Move sched...
4206

dce840a08   Peter Zijlstra   sched: Dynamicall...
4207
  	rcu_read_lock();
aaee1203c   Peter Zijlstra   sched: Move sched...
4208
  	for_each_domain(cpu, tmp) {
e4f428884   Peter Zijlstra   sched: Select_tas...
4209
4210
  		if (!(tmp->flags & SD_LOAD_BALANCE))
  			continue;
aaee1203c   Peter Zijlstra   sched: Move sched...
4211
  		/*
99bd5e2f2   Suresh Siddha   sched: Fix select...
4212
4213
  		 * If both cpu and prev_cpu are part of this domain,
  		 * cpu is a valid SD_WAKE_AFFINE target.
fe3bcfe1f   Peter Zijlstra   sched: More gener...
4214
  		 */
99bd5e2f2   Suresh Siddha   sched: Fix select...
4215
4216
4217
  		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
  		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
  			affine_sd = tmp;
29cd8bae3   Peter Zijlstra   sched: Fix SD_POW...
4218
  			break;
f03542a70   Alex Shi   sched: recover SD...
4219
  		}
29cd8bae3   Peter Zijlstra   sched: Fix SD_POW...
4220

f03542a70   Alex Shi   sched: recover SD...
4221
  		if (tmp->flags & sd_flag)
29cd8bae3   Peter Zijlstra   sched: Fix SD_POW...
4222
4223
  			sd = tmp;
  	}
8b911acdf   Mike Galbraith   sched: Fix select...
4224
  	if (affine_sd) {
f03542a70   Alex Shi   sched: recover SD...
4225
  		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
dce840a08   Peter Zijlstra   sched: Dynamicall...
4226
4227
4228
4229
  			prev_cpu = cpu;
  
  		new_cpu = select_idle_sibling(p, prev_cpu);
  		goto unlock;
8b911acdf   Mike Galbraith   sched: Fix select...
4230
  	}
e7693a362   Gregory Haskins   sched: de-SCHED_O...
4231

aaee1203c   Peter Zijlstra   sched: Move sched...
4232
4233
  	while (sd) {
  		struct sched_group *group;
c88d59108   Peter Zijlstra   sched: Merge sele...
4234
  		int weight;
098fb9db2   Ingo Molnar   sched: clean up w...
4235

0763a660a   Peter Zijlstra   sched: Rename sel...
4236
  		if (!(sd->flags & sd_flag)) {
aaee1203c   Peter Zijlstra   sched: Move sched...
4237
4238
4239
  			sd = sd->child;
  			continue;
  		}
098fb9db2   Ingo Molnar   sched: clean up w...
4240

c44f2a020   Vincent Guittot   sched/fair: Move ...
4241
  		group = find_idlest_group(sd, p, cpu, sd_flag);
aaee1203c   Peter Zijlstra   sched: Move sched...
4242
4243
4244
4245
  		if (!group) {
  			sd = sd->child;
  			continue;
  		}
4ae7d5cef   Ingo Molnar   sched: improve af...
4246

d7c33c493   Peter Zijlstra   sched: Fix task a...
4247
  		new_cpu = find_idlest_cpu(group, p, cpu);
aaee1203c   Peter Zijlstra   sched: Move sched...
4248
4249
4250
4251
  		if (new_cpu == -1 || new_cpu == cpu) {
  			/* Now try balancing at a lower domain level of cpu */
  			sd = sd->child;
  			continue;
e7693a362   Gregory Haskins   sched: de-SCHED_O...
4252
  		}
aaee1203c   Peter Zijlstra   sched: Move sched...
4253
4254
4255
  
  		/* Now try balancing at a lower domain level of new_cpu */
  		cpu = new_cpu;
669c55e9f   Peter Zijlstra   sched: Pre-comput...
4256
  		weight = sd->span_weight;
aaee1203c   Peter Zijlstra   sched: Move sched...
4257
4258
  		sd = NULL;
  		for_each_domain(cpu, tmp) {
669c55e9f   Peter Zijlstra   sched: Pre-comput...
4259
  			if (weight <= tmp->span_weight)
aaee1203c   Peter Zijlstra   sched: Move sched...
4260
  				break;
0763a660a   Peter Zijlstra   sched: Rename sel...
4261
  			if (tmp->flags & sd_flag)
aaee1203c   Peter Zijlstra   sched: Move sched...
4262
4263
4264
  				sd = tmp;
  		}
  		/* while loop will break here if sd == NULL */
e7693a362   Gregory Haskins   sched: de-SCHED_O...
4265
  	}
dce840a08   Peter Zijlstra   sched: Dynamicall...
4266
4267
  unlock:
  	rcu_read_unlock();
e7693a362   Gregory Haskins   sched: de-SCHED_O...
4268

c88d59108   Peter Zijlstra   sched: Merge sele...
4269
  	return new_cpu;
e7693a362   Gregory Haskins   sched: de-SCHED_O...
4270
  }
0a74bef8b   Paul Turner   sched: Add an rq ...
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
  
  /*
   * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
   * cfs_rq_of(p) references at time of call are still valid and identify the
   * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
   * other assumptions, including the state of rq->lock, should be made.
   */
  static void
  migrate_task_rq_fair(struct task_struct *p, int next_cpu)
  {
aff3e4988   Paul Turner   sched: Account fo...
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
  	struct sched_entity *se = &p->se;
  	struct cfs_rq *cfs_rq = cfs_rq_of(se);
  
  	/*
  	 * Load tracking: accumulate removed load so that it can be processed
  	 * when we next update owning cfs_rq under rq->lock.  Tasks contribute
  	 * to blocked load iff they have a positive decay-count.  It can never
  	 * be negative here since on-rq tasks have decay-count == 0.
  	 */
  	if (se->avg.decay_count) {
  		se->avg.decay_count = -__synchronize_entity_decay(se);
2509940fd   Alex Shi   sched/cfs_rq: Cha...
4292
4293
  		atomic_long_add(se->avg.load_avg_contrib,
  						&cfs_rq->removed_load);
aff3e4988   Paul Turner   sched: Account fo...
4294
  	}
0a74bef8b   Paul Turner   sched: Add an rq ...
4295
  }
e7693a362   Gregory Haskins   sched: de-SCHED_O...
4296
  #endif /* CONFIG_SMP */
e52fb7c09   Peter Zijlstra   sched: prefer wakers
4297
4298
  static unsigned long
  wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
0bbd3336e   Peter Zijlstra   sched: fix wakeup...
4299
4300
4301
4302
  {
  	unsigned long gran = sysctl_sched_wakeup_granularity;
  
  	/*
e52fb7c09   Peter Zijlstra   sched: prefer wakers
4303
4304
  	 * Since its curr running now, convert the gran from real-time
  	 * to virtual-time in his units.
13814d42e   Mike Galbraith   sched: Remove ASY...
4305
4306
4307
4308
4309
4310
4311
4312
4313
  	 *
  	 * By using 'se' instead of 'curr' we penalize light tasks, so
  	 * they get preempted easier. That is, if 'se' < 'curr' then
  	 * the resulting gran will be larger, therefore penalizing the
  	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
  	 * be smaller, again penalizing the lighter task.
  	 *
  	 * This is especially important for buddies when the leftmost
  	 * task is higher priority than the buddy.
0bbd3336e   Peter Zijlstra   sched: fix wakeup...
4314
  	 */
f4ad9bd20   Shaohua Li   sched: Eliminate ...
4315
  	return calc_delta_fair(gran, se);
0bbd3336e   Peter Zijlstra   sched: fix wakeup...
4316
4317
4318
  }
  
  /*
464b75273   Peter Zijlstra   sched: re-instate...
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
   * Should 'se' preempt 'curr'.
   *
   *             |s1
   *        |s2
   *   |s3
   *         g
   *      |<--->|c
   *
   *  w(c, s1) = -1
   *  w(c, s2) =  0
   *  w(c, s3) =  1
   *
   */
  static int
  wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
  {
  	s64 gran, vdiff = curr->vruntime - se->vruntime;
  
  	if (vdiff <= 0)
  		return -1;
e52fb7c09   Peter Zijlstra   sched: prefer wakers
4339
  	gran = wakeup_gran(curr, se);
464b75273   Peter Zijlstra   sched: re-instate...
4340
4341
4342
4343
4344
  	if (vdiff > gran)
  		return 1;
  
  	return 0;
  }
02479099c   Peter Zijlstra   sched: fix buddie...
4345
4346
  static void set_last_buddy(struct sched_entity *se)
  {
69c80f3e9   Venkatesh Pallipadi   sched: Make set_*...
4347
4348
4349
4350
4351
  	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
  		return;
  
  	for_each_sched_entity(se)
  		cfs_rq_of(se)->last = se;
02479099c   Peter Zijlstra   sched: fix buddie...
4352
4353
4354
4355
  }
  
  static void set_next_buddy(struct sched_entity *se)
  {
69c80f3e9   Venkatesh Pallipadi   sched: Make set_*...
4356
4357
4358
4359
4360
  	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
  		return;
  
  	for_each_sched_entity(se)
  		cfs_rq_of(se)->next = se;
02479099c   Peter Zijlstra   sched: fix buddie...
4361
  }
ac53db596   Rik van Riel   sched: Use a budd...
4362
4363
  static void set_skip_buddy(struct sched_entity *se)
  {
69c80f3e9   Venkatesh Pallipadi   sched: Make set_*...
4364
4365
  	for_each_sched_entity(se)
  		cfs_rq_of(se)->skip = se;
ac53db596   Rik van Riel   sched: Use a budd...
4366
  }
464b75273   Peter Zijlstra   sched: re-instate...
4367
  /*
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
4368
4369
   * Preempt the current task with a newly woken task if needed:
   */
5a9b86f64   Peter Zijlstra   sched: Rename fla...
4370
  static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
4371
4372
  {
  	struct task_struct *curr = rq->curr;
8651a86c3   Srivatsa Vaddagiri   sched: group sche...
4373
  	struct sched_entity *se = &curr->se, *pse = &p->se;
03e89e457   Mike Galbraith   sched: fix wakeup...
4374
  	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
f685ceaca   Mike Galbraith   sched: Strengthen...
4375
  	int scale = cfs_rq->nr_running >= sched_nr_latency;
2f36825b1   Venkatesh Pallipadi   sched: Next buddy...
4376
  	int next_buddy_marked = 0;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
4377

4ae7d5cef   Ingo Molnar   sched: improve af...
4378
4379
  	if (unlikely(se == pse))
  		return;
5238cdd38   Paul Turner   sched: Prevent bu...
4380
  	/*
ddcdf6e7d   Peter Zijlstra   sched: Rename loa...
4381
  	 * This is possible from callers such as move_task(), in which we
5238cdd38   Paul Turner   sched: Prevent bu...
4382
4383
4384
4385
4386
4387
  	 * unconditionally check_prempt_curr() after an enqueue (which may have
  	 * lead to a throttle).  This both saves work and prevents false
  	 * next-buddy nomination below.
  	 */
  	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
  		return;
2f36825b1   Venkatesh Pallipadi   sched: Next buddy...
4388
  	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
3cb63d527   Mike Galbraith   sched: Complete b...
4389
  		set_next_buddy(pse);
2f36825b1   Venkatesh Pallipadi   sched: Next buddy...
4390
4391
  		next_buddy_marked = 1;
  	}
57fdc26d4   Peter Zijlstra   sched: fixup budd...
4392

aec0a5142   Bharata B Rao   sched: call resch...
4393
4394
4395
  	/*
  	 * We can come here with TIF_NEED_RESCHED already set from new task
  	 * wake up path.
5238cdd38   Paul Turner   sched: Prevent bu...
4396
4397
4398
4399
4400
4401
  	 *
  	 * Note: this also catches the edge-case of curr being in a throttled
  	 * group (e.g. via set_curr_task), since update_curr() (in the
  	 * enqueue of curr) will have resulted in resched being set.  This
  	 * prevents us from potentially nominating it as a false LAST_BUDDY
  	 * below.
aec0a5142   Bharata B Rao   sched: call resch...
4402
4403
4404
  	 */
  	if (test_tsk_need_resched(curr))
  		return;
a2f5c9ab7   Darren Hart   sched: Allow SCHE...
4405
4406
4407
4408
  	/* Idle tasks are by definition preempted by non-idle tasks. */
  	if (unlikely(curr->policy == SCHED_IDLE) &&
  	    likely(p->policy != SCHED_IDLE))
  		goto preempt;
91c234b4e   Ingo Molnar   sched: do not wak...
4409
  	/*
a2f5c9ab7   Darren Hart   sched: Allow SCHE...
4410
4411
  	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
  	 * is driven by the tick):
91c234b4e   Ingo Molnar   sched: do not wak...
4412
  	 */
8ed92e51f   Ingo Molnar   sched: Add WAKEUP...
4413
  	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
91c234b4e   Ingo Molnar   sched: do not wak...
4414
  		return;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
4415

464b75273   Peter Zijlstra   sched: re-instate...
4416
  	find_matching_se(&se, &pse);
9bbd73743   Paul Turner   sched: update cor...
4417
  	update_curr(cfs_rq_of(se));
002f128b4   Paul Turner   sched: remove red...
4418
  	BUG_ON(!pse);
2f36825b1   Venkatesh Pallipadi   sched: Next buddy...
4419
4420
4421
4422
4423
4424
4425
  	if (wakeup_preempt_entity(se, pse) == 1) {
  		/*
  		 * Bias pick_next to pick the sched entity that is
  		 * triggering this preemption.
  		 */
  		if (!next_buddy_marked)
  			set_next_buddy(pse);
3a7e73a2e   Peter Zijlstra   sched: Clean up c...
4426
  		goto preempt;
2f36825b1   Venkatesh Pallipadi   sched: Next buddy...
4427
  	}
464b75273   Peter Zijlstra   sched: re-instate...
4428

3a7e73a2e   Peter Zijlstra   sched: Clean up c...
4429
  	return;
a65ac745e   Jupyung Lee   sched: Move updat...
4430

3a7e73a2e   Peter Zijlstra   sched: Clean up c...
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
  preempt:
  	resched_task(curr);
  	/*
  	 * Only set the backward buddy when the current task is still
  	 * on the rq. This can happen when a wakeup gets interleaved
  	 * with schedule on the ->pre_schedule() or idle_balance()
  	 * point, either of which can * drop the rq lock.
  	 *
  	 * Also, during early boot the idle thread is in the fair class,
  	 * for obvious reasons its a bad idea to schedule back to it.
  	 */
  	if (unlikely(!se->on_rq || curr == rq->idle))
  		return;
  
  	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
  		set_last_buddy(se);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
4447
  }
606dba2e2   Peter Zijlstra   sched: Push put_p...
4448
4449
  static struct task_struct *
  pick_next_task_fair(struct rq *rq, struct task_struct *prev)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
4450
4451
4452
  {
  	struct cfs_rq *cfs_rq = &rq->cfs;
  	struct sched_entity *se;
678d5718d   Peter Zijlstra   sched/fair: Optim...
4453
  	struct task_struct *p;
37e117c07   Peter Zijlstra   sched: Guarantee ...
4454
  	int new_tasks;
678d5718d   Peter Zijlstra   sched/fair: Optim...
4455

6e83125c6   Peter Zijlstra   sched/fair: Remov...
4456
  again:
678d5718d   Peter Zijlstra   sched/fair: Optim...
4457
4458
  #ifdef CONFIG_FAIR_GROUP_SCHED
  	if (!cfs_rq->nr_running)
38033c37f   Peter Zijlstra   sched: Push down ...
4459
  		goto idle;
678d5718d   Peter Zijlstra   sched/fair: Optim...
4460

3f1d2a318   Peter Zijlstra   sched: Fix hotplu...
4461
  	if (prev->sched_class != &fair_sched_class)
678d5718d   Peter Zijlstra   sched/fair: Optim...
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
  		goto simple;
  
  	/*
  	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
  	 * likely that a next task is from the same cgroup as the current.
  	 *
  	 * Therefore attempt to avoid putting and setting the entire cgroup
  	 * hierarchy, only change the part that actually changes.
  	 */
  
  	do {
  		struct sched_entity *curr = cfs_rq->curr;
  
  		/*
  		 * Since we got here without doing put_prev_entity() we also
  		 * have to consider cfs_rq->curr. If it is still a runnable
  		 * entity, update_curr() will update its vruntime, otherwise
  		 * forget we've ever seen it.
  		 */
  		if (curr && curr->on_rq)
  			update_curr(cfs_rq);
  		else
  			curr = NULL;
  
  		/*
  		 * This call to check_cfs_rq_runtime() will do the throttle and
  		 * dequeue its entity in the parent(s). Therefore the 'simple'
  		 * nr_running test will indeed be correct.
  		 */
  		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
  			goto simple;
  
  		se = pick_next_entity(cfs_rq, curr);
  		cfs_rq = group_cfs_rq(se);
  	} while (cfs_rq);
  
  	p = task_of(se);
  
  	/*
  	 * Since we haven't yet done put_prev_entity and if the selected task
  	 * is a different task than we started out with, try and touch the
  	 * least amount of cfs_rqs.
  	 */
  	if (prev != p) {
  		struct sched_entity *pse = &prev->se;
  
  		while (!(cfs_rq = is_same_group(se, pse))) {
  			int se_depth = se->depth;
  			int pse_depth = pse->depth;
  
  			if (se_depth <= pse_depth) {
  				put_prev_entity(cfs_rq_of(pse), pse);
  				pse = parent_entity(pse);
  			}
  			if (se_depth >= pse_depth) {
  				set_next_entity(cfs_rq_of(se), se);
  				se = parent_entity(se);
  			}
  		}
  
  		put_prev_entity(cfs_rq, pse);
  		set_next_entity(cfs_rq, se);
  	}
  
  	if (hrtick_enabled(rq))
  		hrtick_start_fair(rq, p);
  
  	return p;
  simple:
  	cfs_rq = &rq->cfs;
  #endif
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
4533

36ace27e3   Tim Blechmann   sched: Optimize b...
4534
  	if (!cfs_rq->nr_running)
38033c37f   Peter Zijlstra   sched: Push down ...
4535
  		goto idle;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
4536

3f1d2a318   Peter Zijlstra   sched: Fix hotplu...
4537
  	put_prev_task(rq, prev);
606dba2e2   Peter Zijlstra   sched: Push put_p...
4538

bf0f6f24a   Ingo Molnar   sched: cfs core, ...
4539
  	do {
678d5718d   Peter Zijlstra   sched/fair: Optim...
4540
  		se = pick_next_entity(cfs_rq, NULL);
f4b6755fb   Peter Zijlstra   sched: cleanup fa...
4541
  		set_next_entity(cfs_rq, se);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
4542
4543
  		cfs_rq = group_cfs_rq(se);
  	} while (cfs_rq);
8f4d37ec0   Peter Zijlstra   sched: high-res p...
4544
  	p = task_of(se);
678d5718d   Peter Zijlstra   sched/fair: Optim...
4545

b39e66eaf   Mike Galbraith   sched: Save some ...
4546
4547
  	if (hrtick_enabled(rq))
  		hrtick_start_fair(rq, p);
8f4d37ec0   Peter Zijlstra   sched: high-res p...
4548
4549
  
  	return p;
38033c37f   Peter Zijlstra   sched: Push down ...
4550
4551
  
  idle:
e4aa358b6   Kirill Tkhai   sched/fair: Push ...
4552
  	new_tasks = idle_balance(rq);
37e117c07   Peter Zijlstra   sched: Guarantee ...
4553
4554
4555
4556
4557
  	/*
  	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
  	 * possible for any higher priority task to appear. In that case we
  	 * must re-start the pick_next_entity() loop.
  	 */
e4aa358b6   Kirill Tkhai   sched/fair: Push ...
4558
  	if (new_tasks < 0)
37e117c07   Peter Zijlstra   sched: Guarantee ...
4559
  		return RETRY_TASK;
e4aa358b6   Kirill Tkhai   sched/fair: Push ...
4560
  	if (new_tasks > 0)
38033c37f   Peter Zijlstra   sched: Push down ...
4561
  		goto again;
38033c37f   Peter Zijlstra   sched: Push down ...
4562
4563
  
  	return NULL;
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
4564
4565
4566
4567
4568
  }
  
  /*
   * Account for a descheduled task:
   */
31ee529cc   Ingo Molnar   sched: remove the...
4569
  static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
4570
4571
4572
4573
4574
4575
  {
  	struct sched_entity *se = &prev->se;
  	struct cfs_rq *cfs_rq;
  
  	for_each_sched_entity(se) {
  		cfs_rq = cfs_rq_of(se);
ab6cde269   Ingo Molnar   sched: remove the...
4576
  		put_prev_entity(cfs_rq, se);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
4577
4578
  	}
  }
ac53db596   Rik van Riel   sched: Use a budd...
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
  /*
   * sched_yield() is very simple
   *
   * The magic of dealing with the ->skip buddy is in pick_next_entity.
   */
  static void yield_task_fair(struct rq *rq)
  {
  	struct task_struct *curr = rq->curr;
  	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
  	struct sched_entity *se = &curr->se;
  
  	/*
  	 * Are we the only task in the tree?
  	 */
  	if (unlikely(rq->nr_running == 1))
  		return;
  
  	clear_buddies(cfs_rq, se);
  
  	if (curr->policy != SCHED_BATCH) {
  		update_rq_clock(rq);
  		/*
  		 * Update run-time statistics of the 'current'.
  		 */
  		update_curr(cfs_rq);
916671c08   Mike Galbraith   sched: Set skip_c...
4604
4605
4606
4607
4608
4609
  		/*
  		 * Tell update_rq_clock() that we've just updated,
  		 * so we don't do microscopic update in schedule()
  		 * and double the fastpath cost.
  		 */
  		 rq->skip_clock_update = 1;
ac53db596   Rik van Riel   sched: Use a budd...
4610
4611
4612
4613
  	}
  
  	set_skip_buddy(se);
  }
d95f41220   Mike Galbraith   sched: Add yield_...
4614
4615
4616
  static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
  {
  	struct sched_entity *se = &p->se;
5238cdd38   Paul Turner   sched: Prevent bu...
4617
4618
  	/* throttled hierarchies are not runnable */
  	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
d95f41220   Mike Galbraith   sched: Add yield_...
4619
4620
4621
4622
  		return false;
  
  	/* Tell the scheduler that we'd really like pse to run next. */
  	set_next_buddy(se);
d95f41220   Mike Galbraith   sched: Add yield_...
4623
4624
4625
4626
  	yield_task_fair(rq);
  
  	return true;
  }
681f3e685   Peter Williams   sched: isolate SM...
4627
  #ifdef CONFIG_SMP
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
4628
  /**************************************************
e9c84cb8d   Peter Zijlstra   sched: Describe C...
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
   * Fair scheduling class load-balancing methods.
   *
   * BASICS
   *
   * The purpose of load-balancing is to achieve the same basic fairness the
   * per-cpu scheduler provides, namely provide a proportional amount of compute
   * time to each task. This is expressed in the following equation:
   *
   *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
   *
   * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
   * W_i,0 is defined as:
   *
   *   W_i,0 = \Sum_j w_i,j                                             (2)
   *
   * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
   * is derived from the nice value as per prio_to_weight[].
   *
   * The weight average is an exponential decay average of the instantaneous
   * weight:
   *
   *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
   *
   * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
   * fraction of 'recent' time available for SCHED_OTHER task execution. But it
   * can also include other factors [XXX].
   *
   * To achieve this balance we define a measure of imbalance which follows
   * directly from (1):
   *
   *   imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j }    (4)
   *
   * We them move tasks around to minimize the imbalance. In the continuous
   * function space it is obvious this converges, in the discrete case we get
   * a few fun cases generally called infeasible weight scenarios.
   *
   * [XXX expand on:
   *     - infeasible weights;
   *     - local vs global optima in the discrete case. ]
   *
   *
   * SCHED DOMAINS
   *
   * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
   * for all i,j solution, we create a tree of cpus that follows the hardware
   * topology where each level pairs two lower groups (or better). This results
   * in O(log n) layers. Furthermore we reduce the number of cpus going up the
   * tree to only the first of the previous level and we decrease the frequency
   * of load-balance at each level inv. proportional to the number of cpus in
   * the groups.
   *
   * This yields:
   *
   *     log_2 n     1     n
   *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
   *     i = 0      2^i   2^i
   *                               `- size of each group
   *         |         |     `- number of cpus doing load-balance
   *         |         `- freq
   *         `- sum over all levels
   *
   * Coupled with a limit on how many tasks we can migrate every balance pass,
   * this makes (5) the runtime complexity of the balancer.
   *
   * An important property here is that each CPU is still (indirectly) connected
   * to every other cpu in at most O(log n) steps:
   *
   * The adjacency matrix of the resulting graph is given by:
   *
   *             log_2 n     
   *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
   *             k = 0
   *
   * And you'll find that:
   *
   *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
   *
   * Showing there's indeed a path between every cpu in at most O(log n) steps.
   * The task movement gives a factor of O(m), giving a convergence complexity
   * of:
   *
   *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
   *
   *
   * WORK CONSERVING
   *
   * In order to avoid CPUs going idle while there's still work to do, new idle
   * balancing is more aggressive and has the newly idle cpu iterate up the domain
   * tree itself instead of relying on other CPUs to bring it work.
   *
   * This adds some complexity to both (5) and (8) but it reduces the total idle
   * time.
   *
   * [XXX more?]
   *
   *
   * CGROUPS
   *
   * Cgroups make a horror show out of (2), instead of a simple sum we get:
   *
   *                                s_k,i
   *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
   *                                 S_k
   *
   * Where
   *
   *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
   *
   * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
   *
   * The big problem is S_k, its a global sum needed to compute a local (W_i)
   * property.
   *
   * [XXX write more on how we solve this.. _after_ merging pjt's patches that
   *      rewrite all of this once again.]
   */ 
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
4745

ed387b781   Hiroshi Shimamoto   sched: Move SMP-o...
4746
  static unsigned long __read_mostly max_load_balance_interval = HZ/10;
0ec8aa00f   Peter Zijlstra   sched/numa: Avoid...
4747
  enum fbq_type { regular, remote, all };
ddcdf6e7d   Peter Zijlstra   sched: Rename loa...
4748
  #define LBF_ALL_PINNED	0x01
367456c75   Peter Zijlstra   sched: Ditch per ...
4749
  #define LBF_NEED_BREAK	0x02
6263322c5   Peter Zijlstra   sched/fair: Rewri...
4750
4751
  #define LBF_DST_PINNED  0x04
  #define LBF_SOME_PINNED	0x08
ddcdf6e7d   Peter Zijlstra   sched: Rename loa...
4752
4753
4754
  
  struct lb_env {
  	struct sched_domain	*sd;
ddcdf6e7d   Peter Zijlstra   sched: Rename loa...
4755
  	struct rq		*src_rq;
85c1e7dae   Prashanth Nageshappa   sched: Reorder 's...
4756
  	int			src_cpu;
ddcdf6e7d   Peter Zijlstra   sched: Rename loa...
4757
4758
4759
  
  	int			dst_cpu;
  	struct rq		*dst_rq;
88b8dac0a   Srivatsa Vaddagiri   sched: Improve ba...
4760
4761
  	struct cpumask		*dst_grpmask;
  	int			new_dst_cpu;
ddcdf6e7d   Peter Zijlstra   sched: Rename loa...
4762
  	enum cpu_idle_type	idle;
bd939f45d   Peter Zijlstra   sched/fair: Propa...
4763
  	long			imbalance;
b9403130a   Michael Wang   sched/cleanups: A...
4764
4765
  	/* The set of CPUs under consideration for load-balancing */
  	struct cpumask		*cpus;
ddcdf6e7d   Peter Zijlstra   sched: Rename loa...
4766
  	unsigned int		flags;
367456c75   Peter Zijlstra   sched: Ditch per ...
4767
4768
4769
4770
  
  	unsigned int		loop;
  	unsigned int		loop_break;
  	unsigned int		loop_max;
0ec8aa00f   Peter Zijlstra   sched/numa: Avoid...
4771
4772
  
  	enum fbq_type		fbq_type;
ddcdf6e7d   Peter Zijlstra   sched: Rename loa...
4773
  };
1e3c88bde   Peter Zijlstra   sched: Move load ...
4774
  /*
ddcdf6e7d   Peter Zijlstra   sched: Rename loa...
4775
   * move_task - move a task from one runqueue to another runqueue.
1e3c88bde   Peter Zijlstra   sched: Move load ...
4776
4777
   * Both runqueues must be locked.
   */
ddcdf6e7d   Peter Zijlstra   sched: Rename loa...
4778
  static void move_task(struct task_struct *p, struct lb_env *env)
1e3c88bde   Peter Zijlstra   sched: Move load ...
4779
  {
ddcdf6e7d   Peter Zijlstra   sched: Rename loa...
4780
4781
4782
4783
  	deactivate_task(env->src_rq, p, 0);
  	set_task_cpu(p, env->dst_cpu);
  	activate_task(env->dst_rq, p, 0);
  	check_preempt_curr(env->dst_rq, p, 0);
1e3c88bde   Peter Zijlstra   sched: Move load ...
4784
4785
4786
  }
  
  /*
029632fbb   Peter Zijlstra   sched: Make separ...
4787
4788
4789
   * Is this task likely cache-hot:
   */
  static int
6037dd1a4   Alex Shi   sched: Clean up t...
4790
  task_hot(struct task_struct *p, u64 now)
029632fbb   Peter Zijlstra   sched: Make separ...
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
  {
  	s64 delta;
  
  	if (p->sched_class != &fair_sched_class)
  		return 0;
  
  	if (unlikely(p->policy == SCHED_IDLE))
  		return 0;
  
  	/*
  	 * Buddy candidates are cache hot:
  	 */
  	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
  			(&p->se == cfs_rq_of(&p->se)->next ||
  			 &p->se == cfs_rq_of(&p->se)->last))
  		return 1;
  
  	if (sysctl_sched_migration_cost == -1)
  		return 1;
  	if (sysctl_sched_migration_cost == 0)
  		return 0;
  
  	delta = now - p->se.exec_start;
  
  	return delta < (s64)sysctl_sched_migration_cost;
  }
3a7053b32   Mel Gorman   sched/numa: Favou...
4817
4818
4819
4820
4821
  #ifdef CONFIG_NUMA_BALANCING
  /* Returns true if the destination node has incurred more faults */
  static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
  {
  	int src_nid, dst_nid;
ff1df896a   Rik van Riel   sched/numa: Renam...
4822
  	if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
3a7053b32   Mel Gorman   sched/numa: Favou...
4823
4824
4825
4826
4827
4828
  	    !(env->sd->flags & SD_NUMA)) {
  		return false;
  	}
  
  	src_nid = cpu_to_node(env->src_cpu);
  	dst_nid = cpu_to_node(env->dst_cpu);
83e1d2cd9   Mel Gorman   sched/numa: Use g...
4829
  	if (src_nid == dst_nid)
3a7053b32   Mel Gorman   sched/numa: Favou...
4830
  		return false;
83e1d2cd9   Mel Gorman   sched/numa: Use g...
4831
4832
4833
  	/* Always encourage migration to the preferred node. */
  	if (dst_nid == p->numa_preferred_nid)
  		return true;
887c290e8   Rik van Riel   sched/numa: Decid...
4834
4835
4836
  	/* If both task and group weight improve, this move is a winner. */
  	if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
  	    group_weight(p, dst_nid) > group_weight(p, src_nid))
3a7053b32   Mel Gorman   sched/numa: Favou...
4837
4838
4839
4840
  		return true;
  
  	return false;
  }
7a0f30833   Mel Gorman   sched/numa: Resis...
4841
4842
4843
4844
4845
4846
4847
4848
  
  
  static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
  {
  	int src_nid, dst_nid;
  
  	if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
  		return false;
ff1df896a   Rik van Riel   sched/numa: Renam...
4849
  	if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
7a0f30833   Mel Gorman   sched/numa: Resis...
4850
4851
4852
4853
  		return false;
  
  	src_nid = cpu_to_node(env->src_cpu);
  	dst_nid = cpu_to_node(env->dst_cpu);
83e1d2cd9   Mel Gorman   sched/numa: Use g...
4854
  	if (src_nid == dst_nid)
7a0f30833   Mel Gorman   sched/numa: Resis...
4855
  		return false;
83e1d2cd9   Mel Gorman   sched/numa: Use g...
4856
4857
4858
  	/* Migrating away from the preferred node is always bad. */
  	if (src_nid == p->numa_preferred_nid)
  		return true;
887c290e8   Rik van Riel   sched/numa: Decid...
4859
4860
4861
  	/* If either task or group weight get worse, don't do it. */
  	if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
  	    group_weight(p, dst_nid) < group_weight(p, src_nid))
7a0f30833   Mel Gorman   sched/numa: Resis...
4862
4863
4864
4865
  		return true;
  
  	return false;
  }
3a7053b32   Mel Gorman   sched/numa: Favou...
4866
4867
4868
4869
4870
4871
  #else
  static inline bool migrate_improves_locality(struct task_struct *p,
  					     struct lb_env *env)
  {
  	return false;
  }
7a0f30833   Mel Gorman   sched/numa: Resis...
4872
4873
4874
4875
4876
4877
  
  static inline bool migrate_degrades_locality(struct task_struct *p,
  					     struct lb_env *env)
  {
  	return false;
  }
3a7053b32   Mel Gorman   sched/numa: Favou...
4878
  #endif
029632fbb   Peter Zijlstra   sched: Make separ...
4879
  /*
1e3c88bde   Peter Zijlstra   sched: Move load ...
4880
4881
4882
   * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
   */
  static
8e45cb545   Peter Zijlstra   sched: Move load-...
4883
  int can_migrate_task(struct task_struct *p, struct lb_env *env)
1e3c88bde   Peter Zijlstra   sched: Move load ...
4884
4885
4886
4887
  {
  	int tsk_cache_hot = 0;
  	/*
  	 * We do not migrate tasks that are:
d31980846   Joonsoo Kim   sched: Move up af...
4888
  	 * 1) throttled_lb_pair, or
1e3c88bde   Peter Zijlstra   sched: Move load ...
4889
  	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
d31980846   Joonsoo Kim   sched: Move up af...
4890
4891
  	 * 3) running (obviously), or
  	 * 4) are cache-hot on their current CPU.
1e3c88bde   Peter Zijlstra   sched: Move load ...
4892
  	 */
d31980846   Joonsoo Kim   sched: Move up af...
4893
4894
  	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
  		return 0;
ddcdf6e7d   Peter Zijlstra   sched: Rename loa...
4895
  	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
e02e60c10   Joonsoo Kim   sched: Prevent to...
4896
  		int cpu;
88b8dac0a   Srivatsa Vaddagiri   sched: Improve ba...
4897

41acab885   Lucas De Marchi   sched: Implement ...
4898
  		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
88b8dac0a   Srivatsa Vaddagiri   sched: Improve ba...
4899

6263322c5   Peter Zijlstra   sched/fair: Rewri...
4900
  		env->flags |= LBF_SOME_PINNED;
88b8dac0a   Srivatsa Vaddagiri   sched: Improve ba...
4901
4902
4903
4904
4905
4906
4907
4908
  		/*
  		 * Remember if this task can be migrated to any other cpu in
  		 * our sched_group. We may want to revisit it if we couldn't
  		 * meet load balance goals by pulling other tasks on src_cpu.
  		 *
  		 * Also avoid computing new_dst_cpu if we have already computed
  		 * one in current iteration.
  		 */
6263322c5   Peter Zijlstra   sched/fair: Rewri...
4909
  		if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
88b8dac0a   Srivatsa Vaddagiri   sched: Improve ba...
4910
  			return 0;
e02e60c10   Joonsoo Kim   sched: Prevent to...
4911
4912
4913
  		/* Prevent to re-select dst_cpu via env's cpus */
  		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
  			if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
6263322c5   Peter Zijlstra   sched/fair: Rewri...
4914
  				env->flags |= LBF_DST_PINNED;
e02e60c10   Joonsoo Kim   sched: Prevent to...
4915
4916
4917
  				env->new_dst_cpu = cpu;
  				break;
  			}
88b8dac0a   Srivatsa Vaddagiri   sched: Improve ba...
4918
  		}
e02e60c10   Joonsoo Kim   sched: Prevent to...
4919

1e3c88bde   Peter Zijlstra   sched: Move load ...
4920
4921
  		return 0;
  	}
88b8dac0a   Srivatsa Vaddagiri   sched: Improve ba...
4922
4923
  
  	/* Record that we found atleast one task that could run on dst_cpu */
8e45cb545   Peter Zijlstra   sched: Move load-...
4924
  	env->flags &= ~LBF_ALL_PINNED;
1e3c88bde   Peter Zijlstra   sched: Move load ...
4925

ddcdf6e7d   Peter Zijlstra   sched: Rename loa...
4926
  	if (task_running(env->src_rq, p)) {
41acab885   Lucas De Marchi   sched: Implement ...
4927
  		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
1e3c88bde   Peter Zijlstra   sched: Move load ...
4928
4929
4930
4931
4932
  		return 0;
  	}
  
  	/*
  	 * Aggressive migration if:
3a7053b32   Mel Gorman   sched/numa: Favou...
4933
4934
4935
  	 * 1) destination numa is preferred
  	 * 2) task is cache cold, or
  	 * 3) too many balance attempts have failed.
1e3c88bde   Peter Zijlstra   sched: Move load ...
4936
  	 */
6037dd1a4   Alex Shi   sched: Clean up t...
4937
  	tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));
7a0f30833   Mel Gorman   sched/numa: Resis...
4938
4939
  	if (!tsk_cache_hot)
  		tsk_cache_hot = migrate_degrades_locality(p, env);
3a7053b32   Mel Gorman   sched/numa: Favou...
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
  
  	if (migrate_improves_locality(p, env)) {
  #ifdef CONFIG_SCHEDSTATS
  		if (tsk_cache_hot) {
  			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
  			schedstat_inc(p, se.statistics.nr_forced_migrations);
  		}
  #endif
  		return 1;
  	}
1e3c88bde   Peter Zijlstra   sched: Move load ...
4950
  	if (!tsk_cache_hot ||
8e45cb545   Peter Zijlstra   sched: Move load-...
4951
  		env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
4e2dcb73a   Zhang Hang   sched: Simplify c...
4952

1e3c88bde   Peter Zijlstra   sched: Move load ...
4953
  		if (tsk_cache_hot) {
8e45cb545   Peter Zijlstra   sched: Move load-...
4954
  			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
41acab885   Lucas De Marchi   sched: Implement ...
4955
  			schedstat_inc(p, se.statistics.nr_forced_migrations);
1e3c88bde   Peter Zijlstra   sched: Move load ...
4956
  		}
4e2dcb73a   Zhang Hang   sched: Simplify c...
4957

1e3c88bde   Peter Zijlstra   sched: Move load ...
4958
4959
  		return 1;
  	}
4e2dcb73a   Zhang Hang   sched: Simplify c...
4960
4961
  	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
  	return 0;
1e3c88bde   Peter Zijlstra   sched: Move load ...
4962
  }
897c395f4   Peter Zijlstra   sched: Remove rq_...
4963
4964
4965
4966
4967
4968
4969
  /*
   * move_one_task tries to move exactly one task from busiest to this_rq, as
   * part of active balancing operations within "domain".
   * Returns 1 if successful and 0 otherwise.
   *
   * Called with both runqueues locked.
   */
8e45cb545   Peter Zijlstra   sched: Move load-...
4970
  static int move_one_task(struct lb_env *env)
897c395f4   Peter Zijlstra   sched: Remove rq_...
4971
4972
  {
  	struct task_struct *p, *n;
897c395f4   Peter Zijlstra   sched: Remove rq_...
4973

367456c75   Peter Zijlstra   sched: Ditch per ...
4974
  	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
367456c75   Peter Zijlstra   sched: Ditch per ...
4975
4976
  		if (!can_migrate_task(p, env))
  			continue;
897c395f4   Peter Zijlstra   sched: Remove rq_...
4977

367456c75   Peter Zijlstra   sched: Ditch per ...
4978
4979
4980
4981
4982
4983
4984
4985
  		move_task(p, env);
  		/*
  		 * Right now, this is only the second place move_task()
  		 * is called, so we can safely collect move_task()
  		 * stats here rather than inside move_task().
  		 */
  		schedstat_inc(env->sd, lb_gained[env->idle]);
  		return 1;
897c395f4   Peter Zijlstra   sched: Remove rq_...
4986
  	}
897c395f4   Peter Zijlstra   sched: Remove rq_...
4987
4988
  	return 0;
  }
eb95308ee   Peter Zijlstra   sched: Fix more l...
4989
  static const unsigned int sched_nr_migrate_break = 32;
5d6523ebd   Peter Zijlstra   sched: Fix load-b...
4990
  /*
bd939f45d   Peter Zijlstra   sched/fair: Propa...
4991
   * move_tasks tries to move up to imbalance weighted load from busiest to
5d6523ebd   Peter Zijlstra   sched: Fix load-b...
4992
4993
4994
4995
4996
4997
   * this_rq, as part of a balancing operation within domain "sd".
   * Returns 1 if successful and 0 otherwise.
   *
   * Called with both runqueues locked.
   */
  static int move_tasks(struct lb_env *env)
1e3c88bde   Peter Zijlstra   sched: Move load ...
4998
  {
5d6523ebd   Peter Zijlstra   sched: Fix load-b...
4999
5000
  	struct list_head *tasks = &env->src_rq->cfs_tasks;
  	struct task_struct *p;
367456c75   Peter Zijlstra   sched: Ditch per ...
5001
5002
  	unsigned long load;
  	int pulled = 0;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5003

bd939f45d   Peter Zijlstra   sched/fair: Propa...
5004
  	if (env->imbalance <= 0)
5d6523ebd   Peter Zijlstra   sched: Fix load-b...
5005
  		return 0;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5006

5d6523ebd   Peter Zijlstra   sched: Fix load-b...
5007
5008
  	while (!list_empty(tasks)) {
  		p = list_first_entry(tasks, struct task_struct, se.group_node);
1e3c88bde   Peter Zijlstra   sched: Move load ...
5009

367456c75   Peter Zijlstra   sched: Ditch per ...
5010
5011
  		env->loop++;
  		/* We've more or less seen every task there is, call it quits */
5d6523ebd   Peter Zijlstra   sched: Fix load-b...
5012
  		if (env->loop > env->loop_max)
367456c75   Peter Zijlstra   sched: Ditch per ...
5013
  			break;
5d6523ebd   Peter Zijlstra   sched: Fix load-b...
5014
5015
  
  		/* take a breather every nr_migrate tasks */
367456c75   Peter Zijlstra   sched: Ditch per ...
5016
  		if (env->loop > env->loop_break) {
eb95308ee   Peter Zijlstra   sched: Fix more l...
5017
  			env->loop_break += sched_nr_migrate_break;
8e45cb545   Peter Zijlstra   sched: Move load-...
5018
  			env->flags |= LBF_NEED_BREAK;
ee00e66ff   Peter Zijlstra   sched: Remove rq_...
5019
  			break;
a195f004e   Peter Zijlstra   sched: Fix load-b...
5020
  		}
1e3c88bde   Peter Zijlstra   sched: Move load ...
5021

d31980846   Joonsoo Kim   sched: Move up af...
5022
  		if (!can_migrate_task(p, env))
367456c75   Peter Zijlstra   sched: Ditch per ...
5023
5024
5025
  			goto next;
  
  		load = task_h_load(p);
5d6523ebd   Peter Zijlstra   sched: Fix load-b...
5026

eb95308ee   Peter Zijlstra   sched: Fix more l...
5027
  		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
367456c75   Peter Zijlstra   sched: Ditch per ...
5028
  			goto next;
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5029
  		if ((load / 2) > env->imbalance)
367456c75   Peter Zijlstra   sched: Ditch per ...
5030
  			goto next;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5031

ddcdf6e7d   Peter Zijlstra   sched: Rename loa...
5032
  		move_task(p, env);
ee00e66ff   Peter Zijlstra   sched: Remove rq_...
5033
  		pulled++;
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5034
  		env->imbalance -= load;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5035
5036
  
  #ifdef CONFIG_PREEMPT
ee00e66ff   Peter Zijlstra   sched: Remove rq_...
5037
5038
5039
5040
5041
  		/*
  		 * NEWIDLE balancing is a source of latency, so preemptible
  		 * kernels will stop after the first task is pulled to minimize
  		 * the critical section.
  		 */
5d6523ebd   Peter Zijlstra   sched: Fix load-b...
5042
  		if (env->idle == CPU_NEWLY_IDLE)
ee00e66ff   Peter Zijlstra   sched: Remove rq_...
5043
  			break;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5044
  #endif
ee00e66ff   Peter Zijlstra   sched: Remove rq_...
5045
5046
5047
5048
  		/*
  		 * We only want to steal up to the prescribed amount of
  		 * weighted load.
  		 */
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5049
  		if (env->imbalance <= 0)
ee00e66ff   Peter Zijlstra   sched: Remove rq_...
5050
  			break;
367456c75   Peter Zijlstra   sched: Ditch per ...
5051
5052
5053
  
  		continue;
  next:
5d6523ebd   Peter Zijlstra   sched: Fix load-b...
5054
  		list_move_tail(&p->se.group_node, tasks);
1e3c88bde   Peter Zijlstra   sched: Move load ...
5055
  	}
5d6523ebd   Peter Zijlstra   sched: Fix load-b...
5056

1e3c88bde   Peter Zijlstra   sched: Move load ...
5057
  	/*
ddcdf6e7d   Peter Zijlstra   sched: Rename loa...
5058
5059
5060
  	 * Right now, this is one of only two places move_task() is called,
  	 * so we can safely collect move_task() stats here rather than
  	 * inside move_task().
1e3c88bde   Peter Zijlstra   sched: Move load ...
5061
  	 */
8e45cb545   Peter Zijlstra   sched: Move load-...
5062
  	schedstat_add(env->sd, lb_gained[env->idle], pulled);
1e3c88bde   Peter Zijlstra   sched: Move load ...
5063

5d6523ebd   Peter Zijlstra   sched: Fix load-b...
5064
  	return pulled;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5065
  }
230059de7   Peter Zijlstra   sched: Remove fro...
5066
  #ifdef CONFIG_FAIR_GROUP_SCHED
9e3081ca6   Peter Zijlstra   sched: Make tg_sh...
5067
5068
5069
  /*
   * update tg->load_weight by folding this cpu's load_avg
   */
48a167532   Paul Turner   sched: Refactor u...
5070
  static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
9e3081ca6   Peter Zijlstra   sched: Make tg_sh...
5071
  {
48a167532   Paul Turner   sched: Refactor u...
5072
5073
  	struct sched_entity *se = tg->se[cpu];
  	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
9e3081ca6   Peter Zijlstra   sched: Make tg_sh...
5074

48a167532   Paul Turner   sched: Refactor u...
5075
5076
5077
  	/* throttled entities do not contribute to load */
  	if (throttled_hierarchy(cfs_rq))
  		return;
9e3081ca6   Peter Zijlstra   sched: Make tg_sh...
5078

aff3e4988   Paul Turner   sched: Account fo...
5079
  	update_cfs_rq_blocked_load(cfs_rq, 1);
9e3081ca6   Peter Zijlstra   sched: Make tg_sh...
5080

82958366c   Paul Turner   sched: Replace up...
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
  	if (se) {
  		update_entity_load_avg(se, 1);
  		/*
  		 * We pivot on our runnable average having decayed to zero for
  		 * list removal.  This generally implies that all our children
  		 * have also been removed (modulo rounding error or bandwidth
  		 * control); however, such cases are rare and we can fix these
  		 * at enqueue.
  		 *
  		 * TODO: fix up out-of-order children on enqueue.
  		 */
  		if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
  			list_del_leaf_cfs_rq(cfs_rq);
  	} else {
48a167532   Paul Turner   sched: Refactor u...
5095
  		struct rq *rq = rq_of(cfs_rq);
82958366c   Paul Turner   sched: Replace up...
5096
5097
  		update_rq_runnable_avg(rq, rq->nr_running);
  	}
9e3081ca6   Peter Zijlstra   sched: Make tg_sh...
5098
  }
48a167532   Paul Turner   sched: Refactor u...
5099
  static void update_blocked_averages(int cpu)
9e3081ca6   Peter Zijlstra   sched: Make tg_sh...
5100
  {
9e3081ca6   Peter Zijlstra   sched: Make tg_sh...
5101
  	struct rq *rq = cpu_rq(cpu);
48a167532   Paul Turner   sched: Refactor u...
5102
5103
  	struct cfs_rq *cfs_rq;
  	unsigned long flags;
9e3081ca6   Peter Zijlstra   sched: Make tg_sh...
5104

48a167532   Paul Turner   sched: Refactor u...
5105
5106
  	raw_spin_lock_irqsave(&rq->lock, flags);
  	update_rq_clock(rq);
9763b67fb   Peter Zijlstra   sched, cgroup: Op...
5107
5108
5109
5110
  	/*
  	 * Iterates the task_group tree in a bottom up fashion, see
  	 * list_add_leaf_cfs_rq() for details.
  	 */
64660c864   Paul Turner   sched: Prevent in...
5111
  	for_each_leaf_cfs_rq(rq, cfs_rq) {
48a167532   Paul Turner   sched: Refactor u...
5112
5113
5114
5115
5116
5117
  		/*
  		 * Note: We may want to consider periodically releasing
  		 * rq->lock about these updates so that creating many task
  		 * groups does not result in continually extending hold time.
  		 */
  		__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
64660c864   Paul Turner   sched: Prevent in...
5118
  	}
48a167532   Paul Turner   sched: Refactor u...
5119
5120
  
  	raw_spin_unlock_irqrestore(&rq->lock, flags);
9e3081ca6   Peter Zijlstra   sched: Make tg_sh...
5121
  }
9763b67fb   Peter Zijlstra   sched, cgroup: Op...
5122
  /*
685207963   Vladimir Davydov   sched: Move h_loa...
5123
   * Compute the hierarchical load factor for cfs_rq and all its ascendants.
9763b67fb   Peter Zijlstra   sched, cgroup: Op...
5124
5125
5126
   * This needs to be done in a top-down fashion because the load of a child
   * group is a fraction of its parents load.
   */
685207963   Vladimir Davydov   sched: Move h_loa...
5127
  static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
9763b67fb   Peter Zijlstra   sched, cgroup: Op...
5128
  {
685207963   Vladimir Davydov   sched: Move h_loa...
5129
5130
  	struct rq *rq = rq_of(cfs_rq);
  	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
a35b6466a   Peter Zijlstra   sched, cgroup: Re...
5131
  	unsigned long now = jiffies;
685207963   Vladimir Davydov   sched: Move h_loa...
5132
  	unsigned long load;
a35b6466a   Peter Zijlstra   sched, cgroup: Re...
5133

685207963   Vladimir Davydov   sched: Move h_loa...
5134
  	if (cfs_rq->last_h_load_update == now)
a35b6466a   Peter Zijlstra   sched, cgroup: Re...
5135
  		return;
685207963   Vladimir Davydov   sched: Move h_loa...
5136
5137
5138
5139
5140
5141
5142
  	cfs_rq->h_load_next = NULL;
  	for_each_sched_entity(se) {
  		cfs_rq = cfs_rq_of(se);
  		cfs_rq->h_load_next = se;
  		if (cfs_rq->last_h_load_update == now)
  			break;
  	}
a35b6466a   Peter Zijlstra   sched, cgroup: Re...
5143

685207963   Vladimir Davydov   sched: Move h_loa...
5144
  	if (!se) {
7e3115ef5   Vladimir Davydov   sched/balancing: ...
5145
  		cfs_rq->h_load = cfs_rq->runnable_load_avg;
685207963   Vladimir Davydov   sched: Move h_loa...
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
  		cfs_rq->last_h_load_update = now;
  	}
  
  	while ((se = cfs_rq->h_load_next) != NULL) {
  		load = cfs_rq->h_load;
  		load = div64_ul(load * se->avg.load_avg_contrib,
  				cfs_rq->runnable_load_avg + 1);
  		cfs_rq = group_cfs_rq(se);
  		cfs_rq->h_load = load;
  		cfs_rq->last_h_load_update = now;
  	}
9763b67fb   Peter Zijlstra   sched, cgroup: Op...
5157
  }
367456c75   Peter Zijlstra   sched: Ditch per ...
5158
  static unsigned long task_h_load(struct task_struct *p)
230059de7   Peter Zijlstra   sched: Remove fro...
5159
  {
367456c75   Peter Zijlstra   sched: Ditch per ...
5160
  	struct cfs_rq *cfs_rq = task_cfs_rq(p);
230059de7   Peter Zijlstra   sched: Remove fro...
5161

685207963   Vladimir Davydov   sched: Move h_loa...
5162
  	update_cfs_rq_h_load(cfs_rq);
a003a25b2   Alex Shi   sched: Consider r...
5163
5164
  	return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
  			cfs_rq->runnable_load_avg + 1);
230059de7   Peter Zijlstra   sched: Remove fro...
5165
5166
  }
  #else
48a167532   Paul Turner   sched: Refactor u...
5167
  static inline void update_blocked_averages(int cpu)
9e3081ca6   Peter Zijlstra   sched: Make tg_sh...
5168
5169
  {
  }
367456c75   Peter Zijlstra   sched: Ditch per ...
5170
  static unsigned long task_h_load(struct task_struct *p)
1e3c88bde   Peter Zijlstra   sched: Move load ...
5171
  {
a003a25b2   Alex Shi   sched: Consider r...
5172
  	return p->se.avg.load_avg_contrib;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5173
  }
230059de7   Peter Zijlstra   sched: Remove fro...
5174
  #endif
1e3c88bde   Peter Zijlstra   sched: Move load ...
5175

1e3c88bde   Peter Zijlstra   sched: Move load ...
5176
5177
  /********** Helpers for find_busiest_group ************************/
  /*
1e3c88bde   Peter Zijlstra   sched: Move load ...
5178
5179
5180
5181
5182
   * sg_lb_stats - stats of a sched_group required for load_balancing
   */
  struct sg_lb_stats {
  	unsigned long avg_load; /*Avg load across the CPUs of the group */
  	unsigned long group_load; /* Total load over the CPUs of the group */
1e3c88bde   Peter Zijlstra   sched: Move load ...
5183
  	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5184
  	unsigned long load_per_task;
3ae11c90f   Peter Zijlstra   sched/fair: Make ...
5185
  	unsigned long group_power;
147c5fc2b   Peter Zijlstra   sched/fair: Shrin...
5186
5187
5188
5189
  	unsigned int sum_nr_running; /* Nr tasks running in the group */
  	unsigned int group_capacity;
  	unsigned int idle_cpus;
  	unsigned int group_weight;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5190
  	int group_imb; /* Is there an imbalance in the group ? */
fab476228   Nikhil Rao   sched: Force bala...
5191
  	int group_has_capacity; /* Is there extra capacity in the group? */
0ec8aa00f   Peter Zijlstra   sched/numa: Avoid...
5192
5193
5194
5195
  #ifdef CONFIG_NUMA_BALANCING
  	unsigned int nr_numa_running;
  	unsigned int nr_preferred_running;
  #endif
1e3c88bde   Peter Zijlstra   sched: Move load ...
5196
  };
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
  /*
   * sd_lb_stats - Structure to store the statistics of a sched_domain
   *		 during load balancing.
   */
  struct sd_lb_stats {
  	struct sched_group *busiest;	/* Busiest group in this sd */
  	struct sched_group *local;	/* Local group in this sd */
  	unsigned long total_load;	/* Total load of all groups in sd */
  	unsigned long total_pwr;	/* Total power of all groups in sd */
  	unsigned long avg_load;	/* Average load across all groups in sd */
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5207
  	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
147c5fc2b   Peter Zijlstra   sched/fair: Shrin...
5208
  	struct sg_lb_stats local_stat;	/* Statistics of the local group */
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5209
  };
147c5fc2b   Peter Zijlstra   sched/fair: Shrin...
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
  static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
  {
  	/*
  	 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
  	 * local_stat because update_sg_lb_stats() does a full clear/assignment.
  	 * We must however clear busiest_stat::avg_load because
  	 * update_sd_pick_busiest() reads this before assignment.
  	 */
  	*sds = (struct sd_lb_stats){
  		.busiest = NULL,
  		.local = NULL,
  		.total_load = 0UL,
  		.total_pwr = 0UL,
  		.busiest_stat = {
  			.avg_load = 0UL,
  		},
  	};
  }
1e3c88bde   Peter Zijlstra   sched: Move load ...
5228
  /**
1e3c88bde   Peter Zijlstra   sched: Move load ...
5229
5230
   * get_sd_load_idx - Obtain the load index for a given sched domain.
   * @sd: The sched_domain whose load_idx is to be obtained.
ed1b77328   Kamalesh Babulal   sched/fair: Fix t...
5231
   * @idle: The idle status of the CPU for whose sd load_idx is obtained.
e69f61862   Yacine Belkadi   sched: Fix some k...
5232
5233
   *
   * Return: The load index.
1e3c88bde   Peter Zijlstra   sched: Move load ...
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
   */
  static inline int get_sd_load_idx(struct sched_domain *sd,
  					enum cpu_idle_type idle)
  {
  	int load_idx;
  
  	switch (idle) {
  	case CPU_NOT_IDLE:
  		load_idx = sd->busy_idx;
  		break;
  
  	case CPU_NEWLY_IDLE:
  		load_idx = sd->newidle_idx;
  		break;
  	default:
  		load_idx = sd->idle_idx;
  		break;
  	}
  
  	return load_idx;
  }
15f803c94   Li Zefan   sched: Make defau...
5255
  static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
1e3c88bde   Peter Zijlstra   sched: Move load ...
5256
  {
1399fa780   Nikhil Rao   sched: Introduce ...
5257
  	return SCHED_POWER_SCALE;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5258
5259
5260
5261
5262
5263
  }
  
  unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
  {
  	return default_scale_freq_power(sd, cpu);
  }
15f803c94   Li Zefan   sched: Make defau...
5264
  static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
1e3c88bde   Peter Zijlstra   sched: Move load ...
5265
  {
669c55e9f   Peter Zijlstra   sched: Pre-comput...
5266
  	unsigned long weight = sd->span_weight;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
  	unsigned long smt_gain = sd->smt_gain;
  
  	smt_gain /= weight;
  
  	return smt_gain;
  }
  
  unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
  {
  	return default_scale_smt_power(sd, cpu);
  }
15f803c94   Li Zefan   sched: Make defau...
5278
  static unsigned long scale_rt_power(int cpu)
1e3c88bde   Peter Zijlstra   sched: Move load ...
5279
5280
  {
  	struct rq *rq = cpu_rq(cpu);
b654f7de4   Peter Zijlstra   sched: Make sure ...
5281
  	u64 total, available, age_stamp, avg;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5282

b654f7de4   Peter Zijlstra   sched: Make sure ...
5283
5284
5285
5286
5287
5288
  	/*
  	 * Since we're reading these variables without serialization make sure
  	 * we read them once before doing sanity checks on them.
  	 */
  	age_stamp = ACCESS_ONCE(rq->age_stamp);
  	avg = ACCESS_ONCE(rq->rt_avg);
78becc270   Frederic Weisbecker   sched: Use an acc...
5289
  	total = sched_avg_period() + (rq_clock(rq) - age_stamp);
aa4838085   Venkatesh Pallipadi   sched: Remove irq...
5290

b654f7de4   Peter Zijlstra   sched: Make sure ...
5291
  	if (unlikely(total < avg)) {
aa4838085   Venkatesh Pallipadi   sched: Remove irq...
5292
5293
5294
  		/* Ensures that power won't end up being negative */
  		available = 0;
  	} else {
b654f7de4   Peter Zijlstra   sched: Make sure ...
5295
  		available = total - avg;
aa4838085   Venkatesh Pallipadi   sched: Remove irq...
5296
  	}
1e3c88bde   Peter Zijlstra   sched: Move load ...
5297

1399fa780   Nikhil Rao   sched: Introduce ...
5298
5299
  	if (unlikely((s64)total < SCHED_POWER_SCALE))
  		total = SCHED_POWER_SCALE;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5300

1399fa780   Nikhil Rao   sched: Introduce ...
5301
  	total >>= SCHED_POWER_SHIFT;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5302
5303
5304
5305
5306
5307
  
  	return div_u64(available, total);
  }
  
  static void update_cpu_power(struct sched_domain *sd, int cpu)
  {
669c55e9f   Peter Zijlstra   sched: Pre-comput...
5308
  	unsigned long weight = sd->span_weight;
1399fa780   Nikhil Rao   sched: Introduce ...
5309
  	unsigned long power = SCHED_POWER_SCALE;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5310
  	struct sched_group *sdg = sd->groups;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5311
5312
5313
5314
5315
  	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
  		if (sched_feat(ARCH_POWER))
  			power *= arch_scale_smt_power(sd, cpu);
  		else
  			power *= default_scale_smt_power(sd, cpu);
1399fa780   Nikhil Rao   sched: Introduce ...
5316
  		power >>= SCHED_POWER_SHIFT;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5317
  	}
9c3f75cbd   Peter Zijlstra   sched: Break out ...
5318
  	sdg->sgp->power_orig = power;
9d5efe05e   Srivatsa Vaddagiri   sched: Fix capaci...
5319
5320
5321
5322
5323
  
  	if (sched_feat(ARCH_POWER))
  		power *= arch_scale_freq_power(sd, cpu);
  	else
  		power *= default_scale_freq_power(sd, cpu);
1399fa780   Nikhil Rao   sched: Introduce ...
5324
  	power >>= SCHED_POWER_SHIFT;
9d5efe05e   Srivatsa Vaddagiri   sched: Fix capaci...
5325

1e3c88bde   Peter Zijlstra   sched: Move load ...
5326
  	power *= scale_rt_power(cpu);
1399fa780   Nikhil Rao   sched: Introduce ...
5327
  	power >>= SCHED_POWER_SHIFT;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5328
5329
5330
  
  	if (!power)
  		power = 1;
e51fd5e22   Peter Zijlstra   sched: Fix wake_a...
5331
  	cpu_rq(cpu)->cpu_power = power;
9c3f75cbd   Peter Zijlstra   sched: Break out ...
5332
  	sdg->sgp->power = power;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5333
  }
029632fbb   Peter Zijlstra   sched: Make separ...
5334
  void update_group_power(struct sched_domain *sd, int cpu)
1e3c88bde   Peter Zijlstra   sched: Move load ...
5335
5336
5337
  {
  	struct sched_domain *child = sd->child;
  	struct sched_group *group, *sdg = sd->groups;
863bffc80   Peter Zijlstra   sched/fair: Fix g...
5338
  	unsigned long power, power_orig;
4ec4412e1   Vincent Guittot   sched: Ensure cpu...
5339
5340
5341
5342
5343
  	unsigned long interval;
  
  	interval = msecs_to_jiffies(sd->balance_interval);
  	interval = clamp(interval, 1UL, max_load_balance_interval);
  	sdg->sgp->next_update = jiffies + interval;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5344
5345
5346
5347
5348
  
  	if (!child) {
  		update_cpu_power(sd, cpu);
  		return;
  	}
863bffc80   Peter Zijlstra   sched/fair: Fix g...
5349
  	power_orig = power = 0;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5350

74a5ce20e   Peter Zijlstra   sched: Fix SD_OVE...
5351
5352
5353
5354
5355
  	if (child->flags & SD_OVERLAP) {
  		/*
  		 * SD_OVERLAP domains cannot assume that child groups
  		 * span the current group.
  		 */
863bffc80   Peter Zijlstra   sched/fair: Fix g...
5356
  		for_each_cpu(cpu, sched_group_cpus(sdg)) {
9abf24d46   Srikar Dronamraju   sched: Check sche...
5357
5358
  			struct sched_group_power *sgp;
  			struct rq *rq = cpu_rq(cpu);
863bffc80   Peter Zijlstra   sched/fair: Fix g...
5359

9abf24d46   Srikar Dronamraju   sched: Check sche...
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
  			/*
  			 * build_sched_domains() -> init_sched_groups_power()
  			 * gets here before we've attached the domains to the
  			 * runqueues.
  			 *
  			 * Use power_of(), which is set irrespective of domains
  			 * in update_cpu_power().
  			 *
  			 * This avoids power/power_orig from being 0 and
  			 * causing divide-by-zero issues on boot.
  			 *
  			 * Runtime updates will correct power_orig.
  			 */
  			if (unlikely(!rq->sd)) {
  				power_orig += power_of(cpu);
  				power += power_of(cpu);
  				continue;
  			}
863bffc80   Peter Zijlstra   sched/fair: Fix g...
5378

9abf24d46   Srikar Dronamraju   sched: Check sche...
5379
5380
5381
  			sgp = rq->sd->groups->sgp;
  			power_orig += sgp->power_orig;
  			power += sgp->power;
863bffc80   Peter Zijlstra   sched/fair: Fix g...
5382
  		}
74a5ce20e   Peter Zijlstra   sched: Fix SD_OVE...
5383
5384
5385
5386
5387
5388
5389
5390
  	} else  {
  		/*
  		 * !SD_OVERLAP domains can assume that child groups
  		 * span the current group.
  		 */ 
  
  		group = child->groups;
  		do {
863bffc80   Peter Zijlstra   sched/fair: Fix g...
5391
  			power_orig += group->sgp->power_orig;
74a5ce20e   Peter Zijlstra   sched: Fix SD_OVE...
5392
5393
5394
5395
  			power += group->sgp->power;
  			group = group->next;
  		} while (group != child->groups);
  	}
1e3c88bde   Peter Zijlstra   sched: Move load ...
5396

863bffc80   Peter Zijlstra   sched/fair: Fix g...
5397
5398
  	sdg->sgp->power_orig = power_orig;
  	sdg->sgp->power = power;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5399
  }
9d5efe05e   Srivatsa Vaddagiri   sched: Fix capaci...
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
  /*
   * Try and fix up capacity for tiny siblings, this is needed when
   * things like SD_ASYM_PACKING need f_b_g to select another sibling
   * which on its own isn't powerful enough.
   *
   * See update_sd_pick_busiest() and check_asym_packing().
   */
  static inline int
  fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
  {
  	/*
1399fa780   Nikhil Rao   sched: Introduce ...
5411
  	 * Only siblings can have significantly less than SCHED_POWER_SCALE
9d5efe05e   Srivatsa Vaddagiri   sched: Fix capaci...
5412
  	 */
a6c75f2f8   Peter Zijlstra   sched: Avoid usin...
5413
  	if (!(sd->flags & SD_SHARE_CPUPOWER))
9d5efe05e   Srivatsa Vaddagiri   sched: Fix capaci...
5414
5415
5416
5417
5418
  		return 0;
  
  	/*
  	 * If ~90% of the cpu_power is still there, we're good.
  	 */
9c3f75cbd   Peter Zijlstra   sched: Break out ...
5419
  	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
9d5efe05e   Srivatsa Vaddagiri   sched: Fix capaci...
5420
5421
5422
5423
  		return 1;
  
  	return 0;
  }
30ce5dabc   Peter Zijlstra   sched/fair: Rewor...
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
  /*
   * Group imbalance indicates (and tries to solve) the problem where balancing
   * groups is inadequate due to tsk_cpus_allowed() constraints.
   *
   * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
   * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
   * Something like:
   *
   * 	{ 0 1 2 3 } { 4 5 6 7 }
   * 	        *     * * *
   *
   * If we were to balance group-wise we'd place two tasks in the first group and
   * two tasks in the second group. Clearly this is undesired as it will overload
   * cpu 3 and leave one of the cpus in the second group unused.
   *
   * The current solution to this issue is detecting the skew in the first group
6263322c5   Peter Zijlstra   sched/fair: Rewri...
5440
5441
   * by noticing the lower domain failed to reach balance and had difficulty
   * moving tasks due to affinity constraints.
30ce5dabc   Peter Zijlstra   sched/fair: Rewor...
5442
5443
   *
   * When this is so detected; this group becomes a candidate for busiest; see
ed1b77328   Kamalesh Babulal   sched/fair: Fix t...
5444
   * update_sd_pick_busiest(). And calculate_imbalance() and
6263322c5   Peter Zijlstra   sched/fair: Rewri...
5445
   * find_busiest_group() avoid some of the usual balance conditions to allow it
30ce5dabc   Peter Zijlstra   sched/fair: Rewor...
5446
5447
5448
5449
5450
5451
   * to create an effective group imbalance.
   *
   * This is a somewhat tricky proposition since the next run might not find the
   * group imbalance and decide the groups need to be balanced again. A most
   * subtle and fragile situation.
   */
6263322c5   Peter Zijlstra   sched/fair: Rewri...
5452
  static inline int sg_imbalanced(struct sched_group *group)
30ce5dabc   Peter Zijlstra   sched/fair: Rewor...
5453
  {
6263322c5   Peter Zijlstra   sched/fair: Rewri...
5454
  	return group->sgp->imbalance;
30ce5dabc   Peter Zijlstra   sched/fair: Rewor...
5455
  }
b37d93168   Peter Zijlstra   sched/fair: Rewor...
5456
5457
5458
  /*
   * Compute the group capacity.
   *
c61037e90   Peter Zijlstra   sched/fair: Fix t...
5459
5460
5461
   * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
   * first dividing out the smt factor and computing the actual number of cores
   * and limit power unit capacity with that.
b37d93168   Peter Zijlstra   sched/fair: Rewor...
5462
5463
5464
   */
  static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
  {
c61037e90   Peter Zijlstra   sched/fair: Fix t...
5465
5466
5467
5468
5469
5470
  	unsigned int capacity, smt, cpus;
  	unsigned int power, power_orig;
  
  	power = group->sgp->power;
  	power_orig = group->sgp->power_orig;
  	cpus = group->group_weight;
b37d93168   Peter Zijlstra   sched/fair: Rewor...
5471

c61037e90   Peter Zijlstra   sched/fair: Fix t...
5472
5473
5474
  	/* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
  	smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
  	capacity = cpus / smt; /* cores */
b37d93168   Peter Zijlstra   sched/fair: Rewor...
5475

c61037e90   Peter Zijlstra   sched/fair: Fix t...
5476
  	capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
b37d93168   Peter Zijlstra   sched/fair: Rewor...
5477
5478
5479
5480
5481
  	if (!capacity)
  		capacity = fix_small_capacity(env->sd, group);
  
  	return capacity;
  }
1e3c88bde   Peter Zijlstra   sched: Move load ...
5482
5483
  /**
   * update_sg_lb_stats - Update sched_group's statistics for load balancing.
cd96891d4   Randy Dunlap   sched/fair: fix l...
5484
   * @env: The load balancing environment.
1e3c88bde   Peter Zijlstra   sched: Move load ...
5485
   * @group: sched_group whose statistics are to be updated.
1e3c88bde   Peter Zijlstra   sched: Move load ...
5486
   * @load_idx: Load index of sched_domain of this_cpu for load calc.
1e3c88bde   Peter Zijlstra   sched: Move load ...
5487
   * @local_group: Does group contain this_cpu.
1e3c88bde   Peter Zijlstra   sched: Move load ...
5488
5489
   * @sgs: variable to hold the statistics for this group.
   */
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5490
5491
  static inline void update_sg_lb_stats(struct lb_env *env,
  			struct sched_group *group, int load_idx,
23f0d2093   Joonsoo Kim   sched: Factor out...
5492
  			int local_group, struct sg_lb_stats *sgs)
1e3c88bde   Peter Zijlstra   sched: Move load ...
5493
  {
30ce5dabc   Peter Zijlstra   sched/fair: Rewor...
5494
  	unsigned long load;
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5495
  	int i;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5496

b72ff13ce   Peter Zijlstra   sched/fair: Reduc...
5497
  	memset(sgs, 0, sizeof(*sgs));
b9403130a   Michael Wang   sched/cleanups: A...
5498
  	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
1e3c88bde   Peter Zijlstra   sched: Move load ...
5499
  		struct rq *rq = cpu_rq(i);
1e3c88bde   Peter Zijlstra   sched: Move load ...
5500
  		/* Bias balancing toward cpus of our domain */
6263322c5   Peter Zijlstra   sched/fair: Rewri...
5501
  		if (local_group)
04f733b4a   Peter Zijlstra   sched/fair: Rever...
5502
  			load = target_load(i, load_idx);
6263322c5   Peter Zijlstra   sched/fair: Rewri...
5503
  		else
1e3c88bde   Peter Zijlstra   sched: Move load ...
5504
  			load = source_load(i, load_idx);
1e3c88bde   Peter Zijlstra   sched: Move load ...
5505
5506
  
  		sgs->group_load += load;
380c9077b   Kamalesh Babulal   sched/fair: Clean...
5507
  		sgs->sum_nr_running += rq->nr_running;
0ec8aa00f   Peter Zijlstra   sched/numa: Avoid...
5508
5509
5510
5511
  #ifdef CONFIG_NUMA_BALANCING
  		sgs->nr_numa_running += rq->nr_numa_running;
  		sgs->nr_preferred_running += rq->nr_preferred_running;
  #endif
1e3c88bde   Peter Zijlstra   sched: Move load ...
5512
  		sgs->sum_weighted_load += weighted_cpuload(i);
aae6d3ddd   Suresh Siddha   sched: Use group ...
5513
5514
  		if (idle_cpu(i))
  			sgs->idle_cpus++;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5515
  	}
1e3c88bde   Peter Zijlstra   sched: Move load ...
5516
  	/* Adjust by relative CPU power of the group */
3ae11c90f   Peter Zijlstra   sched/fair: Make ...
5517
5518
  	sgs->group_power = group->sgp->power;
  	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5519

dd5feea14   Suresh Siddha   sched: Fix SCHED_...
5520
  	if (sgs->sum_nr_running)
38d0f7708   Peter Zijlstra   sched/fair: Remov...
5521
  		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5522

aae6d3ddd   Suresh Siddha   sched: Use group ...
5523
  	sgs->group_weight = group->group_weight;
fab476228   Nikhil Rao   sched: Force bala...
5524

b37d93168   Peter Zijlstra   sched/fair: Rewor...
5525
5526
  	sgs->group_imb = sg_imbalanced(group);
  	sgs->group_capacity = sg_capacity(env, group);
fab476228   Nikhil Rao   sched: Force bala...
5527
5528
  	if (sgs->group_capacity > sgs->sum_nr_running)
  		sgs->group_has_capacity = 1;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5529
5530
5531
  }
  
  /**
532cb4c40   Michael Neuling   sched: Add asymme...
5532
   * update_sd_pick_busiest - return 1 on busiest group
cd96891d4   Randy Dunlap   sched/fair: fix l...
5533
   * @env: The load balancing environment.
532cb4c40   Michael Neuling   sched: Add asymme...
5534
5535
   * @sds: sched_domain statistics
   * @sg: sched_group candidate to be checked for being the busiest
b6b122944   Michael Neuling   sched: Fix commen...
5536
   * @sgs: sched_group statistics
532cb4c40   Michael Neuling   sched: Add asymme...
5537
5538
5539
   *
   * Determine if @sg is a busier group than the previously selected
   * busiest group.
e69f61862   Yacine Belkadi   sched: Fix some k...
5540
5541
5542
   *
   * Return: %true if @sg is a busier group than the previously selected
   * busiest group. %false otherwise.
532cb4c40   Michael Neuling   sched: Add asymme...
5543
   */
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5544
  static bool update_sd_pick_busiest(struct lb_env *env,
532cb4c40   Michael Neuling   sched: Add asymme...
5545
5546
  				   struct sd_lb_stats *sds,
  				   struct sched_group *sg,
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5547
  				   struct sg_lb_stats *sgs)
532cb4c40   Michael Neuling   sched: Add asymme...
5548
  {
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5549
  	if (sgs->avg_load <= sds->busiest_stat.avg_load)
532cb4c40   Michael Neuling   sched: Add asymme...
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
  		return false;
  
  	if (sgs->sum_nr_running > sgs->group_capacity)
  		return true;
  
  	if (sgs->group_imb)
  		return true;
  
  	/*
  	 * ASYM_PACKING needs to move all the work to the lowest
  	 * numbered CPUs in the group, therefore mark all groups
  	 * higher than ourself as busy.
  	 */
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5563
5564
  	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
  	    env->dst_cpu < group_first_cpu(sg)) {
532cb4c40   Michael Neuling   sched: Add asymme...
5565
5566
5567
5568
5569
5570
5571
5572
5573
  		if (!sds->busiest)
  			return true;
  
  		if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
  			return true;
  	}
  
  	return false;
  }
0ec8aa00f   Peter Zijlstra   sched/numa: Avoid...
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
  #ifdef CONFIG_NUMA_BALANCING
  static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
  {
  	if (sgs->sum_nr_running > sgs->nr_numa_running)
  		return regular;
  	if (sgs->sum_nr_running > sgs->nr_preferred_running)
  		return remote;
  	return all;
  }
  
  static inline enum fbq_type fbq_classify_rq(struct rq *rq)
  {
  	if (rq->nr_running > rq->nr_numa_running)
  		return regular;
  	if (rq->nr_running > rq->nr_preferred_running)
  		return remote;
  	return all;
  }
  #else
  static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
  {
  	return all;
  }
  
  static inline enum fbq_type fbq_classify_rq(struct rq *rq)
  {
  	return regular;
  }
  #endif /* CONFIG_NUMA_BALANCING */
532cb4c40   Michael Neuling   sched: Add asymme...
5603
  /**
461819ac8   Hui Kang   sched_fair: Fix a...
5604
   * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
cd96891d4   Randy Dunlap   sched/fair: fix l...
5605
   * @env: The load balancing environment.
1e3c88bde   Peter Zijlstra   sched: Move load ...
5606
5607
   * @sds: variable to hold the statistics for this sched_domain.
   */
0ec8aa00f   Peter Zijlstra   sched/numa: Avoid...
5608
  static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
1e3c88bde   Peter Zijlstra   sched: Move load ...
5609
  {
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5610
5611
  	struct sched_domain *child = env->sd->child;
  	struct sched_group *sg = env->sd->groups;
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5612
  	struct sg_lb_stats tmp_sgs;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5613
5614
5615
5616
  	int load_idx, prefer_sibling = 0;
  
  	if (child && child->flags & SD_PREFER_SIBLING)
  		prefer_sibling = 1;
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5617
  	load_idx = get_sd_load_idx(env->sd, env->idle);
1e3c88bde   Peter Zijlstra   sched: Move load ...
5618
5619
  
  	do {
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5620
  		struct sg_lb_stats *sgs = &tmp_sgs;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5621
  		int local_group;
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5622
  		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5623
5624
5625
  		if (local_group) {
  			sds->local = sg;
  			sgs = &sds->local_stat;
b72ff13ce   Peter Zijlstra   sched/fair: Reduc...
5626
5627
5628
5629
  
  			if (env->idle != CPU_NEWLY_IDLE ||
  			    time_after_eq(jiffies, sg->sgp->next_update))
  				update_group_power(env->sd, env->dst_cpu);
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5630
  		}
1e3c88bde   Peter Zijlstra   sched: Move load ...
5631

56cf515b4   Joonsoo Kim   sched: Clean-up s...
5632
  		update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
1e3c88bde   Peter Zijlstra   sched: Move load ...
5633

b72ff13ce   Peter Zijlstra   sched/fair: Reduc...
5634
5635
  		if (local_group)
  			goto next_group;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5636
5637
  		/*
  		 * In case the child domain prefers tasks go to siblings
532cb4c40   Michael Neuling   sched: Add asymme...
5638
  		 * first, lower the sg capacity to one so that we'll try
75dd321d7   Nikhil Rao   sched: Drop group...
5639
5640
5641
5642
5643
5644
  		 * and move all the excess tasks away. We lower the capacity
  		 * of a group only if the local group has the capacity to fit
  		 * these excess tasks, i.e. nr_running < group_capacity. The
  		 * extra check prevents the case where you always pull from the
  		 * heaviest group when it is already under-utilized (possible
  		 * with a large weight task outweighs the tasks on the system).
1e3c88bde   Peter Zijlstra   sched: Move load ...
5645
  		 */
b72ff13ce   Peter Zijlstra   sched/fair: Reduc...
5646
5647
  		if (prefer_sibling && sds->local &&
  		    sds->local_stat.group_has_capacity)
147c5fc2b   Peter Zijlstra   sched/fair: Shrin...
5648
  			sgs->group_capacity = min(sgs->group_capacity, 1U);
1e3c88bde   Peter Zijlstra   sched: Move load ...
5649

b72ff13ce   Peter Zijlstra   sched/fair: Reduc...
5650
  		if (update_sd_pick_busiest(env, sds, sg, sgs)) {
532cb4c40   Michael Neuling   sched: Add asymme...
5651
  			sds->busiest = sg;
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5652
  			sds->busiest_stat = *sgs;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5653
  		}
b72ff13ce   Peter Zijlstra   sched/fair: Reduc...
5654
5655
5656
5657
  next_group:
  		/* Now, start updating sd_lb_stats */
  		sds->total_load += sgs->group_load;
  		sds->total_pwr += sgs->group_power;
532cb4c40   Michael Neuling   sched: Add asymme...
5658
  		sg = sg->next;
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5659
  	} while (sg != env->sd->groups);
0ec8aa00f   Peter Zijlstra   sched/numa: Avoid...
5660
5661
5662
  
  	if (env->sd->flags & SD_NUMA)
  		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
532cb4c40   Michael Neuling   sched: Add asymme...
5663
  }
532cb4c40   Michael Neuling   sched: Add asymme...
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
  /**
   * check_asym_packing - Check to see if the group is packed into the
   *			sched doman.
   *
   * This is primarily intended to used at the sibling level.  Some
   * cores like POWER7 prefer to use lower numbered SMT threads.  In the
   * case of POWER7, it can move to lower SMT modes only when higher
   * threads are idle.  When in lower SMT modes, the threads will
   * perform better since they share less core resources.  Hence when we
   * have idle threads, we want them to be the higher ones.
   *
   * This packing function is run on idle threads.  It checks to see if
   * the busiest CPU in this domain (core in the P7 case) has a higher
   * CPU number than the packing function is being run on.  Here we are
   * assuming lower CPU number will be equivalent to lower a SMT thread
   * number.
   *
e69f61862   Yacine Belkadi   sched: Fix some k...
5681
   * Return: 1 when packing is required and a task should be moved to
b6b122944   Michael Neuling   sched: Fix commen...
5682
5683
   * this CPU.  The amount of the imbalance is returned in *imbalance.
   *
cd96891d4   Randy Dunlap   sched/fair: fix l...
5684
   * @env: The load balancing environment.
532cb4c40   Michael Neuling   sched: Add asymme...
5685
   * @sds: Statistics of the sched_domain which is to be packed
532cb4c40   Michael Neuling   sched: Add asymme...
5686
   */
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5687
  static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
532cb4c40   Michael Neuling   sched: Add asymme...
5688
5689
  {
  	int busiest_cpu;
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5690
  	if (!(env->sd->flags & SD_ASYM_PACKING))
532cb4c40   Michael Neuling   sched: Add asymme...
5691
5692
5693
5694
5695
5696
  		return 0;
  
  	if (!sds->busiest)
  		return 0;
  
  	busiest_cpu = group_first_cpu(sds->busiest);
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5697
  	if (env->dst_cpu > busiest_cpu)
532cb4c40   Michael Neuling   sched: Add asymme...
5698
  		return 0;
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5699
  	env->imbalance = DIV_ROUND_CLOSEST(
3ae11c90f   Peter Zijlstra   sched/fair: Make ...
5700
5701
  		sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
  		SCHED_POWER_SCALE);
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5702

532cb4c40   Michael Neuling   sched: Add asymme...
5703
  	return 1;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5704
5705
5706
5707
5708
5709
  }
  
  /**
   * fix_small_imbalance - Calculate the minor imbalance that exists
   *			amongst the groups of a sched_domain, during
   *			load balancing.
cd96891d4   Randy Dunlap   sched/fair: fix l...
5710
   * @env: The load balancing environment.
1e3c88bde   Peter Zijlstra   sched: Move load ...
5711
   * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
1e3c88bde   Peter Zijlstra   sched: Move load ...
5712
   */
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5713
5714
  static inline
  void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
1e3c88bde   Peter Zijlstra   sched: Move load ...
5715
5716
5717
  {
  	unsigned long tmp, pwr_now = 0, pwr_move = 0;
  	unsigned int imbn = 2;
dd5feea14   Suresh Siddha   sched: Fix SCHED_...
5718
  	unsigned long scaled_busy_load_per_task;
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5719
  	struct sg_lb_stats *local, *busiest;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5720

56cf515b4   Joonsoo Kim   sched: Clean-up s...
5721
5722
  	local = &sds->local_stat;
  	busiest = &sds->busiest_stat;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5723

56cf515b4   Joonsoo Kim   sched: Clean-up s...
5724
5725
5726
5727
  	if (!local->sum_nr_running)
  		local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
  	else if (busiest->load_per_task > local->load_per_task)
  		imbn = 1;
dd5feea14   Suresh Siddha   sched: Fix SCHED_...
5728

56cf515b4   Joonsoo Kim   sched: Clean-up s...
5729
5730
  	scaled_busy_load_per_task =
  		(busiest->load_per_task * SCHED_POWER_SCALE) /
3ae11c90f   Peter Zijlstra   sched/fair: Make ...
5731
  		busiest->group_power;
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5732

3029ede39   Vladimir Davydov   sched/balancing: ...
5733
5734
  	if (busiest->avg_load + scaled_busy_load_per_task >=
  	    local->avg_load + (scaled_busy_load_per_task * imbn)) {
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5735
  		env->imbalance = busiest->load_per_task;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5736
5737
5738
5739
5740
5741
5742
5743
  		return;
  	}
  
  	/*
  	 * OK, we don't have enough imbalance to justify moving tasks,
  	 * however we may be able to increase total CPU power used by
  	 * moving them.
  	 */
3ae11c90f   Peter Zijlstra   sched/fair: Make ...
5744
  	pwr_now += busiest->group_power *
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5745
  			min(busiest->load_per_task, busiest->avg_load);
3ae11c90f   Peter Zijlstra   sched/fair: Make ...
5746
  	pwr_now += local->group_power *
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5747
  			min(local->load_per_task, local->avg_load);
1399fa780   Nikhil Rao   sched: Introduce ...
5748
  	pwr_now /= SCHED_POWER_SCALE;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5749
5750
  
  	/* Amount of load we'd subtract */
a2cd42601   Vincent Guittot   sched: Remove dou...
5751
  	if (busiest->avg_load > scaled_busy_load_per_task) {
3ae11c90f   Peter Zijlstra   sched/fair: Make ...
5752
  		pwr_move += busiest->group_power *
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5753
  			    min(busiest->load_per_task,
a2cd42601   Vincent Guittot   sched: Remove dou...
5754
  				busiest->avg_load - scaled_busy_load_per_task);
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5755
  	}
1e3c88bde   Peter Zijlstra   sched: Move load ...
5756
5757
  
  	/* Amount of load we'd add */
3ae11c90f   Peter Zijlstra   sched/fair: Make ...
5758
  	if (busiest->avg_load * busiest->group_power <
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5759
  	    busiest->load_per_task * SCHED_POWER_SCALE) {
3ae11c90f   Peter Zijlstra   sched/fair: Make ...
5760
5761
  		tmp = (busiest->avg_load * busiest->group_power) /
  		      local->group_power;
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5762
5763
  	} else {
  		tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
3ae11c90f   Peter Zijlstra   sched/fair: Make ...
5764
  		      local->group_power;
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5765
  	}
3ae11c90f   Peter Zijlstra   sched/fair: Make ...
5766
5767
  	pwr_move += local->group_power *
  		    min(local->load_per_task, local->avg_load + tmp);
1399fa780   Nikhil Rao   sched: Introduce ...
5768
  	pwr_move /= SCHED_POWER_SCALE;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5769
5770
5771
  
  	/* Move if we gain throughput */
  	if (pwr_move > pwr_now)
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5772
  		env->imbalance = busiest->load_per_task;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5773
5774
5775
5776
5777
  }
  
  /**
   * calculate_imbalance - Calculate the amount of imbalance present within the
   *			 groups of a given sched_domain during load balance.
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5778
   * @env: load balance environment
1e3c88bde   Peter Zijlstra   sched: Move load ...
5779
   * @sds: statistics of the sched_domain whose imbalance is to be calculated.
1e3c88bde   Peter Zijlstra   sched: Move load ...
5780
   */
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5781
  static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
1e3c88bde   Peter Zijlstra   sched: Move load ...
5782
  {
dd5feea14   Suresh Siddha   sched: Fix SCHED_...
5783
  	unsigned long max_pull, load_above_capacity = ~0UL;
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5784
5785
5786
  	struct sg_lb_stats *local, *busiest;
  
  	local = &sds->local_stat;
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5787
  	busiest = &sds->busiest_stat;
dd5feea14   Suresh Siddha   sched: Fix SCHED_...
5788

56cf515b4   Joonsoo Kim   sched: Clean-up s...
5789
  	if (busiest->group_imb) {
30ce5dabc   Peter Zijlstra   sched/fair: Rewor...
5790
5791
5792
5793
  		/*
  		 * In the group_imb case we cannot rely on group-wide averages
  		 * to ensure cpu-load equilibrium, look at wider averages. XXX
  		 */
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5794
5795
  		busiest->load_per_task =
  			min(busiest->load_per_task, sds->avg_load);
dd5feea14   Suresh Siddha   sched: Fix SCHED_...
5796
  	}
1e3c88bde   Peter Zijlstra   sched: Move load ...
5797
5798
5799
5800
5801
  	/*
  	 * In the presence of smp nice balancing, certain scenarios can have
  	 * max load less than avg load(as we skip the groups at or below
  	 * its cpu_power, while calculating max_load..)
  	 */
b18855500   Vladimir Davydov   sched/balancing: ...
5802
5803
  	if (busiest->avg_load <= sds->avg_load ||
  	    local->avg_load >= sds->avg_load) {
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5804
5805
  		env->imbalance = 0;
  		return fix_small_imbalance(env, sds);
1e3c88bde   Peter Zijlstra   sched: Move load ...
5806
  	}
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5807
  	if (!busiest->group_imb) {
dd5feea14   Suresh Siddha   sched: Fix SCHED_...
5808
5809
  		/*
  		 * Don't want to pull so many tasks that a group would go idle.
30ce5dabc   Peter Zijlstra   sched/fair: Rewor...
5810
5811
  		 * Except of course for the group_imb case, since then we might
  		 * have to drop below capacity to reach cpu-load equilibrium.
dd5feea14   Suresh Siddha   sched: Fix SCHED_...
5812
  		 */
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5813
5814
  		load_above_capacity =
  			(busiest->sum_nr_running - busiest->group_capacity);
dd5feea14   Suresh Siddha   sched: Fix SCHED_...
5815

1399fa780   Nikhil Rao   sched: Introduce ...
5816
  		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
3ae11c90f   Peter Zijlstra   sched/fair: Make ...
5817
  		load_above_capacity /= busiest->group_power;
dd5feea14   Suresh Siddha   sched: Fix SCHED_...
5818
5819
5820
5821
5822
5823
5824
5825
5826
  	}
  
  	/*
  	 * We're trying to get all the cpus to the average_load, so we don't
  	 * want to push ourselves above the average load, nor do we wish to
  	 * reduce the max loaded cpu below the average load. At the same time,
  	 * we also don't want to reduce the group load below the group capacity
  	 * (so that we can implement power-savings policies etc). Thus we look
  	 * for the minimum possible imbalance.
dd5feea14   Suresh Siddha   sched: Fix SCHED_...
5827
  	 */
30ce5dabc   Peter Zijlstra   sched/fair: Rewor...
5828
  	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
1e3c88bde   Peter Zijlstra   sched: Move load ...
5829
5830
  
  	/* How much load to actually move to equalise the imbalance */
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5831
  	env->imbalance = min(
3ae11c90f   Peter Zijlstra   sched/fair: Make ...
5832
5833
  		max_pull * busiest->group_power,
  		(sds->avg_load - local->avg_load) * local->group_power
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5834
  	) / SCHED_POWER_SCALE;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5835
5836
5837
  
  	/*
  	 * if *imbalance is less than the average load per runnable task
25985edce   Lucas De Marchi   Fix common misspe...
5838
  	 * there is no guarantee that any tasks will be moved so we'll have
1e3c88bde   Peter Zijlstra   sched: Move load ...
5839
5840
5841
  	 * a think about bumping its value to force at least one task to be
  	 * moved
  	 */
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5842
  	if (env->imbalance < busiest->load_per_task)
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5843
  		return fix_small_imbalance(env, sds);
1e3c88bde   Peter Zijlstra   sched: Move load ...
5844
  }
fab476228   Nikhil Rao   sched: Force bala...
5845

1e3c88bde   Peter Zijlstra   sched: Move load ...
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
  /******* find_busiest_group() helpers end here *********************/
  
  /**
   * find_busiest_group - Returns the busiest group within the sched_domain
   * if there is an imbalance. If there isn't an imbalance, and
   * the user has opted for power-savings, it returns a group whose
   * CPUs can be put to idle by rebalancing those tasks elsewhere, if
   * such a group exists.
   *
   * Also calculates the amount of weighted load which should be moved
   * to restore balance.
   *
cd96891d4   Randy Dunlap   sched/fair: fix l...
5858
   * @env: The load balancing environment.
1e3c88bde   Peter Zijlstra   sched: Move load ...
5859
   *
e69f61862   Yacine Belkadi   sched: Fix some k...
5860
   * Return:	- The busiest group if imbalance exists.
1e3c88bde   Peter Zijlstra   sched: Move load ...
5861
5862
5863
5864
   *		- If no imbalance and user has opted for power-savings balance,
   *		   return the least loaded group whose CPUs can be
   *		   put to idle by rebalancing its tasks onto our group.
   */
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5865
  static struct sched_group *find_busiest_group(struct lb_env *env)
1e3c88bde   Peter Zijlstra   sched: Move load ...
5866
  {
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5867
  	struct sg_lb_stats *local, *busiest;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5868
  	struct sd_lb_stats sds;
147c5fc2b   Peter Zijlstra   sched/fair: Shrin...
5869
  	init_sd_lb_stats(&sds);
1e3c88bde   Peter Zijlstra   sched: Move load ...
5870
5871
5872
5873
5874
  
  	/*
  	 * Compute the various statistics relavent for load balancing at
  	 * this level.
  	 */
23f0d2093   Joonsoo Kim   sched: Factor out...
5875
  	update_sd_lb_stats(env, &sds);
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5876
5877
  	local = &sds.local_stat;
  	busiest = &sds.busiest_stat;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5878

bd939f45d   Peter Zijlstra   sched/fair: Propa...
5879
5880
  	if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
  	    check_asym_packing(env, &sds))
532cb4c40   Michael Neuling   sched: Add asymme...
5881
  		return sds.busiest;
cc57aa8f4   Peter Zijlstra   sched: Clean up s...
5882
  	/* There is no busy sibling group to pull tasks from */
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5883
  	if (!sds.busiest || busiest->sum_nr_running == 0)
1e3c88bde   Peter Zijlstra   sched: Move load ...
5884
  		goto out_balanced;
1399fa780   Nikhil Rao   sched: Introduce ...
5885
  	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
b0432d8f1   Ken Chen   sched: Fix sched-...
5886

866ab43ef   Peter Zijlstra   sched: Fix the gr...
5887
5888
  	/*
  	 * If the busiest group is imbalanced the below checks don't
30ce5dabc   Peter Zijlstra   sched/fair: Rewor...
5889
  	 * work because they assume all things are equal, which typically
866ab43ef   Peter Zijlstra   sched: Fix the gr...
5890
5891
  	 * isn't true due to cpus_allowed constraints and the like.
  	 */
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5892
  	if (busiest->group_imb)
866ab43ef   Peter Zijlstra   sched: Fix the gr...
5893
  		goto force_balance;
cc57aa8f4   Peter Zijlstra   sched: Clean up s...
5894
  	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5895
5896
  	if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
  	    !busiest->group_has_capacity)
fab476228   Nikhil Rao   sched: Force bala...
5897
  		goto force_balance;
cc57aa8f4   Peter Zijlstra   sched: Clean up s...
5898
5899
5900
5901
  	/*
  	 * If the local group is more busy than the selected busiest group
  	 * don't try and pull any tasks.
  	 */
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5902
  	if (local->avg_load >= busiest->avg_load)
1e3c88bde   Peter Zijlstra   sched: Move load ...
5903
  		goto out_balanced;
cc57aa8f4   Peter Zijlstra   sched: Clean up s...
5904
5905
5906
5907
  	/*
  	 * Don't pull any tasks if this group is already above the domain
  	 * average load.
  	 */
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5908
  	if (local->avg_load >= sds.avg_load)
1e3c88bde   Peter Zijlstra   sched: Move load ...
5909
  		goto out_balanced;
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5910
  	if (env->idle == CPU_IDLE) {
aae6d3ddd   Suresh Siddha   sched: Use group ...
5911
5912
5913
5914
5915
5916
  		/*
  		 * This cpu is idle. If the busiest group load doesn't
  		 * have more tasks than the number of available cpu's and
  		 * there is no imbalance between this and busiest group
  		 * wrt to idle cpu's, it is balanced.
  		 */
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5917
5918
  		if ((local->idle_cpus < busiest->idle_cpus) &&
  		    busiest->sum_nr_running <= busiest->group_weight)
aae6d3ddd   Suresh Siddha   sched: Use group ...
5919
  			goto out_balanced;
c186fafe9   Peter Zijlstra   sched: Clean up r...
5920
5921
5922
5923
5924
  	} else {
  		/*
  		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
  		 * imbalance_pct to be conservative.
  		 */
56cf515b4   Joonsoo Kim   sched: Clean-up s...
5925
5926
  		if (100 * busiest->avg_load <=
  				env->sd->imbalance_pct * local->avg_load)
c186fafe9   Peter Zijlstra   sched: Clean up r...
5927
  			goto out_balanced;
aae6d3ddd   Suresh Siddha   sched: Use group ...
5928
  	}
1e3c88bde   Peter Zijlstra   sched: Move load ...
5929

fab476228   Nikhil Rao   sched: Force bala...
5930
  force_balance:
1e3c88bde   Peter Zijlstra   sched: Move load ...
5931
  	/* Looks like there is an imbalance. Compute it */
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5932
  	calculate_imbalance(env, &sds);
1e3c88bde   Peter Zijlstra   sched: Move load ...
5933
5934
5935
  	return sds.busiest;
  
  out_balanced:
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5936
  	env->imbalance = 0;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5937
5938
5939
5940
5941
5942
  	return NULL;
  }
  
  /*
   * find_busiest_queue - find the busiest runqueue among the cpus in group.
   */
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5943
  static struct rq *find_busiest_queue(struct lb_env *env,
b9403130a   Michael Wang   sched/cleanups: A...
5944
  				     struct sched_group *group)
1e3c88bde   Peter Zijlstra   sched: Move load ...
5945
5946
  {
  	struct rq *busiest = NULL, *rq;
95a79b805   Joonsoo Kim   sched: Remove one...
5947
  	unsigned long busiest_load = 0, busiest_power = 1;
1e3c88bde   Peter Zijlstra   sched: Move load ...
5948
  	int i;
6906a4083   Peter Zijlstra   sched/fair: Optim...
5949
  	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
0ec8aa00f   Peter Zijlstra   sched/numa: Avoid...
5950
5951
5952
5953
5954
  		unsigned long power, capacity, wl;
  		enum fbq_type rt;
  
  		rq = cpu_rq(i);
  		rt = fbq_classify_rq(rq);
1e3c88bde   Peter Zijlstra   sched: Move load ...
5955

0ec8aa00f   Peter Zijlstra   sched/numa: Avoid...
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
  		/*
  		 * We classify groups/runqueues into three groups:
  		 *  - regular: there are !numa tasks
  		 *  - remote:  there are numa tasks that run on the 'wrong' node
  		 *  - all:     there is no distinction
  		 *
  		 * In order to avoid migrating ideally placed numa tasks,
  		 * ignore those when there's better options.
  		 *
  		 * If we ignore the actual busiest queue to migrate another
  		 * task, the next balance pass can still reduce the busiest
  		 * queue by moving tasks around inside the node.
  		 *
  		 * If we cannot move enough load due to this classification
  		 * the next pass will adjust the group classification and
  		 * allow migration of more tasks.
  		 *
  		 * Both cases only affect the total convergence complexity.
  		 */
  		if (rt > env->fbq_type)
  			continue;
  
  		power = power_of(i);
  		capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
9d5efe05e   Srivatsa Vaddagiri   sched: Fix capaci...
5980
  		if (!capacity)
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5981
  			capacity = fix_small_capacity(env->sd, group);
9d5efe05e   Srivatsa Vaddagiri   sched: Fix capaci...
5982

6e40f5bbb   Thomas Gleixner   Merge branch 'sch...
5983
  		wl = weighted_cpuload(i);
1e3c88bde   Peter Zijlstra   sched: Move load ...
5984

6e40f5bbb   Thomas Gleixner   Merge branch 'sch...
5985
5986
5987
5988
  		/*
  		 * When comparing with imbalance, use weighted_cpuload()
  		 * which is not scaled with the cpu power.
  		 */
bd939f45d   Peter Zijlstra   sched/fair: Propa...
5989
  		if (capacity && rq->nr_running == 1 && wl > env->imbalance)
1e3c88bde   Peter Zijlstra   sched: Move load ...
5990
  			continue;
6e40f5bbb   Thomas Gleixner   Merge branch 'sch...
5991
5992
5993
5994
5995
  		/*
  		 * For the load comparisons with the other cpu's, consider
  		 * the weighted_cpuload() scaled with the cpu power, so that
  		 * the load can be moved away from the cpu that is potentially
  		 * running at a lower capacity.
95a79b805   Joonsoo Kim   sched: Remove one...
5996
5997
5998
5999
6000
  		 *
  		 * Thus we're looking for max(wl_i / power_i), crosswise
  		 * multiplication to rid ourselves of the division works out
  		 * to: wl_i * power_j > wl_j * power_i;  where j is our
  		 * previous maximum.
6e40f5bbb   Thomas Gleixner   Merge branch 'sch...
6001
  		 */
95a79b805   Joonsoo Kim   sched: Remove one...
6002
6003
6004
  		if (wl * busiest_power > busiest_load * power) {
  			busiest_load = wl;
  			busiest_power = power;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
  			busiest = rq;
  		}
  	}
  
  	return busiest;
  }
  
  /*
   * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
   * so long as it is large enough.
   */
  #define MAX_PINNED_INTERVAL	512
  
  /* Working cpumask for load_balance and load_balance_newidle. */
e6252c3ef   Joonsoo Kim   sched: Rename loa...
6019
  DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
1e3c88bde   Peter Zijlstra   sched: Move load ...
6020

bd939f45d   Peter Zijlstra   sched/fair: Propa...
6021
  static int need_active_balance(struct lb_env *env)
1af3ed3dd   Peter Zijlstra   sched: Unify load...
6022
  {
bd939f45d   Peter Zijlstra   sched/fair: Propa...
6023
6024
6025
  	struct sched_domain *sd = env->sd;
  
  	if (env->idle == CPU_NEWLY_IDLE) {
532cb4c40   Michael Neuling   sched: Add asymme...
6026
6027
6028
6029
6030
6031
  
  		/*
  		 * ASYM_PACKING needs to force migrate tasks from busy but
  		 * higher numbered CPUs in order to pack all tasks in the
  		 * lowest numbered CPUs.
  		 */
bd939f45d   Peter Zijlstra   sched/fair: Propa...
6032
  		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
532cb4c40   Michael Neuling   sched: Add asymme...
6033
  			return 1;
1af3ed3dd   Peter Zijlstra   sched: Unify load...
6034
6035
6036
6037
  	}
  
  	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
  }
969c79215   Tejun Heo   sched: replace mi...
6038
  static int active_load_balance_cpu_stop(void *data);
23f0d2093   Joonsoo Kim   sched: Factor out...
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
  static int should_we_balance(struct lb_env *env)
  {
  	struct sched_group *sg = env->sd->groups;
  	struct cpumask *sg_cpus, *sg_mask;
  	int cpu, balance_cpu = -1;
  
  	/*
  	 * In the newly idle case, we will allow all the cpu's
  	 * to do the newly idle load balance.
  	 */
  	if (env->idle == CPU_NEWLY_IDLE)
  		return 1;
  
  	sg_cpus = sched_group_cpus(sg);
  	sg_mask = sched_group_mask(sg);
  	/* Try to find first idle cpu */
  	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
  		if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
  			continue;
  
  		balance_cpu = cpu;
  		break;
  	}
  
  	if (balance_cpu == -1)
  		balance_cpu = group_balance_cpu(sg);
  
  	/*
  	 * First idle cpu or the first cpu(busiest) in this sched group
  	 * is eligible for doing load balancing at this and above domains.
  	 */
b0cff9d88   Joonsoo Kim   sched: Fix load b...
6070
  	return balance_cpu == env->dst_cpu;
23f0d2093   Joonsoo Kim   sched: Factor out...
6071
  }
1e3c88bde   Peter Zijlstra   sched: Move load ...
6072
6073
6074
6075
6076
6077
  /*
   * Check this_cpu to ensure it is balanced within domain. Attempt to move
   * tasks if there is an imbalance.
   */
  static int load_balance(int this_cpu, struct rq *this_rq,
  			struct sched_domain *sd, enum cpu_idle_type idle,
23f0d2093   Joonsoo Kim   sched: Factor out...
6078
  			int *continue_balancing)
1e3c88bde   Peter Zijlstra   sched: Move load ...
6079
  {
88b8dac0a   Srivatsa Vaddagiri   sched: Improve ba...
6080
  	int ld_moved, cur_ld_moved, active_balance = 0;
6263322c5   Peter Zijlstra   sched/fair: Rewri...
6081
  	struct sched_domain *sd_parent = sd->parent;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6082
  	struct sched_group *group;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6083
6084
  	struct rq *busiest;
  	unsigned long flags;
e6252c3ef   Joonsoo Kim   sched: Rename loa...
6085
  	struct cpumask *cpus = __get_cpu_var(load_balance_mask);
1e3c88bde   Peter Zijlstra   sched: Move load ...
6086

8e45cb545   Peter Zijlstra   sched: Move load-...
6087
6088
  	struct lb_env env = {
  		.sd		= sd,
ddcdf6e7d   Peter Zijlstra   sched: Rename loa...
6089
6090
  		.dst_cpu	= this_cpu,
  		.dst_rq		= this_rq,
88b8dac0a   Srivatsa Vaddagiri   sched: Improve ba...
6091
  		.dst_grpmask    = sched_group_cpus(sd->groups),
8e45cb545   Peter Zijlstra   sched: Move load-...
6092
  		.idle		= idle,
eb95308ee   Peter Zijlstra   sched: Fix more l...
6093
  		.loop_break	= sched_nr_migrate_break,
b9403130a   Michael Wang   sched/cleanups: A...
6094
  		.cpus		= cpus,
0ec8aa00f   Peter Zijlstra   sched/numa: Avoid...
6095
  		.fbq_type	= all,
8e45cb545   Peter Zijlstra   sched: Move load-...
6096
  	};
cfc031180   Joonsoo Kim   sched: Don't cons...
6097
6098
6099
6100
  	/*
  	 * For NEWLY_IDLE load_balancing, we don't need to consider
  	 * other cpus in our group
  	 */
e02e60c10   Joonsoo Kim   sched: Prevent to...
6101
  	if (idle == CPU_NEWLY_IDLE)
cfc031180   Joonsoo Kim   sched: Don't cons...
6102
  		env.dst_grpmask = NULL;
cfc031180   Joonsoo Kim   sched: Don't cons...
6103

1e3c88bde   Peter Zijlstra   sched: Move load ...
6104
  	cpumask_copy(cpus, cpu_active_mask);
1e3c88bde   Peter Zijlstra   sched: Move load ...
6105
6106
6107
  	schedstat_inc(sd, lb_count[idle]);
  
  redo:
23f0d2093   Joonsoo Kim   sched: Factor out...
6108
6109
  	if (!should_we_balance(&env)) {
  		*continue_balancing = 0;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6110
  		goto out_balanced;
23f0d2093   Joonsoo Kim   sched: Factor out...
6111
  	}
1e3c88bde   Peter Zijlstra   sched: Move load ...
6112

23f0d2093   Joonsoo Kim   sched: Factor out...
6113
  	group = find_busiest_group(&env);
1e3c88bde   Peter Zijlstra   sched: Move load ...
6114
6115
6116
6117
  	if (!group) {
  		schedstat_inc(sd, lb_nobusyg[idle]);
  		goto out_balanced;
  	}
b9403130a   Michael Wang   sched/cleanups: A...
6118
  	busiest = find_busiest_queue(&env, group);
1e3c88bde   Peter Zijlstra   sched: Move load ...
6119
6120
6121
6122
  	if (!busiest) {
  		schedstat_inc(sd, lb_nobusyq[idle]);
  		goto out_balanced;
  	}
78feefc51   Michael Wang   sched: using dst_...
6123
  	BUG_ON(busiest == env.dst_rq);
1e3c88bde   Peter Zijlstra   sched: Move load ...
6124

bd939f45d   Peter Zijlstra   sched/fair: Propa...
6125
  	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
1e3c88bde   Peter Zijlstra   sched: Move load ...
6126
6127
6128
6129
6130
6131
6132
6133
6134
  
  	ld_moved = 0;
  	if (busiest->nr_running > 1) {
  		/*
  		 * Attempt to move tasks. If find_busiest_group has found
  		 * an imbalance but busiest->nr_running <= 1, the group is
  		 * still unbalanced. ld_moved simply stays zero, so it is
  		 * correctly treated as an imbalance.
  		 */
8e45cb545   Peter Zijlstra   sched: Move load-...
6135
  		env.flags |= LBF_ALL_PINNED;
c82513e51   Peter Zijlstra   sched: Change rq-...
6136
6137
6138
  		env.src_cpu   = busiest->cpu;
  		env.src_rq    = busiest;
  		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
8e45cb545   Peter Zijlstra   sched: Move load-...
6139

5d6523ebd   Peter Zijlstra   sched: Fix load-b...
6140
  more_balance:
1e3c88bde   Peter Zijlstra   sched: Move load ...
6141
  		local_irq_save(flags);
78feefc51   Michael Wang   sched: using dst_...
6142
  		double_rq_lock(env.dst_rq, busiest);
88b8dac0a   Srivatsa Vaddagiri   sched: Improve ba...
6143
6144
6145
6146
6147
6148
6149
  
  		/*
  		 * cur_ld_moved - load moved in current iteration
  		 * ld_moved     - cumulative load moved across iterations
  		 */
  		cur_ld_moved = move_tasks(&env);
  		ld_moved += cur_ld_moved;
78feefc51   Michael Wang   sched: using dst_...
6150
  		double_rq_unlock(env.dst_rq, busiest);
1e3c88bde   Peter Zijlstra   sched: Move load ...
6151
6152
6153
6154
6155
  		local_irq_restore(flags);
  
  		/*
  		 * some other cpu did the load balance for us.
  		 */
88b8dac0a   Srivatsa Vaddagiri   sched: Improve ba...
6156
6157
  		if (cur_ld_moved && env.dst_cpu != smp_processor_id())
  			resched_cpu(env.dst_cpu);
f1cd08581   Joonsoo Kim   sched: Change pos...
6158
6159
6160
6161
  		if (env.flags & LBF_NEED_BREAK) {
  			env.flags &= ~LBF_NEED_BREAK;
  			goto more_balance;
  		}
88b8dac0a   Srivatsa Vaddagiri   sched: Improve ba...
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
  		/*
  		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
  		 * us and move them to an alternate dst_cpu in our sched_group
  		 * where they can run. The upper limit on how many times we
  		 * iterate on same src_cpu is dependent on number of cpus in our
  		 * sched_group.
  		 *
  		 * This changes load balance semantics a bit on who can move
  		 * load to a given_cpu. In addition to the given_cpu itself
  		 * (or a ilb_cpu acting on its behalf where given_cpu is
  		 * nohz-idle), we now have balance_cpu in a position to move
  		 * load to given_cpu. In rare situations, this may cause
  		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
  		 * _independently_ and at _same_ time to move some load to
  		 * given_cpu) causing exceess load to be moved to given_cpu.
  		 * This however should not happen so much in practice and
  		 * moreover subsequent load balance cycles should correct the
  		 * excess load moved.
  		 */
6263322c5   Peter Zijlstra   sched/fair: Rewri...
6181
  		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
88b8dac0a   Srivatsa Vaddagiri   sched: Improve ba...
6182

7aff2e3a5   Vladimir Davydov   sched/balancing: ...
6183
6184
  			/* Prevent to re-select dst_cpu via env's cpus */
  			cpumask_clear_cpu(env.dst_cpu, env.cpus);
78feefc51   Michael Wang   sched: using dst_...
6185
  			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
88b8dac0a   Srivatsa Vaddagiri   sched: Improve ba...
6186
  			env.dst_cpu	 = env.new_dst_cpu;
6263322c5   Peter Zijlstra   sched/fair: Rewri...
6187
  			env.flags	&= ~LBF_DST_PINNED;
88b8dac0a   Srivatsa Vaddagiri   sched: Improve ba...
6188
6189
  			env.loop	 = 0;
  			env.loop_break	 = sched_nr_migrate_break;
e02e60c10   Joonsoo Kim   sched: Prevent to...
6190

88b8dac0a   Srivatsa Vaddagiri   sched: Improve ba...
6191
6192
6193
6194
6195
6196
  			/*
  			 * Go back to "more_balance" rather than "redo" since we
  			 * need to continue with same src_cpu.
  			 */
  			goto more_balance;
  		}
1e3c88bde   Peter Zijlstra   sched: Move load ...
6197

6263322c5   Peter Zijlstra   sched/fair: Rewri...
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
  		/*
  		 * We failed to reach balance because of affinity.
  		 */
  		if (sd_parent) {
  			int *group_imbalance = &sd_parent->groups->sgp->imbalance;
  
  			if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
  				*group_imbalance = 1;
  			} else if (*group_imbalance)
  				*group_imbalance = 0;
  		}
1e3c88bde   Peter Zijlstra   sched: Move load ...
6209
  		/* All tasks on this runqueue were pinned by CPU affinity */
8e45cb545   Peter Zijlstra   sched: Move load-...
6210
  		if (unlikely(env.flags & LBF_ALL_PINNED)) {
1e3c88bde   Peter Zijlstra   sched: Move load ...
6211
  			cpumask_clear_cpu(cpu_of(busiest), cpus);
bbf18b194   Prashanth Nageshappa   sched: Reset loop...
6212
6213
6214
  			if (!cpumask_empty(cpus)) {
  				env.loop = 0;
  				env.loop_break = sched_nr_migrate_break;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6215
  				goto redo;
bbf18b194   Prashanth Nageshappa   sched: Reset loop...
6216
  			}
1e3c88bde   Peter Zijlstra   sched: Move load ...
6217
6218
6219
6220
6221
6222
  			goto out_balanced;
  		}
  	}
  
  	if (!ld_moved) {
  		schedstat_inc(sd, lb_failed[idle]);
58b26c4c0   Venkatesh Pallipadi   sched: Increment ...
6223
6224
6225
6226
6227
6228
6229
6230
  		/*
  		 * Increment the failure counter only on periodic balance.
  		 * We do not want newidle balance, which can be very
  		 * frequent, pollute the failure counter causing
  		 * excessive cache_hot migrations and active balances.
  		 */
  		if (idle != CPU_NEWLY_IDLE)
  			sd->nr_balance_failed++;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6231

bd939f45d   Peter Zijlstra   sched/fair: Propa...
6232
  		if (need_active_balance(&env)) {
1e3c88bde   Peter Zijlstra   sched: Move load ...
6233
  			raw_spin_lock_irqsave(&busiest->lock, flags);
969c79215   Tejun Heo   sched: replace mi...
6234
6235
6236
  			/* don't kick the active_load_balance_cpu_stop,
  			 * if the curr task on busiest cpu can't be
  			 * moved to this_cpu
1e3c88bde   Peter Zijlstra   sched: Move load ...
6237
6238
  			 */
  			if (!cpumask_test_cpu(this_cpu,
fa17b507f   Peter Zijlstra   sched: Wrap sched...
6239
  					tsk_cpus_allowed(busiest->curr))) {
1e3c88bde   Peter Zijlstra   sched: Move load ...
6240
6241
  				raw_spin_unlock_irqrestore(&busiest->lock,
  							    flags);
8e45cb545   Peter Zijlstra   sched: Move load-...
6242
  				env.flags |= LBF_ALL_PINNED;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6243
6244
  				goto out_one_pinned;
  			}
969c79215   Tejun Heo   sched: replace mi...
6245
6246
6247
6248
6249
  			/*
  			 * ->active_balance synchronizes accesses to
  			 * ->active_balance_work.  Once set, it's cleared
  			 * only after active load balance is finished.
  			 */
1e3c88bde   Peter Zijlstra   sched: Move load ...
6250
6251
6252
6253
6254
6255
  			if (!busiest->active_balance) {
  				busiest->active_balance = 1;
  				busiest->push_cpu = this_cpu;
  				active_balance = 1;
  			}
  			raw_spin_unlock_irqrestore(&busiest->lock, flags);
969c79215   Tejun Heo   sched: replace mi...
6256

bd939f45d   Peter Zijlstra   sched/fair: Propa...
6257
  			if (active_balance) {
969c79215   Tejun Heo   sched: replace mi...
6258
6259
6260
  				stop_one_cpu_nowait(cpu_of(busiest),
  					active_load_balance_cpu_stop, busiest,
  					&busiest->active_balance_work);
bd939f45d   Peter Zijlstra   sched/fair: Propa...
6261
  			}
1e3c88bde   Peter Zijlstra   sched: Move load ...
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
  
  			/*
  			 * We've kicked active balancing, reset the failure
  			 * counter.
  			 */
  			sd->nr_balance_failed = sd->cache_nice_tries+1;
  		}
  	} else
  		sd->nr_balance_failed = 0;
  
  	if (likely(!active_balance)) {
  		/* We were unbalanced, so reset the balancing interval */
  		sd->balance_interval = sd->min_interval;
  	} else {
  		/*
  		 * If we've begun active balancing, start to back off. This
  		 * case may not be covered by the all_pinned logic if there
  		 * is only 1 task on the busy runqueue (because we don't call
  		 * move_tasks).
  		 */
  		if (sd->balance_interval < sd->max_interval)
  			sd->balance_interval *= 2;
  	}
1e3c88bde   Peter Zijlstra   sched: Move load ...
6285
6286
6287
6288
6289
6290
6291
6292
6293
  	goto out;
  
  out_balanced:
  	schedstat_inc(sd, lb_balanced[idle]);
  
  	sd->nr_balance_failed = 0;
  
  out_one_pinned:
  	/* tune up the balancing interval */
8e45cb545   Peter Zijlstra   sched: Move load-...
6294
  	if (((env.flags & LBF_ALL_PINNED) &&
5b54b56be   Peter Zijlstra   sched: Replace al...
6295
  			sd->balance_interval < MAX_PINNED_INTERVAL) ||
1e3c88bde   Peter Zijlstra   sched: Move load ...
6296
6297
  			(sd->balance_interval < sd->max_interval))
  		sd->balance_interval *= 2;
46e49b383   Venkatesh Pallipadi   sched: Wholesale ...
6298
  	ld_moved = 0;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6299
  out:
1e3c88bde   Peter Zijlstra   sched: Move load ...
6300
6301
6302
6303
  	return ld_moved;
  }
  
  /*
1e3c88bde   Peter Zijlstra   sched: Move load ...
6304
6305
6306
   * idle_balance is called by schedule() if this_cpu is about to become
   * idle. Attempts to pull tasks from other CPUs.
   */
6e83125c6   Peter Zijlstra   sched/fair: Remov...
6307
  static int idle_balance(struct rq *this_rq)
1e3c88bde   Peter Zijlstra   sched: Move load ...
6308
6309
6310
6311
  {
  	struct sched_domain *sd;
  	int pulled_task = 0;
  	unsigned long next_balance = jiffies + HZ;
9bd721c55   Jason Low   sched/balancing: ...
6312
  	u64 curr_cost = 0;
b4f2ab436   Daniel Lezcano   sched: Remove 'cp...
6313
  	int this_cpu = this_rq->cpu;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6314

6e83125c6   Peter Zijlstra   sched/fair: Remov...
6315
  	idle_enter_fair(this_rq);
0e5b5337f   Jason Low   sched: Fix updati...
6316

6e83125c6   Peter Zijlstra   sched/fair: Remov...
6317
6318
6319
6320
6321
  	/*
  	 * We must set idle_stamp _before_ calling idle_balance(), such that we
  	 * measure the duration of idle_balance() as idle time.
  	 */
  	this_rq->idle_stamp = rq_clock(this_rq);
1e3c88bde   Peter Zijlstra   sched: Move load ...
6322
  	if (this_rq->avg_idle < sysctl_sched_migration_cost)
6e83125c6   Peter Zijlstra   sched/fair: Remov...
6323
  		goto out;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6324

f492e12ef   Peter Zijlstra   sched: Remove loa...
6325
6326
6327
6328
  	/*
  	 * Drop the rq->lock, but keep IRQ/preempt disabled.
  	 */
  	raw_spin_unlock(&this_rq->lock);
48a167532   Paul Turner   sched: Refactor u...
6329
  	update_blocked_averages(this_cpu);
dce840a08   Peter Zijlstra   sched: Dynamicall...
6330
  	rcu_read_lock();
1e3c88bde   Peter Zijlstra   sched: Move load ...
6331
6332
  	for_each_domain(this_cpu, sd) {
  		unsigned long interval;
23f0d2093   Joonsoo Kim   sched: Factor out...
6333
  		int continue_balancing = 1;
9bd721c55   Jason Low   sched/balancing: ...
6334
  		u64 t0, domain_cost;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6335
6336
6337
  
  		if (!(sd->flags & SD_LOAD_BALANCE))
  			continue;
9bd721c55   Jason Low   sched/balancing: ...
6338
6339
  		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
  			break;
f492e12ef   Peter Zijlstra   sched: Remove loa...
6340
  		if (sd->flags & SD_BALANCE_NEWIDLE) {
9bd721c55   Jason Low   sched/balancing: ...
6341
  			t0 = sched_clock_cpu(this_cpu);
1e3c88bde   Peter Zijlstra   sched: Move load ...
6342
  			/* If we've pulled tasks over stop searching: */
f492e12ef   Peter Zijlstra   sched: Remove loa...
6343
  			pulled_task = load_balance(this_cpu, this_rq,
23f0d2093   Joonsoo Kim   sched: Factor out...
6344
6345
  						   sd, CPU_NEWLY_IDLE,
  						   &continue_balancing);
9bd721c55   Jason Low   sched/balancing: ...
6346
6347
6348
6349
6350
6351
  
  			domain_cost = sched_clock_cpu(this_cpu) - t0;
  			if (domain_cost > sd->max_newidle_lb_cost)
  				sd->max_newidle_lb_cost = domain_cost;
  
  			curr_cost += domain_cost;
f492e12ef   Peter Zijlstra   sched: Remove loa...
6352
  		}
1e3c88bde   Peter Zijlstra   sched: Move load ...
6353
6354
6355
6356
  
  		interval = msecs_to_jiffies(sd->balance_interval);
  		if (time_after(next_balance, sd->last_balance + interval))
  			next_balance = sd->last_balance + interval;
3c4017c13   Daniel Lezcano   sched: Move rq->i...
6357
  		if (pulled_task)
1e3c88bde   Peter Zijlstra   sched: Move load ...
6358
  			break;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6359
  	}
dce840a08   Peter Zijlstra   sched: Dynamicall...
6360
  	rcu_read_unlock();
f492e12ef   Peter Zijlstra   sched: Remove loa...
6361
6362
  
  	raw_spin_lock(&this_rq->lock);
0e5b5337f   Jason Low   sched: Fix updati...
6363
6364
  	if (curr_cost > this_rq->max_idle_balance_cost)
  		this_rq->max_idle_balance_cost = curr_cost;
e5fc66119   Daniel Lezcano   sched: Fix race i...
6365
  	/*
0e5b5337f   Jason Low   sched: Fix updati...
6366
6367
6368
  	 * While browsing the domains, we released the rq lock, a task could
  	 * have been enqueued in the meantime. Since we're not going idle,
  	 * pretend we pulled a task.
e5fc66119   Daniel Lezcano   sched: Fix race i...
6369
  	 */
0e5b5337f   Jason Low   sched: Fix updati...
6370
  	if (this_rq->cfs.h_nr_running && !pulled_task)
6e83125c6   Peter Zijlstra   sched/fair: Remov...
6371
  		pulled_task = 1;
e5fc66119   Daniel Lezcano   sched: Fix race i...
6372

1e3c88bde   Peter Zijlstra   sched: Move load ...
6373
6374
6375
6376
6377
6378
6379
  	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
  		/*
  		 * We are going idle. next_balance may be set based on
  		 * a busy processor. So reset next_balance.
  		 */
  		this_rq->next_balance = next_balance;
  	}
9bd721c55   Jason Low   sched/balancing: ...
6380

6e83125c6   Peter Zijlstra   sched/fair: Remov...
6381
  out:
e4aa358b6   Kirill Tkhai   sched/fair: Push ...
6382
  	/* Is there a task of a high priority class? */
4c6c4e38c   Kirill Tkhai   sched/core: Fix e...
6383
  	if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
a1d9a3231   Kirill Tkhai   sched: Check for ...
6384
6385
  	    ((this_rq->stop && this_rq->stop->on_rq) ||
  	     this_rq->dl.dl_nr_running ||
4c6c4e38c   Kirill Tkhai   sched/core: Fix e...
6386
  	     (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
e4aa358b6   Kirill Tkhai   sched/fair: Push ...
6387
6388
6389
6390
  		pulled_task = -1;
  
  	if (pulled_task) {
  		idle_exit_fair(this_rq);
6e83125c6   Peter Zijlstra   sched/fair: Remov...
6391
  		this_rq->idle_stamp = 0;
e4aa358b6   Kirill Tkhai   sched/fair: Push ...
6392
  	}
6e83125c6   Peter Zijlstra   sched/fair: Remov...
6393

3c4017c13   Daniel Lezcano   sched: Move rq->i...
6394
  	return pulled_task;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6395
6396
6397
  }
  
  /*
969c79215   Tejun Heo   sched: replace mi...
6398
6399
6400
6401
   * active_load_balance_cpu_stop is run by cpu stopper. It pushes
   * running tasks off the busiest CPU onto idle CPUs. It requires at
   * least 1 task to be running on each physical CPU where possible, and
   * avoids physical / logical imbalances.
1e3c88bde   Peter Zijlstra   sched: Move load ...
6402
   */
969c79215   Tejun Heo   sched: replace mi...
6403
  static int active_load_balance_cpu_stop(void *data)
1e3c88bde   Peter Zijlstra   sched: Move load ...
6404
  {
969c79215   Tejun Heo   sched: replace mi...
6405
6406
  	struct rq *busiest_rq = data;
  	int busiest_cpu = cpu_of(busiest_rq);
1e3c88bde   Peter Zijlstra   sched: Move load ...
6407
  	int target_cpu = busiest_rq->push_cpu;
969c79215   Tejun Heo   sched: replace mi...
6408
  	struct rq *target_rq = cpu_rq(target_cpu);
1e3c88bde   Peter Zijlstra   sched: Move load ...
6409
  	struct sched_domain *sd;
969c79215   Tejun Heo   sched: replace mi...
6410
6411
6412
6413
6414
6415
6416
  
  	raw_spin_lock_irq(&busiest_rq->lock);
  
  	/* make sure the requested cpu hasn't gone down in the meantime */
  	if (unlikely(busiest_cpu != smp_processor_id() ||
  		     !busiest_rq->active_balance))
  		goto out_unlock;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6417
6418
6419
  
  	/* Is there any task to move? */
  	if (busiest_rq->nr_running <= 1)
969c79215   Tejun Heo   sched: replace mi...
6420
  		goto out_unlock;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
  
  	/*
  	 * This condition is "impossible", if it occurs
  	 * we need to fix it. Originally reported by
  	 * Bjorn Helgaas on a 128-cpu setup.
  	 */
  	BUG_ON(busiest_rq == target_rq);
  
  	/* move a task from busiest_rq to target_rq */
  	double_lock_balance(busiest_rq, target_rq);
1e3c88bde   Peter Zijlstra   sched: Move load ...
6431
6432
  
  	/* Search for an sd spanning us and the target CPU. */
dce840a08   Peter Zijlstra   sched: Dynamicall...
6433
  	rcu_read_lock();
1e3c88bde   Peter Zijlstra   sched: Move load ...
6434
6435
6436
6437
6438
6439
6440
  	for_each_domain(target_cpu, sd) {
  		if ((sd->flags & SD_LOAD_BALANCE) &&
  		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
  				break;
  	}
  
  	if (likely(sd)) {
8e45cb545   Peter Zijlstra   sched: Move load-...
6441
6442
  		struct lb_env env = {
  			.sd		= sd,
ddcdf6e7d   Peter Zijlstra   sched: Rename loa...
6443
6444
6445
6446
  			.dst_cpu	= target_cpu,
  			.dst_rq		= target_rq,
  			.src_cpu	= busiest_rq->cpu,
  			.src_rq		= busiest_rq,
8e45cb545   Peter Zijlstra   sched: Move load-...
6447
6448
  			.idle		= CPU_IDLE,
  		};
1e3c88bde   Peter Zijlstra   sched: Move load ...
6449
  		schedstat_inc(sd, alb_count);
8e45cb545   Peter Zijlstra   sched: Move load-...
6450
  		if (move_one_task(&env))
1e3c88bde   Peter Zijlstra   sched: Move load ...
6451
6452
6453
6454
  			schedstat_inc(sd, alb_pushed);
  		else
  			schedstat_inc(sd, alb_failed);
  	}
dce840a08   Peter Zijlstra   sched: Dynamicall...
6455
  	rcu_read_unlock();
1e3c88bde   Peter Zijlstra   sched: Move load ...
6456
  	double_unlock_balance(busiest_rq, target_rq);
969c79215   Tejun Heo   sched: replace mi...
6457
6458
6459
6460
  out_unlock:
  	busiest_rq->active_balance = 0;
  	raw_spin_unlock_irq(&busiest_rq->lock);
  	return 0;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6461
  }
d987fc7f3   Mike Galbraith   sched, nohz: Excl...
6462
6463
6464
6465
  static inline int on_null_domain(struct rq *rq)
  {
  	return unlikely(!rcu_dereference_sched(rq->sd));
  }
3451d0243   Frederic Weisbecker   nohz: Rename CONF...
6466
  #ifdef CONFIG_NO_HZ_COMMON
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6467
6468
  /*
   * idle load balancing details
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6469
6470
6471
6472
   * - When one of the busy CPUs notice that there may be an idle rebalancing
   *   needed, they will kick the idle load balancer, which then does idle
   *   load balancing for all the idle CPUs.
   */
1e3c88bde   Peter Zijlstra   sched: Move load ...
6473
  static struct {
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6474
  	cpumask_var_t idle_cpus_mask;
0b005cf54   Suresh Siddha   sched, nohz: Impl...
6475
  	atomic_t nr_cpus;
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6476
6477
  	unsigned long next_balance;     /* in jiffy units */
  } nohz ____cacheline_aligned;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6478

3dd0337d6   Daniel Lezcano   sched: Remove unu...
6479
  static inline int find_new_ilb(void)
1e3c88bde   Peter Zijlstra   sched: Move load ...
6480
  {
0b005cf54   Suresh Siddha   sched, nohz: Impl...
6481
  	int ilb = cpumask_first(nohz.idle_cpus_mask);
1e3c88bde   Peter Zijlstra   sched: Move load ...
6482

786d6dc7a   Suresh Siddha   sched, nohz: Clea...
6483
6484
6485
6486
  	if (ilb < nr_cpu_ids && idle_cpu(ilb))
  		return ilb;
  
  	return nr_cpu_ids;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6487
  }
1e3c88bde   Peter Zijlstra   sched: Move load ...
6488
6489
  
  /*
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6490
6491
6492
6493
   * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
   * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
   * CPU (if there is one).
   */
0aeeeebac   Daniel Lezcano   sched: Remove unu...
6494
  static void nohz_balancer_kick(void)
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6495
6496
6497
6498
  {
  	int ilb_cpu;
  
  	nohz.next_balance++;
3dd0337d6   Daniel Lezcano   sched: Remove unu...
6499
  	ilb_cpu = find_new_ilb();
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6500

0b005cf54   Suresh Siddha   sched, nohz: Impl...
6501
6502
  	if (ilb_cpu >= nr_cpu_ids)
  		return;
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6503

cd490c5b2   Suresh Siddha   sched, nohz: Set ...
6504
  	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
1c792db7f   Suresh Siddha   sched, nohz: Intr...
6505
6506
6507
6508
6509
6510
6511
6512
  		return;
  	/*
  	 * Use smp_send_reschedule() instead of resched_cpu().
  	 * This way we generate a sched IPI on the target cpu which
  	 * is idle. And the softirq performing nohz idle load balance
  	 * will be run before returning from the IPI.
  	 */
  	smp_send_reschedule(ilb_cpu);
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6513
6514
  	return;
  }
c1cc017c5   Alex Shi   sched/nohz: Clean...
6515
  static inline void nohz_balance_exit_idle(int cpu)
71325960d   Suresh Siddha   sched/nohz: Fix n...
6516
6517
  {
  	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
d987fc7f3   Mike Galbraith   sched, nohz: Excl...
6518
6519
6520
6521
6522
6523
6524
  		/*
  		 * Completely isolated CPUs don't ever set, so we must test.
  		 */
  		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
  			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
  			atomic_dec(&nohz.nr_cpus);
  		}
71325960d   Suresh Siddha   sched/nohz: Fix n...
6525
6526
6527
  		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
  	}
  }
69e1e811d   Suresh Siddha   sched, nohz: Trac...
6528
6529
6530
  static inline void set_cpu_sd_state_busy(void)
  {
  	struct sched_domain *sd;
37dc6b50c   Preeti U Murthy   sched: Remove unn...
6531
  	int cpu = smp_processor_id();
69e1e811d   Suresh Siddha   sched, nohz: Trac...
6532

69e1e811d   Suresh Siddha   sched, nohz: Trac...
6533
  	rcu_read_lock();
37dc6b50c   Preeti U Murthy   sched: Remove unn...
6534
  	sd = rcu_dereference(per_cpu(sd_busy, cpu));
25f55d9d0   Vincent Guittot   sched: Fix init N...
6535
6536
6537
6538
  
  	if (!sd || !sd->nohz_idle)
  		goto unlock;
  	sd->nohz_idle = 0;
37dc6b50c   Preeti U Murthy   sched: Remove unn...
6539
  	atomic_inc(&sd->groups->sgp->nr_busy_cpus);
25f55d9d0   Vincent Guittot   sched: Fix init N...
6540
  unlock:
69e1e811d   Suresh Siddha   sched, nohz: Trac...
6541
6542
6543
6544
6545
6546
  	rcu_read_unlock();
  }
  
  void set_cpu_sd_state_idle(void)
  {
  	struct sched_domain *sd;
37dc6b50c   Preeti U Murthy   sched: Remove unn...
6547
  	int cpu = smp_processor_id();
69e1e811d   Suresh Siddha   sched, nohz: Trac...
6548

69e1e811d   Suresh Siddha   sched, nohz: Trac...
6549
  	rcu_read_lock();
37dc6b50c   Preeti U Murthy   sched: Remove unn...
6550
  	sd = rcu_dereference(per_cpu(sd_busy, cpu));
25f55d9d0   Vincent Guittot   sched: Fix init N...
6551
6552
6553
6554
  
  	if (!sd || sd->nohz_idle)
  		goto unlock;
  	sd->nohz_idle = 1;
37dc6b50c   Preeti U Murthy   sched: Remove unn...
6555
  	atomic_dec(&sd->groups->sgp->nr_busy_cpus);
25f55d9d0   Vincent Guittot   sched: Fix init N...
6556
  unlock:
69e1e811d   Suresh Siddha   sched, nohz: Trac...
6557
6558
  	rcu_read_unlock();
  }
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6559
  /*
c1cc017c5   Alex Shi   sched/nohz: Clean...
6560
   * This routine will record that the cpu is going idle with tick stopped.
0b005cf54   Suresh Siddha   sched, nohz: Impl...
6561
   * This info will be used in performing idle load balancing in the future.
1e3c88bde   Peter Zijlstra   sched: Move load ...
6562
   */
c1cc017c5   Alex Shi   sched/nohz: Clean...
6563
  void nohz_balance_enter_idle(int cpu)
1e3c88bde   Peter Zijlstra   sched: Move load ...
6564
  {
71325960d   Suresh Siddha   sched/nohz: Fix n...
6565
6566
6567
6568
6569
  	/*
  	 * If this cpu is going down, then nothing needs to be done.
  	 */
  	if (!cpu_active(cpu))
  		return;
c1cc017c5   Alex Shi   sched/nohz: Clean...
6570
6571
  	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
  		return;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6572

d987fc7f3   Mike Galbraith   sched, nohz: Excl...
6573
6574
6575
6576
6577
  	/*
  	 * If we're a completely isolated CPU, we don't play.
  	 */
  	if (on_null_domain(cpu_rq(cpu)))
  		return;
c1cc017c5   Alex Shi   sched/nohz: Clean...
6578
6579
6580
  	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
  	atomic_inc(&nohz.nr_cpus);
  	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
1e3c88bde   Peter Zijlstra   sched: Move load ...
6581
  }
71325960d   Suresh Siddha   sched/nohz: Fix n...
6582

0db0628d9   Paul Gortmaker   kernel: delete __...
6583
  static int sched_ilb_notifier(struct notifier_block *nfb,
71325960d   Suresh Siddha   sched/nohz: Fix n...
6584
6585
6586
6587
  					unsigned long action, void *hcpu)
  {
  	switch (action & ~CPU_TASKS_FROZEN) {
  	case CPU_DYING:
c1cc017c5   Alex Shi   sched/nohz: Clean...
6588
  		nohz_balance_exit_idle(smp_processor_id());
71325960d   Suresh Siddha   sched/nohz: Fix n...
6589
6590
6591
6592
6593
  		return NOTIFY_OK;
  	default:
  		return NOTIFY_DONE;
  	}
  }
1e3c88bde   Peter Zijlstra   sched: Move load ...
6594
6595
6596
  #endif
  
  static DEFINE_SPINLOCK(balancing);
49c022e65   Peter Zijlstra   sched: Clean up r...
6597
6598
6599
6600
  /*
   * Scale the max load_balance interval with the number of CPUs in the system.
   * This trades load-balance latency on larger machines for less cross talk.
   */
029632fbb   Peter Zijlstra   sched: Make separ...
6601
  void update_max_interval(void)
49c022e65   Peter Zijlstra   sched: Clean up r...
6602
6603
6604
  {
  	max_load_balance_interval = HZ*num_online_cpus()/10;
  }
1e3c88bde   Peter Zijlstra   sched: Move load ...
6605
6606
6607
6608
  /*
   * It checks each scheduling domain to see if it is due to be balanced,
   * and initiates a balancing operation if so.
   *
b9b0853a4   Libin   sched: Fix commen...
6609
   * Balancing parameters are set up in init_sched_domains.
1e3c88bde   Peter Zijlstra   sched: Move load ...
6610
   */
f7ed0a895   Daniel Lezcano   sched: Pass 'stru...
6611
  static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
1e3c88bde   Peter Zijlstra   sched: Move load ...
6612
  {
23f0d2093   Joonsoo Kim   sched: Factor out...
6613
  	int continue_balancing = 1;
f7ed0a895   Daniel Lezcano   sched: Pass 'stru...
6614
  	int cpu = rq->cpu;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6615
  	unsigned long interval;
04f733b4a   Peter Zijlstra   sched/fair: Rever...
6616
  	struct sched_domain *sd;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6617
6618
6619
  	/* Earliest time when we have to do rebalance again */
  	unsigned long next_balance = jiffies + 60*HZ;
  	int update_next_balance = 0;
f48627e68   Jason Low   sched/balancing: ...
6620
6621
  	int need_serialize, need_decay = 0;
  	u64 max_cost = 0;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6622

48a167532   Paul Turner   sched: Refactor u...
6623
  	update_blocked_averages(cpu);
2069dd75c   Peter Zijlstra   sched: Rewrite tg...
6624

dce840a08   Peter Zijlstra   sched: Dynamicall...
6625
  	rcu_read_lock();
1e3c88bde   Peter Zijlstra   sched: Move load ...
6626
  	for_each_domain(cpu, sd) {
f48627e68   Jason Low   sched/balancing: ...
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
  		/*
  		 * Decay the newidle max times here because this is a regular
  		 * visit to all the domains. Decay ~1% per second.
  		 */
  		if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
  			sd->max_newidle_lb_cost =
  				(sd->max_newidle_lb_cost * 253) / 256;
  			sd->next_decay_max_lb_cost = jiffies + HZ;
  			need_decay = 1;
  		}
  		max_cost += sd->max_newidle_lb_cost;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6638
6639
  		if (!(sd->flags & SD_LOAD_BALANCE))
  			continue;
f48627e68   Jason Low   sched/balancing: ...
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
  		/*
  		 * Stop the load balance at this level. There is another
  		 * CPU in our sched group which is doing load balancing more
  		 * actively.
  		 */
  		if (!continue_balancing) {
  			if (need_decay)
  				continue;
  			break;
  		}
1e3c88bde   Peter Zijlstra   sched: Move load ...
6650
6651
6652
6653
6654
6655
  		interval = sd->balance_interval;
  		if (idle != CPU_IDLE)
  			interval *= sd->busy_factor;
  
  		/* scale ms to jiffies */
  		interval = msecs_to_jiffies(interval);
49c022e65   Peter Zijlstra   sched: Clean up r...
6656
  		interval = clamp(interval, 1UL, max_load_balance_interval);
1e3c88bde   Peter Zijlstra   sched: Move load ...
6657
6658
6659
6660
6661
6662
6663
6664
6665
  
  		need_serialize = sd->flags & SD_SERIALIZE;
  
  		if (need_serialize) {
  			if (!spin_trylock(&balancing))
  				goto out;
  		}
  
  		if (time_after_eq(jiffies, sd->last_balance + interval)) {
23f0d2093   Joonsoo Kim   sched: Factor out...
6666
  			if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
1e3c88bde   Peter Zijlstra   sched: Move load ...
6667
  				/*
6263322c5   Peter Zijlstra   sched/fair: Rewri...
6668
  				 * The LBF_DST_PINNED logic could have changed
de5eb2dd7   Joonsoo Kim   sched: Explicitly...
6669
6670
  				 * env->dst_cpu, so we can't know our idle
  				 * state even if we migrated tasks. Update it.
1e3c88bde   Peter Zijlstra   sched: Move load ...
6671
  				 */
de5eb2dd7   Joonsoo Kim   sched: Explicitly...
6672
  				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
1e3c88bde   Peter Zijlstra   sched: Move load ...
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
  			}
  			sd->last_balance = jiffies;
  		}
  		if (need_serialize)
  			spin_unlock(&balancing);
  out:
  		if (time_after(next_balance, sd->last_balance + interval)) {
  			next_balance = sd->last_balance + interval;
  			update_next_balance = 1;
  		}
f48627e68   Jason Low   sched/balancing: ...
6683
6684
  	}
  	if (need_decay) {
1e3c88bde   Peter Zijlstra   sched: Move load ...
6685
  		/*
f48627e68   Jason Low   sched/balancing: ...
6686
6687
  		 * Ensure the rq-wide value also decays but keep it at a
  		 * reasonable floor to avoid funnies with rq->avg_idle.
1e3c88bde   Peter Zijlstra   sched: Move load ...
6688
  		 */
f48627e68   Jason Low   sched/balancing: ...
6689
6690
  		rq->max_idle_balance_cost =
  			max((u64)sysctl_sched_migration_cost, max_cost);
1e3c88bde   Peter Zijlstra   sched: Move load ...
6691
  	}
dce840a08   Peter Zijlstra   sched: Dynamicall...
6692
  	rcu_read_unlock();
1e3c88bde   Peter Zijlstra   sched: Move load ...
6693
6694
6695
6696
6697
6698
6699
6700
6701
  
  	/*
  	 * next_balance will be updated only when there is a need.
  	 * When the cpu is attached to null domain for ex, it will not be
  	 * updated.
  	 */
  	if (likely(update_next_balance))
  		rq->next_balance = next_balance;
  }
3451d0243   Frederic Weisbecker   nohz: Rename CONF...
6702
  #ifdef CONFIG_NO_HZ_COMMON
1e3c88bde   Peter Zijlstra   sched: Move load ...
6703
  /*
3451d0243   Frederic Weisbecker   nohz: Rename CONF...
6704
   * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
1e3c88bde   Peter Zijlstra   sched: Move load ...
6705
6706
   * rebalancing for all the cpus for whom scheduler ticks are stopped.
   */
208cb16ba   Daniel Lezcano   sched: Pass 'stru...
6707
  static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6708
  {
208cb16ba   Daniel Lezcano   sched: Pass 'stru...
6709
  	int this_cpu = this_rq->cpu;
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6710
6711
  	struct rq *rq;
  	int balance_cpu;
1c792db7f   Suresh Siddha   sched, nohz: Intr...
6712
6713
6714
  	if (idle != CPU_IDLE ||
  	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
  		goto end;
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6715
6716
  
  	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
8a6d42d1b   Suresh Siddha   sched, nohz: Fix ...
6717
  		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6718
6719
6720
6721
6722
6723
6724
  			continue;
  
  		/*
  		 * If this cpu gets work to do, stop the load balancing
  		 * work being done for other cpus. Next load
  		 * balancing owner will pick it up.
  		 */
1c792db7f   Suresh Siddha   sched, nohz: Intr...
6725
  		if (need_resched())
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6726
  			break;
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6727

5ed4f1d96   Vincent Guittot   sched: Fix nohz_i...
6728
6729
6730
6731
6732
6733
  		rq = cpu_rq(balance_cpu);
  
  		raw_spin_lock_irq(&rq->lock);
  		update_rq_clock(rq);
  		update_idle_cpu_load(rq);
  		raw_spin_unlock_irq(&rq->lock);
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6734

f7ed0a895   Daniel Lezcano   sched: Pass 'stru...
6735
  		rebalance_domains(rq, CPU_IDLE);
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6736

83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6737
6738
6739
6740
  		if (time_after(this_rq->next_balance, rq->next_balance))
  			this_rq->next_balance = rq->next_balance;
  	}
  	nohz.next_balance = this_rq->next_balance;
1c792db7f   Suresh Siddha   sched, nohz: Intr...
6741
6742
  end:
  	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6743
6744
6745
  }
  
  /*
0b005cf54   Suresh Siddha   sched, nohz: Impl...
6746
6747
6748
6749
6750
6751
6752
   * Current heuristic for kicking the idle load balancer in the presence
   * of an idle cpu is the system.
   *   - This rq has more than one task.
   *   - At any scheduler domain level, this cpu's scheduler group has multiple
   *     busy cpu's exceeding the group's power.
   *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
   *     domain span are idle.
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6753
   */
4a725627f   Daniel Lezcano   sched: Reduce noh...
6754
  static inline int nohz_kick_needed(struct rq *rq)
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6755
6756
  {
  	unsigned long now = jiffies;
0b005cf54   Suresh Siddha   sched, nohz: Impl...
6757
  	struct sched_domain *sd;
37dc6b50c   Preeti U Murthy   sched: Remove unn...
6758
  	struct sched_group_power *sgp;
4a725627f   Daniel Lezcano   sched: Reduce noh...
6759
  	int nr_busy, cpu = rq->cpu;
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6760

4a725627f   Daniel Lezcano   sched: Reduce noh...
6761
  	if (unlikely(rq->idle_balance))
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6762
  		return 0;
1c792db7f   Suresh Siddha   sched, nohz: Intr...
6763
6764
6765
6766
         /*
  	* We may be recently in ticked or tickless idle mode. At the first
  	* busy tick after returning from idle, we will update the busy stats.
  	*/
69e1e811d   Suresh Siddha   sched, nohz: Trac...
6767
  	set_cpu_sd_state_busy();
c1cc017c5   Alex Shi   sched/nohz: Clean...
6768
  	nohz_balance_exit_idle(cpu);
0b005cf54   Suresh Siddha   sched, nohz: Impl...
6769
6770
6771
6772
6773
6774
6775
  
  	/*
  	 * None are in tickless mode and hence no need for NOHZ idle load
  	 * balancing.
  	 */
  	if (likely(!atomic_read(&nohz.nr_cpus)))
  		return 0;
1c792db7f   Suresh Siddha   sched, nohz: Intr...
6776
6777
  
  	if (time_before(now, nohz.next_balance))
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6778
  		return 0;
0b005cf54   Suresh Siddha   sched, nohz: Impl...
6779
6780
  	if (rq->nr_running >= 2)
  		goto need_kick;
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6781

067491b73   Peter Zijlstra   sched, nohz: Fix ...
6782
  	rcu_read_lock();
37dc6b50c   Preeti U Murthy   sched: Remove unn...
6783
  	sd = rcu_dereference(per_cpu(sd_busy, cpu));
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6784

37dc6b50c   Preeti U Murthy   sched: Remove unn...
6785
6786
6787
  	if (sd) {
  		sgp = sd->groups->sgp;
  		nr_busy = atomic_read(&sgp->nr_busy_cpus);
0b005cf54   Suresh Siddha   sched, nohz: Impl...
6788

37dc6b50c   Preeti U Murthy   sched: Remove unn...
6789
  		if (nr_busy > 1)
067491b73   Peter Zijlstra   sched, nohz: Fix ...
6790
  			goto need_kick_unlock;
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6791
  	}
37dc6b50c   Preeti U Murthy   sched: Remove unn...
6792
6793
6794
6795
6796
6797
  
  	sd = rcu_dereference(per_cpu(sd_asym, cpu));
  
  	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
  				  sched_domain_span(sd)) < cpu))
  		goto need_kick_unlock;
067491b73   Peter Zijlstra   sched, nohz: Fix ...
6798
  	rcu_read_unlock();
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6799
  	return 0;
067491b73   Peter Zijlstra   sched, nohz: Fix ...
6800
6801
6802
  
  need_kick_unlock:
  	rcu_read_unlock();
0b005cf54   Suresh Siddha   sched, nohz: Impl...
6803
6804
  need_kick:
  	return 1;
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6805
6806
  }
  #else
208cb16ba   Daniel Lezcano   sched: Pass 'stru...
6807
  static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6808
6809
6810
6811
6812
6813
  #endif
  
  /*
   * run_rebalance_domains is triggered when needed from the scheduler tick.
   * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
   */
1e3c88bde   Peter Zijlstra   sched: Move load ...
6814
6815
  static void run_rebalance_domains(struct softirq_action *h)
  {
208cb16ba   Daniel Lezcano   sched: Pass 'stru...
6816
  	struct rq *this_rq = this_rq();
6eb57e0d6   Suresh Siddha   sched: Request fo...
6817
  	enum cpu_idle_type idle = this_rq->idle_balance ?
1e3c88bde   Peter Zijlstra   sched: Move load ...
6818
  						CPU_IDLE : CPU_NOT_IDLE;
f7ed0a895   Daniel Lezcano   sched: Pass 'stru...
6819
  	rebalance_domains(this_rq, idle);
1e3c88bde   Peter Zijlstra   sched: Move load ...
6820

1e3c88bde   Peter Zijlstra   sched: Move load ...
6821
  	/*
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6822
  	 * If this cpu has a pending nohz_balance_kick, then do the
1e3c88bde   Peter Zijlstra   sched: Move load ...
6823
6824
6825
  	 * balancing on behalf of the other idle cpus whose ticks are
  	 * stopped.
  	 */
208cb16ba   Daniel Lezcano   sched: Pass 'stru...
6826
  	nohz_idle_balance(this_rq, idle);
1e3c88bde   Peter Zijlstra   sched: Move load ...
6827
  }
1e3c88bde   Peter Zijlstra   sched: Move load ...
6828
6829
  /*
   * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
1e3c88bde   Peter Zijlstra   sched: Move load ...
6830
   */
7caff66f3   Daniel Lezcano   sched: Reduce tri...
6831
  void trigger_load_balance(struct rq *rq)
1e3c88bde   Peter Zijlstra   sched: Move load ...
6832
  {
1e3c88bde   Peter Zijlstra   sched: Move load ...
6833
  	/* Don't need to rebalance while attached to NULL domain */
c726099ec   Daniel Lezcano   sched: Factor out...
6834
6835
6836
6837
  	if (unlikely(on_null_domain(rq)))
  		return;
  
  	if (time_after_eq(jiffies, rq->next_balance))
1e3c88bde   Peter Zijlstra   sched: Move load ...
6838
  		raise_softirq(SCHED_SOFTIRQ);
3451d0243   Frederic Weisbecker   nohz: Rename CONF...
6839
  #ifdef CONFIG_NO_HZ_COMMON
c726099ec   Daniel Lezcano   sched: Factor out...
6840
  	if (nohz_kick_needed(rq))
0aeeeebac   Daniel Lezcano   sched: Remove unu...
6841
  		nohz_balancer_kick();
83cd4fe27   Venkatesh Pallipadi   sched: Change noh...
6842
  #endif
1e3c88bde   Peter Zijlstra   sched: Move load ...
6843
  }
0bcdcf28c   Christian Ehrhardt   sched: Fix missin...
6844
6845
6846
6847
6848
6849
6850
6851
  static void rq_online_fair(struct rq *rq)
  {
  	update_sysctl();
  }
  
  static void rq_offline_fair(struct rq *rq)
  {
  	update_sysctl();
a4c96ae31   Peter Boonstoppel   sched: Unthrottle...
6852
6853
6854
  
  	/* Ensure any throttled groups are reachable by pick_next_task */
  	unthrottle_offline_cfs_rqs(rq);
0bcdcf28c   Christian Ehrhardt   sched: Fix missin...
6855
  }
55e12e5e7   Dhaval Giani   sched: make sched...
6856
  #endif /* CONFIG_SMP */
e1d1484f7   Peter Williams   sched: reduce bal...
6857

bf0f6f24a   Ingo Molnar   sched: cfs core, ...
6858
6859
6860
  /*
   * scheduler tick hitting a task of our scheduling class:
   */
8f4d37ec0   Peter Zijlstra   sched: high-res p...
6861
  static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
6862
6863
6864
6865
6866
6867
  {
  	struct cfs_rq *cfs_rq;
  	struct sched_entity *se = &curr->se;
  
  	for_each_sched_entity(se) {
  		cfs_rq = cfs_rq_of(se);
8f4d37ec0   Peter Zijlstra   sched: high-res p...
6868
  		entity_tick(cfs_rq, se, queued);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
6869
  	}
18bf2805d   Ben Segall   sched: Maintain p...
6870

10e84b97e   Dave Kleikamp   mm: sched: numa: ...
6871
  	if (numabalancing_enabled)
cbee9f88e   Peter Zijlstra   mm: numa: Add fau...
6872
  		task_tick_numa(rq, curr);
3d59eebc5   Linus Torvalds   Merge tag 'balanc...
6873

18bf2805d   Ben Segall   sched: Maintain p...
6874
  	update_rq_runnable_avg(rq, 1);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
6875
6876
6877
  }
  
  /*
cd29fe6f2   Peter Zijlstra   sched: Sanitize f...
6878
6879
6880
   * called on fork with the child task as argument from the parent's context
   *  - child not yet on the tasklist
   *  - preemption disabled
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
6881
   */
cd29fe6f2   Peter Zijlstra   sched: Sanitize f...
6882
  static void task_fork_fair(struct task_struct *p)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
6883
  {
4fc420c91   Daisuke Nishimura   sched: Fix cgroup...
6884
6885
  	struct cfs_rq *cfs_rq;
  	struct sched_entity *se = &p->se, *curr;
00bf7bfc2   Ingo Molnar   sched: fix: move ...
6886
  	int this_cpu = smp_processor_id();
cd29fe6f2   Peter Zijlstra   sched: Sanitize f...
6887
6888
  	struct rq *rq = this_rq();
  	unsigned long flags;
05fa785cf   Thomas Gleixner   sched: Convert rq...
6889
  	raw_spin_lock_irqsave(&rq->lock, flags);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
6890

861d034ee   Peter Zijlstra   sched: Fix rq->cl...
6891
  	update_rq_clock(rq);
4fc420c91   Daisuke Nishimura   sched: Fix cgroup...
6892
6893
  	cfs_rq = task_cfs_rq(current);
  	curr = cfs_rq->curr;
6c9a27f5d   Daisuke Nishimura   sched/fair: Fix s...
6894
6895
6896
6897
6898
6899
6900
6901
6902
  	/*
  	 * Not only the cpu but also the task_group of the parent might have
  	 * been changed after parent->se.parent,cfs_rq were copied to
  	 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
  	 * of child point to valid ones.
  	 */
  	rcu_read_lock();
  	__set_task_cpu(p, this_cpu);
  	rcu_read_unlock();
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
6903

7109c4429   Ting Yang   sched: call updat...
6904
  	update_curr(cfs_rq);
cd29fe6f2   Peter Zijlstra   sched: Sanitize f...
6905

b5d9d734a   Mike Galbraith   sched: Ensure tha...
6906
6907
  	if (curr)
  		se->vruntime = curr->vruntime;
aeb73b040   Peter Zijlstra   sched: clean up n...
6908
  	place_entity(cfs_rq, se, 1);
4d78e7b65   Peter Zijlstra   sched: new task p...
6909

cd29fe6f2   Peter Zijlstra   sched: Sanitize f...
6910
  	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
87fefa381   Dmitry Adamushko   sched: optimize t...
6911
  		/*
edcb60a30   Ingo Molnar   sched: kernel/sch...
6912
6913
6914
  		 * Upon rescheduling, sched_class::put_prev_task() will place
  		 * 'current' within the tree based on its new key value.
  		 */
4d78e7b65   Peter Zijlstra   sched: new task p...
6915
  		swap(curr->vruntime, se->vruntime);
aec0a5142   Bharata B Rao   sched: call resch...
6916
  		resched_task(rq->curr);
4d78e7b65   Peter Zijlstra   sched: new task p...
6917
  	}
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
6918

88ec22d3e   Peter Zijlstra   sched: Remove the...
6919
  	se->vruntime -= cfs_rq->min_vruntime;
05fa785cf   Thomas Gleixner   sched: Convert rq...
6920
  	raw_spin_unlock_irqrestore(&rq->lock, flags);
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
6921
  }
cb4698450   Steven Rostedt   sched: RT-balance...
6922
6923
6924
6925
  /*
   * Priority of the task has changed. Check to see if we preempt
   * the current task.
   */
da7a735e5   Peter Zijlstra   sched: Fix switch...
6926
6927
  static void
  prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
cb4698450   Steven Rostedt   sched: RT-balance...
6928
  {
da7a735e5   Peter Zijlstra   sched: Fix switch...
6929
6930
  	if (!p->se.on_rq)
  		return;
cb4698450   Steven Rostedt   sched: RT-balance...
6931
6932
6933
6934
6935
  	/*
  	 * Reschedule if we are currently running on this runqueue and
  	 * our priority decreased, or if we are not currently running on
  	 * this runqueue and our priority is higher than the current's
  	 */
da7a735e5   Peter Zijlstra   sched: Fix switch...
6936
  	if (rq->curr == p) {
cb4698450   Steven Rostedt   sched: RT-balance...
6937
6938
6939
  		if (p->prio > oldprio)
  			resched_task(rq->curr);
  	} else
15afe09bf   Peter Zijlstra   sched: wakeup pre...
6940
  		check_preempt_curr(rq, p, 0);
cb4698450   Steven Rostedt   sched: RT-balance...
6941
  }
da7a735e5   Peter Zijlstra   sched: Fix switch...
6942
6943
6944
6945
6946
6947
  static void switched_from_fair(struct rq *rq, struct task_struct *p)
  {
  	struct sched_entity *se = &p->se;
  	struct cfs_rq *cfs_rq = cfs_rq_of(se);
  
  	/*
791c9e029   George McCollister   sched: Fix double...
6948
  	 * Ensure the task's vruntime is normalized, so that when it's
da7a735e5   Peter Zijlstra   sched: Fix switch...
6949
6950
6951
  	 * switched back to the fair class the enqueue_entity(.flags=0) will
  	 * do the right thing.
  	 *
791c9e029   George McCollister   sched: Fix double...
6952
6953
  	 * If it's on_rq, then the dequeue_entity(.flags=0) will already
  	 * have normalized the vruntime, if it's !on_rq, then only when
da7a735e5   Peter Zijlstra   sched: Fix switch...
6954
6955
  	 * the task is sleeping will it still have non-normalized vruntime.
  	 */
791c9e029   George McCollister   sched: Fix double...
6956
  	if (!p->on_rq && p->state != TASK_RUNNING) {
da7a735e5   Peter Zijlstra   sched: Fix switch...
6957
6958
6959
6960
6961
6962
6963
  		/*
  		 * Fix up our vruntime so that the current sleep doesn't
  		 * cause 'unlimited' sleep bonus.
  		 */
  		place_entity(cfs_rq, se, 0);
  		se->vruntime -= cfs_rq->min_vruntime;
  	}
9ee474f55   Paul Turner   sched: Maintain t...
6964

141965c74   Alex Shi   Revert "sched: In...
6965
  #ifdef CONFIG_SMP
9ee474f55   Paul Turner   sched: Maintain t...
6966
6967
6968
6969
6970
  	/*
  	* Remove our load from contribution when we leave sched_fair
  	* and ensure we don't carry in an old decay_count if we
  	* switch back.
  	*/
87e3c8ae1   Kirill Tkhai   sched/fair: Clean...
6971
6972
6973
  	if (se->avg.decay_count) {
  		__synchronize_entity_decay(se);
  		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
9ee474f55   Paul Turner   sched: Maintain t...
6974
6975
  	}
  #endif
da7a735e5   Peter Zijlstra   sched: Fix switch...
6976
  }
cb4698450   Steven Rostedt   sched: RT-balance...
6977
6978
6979
  /*
   * We switched to the sched_fair class.
   */
da7a735e5   Peter Zijlstra   sched: Fix switch...
6980
  static void switched_to_fair(struct rq *rq, struct task_struct *p)
cb4698450   Steven Rostedt   sched: RT-balance...
6981
  {
eb7a59b2c   Michael wang   sched/fair: Reset...
6982
6983
6984
6985
6986
6987
6988
6989
6990
  	struct sched_entity *se = &p->se;
  #ifdef CONFIG_FAIR_GROUP_SCHED
  	/*
  	 * Since the real-depth could have been changed (only FAIR
  	 * class maintain depth value), reset depth properly.
  	 */
  	se->depth = se->parent ? se->parent->depth + 1 : 0;
  #endif
  	if (!se->on_rq)
da7a735e5   Peter Zijlstra   sched: Fix switch...
6991
  		return;
cb4698450   Steven Rostedt   sched: RT-balance...
6992
6993
6994
6995
6996
  	/*
  	 * We were most likely switched from sched_rt, so
  	 * kick off the schedule if running, otherwise just see
  	 * if we can still preempt the current task.
  	 */
da7a735e5   Peter Zijlstra   sched: Fix switch...
6997
  	if (rq->curr == p)
cb4698450   Steven Rostedt   sched: RT-balance...
6998
6999
  		resched_task(rq->curr);
  	else
15afe09bf   Peter Zijlstra   sched: wakeup pre...
7000
  		check_preempt_curr(rq, p, 0);
cb4698450   Steven Rostedt   sched: RT-balance...
7001
  }
83b699ed2   Srivatsa Vaddagiri   sched: revert rec...
7002
7003
7004
7005
7006
7007
7008
7009
  /* Account for a task changing its policy or group.
   *
   * This routine is mostly called to set cfs_rq->curr field when a task
   * migrates between groups/classes.
   */
  static void set_curr_task_fair(struct rq *rq)
  {
  	struct sched_entity *se = &rq->curr->se;
ec12cb7f3   Paul Turner   sched: Accumulate...
7010
7011
7012
7013
7014
7015
7016
  	for_each_sched_entity(se) {
  		struct cfs_rq *cfs_rq = cfs_rq_of(se);
  
  		set_next_entity(cfs_rq, se);
  		/* ensure bandwidth has been allocated on our new cfs_rq */
  		account_cfs_rq_runtime(cfs_rq, 0);
  	}
83b699ed2   Srivatsa Vaddagiri   sched: revert rec...
7017
  }
029632fbb   Peter Zijlstra   sched: Make separ...
7018
7019
7020
  void init_cfs_rq(struct cfs_rq *cfs_rq)
  {
  	cfs_rq->tasks_timeline = RB_ROOT;
029632fbb   Peter Zijlstra   sched: Make separ...
7021
7022
7023
7024
  	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
  #ifndef CONFIG_64BIT
  	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
  #endif
141965c74   Alex Shi   Revert "sched: In...
7025
  #ifdef CONFIG_SMP
9ee474f55   Paul Turner   sched: Maintain t...
7026
  	atomic64_set(&cfs_rq->decay_counter, 1);
2509940fd   Alex Shi   sched/cfs_rq: Cha...
7027
  	atomic_long_set(&cfs_rq->removed_load, 0);
9ee474f55   Paul Turner   sched: Maintain t...
7028
  #endif
029632fbb   Peter Zijlstra   sched: Make separ...
7029
  }
810b38179   Peter Zijlstra   sched: retain vru...
7030
  #ifdef CONFIG_FAIR_GROUP_SCHED
b2b5ce022   Peter Zijlstra   sched, cgroup: Fi...
7031
  static void task_move_group_fair(struct task_struct *p, int on_rq)
810b38179   Peter Zijlstra   sched: retain vru...
7032
  {
fed14d45f   Peter Zijlstra   sched/fair: Track...
7033
  	struct sched_entity *se = &p->se;
aff3e4988   Paul Turner   sched: Account fo...
7034
  	struct cfs_rq *cfs_rq;
fed14d45f   Peter Zijlstra   sched/fair: Track...
7035

b2b5ce022   Peter Zijlstra   sched, cgroup: Fi...
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
  	/*
  	 * If the task was not on the rq at the time of this cgroup movement
  	 * it must have been asleep, sleeping tasks keep their ->vruntime
  	 * absolute on their old rq until wakeup (needed for the fair sleeper
  	 * bonus in place_entity()).
  	 *
  	 * If it was on the rq, we've just 'preempted' it, which does convert
  	 * ->vruntime to a relative base.
  	 *
  	 * Make sure both cases convert their relative position when migrating
  	 * to another cgroup's rq. This does somewhat interfere with the
  	 * fair sleeper stuff for the first placement, but who cares.
  	 */
7ceff013c   Daisuke Nishimura   sched: Fix cgroup...
7049
7050
7051
7052
7053
7054
  	/*
  	 * When !on_rq, vruntime of the task has usually NOT been normalized.
  	 * But there are some cases where it has already been normalized:
  	 *
  	 * - Moving a forked child which is waiting for being woken up by
  	 *   wake_up_new_task().
62af3783e   Daisuke Nishimura   sched: Fix cgroup...
7055
7056
  	 * - Moving a task which has been woken up by try_to_wake_up() and
  	 *   waiting for actually being woken up by sched_ttwu_pending().
7ceff013c   Daisuke Nishimura   sched: Fix cgroup...
7057
7058
7059
7060
  	 *
  	 * To prevent boost or penalty in the new cfs_rq caused by delta
  	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
  	 */
fed14d45f   Peter Zijlstra   sched/fair: Track...
7061
  	if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
7ceff013c   Daisuke Nishimura   sched: Fix cgroup...
7062
  		on_rq = 1;
b2b5ce022   Peter Zijlstra   sched, cgroup: Fi...
7063
  	if (!on_rq)
fed14d45f   Peter Zijlstra   sched/fair: Track...
7064
  		se->vruntime -= cfs_rq_of(se)->min_vruntime;
b2b5ce022   Peter Zijlstra   sched, cgroup: Fi...
7065
  	set_task_rq(p, task_cpu(p));
fed14d45f   Peter Zijlstra   sched/fair: Track...
7066
  	se->depth = se->parent ? se->parent->depth + 1 : 0;
aff3e4988   Paul Turner   sched: Account fo...
7067
  	if (!on_rq) {
fed14d45f   Peter Zijlstra   sched/fair: Track...
7068
7069
  		cfs_rq = cfs_rq_of(se);
  		se->vruntime += cfs_rq->min_vruntime;
aff3e4988   Paul Turner   sched: Account fo...
7070
7071
7072
7073
7074
7075
  #ifdef CONFIG_SMP
  		/*
  		 * migrate_task_rq_fair() will have removed our previous
  		 * contribution, but we must synchronize for ongoing future
  		 * decay.
  		 */
fed14d45f   Peter Zijlstra   sched/fair: Track...
7076
7077
  		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
  		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
aff3e4988   Paul Turner   sched: Account fo...
7078
7079
  #endif
  	}
810b38179   Peter Zijlstra   sched: retain vru...
7080
  }
029632fbb   Peter Zijlstra   sched: Make separ...
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
  
  void free_fair_sched_group(struct task_group *tg)
  {
  	int i;
  
  	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
  
  	for_each_possible_cpu(i) {
  		if (tg->cfs_rq)
  			kfree(tg->cfs_rq[i]);
  		if (tg->se)
  			kfree(tg->se[i]);
  	}
  
  	kfree(tg->cfs_rq);
  	kfree(tg->se);
  }
  
  int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  {
  	struct cfs_rq *cfs_rq;
  	struct sched_entity *se;
  	int i;
  
  	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
  	if (!tg->cfs_rq)
  		goto err;
  	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
  	if (!tg->se)
  		goto err;
  
  	tg->shares = NICE_0_LOAD;
  
  	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
  
  	for_each_possible_cpu(i) {
  		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
  				      GFP_KERNEL, cpu_to_node(i));
  		if (!cfs_rq)
  			goto err;
  
  		se = kzalloc_node(sizeof(struct sched_entity),
  				  GFP_KERNEL, cpu_to_node(i));
  		if (!se)
  			goto err_free_rq;
  
  		init_cfs_rq(cfs_rq);
  		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
  	}
  
  	return 1;
  
  err_free_rq:
  	kfree(cfs_rq);
  err:
  	return 0;
  }
  
  void unregister_fair_sched_group(struct task_group *tg, int cpu)
  {
  	struct rq *rq = cpu_rq(cpu);
  	unsigned long flags;
  
  	/*
  	* Only empty task groups can be destroyed; so we can speculatively
  	* check on_list without danger of it being re-added.
  	*/
  	if (!tg->cfs_rq[cpu]->on_list)
  		return;
  
  	raw_spin_lock_irqsave(&rq->lock, flags);
  	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
  	raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
  
  void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
  			struct sched_entity *se, int cpu,
  			struct sched_entity *parent)
  {
  	struct rq *rq = cpu_rq(cpu);
  
  	cfs_rq->tg = tg;
  	cfs_rq->rq = rq;
029632fbb   Peter Zijlstra   sched: Make separ...
7164
7165
7166
7167
7168
7169
7170
7171
  	init_cfs_rq_runtime(cfs_rq);
  
  	tg->cfs_rq[cpu] = cfs_rq;
  	tg->se[cpu] = se;
  
  	/* se could be NULL for root_task_group */
  	if (!se)
  		return;
fed14d45f   Peter Zijlstra   sched/fair: Track...
7172
  	if (!parent) {
029632fbb   Peter Zijlstra   sched: Make separ...
7173
  		se->cfs_rq = &rq->cfs;
fed14d45f   Peter Zijlstra   sched/fair: Track...
7174
7175
  		se->depth = 0;
  	} else {
029632fbb   Peter Zijlstra   sched: Make separ...
7176
  		se->cfs_rq = parent->my_q;
fed14d45f   Peter Zijlstra   sched/fair: Track...
7177
7178
  		se->depth = parent->depth + 1;
  	}
029632fbb   Peter Zijlstra   sched: Make separ...
7179
7180
  
  	se->my_q = cfs_rq;
0ac9b1c21   Paul Turner   sched: Guarantee ...
7181
7182
  	/* guarantee group entities always have weight */
  	update_load_set(&se->load, NICE_0_LOAD);
029632fbb   Peter Zijlstra   sched: Make separ...
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
  	se->parent = parent;
  }
  
  static DEFINE_MUTEX(shares_mutex);
  
  int sched_group_set_shares(struct task_group *tg, unsigned long shares)
  {
  	int i;
  	unsigned long flags;
  
  	/*
  	 * We can't change the weight of the root cgroup.
  	 */
  	if (!tg->se[0])
  		return -EINVAL;
  
  	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
  
  	mutex_lock(&shares_mutex);
  	if (tg->shares == shares)
  		goto done;
  
  	tg->shares = shares;
  	for_each_possible_cpu(i) {
  		struct rq *rq = cpu_rq(i);
  		struct sched_entity *se;
  
  		se = tg->se[i];
  		/* Propagate contribution to hierarchy */
  		raw_spin_lock_irqsave(&rq->lock, flags);
71b1da46f   Frederic Weisbecker   sched: Update rq ...
7213
7214
7215
  
  		/* Possible calls to update_curr() need rq clock */
  		update_rq_clock(rq);
17bc14b76   Linus Torvalds   Revert "sched: Up...
7216
  		for_each_sched_entity(se)
029632fbb   Peter Zijlstra   sched: Make separ...
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
  			update_cfs_shares(group_cfs_rq(se));
  		raw_spin_unlock_irqrestore(&rq->lock, flags);
  	}
  
  done:
  	mutex_unlock(&shares_mutex);
  	return 0;
  }
  #else /* CONFIG_FAIR_GROUP_SCHED */
  
  void free_fair_sched_group(struct task_group *tg) { }
  
  int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  {
  	return 1;
  }
  
  void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
  
  #endif /* CONFIG_FAIR_GROUP_SCHED */
810b38179   Peter Zijlstra   sched: retain vru...
7237

6d686f456   H Hartley Sweeten   sched: Don't expo...
7238
  static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
0d721cead   Peter Williams   sched: Simplify s...
7239
7240
  {
  	struct sched_entity *se = &task->se;
0d721cead   Peter Williams   sched: Simplify s...
7241
7242
7243
7244
7245
7246
  	unsigned int rr_interval = 0;
  
  	/*
  	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
  	 * idle runqueue:
  	 */
0d721cead   Peter Williams   sched: Simplify s...
7247
  	if (rq->cfs.load.weight)
a59f4e079   Zhu Yanhai   sched: Fix the br...
7248
  		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
0d721cead   Peter Williams   sched: Simplify s...
7249
7250
7251
  
  	return rr_interval;
  }
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
7252
7253
7254
  /*
   * All the scheduling class methods:
   */
029632fbb   Peter Zijlstra   sched: Make separ...
7255
  const struct sched_class fair_sched_class = {
5522d5d5f   Ingo Molnar   sched: mark sched...
7256
  	.next			= &idle_sched_class,
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
7257
7258
7259
  	.enqueue_task		= enqueue_task_fair,
  	.dequeue_task		= dequeue_task_fair,
  	.yield_task		= yield_task_fair,
d95f41220   Mike Galbraith   sched: Add yield_...
7260
  	.yield_to_task		= yield_to_task_fair,
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
7261

2e09bf556   Ingo Molnar   sched: wakeup gra...
7262
  	.check_preempt_curr	= check_preempt_wakeup,
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
7263
7264
7265
  
  	.pick_next_task		= pick_next_task_fair,
  	.put_prev_task		= put_prev_task_fair,
681f3e685   Peter Williams   sched: isolate SM...
7266
  #ifdef CONFIG_SMP
4ce72a2c0   Li Zefan   sched: add CONFIG...
7267
  	.select_task_rq		= select_task_rq_fair,
0a74bef8b   Paul Turner   sched: Add an rq ...
7268
  	.migrate_task_rq	= migrate_task_rq_fair,
141965c74   Alex Shi   Revert "sched: In...
7269

0bcdcf28c   Christian Ehrhardt   sched: Fix missin...
7270
7271
  	.rq_online		= rq_online_fair,
  	.rq_offline		= rq_offline_fair,
88ec22d3e   Peter Zijlstra   sched: Remove the...
7272
7273
  
  	.task_waking		= task_waking_fair,
681f3e685   Peter Williams   sched: isolate SM...
7274
  #endif
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
7275

83b699ed2   Srivatsa Vaddagiri   sched: revert rec...
7276
  	.set_curr_task          = set_curr_task_fair,
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
7277
  	.task_tick		= task_tick_fair,
cd29fe6f2   Peter Zijlstra   sched: Sanitize f...
7278
  	.task_fork		= task_fork_fair,
cb4698450   Steven Rostedt   sched: RT-balance...
7279
7280
  
  	.prio_changed		= prio_changed_fair,
da7a735e5   Peter Zijlstra   sched: Fix switch...
7281
  	.switched_from		= switched_from_fair,
cb4698450   Steven Rostedt   sched: RT-balance...
7282
  	.switched_to		= switched_to_fair,
810b38179   Peter Zijlstra   sched: retain vru...
7283

0d721cead   Peter Williams   sched: Simplify s...
7284
  	.get_rr_interval	= get_rr_interval_fair,
810b38179   Peter Zijlstra   sched: retain vru...
7285
  #ifdef CONFIG_FAIR_GROUP_SCHED
b2b5ce022   Peter Zijlstra   sched, cgroup: Fi...
7286
  	.task_move_group	= task_move_group_fair,
810b38179   Peter Zijlstra   sched: retain vru...
7287
  #endif
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
7288
7289
7290
  };
  
  #ifdef CONFIG_SCHED_DEBUG
029632fbb   Peter Zijlstra   sched: Make separ...
7291
  void print_cfs_stats(struct seq_file *m, int cpu)
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
7292
  {
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
7293
  	struct cfs_rq *cfs_rq;
5973e5b95   Peter Zijlstra   sched: fix: don't...
7294
  	rcu_read_lock();
c3b64f1e4   Ingo Molnar   sched: clean up s...
7295
  	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
5cef9eca3   Ingo Molnar   sched: remove the...
7296
  		print_cfs_rq(m, cpu, cfs_rq);
5973e5b95   Peter Zijlstra   sched: fix: don't...
7297
  	rcu_read_unlock();
bf0f6f24a   Ingo Molnar   sched: cfs core, ...
7298
7299
  }
  #endif
029632fbb   Peter Zijlstra   sched: Make separ...
7300
7301
7302
7303
7304
  
  __init void init_sched_fair_class(void)
  {
  #ifdef CONFIG_SMP
  	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
3451d0243   Frederic Weisbecker   nohz: Rename CONF...
7305
  #ifdef CONFIG_NO_HZ_COMMON
554cecaf7   Diwakar Tundlam   sched/nohz: Corre...
7306
  	nohz.next_balance = jiffies;
029632fbb   Peter Zijlstra   sched: Make separ...
7307
  	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
71325960d   Suresh Siddha   sched/nohz: Fix n...
7308
  	cpu_notifier(sched_ilb_notifier, 0);
029632fbb   Peter Zijlstra   sched: Make separ...
7309
7310
7311
7312
  #endif
  #endif /* SMP */
  
  }