Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)

2

* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)

3

*

3

*

4

5

*

5

*

6

* Interactivity improvements by Mike Galbraith

6

* Interactivity improvements by Mike Galbraith

7

8

*

8

*

9

* Various enhancements by Dmitry Adamushko.

9

* Various enhancements by Dmitry Adamushko.

10

11

*

11

*

12

* Group scheduling enhancements by Srivatsa Vaddagiri

12

* Group scheduling enhancements by Srivatsa Vaddagiri

13

* Copyright IBM Corporation, 2007

13

* Copyright IBM Corporation, 2007

14

* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>

14

* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>

15

*

15

*

16

* Scaled math optimizations by Thomas Gleixner

16

* Scaled math optimizations by Thomas Gleixner

17

18

*

18

*

19

* Adaptive scheduling granularity, math enhancements by Peter Zijlstra

19

* Adaptive scheduling granularity, math enhancements by Peter Zijlstra

20

21

*/

21

*/

22

23

#include <linux/latencytop.h>

23

#include <linux/latencytop.h>

24

#include <linux/sched.h>

24

#include <linux/sched.h>

25

#include <linux/cpumask.h>

25

#include <linux/cpumask.h>

26

#include <linux/slab.h>

26

#include <linux/slab.h>

27

#include <linux/profile.h>

27

#include <linux/profile.h>

28

#include <linux/interrupt.h>

28

#include <linux/interrupt.h>

29

#include <linux/mempolicy.h>

29

#include <linux/mempolicy.h>

30

#include <linux/migrate.h>

30

#include <linux/migrate.h>

31

#include <linux/task_work.h>

31

#include <linux/task_work.h>

32

33

#include <trace/events/sched.h>

33

#include <trace/events/sched.h>

34

35

#include "sched.h"

35

#include "sched.h"

36

37

/*

37

/*

38

* Targeted preemption latency for CPU-bound tasks:

38

* Targeted preemption latency for CPU-bound tasks:

39

* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)

39

* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)

40

*

40

*

41

* NOTE: this latency value is not the same as the concept of

41

* NOTE: this latency value is not the same as the concept of

42

* 'timeslice length' - timeslices in CFS are of variable length

42

* 'timeslice length' - timeslices in CFS are of variable length

43

* and have no persistent notion like in traditional, time-slice

43

* and have no persistent notion like in traditional, time-slice

44

* based scheduling concepts.

44

* based scheduling concepts.

45

*

45

*

46

* (to see the precise effective timeslice length of your workload,

46

* (to see the precise effective timeslice length of your workload,

47

* run vmstat and monitor the context-switches (cs) field)

47

* run vmstat and monitor the context-switches (cs) field)

48

*/

48

*/

49

unsigned int sysctl_sched_latency = 6000000ULL;

49

unsigned int sysctl_sched_latency = 6000000ULL;

50

unsigned int normalized_sysctl_sched_latency = 6000000ULL;

50

unsigned int normalized_sysctl_sched_latency = 6000000ULL;

51

52

/*

52

/*

53

* The initial- and re-scaling of tunables is configurable

53

* The initial- and re-scaling of tunables is configurable

54

* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))

54

* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))

55

*

55

*

56

* Options are:

56

* Options are:

57

* SCHED_TUNABLESCALING_NONE - unscaled, always *1

57

* SCHED_TUNABLESCALING_NONE - unscaled, always *1

58

* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)

58

* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)

59

* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus

59

* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus

60

*/

60

*/

61

enum sched_tunable_scaling sysctl_sched_tunable_scaling

61

enum sched_tunable_scaling sysctl_sched_tunable_scaling

62

= SCHED_TUNABLESCALING_LOG;

62

= SCHED_TUNABLESCALING_LOG;

63

64

/*

64

/*

65

* Minimal preemption granularity for CPU-bound tasks:

65

* Minimal preemption granularity for CPU-bound tasks:

66

* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)

66

* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)

67

*/

67

*/

68

unsigned int sysctl_sched_min_granularity = 750000ULL;

68

unsigned int sysctl_sched_min_granularity = 750000ULL;

69

unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;

69

unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;

70

71

/*

71

/*

72

* is kept at sysctl_sched_latency / sysctl_sched_min_granularity

72

* is kept at sysctl_sched_latency / sysctl_sched_min_granularity

73

*/

73

*/

74

static unsigned int sched_nr_latency = 8;

74

static unsigned int sched_nr_latency = 8;

75

76

/*

76

/*

77

* After fork, child runs first. If set to 0 (default) then

77

* After fork, child runs first. If set to 0 (default) then

78

* parent will (try to) run first.

78

* parent will (try to) run first.

79

*/

79

*/

80

unsigned int sysctl_sched_child_runs_first __read_mostly;

80

unsigned int sysctl_sched_child_runs_first __read_mostly;

81

82

/*

82

/*

83

* SCHED_OTHER wake-up granularity.

83

* SCHED_OTHER wake-up granularity.

84

* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)

84

* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)

85

*

85

*

86

* This option delays the preemption effects of decoupled workloads

86

* This option delays the preemption effects of decoupled workloads

87

* and reduces their over-scheduling. Synchronous workloads will still

87

* and reduces their over-scheduling. Synchronous workloads will still

88

* have immediate wakeup/sleep latencies.

88

* have immediate wakeup/sleep latencies.

89

*/

89

*/

90

unsigned int sysctl_sched_wakeup_granularity = 1000000UL;

90

unsigned int sysctl_sched_wakeup_granularity = 1000000UL;

91

unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;

91

unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;

92

93

const_debug unsigned int sysctl_sched_migration_cost = 500000UL;

93

const_debug unsigned int sysctl_sched_migration_cost = 500000UL;

94

95

/*

95

/*

96

* The exponential sliding window over which load is averaged for shares

96

* The exponential sliding window over which load is averaged for shares

97

* distribution.

97

* distribution.

98

* (default: 10msec)

98

* (default: 10msec)

99

*/

99

*/

100

unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;

100

unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;

101

102

#ifdef CONFIG_CFS_BANDWIDTH

102

#ifdef CONFIG_CFS_BANDWIDTH

103

/*

103

/*

104

* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool

104

* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool

105

* each time a cfs_rq requests quota.

105

* each time a cfs_rq requests quota.

106

*

106

*

107

* Note: in the case that the slice exceeds the runtime remaining (either due

107

* Note: in the case that the slice exceeds the runtime remaining (either due

108

* to consumption or the quota being specified to be smaller than the slice)

108

* to consumption or the quota being specified to be smaller than the slice)

109

* we will always only issue the remaining available time.

109

* we will always only issue the remaining available time.

110

*

110

*

111

* default: 5 msec, units: microseconds

111

* default: 5 msec, units: microseconds

112

*/

112

*/

113

unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;

113

unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;

114

#endif

114

#endif

115

116

static inline void update_load_add(struct load_weight *lw, unsigned long inc)

116

static inline void update_load_add(struct load_weight *lw, unsigned long inc)

117

{

117

{

118

lw->weight += inc;

118

lw->weight += inc;

119

lw->inv_weight = 0;

119

lw->inv_weight = 0;

120

}

120

}

121

122

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)

122

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)

123

{

123

{

124

lw->weight -= dec;

124

lw->weight -= dec;

125

lw->inv_weight = 0;

125

lw->inv_weight = 0;

126

}

126

}

127

128

static inline void update_load_set(struct load_weight *lw, unsigned long w)

128

static inline void update_load_set(struct load_weight *lw, unsigned long w)

129

{

129

{

130

lw->weight = w;

130

lw->weight = w;

131

lw->inv_weight = 0;

131

lw->inv_weight = 0;

132

}

132

}

133

134

/*

134

/*

135

* Increase the granularity value when there are more CPUs,

135

* Increase the granularity value when there are more CPUs,

136

* because with more CPUs the 'effective latency' as visible

136

* because with more CPUs the 'effective latency' as visible

137

* to users decreases. But the relationship is not linear,

137

* to users decreases. But the relationship is not linear,

138

* so pick a second-best guess by going with the log2 of the

138

* so pick a second-best guess by going with the log2 of the

139

* number of CPUs.

139

* number of CPUs.

140

*

140

*

141

* This idea comes from the SD scheduler of Con Kolivas:

141

* This idea comes from the SD scheduler of Con Kolivas:

142

*/

142

*/

143

static int get_update_sysctl_factor(void)

143

static int get_update_sysctl_factor(void)

144

{

144

{

145

unsigned int cpus = min_t(int, num_online_cpus(), 8);

145

unsigned int cpus = min_t(int, num_online_cpus(), 8);

146

unsigned int factor;

146

unsigned int factor;

147

148

switch (sysctl_sched_tunable_scaling) {

148

switch (sysctl_sched_tunable_scaling) {

149

case SCHED_TUNABLESCALING_NONE:

149

case SCHED_TUNABLESCALING_NONE:

150

factor = 1;

150

factor = 1;

151

break;

151

break;

152

case SCHED_TUNABLESCALING_LINEAR:

152

case SCHED_TUNABLESCALING_LINEAR:

153

factor = cpus;

153

factor = cpus;

154

break;

154

break;

155

case SCHED_TUNABLESCALING_LOG:

155

case SCHED_TUNABLESCALING_LOG:

156

default:

156

default:

157

factor = 1 + ilog2(cpus);

157

factor = 1 + ilog2(cpus);

158

break;

158

break;

159

}

159

}

160

161

return factor;

161

return factor;

162

}

162

}

163

164

static void update_sysctl(void)

164

static void update_sysctl(void)

165

{

165

{

166

unsigned int factor = get_update_sysctl_factor();

166

unsigned int factor = get_update_sysctl_factor();

167

168

#define SET_SYSCTL(name) \

168

#define SET_SYSCTL(name) \

169

(sysctl_##name = (factor) * normalized_sysctl_##name)

169

(sysctl_##name = (factor) * normalized_sysctl_##name)

170

SET_SYSCTL(sched_min_granularity);

170

SET_SYSCTL(sched_min_granularity);

171

SET_SYSCTL(sched_latency);

171

SET_SYSCTL(sched_latency);

172

SET_SYSCTL(sched_wakeup_granularity);

172

SET_SYSCTL(sched_wakeup_granularity);

173

#undef SET_SYSCTL

173

#undef SET_SYSCTL

174

}

174

}

175

176

void sched_init_granularity(void)

176

void sched_init_granularity(void)

177

{

177

{

178

update_sysctl();

178

update_sysctl();

179

}

179

}

180

181

#define WMULT_CONST (~0U)

181

#define WMULT_CONST (~0U)

182

#define WMULT_SHIFT 32

182

#define WMULT_SHIFT 32

183

184

static void __update_inv_weight(struct load_weight *lw)

184

static void __update_inv_weight(struct load_weight *lw)

185

{

185

{

186

unsigned long w;

186

unsigned long w;

187

188

if (likely(lw->inv_weight))

188

if (likely(lw->inv_weight))

189

return;

189

return;

190

191

w = scale_load_down(lw->weight);

191

w = scale_load_down(lw->weight);

192

193

if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))

193

if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))

194

lw->inv_weight = 1;

194

lw->inv_weight = 1;

195

else if (unlikely(!w))

195

else if (unlikely(!w))

196

lw->inv_weight = WMULT_CONST;

196

lw->inv_weight = WMULT_CONST;

197

else

197

else

198

lw->inv_weight = WMULT_CONST / w;

198

lw->inv_weight = WMULT_CONST / w;

199

}

199

}

200

201

/*

201

/*

202

* delta_exec * weight / lw.weight

202

* delta_exec * weight / lw.weight

203

* OR

203

* OR

204

* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT

204

* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT

205

*

205

*

206

* Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case

206

* Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case

207

* we're guaranteed shift stays positive because inv_weight is guaranteed to

207

* we're guaranteed shift stays positive because inv_weight is guaranteed to

208

* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.

208

* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.

209

*

209

*

210

* Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus

210

* Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus

211

* weight/lw.weight <= 1, and therefore our shift will also be positive.

211

* weight/lw.weight <= 1, and therefore our shift will also be positive.

212

*/

212

*/

213

static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)

213

static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)

214

{

214

{

215

u64 fact = scale_load_down(weight);

215

u64 fact = scale_load_down(weight);

216

int shift = WMULT_SHIFT;

216

int shift = WMULT_SHIFT;

217

218

__update_inv_weight(lw);

218

__update_inv_weight(lw);

219

220

if (unlikely(fact >> 32)) {

220

if (unlikely(fact >> 32)) {

221

while (fact >> 32) {

221

while (fact >> 32) {

222

fact >>= 1;

222

fact >>= 1;

223

shift--;

223

shift--;

224

}

224

}

225

}

225

}

226

227

/* hint to use a 32x32->64 mul */

227

/* hint to use a 32x32->64 mul */

228

fact = (u64)(u32)fact * lw->inv_weight;

228

fact = (u64)(u32)fact * lw->inv_weight;

229

230

while (fact >> 32) {

230

while (fact >> 32) {

231

fact >>= 1;

231

fact >>= 1;

232

shift--;

232

shift--;

233

}

233

}

234

235

return mul_u64_u32_shr(delta_exec, fact, shift);

235

return mul_u64_u32_shr(delta_exec, fact, shift);

236

}

236

}

237

238

239

const struct sched_class fair_sched_class;

239

const struct sched_class fair_sched_class;

240

241

/**************************************************************

241

/**************************************************************

242

* CFS operations on generic schedulable entities:

242

* CFS operations on generic schedulable entities:

243

*/

243

*/

244

245

#ifdef CONFIG_FAIR_GROUP_SCHED

245

#ifdef CONFIG_FAIR_GROUP_SCHED

246

247

/* cpu runqueue to which this cfs_rq is attached */

247

/* cpu runqueue to which this cfs_rq is attached */

248

static inline struct rq *rq_of(struct cfs_rq *cfs_rq)

248

static inline struct rq *rq_of(struct cfs_rq *cfs_rq)

249

{

249

{

250

return cfs_rq->rq;

250

return cfs_rq->rq;

251

}

251

}

252

253

/* An entity is a task if it doesn't "own" a runqueue */

253

/* An entity is a task if it doesn't "own" a runqueue */

254

#define entity_is_task(se) (!se->my_q)

254

#define entity_is_task(se) (!se->my_q)

255

256

static inline struct task_struct *task_of(struct sched_entity *se)

256

static inline struct task_struct *task_of(struct sched_entity *se)

257

{

257

{

258

#ifdef CONFIG_SCHED_DEBUG

258

#ifdef CONFIG_SCHED_DEBUG

259

WARN_ON_ONCE(!entity_is_task(se));

259

WARN_ON_ONCE(!entity_is_task(se));

260

#endif

260

#endif

261

return container_of(se, struct task_struct, se);

261

return container_of(se, struct task_struct, se);

262

}

262

}

263

264

/* Walk up scheduling entities hierarchy */

264

/* Walk up scheduling entities hierarchy */

265

#define for_each_sched_entity(se) \

265

#define for_each_sched_entity(se) \

266

for (; se; se = se->parent)

266

for (; se; se = se->parent)

267

268

static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)

268

static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)

269

{

269

{

270

return p->se.cfs_rq;

270

return p->se.cfs_rq;

271

}

271

}

272

273

/* runqueue on which this entity is (to be) queued */

273

/* runqueue on which this entity is (to be) queued */

274

static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)

274

static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)

275

{

275

{

276

return se->cfs_rq;

276

return se->cfs_rq;

277

}

277

}

278

279

/* runqueue "owned" by this group */

279

/* runqueue "owned" by this group */

280

static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)

280

static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)

281

{

281

{

282

return grp->my_q;

282

return grp->my_q;

283

}

283

}

284

285

static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,

285

static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,

286

int force_update);

286

int force_update);

287

288

static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)

288

static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)

289

{

289

{

290

if (!cfs_rq->on_list) {

290

if (!cfs_rq->on_list) {

291

/*

291

/*

292

* Ensure we either appear before our parent (if already

292

* Ensure we either appear before our parent (if already

293

* enqueued) or force our parent to appear after us when it is

293

* enqueued) or force our parent to appear after us when it is

294

* enqueued. The fact that we always enqueue bottom-up

294

* enqueued. The fact that we always enqueue bottom-up

295

* reduces this to two cases.

295

* reduces this to two cases.

296

*/

296

*/

297

if (cfs_rq->tg->parent &&

297

if (cfs_rq->tg->parent &&

298

cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {

298

cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {

299

list_add_rcu(&cfs_rq->leaf_cfs_rq_list,

299

list_add_rcu(&cfs_rq->leaf_cfs_rq_list,

300

&rq_of(cfs_rq)->leaf_cfs_rq_list);

300

&rq_of(cfs_rq)->leaf_cfs_rq_list);

301

} else {

301

} else {

302

list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,

302

list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,

303

&rq_of(cfs_rq)->leaf_cfs_rq_list);

303

&rq_of(cfs_rq)->leaf_cfs_rq_list);

304

}

304

}

305

306

cfs_rq->on_list = 1;

306

cfs_rq->on_list = 1;

307

/* We should have no load, but we need to update last_decay. */

307

/* We should have no load, but we need to update last_decay. */

308

update_cfs_rq_blocked_load(cfs_rq, 0);

308

update_cfs_rq_blocked_load(cfs_rq, 0);

309

}

309

}

310

}

310

}

311

312

static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)

312

static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)

313

{

313

{

314

if (cfs_rq->on_list) {

314

if (cfs_rq->on_list) {

315

list_del_rcu(&cfs_rq->leaf_cfs_rq_list);

315

list_del_rcu(&cfs_rq->leaf_cfs_rq_list);

316

cfs_rq->on_list = 0;

316

cfs_rq->on_list = 0;

317

}

317

}

318

}

318

}

319

320

/* Iterate thr' all leaf cfs_rq's on a runqueue */

320

/* Iterate thr' all leaf cfs_rq's on a runqueue */

321

#define for_each_leaf_cfs_rq(rq, cfs_rq) \

321

#define for_each_leaf_cfs_rq(rq, cfs_rq) \

322

list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)

322

list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)

323

324

/* Do the two (enqueued) entities belong to the same group ? */

324

/* Do the two (enqueued) entities belong to the same group ? */

325

static inline struct cfs_rq *

325

static inline struct cfs_rq *

326

is_same_group(struct sched_entity *se, struct sched_entity *pse)

326

is_same_group(struct sched_entity *se, struct sched_entity *pse)

327

{

327

{

328

if (se->cfs_rq == pse->cfs_rq)

328

if (se->cfs_rq == pse->cfs_rq)

329

return se->cfs_rq;

329

return se->cfs_rq;

330

331

return NULL;

331

return NULL;

332

}

332

}

333

334

static inline struct sched_entity *parent_entity(struct sched_entity *se)

334

static inline struct sched_entity *parent_entity(struct sched_entity *se)

335

{

335

{

336

return se->parent;

336

return se->parent;

337

}

337

}

338

339

static void

339

static void

340

find_matching_se(struct sched_entity **se, struct sched_entity **pse)

340

find_matching_se(struct sched_entity **se, struct sched_entity **pse)

341

{

341

{

342

int se_depth, pse_depth;

342

int se_depth, pse_depth;

343

344

/*

344

/*

345

* preemption test can be made between sibling entities who are in the

345

* preemption test can be made between sibling entities who are in the

346

* same cfs_rq i.e who have a common parent. Walk up the hierarchy of

346

* same cfs_rq i.e who have a common parent. Walk up the hierarchy of

347

* both tasks until we find their ancestors who are siblings of common

347

* both tasks until we find their ancestors who are siblings of common

348

* parent.

348

* parent.

349

*/

349

*/

350

351

/* First walk up until both entities are at same depth */

351

/* First walk up until both entities are at same depth */

352

se_depth = (*se)->depth;

352

se_depth = (*se)->depth;

353

pse_depth = (*pse)->depth;

353

pse_depth = (*pse)->depth;

354

355

while (se_depth > pse_depth) {

355

while (se_depth > pse_depth) {

356

se_depth--;

356

se_depth--;

357

*se = parent_entity(*se);

357

*se = parent_entity(*se);

358

}

358

}

359

360

while (pse_depth > se_depth) {

360

while (pse_depth > se_depth) {

361

pse_depth--;

361

pse_depth--;

362

*pse = parent_entity(*pse);

362

*pse = parent_entity(*pse);

363

}

363

}

364

365

while (!is_same_group(*se, *pse)) {

365

while (!is_same_group(*se, *pse)) {

366

*se = parent_entity(*se);

366

*se = parent_entity(*se);

367

*pse = parent_entity(*pse);

367

*pse = parent_entity(*pse);

368

}

368

}

369

}

369

}

370

371

#else /* !CONFIG_FAIR_GROUP_SCHED */

371

#else /* !CONFIG_FAIR_GROUP_SCHED */

372

373

static inline struct task_struct *task_of(struct sched_entity *se)

373

static inline struct task_struct *task_of(struct sched_entity *se)

374

{

374

{

375

return container_of(se, struct task_struct, se);

375

return container_of(se, struct task_struct, se);

376

}

376

}

377

378

static inline struct rq *rq_of(struct cfs_rq *cfs_rq)

378

static inline struct rq *rq_of(struct cfs_rq *cfs_rq)

379

{

379

{

380

return container_of(cfs_rq, struct rq, cfs);

380

return container_of(cfs_rq, struct rq, cfs);

381

}

381

}

382

383

#define entity_is_task(se) 1

383

#define entity_is_task(se) 1

384

385

#define for_each_sched_entity(se) \

385

#define for_each_sched_entity(se) \

386

for (; se; se = NULL)

386

for (; se; se = NULL)

387

388

static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)

388

static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)

389

{

389

{

390

return &task_rq(p)->cfs;

390

return &task_rq(p)->cfs;

391

}

391

}

392

393

static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)

393

static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)

394

{

394

{

395

struct task_struct *p = task_of(se);

395

struct task_struct *p = task_of(se);

396

struct rq *rq = task_rq(p);

396

struct rq *rq = task_rq(p);

397

398

return &rq->cfs;

398

return &rq->cfs;

399

}

399

}

400

401

/* runqueue "owned" by this group */

401

/* runqueue "owned" by this group */

402

static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)

402

static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)

403

{

403

{

404

return NULL;

404

return NULL;

405

}

405

}

406

407

static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)

407

static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)

408

{

408

{

409

}

409

}

410

411

static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)

411

static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)

412

{

412

{

413

}

413

}

414

415

#define for_each_leaf_cfs_rq(rq, cfs_rq) \

415

#define for_each_leaf_cfs_rq(rq, cfs_rq) \

416

for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)

416

for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)

417

418

static inline struct sched_entity *parent_entity(struct sched_entity *se)

418

static inline struct sched_entity *parent_entity(struct sched_entity *se)

419

{

419

{

420

return NULL;

420

return NULL;

421

}

421

}

422

423

static inline void

423

static inline void

424

find_matching_se(struct sched_entity **se, struct sched_entity **pse)

424

find_matching_se(struct sched_entity **se, struct sched_entity **pse)

425

{

425

{

426

}

426

}

427

428

#endif /* CONFIG_FAIR_GROUP_SCHED */

428

#endif /* CONFIG_FAIR_GROUP_SCHED */

429

430

static __always_inline

430

static __always_inline

431

void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);

431

void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);

432

433

/**************************************************************

433

/**************************************************************

434

* Scheduling class tree data structure manipulation methods:

434

* Scheduling class tree data structure manipulation methods:

435

*/

435

*/

436

437

static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)

437

static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)

438

{

438

{

439

s64 delta = (s64)(vruntime - max_vruntime);

439

s64 delta = (s64)(vruntime - max_vruntime);

440

if (delta > 0)

440

if (delta > 0)

441

max_vruntime = vruntime;

441

max_vruntime = vruntime;

442

443

return max_vruntime;

443

return max_vruntime;

444

}

444

}

445

446

static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)

446

static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)

447

{

447

{

448

s64 delta = (s64)(vruntime - min_vruntime);

448

s64 delta = (s64)(vruntime - min_vruntime);

449

if (delta < 0)

449

if (delta < 0)

450

min_vruntime = vruntime;

450

min_vruntime = vruntime;

451

452

return min_vruntime;

452

return min_vruntime;

453

}

453

}

454

455

static inline int entity_before(struct sched_entity *a,

455

static inline int entity_before(struct sched_entity *a,

456

struct sched_entity *b)

456

struct sched_entity *b)

457

{

457

{

458

return (s64)(a->vruntime - b->vruntime) < 0;

458

return (s64)(a->vruntime - b->vruntime) < 0;

459

}

459

}

460

461

static void update_min_vruntime(struct cfs_rq *cfs_rq)

461

static void update_min_vruntime(struct cfs_rq *cfs_rq)

462

{

462

{

463

u64 vruntime = cfs_rq->min_vruntime;

463

u64 vruntime = cfs_rq->min_vruntime;

464

465

if (cfs_rq->curr)

465

if (cfs_rq->curr)

466

vruntime = cfs_rq->curr->vruntime;

466

vruntime = cfs_rq->curr->vruntime;

467

468

if (cfs_rq->rb_leftmost) {

468

if (cfs_rq->rb_leftmost) {

469

struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,

469

struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,

470

struct sched_entity,

470

struct sched_entity,

471

run_node);

471

run_node);

472

473

if (!cfs_rq->curr)

473

if (!cfs_rq->curr)

474

vruntime = se->vruntime;

474

vruntime = se->vruntime;

475

else

475

else

476

vruntime = min_vruntime(vruntime, se->vruntime);

476

vruntime = min_vruntime(vruntime, se->vruntime);

477

}

477

}

478

479

/* ensure we never gain time by being placed backwards. */

479

/* ensure we never gain time by being placed backwards. */

480

cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);

480

cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);

481

#ifndef CONFIG_64BIT

481

#ifndef CONFIG_64BIT

482

smp_wmb();

482

smp_wmb();

483

cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;

483

cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;

484

#endif

484

#endif

485

}

485

}

486

487

/*

487

/*

488

* Enqueue an entity into the rb-tree:

488

* Enqueue an entity into the rb-tree:

489

*/

489

*/

490

static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)

490

static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)

491

{

491

{

492

struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;

492

struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;

493

struct rb_node *parent = NULL;

493

struct rb_node *parent = NULL;

494

struct sched_entity *entry;

494

struct sched_entity *entry;

495

int leftmost = 1;

495

int leftmost = 1;

496

497

/*

497

/*

498

* Find the right place in the rbtree:

498

* Find the right place in the rbtree:

499

*/

499

*/

500

while (*link) {

500

while (*link) {

501

parent = *link;

501

parent = *link;

502

entry = rb_entry(parent, struct sched_entity, run_node);

502

entry = rb_entry(parent, struct sched_entity, run_node);

503

/*

503

/*

504

* We dont care about collisions. Nodes with

504

* We dont care about collisions. Nodes with

505

* the same key stay together.

505

* the same key stay together.

506

*/

506

*/

507

if (entity_before(se, entry)) {

507

if (entity_before(se, entry)) {

508

link = &parent->rb_left;

508

link = &parent->rb_left;

509

} else {

509

} else {

510

link = &parent->rb_right;

510

link = &parent->rb_right;

511

leftmost = 0;

511

leftmost = 0;

512

}

512

}

513

}

513

}

514

515

/*

515

/*

516

* Maintain a cache of leftmost tree entries (it is frequently

516

* Maintain a cache of leftmost tree entries (it is frequently

517

* used):

517

* used):

518

*/

518

*/

519

if (leftmost)

519

if (leftmost)

520

cfs_rq->rb_leftmost = &se->run_node;

520

cfs_rq->rb_leftmost = &se->run_node;

521

522

rb_link_node(&se->run_node, parent, link);

522

rb_link_node(&se->run_node, parent, link);

523

rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);

523

rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);

524

}

524

}

525

526

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)

526

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)

527

{

527

{

528

if (cfs_rq->rb_leftmost == &se->run_node) {

528

if (cfs_rq->rb_leftmost == &se->run_node) {

529

struct rb_node *next_node;

529

struct rb_node *next_node;

530

531

next_node = rb_next(&se->run_node);

531

next_node = rb_next(&se->run_node);

532

cfs_rq->rb_leftmost = next_node;

532

cfs_rq->rb_leftmost = next_node;

533

}

533

}

534

535

rb_erase(&se->run_node, &cfs_rq->tasks_timeline);

535

rb_erase(&se->run_node, &cfs_rq->tasks_timeline);

536

}

536

}

537

538

struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)

538

struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)

539

{

539

{

540

struct rb_node *left = cfs_rq->rb_leftmost;

540

struct rb_node *left = cfs_rq->rb_leftmost;

541

542

if (!left)

542

if (!left)

543

return NULL;

543

return NULL;

544

545

return rb_entry(left, struct sched_entity, run_node);

545

return rb_entry(left, struct sched_entity, run_node);

546

}

546

}

547

548

static struct sched_entity *__pick_next_entity(struct sched_entity *se)

548

static struct sched_entity *__pick_next_entity(struct sched_entity *se)

549

{

549

{

550

struct rb_node *next = rb_next(&se->run_node);

550

struct rb_node *next = rb_next(&se->run_node);

551

552

if (!next)

552

if (!next)

553

return NULL;

553

return NULL;

554

555

return rb_entry(next, struct sched_entity, run_node);

555

return rb_entry(next, struct sched_entity, run_node);

556

}

556

}

557

558

#ifdef CONFIG_SCHED_DEBUG

558

#ifdef CONFIG_SCHED_DEBUG

559

struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)

559

struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)

560

{

560

{

561

struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);

561

struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);

562

563

if (!last)

563

if (!last)

564

return NULL;

564

return NULL;

565

566

return rb_entry(last, struct sched_entity, run_node);

566

return rb_entry(last, struct sched_entity, run_node);

567

}

567

}

568

569

/**************************************************************

569

/**************************************************************

570

* Scheduling class statistics methods:

570

* Scheduling class statistics methods:

571

*/

571

*/

572

573

int sched_proc_update_handler(struct ctl_table *table, int write,

573

int sched_proc_update_handler(struct ctl_table *table, int write,

574

void __user *buffer, size_t *lenp,

574

void __user *buffer, size_t *lenp,

575

loff_t *ppos)

575

loff_t *ppos)

576

{

576

{

577

int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);

577

int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);

578

int factor = get_update_sysctl_factor();

578

int factor = get_update_sysctl_factor();

579

580

if (ret || !write)

580

if (ret || !write)

581

return ret;

581

return ret;

582

583

sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,

583

sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,

584

sysctl_sched_min_granularity);

584

sysctl_sched_min_granularity);

585

586

#define WRT_SYSCTL(name) \

586

#define WRT_SYSCTL(name) \

587

(normalized_sysctl_##name = sysctl_##name / (factor))

587

(normalized_sysctl_##name = sysctl_##name / (factor))

588

WRT_SYSCTL(sched_min_granularity);

588

WRT_SYSCTL(sched_min_granularity);

589

WRT_SYSCTL(sched_latency);

589

WRT_SYSCTL(sched_latency);

590

WRT_SYSCTL(sched_wakeup_granularity);

590

WRT_SYSCTL(sched_wakeup_granularity);

591

#undef WRT_SYSCTL

591

#undef WRT_SYSCTL

592

593

return 0;

593

return 0;

594

}

594

}

595

#endif

595

#endif

596

597

/*

597

/*

598

* delta /= w

598

* delta /= w

599

*/

599

*/

600

static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)

600

static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)

601

{

601

{

602

if (unlikely(se->load.weight != NICE_0_LOAD))

602

if (unlikely(se->load.weight != NICE_0_LOAD))

603

delta = __calc_delta(delta, NICE_0_LOAD, &se->load);

603

delta = __calc_delta(delta, NICE_0_LOAD, &se->load);

604

605

return delta;

605

return delta;

606

}

606

}

607

608

/*

608

/*

609

* The idea is to set a period in which each task runs once.

609

* The idea is to set a period in which each task runs once.

610

*

610

*

611

* When there are too many tasks (sched_nr_latency) we have to stretch

611

* When there are too many tasks (sched_nr_latency) we have to stretch

612

* this period because otherwise the slices get too small.

612

* this period because otherwise the slices get too small.

613

*

613

*

614

* p = (nr <= nl) ? l : l*nr/nl

614

* p = (nr <= nl) ? l : l*nr/nl

615

*/

615

*/

616

static u64 __sched_period(unsigned long nr_running)

616

static u64 __sched_period(unsigned long nr_running)

617

{

617

{

618

u64 period = sysctl_sched_latency;

618

u64 period = sysctl_sched_latency;

619

unsigned long nr_latency = sched_nr_latency;

619

unsigned long nr_latency = sched_nr_latency;

620

621

if (unlikely(nr_running > nr_latency)) {

621

if (unlikely(nr_running > nr_latency)) {

622

period = sysctl_sched_min_granularity;

622

period = sysctl_sched_min_granularity;

623

period *= nr_running;

623

period *= nr_running;

624

}

624

}

625

626

return period;

626

return period;

627

}

627

}

628

629

/*

629

/*

630

* We calculate the wall-time slice from the period by taking a part

630

* We calculate the wall-time slice from the period by taking a part

631

* proportional to the weight.

631

* proportional to the weight.

632

*

632

*

633

* s = p*P[w/rw]

633

* s = p*P[w/rw]

634

*/

634

*/

635

static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)

635

static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)

636

{

636

{

637

u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);

637

u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);

638

639

for_each_sched_entity(se) {

639

for_each_sched_entity(se) {

640

struct load_weight *load;

640

struct load_weight *load;

641

struct load_weight lw;

641

struct load_weight lw;

642

643

cfs_rq = cfs_rq_of(se);

643

cfs_rq = cfs_rq_of(se);

644

load = &cfs_rq->load;

644

load = &cfs_rq->load;

645

646

if (unlikely(!se->on_rq)) {

646

if (unlikely(!se->on_rq)) {

647

lw = cfs_rq->load;

647

lw = cfs_rq->load;

648

649

update_load_add(&lw, se->load.weight);

649

update_load_add(&lw, se->load.weight);

650

load = &lw;

650

load = &lw;

651

}

651

}

652

slice = __calc_delta(slice, se->load.weight, load);

652

slice = __calc_delta(slice, se->load.weight, load);

653

}

653

}

654

return slice;

654

return slice;

655

}

655

}

656

657

/*

657

/*

658

* We calculate the vruntime slice of a to-be-inserted task.

658

* We calculate the vruntime slice of a to-be-inserted task.

659

*

659

*

660

* vs = s/w

660

* vs = s/w

661

*/

661

*/

662

static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)

662

static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)

663

{

663

{

664

return calc_delta_fair(sched_slice(cfs_rq, se), se);

664

return calc_delta_fair(sched_slice(cfs_rq, se), se);

665

}

665

}

666

667

#ifdef CONFIG_SMP

667

#ifdef CONFIG_SMP

668

static unsigned long task_h_load(struct task_struct *p);

668

static unsigned long task_h_load(struct task_struct *p);

669

670

static inline void __update_task_entity_contrib(struct sched_entity *se);

670

static inline void __update_task_entity_contrib(struct sched_entity *se);

671

672

/* Give new task start runnable values to heavy its load in infant time */

672

/* Give new task start runnable values to heavy its load in infant time */

673

void init_task_runnable_average(struct task_struct *p)

673

void init_task_runnable_average(struct task_struct *p)

674

{

674

{

675

u32 slice;

675

u32 slice;

676

677

p->se.avg.decay_count = 0;

677

p->se.avg.decay_count = 0;

678

slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;

678

slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;

679

p->se.avg.runnable_avg_sum = slice;

679

p->se.avg.runnable_avg_sum = slice;

680

p->se.avg.runnable_avg_period = slice;

680

p->se.avg.runnable_avg_period = slice;

681

__update_task_entity_contrib(&p->se);

681

__update_task_entity_contrib(&p->se);

682

}

682

}

683

#else

683

#else

684

void init_task_runnable_average(struct task_struct *p)

684

void init_task_runnable_average(struct task_struct *p)

685

{

685

{

686

}

686

}

687

#endif

687

#endif

688

689

/*

689

/*

690

* Update the current task's runtime statistics.

690

* Update the current task's runtime statistics.

691

*/

691

*/

692

static void update_curr(struct cfs_rq *cfs_rq)

692

static void update_curr(struct cfs_rq *cfs_rq)

693

{

693

{

694

struct sched_entity *curr = cfs_rq->curr;

694

struct sched_entity *curr = cfs_rq->curr;

695

u64 now = rq_clock_task(rq_of(cfs_rq));

695

u64 now = rq_clock_task(rq_of(cfs_rq));

696

u64 delta_exec;

696

u64 delta_exec;

697

698

if (unlikely(!curr))

698

if (unlikely(!curr))

699

return;

699

return;

700

701

delta_exec = now - curr->exec_start;

701

delta_exec = now - curr->exec_start;

702

if (unlikely((s64)delta_exec <= 0))

702

if (unlikely((s64)delta_exec <= 0))

703

return;

703

return;

704

705

curr->exec_start = now;

705

curr->exec_start = now;

706

707

schedstat_set(curr->statistics.exec_max,

707

schedstat_set(curr->statistics.exec_max,

708

max(delta_exec, curr->statistics.exec_max));

708

max(delta_exec, curr->statistics.exec_max));

709

710

curr->sum_exec_runtime += delta_exec;

710

curr->sum_exec_runtime += delta_exec;

711

schedstat_add(cfs_rq, exec_clock, delta_exec);

711

schedstat_add(cfs_rq, exec_clock, delta_exec);

712

713

curr->vruntime += calc_delta_fair(delta_exec, curr);

713

curr->vruntime += calc_delta_fair(delta_exec, curr);

714

update_min_vruntime(cfs_rq);

714

update_min_vruntime(cfs_rq);

715

716

if (entity_is_task(curr)) {

716

if (entity_is_task(curr)) {

717

struct task_struct *curtask = task_of(curr);

717

struct task_struct *curtask = task_of(curr);

718

719

trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);

719

trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);

720

cpuacct_charge(curtask, delta_exec);

720

cpuacct_charge(curtask, delta_exec);

721

account_group_exec_runtime(curtask, delta_exec);

721

account_group_exec_runtime(curtask, delta_exec);

722

}

722

}

723

724

account_cfs_rq_runtime(cfs_rq, delta_exec);

724

account_cfs_rq_runtime(cfs_rq, delta_exec);

725

}

725

}

726

727

static inline void

727

static inline void

728

update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)

728

update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)

729

{

729

{

730

schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));

730

schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));

731

}

731

}

732

733

/*

733

/*

734

* Task is being enqueued - update stats:

734

* Task is being enqueued - update stats:

735

*/

735

*/

736

static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)

736

static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)

737

{

737

{

738

/*

738

/*

739

* Are we enqueueing a waiting task? (for current tasks

739

* Are we enqueueing a waiting task? (for current tasks

740

* a dequeue/enqueue event is a NOP)

740

* a dequeue/enqueue event is a NOP)

741

*/

741

*/

742

if (se != cfs_rq->curr)

742

if (se != cfs_rq->curr)

743

update_stats_wait_start(cfs_rq, se);

743

update_stats_wait_start(cfs_rq, se);

744

}

744

}

745

746

static void

746

static void

747

update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)

747

update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)

748

{

748

{

749

schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,

749

schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,

750

rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));

750

rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));

751

schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);

751

schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);

752

schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +

752

schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +

753

rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);

753

rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);

754

#ifdef CONFIG_SCHEDSTATS

754

#ifdef CONFIG_SCHEDSTATS

755

if (entity_is_task(se)) {

755

if (entity_is_task(se)) {

756

trace_sched_stat_wait(task_of(se),

756

trace_sched_stat_wait(task_of(se),

757

rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);

757

rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);

758

}

758

}

759

#endif

759

#endif

760

schedstat_set(se->statistics.wait_start, 0);

760

schedstat_set(se->statistics.wait_start, 0);

761

}

761

}

762

763

static inline void

763

static inline void

764

update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)

764

update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)

765

{

765

{

766

/*

766

/*

767

* Mark the end of the wait period if dequeueing a

767

* Mark the end of the wait period if dequeueing a

768

* waiting task:

768

* waiting task:

769

*/

769

*/

770

if (se != cfs_rq->curr)

770

if (se != cfs_rq->curr)

771

update_stats_wait_end(cfs_rq, se);

771

update_stats_wait_end(cfs_rq, se);

772

}

772

}

773

774

/*

774

/*

775

* We are picking a new current task - update its stats:

775

* We are picking a new current task - update its stats:

776

*/

776

*/

777

static inline void

777

static inline void

778

update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)

778

update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)

779

{

779

{

780

/*

780

/*

781

* We are starting a new run period:

781

* We are starting a new run period:

782

*/

782

*/

783

se->exec_start = rq_clock_task(rq_of(cfs_rq));

783

se->exec_start = rq_clock_task(rq_of(cfs_rq));

784

}

784

}

785

786

/**************************************************

786

/**************************************************

787

* Scheduling class queueing methods:

787

* Scheduling class queueing methods:

788

*/

788

*/

789

790

#ifdef CONFIG_NUMA_BALANCING

790

#ifdef CONFIG_NUMA_BALANCING

791

/*

791

/*

792

* Approximate time to scan a full NUMA task in ms. The task scan period is

792

* Approximate time to scan a full NUMA task in ms. The task scan period is

793

* calculated based on the tasks virtual memory size and

793

* calculated based on the tasks virtual memory size and

794

* numa_balancing_scan_size.

794

* numa_balancing_scan_size.

795

*/

795

*/

796

unsigned int sysctl_numa_balancing_scan_period_min = 1000;

796

unsigned int sysctl_numa_balancing_scan_period_min = 1000;

797

unsigned int sysctl_numa_balancing_scan_period_max = 60000;

797

unsigned int sysctl_numa_balancing_scan_period_max = 60000;

798

799

/* Portion of address space to scan in MB */

799

/* Portion of address space to scan in MB */

800

unsigned int sysctl_numa_balancing_scan_size = 256;

800

unsigned int sysctl_numa_balancing_scan_size = 256;

801

802

/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */

802

/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */

803

unsigned int sysctl_numa_balancing_scan_delay = 1000;

803

unsigned int sysctl_numa_balancing_scan_delay = 1000;

804

805

static unsigned int task_nr_scan_windows(struct task_struct *p)

805

static unsigned int task_nr_scan_windows(struct task_struct *p)

806

{

806

{

807

unsigned long rss = 0;

807

unsigned long rss = 0;

808

unsigned long nr_scan_pages;

808

unsigned long nr_scan_pages;

809

810

/*

810

/*

811

* Calculations based on RSS as non-present and empty pages are skipped

811

* Calculations based on RSS as non-present and empty pages are skipped

812

* by the PTE scanner and NUMA hinting faults should be trapped based

812

* by the PTE scanner and NUMA hinting faults should be trapped based

813

* on resident pages

813

* on resident pages

814

*/

814

*/

815

nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);

815

nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);

816

rss = get_mm_rss(p->mm);

816

rss = get_mm_rss(p->mm);

817

if (!rss)

817

if (!rss)

818

rss = nr_scan_pages;

818

rss = nr_scan_pages;

819

820

rss = round_up(rss, nr_scan_pages);

820

rss = round_up(rss, nr_scan_pages);

821

return rss / nr_scan_pages;

821

return rss / nr_scan_pages;

822

}

822

}

823

824

/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */

824

/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */

825

#define MAX_SCAN_WINDOW 2560

825

#define MAX_SCAN_WINDOW 2560

826

827

static unsigned int task_scan_min(struct task_struct *p)

827

static unsigned int task_scan_min(struct task_struct *p)

828

{

828

{

829

unsigned int scan, floor;

829

unsigned int scan, floor;

830

unsigned int windows = 1;

830

unsigned int windows = 1;

831

832

if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)

832

if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)

833

windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;

833

windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;

834

floor = 1000 / windows;

834

floor = 1000 / windows;

835

836

scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);

836

scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);

837

return max_t(unsigned int, floor, scan);

837

return max_t(unsigned int, floor, scan);

838

}

838

}

839

840

static unsigned int task_scan_max(struct task_struct *p)

840

static unsigned int task_scan_max(struct task_struct *p)

841

{

841

{

842

unsigned int smin = task_scan_min(p);

842

unsigned int smin = task_scan_min(p);

843

unsigned int smax;

843

unsigned int smax;

844

845

/* Watch for min being lower than max due to floor calculations */

845

/* Watch for min being lower than max due to floor calculations */

846

smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);

846

smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);

847

return max(smin, smax);

847

return max(smin, smax);

848

}

848

}

849

850

static void account_numa_enqueue(struct rq *rq, struct task_struct *p)

850

static void account_numa_enqueue(struct rq *rq, struct task_struct *p)

851

{

851

{

852

rq->nr_numa_running += (p->numa_preferred_nid != -1);

852

rq->nr_numa_running += (p->numa_preferred_nid != -1);

853

rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));

853

rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));

854

}

854

}

855

856

static void account_numa_dequeue(struct rq *rq, struct task_struct *p)

856

static void account_numa_dequeue(struct rq *rq, struct task_struct *p)

857

{

857

{

858

rq->nr_numa_running -= (p->numa_preferred_nid != -1);

858

rq->nr_numa_running -= (p->numa_preferred_nid != -1);

859

rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));

859

rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));

860

}

860

}

861

862

struct numa_group {

862

struct numa_group {

863

atomic_t refcount;

863

atomic_t refcount;

864

865

spinlock_t lock; /* nr_tasks, tasks */

865

spinlock_t lock; /* nr_tasks, tasks */

866

int nr_tasks;

866

int nr_tasks;

867

pid_t gid;

867

pid_t gid;

868

struct list_head task_list;

868

struct list_head task_list;

869

870

struct rcu_head rcu;

870

struct rcu_head rcu;

871

nodemask_t active_nodes;

871

nodemask_t active_nodes;

872

unsigned long total_faults;

872

unsigned long total_faults;

873

/*

873

/*

874

* Faults_cpu is used to decide whether memory should move

874

* Faults_cpu is used to decide whether memory should move

875

* towards the CPU. As a consequence, these stats are weighted

875

* towards the CPU. As a consequence, these stats are weighted

876

* more by CPU use than by memory faults.

876

* more by CPU use than by memory faults.

877

*/

877

*/

878

unsigned long *faults_cpu;

878

unsigned long *faults_cpu;

879

unsigned long faults[0];

879

unsigned long faults[0];

880

};

880

};

881

882

/* Shared or private faults. */

882

/* Shared or private faults. */

883

#define NR_NUMA_HINT_FAULT_TYPES 2

883

#define NR_NUMA_HINT_FAULT_TYPES 2

884

885

/* Memory and CPU locality */

885

/* Memory and CPU locality */

886

#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)

886

#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)

887

888

/* Averaged statistics, and temporary buffers. */

888

/* Averaged statistics, and temporary buffers. */

889

#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)

889

#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)

890

891

pid_t task_numa_group_id(struct task_struct *p)

891

pid_t task_numa_group_id(struct task_struct *p)

892

{

892

{

893

return p->numa_group ? p->numa_group->gid : 0;

893

return p->numa_group ? p->numa_group->gid : 0;

894

}

894

}

895

896

static inline int task_faults_idx(int nid, int priv)

896

static inline int task_faults_idx(int nid, int priv)

897

{

897

{

898

return NR_NUMA_HINT_FAULT_TYPES * nid + priv;

898

return NR_NUMA_HINT_FAULT_TYPES * nid + priv;

899

}

899

}

900

901

static inline unsigned long task_faults(struct task_struct *p, int nid)

901

static inline unsigned long task_faults(struct task_struct *p, int nid)

902

{

902

{

903

if (!p->numa_faults_memory)

903

if (!p->numa_faults_memory)

904

return 0;

904

return 0;

905

906

return p->numa_faults_memory[task_faults_idx(nid, 0)] +

906

return p->numa_faults_memory[task_faults_idx(nid, 0)] +

907

p->numa_faults_memory[task_faults_idx(nid, 1)];

907

p->numa_faults_memory[task_faults_idx(nid, 1)];

908

}

908

}

909

910

static inline unsigned long group_faults(struct task_struct *p, int nid)

910

static inline unsigned long group_faults(struct task_struct *p, int nid)

911

{

911

{

912

if (!p->numa_group)

912

if (!p->numa_group)

913

return 0;

913

return 0;

914

915

return p->numa_group->faults[task_faults_idx(nid, 0)] +

915

return p->numa_group->faults[task_faults_idx(nid, 0)] +

916

p->numa_group->faults[task_faults_idx(nid, 1)];

916

p->numa_group->faults[task_faults_idx(nid, 1)];

917

}

917

}

918

919

static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)

919

static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)

920

{

920

{

921

return group->faults_cpu[task_faults_idx(nid, 0)] +

921

return group->faults_cpu[task_faults_idx(nid, 0)] +

922

group->faults_cpu[task_faults_idx(nid, 1)];

922

group->faults_cpu[task_faults_idx(nid, 1)];

923

}

923

}

924

925

/*

925

/*

926

* These return the fraction of accesses done by a particular task, or

926

* These return the fraction of accesses done by a particular task, or

927

* task group, on a particular numa node. The group weight is given a

927

* task group, on a particular numa node. The group weight is given a

928

* larger multiplier, in order to group tasks together that are almost

928

* larger multiplier, in order to group tasks together that are almost

929

* evenly spread out between numa nodes.

929

* evenly spread out between numa nodes.

930

*/

930

*/

931

static inline unsigned long task_weight(struct task_struct *p, int nid)

931

static inline unsigned long task_weight(struct task_struct *p, int nid)

932

{

932

{

933

unsigned long total_faults;

933

unsigned long total_faults;

934

935

if (!p->numa_faults_memory)

935

if (!p->numa_faults_memory)

936

return 0;

936

return 0;

937

938

total_faults = p->total_numa_faults;

938

total_faults = p->total_numa_faults;

939

940

if (!total_faults)

940

if (!total_faults)

941

return 0;

941

return 0;

942

943

return 1000 * task_faults(p, nid) / total_faults;

943

return 1000 * task_faults(p, nid) / total_faults;

944

}

944

}

945

946

static inline unsigned long group_weight(struct task_struct *p, int nid)

946

static inline unsigned long group_weight(struct task_struct *p, int nid)

947

{

947

{

948

if (!p->numa_group || !p->numa_group->total_faults)

948

if (!p->numa_group || !p->numa_group->total_faults)

949

return 0;

949

return 0;

950

951

return 1000 * group_faults(p, nid) / p->numa_group->total_faults;

951

return 1000 * group_faults(p, nid) / p->numa_group->total_faults;

952

}

952

}

953

954

bool should_numa_migrate_memory(struct task_struct *p, struct page * page,

954

bool should_numa_migrate_memory(struct task_struct *p, struct page * page,

955

int src_nid, int dst_cpu)

955

int src_nid, int dst_cpu)

956

{

956

{

957

struct numa_group *ng = p->numa_group;

957

struct numa_group *ng = p->numa_group;

958

int dst_nid = cpu_to_node(dst_cpu);

958

int dst_nid = cpu_to_node(dst_cpu);

959

int last_cpupid, this_cpupid;

959

int last_cpupid, this_cpupid;

960

961

this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);

961

this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);

962

963

/*

963

/*

964

* Multi-stage node selection is used in conjunction with a periodic

964

* Multi-stage node selection is used in conjunction with a periodic

965

* migration fault to build a temporal task<->page relation. By using

965

* migration fault to build a temporal task<->page relation. By using

966

* a two-stage filter we remove short/unlikely relations.

966

* a two-stage filter we remove short/unlikely relations.

967

*

967

*

968

* Using P(p) ~ n_p / n_t as per frequentist probability, we can equate

968

* Using P(p) ~ n_p / n_t as per frequentist probability, we can equate

969

* a task's usage of a particular page (n_p) per total usage of this

969

* a task's usage of a particular page (n_p) per total usage of this

970

* page (n_t) (in a given time-span) to a probability.

970

* page (n_t) (in a given time-span) to a probability.

971

*

971

*

972

* Our periodic faults will sample this probability and getting the

972

* Our periodic faults will sample this probability and getting the

973

* same result twice in a row, given these samples are fully

973

* same result twice in a row, given these samples are fully

974

* independent, is then given by P(n)^2, provided our sample period

974

* independent, is then given by P(n)^2, provided our sample period

975

* is sufficiently short compared to the usage pattern.

975

* is sufficiently short compared to the usage pattern.

976

*

976

*

977

* This quadric squishes small probabilities, making it less likely we

977

* This quadric squishes small probabilities, making it less likely we

978

* act on an unlikely task<->page relation.

978

* act on an unlikely task<->page relation.

979

*/

979

*/

980

last_cpupid = page_cpupid_xchg_last(page, this_cpupid);

980

last_cpupid = page_cpupid_xchg_last(page, this_cpupid);

981

if (!cpupid_pid_unset(last_cpupid) &&

981

if (!cpupid_pid_unset(last_cpupid) &&

982

cpupid_to_nid(last_cpupid) != dst_nid)

982

cpupid_to_nid(last_cpupid) != dst_nid)

983

return false;

983

return false;

984

985

/* Always allow migrate on private faults */

985

/* Always allow migrate on private faults */

986

if (cpupid_match_pid(p, last_cpupid))

986

if (cpupid_match_pid(p, last_cpupid))

987

return true;

987

return true;

988

989

/* A shared fault, but p->numa_group has not been set up yet. */

989

/* A shared fault, but p->numa_group has not been set up yet. */

990

if (!ng)

990

if (!ng)

991

return true;

991

return true;

992

993

/*

993

/*

994

* Do not migrate if the destination is not a node that

994

* Do not migrate if the destination is not a node that

995

* is actively used by this numa group.

995

* is actively used by this numa group.

996

*/

996

*/

997

if (!node_isset(dst_nid, ng->active_nodes))

997

if (!node_isset(dst_nid, ng->active_nodes))

998

return false;

998

return false;

999

1000

/*

1000

/*

1001

* Source is a node that is not actively used by this

1001

* Source is a node that is not actively used by this

1002

* numa group, while the destination is. Migrate.

1002

* numa group, while the destination is. Migrate.

1003

*/

1003

*/

1004

if (!node_isset(src_nid, ng->active_nodes))

1004

if (!node_isset(src_nid, ng->active_nodes))

1005

return true;

1005

return true;

1006

1007

/*

1007

/*

1008

* Both source and destination are nodes in active

1008

* Both source and destination are nodes in active

1009

* use by this numa group. Maximize memory bandwidth

1009

* use by this numa group. Maximize memory bandwidth

1010

* by migrating from more heavily used groups, to less

1010

* by migrating from more heavily used groups, to less

1011

* heavily used ones, spreading the load around.

1011

* heavily used ones, spreading the load around.

1012

* Use a 1/4 hysteresis to avoid spurious page movement.

1012

* Use a 1/4 hysteresis to avoid spurious page movement.

1013

*/

1013

*/

1014

return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);

1014

return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);

1015

}

1015

}

1016

1017

static unsigned long weighted_cpuload(const int cpu);

1017

static unsigned long weighted_cpuload(const int cpu);

1018

static unsigned long source_load(int cpu, int type);

1018

static unsigned long source_load(int cpu, int type);

1019

static unsigned long target_load(int cpu, int type);

1019

static unsigned long target_load(int cpu, int type);

1020

static unsigned long power_of(int cpu);

1020

static unsigned long power_of(int cpu);

1021

static long effective_load(struct task_group *tg, int cpu, long wl, long wg);

1021

static long effective_load(struct task_group *tg, int cpu, long wl, long wg);

1022

1023

/* Cached statistics for all CPUs within a node */

1023

/* Cached statistics for all CPUs within a node */

1024

struct numa_stats {

1024

struct numa_stats {

1025

unsigned long nr_running;

1025

unsigned long nr_running;

1026

unsigned long load;

1026

unsigned long load;

1027

1028

/* Total compute capacity of CPUs on a node */

1028

/* Total compute capacity of CPUs on a node */

1029

unsigned long power;

1029

unsigned long power;

1030

1031

/* Approximate capacity in terms of runnable tasks on a node */

1031

/* Approximate capacity in terms of runnable tasks on a node */

1032

unsigned long capacity;

1032

unsigned long capacity;

1033

int has_capacity;

1033

int has_capacity;

1034

};

1034

};

1035

1036

/*

1036

/*

1037

* XXX borrowed from update_sg_lb_stats

1037

* XXX borrowed from update_sg_lb_stats

1038

*/

1038

*/

1039

static void update_numa_stats(struct numa_stats *ns, int nid)

1039

static void update_numa_stats(struct numa_stats *ns, int nid)

1040

{

1040

{

1041

int cpu, cpus = 0;

1041

int cpu, cpus = 0;

1042

1043

memset(ns, 0, sizeof(*ns));

1043

memset(ns, 0, sizeof(*ns));

1044

for_each_cpu(cpu, cpumask_of_node(nid)) {

1044

for_each_cpu(cpu, cpumask_of_node(nid)) {

1045

struct rq *rq = cpu_rq(cpu);

1045

struct rq *rq = cpu_rq(cpu);

1046

1047

ns->nr_running += rq->nr_running;

1047

ns->nr_running += rq->nr_running;

1048

ns->load += weighted_cpuload(cpu);

1048

ns->load += weighted_cpuload(cpu);

1049

ns->power += power_of(cpu);

1049

ns->power += power_of(cpu);

1050

1051

cpus++;

1051

cpus++;

1052

}

1052

}

1053

1054

/*

1054

/*

1055

* If we raced with hotplug and there are no CPUs left in our mask

1055

* If we raced with hotplug and there are no CPUs left in our mask

1056

* the @ns structure is NULL'ed and task_numa_compare() will

1056

* the @ns structure is NULL'ed and task_numa_compare() will

1057

* not find this node attractive.

1057

* not find this node attractive.

1058

*

1058

*

1059

* We'll either bail at !has_capacity, or we'll detect a huge imbalance

1059

* We'll either bail at !has_capacity, or we'll detect a huge imbalance

1060

* and bail there.

1060

* and bail there.

1061

*/

1061

*/

1062

if (!cpus)

1062

if (!cpus)

1063

return;

1063

return;

1064

1065

ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;

1065

ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;

1066

ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);

1066

ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);

1067

ns->has_capacity = (ns->nr_running < ns->capacity);

1067

ns->has_capacity = (ns->nr_running < ns->capacity);

1068

}

1068

}

1069

1070

struct task_numa_env {

1070

struct task_numa_env {

1071

struct task_struct *p;

1071

struct task_struct *p;

1072

1073

int src_cpu, src_nid;

1073

int src_cpu, src_nid;

1074

int dst_cpu, dst_nid;

1074

int dst_cpu, dst_nid;

1075

1076

struct numa_stats src_stats, dst_stats;

1076

struct numa_stats src_stats, dst_stats;

1077

1078

int imbalance_pct;

1078

int imbalance_pct;

1079

1080

struct task_struct *best_task;

1080

struct task_struct *best_task;

1081

long best_imp;

1081

long best_imp;

1082

int best_cpu;

1082

int best_cpu;

1083

};

1083

};

1084

1085

static void task_numa_assign(struct task_numa_env *env,

1085

static void task_numa_assign(struct task_numa_env *env,

1086

struct task_struct *p, long imp)

1086

struct task_struct *p, long imp)

1087

{

1087

{

1088

if (env->best_task)

1088

if (env->best_task)

1089

put_task_struct(env->best_task);

1089

put_task_struct(env->best_task);

1090

if (p)

1090

if (p)

1091

get_task_struct(p);

1091

get_task_struct(p);

1092

1093

env->best_task = p;

1093

env->best_task = p;

1094

env->best_imp = imp;

1094

env->best_imp = imp;

1095

env->best_cpu = env->dst_cpu;

1095

env->best_cpu = env->dst_cpu;

1096

}

1096

}

1097

1098

/*

1098

/*

1099

* This checks if the overall compute and NUMA accesses of the system would

1099

* This checks if the overall compute and NUMA accesses of the system would

1100

* be improved if the source tasks was migrated to the target dst_cpu taking

1100

* be improved if the source tasks was migrated to the target dst_cpu taking

1101

* into account that it might be best if task running on the dst_cpu should

1101

* into account that it might be best if task running on the dst_cpu should

1102

* be exchanged with the source task

1102

* be exchanged with the source task

1103

*/

1103

*/

1104

static void task_numa_compare(struct task_numa_env *env,

1104

static void task_numa_compare(struct task_numa_env *env,

1105

long taskimp, long groupimp)

1105

long taskimp, long groupimp)

1106

{

1106

{

1107

struct rq *src_rq = cpu_rq(env->src_cpu);

1107

struct rq *src_rq = cpu_rq(env->src_cpu);

1108

struct rq *dst_rq = cpu_rq(env->dst_cpu);

1108

struct rq *dst_rq = cpu_rq(env->dst_cpu);

1109

struct task_struct *cur;

1109

struct task_struct *cur;

1110

long dst_load, src_load;

1110

long dst_load, src_load;

1111

long load;

1111

long load;

1112

long imp = (groupimp > 0) ? groupimp : taskimp;

1112

long imp = (groupimp > 0) ? groupimp : taskimp;

1113

1114

rcu_read_lock();

1114

rcu_read_lock();

1115

cur = ACCESS_ONCE(dst_rq->curr);

1115

cur = ACCESS_ONCE(dst_rq->curr);

1116

if (cur->pid == 0) /* idle */

1116

if (cur->pid == 0) /* idle */

1117

cur = NULL;

1117

cur = NULL;

1118

1119

/*

1119

/*

1120

* "imp" is the fault differential for the source task between the

1120

* "imp" is the fault differential for the source task between the

1121

* source and destination node. Calculate the total differential for

1121

* source and destination node. Calculate the total differential for

1122

* the source task and potential destination task. The more negative

1122

* the source task and potential destination task. The more negative

1123

* the value is, the more rmeote accesses that would be expected to

1123

* the value is, the more rmeote accesses that would be expected to

1124

* be incurred if the tasks were swapped.

1124

* be incurred if the tasks were swapped.

1125

*/

1125

*/

1126

if (cur) {

1126

if (cur) {

1127

/* Skip this swap candidate if cannot move to the source cpu */

1127

/* Skip this swap candidate if cannot move to the source cpu */

1128

if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))

1128

if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))

1129

goto unlock;

1129

goto unlock;

1130

1131

/*

1131

/*

1132

* If dst and source tasks are in the same NUMA group, or not

1132

* If dst and source tasks are in the same NUMA group, or not

1133

* in any group then look only at task weights.

1133

* in any group then look only at task weights.

1134

*/

1134

*/

1135

if (cur->numa_group == env->p->numa_group) {

1135

if (cur->numa_group == env->p->numa_group) {

1136

imp = taskimp + task_weight(cur, env->src_nid) -

1136

imp = taskimp + task_weight(cur, env->src_nid) -

1137

task_weight(cur, env->dst_nid);

1137

task_weight(cur, env->dst_nid);

1138

/*

1138

/*

1139

* Add some hysteresis to prevent swapping the

1139

* Add some hysteresis to prevent swapping the

1140

* tasks within a group over tiny differences.

1140

* tasks within a group over tiny differences.

1141

*/

1141

*/

1142

if (cur->numa_group)

1142

if (cur->numa_group)

1143

imp -= imp/16;

1143

imp -= imp/16;

1144

} else {

1144

} else {

1145

/*

1145

/*

1146

* Compare the group weights. If a task is all by

1146

* Compare the group weights. If a task is all by

1147

* itself (not part of a group), use the task weight

1147

* itself (not part of a group), use the task weight

1148

* instead.

1148

* instead.

1149

*/

1149

*/

1150

if (env->p->numa_group)

1150

if (env->p->numa_group)

1151

imp = groupimp;

1151

imp = groupimp;

1152

else

1152

else

1153

imp = taskimp;

1153

imp = taskimp;

1154

1155

if (cur->numa_group)

1155

if (cur->numa_group)

1156

imp += group_weight(cur, env->src_nid) -

1156

imp += group_weight(cur, env->src_nid) -

1157

group_weight(cur, env->dst_nid);

1157

group_weight(cur, env->dst_nid);

1158

else

1158

else

1159

imp += task_weight(cur, env->src_nid) -

1159

imp += task_weight(cur, env->src_nid) -

1160

task_weight(cur, env->dst_nid);

1160

task_weight(cur, env->dst_nid);

1161

}

1161

}

1162

}

1162

}

1163

1164

if (imp < env->best_imp)

1164

if (imp < env->best_imp)

1165

goto unlock;

1165

goto unlock;

1166

1167

if (!cur) {

1167

if (!cur) {

1168

/* Is there capacity at our destination? */

1168

/* Is there capacity at our destination? */

1169

if (env->src_stats.has_capacity &&

1169

if (env->src_stats.has_capacity &&

1170

!env->dst_stats.has_capacity)

1170

!env->dst_stats.has_capacity)

1171

goto unlock;

1171

goto unlock;

1172

1173

goto balance;

1173

goto balance;

1174

}

1174

}

1175

1176

/* Balance doesn't matter much if we're running a task per cpu */

1176

/* Balance doesn't matter much if we're running a task per cpu */

1177

if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)

1177

if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)

1178

goto assign;

1178

goto assign;

1179

1180

/*

1180

/*

1181

* In the overloaded case, try and keep the load balanced.

1181

* In the overloaded case, try and keep the load balanced.

1182

*/

1182

*/

1183

balance:

1183

balance:

1184

dst_load = env->dst_stats.load;

1184

dst_load = env->dst_stats.load;

1185

src_load = env->src_stats.load;

1185

src_load = env->src_stats.load;

1186

1187

/* XXX missing power terms */

1187

/* XXX missing power terms */

1188

load = task_h_load(env->p);

1188

load = task_h_load(env->p);

1189

dst_load += load;

1189

dst_load += load;

1190

src_load -= load;

1190

src_load -= load;

1191

1192

if (cur) {

1192

if (cur) {

1193

load = task_h_load(cur);

1193

load = task_h_load(cur);

1194

dst_load -= load;

1194

dst_load -= load;

1195

src_load += load;

1195

src_load += load;

1196

}

1196

}

1197

1198

/* make src_load the smaller */

1198

/* make src_load the smaller */

1199

if (dst_load < src_load)

1199

if (dst_load < src_load)

1200

swap(dst_load, src_load);

1200

swap(dst_load, src_load);

1201

1202

if (src_load * env->imbalance_pct < dst_load * 100)

1202

if (src_load * env->imbalance_pct < dst_load * 100)

1203

goto unlock;

1203

goto unlock;

1204

1205

assign:

1205

assign:

1206

task_numa_assign(env, cur, imp);

1206

task_numa_assign(env, cur, imp);

1207

unlock:

1207

unlock:

1208

rcu_read_unlock();

1208

rcu_read_unlock();

1209

}

1209

}

1210

1211

static void task_numa_find_cpu(struct task_numa_env *env,

1211

static void task_numa_find_cpu(struct task_numa_env *env,

1212

long taskimp, long groupimp)

1212

long taskimp, long groupimp)

1213

{

1213

{

1214

int cpu;

1214

int cpu;

1215

1216

for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {

1216

for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {

1217

/* Skip this CPU if the source task cannot migrate */

1217

/* Skip this CPU if the source task cannot migrate */

1218

if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))

1218

if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))

1219

continue;

1219

continue;

1220

1221

env->dst_cpu = cpu;

1221

env->dst_cpu = cpu;

1222

task_numa_compare(env, taskimp, groupimp);

1222

task_numa_compare(env, taskimp, groupimp);

1223

}

1223

}

1224

}

1224

}

1225

1226

static int task_numa_migrate(struct task_struct *p)

1226

static int task_numa_migrate(struct task_struct *p)

1227

{

1227

{

1228

struct task_numa_env env = {

1228

struct task_numa_env env = {

1229

.p = p,

1229

.p = p,

1230

1231

.src_cpu = task_cpu(p),

1231

.src_cpu = task_cpu(p),

1232

.src_nid = task_node(p),

1232

.src_nid = task_node(p),

1233

1234

.imbalance_pct = 112,

1234

.imbalance_pct = 112,

1235

1236

.best_task = NULL,

1236

.best_task = NULL,

1237

.best_imp = 0,

1237

.best_imp = 0,

1238

.best_cpu = -1

1238

.best_cpu = -1

1239

};

1239

};

1240

struct sched_domain *sd;

1240

struct sched_domain *sd;

1241

unsigned long taskweight, groupweight;

1241

unsigned long taskweight, groupweight;

1242

int nid, ret;

1242

int nid, ret;

1243

long taskimp, groupimp;

1243

long taskimp, groupimp;

1244

1245

/*

1245

/*

1246

* Pick the lowest SD_NUMA domain, as that would have the smallest

1246

* Pick the lowest SD_NUMA domain, as that would have the smallest

1247

* imbalance and would be the first to start moving tasks about.

1247

* imbalance and would be the first to start moving tasks about.

1248

*

1248

*

1249

* And we want to avoid any moving of tasks about, as that would create

1249

* And we want to avoid any moving of tasks about, as that would create

1250

* random movement of tasks -- counter the numa conditions we're trying

1250

* random movement of tasks -- counter the numa conditions we're trying

1251

* to satisfy here.

1251

* to satisfy here.

1252

*/

1252

*/

1253

rcu_read_lock();

1253

rcu_read_lock();

1254

sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));

1254

sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));

1255

if (sd)

1255

if (sd)

1256

env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;

1256

env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;

1257

rcu_read_unlock();

1257

rcu_read_unlock();

1258

1259

/*

1259

/*

1260

* Cpusets can break the scheduler domain tree into smaller

1260

* Cpusets can break the scheduler domain tree into smaller

1261

* balance domains, some of which do not cross NUMA boundaries.

1261

* balance domains, some of which do not cross NUMA boundaries.

1262

* Tasks that are "trapped" in such domains cannot be migrated

1262

* Tasks that are "trapped" in such domains cannot be migrated

1263

* elsewhere, so there is no point in (re)trying.

1263

* elsewhere, so there is no point in (re)trying.

1264

*/

1264

*/

1265

if (unlikely(!sd)) {

1265

if (unlikely(!sd)) {

1266

p->numa_preferred_nid = task_node(p);

1266

p->numa_preferred_nid = task_node(p);

1267

return -EINVAL;

1267

return -EINVAL;

1268

}

1268

}

1269

1270

taskweight = task_weight(p, env.src_nid);

1270

taskweight = task_weight(p, env.src_nid);

1271

groupweight = group_weight(p, env.src_nid);

1271

groupweight = group_weight(p, env.src_nid);

1272

update_numa_stats(&env.src_stats, env.src_nid);

1272

update_numa_stats(&env.src_stats, env.src_nid);

1273

env.dst_nid = p->numa_preferred_nid;

1273

env.dst_nid = p->numa_preferred_nid;

1274

taskimp = task_weight(p, env.dst_nid) - taskweight;

1274

taskimp = task_weight(p, env.dst_nid) - taskweight;

1275

groupimp = group_weight(p, env.dst_nid) - groupweight;

1275

groupimp = group_weight(p, env.dst_nid) - groupweight;

1276

update_numa_stats(&env.dst_stats, env.dst_nid);

1276

update_numa_stats(&env.dst_stats, env.dst_nid);

1277

1278

/* If the preferred nid has capacity, try to use it. */

1278

/* If the preferred nid has capacity, try to use it. */

1279

if (env.dst_stats.has_capacity)

1279

if (env.dst_stats.has_capacity)

1280

task_numa_find_cpu(&env, taskimp, groupimp);

1280

task_numa_find_cpu(&env, taskimp, groupimp);

1281

1282

/* No space available on the preferred nid. Look elsewhere. */

1282

/* No space available on the preferred nid. Look elsewhere. */

1283

if (env.best_cpu == -1) {

1283

if (env.best_cpu == -1) {

1284

for_each_online_node(nid) {

1284

for_each_online_node(nid) {

1285

if (nid == env.src_nid || nid == p->numa_preferred_nid)

1285

if (nid == env.src_nid || nid == p->numa_preferred_nid)

1286

continue;

1286

continue;

1287

1288

/* Only consider nodes where both task and groups benefit */

1288

/* Only consider nodes where both task and groups benefit */

1289

taskimp = task_weight(p, nid) - taskweight;

1289

taskimp = task_weight(p, nid) - taskweight;

1290

groupimp = group_weight(p, nid) - groupweight;

1290

groupimp = group_weight(p, nid) - groupweight;

1291

if (taskimp < 0 && groupimp < 0)

1291

if (taskimp < 0 && groupimp < 0)

1292

continue;

1292

continue;

1293

1294

env.dst_nid = nid;

1294

env.dst_nid = nid;

1295

update_numa_stats(&env.dst_stats, env.dst_nid);

1295

update_numa_stats(&env.dst_stats, env.dst_nid);

1296

task_numa_find_cpu(&env, taskimp, groupimp);

1296

task_numa_find_cpu(&env, taskimp, groupimp);

1297

}

1297

}

1298

}

1298

}

1299

1300

/* No better CPU than the current one was found. */

1300

/* No better CPU than the current one was found. */

1301

if (env.best_cpu == -1)

1301

if (env.best_cpu == -1)

1302

return -EAGAIN;

1302

return -EAGAIN;

1303

1304

sched_setnuma(p, env.dst_nid);

1304

sched_setnuma(p, env.dst_nid);

1305

1306

/*

1306

/*

1307

* Reset the scan period if the task is being rescheduled on an

1307

* Reset the scan period if the task is being rescheduled on an

1308

* alternative node to recheck if the tasks is now properly placed.

1308

* alternative node to recheck if the tasks is now properly placed.

1309

*/

1309

*/

1310

p->numa_scan_period = task_scan_min(p);

1310

p->numa_scan_period = task_scan_min(p);

1311

1312

if (env.best_task == NULL) {

1312

if (env.best_task == NULL) {

1313

ret = migrate_task_to(p, env.best_cpu);

1313

ret = migrate_task_to(p, env.best_cpu);

1314

if (ret != 0)

1314

if (ret != 0)

1315

trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);

1315

trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);

1316

return ret;

1316

return ret;

1317

}

1317

}

1318

1319

ret = migrate_swap(p, env.best_task);

1319

ret = migrate_swap(p, env.best_task);

1320

if (ret != 0)

1320

if (ret != 0)

1321

trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));

1321

trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));

1322

put_task_struct(env.best_task);

1322

put_task_struct(env.best_task);

1323

return ret;

1323

return ret;

1324

}

1324

}

1325

1326

/* Attempt to migrate a task to a CPU on the preferred node. */

1326

/* Attempt to migrate a task to a CPU on the preferred node. */

1327

static void numa_migrate_preferred(struct task_struct *p)

1327

static void numa_migrate_preferred(struct task_struct *p)

1328

{

1328

{

1329

/* This task has no NUMA fault statistics yet */

1329

/* This task has no NUMA fault statistics yet */

1330

if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))

1330

if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))

1331

return;

1331

return;

1332

1333

/* Periodically retry migrating the task to the preferred node */

1333

/* Periodically retry migrating the task to the preferred node */

1334

p->numa_migrate_retry = jiffies + HZ;

1334

p->numa_migrate_retry = jiffies + HZ;

1335

1336

/* Success if task is already running on preferred CPU */

1336

/* Success if task is already running on preferred CPU */

1337

if (task_node(p) == p->numa_preferred_nid)

1337

if (task_node(p) == p->numa_preferred_nid)

1338

return;

1338

return;

1339

1340

/* Otherwise, try migrate to a CPU on the preferred node */

1340

/* Otherwise, try migrate to a CPU on the preferred node */

1341

task_numa_migrate(p);

1341

task_numa_migrate(p);

1342

}

1342

}

1343

1344

/*

1344

/*

1345

* Find the nodes on which the workload is actively running. We do this by

1345

* Find the nodes on which the workload is actively running. We do this by

1346

* tracking the nodes from which NUMA hinting faults are triggered. This can

1346

* tracking the nodes from which NUMA hinting faults are triggered. This can

1347

* be different from the set of nodes where the workload's memory is currently

1347

* be different from the set of nodes where the workload's memory is currently

1348

* located.

1348

* located.

1349

*

1349

*

1350

* The bitmask is used to make smarter decisions on when to do NUMA page

1350

* The bitmask is used to make smarter decisions on when to do NUMA page

1351

* migrations, To prevent flip-flopping, and excessive page migrations, nodes

1351

* migrations, To prevent flip-flopping, and excessive page migrations, nodes

1352

* are added when they cause over 6/16 of the maximum number of faults, but

1352

* are added when they cause over 6/16 of the maximum number of faults, but

1353

* only removed when they drop below 3/16.

1353

* only removed when they drop below 3/16.

1354

*/

1354

*/

1355

static void update_numa_active_node_mask(struct numa_group *numa_group)

1355

static void update_numa_active_node_mask(struct numa_group *numa_group)

1356

{

1356

{

1357

unsigned long faults, max_faults = 0;

1357

unsigned long faults, max_faults = 0;

1358

int nid;

1358

int nid;

1359

1360

for_each_online_node(nid) {

1360

for_each_online_node(nid) {

1361

faults = group_faults_cpu(numa_group, nid);

1361

faults = group_faults_cpu(numa_group, nid);

1362

if (faults > max_faults)

1362

if (faults > max_faults)

1363

max_faults = faults;

1363

max_faults = faults;

1364

}

1364

}

1365

1366

for_each_online_node(nid) {

1366

for_each_online_node(nid) {

1367

faults = group_faults_cpu(numa_group, nid);

1367

faults = group_faults_cpu(numa_group, nid);

1368

if (!node_isset(nid, numa_group->active_nodes)) {

1368

if (!node_isset(nid, numa_group->active_nodes)) {

1369

if (faults > max_faults * 6 / 16)

1369

if (faults > max_faults * 6 / 16)

1370

node_set(nid, numa_group->active_nodes);

1370

node_set(nid, numa_group->active_nodes);

1371

} else if (faults < max_faults * 3 / 16)

1371

} else if (faults < max_faults * 3 / 16)

1372

node_clear(nid, numa_group->active_nodes);

1372

node_clear(nid, numa_group->active_nodes);

1373

}

1373

}

1374

}

1374

}

1375

1376

/*

1376

/*

1377

* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS

1377

* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS

1378

* increments. The more local the fault statistics are, the higher the scan

1378

* increments. The more local the fault statistics are, the higher the scan

1379

* period will be for the next scan window. If local/remote ratio is below

1379

* period will be for the next scan window. If local/remote ratio is below

1380

* NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the

1380

* NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the

1381

* scan period will decrease

1381

* scan period will decrease

1382

*/

1382

*/

1383

#define NUMA_PERIOD_SLOTS 10

1383

#define NUMA_PERIOD_SLOTS 10

1384

#define NUMA_PERIOD_THRESHOLD 3

1384

#define NUMA_PERIOD_THRESHOLD 3

1385

1386

/*

1386

/*

1387

* Increase the scan period (slow down scanning) if the majority of

1387

* Increase the scan period (slow down scanning) if the majority of

1388

* our memory is already on our local node, or if the majority of

1388

* our memory is already on our local node, or if the majority of

1389

* the page accesses are shared with other processes.

1389

* the page accesses are shared with other processes.

1390

* Otherwise, decrease the scan period.

1390

* Otherwise, decrease the scan period.

1391

*/

1391

*/

1392

static void update_task_scan_period(struct task_struct *p,

1392

static void update_task_scan_period(struct task_struct *p,

1393

unsigned long shared, unsigned long private)

1393

unsigned long shared, unsigned long private)

1394

{

1394

{

1395

unsigned int period_slot;

1395

unsigned int period_slot;

1396

int ratio;

1396

int ratio;

1397

int diff;

1397

int diff;

1398

1399

unsigned long remote = p->numa_faults_locality[0];

1399

unsigned long remote = p->numa_faults_locality[0];

1400

unsigned long local = p->numa_faults_locality[1];

1400

unsigned long local = p->numa_faults_locality[1];

1401

1402

/*

1402

/*

1403

* If there were no record hinting faults then either the task is

1403

* If there were no record hinting faults then either the task is

1404

* completely idle or all activity is areas that are not of interest

1404

* completely idle or all activity is areas that are not of interest

1405

* to automatic numa balancing. Scan slower

1405

* to automatic numa balancing. Scan slower

1406

*/

1406

*/

1407

if (local + shared == 0) {

1407

if (local + shared == 0) {

1408

p->numa_scan_period = min(p->numa_scan_period_max,

1408

p->numa_scan_period = min(p->numa_scan_period_max,

1409

p->numa_scan_period << 1);

1409

p->numa_scan_period << 1);

1410

1411

p->mm->numa_next_scan = jiffies +

1411

p->mm->numa_next_scan = jiffies +

1412

msecs_to_jiffies(p->numa_scan_period);

1412

msecs_to_jiffies(p->numa_scan_period);

1413

1414

return;

1414

return;

1415

}

1415

}

1416

1417

/*

1417

/*

1418

* Prepare to scale scan period relative to the current period.

1418

* Prepare to scale scan period relative to the current period.

1419

* == NUMA_PERIOD_THRESHOLD scan period stays the same

1419

* == NUMA_PERIOD_THRESHOLD scan period stays the same

1420

* < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)

1420

* < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)

1421

* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)

1421

* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)

1422

*/

1422

*/

1423

period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);

1423

period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);

1424

ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);

1424

ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);

1425

if (ratio >= NUMA_PERIOD_THRESHOLD) {

1425

if (ratio >= NUMA_PERIOD_THRESHOLD) {

1426

int slot = ratio - NUMA_PERIOD_THRESHOLD;

1426

int slot = ratio - NUMA_PERIOD_THRESHOLD;

1427

if (!slot)

1427

if (!slot)

1428

slot = 1;

1428

slot = 1;

1429

diff = slot * period_slot;

1429

diff = slot * period_slot;

1430

} else {

1430

} else {

1431

diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;

1431

diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;

1432

1433

/*

1433

/*

1434

* Scale scan rate increases based on sharing. There is an

1434

* Scale scan rate increases based on sharing. There is an

1435

* inverse relationship between the degree of sharing and

1435

* inverse relationship between the degree of sharing and

1436

* the adjustment made to the scanning period. Broadly

1436

* the adjustment made to the scanning period. Broadly

1437

* speaking the intent is that there is little point

1437

* speaking the intent is that there is little point

1438

* scanning faster if shared accesses dominate as it may

1438

* scanning faster if shared accesses dominate as it may

1439

* simply bounce migrations uselessly

1439

* simply bounce migrations uselessly

1440

*/

1440

*/

1441

ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));

1441

ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));

1442

diff = (diff * ratio) / NUMA_PERIOD_SLOTS;

1442

diff = (diff * ratio) / NUMA_PERIOD_SLOTS;

1443

}

1443

}

1444

1445

p->numa_scan_period = clamp(p->numa_scan_period + diff,

1445

p->numa_scan_period = clamp(p->numa_scan_period + diff,

1446

task_scan_min(p), task_scan_max(p));

1446

task_scan_min(p), task_scan_max(p));

1447

memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));

1447

memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));

1448

}

1448

}

1449

1450

/*

1450

/*

1451

* Get the fraction of time the task has been running since the last

1451

* Get the fraction of time the task has been running since the last

1452

* NUMA placement cycle. The scheduler keeps similar statistics, but

1452

* NUMA placement cycle. The scheduler keeps similar statistics, but

1453

* decays those on a 32ms period, which is orders of magnitude off

1453

* decays those on a 32ms period, which is orders of magnitude off

1454

* from the dozens-of-seconds NUMA balancing period. Use the scheduler

1454

* from the dozens-of-seconds NUMA balancing period. Use the scheduler

1455

* stats only if the task is so new there are no NUMA statistics yet.

1455

* stats only if the task is so new there are no NUMA statistics yet.

1456

*/

1456

*/

1457

static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)

1457

static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)

1458

{

1458

{

1459

u64 runtime, delta, now;

1459

u64 runtime, delta, now;

1460

/* Use the start of this time slice to avoid calculations. */

1460

/* Use the start of this time slice to avoid calculations. */

1461

now = p->se.exec_start;

1461

now = p->se.exec_start;

1462

runtime = p->se.sum_exec_runtime;

1462

runtime = p->se.sum_exec_runtime;

1463

1464

if (p->last_task_numa_placement) {

1464

if (p->last_task_numa_placement) {

1465

delta = runtime - p->last_sum_exec_runtime;

1465

delta = runtime - p->last_sum_exec_runtime;

1466

*period = now - p->last_task_numa_placement;

1466

*period = now - p->last_task_numa_placement;

1467

} else {

1467

} else {

1468

delta = p->se.avg.runnable_avg_sum;

1468

delta = p->se.avg.runnable_avg_sum;

1469

*period = p->se.avg.runnable_avg_period;

1469

*period = p->se.avg.runnable_avg_period;

1470

}

1470

}

1471

1472

p->last_sum_exec_runtime = runtime;

1472

p->last_sum_exec_runtime = runtime;

1473

p->last_task_numa_placement = now;

1473

p->last_task_numa_placement = now;

1474

1475

return delta;

1475

return delta;

1476

}

1476

}

1477

1478

static void task_numa_placement(struct task_struct *p)

1478

static void task_numa_placement(struct task_struct *p)

1479

{

1479

{

1480

int seq, nid, max_nid = -1, max_group_nid = -1;

1480

int seq, nid, max_nid = -1, max_group_nid = -1;

1481

unsigned long max_faults = 0, max_group_faults = 0;

1481

unsigned long max_faults = 0, max_group_faults = 0;

1482

unsigned long fault_types[2] = { 0, 0 };

1482

unsigned long fault_types[2] = { 0, 0 };

1483

unsigned long total_faults;

1483

unsigned long total_faults;

1484

u64 runtime, period;

1484

u64 runtime, period;

1485

spinlock_t *group_lock = NULL;

1485

spinlock_t *group_lock = NULL;

1486

1487

seq = ACCESS_ONCE(p->mm->numa_scan_seq);

1487

seq = ACCESS_ONCE(p->mm->numa_scan_seq);

1488

if (p->numa_scan_seq == seq)

1488

if (p->numa_scan_seq == seq)

1489

return;

1489

return;

1490

p->numa_scan_seq = seq;

1490

p->numa_scan_seq = seq;

1491

p->numa_scan_period_max = task_scan_max(p);

1491

p->numa_scan_period_max = task_scan_max(p);

1492

1493

total_faults = p->numa_faults_locality[0] +

1493

total_faults = p->numa_faults_locality[0] +

1494

p->numa_faults_locality[1];

1494

p->numa_faults_locality[1];

1495

runtime = numa_get_avg_runtime(p, &period);

1495

runtime = numa_get_avg_runtime(p, &period);

1496

1497

/* If the task is part of a group prevent parallel updates to group stats */

1497

/* If the task is part of a group prevent parallel updates to group stats */

1498

if (p->numa_group) {

1498

if (p->numa_group) {

1499

group_lock = &p->numa_group->lock;

1499

group_lock = &p->numa_group->lock;

1500

spin_lock_irq(group_lock);

1500

spin_lock_irq(group_lock);

1501

}

1501

}

1502

1503

/* Find the node with the highest number of faults */

1503

/* Find the node with the highest number of faults */

1504

for_each_online_node(nid) {

1504

for_each_online_node(nid) {

1505

unsigned long faults = 0, group_faults = 0;

1505

unsigned long faults = 0, group_faults = 0;

1506

int priv, i;

1506

int priv, i;

1507

1508

for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {

1508

for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {

1509

long diff, f_diff, f_weight;

1509

long diff, f_diff, f_weight;

1510

1511

i = task_faults_idx(nid, priv);

1511

i = task_faults_idx(nid, priv);

1512

1513

/* Decay existing window, copy faults since last scan */

1513

/* Decay existing window, copy faults since last scan */

1514

diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;

1514

diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;

1515

fault_types[priv] += p->numa_faults_buffer_memory[i];

1515

fault_types[priv] += p->numa_faults_buffer_memory[i];

1516

p->numa_faults_buffer_memory[i] = 0;

1516

p->numa_faults_buffer_memory[i] = 0;

1517

1518

/*

1518

/*

1519

* Normalize the faults_from, so all tasks in a group

1519

* Normalize the faults_from, so all tasks in a group

1520

* count according to CPU use, instead of by the raw

1520

* count according to CPU use, instead of by the raw

1521

* number of faults. Tasks with little runtime have

1521

* number of faults. Tasks with little runtime have

1522

* little over-all impact on throughput, and thus their

1522

* little over-all impact on throughput, and thus their

1523

* faults are less important.

1523

* faults are less important.

1524

*/

1524

*/

1525

f_weight = div64_u64(runtime << 16, period + 1);

1525

f_weight = div64_u64(runtime << 16, period + 1);

1526

f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /

1526

f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /

1527

(total_faults + 1);

1527

(total_faults + 1);

1528

f_diff = f_weight - p->numa_faults_cpu[i] / 2;

1528

f_diff = f_weight - p->numa_faults_cpu[i] / 2;

1529

p->numa_faults_buffer_cpu[i] = 0;

1529

p->numa_faults_buffer_cpu[i] = 0;

1530

1531

p->numa_faults_memory[i] += diff;

1531

p->numa_faults_memory[i] += diff;

1532

p->numa_faults_cpu[i] += f_diff;

1532

p->numa_faults_cpu[i] += f_diff;

1533

faults += p->numa_faults_memory[i];

1533

faults += p->numa_faults_memory[i];

1534

p->total_numa_faults += diff;

1534

p->total_numa_faults += diff;

1535

if (p->numa_group) {

1535

if (p->numa_group) {

1536

/* safe because we can only change our own group */

1536

/* safe because we can only change our own group */

1537

p->numa_group->faults[i] += diff;

1537

p->numa_group->faults[i] += diff;

1538

p->numa_group->faults_cpu[i] += f_diff;

1538

p->numa_group->faults_cpu[i] += f_diff;

1539

p->numa_group->total_faults += diff;

1539

p->numa_group->total_faults += diff;

1540

group_faults += p->numa_group->faults[i];

1540

group_faults += p->numa_group->faults[i];

1541

}

1541

}

1542

}

1542

}

1543

1544

if (faults > max_faults) {

1544

if (faults > max_faults) {

1545

max_faults = faults;

1545

max_faults = faults;

1546

max_nid = nid;

1546

max_nid = nid;

1547

}

1547

}

1548

1549

if (group_faults > max_group_faults) {

1549

if (group_faults > max_group_faults) {

1550

max_group_faults = group_faults;

1550

max_group_faults = group_faults;

1551

max_group_nid = nid;

1551

max_group_nid = nid;

1552

}

1552

}

1553

}

1553

}

1554

1555

update_task_scan_period(p, fault_types[0], fault_types[1]);

1555

update_task_scan_period(p, fault_types[0], fault_types[1]);

1556

1557

if (p->numa_group) {

1557

if (p->numa_group) {

1558

update_numa_active_node_mask(p->numa_group);

1558

update_numa_active_node_mask(p->numa_group);

1559

/*

1559

/*

1560

* If the preferred task and group nids are different,

1560

* If the preferred task and group nids are different,

1561

* iterate over the nodes again to find the best place.

1561

* iterate over the nodes again to find the best place.

1562

*/

1562

*/

1563

if (max_nid != max_group_nid) {

1563

if (max_nid != max_group_nid) {

1564

unsigned long weight, max_weight = 0;

1564

unsigned long weight, max_weight = 0;

1565

1566

for_each_online_node(nid) {

1566

for_each_online_node(nid) {

1567

weight = task_weight(p, nid) + group_weight(p, nid);

1567

weight = task_weight(p, nid) + group_weight(p, nid);

1568

if (weight > max_weight) {

1568

if (weight > max_weight) {

1569

max_weight = weight;

1569

max_weight = weight;

1570

max_nid = nid;

1570

max_nid = nid;

1571

}

1571

}

1572

}

1572

}

1573

}

1573

}

1574

1575

spin_unlock_irq(group_lock);

1575

spin_unlock_irq(group_lock);

1576

}

1576

}

1577

1578

/* Preferred node as the node with the most faults */

1578

/* Preferred node as the node with the most faults */

1579

if (max_faults && max_nid != p->numa_preferred_nid) {

1579

if (max_faults && max_nid != p->numa_preferred_nid) {

1580

/* Update the preferred nid and migrate task if possible */

1580

/* Update the preferred nid and migrate task if possible */

1581

sched_setnuma(p, max_nid);

1581

sched_setnuma(p, max_nid);

1582

numa_migrate_preferred(p);

1582

numa_migrate_preferred(p);

1583

}

1583

}

1584

}

1584

}

1585

1586

static inline int get_numa_group(struct numa_group *grp)

1586

static inline int get_numa_group(struct numa_group *grp)

1587

{

1587

{

1588

return atomic_inc_not_zero(&grp->refcount);

1588

return atomic_inc_not_zero(&grp->refcount);

1589

}

1589

}

1590

1591

static inline void put_numa_group(struct numa_group *grp)

1591

static inline void put_numa_group(struct numa_group *grp)

1592

{

1592

{

1593

if (atomic_dec_and_test(&grp->refcount))

1593

if (atomic_dec_and_test(&grp->refcount))

1594

kfree_rcu(grp, rcu);

1594

kfree_rcu(grp, rcu);

1595

}

1595

}

1596

1597

static void task_numa_group(struct task_struct *p, int cpupid, int flags,

1597

static void task_numa_group(struct task_struct *p, int cpupid, int flags,

1598

int *priv)

1598

int *priv)

1599

{

1599

{

1600

struct numa_group *grp, *my_grp;

1600

struct numa_group *grp, *my_grp;

1601

struct task_struct *tsk;

1601

struct task_struct *tsk;

1602

bool join = false;

1602

bool join = false;

1603

int cpu = cpupid_to_cpu(cpupid);

1603

int cpu = cpupid_to_cpu(cpupid);

1604

int i;

1604

int i;

1605

1606

if (unlikely(!p->numa_group)) {

1606

if (unlikely(!p->numa_group)) {

1607

unsigned int size = sizeof(struct numa_group) +

1607

unsigned int size = sizeof(struct numa_group) +

1608

4*nr_node_ids*sizeof(unsigned long);

1608

4*nr_node_ids*sizeof(unsigned long);

1609

1610

grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);

1610

grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);

1611

if (!grp)

1611

if (!grp)

1612

return;

1612

return;

1613

1614

atomic_set(&grp->refcount, 1);

1614

atomic_set(&grp->refcount, 1);

1615

spin_lock_init(&grp->lock);

1615

spin_lock_init(&grp->lock);

1616

INIT_LIST_HEAD(&grp->task_list);

1616

INIT_LIST_HEAD(&grp->task_list);

1617

grp->gid = p->pid;

1617

grp->gid = p->pid;

1618

/* Second half of the array tracks nids where faults happen */

1618

/* Second half of the array tracks nids where faults happen */

1619

grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *

1619

grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *

1620

nr_node_ids;

1620

nr_node_ids;

1621

1622

node_set(task_node(current), grp->active_nodes);

1622

node_set(task_node(current), grp->active_nodes);

1623

1624

for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)

1624

for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)

1625

grp->faults[i] = p->numa_faults_memory[i];

1625

grp->faults[i] = p->numa_faults_memory[i];

1626

1627

grp->total_faults = p->total_numa_faults;

1627

grp->total_faults = p->total_numa_faults;

1628

1629

list_add(&p->numa_entry, &grp->task_list);

1629

list_add(&p->numa_entry, &grp->task_list);

1630

grp->nr_tasks++;

1630

grp->nr_tasks++;

1631

rcu_assign_pointer(p->numa_group, grp);

1631

rcu_assign_pointer(p->numa_group, grp);

1632

}

1632

}

1633

1634

rcu_read_lock();

1634

rcu_read_lock();

1635

tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);

1635

tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);

1636

1637

if (!cpupid_match_pid(tsk, cpupid))

1637

if (!cpupid_match_pid(tsk, cpupid))

1638

goto no_join;

1638

goto no_join;

1639

1640

grp = rcu_dereference(tsk->numa_group);

1640

grp = rcu_dereference(tsk->numa_group);

1641

if (!grp)

1641

if (!grp)

1642

goto no_join;

1642

goto no_join;

1643

1644

my_grp = p->numa_group;

1644

my_grp = p->numa_group;

1645

if (grp == my_grp)

1645

if (grp == my_grp)

1646

goto no_join;

1646

goto no_join;

1647

1648

/*

1648

/*

1649

* Only join the other group if its bigger; if we're the bigger group,

1649

* Only join the other group if its bigger; if we're the bigger group,

1650

* the other task will join us.

1650

* the other task will join us.

1651

*/

1651

*/

1652

if (my_grp->nr_tasks > grp->nr_tasks)

1652

if (my_grp->nr_tasks > grp->nr_tasks)

1653

goto no_join;

1653

goto no_join;

1654

1655

/*

1655

/*

1656

* Tie-break on the grp address.

1656

* Tie-break on the grp address.

1657

*/

1657

*/

1658

if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)

1658

if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)

1659

goto no_join;

1659

goto no_join;

1660

1661

/* Always join threads in the same process. */

1661

/* Always join threads in the same process. */

1662

if (tsk->mm == current->mm)

1662

if (tsk->mm == current->mm)

1663

join = true;

1663

join = true;

1664

1665

/* Simple filter to avoid false positives due to PID collisions */

1665

/* Simple filter to avoid false positives due to PID collisions */

1666

if (flags & TNF_SHARED)

1666

if (flags & TNF_SHARED)

1667

join = true;

1667

join = true;

1668

1669

/* Update priv based on whether false sharing was detected */

1669

/* Update priv based on whether false sharing was detected */

1670

*priv = !join;

1670

*priv = !join;

1671

1672

if (join && !get_numa_group(grp))

1672

if (join && !get_numa_group(grp))

1673

goto no_join;

1673

goto no_join;

1674

1675

rcu_read_unlock();

1675

rcu_read_unlock();

1676

1677

if (!join)

1677

if (!join)

1678

return;

1678

return;

1679

1680

BUG_ON(irqs_disabled());

1680

BUG_ON(irqs_disabled());

1681

double_lock_irq(&my_grp->lock, &grp->lock);

1681

double_lock_irq(&my_grp->lock, &grp->lock);

1682

1683

for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {

1683

for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {

1684

my_grp->faults[i] -= p->numa_faults_memory[i];

1684

my_grp->faults[i] -= p->numa_faults_memory[i];

1685

grp->faults[i] += p->numa_faults_memory[i];

1685

grp->faults[i] += p->numa_faults_memory[i];

1686

}

1686

}

1687

my_grp->total_faults -= p->total_numa_faults;

1687

my_grp->total_faults -= p->total_numa_faults;

1688

grp->total_faults += p->total_numa_faults;

1688

grp->total_faults += p->total_numa_faults;

1689

1690

list_move(&p->numa_entry, &grp->task_list);

1690

list_move(&p->numa_entry, &grp->task_list);

1691

my_grp->nr_tasks--;

1691

my_grp->nr_tasks--;

1692

grp->nr_tasks++;

1692

grp->nr_tasks++;

1693

1694

spin_unlock(&my_grp->lock);

1694

spin_unlock(&my_grp->lock);

1695

spin_unlock_irq(&grp->lock);

1695

spin_unlock_irq(&grp->lock);

1696

1697

rcu_assign_pointer(p->numa_group, grp);

1697

rcu_assign_pointer(p->numa_group, grp);

1698

1699

put_numa_group(my_grp);

1699

put_numa_group(my_grp);

1700

return;

1700

return;

1701

1702

no_join:

1702

no_join:

1703

rcu_read_unlock();

1703

rcu_read_unlock();

1704

return;

1704

return;

1705

}

1705

}

1706

1707

void task_numa_free(struct task_struct *p)

1707

void task_numa_free(struct task_struct *p)

1708

{

1708

{

1709

struct numa_group *grp = p->numa_group;

1709

struct numa_group *grp = p->numa_group;

1710

int i;

1711

void *numa_faults = p->numa_faults_memory;

1710

void *numa_faults = p->numa_faults_memory;

1711

unsigned long flags;

1712

int i;

1712

1713

if (grp) {

1714

if (grp) {

1714

spin_lock_irq(&grp->lock);

1715

spin_lock_irqsave(&grp->lock, flags);

1715

for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)

1716

for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)

1716

grp->faults[i] -= p->numa_faults_memory[i];

1717

grp->faults[i] -= p->numa_faults_memory[i];

1717

grp->total_faults -= p->total_numa_faults;

1718

grp->total_faults -= p->total_numa_faults;

1718

1719

list_del(&p->numa_entry);

1720

list_del(&p->numa_entry);

1720

grp->nr_tasks--;

1721

grp->nr_tasks--;

1721

spin_unlock_irq(&grp->lock);

1722

spin_unlock_irqrestore(&grp->lock, flags);

1722

rcu_assign_pointer(p->numa_group, NULL);

1723

rcu_assign_pointer(p->numa_group, NULL);

1723

put_numa_group(grp);

1724

put_numa_group(grp);

1724

}

1725

}

1725

1726

p->numa_faults_memory = NULL;

1727

p->numa_faults_memory = NULL;

1727

p->numa_faults_buffer_memory = NULL;

1728

p->numa_faults_buffer_memory = NULL;

1728

p->numa_faults_cpu= NULL;

1729

p->numa_faults_cpu= NULL;

1729

p->numa_faults_buffer_cpu = NULL;

1730

p->numa_faults_buffer_cpu = NULL;

1730

kfree(numa_faults);

1731

kfree(numa_faults);

1731

}

1732

}

1732

1733

/*

1734

/*

1734

* Got a PROT_NONE fault for a page on @node.

1735

* Got a PROT_NONE fault for a page on @node.

1735

*/

1736

*/

1736

void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)

1737

void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)

1737

{

1738

{

1738

struct task_struct *p = current;

1739

struct task_struct *p = current;

1739

bool migrated = flags & TNF_MIGRATED;

1740

bool migrated = flags & TNF_MIGRATED;

1740

int cpu_node = task_node(current);

1741

int cpu_node = task_node(current);

1741

int priv;

1742

int priv;

1742

1743

if (!numabalancing_enabled)

1744

if (!numabalancing_enabled)

1744

return;

1745

return;

1745

1746

/* for example, ksmd faulting in a user's mm */

1747

/* for example, ksmd faulting in a user's mm */

1747

if (!p->mm)

1748

if (!p->mm)

1748

return;

1749

return;

1749

1750

/* Do not worry about placement if exiting */

1751

/* Do not worry about placement if exiting */

1751

if (p->state == TASK_DEAD)

1752

if (p->state == TASK_DEAD)

1752

return;

1753

return;

1753

1754

/* Allocate buffer to track faults on a per-node basis */

1755

/* Allocate buffer to track faults on a per-node basis */

1755

if (unlikely(!p->numa_faults_memory)) {

1756

if (unlikely(!p->numa_faults_memory)) {

1756

int size = sizeof(*p->numa_faults_memory) *

1757

int size = sizeof(*p->numa_faults_memory) *

1757

NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;

1758

NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;

1758

1759

p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);

1760

p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);

1760

if (!p->numa_faults_memory)

1761

if (!p->numa_faults_memory)

1761

return;

1762

return;

1762

1763

BUG_ON(p->numa_faults_buffer_memory);

1764

BUG_ON(p->numa_faults_buffer_memory);

1764

/*

1765

/*

1765

* The averaged statistics, shared & private, memory & cpu,

1766

* The averaged statistics, shared & private, memory & cpu,

1766

* occupy the first half of the array. The second half of the

1767

* occupy the first half of the array. The second half of the

1767

* array is for current counters, which are averaged into the

1768

* array is for current counters, which are averaged into the

1768

* first set by task_numa_placement.

1769

* first set by task_numa_placement.

1769

*/

1770

*/

1770

p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);

1771

p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);

1771

p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);

1772

p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);

1772

p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);

1773

p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);

1773

p->total_numa_faults = 0;

1774

p->total_numa_faults = 0;

1774

memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));

1775

memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));

1775

}

1776

}

1776

1777

/*

1778

/*

1778

* First accesses are treated as private, otherwise consider accesses

1779

* First accesses are treated as private, otherwise consider accesses

1779

* to be private if the accessing pid has not changed

1780

* to be private if the accessing pid has not changed

1780

*/

1781

*/

1781

if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {

1782

if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {

1782

priv = 1;

1783

priv = 1;

1783

} else {

1784

} else {

1784

priv = cpupid_match_pid(p, last_cpupid);

1785

priv = cpupid_match_pid(p, last_cpupid);

1785

if (!priv && !(flags & TNF_NO_GROUP))

1786

if (!priv && !(flags & TNF_NO_GROUP))

1786

task_numa_group(p, last_cpupid, flags, &priv);

1787

task_numa_group(p, last_cpupid, flags, &priv);

1787

}

1788

}

1788

1789

task_numa_placement(p);

1790

task_numa_placement(p);

1790

1791

/*

1792

/*

1792

* Retry task to preferred node migration periodically, in case it

1793

* Retry task to preferred node migration periodically, in case it

1793

* case it previously failed, or the scheduler moved us.

1794

* case it previously failed, or the scheduler moved us.

1794

*/

1795

*/

1795

if (time_after(jiffies, p->numa_migrate_retry))

1796

if (time_after(jiffies, p->numa_migrate_retry))

1796

numa_migrate_preferred(p);

1797

numa_migrate_preferred(p);

1797

1798

if (migrated)

1799

if (migrated)

1799

p->numa_pages_migrated += pages;

1800

p->numa_pages_migrated += pages;

1800

1801

p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;

1802

p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;

1802

p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;

1803

p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;

1803

p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;

1804

p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;

1804

}

1805

}

1805

1806

static void reset_ptenuma_scan(struct task_struct *p)

1807

static void reset_ptenuma_scan(struct task_struct *p)

1807

{

1808

{

1808

ACCESS_ONCE(p->mm->numa_scan_seq)++;

1809

ACCESS_ONCE(p->mm->numa_scan_seq)++;

1809

p->mm->numa_scan_offset = 0;

1810

p->mm->numa_scan_offset = 0;

1810

}

1811

}

1811

1812

/*

1813

/*

1813

* The expensive part of numa migration is done from task_work context.

1814

* The expensive part of numa migration is done from task_work context.

1814

* Triggered from task_tick_numa().

1815

* Triggered from task_tick_numa().

1815

*/

1816

*/

1816

void task_numa_work(struct callback_head *work)

1817

void task_numa_work(struct callback_head *work)

1817

{

1818

{

1818

unsigned long migrate, next_scan, now = jiffies;

1819

unsigned long migrate, next_scan, now = jiffies;

1819

struct task_struct *p = current;

1820

struct task_struct *p = current;

1820

struct mm_struct *mm = p->mm;

1821

struct mm_struct *mm = p->mm;

1821

struct vm_area_struct *vma;

1822

struct vm_area_struct *vma;

1822

unsigned long start, end;

1823

unsigned long start, end;

1823

unsigned long nr_pte_updates = 0;

1824

unsigned long nr_pte_updates = 0;

1824

long pages;

1825

long pages;

1825

1826

WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));

1827

WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));

1827

1828

work->next = work; /* protect against double add */

1829

work->next = work; /* protect against double add */

1829

/*

1830

/*

1830

* Who cares about NUMA placement when they're dying.

1831

* Who cares about NUMA placement when they're dying.

1831

*

1832

*

1832

* NOTE: make sure not to dereference p->mm before this check,

1833

* NOTE: make sure not to dereference p->mm before this check,

1833

* exit_task_work() happens _after_ exit_mm() so we could be called

1834

* exit_task_work() happens _after_ exit_mm() so we could be called

1834

* without p->mm even though we still had it when we enqueued this

1835

* without p->mm even though we still had it when we enqueued this

1835

* work.

1836

* work.

1836

*/

1837

*/

1837

if (p->flags & PF_EXITING)

1838

if (p->flags & PF_EXITING)

1838

return;

1839

return;

1839

1840

if (!mm->numa_next_scan) {

1841

if (!mm->numa_next_scan) {

1841

mm->numa_next_scan = now +

1842

mm->numa_next_scan = now +

1842

msecs_to_jiffies(sysctl_numa_balancing_scan_delay);

1843

msecs_to_jiffies(sysctl_numa_balancing_scan_delay);

1843

}

1844

}

1844

1845

/*

1846

/*

1846

* Enforce maximal scan/migration frequency..

1847

* Enforce maximal scan/migration frequency..

1847

*/

1848

*/

1848

migrate = mm->numa_next_scan;

1849

migrate = mm->numa_next_scan;

1849

if (time_before(now, migrate))

1850

if (time_before(now, migrate))

1850

return;

1851

return;

1851

1852

if (p->numa_scan_period == 0) {

1853

if (p->numa_scan_period == 0) {

1853

p->numa_scan_period_max = task_scan_max(p);

1854

p->numa_scan_period_max = task_scan_max(p);

1854

p->numa_scan_period = task_scan_min(p);

1855

p->numa_scan_period = task_scan_min(p);

1855

}

1856

}

1856

1857

next_scan = now + msecs_to_jiffies(p->numa_scan_period);

1858

next_scan = now + msecs_to_jiffies(p->numa_scan_period);

1858

if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)

1859

if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)

1859

return;

1860

return;

1860

1861

/*

1862

/*

1862

* Delay this task enough that another task of this mm will likely win

1863

* Delay this task enough that another task of this mm will likely win

1863

* the next time around.

1864

* the next time around.

1864

*/

1865

*/

1865

p->node_stamp += 2 * TICK_NSEC;

1866

p->node_stamp += 2 * TICK_NSEC;

1866

1867

start = mm->numa_scan_offset;

1868

start = mm->numa_scan_offset;

1868

pages = sysctl_numa_balancing_scan_size;

1869

pages = sysctl_numa_balancing_scan_size;

1869

pages <<= 20 - PAGE_SHIFT; /* MB in pages */

1870

pages <<= 20 - PAGE_SHIFT; /* MB in pages */

1870

if (!pages)

1871

if (!pages)

1871

return;

1872

return;

1872

1873

down_read(&mm->mmap_sem);

1874

down_read(&mm->mmap_sem);

1874

vma = find_vma(mm, start);

1875

vma = find_vma(mm, start);

1875

if (!vma) {

1876

if (!vma) {

1876

reset_ptenuma_scan(p);

1877

reset_ptenuma_scan(p);

1877

start = 0;

1878

start = 0;

1878

vma = mm->mmap;

1879

vma = mm->mmap;

1879

}

1880

}

1880

for (; vma; vma = vma->vm_next) {

1881

for (; vma; vma = vma->vm_next) {

1881

if (!vma_migratable(vma) || !vma_policy_mof(p, vma))

1882

if (!vma_migratable(vma) || !vma_policy_mof(p, vma))

1882

continue;

1883

continue;

1883

1884

/*

1885

/*

1885

* Shared library pages mapped by multiple processes are not

1886

* Shared library pages mapped by multiple processes are not

1886

* migrated as it is expected they are cache replicated. Avoid

1887

* migrated as it is expected they are cache replicated. Avoid

1887

* hinting faults in read-only file-backed mappings or the vdso

1888

* hinting faults in read-only file-backed mappings or the vdso

1888

* as migrating the pages will be of marginal benefit.

1889

* as migrating the pages will be of marginal benefit.

1889

*/

1890

*/

1890

if (!vma->vm_mm ||

1891

if (!vma->vm_mm ||

1891

(vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))

1892

(vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))

1892

continue;

1893

continue;

1893

1894

/*

1895

/*

1895

* Skip inaccessible VMAs to avoid any confusion between

1896

* Skip inaccessible VMAs to avoid any confusion between

1896

* PROT_NONE and NUMA hinting ptes

1897

* PROT_NONE and NUMA hinting ptes

1897

*/

1898

*/

1898

if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))

1899

if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))

1899

continue;

1900

continue;

1900

1901

do {

1902

do {

1902

start = max(start, vma->vm_start);

1903

start = max(start, vma->vm_start);

1903

end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);

1904

end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);

1904

end = min(end, vma->vm_end);

1905

end = min(end, vma->vm_end);

1905

nr_pte_updates += change_prot_numa(vma, start, end);

1906

nr_pte_updates += change_prot_numa(vma, start, end);

1906

1907

/*

1908

/*

1908

* Scan sysctl_numa_balancing_scan_size but ensure that

1909

* Scan sysctl_numa_balancing_scan_size but ensure that

1909

* at least one PTE is updated so that unused virtual

1910

* at least one PTE is updated so that unused virtual

1910

* address space is quickly skipped.

1911

* address space is quickly skipped.

1911

*/

1912

*/

1912

if (nr_pte_updates)

1913

if (nr_pte_updates)

1913

pages -= (end - start) >> PAGE_SHIFT;

1914

pages -= (end - start) >> PAGE_SHIFT;

1914

1915

start = end;

1916

start = end;

1916

if (pages <= 0)

1917

if (pages <= 0)

1917

goto out;

1918

goto out;

1918

1919

cond_resched();

1920

cond_resched();

1920

} while (end != vma->vm_end);

1921

} while (end != vma->vm_end);

1921

}

1922

}

1922

1923

out:

1924

out:

1924

/*

1925

/*

1925

* It is possible to reach the end of the VMA list but the last few

1926

* It is possible to reach the end of the VMA list but the last few

1926

* VMAs are not guaranteed to the vma_migratable. If they are not, we

1927

* VMAs are not guaranteed to the vma_migratable. If they are not, we

1927

* would find the !migratable VMA on the next scan but not reset the

1928

* would find the !migratable VMA on the next scan but not reset the

1928

* scanner to the start so check it now.

1929

* scanner to the start so check it now.

1929

*/

1930

*/

1930

if (vma)

1931

if (vma)

1931

mm->numa_scan_offset = start;

1932

mm->numa_scan_offset = start;

1932

else

1933

else

1933

reset_ptenuma_scan(p);

1934

reset_ptenuma_scan(p);

1934

up_read(&mm->mmap_sem);

1935

up_read(&mm->mmap_sem);

1935

}

1936

}

1936

1937

/*

1938

/*

1938

* Drive the periodic memory faults..

1939

* Drive the periodic memory faults..

1939

*/

1940

*/

1940

void task_tick_numa(struct rq *rq, struct task_struct *curr)

1941

void task_tick_numa(struct rq *rq, struct task_struct *curr)

1941

{

1942

{

1942

struct callback_head *work = &curr->numa_work;

1943

struct callback_head *work = &curr->numa_work;

1943

u64 period, now;

1944

u64 period, now;

1944

1945

/*

1946

/*

1946

* We don't care about NUMA placement if we don't have memory.

1947

* We don't care about NUMA placement if we don't have memory.

1947

*/

1948

*/

1948

if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)

1949

if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)

1949

return;

1950

return;

1950

1951

/*

1952

/*

1952

* Using runtime rather than walltime has the dual advantage that

1953

* Using runtime rather than walltime has the dual advantage that

1953

* we (mostly) drive the selection from busy threads and that the

1954

* we (mostly) drive the selection from busy threads and that the

1954

* task needs to have done some actual work before we bother with

1955

* task needs to have done some actual work before we bother with

1955

* NUMA placement.

1956

* NUMA placement.

1956

*/

1957

*/

1957

now = curr->se.sum_exec_runtime;

1958

now = curr->se.sum_exec_runtime;

1958

period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;

1959

period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;

1959

1960

if (now - curr->node_stamp > period) {

1961

if (now - curr->node_stamp > period) {

1961

if (!curr->node_stamp)

1962

if (!curr->node_stamp)

1962

curr->numa_scan_period = task_scan_min(curr);

1963

curr->numa_scan_period = task_scan_min(curr);

1963

curr->node_stamp += period;

1964

curr->node_stamp += period;

1964

1965

if (!time_before(jiffies, curr->mm->numa_next_scan)) {

1966

if (!time_before(jiffies, curr->mm->numa_next_scan)) {

1966

init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */

1967

init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */

1967

task_work_add(curr, work, true);

1968

task_work_add(curr, work, true);

1968

}

1969

}

1969

}

1970

}

1970

}

1971

}

1971

#else

1972

#else

1972

static void task_tick_numa(struct rq *rq, struct task_struct *curr)

1973

static void task_tick_numa(struct rq *rq, struct task_struct *curr)

1973

{

1974

{

1974

}

1975

}

1975

1976

static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)

1977

static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)

1977

{

1978

{

1978

}

1979

}

1979

1980

static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)

1981

static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)

1981

{

1982

{

1982

}

1983

}

1983

#endif /* CONFIG_NUMA_BALANCING */

1984

#endif /* CONFIG_NUMA_BALANCING */

1984

1985

static void

1986

static void

1986

account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)

1987

account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)

1987

{

1988

{

1988

update_load_add(&cfs_rq->load, se->load.weight);

1989

update_load_add(&cfs_rq->load, se->load.weight);

1989

if (!parent_entity(se))

1990

if (!parent_entity(se))

1990

update_load_add(&rq_of(cfs_rq)->load, se->load.weight);

1991

update_load_add(&rq_of(cfs_rq)->load, se->load.weight);

1991

#ifdef CONFIG_SMP

1992

#ifdef CONFIG_SMP

1992

if (entity_is_task(se)) {

1993

if (entity_is_task(se)) {

1993

struct rq *rq = rq_of(cfs_rq);

1994

struct rq *rq = rq_of(cfs_rq);

1994

1995

account_numa_enqueue(rq, task_of(se));

1996

account_numa_enqueue(rq, task_of(se));

1996

list_add(&se->group_node, &rq->cfs_tasks);

1997

list_add(&se->group_node, &rq->cfs_tasks);

1997

}

1998

}

1998

#endif

1999

#endif

1999

cfs_rq->nr_running++;

2000

cfs_rq->nr_running++;

2000

}

2001

}

2001

2002

static void

2003

static void

2003

account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)

2004

account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)

2004

{

2005

{

2005

update_load_sub(&cfs_rq->load, se->load.weight);

2006

update_load_sub(&cfs_rq->load, se->load.weight);

2006

if (!parent_entity(se))

2007

if (!parent_entity(se))

2007

update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);

2008

update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);

2008

if (entity_is_task(se)) {

2009

if (entity_is_task(se)) {

2009

account_numa_dequeue(rq_of(cfs_rq), task_of(se));

2010

account_numa_dequeue(rq_of(cfs_rq), task_of(se));

2010

list_del_init(&se->group_node);

2011

list_del_init(&se->group_node);

2011

}

2012

}

2012

cfs_rq->nr_running--;

2013

cfs_rq->nr_running--;

2013

}

2014

}

2014

2015

#ifdef CONFIG_FAIR_GROUP_SCHED

2016

#ifdef CONFIG_FAIR_GROUP_SCHED

2016

# ifdef CONFIG_SMP

2017

# ifdef CONFIG_SMP

2017

static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)

2018

static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)

2018

{

2019

{

2019

long tg_weight;

2020

long tg_weight;

2020

2021

/*

2022

/*

2022

* Use this CPU's actual weight instead of the last load_contribution

2023

* Use this CPU's actual weight instead of the last load_contribution

2023

* to gain a more accurate current total weight. See

2024

* to gain a more accurate current total weight. See

2024

* update_cfs_rq_load_contribution().

2025

* update_cfs_rq_load_contribution().

2025

*/

2026

*/

2026

tg_weight = atomic_long_read(&tg->load_avg);

2027

tg_weight = atomic_long_read(&tg->load_avg);

2027

tg_weight -= cfs_rq->tg_load_contrib;

2028

tg_weight -= cfs_rq->tg_load_contrib;

2028

tg_weight += cfs_rq->load.weight;

2029

tg_weight += cfs_rq->load.weight;

2029

2030

return tg_weight;

2031

return tg_weight;

2031

}

2032

}

2032

2033

static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)

2034

static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)

2034

{

2035

{

2035

long tg_weight, load, shares;

2036

long tg_weight, load, shares;

2036

2037

tg_weight = calc_tg_weight(tg, cfs_rq);

2038

tg_weight = calc_tg_weight(tg, cfs_rq);

2038

load = cfs_rq->load.weight;

2039

load = cfs_rq->load.weight;

2039

2040

shares = (tg->shares * load);

2041

shares = (tg->shares * load);

2041

if (tg_weight)

2042

if (tg_weight)

2042

shares /= tg_weight;

2043

shares /= tg_weight;

2043

2044

if (shares < MIN_SHARES)

2045

if (shares < MIN_SHARES)

2045

shares = MIN_SHARES;

2046

shares = MIN_SHARES;

2046

if (shares > tg->shares)

2047

if (shares > tg->shares)

2047

shares = tg->shares;

2048

shares = tg->shares;

2048

2049

return shares;

2050

return shares;

2050

}

2051

}

2051

# else /* CONFIG_SMP */

2052

# else /* CONFIG_SMP */

2052

static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)

2053

static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)

2053

{

2054

{

2054

return tg->shares;

2055

return tg->shares;

2055

}

2056

}

2056

# endif /* CONFIG_SMP */

2057

# endif /* CONFIG_SMP */

2057

static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,

2058

static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,

2058

unsigned long weight)

2059

unsigned long weight)

2059

{

2060

{

2060

if (se->on_rq) {

2061

if (se->on_rq) {

2061

/* commit outstanding execution time */

2062

/* commit outstanding execution time */

2062

if (cfs_rq->curr == se)

2063

if (cfs_rq->curr == se)

2063

update_curr(cfs_rq);

2064

update_curr(cfs_rq);

2064

account_entity_dequeue(cfs_rq, se);

2065

account_entity_dequeue(cfs_rq, se);

2065

}

2066

}

2066

2067

update_load_set(&se->load, weight);

2068

update_load_set(&se->load, weight);

2068

2069

if (se->on_rq)

2070

if (se->on_rq)

2070

account_entity_enqueue(cfs_rq, se);

2071

account_entity_enqueue(cfs_rq, se);

2071

}

2072

}

2072

2073

static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);

2074

static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);

2074

2075

static void update_cfs_shares(struct cfs_rq *cfs_rq)

2076

static void update_cfs_shares(struct cfs_rq *cfs_rq)

2076

{

2077

{

2077

struct task_group *tg;

2078

struct task_group *tg;

2078

struct sched_entity *se;

2079

struct sched_entity *se;

2079

long shares;

2080

long shares;

2080

2081

tg = cfs_rq->tg;

2082

tg = cfs_rq->tg;

2082

se = tg->se[cpu_of(rq_of(cfs_rq))];

2083

se = tg->se[cpu_of(rq_of(cfs_rq))];

2083

if (!se || throttled_hierarchy(cfs_rq))

2084

if (!se || throttled_hierarchy(cfs_rq))

2084

return;

2085

return;

2085

#ifndef CONFIG_SMP

2086

#ifndef CONFIG_SMP

2086

if (likely(se->load.weight == tg->shares))

2087

if (likely(se->load.weight == tg->shares))

2087

return;

2088

return;

2088

#endif

2089

#endif

2089

shares = calc_cfs_shares(cfs_rq, tg);

2090

shares = calc_cfs_shares(cfs_rq, tg);

2090

2091

reweight_entity(cfs_rq_of(se), se, shares);

2092

reweight_entity(cfs_rq_of(se), se, shares);

2092

}

2093

}

2093

#else /* CONFIG_FAIR_GROUP_SCHED */

2094

#else /* CONFIG_FAIR_GROUP_SCHED */

2094

static inline void update_cfs_shares(struct cfs_rq *cfs_rq)

2095

static inline void update_cfs_shares(struct cfs_rq *cfs_rq)

2095

{

2096

{

2096

}

2097

}

2097

#endif /* CONFIG_FAIR_GROUP_SCHED */

2098

#endif /* CONFIG_FAIR_GROUP_SCHED */

2098

2099

#ifdef CONFIG_SMP

2100

#ifdef CONFIG_SMP

2100

/*

2101

/*

2101

* We choose a half-life close to 1 scheduling period.

2102

* We choose a half-life close to 1 scheduling period.

2102

* Note: The tables below are dependent on this value.

2103

* Note: The tables below are dependent on this value.

2103

*/

2104

*/

2104

#define LOAD_AVG_PERIOD 32

2105

#define LOAD_AVG_PERIOD 32

2105

#define LOAD_AVG_MAX 47742 /* maximum possible load avg */

2106

#define LOAD_AVG_MAX 47742 /* maximum possible load avg */

2106

#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */

2107

#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */

2107

2108

/* Precomputed fixed inverse multiplies for multiplication by y^n */

2109

/* Precomputed fixed inverse multiplies for multiplication by y^n */

2109

static const u32 runnable_avg_yN_inv[] = {

2110

static const u32 runnable_avg_yN_inv[] = {

2110

0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,

2111

0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,

2111

0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,

2112

0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,

2112

0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,

2113

0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,

2113

0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,

2114

0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,

2114

0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,

2115

0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,

2115

0x85aac367, 0x82cd8698,

2116

0x85aac367, 0x82cd8698,

2116

};

2117

};

2117

2118

/*

2119

/*

2119

* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent

2120

* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent

2120

* over-estimates when re-combining.

2121

* over-estimates when re-combining.

2121

*/

2122

*/

2122

static const u32 runnable_avg_yN_sum[] = {

2123

static const u32 runnable_avg_yN_sum[] = {

2123

0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,

2124

0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,

2124

9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,

2125

9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,

2125

17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,

2126

17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,

2126

};

2127

};

2127

2128

/*

2129

/*

2129

* Approximate:

2130

* Approximate:

2130

* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)

2131

* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)

2131

*/

2132

*/

2132

static __always_inline u64 decay_load(u64 val, u64 n)

2133

static __always_inline u64 decay_load(u64 val, u64 n)

2133

{

2134

{

2134

unsigned int local_n;

2135

unsigned int local_n;

2135

2136

if (!n)

2137

if (!n)

2137

return val;

2138

return val;

2138

else if (unlikely(n > LOAD_AVG_PERIOD * 63))

2139

else if (unlikely(n > LOAD_AVG_PERIOD * 63))

2139

return 0;

2140

return 0;

2140

2141

/* after bounds checking we can collapse to 32-bit */

2142

/* after bounds checking we can collapse to 32-bit */

2142

local_n = n;

2143

local_n = n;

2143

2144

/*

2145

/*

2145

* As y^PERIOD = 1/2, we can combine

2146

* As y^PERIOD = 1/2, we can combine

2146

* y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)

2147

* y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)

2147

* With a look-up table which covers k^n (n<PERIOD)

2148

* With a look-up table which covers k^n (n<PERIOD)

2148

*

2149

*

2149

* To achieve constant time decay_load.

2150

* To achieve constant time decay_load.

2150

*/

2151

*/

2151

if (unlikely(local_n >= LOAD_AVG_PERIOD)) {

2152

if (unlikely(local_n >= LOAD_AVG_PERIOD)) {

2152

val >>= local_n / LOAD_AVG_PERIOD;

2153

val >>= local_n / LOAD_AVG_PERIOD;

2153

local_n %= LOAD_AVG_PERIOD;

2154

local_n %= LOAD_AVG_PERIOD;

2154

}

2155

}

2155

2156

val *= runnable_avg_yN_inv[local_n];

2157

val *= runnable_avg_yN_inv[local_n];

2157

/* We don't use SRR here since we always want to round down. */

2158

/* We don't use SRR here since we always want to round down. */

2158

return val >> 32;

2159

return val >> 32;

2159

}

2160

}

2160

2161

/*

2162

/*

2162

* For updates fully spanning n periods, the contribution to runnable

2163

* For updates fully spanning n periods, the contribution to runnable

2163

* average will be: \Sum 1024*y^n

2164

* average will be: \Sum 1024*y^n

2164

*

2165

*

2165

* We can compute this reasonably efficiently by combining:

2166

* We can compute this reasonably efficiently by combining:

2166

* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}

2167

* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}

2167

*/

2168

*/

2168

static u32 __compute_runnable_contrib(u64 n)

2169

static u32 __compute_runnable_contrib(u64 n)

2169

{

2170

{

2170

u32 contrib = 0;

2171

u32 contrib = 0;

2171

2172

if (likely(n <= LOAD_AVG_PERIOD))

2173

if (likely(n <= LOAD_AVG_PERIOD))

2173

return runnable_avg_yN_sum[n];

2174

return runnable_avg_yN_sum[n];

2174

else if (unlikely(n >= LOAD_AVG_MAX_N))

2175

else if (unlikely(n >= LOAD_AVG_MAX_N))

2175

return LOAD_AVG_MAX;

2176

return LOAD_AVG_MAX;

2176

2177

/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */

2178

/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */

2178

do {

2179

do {

2179

contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */

2180

contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */

2180

contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];

2181

contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];

2181

2182

n -= LOAD_AVG_PERIOD;

2183

n -= LOAD_AVG_PERIOD;

2183

} while (n > LOAD_AVG_PERIOD);

2184

} while (n > LOAD_AVG_PERIOD);

2184

2185

contrib = decay_load(contrib, n);

2186

contrib = decay_load(contrib, n);

2186

return contrib + runnable_avg_yN_sum[n];

2187

return contrib + runnable_avg_yN_sum[n];

2187

}

2188

}

2188

2189

/*

2190

/*

2190

* We can represent the historical contribution to runnable average as the

2191

* We can represent the historical contribution to runnable average as the

2191

* coefficients of a geometric series. To do this we sub-divide our runnable

2192

* coefficients of a geometric series. To do this we sub-divide our runnable

2192

* history into segments of approximately 1ms (1024us); label the segment that

2193

* history into segments of approximately 1ms (1024us); label the segment that

2193

* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.

2194

* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.

2194

*

2195

*

2195

* [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...

2196

* [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...

2196

* p0 p1 p2

2197

* p0 p1 p2

2197

* (now) (~1ms ago) (~2ms ago)

2198

* (now) (~1ms ago) (~2ms ago)

2198

*

2199

*

2199

* Let u_i denote the fraction of p_i that the entity was runnable.

2200

* Let u_i denote the fraction of p_i that the entity was runnable.

2200

*

2201

*

2201

* We then designate the fractions u_i as our co-efficients, yielding the

2202

* We then designate the fractions u_i as our co-efficients, yielding the

2202

* following representation of historical load:

2203

* following representation of historical load:

2203

* u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...

2204

* u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...

2204

*

2205

*

2205

* We choose y based on the with of a reasonably scheduling period, fixing:

2206

* We choose y based on the with of a reasonably scheduling period, fixing:

2206

* y^32 = 0.5

2207

* y^32 = 0.5

2207

*

2208

*

2208

* This means that the contribution to load ~32ms ago (u_32) will be weighted

2209

* This means that the contribution to load ~32ms ago (u_32) will be weighted

2209

* approximately half as much as the contribution to load within the last ms

2210

* approximately half as much as the contribution to load within the last ms

2210

* (u_0).

2211

* (u_0).

2211

*

2212

*

2212

* When a period "rolls over" and we have new u_0`, multiplying the previous

2213

* When a period "rolls over" and we have new u_0`, multiplying the previous

2213

* sum again by y is sufficient to update:

2214

* sum again by y is sufficient to update:

2214

* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )

2215

* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )

2215

* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]

2216

* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]

2216

*/

2217

*/

2217

static __always_inline int __update_entity_runnable_avg(u64 now,

2218

static __always_inline int __update_entity_runnable_avg(u64 now,

2218

struct sched_avg *sa,

2219

struct sched_avg *sa,

2219

int runnable)

2220

int runnable)

2220

{

2221

{

2221

u64 delta, periods;

2222

u64 delta, periods;

2222

u32 runnable_contrib;

2223

u32 runnable_contrib;

2223

int delta_w, decayed = 0;

2224

int delta_w, decayed = 0;

2224

2225

delta = now - sa->last_runnable_update;

2226

delta = now - sa->last_runnable_update;

2226

/*

2227

/*

2227

* This should only happen when time goes backwards, which it

2228

* This should only happen when time goes backwards, which it

2228

* unfortunately does during sched clock init when we swap over to TSC.

2229

* unfortunately does during sched clock init when we swap over to TSC.

2229

*/

2230

*/

2230

if ((s64)delta < 0) {

2231

if ((s64)delta < 0) {

2231

sa->last_runnable_update = now;

2232

sa->last_runnable_update = now;

2232

return 0;

2233

return 0;

2233

}

2234

}

2234

2235

/*

2236

/*

2236

* Use 1024ns as the unit of measurement since it's a reasonable

2237

* Use 1024ns as the unit of measurement since it's a reasonable

2237

* approximation of 1us and fast to compute.

2238

* approximation of 1us and fast to compute.

2238

*/

2239

*/

2239

delta >>= 10;

2240

delta >>= 10;

2240

if (!delta)

2241

if (!delta)

2241

return 0;

2242

return 0;

2242

sa->last_runnable_update = now;

2243

sa->last_runnable_update = now;

2243

2244

/* delta_w is the amount already accumulated against our next period */

2245

/* delta_w is the amount already accumulated against our next period */

2245

delta_w = sa->runnable_avg_period % 1024;

2246

delta_w = sa->runnable_avg_period % 1024;

2246

if (delta + delta_w >= 1024) {

2247

if (delta + delta_w >= 1024) {

2247

/* period roll-over */

2248

/* period roll-over */

2248

decayed = 1;

2249

decayed = 1;

2249

2250

/*

2251

/*

2251

* Now that we know we're crossing a period boundary, figure

2252

* Now that we know we're crossing a period boundary, figure

2252

* out how much from delta we need to complete the current

2253

* out how much from delta we need to complete the current

2253

* period and accrue it.

2254

* period and accrue it.

2254

*/

2255

*/

2255

delta_w = 1024 - delta_w;

2256

delta_w = 1024 - delta_w;

2256

if (runnable)

2257

if (runnable)

2257

sa->runnable_avg_sum += delta_w;

2258

sa->runnable_avg_sum += delta_w;

2258

sa->runnable_avg_period += delta_w;

2259

sa->runnable_avg_period += delta_w;

2259

2260

delta -= delta_w;

2261

delta -= delta_w;

2261

2262

/* Figure out how many additional periods this update spans */

2263

/* Figure out how many additional periods this update spans */

2263

periods = delta / 1024;

2264

periods = delta / 1024;

2264

delta %= 1024;

2265

delta %= 1024;

2265

2266

sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,

2267

sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,

2267

periods + 1);

2268

periods + 1);

2268

sa->runnable_avg_period = decay_load(sa->runnable_avg_period,

2269

sa->runnable_avg_period = decay_load(sa->runnable_avg_period,

2269

periods + 1);

2270

periods + 1);

2270

2271

/* Efficiently calculate \sum (1..n_period) 1024*y^i */

2272

/* Efficiently calculate \sum (1..n_period) 1024*y^i */

2272

runnable_contrib = __compute_runnable_contrib(periods);

2273

runnable_contrib = __compute_runnable_contrib(periods);

2273

if (runnable)

2274

if (runnable)

2274

sa->runnable_avg_sum += runnable_contrib;

2275

sa->runnable_avg_sum += runnable_contrib;

2275

sa->runnable_avg_period += runnable_contrib;

2276

sa->runnable_avg_period += runnable_contrib;

2276

}

2277

}

2277

2278

/* Remainder of delta accrued against u_0` */

2279

/* Remainder of delta accrued against u_0` */

2279

if (runnable)

2280

if (runnable)

2280

sa->runnable_avg_sum += delta;

2281

sa->runnable_avg_sum += delta;

2281

sa->runnable_avg_period += delta;

2282

sa->runnable_avg_period += delta;

2282

2283

return decayed;

2284

return decayed;

2284

}

2285

}

2285

2286

/* Synchronize an entity's decay with its parenting cfs_rq.*/

2287

/* Synchronize an entity's decay with its parenting cfs_rq.*/

2287

static inline u64 __synchronize_entity_decay(struct sched_entity *se)

2288

static inline u64 __synchronize_entity_decay(struct sched_entity *se)

2288

{

2289

{

2289

struct cfs_rq *cfs_rq = cfs_rq_of(se);

2290

struct cfs_rq *cfs_rq = cfs_rq_of(se);

2290

u64 decays = atomic64_read(&cfs_rq->decay_counter);

2291

u64 decays = atomic64_read(&cfs_rq->decay_counter);

2291

2292

decays -= se->avg.decay_count;

2293

decays -= se->avg.decay_count;

2293

if (!decays)

2294

if (!decays)

2294

return 0;

2295

return 0;

2295

2296

se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);

2297

se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);

2297

se->avg.decay_count = 0;

2298

se->avg.decay_count = 0;

2298

2299

return decays;

2300

return decays;

2300

}

2301

}

2301

2302

#ifdef CONFIG_FAIR_GROUP_SCHED

2303

#ifdef CONFIG_FAIR_GROUP_SCHED

2303

static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,

2304

static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,

2304

int force_update)

2305

int force_update)

2305

{

2306

{

2306

struct task_group *tg = cfs_rq->tg;

2307

struct task_group *tg = cfs_rq->tg;

2307

long tg_contrib;

2308

long tg_contrib;

2308

2309

tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;

2310

tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;

2310

tg_contrib -= cfs_rq->tg_load_contrib;

2311

tg_contrib -= cfs_rq->tg_load_contrib;

2311

2312

if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {

2313

if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {

2313

atomic_long_add(tg_contrib, &tg->load_avg);

2314

atomic_long_add(tg_contrib, &tg->load_avg);

2314

cfs_rq->tg_load_contrib += tg_contrib;

2315

cfs_rq->tg_load_contrib += tg_contrib;

2315

}

2316

}

2316

}

2317

}

2317

2318

/*

2319

/*

2319

* Aggregate cfs_rq runnable averages into an equivalent task_group

2320

* Aggregate cfs_rq runnable averages into an equivalent task_group

2320

* representation for computing load contributions.

2321

* representation for computing load contributions.

2321

*/

2322

*/

2322

static inline void __update_tg_runnable_avg(struct sched_avg *sa,

2323

static inline void __update_tg_runnable_avg(struct sched_avg *sa,

2323

struct cfs_rq *cfs_rq)

2324

struct cfs_rq *cfs_rq)

2324

{

2325

{

2325

struct task_group *tg = cfs_rq->tg;

2326

struct task_group *tg = cfs_rq->tg;

2326

long contrib;

2327

long contrib;

2327

2328

/* The fraction of a cpu used by this cfs_rq */

2329

/* The fraction of a cpu used by this cfs_rq */

2329

contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,

2330

contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,

2330

sa->runnable_avg_period + 1);

2331

sa->runnable_avg_period + 1);

2331

contrib -= cfs_rq->tg_runnable_contrib;

2332

contrib -= cfs_rq->tg_runnable_contrib;

2332

2333

if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {

2334

if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {

2334

atomic_add(contrib, &tg->runnable_avg);

2335

atomic_add(contrib, &tg->runnable_avg);

2335

cfs_rq->tg_runnable_contrib += contrib;

2336

cfs_rq->tg_runnable_contrib += contrib;

2336

}

2337

}

2337

}

2338

}

2338

2339

static inline void __update_group_entity_contrib(struct sched_entity *se)

2340

static inline void __update_group_entity_contrib(struct sched_entity *se)

2340

{

2341

{

2341

struct cfs_rq *cfs_rq = group_cfs_rq(se);

2342

struct cfs_rq *cfs_rq = group_cfs_rq(se);

2342

struct task_group *tg = cfs_rq->tg;

2343

struct task_group *tg = cfs_rq->tg;

2343

int runnable_avg;

2344

int runnable_avg;

2344

2345

u64 contrib;

2346

u64 contrib;

2346

2347

contrib = cfs_rq->tg_load_contrib * tg->shares;

2348

contrib = cfs_rq->tg_load_contrib * tg->shares;

2348

se->avg.load_avg_contrib = div_u64(contrib,

2349

se->avg.load_avg_contrib = div_u64(contrib,

2349

atomic_long_read(&tg->load_avg) + 1);

2350

atomic_long_read(&tg->load_avg) + 1);

2350

2351

/*

2352

/*

2352

* For group entities we need to compute a correction term in the case

2353

* For group entities we need to compute a correction term in the case

2353

* that they are consuming <1 cpu so that we would contribute the same

2354

* that they are consuming <1 cpu so that we would contribute the same

2354

* load as a task of equal weight.

2355

* load as a task of equal weight.

2355

*

2356

*

2356

* Explicitly co-ordinating this measurement would be expensive, but

2357

* Explicitly co-ordinating this measurement would be expensive, but

2357

* fortunately the sum of each cpus contribution forms a usable

2358

* fortunately the sum of each cpus contribution forms a usable

2358

* lower-bound on the true value.

2359

* lower-bound on the true value.

2359

*

2360

*

2360

* Consider the aggregate of 2 contributions. Either they are disjoint

2361

* Consider the aggregate of 2 contributions. Either they are disjoint

2361

* (and the sum represents true value) or they are disjoint and we are

2362

* (and the sum represents true value) or they are disjoint and we are

2362

* understating by the aggregate of their overlap.

2363

* understating by the aggregate of their overlap.

2363

*

2364

*

2364

* Extending this to N cpus, for a given overlap, the maximum amount we

2365

* Extending this to N cpus, for a given overlap, the maximum amount we

2365

* understand is then n_i(n_i+1)/2 * w_i where n_i is the number of

2366

* understand is then n_i(n_i+1)/2 * w_i where n_i is the number of

2366

* cpus that overlap for this interval and w_i is the interval width.

2367

* cpus that overlap for this interval and w_i is the interval width.

2367

*

2368

*

2368

* On a small machine; the first term is well-bounded which bounds the

2369

* On a small machine; the first term is well-bounded which bounds the

2369

* total error since w_i is a subset of the period. Whereas on a

2370

* total error since w_i is a subset of the period. Whereas on a

2370

* larger machine, while this first term can be larger, if w_i is the

2371

* larger machine, while this first term can be larger, if w_i is the

2371

* of consequential size guaranteed to see n_i*w_i quickly converge to

2372

* of consequential size guaranteed to see n_i*w_i quickly converge to

2372

* our upper bound of 1-cpu.

2373

* our upper bound of 1-cpu.

2373

*/

2374

*/

2374

runnable_avg = atomic_read(&tg->runnable_avg);

2375

runnable_avg = atomic_read(&tg->runnable_avg);

2375

if (runnable_avg < NICE_0_LOAD) {

2376

if (runnable_avg < NICE_0_LOAD) {

2376

se->avg.load_avg_contrib *= runnable_avg;

2377

se->avg.load_avg_contrib *= runnable_avg;

2377

se->avg.load_avg_contrib >>= NICE_0_SHIFT;

2378

se->avg.load_avg_contrib >>= NICE_0_SHIFT;

2378

}

2379

}

2379

}

2380

}

2380

2381

static inline void update_rq_runnable_avg(struct rq *rq, int runnable)

2382

static inline void update_rq_runnable_avg(struct rq *rq, int runnable)

2382

{

2383

{

2383

__update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);

2384

__update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);

2384

__update_tg_runnable_avg(&rq->avg, &rq->cfs);

2385

__update_tg_runnable_avg(&rq->avg, &rq->cfs);

2385

}

2386

}

2386

#else /* CONFIG_FAIR_GROUP_SCHED */

2387

#else /* CONFIG_FAIR_GROUP_SCHED */

2387

static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,

2388

static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,

2388

int force_update) {}

2389

int force_update) {}

2389

static inline void __update_tg_runnable_avg(struct sched_avg *sa,

2390

static inline void __update_tg_runnable_avg(struct sched_avg *sa,

2390

struct cfs_rq *cfs_rq) {}

2391

struct cfs_rq *cfs_rq) {}

2391

static inline void __update_group_entity_contrib(struct sched_entity *se) {}

2392

static inline void __update_group_entity_contrib(struct sched_entity *se) {}

2392

static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}

2393

static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}

2393

#endif /* CONFIG_FAIR_GROUP_SCHED */

2394

#endif /* CONFIG_FAIR_GROUP_SCHED */

2394

2395

static inline void __update_task_entity_contrib(struct sched_entity *se)

2396

static inline void __update_task_entity_contrib(struct sched_entity *se)

2396

{

2397

{

2397

u32 contrib;

2398

u32 contrib;

2398

2399

/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */

2400

/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */

2400

contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);

2401

contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);

2401

contrib /= (se->avg.runnable_avg_period + 1);

2402

contrib /= (se->avg.runnable_avg_period + 1);

2402

se->avg.load_avg_contrib = scale_load(contrib);

2403

se->avg.load_avg_contrib = scale_load(contrib);

2403

}

2404

}

2404

2405

/* Compute the current contribution to load_avg by se, return any delta */

2406

/* Compute the current contribution to load_avg by se, return any delta */

2406

static long __update_entity_load_avg_contrib(struct sched_entity *se)

2407

static long __update_entity_load_avg_contrib(struct sched_entity *se)

2407

{

2408

{

2408

long old_contrib = se->avg.load_avg_contrib;

2409

long old_contrib = se->avg.load_avg_contrib;

2409

2410

if (entity_is_task(se)) {

2411

if (entity_is_task(se)) {

2411

__update_task_entity_contrib(se);

2412

__update_task_entity_contrib(se);

2412

} else {

2413

} else {

2413

__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));

2414

__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));

2414

__update_group_entity_contrib(se);

2415

__update_group_entity_contrib(se);

2415

}

2416

}

2416

2417

return se->avg.load_avg_contrib - old_contrib;

2418

return se->avg.load_avg_contrib - old_contrib;

2418

}

2419

}

2419

2420

static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,

2421

static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,

2421

long load_contrib)

2422

long load_contrib)

2422

{

2423

{

2423

if (likely(load_contrib < cfs_rq->blocked_load_avg))

2424

if (likely(load_contrib < cfs_rq->blocked_load_avg))

2424

cfs_rq->blocked_load_avg -= load_contrib;

2425

cfs_rq->blocked_load_avg -= load_contrib;

2425

else

2426

else

2426

cfs_rq->blocked_load_avg = 0;

2427

cfs_rq->blocked_load_avg = 0;

2427

}

2428

}

2428

2429

static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);

2430

static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);

2430

2431

/* Update a sched_entity's runnable average */

2432

/* Update a sched_entity's runnable average */

2432

static inline void update_entity_load_avg(struct sched_entity *se,

2433

static inline void update_entity_load_avg(struct sched_entity *se,

2433

int update_cfs_rq)

2434

int update_cfs_rq)

2434

{

2435

{

2435

struct cfs_rq *cfs_rq = cfs_rq_of(se);

2436

struct cfs_rq *cfs_rq = cfs_rq_of(se);

2436

long contrib_delta;

2437

long contrib_delta;

2437

u64 now;

2438

u64 now;

2438

2439

/*

2440

/*

2440

* For a group entity we need to use their owned cfs_rq_clock_task() in

2441

* For a group entity we need to use their owned cfs_rq_clock_task() in

2441

* case they are the parent of a throttled hierarchy.

2442

* case they are the parent of a throttled hierarchy.

2442

*/

2443

*/

2443

if (entity_is_task(se))

2444

if (entity_is_task(se))

2444

now = cfs_rq_clock_task(cfs_rq);

2445

now = cfs_rq_clock_task(cfs_rq);

2445

else

2446

else

2446

now = cfs_rq_clock_task(group_cfs_rq(se));

2447

now = cfs_rq_clock_task(group_cfs_rq(se));

2447

2448

if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))

2449

if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))

2449

return;

2450

return;

2450

2451

contrib_delta = __update_entity_load_avg_contrib(se);

2452

contrib_delta = __update_entity_load_avg_contrib(se);

2452

2453

if (!update_cfs_rq)

2454

if (!update_cfs_rq)

2454

return;

2455

return;

2455

2456

if (se->on_rq)

2457

if (se->on_rq)

2457

cfs_rq->runnable_load_avg += contrib_delta;

2458

cfs_rq->runnable_load_avg += contrib_delta;

2458

else

2459

else

2459

subtract_blocked_load_contrib(cfs_rq, -contrib_delta);

2460

subtract_blocked_load_contrib(cfs_rq, -contrib_delta);

2460

}

2461

}

2461

2462

/*

2463

/*

2463

* Decay the load contributed by all blocked children and account this so that

2464

* Decay the load contributed by all blocked children and account this so that

2464

* their contribution may appropriately discounted when they wake up.

2465

* their contribution may appropriately discounted when they wake up.

2465

*/

2466

*/

2466

static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)

2467

static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)

2467

{

2468

{

2468

u64 now = cfs_rq_clock_task(cfs_rq) >> 20;

2469

u64 now = cfs_rq_clock_task(cfs_rq) >> 20;

2469

u64 decays;

2470

u64 decays;

2470

2471

decays = now - cfs_rq->last_decay;

2472

decays = now - cfs_rq->last_decay;

2472

if (!decays && !force_update)

2473

if (!decays && !force_update)

2473

return;

2474

return;

2474

2475

if (atomic_long_read(&cfs_rq->removed_load)) {

2476

if (atomic_long_read(&cfs_rq->removed_load)) {

2476

unsigned long removed_load;

2477

unsigned long removed_load;

2477

removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);

2478

removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);

2478

subtract_blocked_load_contrib(cfs_rq, removed_load);

2479

subtract_blocked_load_contrib(cfs_rq, removed_load);

2479

}

2480

}

2480

2481

if (decays) {

2482

if (decays) {

2482

cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,

2483

cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,

2483

decays);

2484

decays);

2484

atomic64_add(decays, &cfs_rq->decay_counter);

2485

atomic64_add(decays, &cfs_rq->decay_counter);

2485

cfs_rq->last_decay = now;

2486

cfs_rq->last_decay = now;

2486

}

2487

}

2487

2488

__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);

2489

__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);

2489

}

2490

}

2490

2491

/* Add the load generated by se into cfs_rq's child load-average */

2492

/* Add the load generated by se into cfs_rq's child load-average */

2492

static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,

2493

static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,

2493

struct sched_entity *se,

2494

struct sched_entity *se,

2494

int wakeup)

2495

int wakeup)

2495

{

2496

{

2496

/*

2497

/*

2497

* We track migrations using entity decay_count <= 0, on a wake-up

2498

* We track migrations using entity decay_count <= 0, on a wake-up

2498

* migration we use a negative decay count to track the remote decays

2499

* migration we use a negative decay count to track the remote decays

2499

* accumulated while sleeping.

2500

* accumulated while sleeping.

2500

*

2501

*

2501

* Newly forked tasks are enqueued with se->avg.decay_count == 0, they

2502

* Newly forked tasks are enqueued with se->avg.decay_count == 0, they

2502

* are seen by enqueue_entity_load_avg() as a migration with an already

2503

* are seen by enqueue_entity_load_avg() as a migration with an already

2503

* constructed load_avg_contrib.

2504

* constructed load_avg_contrib.

2504

*/

2505

*/

2505

if (unlikely(se->avg.decay_count <= 0)) {

2506

if (unlikely(se->avg.decay_count <= 0)) {

2506

se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));

2507

se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));

2507

if (se->avg.decay_count) {

2508

if (se->avg.decay_count) {

2508

/*

2509

/*

2509

* In a wake-up migration we have to approximate the

2510

* In a wake-up migration we have to approximate the

2510

* time sleeping. This is because we can't synchronize

2511

* time sleeping. This is because we can't synchronize

2511

* clock_task between the two cpus, and it is not

2512

* clock_task between the two cpus, and it is not

2512

* guaranteed to be read-safe. Instead, we can

2513

* guaranteed to be read-safe. Instead, we can

2513

* approximate this using our carried decays, which are

2514

* approximate this using our carried decays, which are

2514

* explicitly atomically readable.

2515

* explicitly atomically readable.

2515

*/

2516

*/

2516

se->avg.last_runnable_update -= (-se->avg.decay_count)

2517

se->avg.last_runnable_update -= (-se->avg.decay_count)

2517

<< 20;

2518

<< 20;

2518

update_entity_load_avg(se, 0);

2519

update_entity_load_avg(se, 0);

2519

/* Indicate that we're now synchronized and on-rq */

2520

/* Indicate that we're now synchronized and on-rq */

2520

se->avg.decay_count = 0;

2521

se->avg.decay_count = 0;

2521

}

2522

}

2522

wakeup = 0;

2523

wakeup = 0;

2523

} else {

2524

} else {

2524

__synchronize_entity_decay(se);

2525

__synchronize_entity_decay(se);

2525

}

2526

}

2526

2527

/* migrated tasks did not contribute to our blocked load */

2528

/* migrated tasks did not contribute to our blocked load */

2528

if (wakeup) {

2529

if (wakeup) {

2529

subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);

2530

subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);

2530

update_entity_load_avg(se, 0);

2531

update_entity_load_avg(se, 0);

2531

}

2532

}

2532

2533

cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;

2534

cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;

2534

/* we force update consideration on load-balancer moves */

2535

/* we force update consideration on load-balancer moves */

2535

update_cfs_rq_blocked_load(cfs_rq, !wakeup);

2536

update_cfs_rq_blocked_load(cfs_rq, !wakeup);

2536

}

2537

}

2537

2538

/*

2539

/*

2539

* Remove se's load from this cfs_rq child load-average, if the entity is

2540

* Remove se's load from this cfs_rq child load-average, if the entity is

2540

* transitioning to a blocked state we track its projected decay using

2541

* transitioning to a blocked state we track its projected decay using

2541

* blocked_load_avg.

2542

* blocked_load_avg.

2542

*/

2543

*/

2543

static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,

2544

static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,

2544

struct sched_entity *se,

2545

struct sched_entity *se,

2545

int sleep)

2546

int sleep)

2546

{

2547

{

2547

update_entity_load_avg(se, 1);

2548

update_entity_load_avg(se, 1);

2548

/* we force update consideration on load-balancer moves */

2549

/* we force update consideration on load-balancer moves */

2549

update_cfs_rq_blocked_load(cfs_rq, !sleep);

2550

update_cfs_rq_blocked_load(cfs_rq, !sleep);

2550

2551

cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;

2552

cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;

2552

if (sleep) {

2553

if (sleep) {

2553

cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;

2554

cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;

2554

se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);

2555

se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);

2555

} /* migrations, e.g. sleep=0 leave decay_count == 0 */

2556

} /* migrations, e.g. sleep=0 leave decay_count == 0 */

2556

}

2557

}

2557

2558

/*

2559

/*

2559

* Update the rq's load with the elapsed running time before entering

2560

* Update the rq's load with the elapsed running time before entering

2560

* idle. if the last scheduled task is not a CFS task, idle_enter will

2561

* idle. if the last scheduled task is not a CFS task, idle_enter will

2561

* be the only way to update the runnable statistic.

2562

* be the only way to update the runnable statistic.

2562

*/

2563

*/

2563

void idle_enter_fair(struct rq *this_rq)

2564

void idle_enter_fair(struct rq *this_rq)

2564

{

2565

{

2565

update_rq_runnable_avg(this_rq, 1);

2566

update_rq_runnable_avg(this_rq, 1);

2566

}

2567

}

2567

2568

/*

2569

/*

2569

* Update the rq's load with the elapsed idle time before a task is

2570

* Update the rq's load with the elapsed idle time before a task is

2570

* scheduled. if the newly scheduled task is not a CFS task, idle_exit will

2571

* scheduled. if the newly scheduled task is not a CFS task, idle_exit will

2571

* be the only way to update the runnable statistic.

2572

* be the only way to update the runnable statistic.

2572

*/

2573

*/

2573

void idle_exit_fair(struct rq *this_rq)

2574

void idle_exit_fair(struct rq *this_rq)

2574

{

2575

{

2575

update_rq_runnable_avg(this_rq, 0);

2576

update_rq_runnable_avg(this_rq, 0);

2576

}

2577

}

2577

2578

static int idle_balance(struct rq *this_rq);

2579

static int idle_balance(struct rq *this_rq);

2579

2580

#else /* CONFIG_SMP */

2581

#else /* CONFIG_SMP */

2581

2582

static inline void update_entity_load_avg(struct sched_entity *se,

2583

static inline void update_entity_load_avg(struct sched_entity *se,

2583

int update_cfs_rq) {}

2584

int update_cfs_rq) {}

2584

static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}

2585

static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}

2585

static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,

2586

static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,

2586

struct sched_entity *se,

2587

struct sched_entity *se,

2587

int wakeup) {}

2588

int wakeup) {}

2588

static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,

2589

static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,

2589

struct sched_entity *se,

2590

struct sched_entity *se,

2590

int sleep) {}

2591

int sleep) {}

2591

static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,

2592

static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,

2592

int force_update) {}

2593

int force_update) {}

2593

2594

static inline int idle_balance(struct rq *rq)

2595

static inline int idle_balance(struct rq *rq)

2595

{

2596

{

2596

return 0;

2597

return 0;

2597

}

2598

}

2598

2599

#endif /* CONFIG_SMP */

2600

#endif /* CONFIG_SMP */

2600

2601

static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)

2602

static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)

2602

{

2603

{

2603

#ifdef CONFIG_SCHEDSTATS

2604

#ifdef CONFIG_SCHEDSTATS

2604

struct task_struct *tsk = NULL;

2605

struct task_struct *tsk = NULL;

2605

2606

if (entity_is_task(se))

2607

if (entity_is_task(se))

2607

tsk = task_of(se);

2608

tsk = task_of(se);

2608

2609

if (se->statistics.sleep_start) {

2610

if (se->statistics.sleep_start) {

2610

u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;

2611

u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;

2611

2612

if ((s64)delta < 0)

2613

if ((s64)delta < 0)

2613

delta = 0;

2614

delta = 0;

2614

2615

if (unlikely(delta > se->statistics.sleep_max))

2616

if (unlikely(delta > se->statistics.sleep_max))

2616

se->statistics.sleep_max = delta;

2617

se->statistics.sleep_max = delta;

2617

2618

se->statistics.sleep_start = 0;

2619

se->statistics.sleep_start = 0;

2619

se->statistics.sum_sleep_runtime += delta;

2620

se->statistics.sum_sleep_runtime += delta;

2620

2621

if (tsk) {

2622

if (tsk) {

2622

account_scheduler_latency(tsk, delta >> 10, 1);

2623

account_scheduler_latency(tsk, delta >> 10, 1);

2623

trace_sched_stat_sleep(tsk, delta);

2624

trace_sched_stat_sleep(tsk, delta);

2624

}

2625

}

2625

}

2626

}

2626

if (se->statistics.block_start) {

2627

if (se->statistics.block_start) {

2627

u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;

2628

u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;

2628

2629

if ((s64)delta < 0)

2630

if ((s64)delta < 0)

2630

delta = 0;

2631

delta = 0;

2631

2632

if (unlikely(delta > se->statistics.block_max))

2633

if (unlikely(delta > se->statistics.block_max))

2633

se->statistics.block_max = delta;

2634

se->statistics.block_max = delta;

2634

2635

se->statistics.block_start = 0;

2636

se->statistics.block_start = 0;

2636

se->statistics.sum_sleep_runtime += delta;

2637

se->statistics.sum_sleep_runtime += delta;

2637

2638

if (tsk) {

2639

if (tsk) {

2639

if (tsk->in_iowait) {

2640

if (tsk->in_iowait) {

2640

se->statistics.iowait_sum += delta;

2641

se->statistics.iowait_sum += delta;

2641

se->statistics.iowait_count++;

2642

se->statistics.iowait_count++;

2642

trace_sched_stat_iowait(tsk, delta);

2643

trace_sched_stat_iowait(tsk, delta);

2643

}

2644

}

2644

2645

trace_sched_stat_blocked(tsk, delta);

2646

trace_sched_stat_blocked(tsk, delta);

2646

2647

/*

2648

/*

2648

* Blocking time is in units of nanosecs, so shift by

2649

* Blocking time is in units of nanosecs, so shift by

2649

* 20 to get a milliseconds-range estimation of the

2650

* 20 to get a milliseconds-range estimation of the

2650

* amount of time that the task spent sleeping:

2651

* amount of time that the task spent sleeping:

2651

*/

2652

*/

2652

if (unlikely(prof_on == SLEEP_PROFILING)) {

2653

if (unlikely(prof_on == SLEEP_PROFILING)) {

2653

profile_hits(SLEEP_PROFILING,

2654

profile_hits(SLEEP_PROFILING,

2654

(void *)get_wchan(tsk),

2655

(void *)get_wchan(tsk),

2655

delta >> 20);

2656

delta >> 20);

2656

}

2657

}

2657

account_scheduler_latency(tsk, delta >> 10, 0);

2658

account_scheduler_latency(tsk, delta >> 10, 0);

2658

}

2659

}

2659

}

2660

}

2660

#endif

2661

#endif

2661

}

2662

}

2662

2663

static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)

2664

static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)

2664

{

2665

{

2665

#ifdef CONFIG_SCHED_DEBUG

2666

#ifdef CONFIG_SCHED_DEBUG

2666

s64 d = se->vruntime - cfs_rq->min_vruntime;

2667

s64 d = se->vruntime - cfs_rq->min_vruntime;

2667

2668

if (d < 0)

2669

if (d < 0)

2669

d = -d;

2670

d = -d;

2670

2671

if (d > 3*sysctl_sched_latency)

2672

if (d > 3*sysctl_sched_latency)

2672

schedstat_inc(cfs_rq, nr_spread_over);

2673

schedstat_inc(cfs_rq, nr_spread_over);

2673

#endif

2674

#endif

2674

}

2675

}

2675

2676

static void

2677

static void

2677

place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)

2678

place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)

2678

{

2679

{

2679

u64 vruntime = cfs_rq->min_vruntime;

2680

u64 vruntime = cfs_rq->min_vruntime;

2680

2681

/*

2682

/*

2682

* The 'current' period is already promised to the current tasks,

2683

* The 'current' period is already promised to the current tasks,

2683

* however the extra weight of the new task will slow them down a

2684

* however the extra weight of the new task will slow them down a

2684

* little, place the new task so that it fits in the slot that

2685

* little, place the new task so that it fits in the slot that

2685

* stays open at the end.

2686

* stays open at the end.

2686

*/

2687

*/

2687

if (initial && sched_feat(START_DEBIT))

2688

if (initial && sched_feat(START_DEBIT))

2688

vruntime += sched_vslice(cfs_rq, se);

2689

vruntime += sched_vslice(cfs_rq, se);

2689

2690

/* sleeps up to a single latency don't count. */

2691

/* sleeps up to a single latency don't count. */

2691

if (!initial) {

2692

if (!initial) {

2692

unsigned long thresh = sysctl_sched_latency;

2693

unsigned long thresh = sysctl_sched_latency;

2693

2694

/*

2695

/*

2695

* Halve their sleep time's effect, to allow

2696

* Halve their sleep time's effect, to allow

2696

* for a gentler effect of sleepers:

2697

* for a gentler effect of sleepers:

2697

*/

2698

*/

2698

if (sched_feat(GENTLE_FAIR_SLEEPERS))

2699

if (sched_feat(GENTLE_FAIR_SLEEPERS))

2699

thresh >>= 1;

2700

thresh >>= 1;

2700

2701

vruntime -= thresh;

2702

vruntime -= thresh;

2702

}

2703

}

2703

2704

/* ensure we never gain time by being placed backwards. */

2705

/* ensure we never gain time by being placed backwards. */

2705

se->vruntime = max_vruntime(se->vruntime, vruntime);

2706

se->vruntime = max_vruntime(se->vruntime, vruntime);

2706

}

2707

}

2707

2708

static void check_enqueue_throttle(struct cfs_rq *cfs_rq);

2709

static void check_enqueue_throttle(struct cfs_rq *cfs_rq);

2709

2710

static void

2711

static void

2711

enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)

2712

enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)

2712

{

2713

{

2713

/*

2714

/*

2714

* Update the normalized vruntime before updating min_vruntime

2715

* Update the normalized vruntime before updating min_vruntime

2715

* through calling update_curr().

2716

* through calling update_curr().

2716

*/

2717

*/

2717

if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))

2718

if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))

2718

se->vruntime += cfs_rq->min_vruntime;

2719

se->vruntime += cfs_rq->min_vruntime;

2719

2720

/*

2721

/*

2721

* Update run-time statistics of the 'current'.

2722

* Update run-time statistics of the 'current'.

2722

*/

2723

*/

2723

update_curr(cfs_rq);

2724

update_curr(cfs_rq);

2724

enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);

2725

enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);

2725

account_entity_enqueue(cfs_rq, se);

2726

account_entity_enqueue(cfs_rq, se);

2726

update_cfs_shares(cfs_rq);

2727

update_cfs_shares(cfs_rq);

2727

2728

if (flags & ENQUEUE_WAKEUP) {

2729

if (flags & ENQUEUE_WAKEUP) {

2729

place_entity(cfs_rq, se, 0);

2730

place_entity(cfs_rq, se, 0);

2730

enqueue_sleeper(cfs_rq, se);

2731

enqueue_sleeper(cfs_rq, se);

2731

}

2732

}

2732

2733

update_stats_enqueue(cfs_rq, se);

2734

update_stats_enqueue(cfs_rq, se);

2734

check_spread(cfs_rq, se);

2735

check_spread(cfs_rq, se);

2735

if (se != cfs_rq->curr)

2736

if (se != cfs_rq->curr)

2736

__enqueue_entity(cfs_rq, se);

2737

__enqueue_entity(cfs_rq, se);

2737

se->on_rq = 1;

2738

se->on_rq = 1;

2738

2739

if (cfs_rq->nr_running == 1) {

2740

if (cfs_rq->nr_running == 1) {

2740

list_add_leaf_cfs_rq(cfs_rq);

2741

list_add_leaf_cfs_rq(cfs_rq);

2741

check_enqueue_throttle(cfs_rq);

2742

check_enqueue_throttle(cfs_rq);

2742

}

2743

}

2743

}

2744

}

2744

2745

static void __clear_buddies_last(struct sched_entity *se)

2746

static void __clear_buddies_last(struct sched_entity *se)

2746

{

2747

{

2747

for_each_sched_entity(se) {

2748

for_each_sched_entity(se) {

2748

struct cfs_rq *cfs_rq = cfs_rq_of(se);

2749

struct cfs_rq *cfs_rq = cfs_rq_of(se);

2749

if (cfs_rq->last != se)

2750

if (cfs_rq->last != se)

2750

break;

2751

break;

2751

2752

cfs_rq->last = NULL;

2753

cfs_rq->last = NULL;

2753

}

2754

}

2754

}

2755

}

2755

2756

static void __clear_buddies_next(struct sched_entity *se)

2757

static void __clear_buddies_next(struct sched_entity *se)

2757

{

2758

{

2758

for_each_sched_entity(se) {

2759

for_each_sched_entity(se) {

2759

struct cfs_rq *cfs_rq = cfs_rq_of(se);

2760

struct cfs_rq *cfs_rq = cfs_rq_of(se);

2760

if (cfs_rq->next != se)

2761

if (cfs_rq->next != se)

2761

break;

2762

break;

2762

2763

cfs_rq->next = NULL;

2764

cfs_rq->next = NULL;

2764

}

2765

}

2765

}

2766

}

2766

2767

static void __clear_buddies_skip(struct sched_entity *se)

2768

static void __clear_buddies_skip(struct sched_entity *se)

2768

{

2769

{

2769

for_each_sched_entity(se) {

2770

for_each_sched_entity(se) {

2770

struct cfs_rq *cfs_rq = cfs_rq_of(se);

2771

struct cfs_rq *cfs_rq = cfs_rq_of(se);

2771

if (cfs_rq->skip != se)

2772

if (cfs_rq->skip != se)

2772

break;

2773

break;

2773

2774

cfs_rq->skip = NULL;

2775

cfs_rq->skip = NULL;

2775

}

2776

}

2776

}

2777

}

2777

2778

static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)

2779

static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)

2779

{

2780

{

2780

if (cfs_rq->last == se)

2781

if (cfs_rq->last == se)

2781

__clear_buddies_last(se);

2782

__clear_buddies_last(se);

2782

2783

if (cfs_rq->next == se)

2784

if (cfs_rq->next == se)

2784

__clear_buddies_next(se);

2785

__clear_buddies_next(se);

2785

2786

if (cfs_rq->skip == se)

2787

if (cfs_rq->skip == se)

2787

__clear_buddies_skip(se);

2788

__clear_buddies_skip(se);

2788

}

2789

}

2789

2790

static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);

2791

static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);

2791

2792

static void

2793

static void

2793

dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)

2794

dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)

2794

{

2795

{

2795

/*

2796

/*

2796

* Update run-time statistics of the 'current'.

2797

* Update run-time statistics of the 'current'.

2797

*/

2798

*/

2798

update_curr(cfs_rq);

2799

update_curr(cfs_rq);

2799

dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);

2800

dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);

2800

2801

update_stats_dequeue(cfs_rq, se);

2802

update_stats_dequeue(cfs_rq, se);

2802

if (flags & DEQUEUE_SLEEP) {

2803

if (flags & DEQUEUE_SLEEP) {

2803

#ifdef CONFIG_SCHEDSTATS

2804

#ifdef CONFIG_SCHEDSTATS

2804

if (entity_is_task(se)) {

2805

if (entity_is_task(se)) {

2805

struct task_struct *tsk = task_of(se);

2806

struct task_struct *tsk = task_of(se);

2806

2807

if (tsk->state & TASK_INTERRUPTIBLE)

2808

if (tsk->state & TASK_INTERRUPTIBLE)

2808

se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));

2809

se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));

2809

if (tsk->state & TASK_UNINTERRUPTIBLE)

2810

if (tsk->state & TASK_UNINTERRUPTIBLE)

2810

se->statistics.block_start = rq_clock(rq_of(cfs_rq));

2811

se->statistics.block_start = rq_clock(rq_of(cfs_rq));

2811

}

2812

}

2812

#endif

2813

#endif

2813

}

2814

}

2814

2815

clear_buddies(cfs_rq, se);

2816

clear_buddies(cfs_rq, se);

2816

2817

if (se != cfs_rq->curr)

2818

if (se != cfs_rq->curr)

2818

__dequeue_entity(cfs_rq, se);

2819

__dequeue_entity(cfs_rq, se);

2819

se->on_rq = 0;

2820

se->on_rq = 0;

2820

account_entity_dequeue(cfs_rq, se);

2821

account_entity_dequeue(cfs_rq, se);

2821

2822

/*

2823

/*

2823

* Normalize the entity after updating the min_vruntime because the

2824

* Normalize the entity after updating the min_vruntime because the

2824

* update can refer to the ->curr item and we need to reflect this

2825

* update can refer to the ->curr item and we need to reflect this

2825

* movement in our normalized position.

2826

* movement in our normalized position.

2826

*/

2827

*/

2827

if (!(flags & DEQUEUE_SLEEP))

2828

if (!(flags & DEQUEUE_SLEEP))

2828

se->vruntime -= cfs_rq->min_vruntime;

2829

se->vruntime -= cfs_rq->min_vruntime;

2829

2830

/* return excess runtime on last dequeue */

2831

/* return excess runtime on last dequeue */

2831

return_cfs_rq_runtime(cfs_rq);

2832

return_cfs_rq_runtime(cfs_rq);

2832

2833

update_min_vruntime(cfs_rq);

2834

update_min_vruntime(cfs_rq);

2834

update_cfs_shares(cfs_rq);

2835

update_cfs_shares(cfs_rq);

2835

}

2836

}

2836

2837

/*

2838

/*

2838

* Preempt the current task with a newly woken task if needed:

2839

* Preempt the current task with a newly woken task if needed:

2839

*/

2840

*/

2840

static void

2841

static void

2841

check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)

2842

check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)

2842

{

2843

{

2843

unsigned long ideal_runtime, delta_exec;

2844

unsigned long ideal_runtime, delta_exec;

2844

struct sched_entity *se;

2845

struct sched_entity *se;

2845

s64 delta;

2846

s64 delta;

2846

2847

ideal_runtime = sched_slice(cfs_rq, curr);

2848

ideal_runtime = sched_slice(cfs_rq, curr);

2848

delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;

2849

delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;

2849

if (delta_exec > ideal_runtime) {

2850

if (delta_exec > ideal_runtime) {

2850

resched_task(rq_of(cfs_rq)->curr);

2851

resched_task(rq_of(cfs_rq)->curr);

2851

/*

2852

/*

2852

* The current task ran long enough, ensure it doesn't get

2853

* The current task ran long enough, ensure it doesn't get

2853

* re-elected due to buddy favours.

2854

* re-elected due to buddy favours.

2854

*/

2855

*/

2855

clear_buddies(cfs_rq, curr);

2856

clear_buddies(cfs_rq, curr);

2856

return;

2857

return;

2857

}

2858

}

2858

2859

/*

2860

/*

2860

* Ensure that a task that missed wakeup preemption by a

2861

* Ensure that a task that missed wakeup preemption by a

2861

* narrow margin doesn't have to wait for a full slice.

2862

* narrow margin doesn't have to wait for a full slice.

2862

* This also mitigates buddy induced latencies under load.

2863

* This also mitigates buddy induced latencies under load.

2863

*/

2864

*/

2864

if (delta_exec < sysctl_sched_min_granularity)

2865

if (delta_exec < sysctl_sched_min_granularity)

2865

return;

2866

return;

2866

2867

se = __pick_first_entity(cfs_rq);

2868

se = __pick_first_entity(cfs_rq);

2868

delta = curr->vruntime - se->vruntime;

2869

delta = curr->vruntime - se->vruntime;

2869

2870

if (delta < 0)

2871

if (delta < 0)

2871

return;

2872

return;

2872

2873

if (delta > ideal_runtime)

2874

if (delta > ideal_runtime)

2874

resched_task(rq_of(cfs_rq)->curr);

2875

resched_task(rq_of(cfs_rq)->curr);

2875

}

2876

}

2876

2877

static void

2878

static void

2878

set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)

2879

set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)

2879

{

2880

{

2880

/* 'current' is not kept within the tree. */

2881

/* 'current' is not kept within the tree. */

2881

if (se->on_rq) {

2882

if (se->on_rq) {

2882

/*

2883

/*

2883

* Any task has to be enqueued before it get to execute on

2884

* Any task has to be enqueued before it get to execute on

2884

* a CPU. So account for the time it spent waiting on the

2885

* a CPU. So account for the time it spent waiting on the

2885

* runqueue.

2886

* runqueue.

2886

*/

2887

*/

2887

update_stats_wait_end(cfs_rq, se);

2888

update_stats_wait_end(cfs_rq, se);

2888

__dequeue_entity(cfs_rq, se);

2889

__dequeue_entity(cfs_rq, se);

2889

}

2890

}

2890

2891

update_stats_curr_start(cfs_rq, se);

2892

update_stats_curr_start(cfs_rq, se);

2892

cfs_rq->curr = se;

2893

cfs_rq->curr = se;

2893

#ifdef CONFIG_SCHEDSTATS

2894

#ifdef CONFIG_SCHEDSTATS

2894

/*

2895

/*

2895

* Track our maximum slice length, if the CPU's load is at

2896

* Track our maximum slice length, if the CPU's load is at

2896

* least twice that of our own weight (i.e. dont track it

2897

* least twice that of our own weight (i.e. dont track it

2897

* when there are only lesser-weight tasks around):

2898

* when there are only lesser-weight tasks around):

2898

*/

2899

*/

2899

if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {

2900

if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {

2900

se->statistics.slice_max = max(se->statistics.slice_max,

2901

se->statistics.slice_max = max(se->statistics.slice_max,

2901

se->sum_exec_runtime - se->prev_sum_exec_runtime);

2902

se->sum_exec_runtime - se->prev_sum_exec_runtime);

2902

}

2903

}

2903

#endif

2904

#endif

2904

se->prev_sum_exec_runtime = se->sum_exec_runtime;

2905

se->prev_sum_exec_runtime = se->sum_exec_runtime;

2905

}

2906

}

2906

2907

static int

2908

static int

2908

wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);

2909

wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);

2909

2910

/*

2911

/*

2911

* Pick the next process, keeping these things in mind, in this order:

2912

* Pick the next process, keeping these things in mind, in this order:

2912

* 1) keep things fair between processes/task groups

2913

* 1) keep things fair between processes/task groups

2913

* 2) pick the "next" process, since someone really wants that to run

2914

* 2) pick the "next" process, since someone really wants that to run

2914

* 3) pick the "last" process, for cache locality

2915

* 3) pick the "last" process, for cache locality

2915

* 4) do not run the "skip" process, if something else is available

2916

* 4) do not run the "skip" process, if something else is available

2916

*/

2917

*/

2917

static struct sched_entity *

2918

static struct sched_entity *

2918

pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)

2919

pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)

2919

{

2920

{

2920

struct sched_entity *left = __pick_first_entity(cfs_rq);

2921

struct sched_entity *left = __pick_first_entity(cfs_rq);

2921

struct sched_entity *se;

2922

struct sched_entity *se;

2922

2923

/*

2924

/*

2924

* If curr is set we have to see if its left of the leftmost entity

2925

* If curr is set we have to see if its left of the leftmost entity

2925

* still in the tree, provided there was anything in the tree at all.

2926

* still in the tree, provided there was anything in the tree at all.

2926

*/

2927

*/

2927

if (!left || (curr && entity_before(curr, left)))

2928

if (!left || (curr && entity_before(curr, left)))

2928

left = curr;

2929

left = curr;

2929

2930

se = left; /* ideally we run the leftmost entity */

2931

se = left; /* ideally we run the leftmost entity */

2931

2932

/*

2933

/*

2933

* Avoid running the skip buddy, if running something else can

2934

* Avoid running the skip buddy, if running something else can

2934

* be done without getting too unfair.

2935

* be done without getting too unfair.

2935

*/

2936

*/

2936

if (cfs_rq->skip == se) {

2937

if (cfs_rq->skip == se) {

2937

struct sched_entity *second;

2938

struct sched_entity *second;

2938

2939

if (se == curr) {

2940

if (se == curr) {

2940

second = __pick_first_entity(cfs_rq);

2941

second = __pick_first_entity(cfs_rq);

2941

} else {

2942

} else {

2942

second = __pick_next_entity(se);

2943

second = __pick_next_entity(se);

2943

if (!second || (curr && entity_before(curr, second)))

2944

if (!second || (curr && entity_before(curr, second)))

2944

second = curr;

2945

second = curr;

2945

}

2946

}

2946

2947

if (second && wakeup_preempt_entity(second, left) < 1)

2948

if (second && wakeup_preempt_entity(second, left) < 1)

2948

se = second;

2949

se = second;

2949

}

2950

}

2950

2951

/*

2952

/*

2952

* Prefer last buddy, try to return the CPU to a preempted task.

2953

* Prefer last buddy, try to return the CPU to a preempted task.

2953

*/

2954

*/

2954

if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)

2955

if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)

2955

se = cfs_rq->last;

2956

se = cfs_rq->last;

2956

2957

/*

2958

/*

2958

* Someone really wants this to run. If it's not unfair, run it.

2959

* Someone really wants this to run. If it's not unfair, run it.

2959

*/

2960

*/

2960

if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)

2961

if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)

2961

se = cfs_rq->next;

2962

se = cfs_rq->next;

2962

2963

clear_buddies(cfs_rq, se);

2964

clear_buddies(cfs_rq, se);

2964

2965

return se;

2966

return se;

2966

}

2967

}

2967

2968

static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);

2969

static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);

2969

2970

static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)

2971

static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)

2971

{

2972

{

2972

/*

2973

/*

2973

* If still on the runqueue then deactivate_task()

2974

* If still on the runqueue then deactivate_task()

2974

* was not called and update_curr() has to be done:

2975

* was not called and update_curr() has to be done:

2975

*/

2976

*/

2976

if (prev->on_rq)

2977

if (prev->on_rq)

2977

update_curr(cfs_rq);

2978

update_curr(cfs_rq);

2978

2979

/* throttle cfs_rqs exceeding runtime */

2980

/* throttle cfs_rqs exceeding runtime */

2980

check_cfs_rq_runtime(cfs_rq);

2981

check_cfs_rq_runtime(cfs_rq);

2981

2982

check_spread(cfs_rq, prev);

2983

check_spread(cfs_rq, prev);

2983

if (prev->on_rq) {

2984

if (prev->on_rq) {

2984

update_stats_wait_start(cfs_rq, prev);

2985

update_stats_wait_start(cfs_rq, prev);

2985

/* Put 'current' back into the tree. */

2986

/* Put 'current' back into the tree. */

2986

__enqueue_entity(cfs_rq, prev);

2987

__enqueue_entity(cfs_rq, prev);

2987

/* in !on_rq case, update occurred at dequeue */

2988

/* in !on_rq case, update occurred at dequeue */

2988

update_entity_load_avg(prev, 1);

2989

update_entity_load_avg(prev, 1);

2989

}

2990

}

2990

cfs_rq->curr = NULL;

2991

cfs_rq->curr = NULL;

2991

}

2992

}

2992

2993

static void

2994

static void

2994

entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)

2995

entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)

2995

{

2996

{

2996

/*

2997

/*

2997

* Update run-time statistics of the 'current'.

2998

* Update run-time statistics of the 'current'.

2998

*/

2999

*/

2999

update_curr(cfs_rq);

3000

update_curr(cfs_rq);

3000

3001

/*

3002

/*

3002

* Ensure that runnable average is periodically updated.

3003

* Ensure that runnable average is periodically updated.

3003

*/

3004

*/

3004

update_entity_load_avg(curr, 1);

3005

update_entity_load_avg(curr, 1);

3005

update_cfs_rq_blocked_load(cfs_rq, 1);

3006

update_cfs_rq_blocked_load(cfs_rq, 1);

3006

update_cfs_shares(cfs_rq);

3007

update_cfs_shares(cfs_rq);

3007

3008

#ifdef CONFIG_SCHED_HRTICK

3009

#ifdef CONFIG_SCHED_HRTICK

3009

/*

3010

/*

3010

* queued ticks are scheduled to match the slice, so don't bother

3011

* queued ticks are scheduled to match the slice, so don't bother

3011

* validating it and just reschedule.

3012

* validating it and just reschedule.

3012

*/

3013

*/

3013

if (queued) {

3014

if (queued) {

3014

resched_task(rq_of(cfs_rq)->curr);

3015

resched_task(rq_of(cfs_rq)->curr);

3015

return;

3016

return;

3016

}

3017

}

3017

/*

3018

/*

3018

* don't let the period tick interfere with the hrtick preemption

3019

* don't let the period tick interfere with the hrtick preemption

3019

*/

3020

*/

3020

if (!sched_feat(DOUBLE_TICK) &&

3021

if (!sched_feat(DOUBLE_TICK) &&

3021

hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))

3022

hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))

3022

return;

3023

return;

3023

#endif

3024

#endif

3024

3025

if (cfs_rq->nr_running > 1)

3026

if (cfs_rq->nr_running > 1)

3026

check_preempt_tick(cfs_rq, curr);

3027

check_preempt_tick(cfs_rq, curr);

3027

}

3028

}

3028

3029

3030

/**************************************************

3031

/**************************************************

3031

* CFS bandwidth control machinery

3032

* CFS bandwidth control machinery

3032

*/

3033

*/

3033

3034

#ifdef CONFIG_CFS_BANDWIDTH

3035

#ifdef CONFIG_CFS_BANDWIDTH

3035

3036

#ifdef HAVE_JUMP_LABEL

3037

#ifdef HAVE_JUMP_LABEL

3037

static struct static_key __cfs_bandwidth_used;

3038

static struct static_key __cfs_bandwidth_used;

3038

3039

static inline bool cfs_bandwidth_used(void)

3040

static inline bool cfs_bandwidth_used(void)

3040

{

3041

{

3041

return static_key_false(&__cfs_bandwidth_used);

3042

return static_key_false(&__cfs_bandwidth_used);

3042

}

3043

}

3043

3044

void cfs_bandwidth_usage_inc(void)

3045

void cfs_bandwidth_usage_inc(void)

3045

{

3046

{

3046

static_key_slow_inc(&__cfs_bandwidth_used);

3047

static_key_slow_inc(&__cfs_bandwidth_used);

3047

}

3048

}

3048

3049

void cfs_bandwidth_usage_dec(void)

3050

void cfs_bandwidth_usage_dec(void)

3050

{

3051

{

3051

static_key_slow_dec(&__cfs_bandwidth_used);

3052

static_key_slow_dec(&__cfs_bandwidth_used);

3052

}

3053

}

3053

#else /* HAVE_JUMP_LABEL */

3054

#else /* HAVE_JUMP_LABEL */

3054

static bool cfs_bandwidth_used(void)

3055

static bool cfs_bandwidth_used(void)

3055

{

3056

{

3056

return true;

3057

return true;

3057

}

3058

}

3058

3059

void cfs_bandwidth_usage_inc(void) {}

3060

void cfs_bandwidth_usage_inc(void) {}

3060

void cfs_bandwidth_usage_dec(void) {}

3061

void cfs_bandwidth_usage_dec(void) {}

3061

#endif /* HAVE_JUMP_LABEL */

3062

#endif /* HAVE_JUMP_LABEL */

3062

3063

/*

3064

/*

3064

* default period for cfs group bandwidth.

3065

* default period for cfs group bandwidth.

3065

* default: 0.1s, units: nanoseconds

3066

* default: 0.1s, units: nanoseconds

3066

*/

3067

*/

3067

static inline u64 default_cfs_period(void)

3068

static inline u64 default_cfs_period(void)

3068

{

3069

{

3069

return 100000000ULL;

3070

return 100000000ULL;

3070

}

3071

}

3071

3072

static inline u64 sched_cfs_bandwidth_slice(void)

3073

static inline u64 sched_cfs_bandwidth_slice(void)

3073

{

3074

{

3074

return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;

3075

return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;

3075

}

3076

}

3076

3077

/*

3078

/*

3078

* Replenish runtime according to assigned quota and update expiration time.

3079

* Replenish runtime according to assigned quota and update expiration time.

3079

* We use sched_clock_cpu directly instead of rq->clock to avoid adding

3080

* We use sched_clock_cpu directly instead of rq->clock to avoid adding

3080

* additional synchronization around rq->lock.

3081

* additional synchronization around rq->lock.

3081

*

3082

*

3082

* requires cfs_b->lock

3083

* requires cfs_b->lock

3083

*/

3084

*/

3084

void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)

3085

void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)

3085

{

3086

{

3086

u64 now;

3087

u64 now;

3087

3088

if (cfs_b->quota == RUNTIME_INF)

3089

if (cfs_b->quota == RUNTIME_INF)

3089

return;

3090

return;

3090

3091

now = sched_clock_cpu(smp_processor_id());

3092

now = sched_clock_cpu(smp_processor_id());

3092

cfs_b->runtime = cfs_b->quota;

3093

cfs_b->runtime = cfs_b->quota;

3093

cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);

3094

cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);

3094

}

3095

}

3095

3096

static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)

3097

static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)

3097

{

3098

{

3098

return &tg->cfs_bandwidth;

3099

return &tg->cfs_bandwidth;

3099

}

3100

}

3100

3101

/* rq->task_clock normalized against any time this cfs_rq has spent throttled */

3102

/* rq->task_clock normalized against any time this cfs_rq has spent throttled */

3102

static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)

3103

static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)

3103

{

3104

{

3104

if (unlikely(cfs_rq->throttle_count))

3105

if (unlikely(cfs_rq->throttle_count))

3105

return cfs_rq->throttled_clock_task;

3106

return cfs_rq->throttled_clock_task;

3106

3107

return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;

3108

return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;

3108

}

3109

}

3109

3110

/* returns 0 on failure to allocate runtime */

3111

/* returns 0 on failure to allocate runtime */

3111

static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3112

static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3112

{

3113

{

3113

struct task_group *tg = cfs_rq->tg;

3114

struct task_group *tg = cfs_rq->tg;

3114

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);

3115

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);

3115

u64 amount = 0, min_amount, expires;

3116

u64 amount = 0, min_amount, expires;

3116

3117

/* note: this is a positive sum as runtime_remaining <= 0 */

3118

/* note: this is a positive sum as runtime_remaining <= 0 */

3118

min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;

3119

min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;

3119

3120

raw_spin_lock(&cfs_b->lock);

3121

raw_spin_lock(&cfs_b->lock);

3121

if (cfs_b->quota == RUNTIME_INF)

3122

if (cfs_b->quota == RUNTIME_INF)

3122

amount = min_amount;

3123

amount = min_amount;

3123

else {

3124

else {

3124

/*

3125

/*

3125

* If the bandwidth pool has become inactive, then at least one

3126

* If the bandwidth pool has become inactive, then at least one

3126

* period must have elapsed since the last consumption.

3127

* period must have elapsed since the last consumption.

3127

* Refresh the global state and ensure bandwidth timer becomes

3128

* Refresh the global state and ensure bandwidth timer becomes

3128

* active.

3129

* active.

3129

*/

3130

*/

3130

if (!cfs_b->timer_active) {

3131

if (!cfs_b->timer_active) {

3131

__refill_cfs_bandwidth_runtime(cfs_b);

3132

__refill_cfs_bandwidth_runtime(cfs_b);

3132

__start_cfs_bandwidth(cfs_b);

3133

__start_cfs_bandwidth(cfs_b);

3133

}

3134

}

3134

3135

if (cfs_b->runtime > 0) {

3136

if (cfs_b->runtime > 0) {

3136

amount = min(cfs_b->runtime, min_amount);

3137

amount = min(cfs_b->runtime, min_amount);

3137

cfs_b->runtime -= amount;

3138

cfs_b->runtime -= amount;

3138

cfs_b->idle = 0;

3139

cfs_b->idle = 0;

3139

}

3140

}

3140

}

3141

}

3141

expires = cfs_b->runtime_expires;

3142

expires = cfs_b->runtime_expires;

3142

raw_spin_unlock(&cfs_b->lock);

3143

raw_spin_unlock(&cfs_b->lock);

3143

3144

cfs_rq->runtime_remaining += amount;

3145

cfs_rq->runtime_remaining += amount;

3145

/*

3146

/*

3146

* we may have advanced our local expiration to account for allowed

3147

* we may have advanced our local expiration to account for allowed

3147

* spread between our sched_clock and the one on which runtime was

3148

* spread between our sched_clock and the one on which runtime was

3148

* issued.

3149

* issued.

3149

*/

3150

*/

3150

if ((s64)(expires - cfs_rq->runtime_expires) > 0)

3151

if ((s64)(expires - cfs_rq->runtime_expires) > 0)

3151

cfs_rq->runtime_expires = expires;

3152

cfs_rq->runtime_expires = expires;

3152

3153

return cfs_rq->runtime_remaining > 0;

3154

return cfs_rq->runtime_remaining > 0;

3154

}

3155

}

3155

3156

/*

3157

/*

3157

* Note: This depends on the synchronization provided by sched_clock and the

3158

* Note: This depends on the synchronization provided by sched_clock and the

3158

* fact that rq->clock snapshots this value.

3159

* fact that rq->clock snapshots this value.

3159

*/

3160

*/

3160

static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3161

static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3161

{

3162

{

3162

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);

3163

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);

3163

3164

/* if the deadline is ahead of our clock, nothing to do */

3165

/* if the deadline is ahead of our clock, nothing to do */

3165

if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))

3166

if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))

3166

return;

3167

return;

3167

3168

if (cfs_rq->runtime_remaining < 0)

3169

if (cfs_rq->runtime_remaining < 0)

3169

return;

3170

return;

3170

3171

/*

3172

/*

3172

* If the local deadline has passed we have to consider the

3173

* If the local deadline has passed we have to consider the

3173

* possibility that our sched_clock is 'fast' and the global deadline

3174

* possibility that our sched_clock is 'fast' and the global deadline

3174

* has not truly expired.

3175

* has not truly expired.

3175

*

3176

*

3176

* Fortunately we can check determine whether this the case by checking

3177

* Fortunately we can check determine whether this the case by checking

3177

* whether the global deadline has advanced.

3178

* whether the global deadline has advanced.

3178

*/

3179

*/

3179

3180

if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {

3181

if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {

3181

/* extend local deadline, drift is bounded above by 2 ticks */

3182

/* extend local deadline, drift is bounded above by 2 ticks */

3182

cfs_rq->runtime_expires += TICK_NSEC;

3183

cfs_rq->runtime_expires += TICK_NSEC;

3183

} else {

3184

} else {

3184

/* global deadline is ahead, expiration has passed */

3185

/* global deadline is ahead, expiration has passed */

3185

cfs_rq->runtime_remaining = 0;

3186

cfs_rq->runtime_remaining = 0;

3186

}

3187

}

3187

}

3188

}

3188

3189

static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)

3190

static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)

3190

{

3191

{

3191

/* dock delta_exec before expiring quota (as it could span periods) */

3192

/* dock delta_exec before expiring quota (as it could span periods) */

3192

cfs_rq->runtime_remaining -= delta_exec;

3193

cfs_rq->runtime_remaining -= delta_exec;

3193

expire_cfs_rq_runtime(cfs_rq);

3194

expire_cfs_rq_runtime(cfs_rq);

3194

3195

if (likely(cfs_rq->runtime_remaining > 0))

3196

if (likely(cfs_rq->runtime_remaining > 0))

3196

return;

3197

return;

3197

3198

/*

3199

/*

3199

* if we're unable to extend our runtime we resched so that the active

3200

* if we're unable to extend our runtime we resched so that the active

3200

* hierarchy can be throttled

3201

* hierarchy can be throttled

3201

*/

3202

*/

3202

if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))

3203

if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))

3203

resched_task(rq_of(cfs_rq)->curr);

3204

resched_task(rq_of(cfs_rq)->curr);

3204

}

3205

}

3205

3206

static __always_inline

3207

static __always_inline

3207

void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)

3208

void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)

3208

{

3209

{

3209

if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)

3210

if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)

3210

return;

3211

return;

3211

3212

__account_cfs_rq_runtime(cfs_rq, delta_exec);

3213

__account_cfs_rq_runtime(cfs_rq, delta_exec);

3213

}

3214

}

3214

3215

static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)

3216

static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)

3216

{

3217

{

3217

return cfs_bandwidth_used() && cfs_rq->throttled;

3218

return cfs_bandwidth_used() && cfs_rq->throttled;

3218

}

3219

}

3219

3220

/* check whether cfs_rq, or any parent, is throttled */

3221

/* check whether cfs_rq, or any parent, is throttled */

3221

static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)

3222

static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)

3222

{

3223

{

3223

return cfs_bandwidth_used() && cfs_rq->throttle_count;

3224

return cfs_bandwidth_used() && cfs_rq->throttle_count;

3224

}

3225

}

3225

3226

/*

3227

/*

3227

* Ensure that neither of the group entities corresponding to src_cpu or

3228

* Ensure that neither of the group entities corresponding to src_cpu or

3228

* dest_cpu are members of a throttled hierarchy when performing group

3229

* dest_cpu are members of a throttled hierarchy when performing group

3229

* load-balance operations.

3230

* load-balance operations.

3230

*/

3231

*/

3231

static inline int throttled_lb_pair(struct task_group *tg,

3232

static inline int throttled_lb_pair(struct task_group *tg,

3232

int src_cpu, int dest_cpu)

3233

int src_cpu, int dest_cpu)

3233

{

3234

{

3234

struct cfs_rq *src_cfs_rq, *dest_cfs_rq;

3235

struct cfs_rq *src_cfs_rq, *dest_cfs_rq;

3235

3236

src_cfs_rq = tg->cfs_rq[src_cpu];

3237

src_cfs_rq = tg->cfs_rq[src_cpu];

3237

dest_cfs_rq = tg->cfs_rq[dest_cpu];

3238

dest_cfs_rq = tg->cfs_rq[dest_cpu];

3238

3239

return throttled_hierarchy(src_cfs_rq) ||

3240

return throttled_hierarchy(src_cfs_rq) ||

3240

throttled_hierarchy(dest_cfs_rq);

3241

throttled_hierarchy(dest_cfs_rq);

3241

}

3242

}

3242

3243

/* updated child weight may affect parent so we have to do this bottom up */

3244

/* updated child weight may affect parent so we have to do this bottom up */

3244

static int tg_unthrottle_up(struct task_group *tg, void *data)

3245

static int tg_unthrottle_up(struct task_group *tg, void *data)

3245

{

3246

{

3246

struct rq *rq = data;

3247

struct rq *rq = data;

3247

struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];

3248

struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];

3248

3249

cfs_rq->throttle_count--;

3250

cfs_rq->throttle_count--;

3250

#ifdef CONFIG_SMP

3251

#ifdef CONFIG_SMP

3251

if (!cfs_rq->throttle_count) {

3252

if (!cfs_rq->throttle_count) {

3252

/* adjust cfs_rq_clock_task() */

3253

/* adjust cfs_rq_clock_task() */

3253

cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -

3254

cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -

3254

cfs_rq->throttled_clock_task;

3255

cfs_rq->throttled_clock_task;

3255

}

3256

}

3256

#endif

3257

#endif

3257

3258

return 0;

3259

return 0;

3259

}

3260

}

3260

3261

static int tg_throttle_down(struct task_group *tg, void *data)

3262

static int tg_throttle_down(struct task_group *tg, void *data)

3262

{

3263

{

3263

struct rq *rq = data;

3264

struct rq *rq = data;

3264

struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];

3265

struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];

3265

3266

/* group is entering throttled state, stop time */

3267

/* group is entering throttled state, stop time */

3267

if (!cfs_rq->throttle_count)

3268

if (!cfs_rq->throttle_count)

3268

cfs_rq->throttled_clock_task = rq_clock_task(rq);

3269

cfs_rq->throttled_clock_task = rq_clock_task(rq);

3269

cfs_rq->throttle_count++;

3270

cfs_rq->throttle_count++;

3270

3271

return 0;

3272

return 0;

3272

}

3273

}

3273

3274

static void throttle_cfs_rq(struct cfs_rq *cfs_rq)

3275

static void throttle_cfs_rq(struct cfs_rq *cfs_rq)

3275

{

3276

{

3276

struct rq *rq = rq_of(cfs_rq);

3277

struct rq *rq = rq_of(cfs_rq);

3277

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);

3278

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);

3278

struct sched_entity *se;

3279

struct sched_entity *se;

3279

long task_delta, dequeue = 1;

3280

long task_delta, dequeue = 1;

3280

3281

se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];

3282

se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];

3282

3283

/* freeze hierarchy runnable averages while throttled */

3284

/* freeze hierarchy runnable averages while throttled */

3284

rcu_read_lock();

3285

rcu_read_lock();

3285

walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);

3286

walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);

3286

rcu_read_unlock();

3287

rcu_read_unlock();

3287

3288

task_delta = cfs_rq->h_nr_running;

3289

task_delta = cfs_rq->h_nr_running;

3289

for_each_sched_entity(se) {

3290

for_each_sched_entity(se) {

3290

struct cfs_rq *qcfs_rq = cfs_rq_of(se);

3291

struct cfs_rq *qcfs_rq = cfs_rq_of(se);

3291

/* throttled entity or throttle-on-deactivate */

3292

/* throttled entity or throttle-on-deactivate */

3292

if (!se->on_rq)

3293

if (!se->on_rq)

3293

break;

3294

break;

3294

3295

if (dequeue)

3296

if (dequeue)

3296

dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);

3297

dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);

3297

qcfs_rq->h_nr_running -= task_delta;

3298

qcfs_rq->h_nr_running -= task_delta;

3298

3299

if (qcfs_rq->load.weight)

3300

if (qcfs_rq->load.weight)

3300

dequeue = 0;

3301

dequeue = 0;

3301

}

3302

}

3302

3303

if (!se)

3304

if (!se)

3304

rq->nr_running -= task_delta;

3305

rq->nr_running -= task_delta;

3305

3306

cfs_rq->throttled = 1;

3307

cfs_rq->throttled = 1;

3307

cfs_rq->throttled_clock = rq_clock(rq);

3308

cfs_rq->throttled_clock = rq_clock(rq);

3308

raw_spin_lock(&cfs_b->lock);

3309

raw_spin_lock(&cfs_b->lock);

3309

list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);

3310

list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);

3310

if (!cfs_b->timer_active)

3311

if (!cfs_b->timer_active)

3311

__start_cfs_bandwidth(cfs_b);

3312

__start_cfs_bandwidth(cfs_b);

3312

raw_spin_unlock(&cfs_b->lock);

3313

raw_spin_unlock(&cfs_b->lock);

3313

}

3314

}

3314

3315

void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)

3316

void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)

3316

{

3317

{

3317

struct rq *rq = rq_of(cfs_rq);

3318

struct rq *rq = rq_of(cfs_rq);

3318

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);

3319

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);

3319

struct sched_entity *se;

3320

struct sched_entity *se;

3320

int enqueue = 1;

3321

int enqueue = 1;

3321

long task_delta;

3322

long task_delta;

3322

3323

se = cfs_rq->tg->se[cpu_of(rq)];

3324

se = cfs_rq->tg->se[cpu_of(rq)];

3324

3325

cfs_rq->throttled = 0;

3326

cfs_rq->throttled = 0;

3326

3327

update_rq_clock(rq);

3328

update_rq_clock(rq);

3328

3329

raw_spin_lock(&cfs_b->lock);

3330

raw_spin_lock(&cfs_b->lock);

3330

cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;

3331

cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;

3331

list_del_rcu(&cfs_rq->throttled_list);

3332

list_del_rcu(&cfs_rq->throttled_list);

3332

raw_spin_unlock(&cfs_b->lock);

3333

raw_spin_unlock(&cfs_b->lock);

3333

3334

/* update hierarchical throttle state */

3335

/* update hierarchical throttle state */

3335

walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);

3336

walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);

3336

3337

if (!cfs_rq->load.weight)

3338

if (!cfs_rq->load.weight)

3338

return;

3339

return;

3339

3340

task_delta = cfs_rq->h_nr_running;

3341

task_delta = cfs_rq->h_nr_running;

3341

for_each_sched_entity(se) {

3342

for_each_sched_entity(se) {

3342

if (se->on_rq)

3343

if (se->on_rq)

3343

enqueue = 0;

3344

enqueue = 0;

3344

3345

cfs_rq = cfs_rq_of(se);

3346

cfs_rq = cfs_rq_of(se);

3346

if (enqueue)

3347

if (enqueue)

3347

enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);

3348

enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);

3348

cfs_rq->h_nr_running += task_delta;

3349

cfs_rq->h_nr_running += task_delta;

3349

3350

if (cfs_rq_throttled(cfs_rq))

3351

if (cfs_rq_throttled(cfs_rq))

3351

break;

3352

break;

3352

}

3353

}

3353

3354

if (!se)

3355

if (!se)

3355

rq->nr_running += task_delta;

3356

rq->nr_running += task_delta;

3356

3357

/* determine whether we need to wake up potentially idle cpu */

3358

/* determine whether we need to wake up potentially idle cpu */

3358

if (rq->curr == rq->idle && rq->cfs.nr_running)

3359

if (rq->curr == rq->idle && rq->cfs.nr_running)

3359

resched_task(rq->curr);

3360

resched_task(rq->curr);

3360

}

3361

}

3361

3362

static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,

3363

static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,

3363

u64 remaining, u64 expires)

3364

u64 remaining, u64 expires)

3364

{

3365

{

3365

struct cfs_rq *cfs_rq;

3366

struct cfs_rq *cfs_rq;

3366

u64 runtime = remaining;

3367

u64 runtime = remaining;

3367

3368

rcu_read_lock();

3369

rcu_read_lock();

3369

list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,

3370

list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,

3370

throttled_list) {

3371

throttled_list) {

3371

struct rq *rq = rq_of(cfs_rq);

3372

struct rq *rq = rq_of(cfs_rq);

3372

3373

raw_spin_lock(&rq->lock);

3374

raw_spin_lock(&rq->lock);

3374

if (!cfs_rq_throttled(cfs_rq))

3375

if (!cfs_rq_throttled(cfs_rq))

3375

goto next;

3376

goto next;

3376

3377

runtime = -cfs_rq->runtime_remaining + 1;

3378

runtime = -cfs_rq->runtime_remaining + 1;

3378

if (runtime > remaining)

3379

if (runtime > remaining)

3379

runtime = remaining;

3380

runtime = remaining;

3380

remaining -= runtime;

3381

remaining -= runtime;

3381

3382

cfs_rq->runtime_remaining += runtime;

3383

cfs_rq->runtime_remaining += runtime;

3383

cfs_rq->runtime_expires = expires;

3384

cfs_rq->runtime_expires = expires;

3384

3385

/* we check whether we're throttled above */

3386

/* we check whether we're throttled above */

3386

if (cfs_rq->runtime_remaining > 0)

3387

if (cfs_rq->runtime_remaining > 0)

3387

unthrottle_cfs_rq(cfs_rq);

3388

unthrottle_cfs_rq(cfs_rq);

3388

3389

raw_spin_unlock(&rq->lock);

3391

raw_spin_unlock(&rq->lock);

3391

3392

if (!remaining)

3393

if (!remaining)

3393

break;

3394

break;

3394

}

3395

}

3395

rcu_read_unlock();

3396

rcu_read_unlock();

3396

3397

return remaining;

3398

return remaining;

3398

}

3399

}

3399

3400

/*

3401

/*

3401

* Responsible for refilling a task_group's bandwidth and unthrottling its

3402

* Responsible for refilling a task_group's bandwidth and unthrottling its

3402

* cfs_rqs as appropriate. If there has been no activity within the last

3403

* cfs_rqs as appropriate. If there has been no activity within the last

3403

* period the timer is deactivated until scheduling resumes; cfs_b->idle is

3404

* period the timer is deactivated until scheduling resumes; cfs_b->idle is

3404

* used to track this state.

3405

* used to track this state.

3405

*/

3406

*/

3406

static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)

3407

static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)

3407

{

3408

{

3408

u64 runtime, runtime_expires;

3409

u64 runtime, runtime_expires;

3409

int idle = 1, throttled;

3410

int idle = 1, throttled;

3410

3411

raw_spin_lock(&cfs_b->lock);

3412

raw_spin_lock(&cfs_b->lock);

3412

/* no need to continue the timer with no bandwidth constraint */

3413

/* no need to continue the timer with no bandwidth constraint */

3413

if (cfs_b->quota == RUNTIME_INF)

3414

if (cfs_b->quota == RUNTIME_INF)

3414

goto out_unlock;

3415

goto out_unlock;

3415

3416

throttled = !list_empty(&cfs_b->throttled_cfs_rq);

3417

throttled = !list_empty(&cfs_b->throttled_cfs_rq);

3417

/* idle depends on !throttled (for the case of a large deficit) */

3418

/* idle depends on !throttled (for the case of a large deficit) */

3418

idle = cfs_b->idle && !throttled;

3419

idle = cfs_b->idle && !throttled;

3419

cfs_b->nr_periods += overrun;

3420

cfs_b->nr_periods += overrun;

3420

3421

/* if we're going inactive then everything else can be deferred */

3422

/* if we're going inactive then everything else can be deferred */

3422

if (idle)

3423

if (idle)

3423

goto out_unlock;

3424

goto out_unlock;

3424

3425

/*

3426

/*

3426

* if we have relooped after returning idle once, we need to update our

3427

* if we have relooped after returning idle once, we need to update our

3427

* status as actually running, so that other cpus doing

3428

* status as actually running, so that other cpus doing

3428

* __start_cfs_bandwidth will stop trying to cancel us.

3429

* __start_cfs_bandwidth will stop trying to cancel us.

3429

*/

3430

*/

3430

cfs_b->timer_active = 1;

3431

cfs_b->timer_active = 1;

3431

3432

__refill_cfs_bandwidth_runtime(cfs_b);

3433

__refill_cfs_bandwidth_runtime(cfs_b);

3433

3434

if (!throttled) {

3435

if (!throttled) {

3435

/* mark as potentially idle for the upcoming period */

3436

/* mark as potentially idle for the upcoming period */

3436

cfs_b->idle = 1;

3437

cfs_b->idle = 1;

3437

goto out_unlock;

3438

goto out_unlock;

3438

}

3439

}

3439

3440

/* account preceding periods in which throttling occurred */

3441

/* account preceding periods in which throttling occurred */

3441

cfs_b->nr_throttled += overrun;

3442

cfs_b->nr_throttled += overrun;

3442

3443

/*

3444

/*

3444

* There are throttled entities so we must first use the new bandwidth

3445

* There are throttled entities so we must first use the new bandwidth

3445

* to unthrottle them before making it generally available. This

3446

* to unthrottle them before making it generally available. This

3446

* ensures that all existing debts will be paid before a new cfs_rq is

3447

* ensures that all existing debts will be paid before a new cfs_rq is

3447

* allowed to run.

3448

* allowed to run.

3448

*/

3449

*/

3449

runtime = cfs_b->runtime;

3450

runtime = cfs_b->runtime;

3450

runtime_expires = cfs_b->runtime_expires;

3451

runtime_expires = cfs_b->runtime_expires;

3451

cfs_b->runtime = 0;

3452

cfs_b->runtime = 0;

3452

3453

/*

3454

/*

3454

* This check is repeated as we are holding onto the new bandwidth

3455

* This check is repeated as we are holding onto the new bandwidth

3455

* while we unthrottle. This can potentially race with an unthrottled

3456

* while we unthrottle. This can potentially race with an unthrottled

3456

* group trying to acquire new bandwidth from the global pool.

3457

* group trying to acquire new bandwidth from the global pool.

3457

*/

3458

*/

3458

while (throttled && runtime > 0) {

3459

while (throttled && runtime > 0) {

3459

raw_spin_unlock(&cfs_b->lock);

3460

raw_spin_unlock(&cfs_b->lock);

3460

/* we can't nest cfs_b->lock while distributing bandwidth */

3461

/* we can't nest cfs_b->lock while distributing bandwidth */

3461

runtime = distribute_cfs_runtime(cfs_b, runtime,

3462

runtime = distribute_cfs_runtime(cfs_b, runtime,

3462

runtime_expires);

3463

runtime_expires);

3463

raw_spin_lock(&cfs_b->lock);

3464

raw_spin_lock(&cfs_b->lock);

3464

3465

throttled = !list_empty(&cfs_b->throttled_cfs_rq);

3466

throttled = !list_empty(&cfs_b->throttled_cfs_rq);

3466

}

3467

}

3467

3468

/* return (any) remaining runtime */

3469

/* return (any) remaining runtime */

3469

cfs_b->runtime = runtime;

3470

cfs_b->runtime = runtime;

3470

/*

3471

/*

3471

* While we are ensured activity in the period following an

3472

* While we are ensured activity in the period following an

3472

* unthrottle, this also covers the case in which the new bandwidth is

3473

* unthrottle, this also covers the case in which the new bandwidth is

3473

* insufficient to cover the existing bandwidth deficit. (Forcing the

3474

* insufficient to cover the existing bandwidth deficit. (Forcing the

3474

* timer to remain active while there are any throttled entities.)

3475

* timer to remain active while there are any throttled entities.)

3475

*/

3476

*/

3476

cfs_b->idle = 0;

3477

cfs_b->idle = 0;

3477

out_unlock:

3478

out_unlock:

3478

if (idle)

3479

if (idle)

3479

cfs_b->timer_active = 0;

3480

cfs_b->timer_active = 0;

3480

raw_spin_unlock(&cfs_b->lock);

3481

raw_spin_unlock(&cfs_b->lock);

3481

3482

return idle;

3483

return idle;

3483

}

3484

}

3484

3485

/* a cfs_rq won't donate quota below this amount */

3486

/* a cfs_rq won't donate quota below this amount */

3486

static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;

3487

static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;

3487

/* minimum remaining period time to redistribute slack quota */

3488

/* minimum remaining period time to redistribute slack quota */

3488

static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;

3489

static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;

3489

/* how long we wait to gather additional slack before distributing */

3490

/* how long we wait to gather additional slack before distributing */

3490

static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;

3491

static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;

3491

3492

/*

3493

/*

3493

* Are we near the end of the current quota period?

3494

* Are we near the end of the current quota period?

3494

*

3495

*

3495

* Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the

3496

* Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the

3496

* hrtimer base being cleared by __hrtimer_start_range_ns. In the case of

3497

* hrtimer base being cleared by __hrtimer_start_range_ns. In the case of

3497

* migrate_hrtimers, base is never cleared, so we are fine.

3498

* migrate_hrtimers, base is never cleared, so we are fine.

3498

*/

3499

*/

3499

static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)

3500

static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)

3500

{

3501

{

3501

struct hrtimer *refresh_timer = &cfs_b->period_timer;

3502

struct hrtimer *refresh_timer = &cfs_b->period_timer;

3502

u64 remaining;

3503

u64 remaining;

3503

3504

/* if the call-back is running a quota refresh is already occurring */

3505

/* if the call-back is running a quota refresh is already occurring */

3505

if (hrtimer_callback_running(refresh_timer))

3506

if (hrtimer_callback_running(refresh_timer))

3506

return 1;

3507

return 1;

3507

3508

/* is a quota refresh about to occur? */

3509

/* is a quota refresh about to occur? */

3509

remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));

3510

remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));

3510

if (remaining < min_expire)

3511

if (remaining < min_expire)

3511

return 1;

3512

return 1;

3512

3513

return 0;

3514

return 0;

3514

}

3515

}

3515

3516

static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)

3517

static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)

3517

{

3518

{

3518

u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;

3519

u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;

3519

3520

/* if there's a quota refresh soon don't bother with slack */

3521

/* if there's a quota refresh soon don't bother with slack */

3521

if (runtime_refresh_within(cfs_b, min_left))

3522

if (runtime_refresh_within(cfs_b, min_left))

3522

return;

3523

return;

3523

3524

start_bandwidth_timer(&cfs_b->slack_timer,

3525

start_bandwidth_timer(&cfs_b->slack_timer,

3525

ns_to_ktime(cfs_bandwidth_slack_period));

3526

ns_to_ktime(cfs_bandwidth_slack_period));

3526

}

3527

}

3527

3528

/* we know any runtime found here is valid as update_curr() precedes return */

3529

/* we know any runtime found here is valid as update_curr() precedes return */

3529

static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3530

static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3530

{

3531

{

3531

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);

3532

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);

3532

s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;

3533

s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;

3533

3534

if (slack_runtime <= 0)

3535

if (slack_runtime <= 0)

3535

return;

3536

return;

3536

3537

raw_spin_lock(&cfs_b->lock);

3538

raw_spin_lock(&cfs_b->lock);

3538

if (cfs_b->quota != RUNTIME_INF &&

3539

if (cfs_b->quota != RUNTIME_INF &&

3539

cfs_rq->runtime_expires == cfs_b->runtime_expires) {

3540

cfs_rq->runtime_expires == cfs_b->runtime_expires) {

3540

cfs_b->runtime += slack_runtime;

3541

cfs_b->runtime += slack_runtime;

3541

3542

/* we are under rq->lock, defer unthrottling using a timer */

3543

/* we are under rq->lock, defer unthrottling using a timer */

3543

if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&

3544

if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&

3544

!list_empty(&cfs_b->throttled_cfs_rq))

3545

!list_empty(&cfs_b->throttled_cfs_rq))

3545

start_cfs_slack_bandwidth(cfs_b);

3546

start_cfs_slack_bandwidth(cfs_b);

3546

}

3547

}

3547

raw_spin_unlock(&cfs_b->lock);

3548

raw_spin_unlock(&cfs_b->lock);

3548

3549

/* even if it's not valid for return we don't want to try again */

3550

/* even if it's not valid for return we don't want to try again */

3550

cfs_rq->runtime_remaining -= slack_runtime;

3551

cfs_rq->runtime_remaining -= slack_runtime;

3551

}

3552

}

3552

3553

static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3554

static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3554

{

3555

{

3555

if (!cfs_bandwidth_used())

3556

if (!cfs_bandwidth_used())

3556

return;

3557

return;

3557

3558

if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)

3559

if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)

3559

return;

3560

return;

3560

3561

__return_cfs_rq_runtime(cfs_rq);

3562

__return_cfs_rq_runtime(cfs_rq);

3562

}

3563

}

3563

3564

/*

3565

/*

3565

* This is done with a timer (instead of inline with bandwidth return) since

3566

* This is done with a timer (instead of inline with bandwidth return) since

3566

* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.

3567

* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.

3567

*/

3568

*/

3568

static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)

3569

static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)

3569

{

3570

{

3570

u64 runtime = 0, slice = sched_cfs_bandwidth_slice();

3571

u64 runtime = 0, slice = sched_cfs_bandwidth_slice();

3571

u64 expires;

3572

u64 expires;

3572

3573

/* confirm we're still not at a refresh boundary */

3574

/* confirm we're still not at a refresh boundary */

3574

raw_spin_lock(&cfs_b->lock);

3575

raw_spin_lock(&cfs_b->lock);

3575

if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {

3576

if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {

3576

raw_spin_unlock(&cfs_b->lock);

3577

raw_spin_unlock(&cfs_b->lock);

3577

return;

3578

return;

3578

}

3579

}

3579

3580

if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {

3581

if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {

3581

runtime = cfs_b->runtime;

3582

runtime = cfs_b->runtime;

3582

cfs_b->runtime = 0;

3583

cfs_b->runtime = 0;

3583

}

3584

}

3584

expires = cfs_b->runtime_expires;

3585

expires = cfs_b->runtime_expires;

3585

raw_spin_unlock(&cfs_b->lock);

3586

raw_spin_unlock(&cfs_b->lock);

3586

3587

if (!runtime)

3588

if (!runtime)

3588

return;

3589

return;

3589

3590

runtime = distribute_cfs_runtime(cfs_b, runtime, expires);

3591

runtime = distribute_cfs_runtime(cfs_b, runtime, expires);

3591

3592

raw_spin_lock(&cfs_b->lock);

3593

raw_spin_lock(&cfs_b->lock);

3593

if (expires == cfs_b->runtime_expires)

3594

if (expires == cfs_b->runtime_expires)

3594

cfs_b->runtime = runtime;

3595

cfs_b->runtime = runtime;

3595

raw_spin_unlock(&cfs_b->lock);

3596

raw_spin_unlock(&cfs_b->lock);

3596

}

3597

}

3597

3598

/*

3599

/*

3599

* When a group wakes up we want to make sure that its quota is not already

3600

* When a group wakes up we want to make sure that its quota is not already

3600

* expired/exceeded, otherwise it may be allowed to steal additional ticks of

3601

* expired/exceeded, otherwise it may be allowed to steal additional ticks of

3601

* runtime as update_curr() throttling can not not trigger until it's on-rq.

3602

* runtime as update_curr() throttling can not not trigger until it's on-rq.

3602

*/

3603

*/

3603

static void check_enqueue_throttle(struct cfs_rq *cfs_rq)

3604

static void check_enqueue_throttle(struct cfs_rq *cfs_rq)

3604

{

3605

{

3605

if (!cfs_bandwidth_used())

3606

if (!cfs_bandwidth_used())

3606

return;

3607

return;

3607

3608

/* an active group must be handled by the update_curr()->put() path */

3609

/* an active group must be handled by the update_curr()->put() path */

3609

if (!cfs_rq->runtime_enabled || cfs_rq->curr)

3610

if (!cfs_rq->runtime_enabled || cfs_rq->curr)

3610

return;

3611

return;

3611

3612

/* ensure the group is not already throttled */

3613

/* ensure the group is not already throttled */

3613

if (cfs_rq_throttled(cfs_rq))

3614

if (cfs_rq_throttled(cfs_rq))

3614

return;

3615

return;

3615

3616

/* update runtime allocation */

3617

/* update runtime allocation */

3617

account_cfs_rq_runtime(cfs_rq, 0);

3618

account_cfs_rq_runtime(cfs_rq, 0);

3618

if (cfs_rq->runtime_remaining <= 0)

3619

if (cfs_rq->runtime_remaining <= 0)

3619

throttle_cfs_rq(cfs_rq);

3620

throttle_cfs_rq(cfs_rq);

3620

}

3621

}

3621

3622

/* conditionally throttle active cfs_rq's from put_prev_entity() */

3623

/* conditionally throttle active cfs_rq's from put_prev_entity() */

3623

static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3624

static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3624

{

3625

{

3625

if (!cfs_bandwidth_used())

3626

if (!cfs_bandwidth_used())

3626

return false;

3627

return false;

3627

3628

if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))

3629

if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))

3629

return false;

3630

return false;

3630

3631

/*

3632

/*

3632

* it's possible for a throttled entity to be forced into a running

3633

* it's possible for a throttled entity to be forced into a running

3633

* state (e.g. set_curr_task), in this case we're finished.

3634

* state (e.g. set_curr_task), in this case we're finished.

3634

*/

3635

*/

3635

if (cfs_rq_throttled(cfs_rq))

3636

if (cfs_rq_throttled(cfs_rq))

3636

return true;

3637

return true;

3637

3638

throttle_cfs_rq(cfs_rq);

3639

throttle_cfs_rq(cfs_rq);

3639

return true;

3640

return true;

3640

}

3641

}

3641

3642

static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)

3643

static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)

3643

{

3644

{

3644

struct cfs_bandwidth *cfs_b =

3645

struct cfs_bandwidth *cfs_b =

3645

container_of(timer, struct cfs_bandwidth, slack_timer);

3646

container_of(timer, struct cfs_bandwidth, slack_timer);

3646

do_sched_cfs_slack_timer(cfs_b);

3647

do_sched_cfs_slack_timer(cfs_b);

3647

3648

return HRTIMER_NORESTART;

3649

return HRTIMER_NORESTART;

3649

}

3650

}

3650

3651

static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)

3652

static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)

3652

{

3653

{

3653

struct cfs_bandwidth *cfs_b =

3654

struct cfs_bandwidth *cfs_b =

3654

container_of(timer, struct cfs_bandwidth, period_timer);

3655

container_of(timer, struct cfs_bandwidth, period_timer);

3655

ktime_t now;

3656

ktime_t now;

3656

int overrun;

3657

int overrun;

3657

int idle = 0;

3658

int idle = 0;

3658

3659

for (;;) {

3660

for (;;) {

3660

now = hrtimer_cb_get_time(timer);

3661

now = hrtimer_cb_get_time(timer);

3661

overrun = hrtimer_forward(timer, now, cfs_b->period);

3662

overrun = hrtimer_forward(timer, now, cfs_b->period);

3662

3663

if (!overrun)

3664

if (!overrun)

3664

break;

3665

break;

3665

3666

idle = do_sched_cfs_period_timer(cfs_b, overrun);

3667

idle = do_sched_cfs_period_timer(cfs_b, overrun);

3667

}

3668

}

3668

3669

return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;

3670

return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;

3670

}

3671

}

3671

3672

void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)

3673

void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)

3673

{

3674

{

3674

raw_spin_lock_init(&cfs_b->lock);

3675

raw_spin_lock_init(&cfs_b->lock);

3675

cfs_b->runtime = 0;

3676

cfs_b->runtime = 0;

3676

cfs_b->quota = RUNTIME_INF;

3677

cfs_b->quota = RUNTIME_INF;

3677

cfs_b->period = ns_to_ktime(default_cfs_period());

3678

cfs_b->period = ns_to_ktime(default_cfs_period());

3678

3679

INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);

3680

INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);

3680

hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

3681

hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

3681

cfs_b->period_timer.function = sched_cfs_period_timer;

3682

cfs_b->period_timer.function = sched_cfs_period_timer;

3682

hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

3683

hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

3683

cfs_b->slack_timer.function = sched_cfs_slack_timer;

3684

cfs_b->slack_timer.function = sched_cfs_slack_timer;

3684

}

3685

}

3685

3686

static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3687

static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3687

{

3688

{

3688

cfs_rq->runtime_enabled = 0;

3689

cfs_rq->runtime_enabled = 0;

3689

INIT_LIST_HEAD(&cfs_rq->throttled_list);

3690

INIT_LIST_HEAD(&cfs_rq->throttled_list);

3690

}

3691

}

3691

3692

/* requires cfs_b->lock, may release to reprogram timer */

3693

/* requires cfs_b->lock, may release to reprogram timer */

3693

void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)

3694

void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)

3694

{

3695

{

3695

/*

3696

/*

3696

* The timer may be active because we're trying to set a new bandwidth

3697

* The timer may be active because we're trying to set a new bandwidth

3697

* period or because we're racing with the tear-down path

3698

* period or because we're racing with the tear-down path

3698

* (timer_active==0 becomes visible before the hrtimer call-back

3699

* (timer_active==0 becomes visible before the hrtimer call-back

3699

* terminates). In either case we ensure that it's re-programmed

3700

* terminates). In either case we ensure that it's re-programmed

3700

*/

3701

*/

3701

while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&

3702

while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&

3702

hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {

3703

hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {

3703

/* bounce the lock to allow do_sched_cfs_period_timer to run */

3704

/* bounce the lock to allow do_sched_cfs_period_timer to run */

3704

raw_spin_unlock(&cfs_b->lock);

3705

raw_spin_unlock(&cfs_b->lock);

3705

cpu_relax();

3706

cpu_relax();

3706

raw_spin_lock(&cfs_b->lock);

3707

raw_spin_lock(&cfs_b->lock);

3707

/* if someone else restarted the timer then we're done */

3708

/* if someone else restarted the timer then we're done */

3708

if (cfs_b->timer_active)

3709

if (cfs_b->timer_active)

3709

return;

3710

return;

3710

}

3711

}

3711

3712

cfs_b->timer_active = 1;

3713

cfs_b->timer_active = 1;

3713

start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);

3714

start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);

3714

}

3715

}

3715

3716

static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)

3717

static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)

3717

{

3718

{

3718

hrtimer_cancel(&cfs_b->period_timer);

3719

hrtimer_cancel(&cfs_b->period_timer);

3719

hrtimer_cancel(&cfs_b->slack_timer);

3720

hrtimer_cancel(&cfs_b->slack_timer);

3720

}

3721

}

3721

3722

static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)

3723

static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)

3723

{

3724

{

3724

struct cfs_rq *cfs_rq;

3725

struct cfs_rq *cfs_rq;

3725

3726

for_each_leaf_cfs_rq(rq, cfs_rq) {

3727

for_each_leaf_cfs_rq(rq, cfs_rq) {

3727

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);

3728

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);

3728

3729

if (!cfs_rq->runtime_enabled)

3730

if (!cfs_rq->runtime_enabled)

3730

continue;

3731

continue;

3731

3732

/*

3733

/*

3733

* clock_task is not advancing so we just need to make sure

3734

* clock_task is not advancing so we just need to make sure

3734

* there's some valid quota amount

3735

* there's some valid quota amount

3735

*/

3736

*/

3736

cfs_rq->runtime_remaining = cfs_b->quota;

3737

cfs_rq->runtime_remaining = cfs_b->quota;

3737

if (cfs_rq_throttled(cfs_rq))

3738

if (cfs_rq_throttled(cfs_rq))

3738

unthrottle_cfs_rq(cfs_rq);

3739

unthrottle_cfs_rq(cfs_rq);

3739

}

3740

}

3740

}

3741

}

3741

3742

#else /* CONFIG_CFS_BANDWIDTH */

3743

#else /* CONFIG_CFS_BANDWIDTH */

3743

static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)

3744

static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)

3744

{

3745

{

3745

return rq_clock_task(rq_of(cfs_rq));

3746

return rq_clock_task(rq_of(cfs_rq));

3746

}

3747

}

3747

3748

static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}

3749

static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}

3749

static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }

3750

static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }

3750

static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}

3751

static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}

3751

static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}

3752

static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}

3752

3753

static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)

3754

static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)

3754

{

3755

{

3755

return 0;

3756

return 0;

3756

}

3757

}

3757

3758

static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)

3759

static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)

3759

{

3760

{

3760

return 0;

3761

return 0;

3761

}

3762

}

3762

3763

static inline int throttled_lb_pair(struct task_group *tg,

3764

static inline int throttled_lb_pair(struct task_group *tg,

3764

int src_cpu, int dest_cpu)

3765

int src_cpu, int dest_cpu)

3765

{

3766

{

3766

return 0;

3767

return 0;

3767

}

3768

}

3768

3769

void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}

3770

void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}

3770

3771

#ifdef CONFIG_FAIR_GROUP_SCHED

3772

#ifdef CONFIG_FAIR_GROUP_SCHED

3772

static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}

3773

static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}

3773

#endif

3774

#endif

3774

3775

static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)

3776

static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)

3776

{

3777

{

3777

return NULL;

3778

return NULL;

3778

}

3779

}

3779

static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}

3780

static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}

3780

static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}

3781

static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}

3781

3782

#endif /* CONFIG_CFS_BANDWIDTH */

3783

#endif /* CONFIG_CFS_BANDWIDTH */

3783

3784

/**************************************************

3785

/**************************************************

3785

* CFS operations on tasks:

3786

* CFS operations on tasks:

3786

*/

3787

*/

3787

3788

#ifdef CONFIG_SCHED_HRTICK

3789

#ifdef CONFIG_SCHED_HRTICK

3789

static void hrtick_start_fair(struct rq *rq, struct task_struct *p)

3790

static void hrtick_start_fair(struct rq *rq, struct task_struct *p)

3790

{

3791

{

3791

struct sched_entity *se = &p->se;

3792

struct sched_entity *se = &p->se;

3792

struct cfs_rq *cfs_rq = cfs_rq_of(se);

3793

struct cfs_rq *cfs_rq = cfs_rq_of(se);

3793

3794

WARN_ON(task_rq(p) != rq);

3795

WARN_ON(task_rq(p) != rq);

3795

3796

if (cfs_rq->nr_running > 1) {

3797

if (cfs_rq->nr_running > 1) {

3797

u64 slice = sched_slice(cfs_rq, se);

3798

u64 slice = sched_slice(cfs_rq, se);

3798

u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;

3799

u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;

3799

s64 delta = slice - ran;

3800

s64 delta = slice - ran;

3800

3801

if (delta < 0) {

3802

if (delta < 0) {

3802

if (rq->curr == p)

3803

if (rq->curr == p)

3803

resched_task(p);

3804

resched_task(p);

3804

return;

3805

return;

3805

}

3806

}

3806

3807

/*

3808

/*

3808

* Don't schedule slices shorter than 10000ns, that just

3809

* Don't schedule slices shorter than 10000ns, that just

3809

* doesn't make sense. Rely on vruntime for fairness.

3810

* doesn't make sense. Rely on vruntime for fairness.

3810

*/

3811

*/

3811

if (rq->curr != p)

3812

if (rq->curr != p)

3812

delta = max_t(s64, 10000LL, delta);

3813

delta = max_t(s64, 10000LL, delta);

3813

3814

hrtick_start(rq, delta);

3815

hrtick_start(rq, delta);

3815

}

3816

}

3816

}

3817

}

3817

3818

/*

3819

/*

3819

* called from enqueue/dequeue and updates the hrtick when the

3820

* called from enqueue/dequeue and updates the hrtick when the

3820

* current task is from our class and nr_running is low enough

3821

* current task is from our class and nr_running is low enough

3821

* to matter.

3822

* to matter.

3822

*/

3823

*/

3823

static void hrtick_update(struct rq *rq)

3824

static void hrtick_update(struct rq *rq)

3824

{

3825

{

3825

struct task_struct *curr = rq->curr;

3826

struct task_struct *curr = rq->curr;

3826

3827

if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)

3828

if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)

3828

return;

3829

return;

3829

3830

if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)

3831

if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)

3831

hrtick_start_fair(rq, curr);

3832

hrtick_start_fair(rq, curr);

3832

}

3833

}

3833

#else /* !CONFIG_SCHED_HRTICK */

3834

#else /* !CONFIG_SCHED_HRTICK */

3834

static inline void

3835

static inline void

3835

hrtick_start_fair(struct rq *rq, struct task_struct *p)

3836

hrtick_start_fair(struct rq *rq, struct task_struct *p)

3836

{

3837

{

3837

}

3838

}

3838

3839

static inline void hrtick_update(struct rq *rq)

3840

static inline void hrtick_update(struct rq *rq)

3840

{

3841

{

3841

}

3842

}

3842

#endif

3843

#endif

3843

3844

/*

3845

/*

3845

* The enqueue_task method is called before nr_running is

3846

* The enqueue_task method is called before nr_running is

3846

* increased. Here we update the fair scheduling stats and

3847

* increased. Here we update the fair scheduling stats and

3847

* then put the task into the rbtree:

3848

* then put the task into the rbtree:

3848

*/

3849

*/

3849

static void

3850

static void

3850

enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)

3851

enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)

3851

{

3852

{

3852

struct cfs_rq *cfs_rq;

3853

struct cfs_rq *cfs_rq;

3853

struct sched_entity *se = &p->se;

3854

struct sched_entity *se = &p->se;

3854

3855

for_each_sched_entity(se) {

3856

for_each_sched_entity(se) {

3856

if (se->on_rq)

3857

if (se->on_rq)

3857

break;

3858

break;

3858

cfs_rq = cfs_rq_of(se);

3859

cfs_rq = cfs_rq_of(se);

3859

enqueue_entity(cfs_rq, se, flags);

3860

enqueue_entity(cfs_rq, se, flags);

3860

3861

/*

3862

/*

3862

* end evaluation on encountering a throttled cfs_rq

3863

* end evaluation on encountering a throttled cfs_rq

3863

*

3864

*

3864

* note: in the case of encountering a throttled cfs_rq we will

3865

* note: in the case of encountering a throttled cfs_rq we will

3865

* post the final h_nr_running increment below.

3866

* post the final h_nr_running increment below.

3866

*/

3867

*/

3867

if (cfs_rq_throttled(cfs_rq))

3868

if (cfs_rq_throttled(cfs_rq))

3868

break;

3869

break;

3869

cfs_rq->h_nr_running++;

3870

cfs_rq->h_nr_running++;

3870

3871

flags = ENQUEUE_WAKEUP;

3872

flags = ENQUEUE_WAKEUP;

3872

}

3873

}

3873

3874

for_each_sched_entity(se) {

3875

for_each_sched_entity(se) {

3875

cfs_rq = cfs_rq_of(se);

3876

cfs_rq = cfs_rq_of(se);

3876

cfs_rq->h_nr_running++;

3877

cfs_rq->h_nr_running++;

3877

3878

if (cfs_rq_throttled(cfs_rq))

3879

if (cfs_rq_throttled(cfs_rq))

3879

break;

3880

break;

3880

3881

update_cfs_shares(cfs_rq);

3882

update_cfs_shares(cfs_rq);

3882

update_entity_load_avg(se, 1);

3883

update_entity_load_avg(se, 1);

3883

}

3884

}

3884

3885

if (!se) {

3886

if (!se) {

3886

update_rq_runnable_avg(rq, rq->nr_running);

3887

update_rq_runnable_avg(rq, rq->nr_running);

3887

inc_nr_running(rq);

3888

inc_nr_running(rq);

3888

}

3889

}

3889

hrtick_update(rq);

3890

hrtick_update(rq);

3890

}

3891

}

3891

3892

static void set_next_buddy(struct sched_entity *se);

3893

static void set_next_buddy(struct sched_entity *se);

3893

3894

/*

3895

/*

3895

* The dequeue_task method is called before nr_running is

3896

* The dequeue_task method is called before nr_running is

3896

* decreased. We remove the task from the rbtree and

3897

* decreased. We remove the task from the rbtree and

3897

* update the fair scheduling stats:

3898

* update the fair scheduling stats:

3898

*/

3899

*/

3899

static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)

3900

static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)

3900

{

3901

{

3901

struct cfs_rq *cfs_rq;

3902

struct cfs_rq *cfs_rq;

3902

struct sched_entity *se = &p->se;

3903

struct sched_entity *se = &p->se;

3903

int task_sleep = flags & DEQUEUE_SLEEP;

3904

int task_sleep = flags & DEQUEUE_SLEEP;

3904

3905

for_each_sched_entity(se) {

3906

for_each_sched_entity(se) {

3906

cfs_rq = cfs_rq_of(se);

3907

cfs_rq = cfs_rq_of(se);

3907

dequeue_entity(cfs_rq, se, flags);

3908

dequeue_entity(cfs_rq, se, flags);

3908

3909

/*

3910

/*

3910

* end evaluation on encountering a throttled cfs_rq

3911

* end evaluation on encountering a throttled cfs_rq

3911

*

3912

*

3912

* note: in the case of encountering a throttled cfs_rq we will

3913

* note: in the case of encountering a throttled cfs_rq we will

3913

* post the final h_nr_running decrement below.

3914

* post the final h_nr_running decrement below.

3914

*/

3915

*/

3915

if (cfs_rq_throttled(cfs_rq))

3916

if (cfs_rq_throttled(cfs_rq))

3916

break;

3917

break;

3917

cfs_rq->h_nr_running--;

3918

cfs_rq->h_nr_running--;

3918

3919

/* Don't dequeue parent if it has other entities besides us */

3920

/* Don't dequeue parent if it has other entities besides us */

3920

if (cfs_rq->load.weight) {

3921

if (cfs_rq->load.weight) {

3921

/*

3922

/*

3922

* Bias pick_next to pick a task from this cfs_rq, as

3923

* Bias pick_next to pick a task from this cfs_rq, as

3923

* p is sleeping when it is within its sched_slice.

3924

* p is sleeping when it is within its sched_slice.

3924

*/

3925

*/

3925

if (task_sleep && parent_entity(se))

3926

if (task_sleep && parent_entity(se))

3926

set_next_buddy(parent_entity(se));

3927

set_next_buddy(parent_entity(se));

3927

3928

/* avoid re-evaluating load for this entity */

3929

/* avoid re-evaluating load for this entity */

3929

se = parent_entity(se);

3930

se = parent_entity(se);

3930

break;

3931

break;

3931

}

3932

}

3932

flags |= DEQUEUE_SLEEP;

3933

flags |= DEQUEUE_SLEEP;

3933

}

3934

}

3934

3935

for_each_sched_entity(se) {

3936

for_each_sched_entity(se) {

3936

cfs_rq = cfs_rq_of(se);

3937

cfs_rq = cfs_rq_of(se);

3937

cfs_rq->h_nr_running--;

3938

cfs_rq->h_nr_running--;

3938

3939

if (cfs_rq_throttled(cfs_rq))

3940

if (cfs_rq_throttled(cfs_rq))

3940

break;

3941

break;

3941

3942

update_cfs_shares(cfs_rq);

3943

update_cfs_shares(cfs_rq);

3943

update_entity_load_avg(se, 1);

3944

update_entity_load_avg(se, 1);

3944

}

3945

}

3945

3946

if (!se) {

3947

if (!se) {

3947

dec_nr_running(rq);

3948

dec_nr_running(rq);

3948

update_rq_runnable_avg(rq, 1);

3949

update_rq_runnable_avg(rq, 1);

3949

}

3950

}

3950

hrtick_update(rq);

3951

hrtick_update(rq);

3951

}

3952

}

3952

3953

#ifdef CONFIG_SMP

3954

#ifdef CONFIG_SMP

3954

/* Used instead of source_load when we know the type == 0 */

3955

/* Used instead of source_load when we know the type == 0 */

3955

static unsigned long weighted_cpuload(const int cpu)

3956

static unsigned long weighted_cpuload(const int cpu)

3956

{

3957

{

3957

return cpu_rq(cpu)->cfs.runnable_load_avg;

3958

return cpu_rq(cpu)->cfs.runnable_load_avg;

3958

}

3959

}

3959

3960

/*

3961

/*

3961

* Return a low guess at the load of a migration-source cpu weighted

3962

* Return a low guess at the load of a migration-source cpu weighted

3962

* according to the scheduling class and "nice" value.

3963

* according to the scheduling class and "nice" value.

3963

*

3964

*

3964

* We want to under-estimate the load of migration sources, to

3965

* We want to under-estimate the load of migration sources, to

3965

* balance conservatively.

3966

* balance conservatively.

3966

*/

3967

*/

3967

static unsigned long source_load(int cpu, int type)

3968

static unsigned long source_load(int cpu, int type)

3968

{

3969

{

3969

struct rq *rq = cpu_rq(cpu);

3970

struct rq *rq = cpu_rq(cpu);

3970

unsigned long total = weighted_cpuload(cpu);

3971

unsigned long total = weighted_cpuload(cpu);

3971

3972

if (type == 0 || !sched_feat(LB_BIAS))

3973

if (type == 0 || !sched_feat(LB_BIAS))

3973

return total;

3974

return total;

3974

3975

return min(rq->cpu_load[type-1], total);

3976

return min(rq->cpu_load[type-1], total);

3976

}

3977

}

3977

3978

/*

3979

/*

3979

* Return a high guess at the load of a migration-target cpu weighted

3980

* Return a high guess at the load of a migration-target cpu weighted

3980

* according to the scheduling class and "nice" value.

3981

* according to the scheduling class and "nice" value.

3981

*/

3982

*/

3982

static unsigned long target_load(int cpu, int type)

3983

static unsigned long target_load(int cpu, int type)

3983

{

3984

{

3984

struct rq *rq = cpu_rq(cpu);

3985

struct rq *rq = cpu_rq(cpu);

3985

unsigned long total = weighted_cpuload(cpu);

3986

unsigned long total = weighted_cpuload(cpu);

3986

3987

if (type == 0 || !sched_feat(LB_BIAS))

3988

if (type == 0 || !sched_feat(LB_BIAS))

3988

return total;

3989

return total;

3989

3990

return max(rq->cpu_load[type-1], total);

3991

return max(rq->cpu_load[type-1], total);

3991

}

3992

}

3992

3993

static unsigned long power_of(int cpu)

3994

static unsigned long power_of(int cpu)

3994

{

3995

{

3995

return cpu_rq(cpu)->cpu_power;

3996

return cpu_rq(cpu)->cpu_power;

3996

}

3997

}

3997

3998

static unsigned long cpu_avg_load_per_task(int cpu)

3999

static unsigned long cpu_avg_load_per_task(int cpu)

3999

{

4000

{

4000

struct rq *rq = cpu_rq(cpu);

4001

struct rq *rq = cpu_rq(cpu);

4001

unsigned long nr_running = ACCESS_ONCE(rq->nr_running);

4002

unsigned long nr_running = ACCESS_ONCE(rq->nr_running);

4002

unsigned long load_avg = rq->cfs.runnable_load_avg;

4003

unsigned long load_avg = rq->cfs.runnable_load_avg;

4003

4004

if (nr_running)

4005

if (nr_running)

4005

return load_avg / nr_running;

4006

return load_avg / nr_running;

4006

4007

return 0;

4008

return 0;

4008

}

4009

}

4009

4010

static void record_wakee(struct task_struct *p)

4011

static void record_wakee(struct task_struct *p)

4011

{

4012

{

4012

/*

4013

/*

4013

* Rough decay (wiping) for cost saving, don't worry

4014

* Rough decay (wiping) for cost saving, don't worry

4014

* about the boundary, really active task won't care

4015

* about the boundary, really active task won't care

4015

* about the loss.

4016

* about the loss.

4016

*/

4017

*/

4017

if (jiffies > current->wakee_flip_decay_ts + HZ) {

4018

if (jiffies > current->wakee_flip_decay_ts + HZ) {

4018

current->wakee_flips = 0;

4019

current->wakee_flips = 0;

4019

current->wakee_flip_decay_ts = jiffies;

4020

current->wakee_flip_decay_ts = jiffies;

4020

}

4021

}

4021

4022

if (current->last_wakee != p) {

4023

if (current->last_wakee != p) {

4023

current->last_wakee = p;

4024

current->last_wakee = p;

4024

current->wakee_flips++;

4025

current->wakee_flips++;

4025

}

4026

}

4026

}

4027

}

4027

4028

static void task_waking_fair(struct task_struct *p)

4029

static void task_waking_fair(struct task_struct *p)

4029

{

4030

{

4030

struct sched_entity *se = &p->se;

4031

struct sched_entity *se = &p->se;

4031

struct cfs_rq *cfs_rq = cfs_rq_of(se);

4032

struct cfs_rq *cfs_rq = cfs_rq_of(se);

4032

u64 min_vruntime;

4033

u64 min_vruntime;

4033

4034

#ifndef CONFIG_64BIT

4035

#ifndef CONFIG_64BIT

4035

u64 min_vruntime_copy;

4036

u64 min_vruntime_copy;

4036

4037

do {

4038

do {

4038

min_vruntime_copy = cfs_rq->min_vruntime_copy;

4039

min_vruntime_copy = cfs_rq->min_vruntime_copy;

4039

smp_rmb();

4040

smp_rmb();

4040

min_vruntime = cfs_rq->min_vruntime;

4041

min_vruntime = cfs_rq->min_vruntime;

4041

} while (min_vruntime != min_vruntime_copy);

4042

} while (min_vruntime != min_vruntime_copy);

4042

#else

4043

#else

4043

min_vruntime = cfs_rq->min_vruntime;

4044

min_vruntime = cfs_rq->min_vruntime;

4044

#endif

4045

#endif

4045

4046

se->vruntime -= min_vruntime;

4047

se->vruntime -= min_vruntime;

4047

record_wakee(p);

4048

record_wakee(p);

4048

}

4049

}

4049

4050

#ifdef CONFIG_FAIR_GROUP_SCHED

4051

#ifdef CONFIG_FAIR_GROUP_SCHED

4051

/*

4052

/*

4052

* effective_load() calculates the load change as seen from the root_task_group

4053

* effective_load() calculates the load change as seen from the root_task_group

4053

*

4054

*

4054

* Adding load to a group doesn't make a group heavier, but can cause movement

4055

* Adding load to a group doesn't make a group heavier, but can cause movement

4055

* of group shares between cpus. Assuming the shares were perfectly aligned one

4056

* of group shares between cpus. Assuming the shares were perfectly aligned one

4056

* can calculate the shift in shares.

4057

* can calculate the shift in shares.

4057

*

4058

*

4058

* Calculate the effective load difference if @wl is added (subtracted) to @tg

4059

* Calculate the effective load difference if @wl is added (subtracted) to @tg

4059

* on this @cpu and results in a total addition (subtraction) of @wg to the

4060

* on this @cpu and results in a total addition (subtraction) of @wg to the

4060

* total group weight.

4061

* total group weight.

4061

*

4062

*

4062

* Given a runqueue weight distribution (rw_i) we can compute a shares

4063

* Given a runqueue weight distribution (rw_i) we can compute a shares

4063

* distribution (s_i) using:

4064

* distribution (s_i) using:

4064

*

4065

*

4065

* s_i = rw_i / \Sum rw_j (1)

4066

* s_i = rw_i / \Sum rw_j (1)

4066

*

4067

*

4067

* Suppose we have 4 CPUs and our @tg is a direct child of the root group and

4068

* Suppose we have 4 CPUs and our @tg is a direct child of the root group and

4068

* has 7 equal weight tasks, distributed as below (rw_i), with the resulting

4069

* has 7 equal weight tasks, distributed as below (rw_i), with the resulting

4069

* shares distribution (s_i):

4070

* shares distribution (s_i):

4070

*

4071

*

4071

* rw_i = { 2, 4, 1, 0 }

4072

* rw_i = { 2, 4, 1, 0 }

4072

* s_i = { 2/7, 4/7, 1/7, 0 }

4073

* s_i = { 2/7, 4/7, 1/7, 0 }

4073

*

4074

*

4074

* As per wake_affine() we're interested in the load of two CPUs (the CPU the

4075

* As per wake_affine() we're interested in the load of two CPUs (the CPU the

4075

* task used to run on and the CPU the waker is running on), we need to

4076

* task used to run on and the CPU the waker is running on), we need to

4076

* compute the effect of waking a task on either CPU and, in case of a sync

4077

* compute the effect of waking a task on either CPU and, in case of a sync

4077

* wakeup, compute the effect of the current task going to sleep.

4078

* wakeup, compute the effect of the current task going to sleep.

4078

*

4079

*

4079

* So for a change of @wl to the local @cpu with an overall group weight change

4080

* So for a change of @wl to the local @cpu with an overall group weight change

4080

* of @wl we can compute the new shares distribution (s'_i) using:

4081

* of @wl we can compute the new shares distribution (s'_i) using:

4081

*

4082

*

4082

* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)

4083

* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)

4083

*

4084

*

4084

* Suppose we're interested in CPUs 0 and 1, and want to compute the load

4085

* Suppose we're interested in CPUs 0 and 1, and want to compute the load

4085

* differences in waking a task to CPU 0. The additional task changes the

4086

* differences in waking a task to CPU 0. The additional task changes the

4086

* weight and shares distributions like:

4087

* weight and shares distributions like:

4087

*

4088

*

4088

* rw'_i = { 3, 4, 1, 0 }

4089

* rw'_i = { 3, 4, 1, 0 }

4089

* s'_i = { 3/8, 4/8, 1/8, 0 }

4090

* s'_i = { 3/8, 4/8, 1/8, 0 }

4090

*

4091

*

4091

* We can then compute the difference in effective weight by using:

4092

* We can then compute the difference in effective weight by using:

4092

*

4093

*

4093

* dw_i = S * (s'_i - s_i) (3)

4094

* dw_i = S * (s'_i - s_i) (3)

4094

*

4095

*

4095

* Where 'S' is the group weight as seen by its parent.

4096

* Where 'S' is the group weight as seen by its parent.

4096

*

4097

*

4097

* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)

4098

* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)

4098

* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -

4099

* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -

4099

* 4/7) times the weight of the group.

4100

* 4/7) times the weight of the group.

4100

*/

4101

*/

4101

static long effective_load(struct task_group *tg, int cpu, long wl, long wg)

4102

static long effective_load(struct task_group *tg, int cpu, long wl, long wg)

4102

{

4103

{

4103

struct sched_entity *se = tg->se[cpu];

4104

struct sched_entity *se = tg->se[cpu];

4104

4105

if (!tg->parent) /* the trivial, non-cgroup case */

4106

if (!tg->parent) /* the trivial, non-cgroup case */

4106

return wl;

4107

return wl;

4107

4108

for_each_sched_entity(se) {

4109

for_each_sched_entity(se) {

4109

long w, W;

4110

long w, W;

4110

4111

tg = se->my_q->tg;

4112

tg = se->my_q->tg;

4112

4113

/*

4114

/*

4114

* W = @wg + \Sum rw_j

4115

* W = @wg + \Sum rw_j

4115

*/

4116

*/

4116

W = wg + calc_tg_weight(tg, se->my_q);

4117

W = wg + calc_tg_weight(tg, se->my_q);

4117

4118

/*

4119

/*

4119

* w = rw_i + @wl

4120

* w = rw_i + @wl

4120

*/

4121

*/

4121

w = se->my_q->load.weight + wl;

4122

w = se->my_q->load.weight + wl;

4122

4123

/*

4124

/*

4124

* wl = S * s'_i; see (2)

4125

* wl = S * s'_i; see (2)

4125

*/

4126

*/

4126

if (W > 0 && w < W)

4127

if (W > 0 && w < W)

4127

wl = (w * tg->shares) / W;

4128

wl = (w * tg->shares) / W;

4128

else

4129

else

4129

wl = tg->shares;

4130

wl = tg->shares;

4130

4131

/*

4132

/*

4132

* Per the above, wl is the new se->load.weight value; since

4133

* Per the above, wl is the new se->load.weight value; since

4133

* those are clipped to [MIN_SHARES, ...) do so now. See

4134

* those are clipped to [MIN_SHARES, ...) do so now. See

4134

* calc_cfs_shares().

4135

* calc_cfs_shares().

4135

*/

4136

*/

4136

if (wl < MIN_SHARES)

4137

if (wl < MIN_SHARES)

4137

wl = MIN_SHARES;

4138

wl = MIN_SHARES;

4138

4139

/*

4140

/*

4140

* wl = dw_i = S * (s'_i - s_i); see (3)

4141

* wl = dw_i = S * (s'_i - s_i); see (3)

4141

*/

4142

*/

4142

wl -= se->load.weight;

4143

wl -= se->load.weight;

4143

4144

/*

4145

/*

4145

* Recursively apply this logic to all parent groups to compute

4146

* Recursively apply this logic to all parent groups to compute

4146

* the final effective load change on the root group. Since

4147

* the final effective load change on the root group. Since

4147

* only the @tg group gets extra weight, all parent groups can

4148

* only the @tg group gets extra weight, all parent groups can

4148

* only redistribute existing shares. @wl is the shift in shares

4149

* only redistribute existing shares. @wl is the shift in shares

4149

* resulting from this level per the above.

4150

* resulting from this level per the above.

4150

*/

4151

*/

4151

wg = 0;

4152

wg = 0;

4152

}

4153

}

4153

4154

return wl;

4155

return wl;

4155

}

4156

}

4156

#else

4157

#else

4157

4158

static long effective_load(struct task_group *tg, int cpu, long wl, long wg)

4159

static long effective_load(struct task_group *tg, int cpu, long wl, long wg)

4159

{

4160

{

4160

return wl;

4161

return wl;

4161

}

4162

}

4162

4163

#endif

4164

#endif

4164

4165

static int wake_wide(struct task_struct *p)

4166

static int wake_wide(struct task_struct *p)

4166

{

4167

{

4167

int factor = this_cpu_read(sd_llc_size);

4168

int factor = this_cpu_read(sd_llc_size);

4168

4169

/*

4170

/*

4170

* Yeah, it's the switching-frequency, could means many wakee or

4171

* Yeah, it's the switching-frequency, could means many wakee or

4171

* rapidly switch, use factor here will just help to automatically

4172

* rapidly switch, use factor here will just help to automatically

4172

* adjust the loose-degree, so bigger node will lead to more pull.

4173

* adjust the loose-degree, so bigger node will lead to more pull.

4173

*/

4174

*/

4174

if (p->wakee_flips > factor) {

4175

if (p->wakee_flips > factor) {

4175

/*

4176

/*

4176

* wakee is somewhat hot, it needs certain amount of cpu

4177

* wakee is somewhat hot, it needs certain amount of cpu

4177

* resource, so if waker is far more hot, prefer to leave

4178

* resource, so if waker is far more hot, prefer to leave

4178

* it alone.

4179

* it alone.

4179

*/

4180

*/

4180

if (current->wakee_flips > (factor * p->wakee_flips))

4181

if (current->wakee_flips > (factor * p->wakee_flips))

4181

return 1;

4182

return 1;

4182

}

4183

}

4183

4184

return 0;

4185

return 0;

4185

}

4186

}

4186

4187

static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)

4188

static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)

4188

{

4189

{

4189

s64 this_load, load;

4190

s64 this_load, load;

4190

int idx, this_cpu, prev_cpu;

4191

int idx, this_cpu, prev_cpu;

4191

unsigned long tl_per_task;

4192

unsigned long tl_per_task;

4192

struct task_group *tg;

4193

struct task_group *tg;

4193

unsigned long weight;

4194

unsigned long weight;

4194

int balanced;

4195

int balanced;

4195

4196

/*

4197

/*

4197

* If we wake multiple tasks be careful to not bounce

4198

* If we wake multiple tasks be careful to not bounce

4198

* ourselves around too much.

4199

* ourselves around too much.

4199

*/

4200

*/

4200

if (wake_wide(p))

4201

if (wake_wide(p))

4201

return 0;

4202

return 0;

4202

4203

idx = sd->wake_idx;

4204

idx = sd->wake_idx;

4204

this_cpu = smp_processor_id();

4205

this_cpu = smp_processor_id();

4205

prev_cpu = task_cpu(p);

4206

prev_cpu = task_cpu(p);

4206

load = source_load(prev_cpu, idx);

4207

load = source_load(prev_cpu, idx);

4207

this_load = target_load(this_cpu, idx);

4208

this_load = target_load(this_cpu, idx);

4208

4209

/*

4210

/*

4210

* If sync wakeup then subtract the (maximum possible)

4211

* If sync wakeup then subtract the (maximum possible)

4211

* effect of the currently running task from the load

4212

* effect of the currently running task from the load

4212

* of the current CPU:

4213

* of the current CPU:

4213

*/

4214

*/

4214

if (sync) {

4215

if (sync) {

4215

tg = task_group(current);

4216

tg = task_group(current);

4216

weight = current->se.load.weight;

4217

weight = current->se.load.weight;

4217

4218

this_load += effective_load(tg, this_cpu, -weight, -weight);

4219

this_load += effective_load(tg, this_cpu, -weight, -weight);

4219

load += effective_load(tg, prev_cpu, 0, -weight);

4220

load += effective_load(tg, prev_cpu, 0, -weight);

4220

}

4221

}

4221

4222

tg = task_group(p);

4223

tg = task_group(p);

4223

weight = p->se.load.weight;

4224

weight = p->se.load.weight;

4224

4225

/*

4226

/*

4226

* In low-load situations, where prev_cpu is idle and this_cpu is idle

4227

* In low-load situations, where prev_cpu is idle and this_cpu is idle

4227

* due to the sync cause above having dropped this_load to 0, we'll

4228

* due to the sync cause above having dropped this_load to 0, we'll

4228

* always have an imbalance, but there's really nothing you can do

4229

* always have an imbalance, but there's really nothing you can do

4229

* about that, so that's good too.

4230

* about that, so that's good too.

4230

*

4231

*

4231

* Otherwise check if either cpus are near enough in load to allow this

4232

* Otherwise check if either cpus are near enough in load to allow this

4232

* task to be woken on this_cpu.

4233

* task to be woken on this_cpu.

4233

*/

4234

*/

4234

if (this_load > 0) {

4235

if (this_load > 0) {

4235

s64 this_eff_load, prev_eff_load;

4236

s64 this_eff_load, prev_eff_load;

4236

4237

this_eff_load = 100;

4238

this_eff_load = 100;

4238

this_eff_load *= power_of(prev_cpu);

4239

this_eff_load *= power_of(prev_cpu);

4239

this_eff_load *= this_load +

4240

this_eff_load *= this_load +

4240

effective_load(tg, this_cpu, weight, weight);

4241

effective_load(tg, this_cpu, weight, weight);

4241

4242

prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;

4243

prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;

4243

prev_eff_load *= power_of(this_cpu);

4244

prev_eff_load *= power_of(this_cpu);

4244

prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);

4245

prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);

4245

4246

balanced = this_eff_load <= prev_eff_load;

4247

balanced = this_eff_load <= prev_eff_load;

4247

} else

4248

} else

4248

balanced = true;

4249

balanced = true;

4249

4250

/*

4251

/*

4251

* If the currently running task will sleep within

4252

* If the currently running task will sleep within

4252

* a reasonable amount of time then attract this newly

4253

* a reasonable amount of time then attract this newly

4253

* woken task:

4254

* woken task:

4254

*/

4255

*/

4255

if (sync && balanced)

4256

if (sync && balanced)

4256

return 1;

4257

return 1;

4257

4258

schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);

4259

schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);

4259

tl_per_task = cpu_avg_load_per_task(this_cpu);

4260

tl_per_task = cpu_avg_load_per_task(this_cpu);

4260

4261

if (balanced ||

4262

if (balanced ||

4262

(this_load <= load &&

4263

(this_load <= load &&

4263

this_load + target_load(prev_cpu, idx) <= tl_per_task)) {

4264

this_load + target_load(prev_cpu, idx) <= tl_per_task)) {

4264

/*

4265

/*

4265

* This domain has SD_WAKE_AFFINE and

4266

* This domain has SD_WAKE_AFFINE and

4266

* p is cache cold in this domain, and

4267

* p is cache cold in this domain, and

4267

* there is no bad imbalance.

4268

* there is no bad imbalance.

4268

*/

4269

*/

4269

schedstat_inc(sd, ttwu_move_affine);

4270

schedstat_inc(sd, ttwu_move_affine);

4270

schedstat_inc(p, se.statistics.nr_wakeups_affine);

4271

schedstat_inc(p, se.statistics.nr_wakeups_affine);

4271

4272

return 1;

4273

return 1;

4273

}

4274

}

4274

return 0;

4275

return 0;

4275

}

4276

}

4276

4277

/*

4278

/*

4278

* find_idlest_group finds and returns the least busy CPU group within the

4279

* find_idlest_group finds and returns the least busy CPU group within the

4279

* domain.

4280

* domain.

4280

*/

4281

*/

4281

static struct sched_group *

4282

static struct sched_group *

4282

find_idlest_group(struct sched_domain *sd, struct task_struct *p,

4283

find_idlest_group(struct sched_domain *sd, struct task_struct *p,

4283

int this_cpu, int sd_flag)

4284

int this_cpu, int sd_flag)

4284

{

4285

{

4285

struct sched_group *idlest = NULL, *group = sd->groups;

4286

struct sched_group *idlest = NULL, *group = sd->groups;

4286

unsigned long min_load = ULONG_MAX, this_load = 0;

4287

unsigned long min_load = ULONG_MAX, this_load = 0;

4287

int load_idx = sd->forkexec_idx;

4288

int load_idx = sd->forkexec_idx;

4288

int imbalance = 100 + (sd->imbalance_pct-100)/2;

4289

int imbalance = 100 + (sd->imbalance_pct-100)/2;

4289

4290

if (sd_flag & SD_BALANCE_WAKE)

4291

if (sd_flag & SD_BALANCE_WAKE)

4291

load_idx = sd->wake_idx;

4292

load_idx = sd->wake_idx;

4292

4293

do {

4294

do {

4294

unsigned long load, avg_load;

4295

unsigned long load, avg_load;

4295

int local_group;

4296

int local_group;

4296

int i;

4297

int i;

4297

4298

/* Skip over this group if it has no CPUs allowed */

4299

/* Skip over this group if it has no CPUs allowed */

4299

if (!cpumask_intersects(sched_group_cpus(group),

4300

if (!cpumask_intersects(sched_group_cpus(group),

4300

tsk_cpus_allowed(p)))

4301

tsk_cpus_allowed(p)))

4301

continue;

4302

continue;

4302

4303

local_group = cpumask_test_cpu(this_cpu,

4304

local_group = cpumask_test_cpu(this_cpu,

4304

sched_group_cpus(group));

4305

sched_group_cpus(group));

4305

4306

/* Tally up the load of all CPUs in the group */

4307

/* Tally up the load of all CPUs in the group */

4307

avg_load = 0;

4308

avg_load = 0;

4308

4309

for_each_cpu(i, sched_group_cpus(group)) {

4310

for_each_cpu(i, sched_group_cpus(group)) {

4310

/* Bias balancing toward cpus of our domain */

4311

/* Bias balancing toward cpus of our domain */

4311

if (local_group)

4312

if (local_group)

4312

load = source_load(i, load_idx);

4313

load = source_load(i, load_idx);

4313

else

4314

else

4314

load = target_load(i, load_idx);

4315

load = target_load(i, load_idx);

4315

4316

avg_load += load;

4317

avg_load += load;

4317

}

4318

}

4318

4319

/* Adjust by relative CPU power of the group */

4320

/* Adjust by relative CPU power of the group */

4320

avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;

4321

avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;

4321

4322

if (local_group) {

4323

if (local_group) {

4323

this_load = avg_load;

4324

this_load = avg_load;

4324

} else if (avg_load < min_load) {

4325

} else if (avg_load < min_load) {

4325

min_load = avg_load;

4326

min_load = avg_load;

4326

idlest = group;

4327

idlest = group;

4327

}

4328

}

4328

} while (group = group->next, group != sd->groups);

4329

} while (group = group->next, group != sd->groups);

4329

4330

if (!idlest || 100*this_load < imbalance*min_load)

4331

if (!idlest || 100*this_load < imbalance*min_load)

4331

return NULL;

4332

return NULL;

4332

return idlest;

4333

return idlest;

4333

}

4334

}

4334

4335

/*

4336

/*

4336

* find_idlest_cpu - find the idlest cpu among the cpus in group.

4337

* find_idlest_cpu - find the idlest cpu among the cpus in group.

4337

*/

4338

*/

4338

static int

4339

static int

4339

find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)

4340

find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)

4340

{

4341

{

4341

unsigned long load, min_load = ULONG_MAX;

4342

unsigned long load, min_load = ULONG_MAX;

4342

int idlest = -1;

4343

int idlest = -1;

4343

int i;

4344

int i;

4344

4345

/* Traverse only the allowed CPUs */

4346

/* Traverse only the allowed CPUs */

4346

for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {

4347

for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {

4347

load = weighted_cpuload(i);

4348

load = weighted_cpuload(i);

4348

4349

if (load < min_load || (load == min_load && i == this_cpu)) {

4350

if (load < min_load || (load == min_load && i == this_cpu)) {

4350

min_load = load;

4351

min_load = load;

4351

idlest = i;

4352

idlest = i;

4352

}

4353

}

4353

}

4354

}

4354

4355

return idlest;

4356

return idlest;

4356

}

4357

}

4357

4358

/*

4359

/*

4359

* Try and locate an idle CPU in the sched_domain.

4360

* Try and locate an idle CPU in the sched_domain.

4360

*/

4361

*/

4361

static int select_idle_sibling(struct task_struct *p, int target)

4362

static int select_idle_sibling(struct task_struct *p, int target)

4362

{

4363

{

4363

struct sched_domain *sd;

4364

struct sched_domain *sd;

4364

struct sched_group *sg;

4365

struct sched_group *sg;

4365

int i = task_cpu(p);

4366

int i = task_cpu(p);

4366

4367

if (idle_cpu(target))

4368

if (idle_cpu(target))

4368

return target;

4369

return target;

4369

4370

/*

4371

/*

4371

* If the prevous cpu is cache affine and idle, don't be stupid.

4372

* If the prevous cpu is cache affine and idle, don't be stupid.

4372

*/

4373

*/

4373

if (i != target && cpus_share_cache(i, target) && idle_cpu(i))

4374

if (i != target && cpus_share_cache(i, target) && idle_cpu(i))

4374

return i;

4375

return i;

4375

4376

/*

4377

/*

4377

* Otherwise, iterate the domains and find an elegible idle cpu.

4378

* Otherwise, iterate the domains and find an elegible idle cpu.

4378

*/

4379

*/

4379

sd = rcu_dereference(per_cpu(sd_llc, target));

4380

sd = rcu_dereference(per_cpu(sd_llc, target));

4380

for_each_lower_domain(sd) {

4381

for_each_lower_domain(sd) {

4381

sg = sd->groups;

4382

sg = sd->groups;

4382

do {

4383

do {

4383

if (!cpumask_intersects(sched_group_cpus(sg),

4384

if (!cpumask_intersects(sched_group_cpus(sg),

4384

tsk_cpus_allowed(p)))

4385

tsk_cpus_allowed(p)))

4385

goto next;

4386

goto next;

4386

4387

for_each_cpu(i, sched_group_cpus(sg)) {

4388

for_each_cpu(i, sched_group_cpus(sg)) {

4388

if (i == target || !idle_cpu(i))

4389

if (i == target || !idle_cpu(i))

4389

goto next;

4390

goto next;

4390

}

4391

}

4391

4392

target = cpumask_first_and(sched_group_cpus(sg),

4393

target = cpumask_first_and(sched_group_cpus(sg),

4393

tsk_cpus_allowed(p));

4394

tsk_cpus_allowed(p));

4394

goto done;

4395

goto done;

4395

sg = sg->next;

4397

sg = sg->next;

4397

} while (sg != sd->groups);

4398

} while (sg != sd->groups);

4398

}

4399

}

4399

done:

4400

done:

4400

return target;

4401

return target;

4401

}

4402

}

4402

4403

/*

4404

/*

4404

* select_task_rq_fair: Select target runqueue for the waking task in domains

4405

* select_task_rq_fair: Select target runqueue for the waking task in domains

4405

* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,

4406

* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,

4406

* SD_BALANCE_FORK, or SD_BALANCE_EXEC.

4407

* SD_BALANCE_FORK, or SD_BALANCE_EXEC.

4407

*

4408

*

4408

* Balances load by selecting the idlest cpu in the idlest group, or under

4409

* Balances load by selecting the idlest cpu in the idlest group, or under

4409

* certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.

4410

* certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.

4410

*

4411

*

4411

* Returns the target cpu number.

4412

* Returns the target cpu number.

4412

*

4413

*

4413

* preempt must be disabled.

4414

* preempt must be disabled.

4414

*/

4415

*/

4415

static int

4416

static int

4416

select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)

4417

select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)

4417

{

4418

{

4418

struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;

4419

struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;

4419

int cpu = smp_processor_id();

4420

int cpu = smp_processor_id();

4420

int new_cpu = cpu;

4421

int new_cpu = cpu;

4421

int want_affine = 0;

4422

int want_affine = 0;

4422

int sync = wake_flags & WF_SYNC;

4423

int sync = wake_flags & WF_SYNC;

4423

4424

if (p->nr_cpus_allowed == 1)

4425

if (p->nr_cpus_allowed == 1)

4425

return prev_cpu;

4426

return prev_cpu;

4426

4427

if (sd_flag & SD_BALANCE_WAKE) {

4428

if (sd_flag & SD_BALANCE_WAKE) {

4428

if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))

4429

if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))

4429

want_affine = 1;

4430

want_affine = 1;

4430

new_cpu = prev_cpu;

4431

new_cpu = prev_cpu;

4431

}

4432

}

4432

4433

rcu_read_lock();

4434

rcu_read_lock();

4434

for_each_domain(cpu, tmp) {

4435

for_each_domain(cpu, tmp) {

4435

if (!(tmp->flags & SD_LOAD_BALANCE))

4436

if (!(tmp->flags & SD_LOAD_BALANCE))

4436

continue;

4437

continue;

4437

4438

/*

4439

/*

4439

* If both cpu and prev_cpu are part of this domain,

4440

* If both cpu and prev_cpu are part of this domain,

4440

* cpu is a valid SD_WAKE_AFFINE target.

4441

* cpu is a valid SD_WAKE_AFFINE target.

4441

*/

4442

*/

4442

if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&

4443

if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&

4443

cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {

4444

cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {

4444

affine_sd = tmp;

4445

affine_sd = tmp;

4445

break;

4446

break;

4446

}

4447

}

4447

4448

if (tmp->flags & sd_flag)

4449

if (tmp->flags & sd_flag)

4449

sd = tmp;

4450

sd = tmp;

4450

}

4451

}

4451

4452

if (affine_sd) {

4453

if (affine_sd) {

4453

if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))

4454

if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))

4454

prev_cpu = cpu;

4455

prev_cpu = cpu;

4455

4456

new_cpu = select_idle_sibling(p, prev_cpu);

4457

new_cpu = select_idle_sibling(p, prev_cpu);

4457

goto unlock;

4458

goto unlock;

4458

}

4459

}

4459

4460

while (sd) {

4461

while (sd) {

4461

struct sched_group *group;

4462

struct sched_group *group;

4462

int weight;

4463

int weight;

4463

4464

if (!(sd->flags & sd_flag)) {

4465

if (!(sd->flags & sd_flag)) {

4465

sd = sd->child;

4466

sd = sd->child;

4466

continue;

4467

continue;

4467

}

4468

}

4468

4469

group = find_idlest_group(sd, p, cpu, sd_flag);

4470

group = find_idlest_group(sd, p, cpu, sd_flag);

4470

if (!group) {

4471

if (!group) {

4471

sd = sd->child;

4472

sd = sd->child;

4472

continue;

4473

continue;

4473

}

4474

}

4474

4475

new_cpu = find_idlest_cpu(group, p, cpu);

4476

new_cpu = find_idlest_cpu(group, p, cpu);

4476

if (new_cpu == -1 || new_cpu == cpu) {

4477

if (new_cpu == -1 || new_cpu == cpu) {

4477

/* Now try balancing at a lower domain level of cpu */

4478

/* Now try balancing at a lower domain level of cpu */

4478

sd = sd->child;

4479

sd = sd->child;

4479

continue;

4480

continue;

4480

}

4481

}

4481

4482

/* Now try balancing at a lower domain level of new_cpu */

4483

/* Now try balancing at a lower domain level of new_cpu */

4483

cpu = new_cpu;

4484

cpu = new_cpu;

4484

weight = sd->span_weight;

4485

weight = sd->span_weight;

4485

sd = NULL;

4486

sd = NULL;

4486

for_each_domain(cpu, tmp) {

4487

for_each_domain(cpu, tmp) {

4487

if (weight <= tmp->span_weight)

4488

if (weight <= tmp->span_weight)

4488

break;

4489

break;

4489

if (tmp->flags & sd_flag)

4490

if (tmp->flags & sd_flag)

4490

sd = tmp;

4491

sd = tmp;

4491

}

4492

}

4492

/* while loop will break here if sd == NULL */

4493

/* while loop will break here if sd == NULL */

4493

}

4494

}

4494

unlock:

4495

unlock:

4495

rcu_read_unlock();

4496

rcu_read_unlock();

4496

4497

return new_cpu;

4498

return new_cpu;

4498

}

4499

}

4499

4500

/*

4501

/*

4501

* Called immediately before a task is migrated to a new cpu; task_cpu(p) and

4502

* Called immediately before a task is migrated to a new cpu; task_cpu(p) and

4502

* cfs_rq_of(p) references at time of call are still valid and identify the

4503

* cfs_rq_of(p) references at time of call are still valid and identify the

4503

* previous cpu. However, the caller only guarantees p->pi_lock is held; no

4504

* previous cpu. However, the caller only guarantees p->pi_lock is held; no

4504

* other assumptions, including the state of rq->lock, should be made.

4505

* other assumptions, including the state of rq->lock, should be made.

4505

*/

4506

*/

4506

static void

4507

static void

4507

migrate_task_rq_fair(struct task_struct *p, int next_cpu)

4508

migrate_task_rq_fair(struct task_struct *p, int next_cpu)

4508

{

4509

{

4509

struct sched_entity *se = &p->se;

4510

struct sched_entity *se = &p->se;

4510

struct cfs_rq *cfs_rq = cfs_rq_of(se);

4511

struct cfs_rq *cfs_rq = cfs_rq_of(se);

4511

4512

/*

4513

/*

4513

* Load tracking: accumulate removed load so that it can be processed

4514

* Load tracking: accumulate removed load so that it can be processed

4514

* when we next update owning cfs_rq under rq->lock. Tasks contribute

4515

* when we next update owning cfs_rq under rq->lock. Tasks contribute

4515

* to blocked load iff they have a positive decay-count. It can never

4516

* to blocked load iff they have a positive decay-count. It can never

4516

* be negative here since on-rq tasks have decay-count == 0.

4517

* be negative here since on-rq tasks have decay-count == 0.

4517

*/

4518

*/

4518

if (se->avg.decay_count) {

4519

if (se->avg.decay_count) {

4519

se->avg.decay_count = -__synchronize_entity_decay(se);

4520

se->avg.decay_count = -__synchronize_entity_decay(se);

4520

atomic_long_add(se->avg.load_avg_contrib,

4521

atomic_long_add(se->avg.load_avg_contrib,

4521

&cfs_rq->removed_load);

4522

&cfs_rq->removed_load);

4522

}

4523

}

4523

}

4524

}

4524

#endif /* CONFIG_SMP */

4525

#endif /* CONFIG_SMP */

4525

4526

static unsigned long

4527

static unsigned long

4527

wakeup_gran(struct sched_entity *curr, struct sched_entity *se)

4528

wakeup_gran(struct sched_entity *curr, struct sched_entity *se)

4528

{

4529

{

4529

unsigned long gran = sysctl_sched_wakeup_granularity;

4530

unsigned long gran = sysctl_sched_wakeup_granularity;

4530

4531

/*

4532

/*

4532

* Since its curr running now, convert the gran from real-time

4533

* Since its curr running now, convert the gran from real-time

4533

* to virtual-time in his units.

4534

* to virtual-time in his units.

4534

*

4535

*

4535

* By using 'se' instead of 'curr' we penalize light tasks, so

4536

* By using 'se' instead of 'curr' we penalize light tasks, so

4536

* they get preempted easier. That is, if 'se' < 'curr' then

4537

* they get preempted easier. That is, if 'se' < 'curr' then

4537

* the resulting gran will be larger, therefore penalizing the

4538

* the resulting gran will be larger, therefore penalizing the

4538

* lighter, if otoh 'se' > 'curr' then the resulting gran will

4539

* lighter, if otoh 'se' > 'curr' then the resulting gran will

4539

* be smaller, again penalizing the lighter task.

4540

* be smaller, again penalizing the lighter task.

4540

*

4541

*

4541

* This is especially important for buddies when the leftmost

4542

* This is especially important for buddies when the leftmost

4542

* task is higher priority than the buddy.

4543

* task is higher priority than the buddy.

4543

*/

4544

*/

4544

return calc_delta_fair(gran, se);

4545

return calc_delta_fair(gran, se);

4545

}

4546

}

4546

4547

/*

4548

/*

4548

* Should 'se' preempt 'curr'.

4549

* Should 'se' preempt 'curr'.

4549

*

4550

*

4550

* |s1

4551

* |s1

4551

* |s2

4552

* |s2

4552

* |s3

4553

* |s3

4553

* g

4554

* g

4554

* |<--->|c

4555

* |<--->|c

4555

*

4556

*

4556

* w(c, s1) = -1

4557

* w(c, s1) = -1

4557

* w(c, s2) = 0

4558

* w(c, s2) = 0

4558

* w(c, s3) = 1

4559

* w(c, s3) = 1

4559

*

4560

*

4560

*/

4561

*/

4561

static int

4562

static int

4562

wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)

4563

wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)

4563

{

4564

{

4564

s64 gran, vdiff = curr->vruntime - se->vruntime;

4565

s64 gran, vdiff = curr->vruntime - se->vruntime;

4565

4566

if (vdiff <= 0)

4567

if (vdiff <= 0)

4567

return -1;

4568

return -1;

4568

4569

gran = wakeup_gran(curr, se);

4570

gran = wakeup_gran(curr, se);

4570

if (vdiff > gran)

4571

if (vdiff > gran)

4571

return 1;

4572

return 1;

4572

4573

return 0;

4574

return 0;

4574

}

4575

}

4575

4576

static void set_last_buddy(struct sched_entity *se)

4577

static void set_last_buddy(struct sched_entity *se)

4577

{

4578

{

4578

if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))

4579

if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))

4579

return;

4580

return;

4580

4581

for_each_sched_entity(se)

4582

for_each_sched_entity(se)

4582

cfs_rq_of(se)->last = se;

4583

cfs_rq_of(se)->last = se;

4583

}

4584

}

4584

4585

static void set_next_buddy(struct sched_entity *se)

4586

static void set_next_buddy(struct sched_entity *se)

4586

{

4587

{

4587

if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))

4588

if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))

4588

return;

4589

return;

4589

4590

for_each_sched_entity(se)

4591

for_each_sched_entity(se)

4591

cfs_rq_of(se)->next = se;

4592

cfs_rq_of(se)->next = se;

4592

}

4593

}

4593

4594

static void set_skip_buddy(struct sched_entity *se)

4595

static void set_skip_buddy(struct sched_entity *se)

4595

{

4596

{

4596

for_each_sched_entity(se)

4597

for_each_sched_entity(se)

4597

cfs_rq_of(se)->skip = se;

4598

cfs_rq_of(se)->skip = se;

4598

}

4599

}

4599

4600

/*

4601

/*

4601

* Preempt the current task with a newly woken task if needed:

4602

* Preempt the current task with a newly woken task if needed:

4602

*/

4603

*/

4603

static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)

4604

static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)

4604

{

4605

{

4605

struct task_struct *curr = rq->curr;

4606

struct task_struct *curr = rq->curr;

4606

struct sched_entity *se = &curr->se, *pse = &p->se;

4607

struct sched_entity *se = &curr->se, *pse = &p->se;

4607

struct cfs_rq *cfs_rq = task_cfs_rq(curr);

4608

struct cfs_rq *cfs_rq = task_cfs_rq(curr);

4608

int scale = cfs_rq->nr_running >= sched_nr_latency;

4609

int scale = cfs_rq->nr_running >= sched_nr_latency;

4609

int next_buddy_marked = 0;

4610

int next_buddy_marked = 0;

4610

4611

if (unlikely(se == pse))

4612

if (unlikely(se == pse))

4612

return;

4613

return;

4613

4614

/*

4615

/*

4615

* This is possible from callers such as move_task(), in which we

4616

* This is possible from callers such as move_task(), in which we

4616

* unconditionally check_prempt_curr() after an enqueue (which may have

4617

* unconditionally check_prempt_curr() after an enqueue (which may have

4617

* lead to a throttle). This both saves work and prevents false

4618

* lead to a throttle). This both saves work and prevents false

4618

* next-buddy nomination below.

4619

* next-buddy nomination below.

4619

*/

4620

*/

4620

if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))

4621

if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))

4621

return;

4622

return;

4622

4623

if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {

4624

if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {

4624

set_next_buddy(pse);

4625

set_next_buddy(pse);

4625

next_buddy_marked = 1;

4626

next_buddy_marked = 1;

4626

}

4627

}

4627

4628

/*

4629

/*

4629

* We can come here with TIF_NEED_RESCHED already set from new task

4630

* We can come here with TIF_NEED_RESCHED already set from new task

4630

* wake up path.

4631

* wake up path.

4631

*

4632

*

4632

* Note: this also catches the edge-case of curr being in a throttled

4633

* Note: this also catches the edge-case of curr being in a throttled

4633

* group (e.g. via set_curr_task), since update_curr() (in the

4634

* group (e.g. via set_curr_task), since update_curr() (in the

4634

* enqueue of curr) will have resulted in resched being set. This

4635

* enqueue of curr) will have resulted in resched being set. This

4635

* prevents us from potentially nominating it as a false LAST_BUDDY

4636

* prevents us from potentially nominating it as a false LAST_BUDDY

4636

* below.

4637

* below.

4637

*/

4638

*/

4638

if (test_tsk_need_resched(curr))

4639

if (test_tsk_need_resched(curr))

4639

return;

4640

return;

4640

4641

/* Idle tasks are by definition preempted by non-idle tasks. */

4642

/* Idle tasks are by definition preempted by non-idle tasks. */

4642

if (unlikely(curr->policy == SCHED_IDLE) &&

4643

if (unlikely(curr->policy == SCHED_IDLE) &&

4643

likely(p->policy != SCHED_IDLE))

4644

likely(p->policy != SCHED_IDLE))

4644

goto preempt;

4645

goto preempt;

4645

4646

/*

4647

/*

4647

* Batch and idle tasks do not preempt non-idle tasks (their preemption

4648

* Batch and idle tasks do not preempt non-idle tasks (their preemption

4648

* is driven by the tick):

4649

* is driven by the tick):

4649

*/

4650

*/

4650

if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))

4651

if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))

4651

return;

4652

return;

4652

4653

find_matching_se(&se, &pse);

4654

find_matching_se(&se, &pse);

4654

update_curr(cfs_rq_of(se));

4655

update_curr(cfs_rq_of(se));

4655

BUG_ON(!pse);

4656

BUG_ON(!pse);

4656

if (wakeup_preempt_entity(se, pse) == 1) {

4657

if (wakeup_preempt_entity(se, pse) == 1) {

4657

/*

4658

/*

4658

* Bias pick_next to pick the sched entity that is

4659

* Bias pick_next to pick the sched entity that is

4659

* triggering this preemption.

4660

* triggering this preemption.

4660

*/

4661

*/

4661

if (!next_buddy_marked)

4662

if (!next_buddy_marked)

4662

set_next_buddy(pse);

4663

set_next_buddy(pse);

4663

goto preempt;

4664

goto preempt;

4664

}

4665

}

4665

4666

return;

4667

return;

4667

4668

preempt:

4669

preempt:

4669

resched_task(curr);

4670

resched_task(curr);

4670

/*

4671

/*

4671

* Only set the backward buddy when the current task is still

4672

* Only set the backward buddy when the current task is still

4672

* on the rq. This can happen when a wakeup gets interleaved

4673

* on the rq. This can happen when a wakeup gets interleaved

4673

* with schedule on the ->pre_schedule() or idle_balance()

4674

* with schedule on the ->pre_schedule() or idle_balance()

4674

* point, either of which can * drop the rq lock.

4675

* point, either of which can * drop the rq lock.

4675

*

4676

*

4676

* Also, during early boot the idle thread is in the fair class,

4677

* Also, during early boot the idle thread is in the fair class,

4677

* for obvious reasons its a bad idea to schedule back to it.

4678

* for obvious reasons its a bad idea to schedule back to it.

4678

*/

4679

*/

4679

if (unlikely(!se->on_rq || curr == rq->idle))

4680

if (unlikely(!se->on_rq || curr == rq->idle))

4680

return;

4681

return;

4681

4682

if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))

4683

if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))

4683

set_last_buddy(se);

4684

set_last_buddy(se);

4684

}

4685

}

4685

4686

static struct task_struct *

4687

static struct task_struct *

4687

pick_next_task_fair(struct rq *rq, struct task_struct *prev)

4688

pick_next_task_fair(struct rq *rq, struct task_struct *prev)

4688

{

4689

{

4689

struct cfs_rq *cfs_rq = &rq->cfs;

4690

struct cfs_rq *cfs_rq = &rq->cfs;

4690

struct sched_entity *se;

4691

struct sched_entity *se;

4691

struct task_struct *p;

4692

struct task_struct *p;

4692

int new_tasks;

4693

int new_tasks;

4693

4694

again:

4695

again:

4695

#ifdef CONFIG_FAIR_GROUP_SCHED

4696

#ifdef CONFIG_FAIR_GROUP_SCHED

4696

if (!cfs_rq->nr_running)

4697

if (!cfs_rq->nr_running)

4697

goto idle;

4698

goto idle;

4698

4699

if (prev->sched_class != &fair_sched_class)

4700

if (prev->sched_class != &fair_sched_class)

4700

goto simple;

4701

goto simple;

4701

4702

/*

4703

/*

4703

* Because of the set_next_buddy() in dequeue_task_fair() it is rather

4704

* Because of the set_next_buddy() in dequeue_task_fair() it is rather

4704

* likely that a next task is from the same cgroup as the current.

4705

* likely that a next task is from the same cgroup as the current.

4705

*

4706

*

4706

* Therefore attempt to avoid putting and setting the entire cgroup

4707

* Therefore attempt to avoid putting and setting the entire cgroup

4707

* hierarchy, only change the part that actually changes.

4708

* hierarchy, only change the part that actually changes.

4708

*/

4709

*/

4709

4710

do {

4711

do {

4711

struct sched_entity *curr = cfs_rq->curr;

4712

struct sched_entity *curr = cfs_rq->curr;

4712

4713

/*

4714

/*

4714

* Since we got here without doing put_prev_entity() we also

4715

* Since we got here without doing put_prev_entity() we also

4715

* have to consider cfs_rq->curr. If it is still a runnable

4716

* have to consider cfs_rq->curr. If it is still a runnable

4716

* entity, update_curr() will update its vruntime, otherwise

4717

* entity, update_curr() will update its vruntime, otherwise

4717

* forget we've ever seen it.

4718

* forget we've ever seen it.

4718

*/

4719

*/

4719

if (curr && curr->on_rq)

4720

if (curr && curr->on_rq)

4720

update_curr(cfs_rq);

4721

update_curr(cfs_rq);

4721

else

4722

else

4722

curr = NULL;

4723

curr = NULL;

4723

4724

/*

4725

/*

4725

* This call to check_cfs_rq_runtime() will do the throttle and

4726

* This call to check_cfs_rq_runtime() will do the throttle and

4726

* dequeue its entity in the parent(s). Therefore the 'simple'

4727

* dequeue its entity in the parent(s). Therefore the 'simple'

4727

* nr_running test will indeed be correct.

4728

* nr_running test will indeed be correct.

4728

*/

4729

*/

4729

if (unlikely(check_cfs_rq_runtime(cfs_rq)))

4730

if (unlikely(check_cfs_rq_runtime(cfs_rq)))

4730

goto simple;

4731

goto simple;

4731

4732

se = pick_next_entity(cfs_rq, curr);

4733

se = pick_next_entity(cfs_rq, curr);

4733

cfs_rq = group_cfs_rq(se);

4734

cfs_rq = group_cfs_rq(se);

4734

} while (cfs_rq);

4735

} while (cfs_rq);

4735

4736

p = task_of(se);

4737

p = task_of(se);

4737

4738

/*

4739

/*

4739

* Since we haven't yet done put_prev_entity and if the selected task

4740

* Since we haven't yet done put_prev_entity and if the selected task

4740

* is a different task than we started out with, try and touch the

4741

* is a different task than we started out with, try and touch the

4741

* least amount of cfs_rqs.

4742

* least amount of cfs_rqs.

4742

*/

4743

*/

4743

if (prev != p) {

4744

if (prev != p) {

4744

struct sched_entity *pse = &prev->se;

4745

struct sched_entity *pse = &prev->se;

4745

4746

while (!(cfs_rq = is_same_group(se, pse))) {

4747

while (!(cfs_rq = is_same_group(se, pse))) {

4747

int se_depth = se->depth;

4748

int se_depth = se->depth;

4748

int pse_depth = pse->depth;

4749

int pse_depth = pse->depth;

4749

4750

if (se_depth <= pse_depth) {

4751

if (se_depth <= pse_depth) {

4751

put_prev_entity(cfs_rq_of(pse), pse);

4752

put_prev_entity(cfs_rq_of(pse), pse);

4752

pse = parent_entity(pse);

4753

pse = parent_entity(pse);

4753

}

4754

}

4754

if (se_depth >= pse_depth) {

4755

if (se_depth >= pse_depth) {

4755

set_next_entity(cfs_rq_of(se), se);

4756

set_next_entity(cfs_rq_of(se), se);

4756

se = parent_entity(se);

4757

se = parent_entity(se);

4757

}

4758

}

4758

}

4759

}

4759

4760

put_prev_entity(cfs_rq, pse);

4761

put_prev_entity(cfs_rq, pse);

4761

set_next_entity(cfs_rq, se);

4762

set_next_entity(cfs_rq, se);

4762

}

4763

}

4763

4764

if (hrtick_enabled(rq))

4765

if (hrtick_enabled(rq))

4765

hrtick_start_fair(rq, p);

4766

hrtick_start_fair(rq, p);

4766

4767

return p;

4768

return p;

4768

simple:

4769

simple:

4769

cfs_rq = &rq->cfs;

4770

cfs_rq = &rq->cfs;

4770

#endif

4771

#endif

4771

4772

if (!cfs_rq->nr_running)

4773

if (!cfs_rq->nr_running)

4773

goto idle;

4774

goto idle;

4774

4775

put_prev_task(rq, prev);

4776

put_prev_task(rq, prev);

4776

4777

do {

4778

do {

4778

se = pick_next_entity(cfs_rq, NULL);

4779

se = pick_next_entity(cfs_rq, NULL);

4779

set_next_entity(cfs_rq, se);

4780

set_next_entity(cfs_rq, se);

4780

cfs_rq = group_cfs_rq(se);

4781

cfs_rq = group_cfs_rq(se);

4781

} while (cfs_rq);

4782

} while (cfs_rq);

4782

4783

p = task_of(se);

4784

p = task_of(se);

4784

4785

if (hrtick_enabled(rq))

4786

if (hrtick_enabled(rq))

4786

hrtick_start_fair(rq, p);

4787

hrtick_start_fair(rq, p);

4787

4788

return p;

4789

return p;

4789

4790

idle:

4791

idle:

4791

new_tasks = idle_balance(rq);

4792

new_tasks = idle_balance(rq);

4792

/*

4793

/*

4793

* Because idle_balance() releases (and re-acquires) rq->lock, it is

4794

* Because idle_balance() releases (and re-acquires) rq->lock, it is

4794

* possible for any higher priority task to appear. In that case we

4795

* possible for any higher priority task to appear. In that case we

4795

* must re-start the pick_next_entity() loop.

4796

* must re-start the pick_next_entity() loop.

4796

*/

4797

*/

4797

if (new_tasks < 0)

4798

if (new_tasks < 0)

4798

return RETRY_TASK;

4799

return RETRY_TASK;

4799

4800

if (new_tasks > 0)

4801

if (new_tasks > 0)

4801

goto again;

4802

goto again;

4802

4803

return NULL;

4804

return NULL;

4804

}

4805

}

4805

4806

/*

4807

/*

4807

* Account for a descheduled task:

4808

* Account for a descheduled task:

4808

*/

4809

*/

4809

static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)

4810

static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)

4810

{

4811

{

4811

struct sched_entity *se = &prev->se;

4812

struct sched_entity *se = &prev->se;

4812

struct cfs_rq *cfs_rq;

4813

struct cfs_rq *cfs_rq;

4813

4814

for_each_sched_entity(se) {

4815

for_each_sched_entity(se) {

4815

cfs_rq = cfs_rq_of(se);

4816

cfs_rq = cfs_rq_of(se);

4816

put_prev_entity(cfs_rq, se);

4817

put_prev_entity(cfs_rq, se);

4817

}

4818

}

4818

}

4819

}

4819

4820

/*

4821

/*

4821

* sched_yield() is very simple

4822

* sched_yield() is very simple

4822

*

4823

*

4823

* The magic of dealing with the ->skip buddy is in pick_next_entity.

4824

* The magic of dealing with the ->skip buddy is in pick_next_entity.

4824

*/

4825

*/

4825

static void yield_task_fair(struct rq *rq)

4826

static void yield_task_fair(struct rq *rq)

4826

{

4827

{

4827

struct task_struct *curr = rq->curr;

4828

struct task_struct *curr = rq->curr;

4828

struct cfs_rq *cfs_rq = task_cfs_rq(curr);

4829

struct cfs_rq *cfs_rq = task_cfs_rq(curr);

4829

struct sched_entity *se = &curr->se;

4830

struct sched_entity *se = &curr->se;

4830

4831

/*

4832

/*

4832

* Are we the only task in the tree?

4833

* Are we the only task in the tree?

4833

*/

4834

*/

4834

if (unlikely(rq->nr_running == 1))

4835

if (unlikely(rq->nr_running == 1))

4835

return;

4836

return;

4836

4837

clear_buddies(cfs_rq, se);

4838

clear_buddies(cfs_rq, se);

4838

4839

if (curr->policy != SCHED_BATCH) {

4840

if (curr->policy != SCHED_BATCH) {

4840

update_rq_clock(rq);

4841

update_rq_clock(rq);

4841

/*

4842

/*

4842

* Update run-time statistics of the 'current'.

4843

* Update run-time statistics of the 'current'.

4843

*/

4844

*/

4844

update_curr(cfs_rq);

4845

update_curr(cfs_rq);

4845

/*

4846

/*

4846

* Tell update_rq_clock() that we've just updated,

4847

* Tell update_rq_clock() that we've just updated,

4847

* so we don't do microscopic update in schedule()

4848

* so we don't do microscopic update in schedule()

4848

* and double the fastpath cost.

4849

* and double the fastpath cost.

4849

*/

4850

*/

4850

rq->skip_clock_update = 1;

4851

rq->skip_clock_update = 1;

4851

}

4852

}

4852

4853

set_skip_buddy(se);

4854

set_skip_buddy(se);

4854

}

4855

}

4855

4856

static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)

4857

static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)

4857

{

4858

{

4858

struct sched_entity *se = &p->se;

4859

struct sched_entity *se = &p->se;

4859

4860

/* throttled hierarchies are not runnable */

4861

/* throttled hierarchies are not runnable */

4861

if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))

4862

if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))

4862

return false;

4863

return false;

4863

4864

/* Tell the scheduler that we'd really like pse to run next. */

4865

/* Tell the scheduler that we'd really like pse to run next. */

4865

set_next_buddy(se);

4866

set_next_buddy(se);

4866

4867

yield_task_fair(rq);

4868

yield_task_fair(rq);

4868

4869

return true;

4870

return true;

4870

}

4871

}

4871

4872

#ifdef CONFIG_SMP

4873

#ifdef CONFIG_SMP

4873

/**************************************************

4874

/**************************************************

4874

* Fair scheduling class load-balancing methods.

4875

* Fair scheduling class load-balancing methods.

4875

*

4876

*

4876

* BASICS

4877

* BASICS

4877

*

4878

*

4878

* The purpose of load-balancing is to achieve the same basic fairness the

4879

* The purpose of load-balancing is to achieve the same basic fairness the

4879

* per-cpu scheduler provides, namely provide a proportional amount of compute

4880

* per-cpu scheduler provides, namely provide a proportional amount of compute

4880

* time to each task. This is expressed in the following equation:

4881

* time to each task. This is expressed in the following equation:

4881

*

4882

*

4882

* W_i,n/P_i == W_j,n/P_j for all i,j (1)

4883

* W_i,n/P_i == W_j,n/P_j for all i,j (1)

4883

*

4884

*

4884

* Where W_i,n is the n-th weight average for cpu i. The instantaneous weight

4885

* Where W_i,n is the n-th weight average for cpu i. The instantaneous weight

4885

* W_i,0 is defined as:

4886

* W_i,0 is defined as:

4886

*

4887

*

4887

* W_i,0 = \Sum_j w_i,j (2)

4888

* W_i,0 = \Sum_j w_i,j (2)

4888

*

4889

*

4889

* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight

4890

* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight

4890

* is derived from the nice value as per prio_to_weight[].

4891

* is derived from the nice value as per prio_to_weight[].

4891

*

4892

*

4892

* The weight average is an exponential decay average of the instantaneous

4893

* The weight average is an exponential decay average of the instantaneous

4893

* weight:

4894

* weight:

4894

*

4895

*

4895

* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)

4896

* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)

4896

*

4897

*

4897

* P_i is the cpu power (or compute capacity) of cpu i, typically it is the

4898

* P_i is the cpu power (or compute capacity) of cpu i, typically it is the

4898

* fraction of 'recent' time available for SCHED_OTHER task execution. But it

4899

* fraction of 'recent' time available for SCHED_OTHER task execution. But it

4899

* can also include other factors [XXX].

4900

* can also include other factors [XXX].

4900

*

4901

*

4901

* To achieve this balance we define a measure of imbalance which follows

4902

* To achieve this balance we define a measure of imbalance which follows

4902

* directly from (1):

4903

* directly from (1):

4903

*

4904

*

4904

* imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)

4905

* imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)

4905

*

4906

*

4906

* We them move tasks around to minimize the imbalance. In the continuous

4907

* We them move tasks around to minimize the imbalance. In the continuous

4907

* function space it is obvious this converges, in the discrete case we get

4908

* function space it is obvious this converges, in the discrete case we get

4908

* a few fun cases generally called infeasible weight scenarios.

4909

* a few fun cases generally called infeasible weight scenarios.

4909

*

4910

*

4910

* [XXX expand on:

4911

* [XXX expand on:

4911

* - infeasible weights;

4912

* - infeasible weights;

4912

* - local vs global optima in the discrete case. ]

4913

* - local vs global optima in the discrete case. ]

4913

*

4914

*

4914

*

4915

*

4915

* SCHED DOMAINS

4916

* SCHED DOMAINS

4916

*

4917

*

4917

* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)

4918

* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)

4918

* for all i,j solution, we create a tree of cpus that follows the hardware

4919

* for all i,j solution, we create a tree of cpus that follows the hardware

4919

* topology where each level pairs two lower groups (or better). This results

4920

* topology where each level pairs two lower groups (or better). This results

4920

* in O(log n) layers. Furthermore we reduce the number of cpus going up the

4921

* in O(log n) layers. Furthermore we reduce the number of cpus going up the

4921

* tree to only the first of the previous level and we decrease the frequency

4922

* tree to only the first of the previous level and we decrease the frequency

4922

* of load-balance at each level inv. proportional to the number of cpus in

4923

* of load-balance at each level inv. proportional to the number of cpus in

4923

* the groups.

4924

* the groups.

4924

*

4925

*

4925

* This yields:

4926

* This yields:

4926

*

4927

*

4927

* log_2 n 1 n

4928

* log_2 n 1 n

4928

* \Sum { --- * --- * 2^i } = O(n) (5)

4929

* \Sum { --- * --- * 2^i } = O(n) (5)

4929

* i = 0 2^i 2^i

4930

* i = 0 2^i 2^i

4930

* `- size of each group

4931

* `- size of each group

4931

* | | `- number of cpus doing load-balance

4932

* | | `- number of cpus doing load-balance

4932

* | `- freq

4933

* | `- freq

4933

* `- sum over all levels

4934

* `- sum over all levels

4934

*

4935

*

4935

* Coupled with a limit on how many tasks we can migrate every balance pass,

4936

* Coupled with a limit on how many tasks we can migrate every balance pass,

4936

* this makes (5) the runtime complexity of the balancer.

4937

* this makes (5) the runtime complexity of the balancer.

4937

*

4938

*

4938

* An important property here is that each CPU is still (indirectly) connected

4939

* An important property here is that each CPU is still (indirectly) connected

4939

* to every other cpu in at most O(log n) steps:

4940

* to every other cpu in at most O(log n) steps:

4940

*

4941

*

4941

* The adjacency matrix of the resulting graph is given by:

4942

* The adjacency matrix of the resulting graph is given by:

4942

*

4943

*

4943

* log_2 n

4944

* log_2 n

4944

* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)

4945

* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)

4945

* k = 0

4946

* k = 0

4946

*

4947

*

4947

* And you'll find that:

4948

* And you'll find that:

4948

*

4949

*

4949

* A^(log_2 n)_i,j != 0 for all i,j (7)

4950

* A^(log_2 n)_i,j != 0 for all i,j (7)

4950

*

4951

*

4951

* Showing there's indeed a path between every cpu in at most O(log n) steps.

4952

* Showing there's indeed a path between every cpu in at most O(log n) steps.

4952

* The task movement gives a factor of O(m), giving a convergence complexity

4953

* The task movement gives a factor of O(m), giving a convergence complexity

4953

* of:

4954

* of:

4954

*

4955

*

4955

* O(nm log n), n := nr_cpus, m := nr_tasks (8)

4956

* O(nm log n), n := nr_cpus, m := nr_tasks (8)

4956

*

4957

*

4957

*

4958

*

4958

* WORK CONSERVING

4959

* WORK CONSERVING

4959

*

4960

*

4960

* In order to avoid CPUs going idle while there's still work to do, new idle

4961

* In order to avoid CPUs going idle while there's still work to do, new idle

4961

* balancing is more aggressive and has the newly idle cpu iterate up the domain

4962

* balancing is more aggressive and has the newly idle cpu iterate up the domain

4962

* tree itself instead of relying on other CPUs to bring it work.

4963

* tree itself instead of relying on other CPUs to bring it work.

4963

*

4964

*

4964

* This adds some complexity to both (5) and (8) but it reduces the total idle

4965

* This adds some complexity to both (5) and (8) but it reduces the total idle

4965

* time.

4966

* time.

4966

*

4967

*

4967

* [XXX more?]

4968

* [XXX more?]

4968

*

4969

*

4969

*

4970

*

4970

* CGROUPS

4971

* CGROUPS

4971

*

4972

*

4972

* Cgroups make a horror show out of (2), instead of a simple sum we get:

4973

* Cgroups make a horror show out of (2), instead of a simple sum we get:

4973

*

4974

*

4974

* s_k,i

4975

* s_k,i

4975

* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)

4976

* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)

4976

* S_k

4977

* S_k

4977

*

4978

*

4978

* Where

4979

* Where

4979

*

4980

*

4980

* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)

4981

* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)

4981

*

4982

*

4982

* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.

4983

* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.

4983

*

4984

*

4984

* The big problem is S_k, its a global sum needed to compute a local (W_i)

4985

* The big problem is S_k, its a global sum needed to compute a local (W_i)

4985

* property.

4986

* property.

4986

*

4987

*

4987

* [XXX write more on how we solve this.. _after_ merging pjt's patches that

4988

* [XXX write more on how we solve this.. _after_ merging pjt's patches that

4988

* rewrite all of this once again.]

4989

* rewrite all of this once again.]

4989

*/

4990

*/

4990

4991

static unsigned long __read_mostly max_load_balance_interval = HZ/10;

4992

static unsigned long __read_mostly max_load_balance_interval = HZ/10;

4992

4993

enum fbq_type { regular, remote, all };

4994

enum fbq_type { regular, remote, all };

4994

4995

#define LBF_ALL_PINNED 0x01

4996

#define LBF_ALL_PINNED 0x01

4996

#define LBF_NEED_BREAK 0x02

4997

#define LBF_NEED_BREAK 0x02

4997

#define LBF_DST_PINNED 0x04

4998

#define LBF_DST_PINNED 0x04

4998

#define LBF_SOME_PINNED 0x08

4999

#define LBF_SOME_PINNED 0x08

4999

5000

struct lb_env {

5001

struct lb_env {

5001

struct sched_domain *sd;

5002

struct sched_domain *sd;

5002

5003

struct rq *src_rq;

5004

struct rq *src_rq;

5004

int src_cpu;

5005

int src_cpu;

5005

5006

int dst_cpu;

5007

int dst_cpu;

5007

struct rq *dst_rq;

5008

struct rq *dst_rq;

5008

5009

struct cpumask *dst_grpmask;

5010

struct cpumask *dst_grpmask;

5010

int new_dst_cpu;

5011

int new_dst_cpu;

5011

enum cpu_idle_type idle;

5012

enum cpu_idle_type idle;

5012

long imbalance;

5013

long imbalance;

5013

/* The set of CPUs under consideration for load-balancing */

5014

/* The set of CPUs under consideration for load-balancing */

5014

struct cpumask *cpus;

5015

struct cpumask *cpus;

5015

5016

unsigned int flags;

5017

unsigned int flags;

5017

5018

unsigned int loop;

5019

unsigned int loop;

5019

unsigned int loop_break;

5020

unsigned int loop_break;

5020

unsigned int loop_max;

5021

unsigned int loop_max;

5021

5022

enum fbq_type fbq_type;

5023

enum fbq_type fbq_type;

5023

};

5024

};

5024

5025

/*

5026

/*

5026

* move_task - move a task from one runqueue to another runqueue.

5027

* move_task - move a task from one runqueue to another runqueue.

5027

* Both runqueues must be locked.

5028

* Both runqueues must be locked.

5028

*/

5029

*/

5029

static void move_task(struct task_struct *p, struct lb_env *env)

5030

static void move_task(struct task_struct *p, struct lb_env *env)

5030

{

5031

{

5031

deactivate_task(env->src_rq, p, 0);

5032

deactivate_task(env->src_rq, p, 0);

5032

set_task_cpu(p, env->dst_cpu);

5033

set_task_cpu(p, env->dst_cpu);

5033

activate_task(env->dst_rq, p, 0);

5034

activate_task(env->dst_rq, p, 0);

5034

check_preempt_curr(env->dst_rq, p, 0);

5035

check_preempt_curr(env->dst_rq, p, 0);

5035

}

5036

}

5036

5037

/*

5038

/*

5038

* Is this task likely cache-hot:

5039

* Is this task likely cache-hot:

5039

*/

5040

*/

5040

static int

5041

static int

5041

task_hot(struct task_struct *p, u64 now)

5042

task_hot(struct task_struct *p, u64 now)

5042

{

5043

{

5043

s64 delta;

5044

s64 delta;

5044

5045

if (p->sched_class != &fair_sched_class)

5046

if (p->sched_class != &fair_sched_class)

5046

return 0;

5047

return 0;

5047

5048

if (unlikely(p->policy == SCHED_IDLE))

5049

if (unlikely(p->policy == SCHED_IDLE))

5049

return 0;

5050

return 0;

5050

5051

/*

5052

/*

5052

* Buddy candidates are cache hot:

5053

* Buddy candidates are cache hot:

5053

*/

5054

*/

5054

if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&

5055

if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&

5055

(&p->se == cfs_rq_of(&p->se)->next ||

5056

(&p->se == cfs_rq_of(&p->se)->next ||

5056

&p->se == cfs_rq_of(&p->se)->last))

5057

&p->se == cfs_rq_of(&p->se)->last))

5057

return 1;

5058

return 1;

5058

5059

if (sysctl_sched_migration_cost == -1)

5060

if (sysctl_sched_migration_cost == -1)

5060

return 1;

5061

return 1;

5061

if (sysctl_sched_migration_cost == 0)

5062

if (sysctl_sched_migration_cost == 0)

5062

return 0;

5063

return 0;

5063

5064

delta = now - p->se.exec_start;

5065

delta = now - p->se.exec_start;

5065

5066

return delta < (s64)sysctl_sched_migration_cost;

5067

return delta < (s64)sysctl_sched_migration_cost;

5067

}

5068

}

5068

5069

#ifdef CONFIG_NUMA_BALANCING

5070

#ifdef CONFIG_NUMA_BALANCING

5070

/* Returns true if the destination node has incurred more faults */

5071

/* Returns true if the destination node has incurred more faults */

5071

static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)

5072

static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)

5072

{

5073

{

5073

int src_nid, dst_nid;

5074

int src_nid, dst_nid;

5074

5075

if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||

5076

if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||

5076

!(env->sd->flags & SD_NUMA)) {

5077

!(env->sd->flags & SD_NUMA)) {

5077

return false;

5078

return false;

5078

}

5079

}

5079

5080

src_nid = cpu_to_node(env->src_cpu);

5081

src_nid = cpu_to_node(env->src_cpu);

5081

dst_nid = cpu_to_node(env->dst_cpu);

5082

dst_nid = cpu_to_node(env->dst_cpu);

5082

5083

if (src_nid == dst_nid)

5084

if (src_nid == dst_nid)

5084

return false;

5085

return false;

5085

5086

/* Always encourage migration to the preferred node. */

5087

/* Always encourage migration to the preferred node. */

5087

if (dst_nid == p->numa_preferred_nid)

5088

if (dst_nid == p->numa_preferred_nid)

5088

return true;

5089

return true;

5089

5090

/* If both task and group weight improve, this move is a winner. */

5091

/* If both task and group weight improve, this move is a winner. */

5091

if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&

5092

if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&

5092

group_weight(p, dst_nid) > group_weight(p, src_nid))

5093

group_weight(p, dst_nid) > group_weight(p, src_nid))

5093

return true;

5094

return true;

5094

5095

return false;

5096

return false;

5096

}

5097

}

5097

5098

5099

static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)

5100

static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)

5100

{

5101

{

5101

int src_nid, dst_nid;

5102

int src_nid, dst_nid;

5102

5103

if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))

5104

if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))

5104

return false;

5105

return false;

5105

5106

if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))

5107

if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))

5107

return false;

5108

return false;

5108

5109

src_nid = cpu_to_node(env->src_cpu);

5110

src_nid = cpu_to_node(env->src_cpu);

5110

dst_nid = cpu_to_node(env->dst_cpu);

5111

dst_nid = cpu_to_node(env->dst_cpu);

5111

5112

if (src_nid == dst_nid)

5113

if (src_nid == dst_nid)

5113

return false;

5114

return false;

5114

5115

/* Migrating away from the preferred node is always bad. */

5116

/* Migrating away from the preferred node is always bad. */

5116

if (src_nid == p->numa_preferred_nid)

5117

if (src_nid == p->numa_preferred_nid)

5117

return true;

5118

return true;

5118

5119

/* If either task or group weight get worse, don't do it. */

5120

/* If either task or group weight get worse, don't do it. */

5120

if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||

5121

if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||

5121

group_weight(p, dst_nid) < group_weight(p, src_nid))

5122

group_weight(p, dst_nid) < group_weight(p, src_nid))

5122

return true;

5123

return true;

5123

5124

return false;

5125

return false;

5125

}

5126

}

5126

5127

#else

5128

#else

5128

static inline bool migrate_improves_locality(struct task_struct *p,

5129

static inline bool migrate_improves_locality(struct task_struct *p,

5129

struct lb_env *env)

5130

struct lb_env *env)

5130

{

5131

{

5131

return false;

5132

return false;

5132

}

5133

}

5133

5134

static inline bool migrate_degrades_locality(struct task_struct *p,

5135

static inline bool migrate_degrades_locality(struct task_struct *p,

5135

struct lb_env *env)

5136

struct lb_env *env)

5136

{

5137

{

5137

return false;

5138

return false;

5138

}

5139

}

5139

#endif

5140

#endif

5140

5141

/*

5142

/*

5142

* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?

5143

* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?

5143

*/

5144

*/

5144

static

5145

static

5145

int can_migrate_task(struct task_struct *p, struct lb_env *env)

5146

int can_migrate_task(struct task_struct *p, struct lb_env *env)

5146

{

5147

{

5147

int tsk_cache_hot = 0;

5148

int tsk_cache_hot = 0;

5148

/*

5149

/*

5149

* We do not migrate tasks that are:

5150

* We do not migrate tasks that are:

5150

* 1) throttled_lb_pair, or

5151

* 1) throttled_lb_pair, or

5151

* 2) cannot be migrated to this CPU due to cpus_allowed, or

5152

* 2) cannot be migrated to this CPU due to cpus_allowed, or

5152

* 3) running (obviously), or

5153

* 3) running (obviously), or

5153

* 4) are cache-hot on their current CPU.

5154

* 4) are cache-hot on their current CPU.

5154

*/

5155

*/

5155

if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))

5156

if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))

5156

return 0;

5157

return 0;

5157

5158

if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {

5159

if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {

5159

int cpu;

5160

int cpu;

5160

5161

schedstat_inc(p, se.statistics.nr_failed_migrations_affine);

5162

schedstat_inc(p, se.statistics.nr_failed_migrations_affine);

5162

5163

env->flags |= LBF_SOME_PINNED;

5164

env->flags |= LBF_SOME_PINNED;

5164

5165

/*

5166

/*

5166

* Remember if this task can be migrated to any other cpu in

5167

* Remember if this task can be migrated to any other cpu in

5167

* our sched_group. We may want to revisit it if we couldn't

5168

* our sched_group. We may want to revisit it if we couldn't

5168

* meet load balance goals by pulling other tasks on src_cpu.

5169

* meet load balance goals by pulling other tasks on src_cpu.

5169

*

5170

*

5170

* Also avoid computing new_dst_cpu if we have already computed

5171

* Also avoid computing new_dst_cpu if we have already computed

5171

* one in current iteration.

5172

* one in current iteration.

5172

*/

5173

*/

5173

if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))

5174

if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))

5174

return 0;

5175

return 0;

5175

5176

/* Prevent to re-select dst_cpu via env's cpus */

5177

/* Prevent to re-select dst_cpu via env's cpus */

5177

for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {

5178

for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {

5178

if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {

5179

if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {

5179

env->flags |= LBF_DST_PINNED;

5180

env->flags |= LBF_DST_PINNED;

5180

env->new_dst_cpu = cpu;

5181

env->new_dst_cpu = cpu;

5181

break;

5182

break;

5182

}

5183

}

5183

}

5184

}

5184

5185

return 0;

5186

return 0;

5186

}

5187

}

5187

5188

/* Record that we found atleast one task that could run on dst_cpu */

5189

/* Record that we found atleast one task that could run on dst_cpu */

5189

env->flags &= ~LBF_ALL_PINNED;

5190

env->flags &= ~LBF_ALL_PINNED;

5190

5191

if (task_running(env->src_rq, p)) {

5192

if (task_running(env->src_rq, p)) {

5192

schedstat_inc(p, se.statistics.nr_failed_migrations_running);

5193

schedstat_inc(p, se.statistics.nr_failed_migrations_running);

5193

return 0;

5194

return 0;

5194

}

5195

}

5195

5196

/*

5197

/*

5197

* Aggressive migration if:

5198

* Aggressive migration if:

5198

* 1) destination numa is preferred

5199

* 1) destination numa is preferred

5199

* 2) task is cache cold, or

5200

* 2) task is cache cold, or

5200

* 3) too many balance attempts have failed.

5201

* 3) too many balance attempts have failed.

5201

*/

5202

*/

5202

tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));

5203

tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));

5203

if (!tsk_cache_hot)

5204

if (!tsk_cache_hot)

5204

tsk_cache_hot = migrate_degrades_locality(p, env);

5205

tsk_cache_hot = migrate_degrades_locality(p, env);

5205

5206

if (migrate_improves_locality(p, env)) {

5207

if (migrate_improves_locality(p, env)) {

5207

#ifdef CONFIG_SCHEDSTATS

5208

#ifdef CONFIG_SCHEDSTATS

5208

if (tsk_cache_hot) {

5209

if (tsk_cache_hot) {

5209

schedstat_inc(env->sd, lb_hot_gained[env->idle]);

5210

schedstat_inc(env->sd, lb_hot_gained[env->idle]);

5210

schedstat_inc(p, se.statistics.nr_forced_migrations);

5211

schedstat_inc(p, se.statistics.nr_forced_migrations);

5211

}

5212

}

5212

#endif

5213

#endif

5213

return 1;

5214

return 1;

5214

}

5215

}

5215

5216

if (!tsk_cache_hot ||

5217

if (!tsk_cache_hot ||

5217

env->sd->nr_balance_failed > env->sd->cache_nice_tries) {

5218

env->sd->nr_balance_failed > env->sd->cache_nice_tries) {

5218

5219

if (tsk_cache_hot) {

5220

if (tsk_cache_hot) {

5220

schedstat_inc(env->sd, lb_hot_gained[env->idle]);

5221

schedstat_inc(env->sd, lb_hot_gained[env->idle]);

5221

schedstat_inc(p, se.statistics.nr_forced_migrations);

5222

schedstat_inc(p, se.statistics.nr_forced_migrations);

5222

}

5223

}

5223

5224

return 1;

5225

return 1;

5225

}

5226

}

5226

5227

schedstat_inc(p, se.statistics.nr_failed_migrations_hot);

5228

schedstat_inc(p, se.statistics.nr_failed_migrations_hot);

5228

return 0;

5229

return 0;

5229

}

5230

}

5230

5231

/*

5232

/*

5232

* move_one_task tries to move exactly one task from busiest to this_rq, as

5233

* move_one_task tries to move exactly one task from busiest to this_rq, as

5233

* part of active balancing operations within "domain".

5234

* part of active balancing operations within "domain".

5234

* Returns 1 if successful and 0 otherwise.

5235

* Returns 1 if successful and 0 otherwise.

5235

*

5236

*

5236

* Called with both runqueues locked.

5237

* Called with both runqueues locked.

5237

*/

5238

*/

5238

static int move_one_task(struct lb_env *env)

5239

static int move_one_task(struct lb_env *env)

5239

{

5240

{

5240

struct task_struct *p, *n;

5241

struct task_struct *p, *n;

5241

5242

list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {

5243

list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {

5243

if (!can_migrate_task(p, env))

5244

if (!can_migrate_task(p, env))

5244

continue;

5245

continue;

5245

5246

move_task(p, env);

5247

move_task(p, env);

5247

/*

5248

/*

5248

* Right now, this is only the second place move_task()

5249

* Right now, this is only the second place move_task()

5249

* is called, so we can safely collect move_task()

5250

* is called, so we can safely collect move_task()

5250

* stats here rather than inside move_task().

5251

* stats here rather than inside move_task().

5251

*/

5252

*/

5252

schedstat_inc(env->sd, lb_gained[env->idle]);

5253

schedstat_inc(env->sd, lb_gained[env->idle]);

5253

return 1;

5254

return 1;

5254

}

5255

}

5255

return 0;

5256

return 0;

5256

}

5257

}

5257

5258

static const unsigned int sched_nr_migrate_break = 32;

5259

static const unsigned int sched_nr_migrate_break = 32;

5259

5260

/*

5261

/*

5261

* move_tasks tries to move up to imbalance weighted load from busiest to

5262

* move_tasks tries to move up to imbalance weighted load from busiest to

5262

* this_rq, as part of a balancing operation within domain "sd".

5263

* this_rq, as part of a balancing operation within domain "sd".

5263

* Returns 1 if successful and 0 otherwise.

5264

* Returns 1 if successful and 0 otherwise.

5264

*

5265

*

5265

* Called with both runqueues locked.

5266

* Called with both runqueues locked.

5266

*/

5267

*/

5267

static int move_tasks(struct lb_env *env)

5268

static int move_tasks(struct lb_env *env)

5268

{

5269

{

5269

struct list_head *tasks = &env->src_rq->cfs_tasks;

5270

struct list_head *tasks = &env->src_rq->cfs_tasks;

5270

struct task_struct *p;

5271

struct task_struct *p;

5271

unsigned long load;

5272

unsigned long load;

5272

int pulled = 0;

5273

int pulled = 0;

5273

5274

if (env->imbalance <= 0)

5275

if (env->imbalance <= 0)

5275

return 0;

5276

return 0;

5276

5277

while (!list_empty(tasks)) {

5278

while (!list_empty(tasks)) {

5278

p = list_first_entry(tasks, struct task_struct, se.group_node);

5279

p = list_first_entry(tasks, struct task_struct, se.group_node);

5279

5280

env->loop++;

5281

env->loop++;

5281

/* We've more or less seen every task there is, call it quits */

5282

/* We've more or less seen every task there is, call it quits */

5282

if (env->loop > env->loop_max)

5283

if (env->loop > env->loop_max)

5283

break;

5284

break;

5284

5285

/* take a breather every nr_migrate tasks */

5286

/* take a breather every nr_migrate tasks */

5286

if (env->loop > env->loop_break) {

5287

if (env->loop > env->loop_break) {

5287

env->loop_break += sched_nr_migrate_break;

5288

env->loop_break += sched_nr_migrate_break;

5288

env->flags |= LBF_NEED_BREAK;

5289

env->flags |= LBF_NEED_BREAK;

5289

break;

5290

break;

5290

}

5291

}

5291

5292

if (!can_migrate_task(p, env))

5293

if (!can_migrate_task(p, env))

5293

goto next;

5294

goto next;

5294

5295

load = task_h_load(p);

5296

load = task_h_load(p);

5296

5297

if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)

5298

if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)

5298

goto next;

5299

goto next;

5299

5300

if ((load / 2) > env->imbalance)

5301

if ((load / 2) > env->imbalance)

5301

goto next;

5302

goto next;

5302

5303

move_task(p, env);

5304

move_task(p, env);

5304

pulled++;

5305

pulled++;

5305

env->imbalance -= load;

5306

env->imbalance -= load;

5306

5307

#ifdef CONFIG_PREEMPT

5308

#ifdef CONFIG_PREEMPT

5308

/*

5309

/*

5309

* NEWIDLE balancing is a source of latency, so preemptible

5310

* NEWIDLE balancing is a source of latency, so preemptible

5310

* kernels will stop after the first task is pulled to minimize

5311

* kernels will stop after the first task is pulled to minimize

5311

* the critical section.

5312

* the critical section.

5312

*/

5313

*/

5313

if (env->idle == CPU_NEWLY_IDLE)

5314

if (env->idle == CPU_NEWLY_IDLE)

5314

break;

5315

break;

5315

#endif

5316

#endif

5316

5317

/*

5318

/*

5318

* We only want to steal up to the prescribed amount of

5319

* We only want to steal up to the prescribed amount of

5319

* weighted load.

5320

* weighted load.

5320

*/

5321

*/

5321

if (env->imbalance <= 0)

5322

if (env->imbalance <= 0)

5322

break;

5323

break;

5323

5324

continue;

5325

continue;

5325

list_move_tail(&p->se.group_node, tasks);

5327

list_move_tail(&p->se.group_node, tasks);

5327

}

5328

}

5328

5329

/*

5330

/*

5330

* Right now, this is one of only two places move_task() is called,

5331

* Right now, this is one of only two places move_task() is called,

5331

* so we can safely collect move_task() stats here rather than

5332

* so we can safely collect move_task() stats here rather than

5332

* inside move_task().

5333

* inside move_task().

5333

*/

5334

*/

5334

schedstat_add(env->sd, lb_gained[env->idle], pulled);

5335

schedstat_add(env->sd, lb_gained[env->idle], pulled);

5335

5336

return pulled;

5337

return pulled;

5337

}

5338

}

5338

5339

#ifdef CONFIG_FAIR_GROUP_SCHED

5340

#ifdef CONFIG_FAIR_GROUP_SCHED

5340

/*

5341

/*

5341

* update tg->load_weight by folding this cpu's load_avg

5342

* update tg->load_weight by folding this cpu's load_avg

5342

*/

5343

*/

5343

static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)

5344

static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)

5344

{

5345

{

5345

struct sched_entity *se = tg->se[cpu];

5346

struct sched_entity *se = tg->se[cpu];

5346

struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];

5347

struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];

5347

5348

/* throttled entities do not contribute to load */

5349

/* throttled entities do not contribute to load */

5349

if (throttled_hierarchy(cfs_rq))

5350

if (throttled_hierarchy(cfs_rq))

5350

return;

5351

return;

5351

5352

update_cfs_rq_blocked_load(cfs_rq, 1);

5353

update_cfs_rq_blocked_load(cfs_rq, 1);

5353

5354

if (se) {

5355

if (se) {

5355

update_entity_load_avg(se, 1);

5356

update_entity_load_avg(se, 1);

5356

/*

5357

/*

5357

* We pivot on our runnable average having decayed to zero for

5358

* We pivot on our runnable average having decayed to zero for

5358

* list removal. This generally implies that all our children

5359

* list removal. This generally implies that all our children

5359

* have also been removed (modulo rounding error or bandwidth

5360

* have also been removed (modulo rounding error or bandwidth

5360

* control); however, such cases are rare and we can fix these

5361

* control); however, such cases are rare and we can fix these

5361

* at enqueue.

5362

* at enqueue.

5362

*

5363

*

5363

* TODO: fix up out-of-order children on enqueue.

5364

* TODO: fix up out-of-order children on enqueue.

5364

*/

5365

*/

5365

if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)

5366

if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)

5366

list_del_leaf_cfs_rq(cfs_rq);

5367

list_del_leaf_cfs_rq(cfs_rq);

5367

} else {

5368

} else {

5368

struct rq *rq = rq_of(cfs_rq);

5369

struct rq *rq = rq_of(cfs_rq);

5369

update_rq_runnable_avg(rq, rq->nr_running);

5370

update_rq_runnable_avg(rq, rq->nr_running);

5370

}

5371

}

5371

}

5372

}

5372

5373

static void update_blocked_averages(int cpu)

5374

static void update_blocked_averages(int cpu)

5374

{

5375

{

5375

struct rq *rq = cpu_rq(cpu);

5376

struct rq *rq = cpu_rq(cpu);

5376

struct cfs_rq *cfs_rq;

5377

struct cfs_rq *cfs_rq;

5377

unsigned long flags;

5378

unsigned long flags;

5378

5379

raw_spin_lock_irqsave(&rq->lock, flags);

5380

raw_spin_lock_irqsave(&rq->lock, flags);

5380

update_rq_clock(rq);

5381

update_rq_clock(rq);

5381

/*

5382

/*

5382

* Iterates the task_group tree in a bottom up fashion, see

5383

* Iterates the task_group tree in a bottom up fashion, see

5383

* list_add_leaf_cfs_rq() for details.

5384

* list_add_leaf_cfs_rq() for details.

5384

*/

5385

*/

5385

for_each_leaf_cfs_rq(rq, cfs_rq) {

5386

for_each_leaf_cfs_rq(rq, cfs_rq) {

5386

/*

5387

/*

5387

* Note: We may want to consider periodically releasing

5388

* Note: We may want to consider periodically releasing

5388

* rq->lock about these updates so that creating many task

5389

* rq->lock about these updates so that creating many task

5389

* groups does not result in continually extending hold time.

5390

* groups does not result in continually extending hold time.

5390

*/

5391

*/

5391

__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);

5392

__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);

5392

}

5393

}

5393

5394

raw_spin_unlock_irqrestore(&rq->lock, flags);

5395

raw_spin_unlock_irqrestore(&rq->lock, flags);

5395

}

5396

}

5396

5397

/*

5398

/*

5398

* Compute the hierarchical load factor for cfs_rq and all its ascendants.

5399

* Compute the hierarchical load factor for cfs_rq and all its ascendants.

5399

* This needs to be done in a top-down fashion because the load of a child

5400

* This needs to be done in a top-down fashion because the load of a child

5400

* group is a fraction of its parents load.

5401

* group is a fraction of its parents load.

5401

*/

5402

*/

5402

static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)

5403

static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)

5403

{

5404

{

5404

struct rq *rq = rq_of(cfs_rq);

5405

struct rq *rq = rq_of(cfs_rq);

5405

struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];

5406

struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];

5406

unsigned long now = jiffies;

5407

unsigned long now = jiffies;

5407

unsigned long load;

5408

unsigned long load;

5408

5409

if (cfs_rq->last_h_load_update == now)

5410

if (cfs_rq->last_h_load_update == now)

5410

return;

5411

return;

5411

5412

cfs_rq->h_load_next = NULL;

5413

cfs_rq->h_load_next = NULL;

5413

for_each_sched_entity(se) {

5414

for_each_sched_entity(se) {

5414

cfs_rq = cfs_rq_of(se);

5415

cfs_rq = cfs_rq_of(se);

5415

cfs_rq->h_load_next = se;

5416

cfs_rq->h_load_next = se;

5416

if (cfs_rq->last_h_load_update == now)

5417

if (cfs_rq->last_h_load_update == now)

5417

break;

5418

break;

5418

}

5419

}

5419

5420

if (!se) {

5421

if (!se) {

5421

cfs_rq->h_load = cfs_rq->runnable_load_avg;

5422

cfs_rq->h_load = cfs_rq->runnable_load_avg;

5422

cfs_rq->last_h_load_update = now;

5423

cfs_rq->last_h_load_update = now;

5423

}

5424

}

5424

5425

while ((se = cfs_rq->h_load_next) != NULL) {

5426

while ((se = cfs_rq->h_load_next) != NULL) {

5426

load = cfs_rq->h_load;

5427

load = cfs_rq->h_load;

5427

load = div64_ul(load * se->avg.load_avg_contrib,

5428

load = div64_ul(load * se->avg.load_avg_contrib,

5428

cfs_rq->runnable_load_avg + 1);

5429

cfs_rq->runnable_load_avg + 1);

5429

cfs_rq = group_cfs_rq(se);

5430

cfs_rq = group_cfs_rq(se);

5430

cfs_rq->h_load = load;

5431

cfs_rq->h_load = load;

5431

cfs_rq->last_h_load_update = now;

5432

cfs_rq->last_h_load_update = now;

5432

}

5433

}

5433

}

5434

}

5434

5435

static unsigned long task_h_load(struct task_struct *p)

5436

static unsigned long task_h_load(struct task_struct *p)

5436

{

5437

{

5437

struct cfs_rq *cfs_rq = task_cfs_rq(p);

5438

struct cfs_rq *cfs_rq = task_cfs_rq(p);

5438

5439

update_cfs_rq_h_load(cfs_rq);

5440

update_cfs_rq_h_load(cfs_rq);

5440

return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,

5441

return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,

5441

cfs_rq->runnable_load_avg + 1);

5442

cfs_rq->runnable_load_avg + 1);

5442

}

5443

}

5443

#else

5444

#else

5444

static inline void update_blocked_averages(int cpu)

5445

static inline void update_blocked_averages(int cpu)

5445

{

5446

{

5446

}

5447

}

5447

5448

static unsigned long task_h_load(struct task_struct *p)

5449

static unsigned long task_h_load(struct task_struct *p)

5449

{

5450

{

5450

return p->se.avg.load_avg_contrib;

5451

return p->se.avg.load_avg_contrib;

5451

}

5452

}

5452

#endif

5453

#endif

5453

5454

/********** Helpers for find_busiest_group ************************/

5455

/********** Helpers for find_busiest_group ************************/

5455

/*

5456

/*

5456

* sg_lb_stats - stats of a sched_group required for load_balancing

5457

* sg_lb_stats - stats of a sched_group required for load_balancing

5457

*/

5458

*/

5458

struct sg_lb_stats {

5459

struct sg_lb_stats {

5459

unsigned long avg_load; /*Avg load across the CPUs of the group */

5460

unsigned long avg_load; /*Avg load across the CPUs of the group */

5460

unsigned long group_load; /* Total load over the CPUs of the group */

5461

unsigned long group_load; /* Total load over the CPUs of the group */

5461

unsigned long sum_weighted_load; /* Weighted load of group's tasks */

5462

unsigned long sum_weighted_load; /* Weighted load of group's tasks */

5462

unsigned long load_per_task;

5463

unsigned long load_per_task;

5463

unsigned long group_power;

5464

unsigned long group_power;

5464

unsigned int sum_nr_running; /* Nr tasks running in the group */

5465

unsigned int sum_nr_running; /* Nr tasks running in the group */

5465

unsigned int group_capacity;

5466

unsigned int group_capacity;

5466

unsigned int idle_cpus;

5467

unsigned int idle_cpus;

5467

unsigned int group_weight;

5468

unsigned int group_weight;

5468

int group_imb; /* Is there an imbalance in the group ? */

5469

int group_imb; /* Is there an imbalance in the group ? */

5469

int group_has_capacity; /* Is there extra capacity in the group? */

5470

int group_has_capacity; /* Is there extra capacity in the group? */

5470

#ifdef CONFIG_NUMA_BALANCING

5471

#ifdef CONFIG_NUMA_BALANCING

5471

unsigned int nr_numa_running;

5472

unsigned int nr_numa_running;

5472

unsigned int nr_preferred_running;

5473

unsigned int nr_preferred_running;

5473

#endif

5474

#endif

5474

};

5475

};

5475

5476

/*

5477

/*

5477

* sd_lb_stats - Structure to store the statistics of a sched_domain

5478

* sd_lb_stats - Structure to store the statistics of a sched_domain

5478

* during load balancing.

5479

* during load balancing.

5479

*/

5480

*/

5480

struct sd_lb_stats {

5481

struct sd_lb_stats {

5481

struct sched_group *busiest; /* Busiest group in this sd */

5482

struct sched_group *busiest; /* Busiest group in this sd */

5482

struct sched_group *local; /* Local group in this sd */

5483

struct sched_group *local; /* Local group in this sd */

5483

unsigned long total_load; /* Total load of all groups in sd */

5484

unsigned long total_load; /* Total load of all groups in sd */

5484

unsigned long total_pwr; /* Total power of all groups in sd */

5485

unsigned long total_pwr; /* Total power of all groups in sd */

5485

unsigned long avg_load; /* Average load across all groups in sd */

5486

unsigned long avg_load; /* Average load across all groups in sd */

5486

5487

struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */

5488

struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */

5488

struct sg_lb_stats local_stat; /* Statistics of the local group */

5489

struct sg_lb_stats local_stat; /* Statistics of the local group */

5489

};

5490

};

5490

5491

static inline void init_sd_lb_stats(struct sd_lb_stats *sds)

5492

static inline void init_sd_lb_stats(struct sd_lb_stats *sds)

5492

{

5493

{

5493

/*

5494

/*

5494

* Skimp on the clearing to avoid duplicate work. We can avoid clearing

5495

* Skimp on the clearing to avoid duplicate work. We can avoid clearing

5495

* local_stat because update_sg_lb_stats() does a full clear/assignment.

5496

* local_stat because update_sg_lb_stats() does a full clear/assignment.

5496

* We must however clear busiest_stat::avg_load because

5497

* We must however clear busiest_stat::avg_load because

5497

* update_sd_pick_busiest() reads this before assignment.

5498

* update_sd_pick_busiest() reads this before assignment.

5498

*/

5499

*/

5499

*sds = (struct sd_lb_stats){

5500

*sds = (struct sd_lb_stats){

5500

.busiest = NULL,

5501

.busiest = NULL,

5501

.local = NULL,

5502

.local = NULL,

5502

.total_load = 0UL,

5503

.total_load = 0UL,

5503

.total_pwr = 0UL,

5504

.total_pwr = 0UL,

5504

.busiest_stat = {

5505

.busiest_stat = {

5505

.avg_load = 0UL,

5506

.avg_load = 0UL,

5506

},

5507

},

5507

};

5508

};

5508

}

5509

}

5509

5510

/**

5511

/**

5511

* get_sd_load_idx - Obtain the load index for a given sched domain.

5512

* get_sd_load_idx - Obtain the load index for a given sched domain.

5512

* @sd: The sched_domain whose load_idx is to be obtained.

5513

* @sd: The sched_domain whose load_idx is to be obtained.

5513

* @idle: The idle status of the CPU for whose sd load_idx is obtained.

5514

* @idle: The idle status of the CPU for whose sd load_idx is obtained.

5514

*

5515

*

5515

* Return: The load index.

5516

* Return: The load index.

5516

*/

5517

*/

5517

static inline int get_sd_load_idx(struct sched_domain *sd,

5518

static inline int get_sd_load_idx(struct sched_domain *sd,

5518

enum cpu_idle_type idle)

5519

enum cpu_idle_type idle)

5519

{

5520

{

5520

int load_idx;

5521

int load_idx;

5521

5522

switch (idle) {

5523

switch (idle) {

5523

case CPU_NOT_IDLE:

5524

case CPU_NOT_IDLE:

5524

load_idx = sd->busy_idx;

5525

load_idx = sd->busy_idx;

5525

break;

5526

break;

5526

5527

case CPU_NEWLY_IDLE:

5528

case CPU_NEWLY_IDLE:

5528

load_idx = sd->newidle_idx;

5529

load_idx = sd->newidle_idx;

5529

break;

5530

break;

5530

default:

5531

default:

5531

load_idx = sd->idle_idx;

5532

load_idx = sd->idle_idx;

5532

break;

5533

break;

5533

}

5534

}

5534

5535

return load_idx;

5536

return load_idx;

5536

}

5537

}

5537

5538

static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)

5539

static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)

5539

{

5540

{

5540

return SCHED_POWER_SCALE;

5541

return SCHED_POWER_SCALE;

5541

}

5542

}

5542

5543

unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)

5544

unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)

5544

{

5545

{

5545

return default_scale_freq_power(sd, cpu);

5546

return default_scale_freq_power(sd, cpu);

5546

}

5547

}

5547

5548

static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)

5549

static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)

5549

{

5550

{

5550

unsigned long weight = sd->span_weight;

5551

unsigned long weight = sd->span_weight;

5551

unsigned long smt_gain = sd->smt_gain;

5552

unsigned long smt_gain = sd->smt_gain;

5552

5553

smt_gain /= weight;

5554

smt_gain /= weight;

5554

5555

return smt_gain;

5556

return smt_gain;

5556

}

5557

}

5557

5558

unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)

5559

unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)

5559

{

5560

{

5560

return default_scale_smt_power(sd, cpu);

5561

return default_scale_smt_power(sd, cpu);

5561

}

5562

}

5562

5563

static unsigned long scale_rt_power(int cpu)

5564

static unsigned long scale_rt_power(int cpu)

5564

{

5565

{

5565

struct rq *rq = cpu_rq(cpu);

5566

struct rq *rq = cpu_rq(cpu);

5566

u64 total, available, age_stamp, avg;

5567

u64 total, available, age_stamp, avg;

5567

5568

/*

5569

/*

5569

* Since we're reading these variables without serialization make sure

5570

* Since we're reading these variables without serialization make sure

5570

* we read them once before doing sanity checks on them.

5571

* we read them once before doing sanity checks on them.

5571

*/

5572

*/

5572

age_stamp = ACCESS_ONCE(rq->age_stamp);

5573

age_stamp = ACCESS_ONCE(rq->age_stamp);

5573

avg = ACCESS_ONCE(rq->rt_avg);

5574

avg = ACCESS_ONCE(rq->rt_avg);

5574

5575

total = sched_avg_period() + (rq_clock(rq) - age_stamp);

5576

total = sched_avg_period() + (rq_clock(rq) - age_stamp);

5576

5577

if (unlikely(total < avg)) {

5578

if (unlikely(total < avg)) {

5578

/* Ensures that power won't end up being negative */

5579

/* Ensures that power won't end up being negative */

5579

available = 0;

5580

available = 0;

5580

} else {

5581

} else {

5581

available = total - avg;

5582

available = total - avg;

5582

}

5583

}

5583

5584

if (unlikely((s64)total < SCHED_POWER_SCALE))

5585

if (unlikely((s64)total < SCHED_POWER_SCALE))

5585

total = SCHED_POWER_SCALE;

5586

total = SCHED_POWER_SCALE;

5586

5587

total >>= SCHED_POWER_SHIFT;

5588

total >>= SCHED_POWER_SHIFT;

5588

5589

return div_u64(available, total);

5590

return div_u64(available, total);

5590

}

5591

}

5591

5592

static void update_cpu_power(struct sched_domain *sd, int cpu)

5593

static void update_cpu_power(struct sched_domain *sd, int cpu)

5593

{

5594

{

5594

unsigned long weight = sd->span_weight;

5595

unsigned long weight = sd->span_weight;

5595

unsigned long power = SCHED_POWER_SCALE;

5596

unsigned long power = SCHED_POWER_SCALE;

5596

struct sched_group *sdg = sd->groups;

5597

struct sched_group *sdg = sd->groups;

5597

5598

if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {

5599

if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {

5599

if (sched_feat(ARCH_POWER))

5600

if (sched_feat(ARCH_POWER))

5600

power *= arch_scale_smt_power(sd, cpu);

5601

power *= arch_scale_smt_power(sd, cpu);

5601

else

5602

else

5602

power *= default_scale_smt_power(sd, cpu);

5603

power *= default_scale_smt_power(sd, cpu);

5603

5604

power >>= SCHED_POWER_SHIFT;

5605

power >>= SCHED_POWER_SHIFT;

5605

}

5606

}

5606

5607

sdg->sgp->power_orig = power;

5608

sdg->sgp->power_orig = power;

5608

5609

if (sched_feat(ARCH_POWER))

5610

if (sched_feat(ARCH_POWER))

5610

power *= arch_scale_freq_power(sd, cpu);

5611

power *= arch_scale_freq_power(sd, cpu);

5611

else

5612

else

5612

power *= default_scale_freq_power(sd, cpu);

5613

power *= default_scale_freq_power(sd, cpu);

5613

5614

power >>= SCHED_POWER_SHIFT;

5615

power >>= SCHED_POWER_SHIFT;

5615

5616

power *= scale_rt_power(cpu);

5617

power *= scale_rt_power(cpu);

5617

power >>= SCHED_POWER_SHIFT;

5618

power >>= SCHED_POWER_SHIFT;

5618

5619

if (!power)

5620

if (!power)

5620

power = 1;

5621

power = 1;

5621

5622

cpu_rq(cpu)->cpu_power = power;

5623

cpu_rq(cpu)->cpu_power = power;

5623

sdg->sgp->power = power;

5624

sdg->sgp->power = power;

5624

}

5625

}

5625

5626

void update_group_power(struct sched_domain *sd, int cpu)

5627

void update_group_power(struct sched_domain *sd, int cpu)

5627

{

5628

{

5628

struct sched_domain *child = sd->child;

5629

struct sched_domain *child = sd->child;

5629

struct sched_group *group, *sdg = sd->groups;

5630

struct sched_group *group, *sdg = sd->groups;

5630

unsigned long power, power_orig;

5631

unsigned long power, power_orig;

5631

unsigned long interval;

5632

unsigned long interval;

5632

5633

interval = msecs_to_jiffies(sd->balance_interval);

5634

interval = msecs_to_jiffies(sd->balance_interval);

5634

interval = clamp(interval, 1UL, max_load_balance_interval);

5635

interval = clamp(interval, 1UL, max_load_balance_interval);

5635

sdg->sgp->next_update = jiffies + interval;

5636

sdg->sgp->next_update = jiffies + interval;

5636

5637

if (!child) {

5638

if (!child) {

5638

update_cpu_power(sd, cpu);

5639

update_cpu_power(sd, cpu);

5639

return;

5640

return;

5640

}

5641

}

5641

5642

power_orig = power = 0;

5643

power_orig = power = 0;

5643

5644

if (child->flags & SD_OVERLAP) {

5645

if (child->flags & SD_OVERLAP) {

5645

/*

5646

/*

5646

* SD_OVERLAP domains cannot assume that child groups

5647

* SD_OVERLAP domains cannot assume that child groups

5647

* span the current group.

5648

* span the current group.

5648

*/

5649

*/

5649

5650

for_each_cpu(cpu, sched_group_cpus(sdg)) {

5651

for_each_cpu(cpu, sched_group_cpus(sdg)) {

5651

struct sched_group_power *sgp;

5652

struct sched_group_power *sgp;

5652

struct rq *rq = cpu_rq(cpu);

5653

struct rq *rq = cpu_rq(cpu);

5653

5654

/*

5655

/*

5655

* build_sched_domains() -> init_sched_groups_power()

5656

* build_sched_domains() -> init_sched_groups_power()

5656

* gets here before we've attached the domains to the

5657

* gets here before we've attached the domains to the

5657

* runqueues.

5658

* runqueues.

5658

*

5659

*

5659

* Use power_of(), which is set irrespective of domains

5660

* Use power_of(), which is set irrespective of domains

5660

* in update_cpu_power().

5661

* in update_cpu_power().

5661

*

5662

*

5662

* This avoids power/power_orig from being 0 and

5663

* This avoids power/power_orig from being 0 and

5663

* causing divide-by-zero issues on boot.

5664

* causing divide-by-zero issues on boot.

5664

*

5665

*

5665

* Runtime updates will correct power_orig.

5666

* Runtime updates will correct power_orig.

5666

*/

5667

*/

5667

if (unlikely(!rq->sd)) {

5668

if (unlikely(!rq->sd)) {

5668

power_orig += power_of(cpu);

5669

power_orig += power_of(cpu);

5669

power += power_of(cpu);

5670

power += power_of(cpu);

5670

continue;

5671

continue;

5671

}

5672

}

5672

5673

sgp = rq->sd->groups->sgp;

5674

sgp = rq->sd->groups->sgp;

5674

power_orig += sgp->power_orig;

5675

power_orig += sgp->power_orig;

5675

power += sgp->power;

5676

power += sgp->power;

5676

}

5677

}

5677

} else {

5678

} else {

5678

/*

5679

/*

5679

* !SD_OVERLAP domains can assume that child groups

5680

* !SD_OVERLAP domains can assume that child groups

5680

* span the current group.

5681

* span the current group.

5681

*/

5682

*/

5682

5683

group = child->groups;

5684

group = child->groups;

5684

do {

5685

do {

5685

power_orig += group->sgp->power_orig;

5686

power_orig += group->sgp->power_orig;

5686

power += group->sgp->power;

5687

power += group->sgp->power;

5687

group = group->next;

5688

group = group->next;

5688

} while (group != child->groups);

5689

} while (group != child->groups);

5689

}

5690

}

5690

5691

sdg->sgp->power_orig = power_orig;

5692

sdg->sgp->power_orig = power_orig;

5692

sdg->sgp->power = power;

5693

sdg->sgp->power = power;

5693

}

5694

}

5694

5695

/*

5696

/*

5696

* Try and fix up capacity for tiny siblings, this is needed when

5697

* Try and fix up capacity for tiny siblings, this is needed when

5697

* things like SD_ASYM_PACKING need f_b_g to select another sibling

5698

* things like SD_ASYM_PACKING need f_b_g to select another sibling

5698

* which on its own isn't powerful enough.

5699

* which on its own isn't powerful enough.

5699

*

5700

*

5700

* See update_sd_pick_busiest() and check_asym_packing().

5701

* See update_sd_pick_busiest() and check_asym_packing().

5701

*/

5702

*/

5702

static inline int

5703

static inline int

5703

fix_small_capacity(struct sched_domain *sd, struct sched_group *group)

5704

fix_small_capacity(struct sched_domain *sd, struct sched_group *group)

5704

{

5705

{

5705

/*

5706

/*

5706

* Only siblings can have significantly less than SCHED_POWER_SCALE

5707

* Only siblings can have significantly less than SCHED_POWER_SCALE

5707

*/

5708

*/

5708

if (!(sd->flags & SD_SHARE_CPUPOWER))

5709

if (!(sd->flags & SD_SHARE_CPUPOWER))

5709

return 0;

5710

return 0;

5710

5711

/*

5712

/*

5712

* If ~90% of the cpu_power is still there, we're good.

5713

* If ~90% of the cpu_power is still there, we're good.

5713

*/

5714

*/

5714

if (group->sgp->power * 32 > group->sgp->power_orig * 29)

5715

if (group->sgp->power * 32 > group->sgp->power_orig * 29)

5715

return 1;

5716

return 1;

5716

5717

return 0;

5718

return 0;

5718

}

5719

}

5719

5720

/*

5721

/*

5721

* Group imbalance indicates (and tries to solve) the problem where balancing

5722

* Group imbalance indicates (and tries to solve) the problem where balancing

5722

* groups is inadequate due to tsk_cpus_allowed() constraints.

5723

* groups is inadequate due to tsk_cpus_allowed() constraints.

5723

*

5724

*

5724

* Imagine a situation of two groups of 4 cpus each and 4 tasks each with a

5725

* Imagine a situation of two groups of 4 cpus each and 4 tasks each with a

5725

* cpumask covering 1 cpu of the first group and 3 cpus of the second group.

5726

* cpumask covering 1 cpu of the first group and 3 cpus of the second group.

5726

* Something like:

5727

* Something like:

5727

*

5728

*

5728

* { 0 1 2 3 } { 4 5 6 7 }

5729

* { 0 1 2 3 } { 4 5 6 7 }

5729

* * * * *

5730

* * * * *

5730

*

5731

*

5731

* If we were to balance group-wise we'd place two tasks in the first group and

5732

* If we were to balance group-wise we'd place two tasks in the first group and

5732

* two tasks in the second group. Clearly this is undesired as it will overload

5733

* two tasks in the second group. Clearly this is undesired as it will overload

5733

* cpu 3 and leave one of the cpus in the second group unused.

5734

* cpu 3 and leave one of the cpus in the second group unused.

5734

*

5735

*

5735

* The current solution to this issue is detecting the skew in the first group

5736

* The current solution to this issue is detecting the skew in the first group

5736

* by noticing the lower domain failed to reach balance and had difficulty

5737

* by noticing the lower domain failed to reach balance and had difficulty

5737

* moving tasks due to affinity constraints.

5738

* moving tasks due to affinity constraints.

5738

*

5739

*

5739

* When this is so detected; this group becomes a candidate for busiest; see

5740

* When this is so detected; this group becomes a candidate for busiest; see

5740

* update_sd_pick_busiest(). And calculate_imbalance() and

5741

* update_sd_pick_busiest(). And calculate_imbalance() and

5741

* find_busiest_group() avoid some of the usual balance conditions to allow it

5742

* find_busiest_group() avoid some of the usual balance conditions to allow it

5742

* to create an effective group imbalance.

5743

* to create an effective group imbalance.

5743

*

5744

*

5744

* This is a somewhat tricky proposition since the next run might not find the

5745

* This is a somewhat tricky proposition since the next run might not find the

5745

* group imbalance and decide the groups need to be balanced again. A most

5746

* group imbalance and decide the groups need to be balanced again. A most

5746

* subtle and fragile situation.

5747

* subtle and fragile situation.

5747

*/

5748

*/

5748

5749

static inline int sg_imbalanced(struct sched_group *group)

5750

static inline int sg_imbalanced(struct sched_group *group)

5750

{

5751

{

5751

return group->sgp->imbalance;

5752

return group->sgp->imbalance;

5752

}

5753

}

5753

5754

/*

5755

/*

5755

* Compute the group capacity.

5756

* Compute the group capacity.

5756

*

5757

*

5757

* Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by

5758

* Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by

5758

* first dividing out the smt factor and computing the actual number of cores

5759

* first dividing out the smt factor and computing the actual number of cores

5759

* and limit power unit capacity with that.

5760

* and limit power unit capacity with that.

5760

*/

5761

*/

5761

static inline int sg_capacity(struct lb_env *env, struct sched_group *group)

5762

static inline int sg_capacity(struct lb_env *env, struct sched_group *group)

5762

{

5763

{

5763

unsigned int capacity, smt, cpus;

5764

unsigned int capacity, smt, cpus;

5764

unsigned int power, power_orig;

5765

unsigned int power, power_orig;

5765

5766

power = group->sgp->power;

5767

power = group->sgp->power;

5767

power_orig = group->sgp->power_orig;

5768

power_orig = group->sgp->power_orig;

5768

cpus = group->group_weight;

5769

cpus = group->group_weight;

5769

5770

/* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */

5771

/* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */

5771

smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);

5772

smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);

5772

capacity = cpus / smt; /* cores */

5773

capacity = cpus / smt; /* cores */

5773

5774

capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));

5775

capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));

5775

if (!capacity)

5776

if (!capacity)

5776

capacity = fix_small_capacity(env->sd, group);

5777

capacity = fix_small_capacity(env->sd, group);

5777

5778

return capacity;

5779

return capacity;

5779

}

5780

}

5780

5781

/**

5782

/**

5782

* update_sg_lb_stats - Update sched_group's statistics for load balancing.

5783

* update_sg_lb_stats - Update sched_group's statistics for load balancing.

5783

* @env: The load balancing environment.

5784

* @env: The load balancing environment.

5784

* @group: sched_group whose statistics are to be updated.

5785

* @group: sched_group whose statistics are to be updated.

5785

* @load_idx: Load index of sched_domain of this_cpu for load calc.

5786

* @load_idx: Load index of sched_domain of this_cpu for load calc.

5786

* @local_group: Does group contain this_cpu.

5787

* @local_group: Does group contain this_cpu.

5787

* @sgs: variable to hold the statistics for this group.

5788

* @sgs: variable to hold the statistics for this group.

5788

*/

5789

*/

5789

static inline void update_sg_lb_stats(struct lb_env *env,

5790

static inline void update_sg_lb_stats(struct lb_env *env,

5790

struct sched_group *group, int load_idx,

5791

struct sched_group *group, int load_idx,

5791

int local_group, struct sg_lb_stats *sgs)

5792

int local_group, struct sg_lb_stats *sgs)

5792

{

5793

{

5793

unsigned long load;

5794

unsigned long load;

5794

int i;

5795

int i;

5795

5796

memset(sgs, 0, sizeof(*sgs));

5797

memset(sgs, 0, sizeof(*sgs));

5797

5798

for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {

5799

for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {

5799

struct rq *rq = cpu_rq(i);

5800

struct rq *rq = cpu_rq(i);

5800

5801

/* Bias balancing toward cpus of our domain */

5802

/* Bias balancing toward cpus of our domain */

5802

if (local_group)

5803

if (local_group)

5803

load = target_load(i, load_idx);

5804

load = target_load(i, load_idx);

5804

else

5805

else

5805

load = source_load(i, load_idx);

5806

load = source_load(i, load_idx);

5806

5807

sgs->group_load += load;

5808

sgs->group_load += load;

5808

sgs->sum_nr_running += rq->nr_running;

5809

sgs->sum_nr_running += rq->nr_running;

5809

#ifdef CONFIG_NUMA_BALANCING

5810

#ifdef CONFIG_NUMA_BALANCING

5810

sgs->nr_numa_running += rq->nr_numa_running;

5811

sgs->nr_numa_running += rq->nr_numa_running;

5811

sgs->nr_preferred_running += rq->nr_preferred_running;

5812

sgs->nr_preferred_running += rq->nr_preferred_running;

5812

#endif

5813

#endif

5813

sgs->sum_weighted_load += weighted_cpuload(i);

5814

sgs->sum_weighted_load += weighted_cpuload(i);

5814

if (idle_cpu(i))

5815

if (idle_cpu(i))

5815

sgs->idle_cpus++;

5816

sgs->idle_cpus++;

5816

}

5817

}

5817

5818

/* Adjust by relative CPU power of the group */

5819

/* Adjust by relative CPU power of the group */

5819

sgs->group_power = group->sgp->power;

5820

sgs->group_power = group->sgp->power;

5820

sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;

5821

sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;

5821

5822

if (sgs->sum_nr_running)

5823

if (sgs->sum_nr_running)

5823

sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;

5824

sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;

5824

5825

sgs->group_weight = group->group_weight;

5826

sgs->group_weight = group->group_weight;

5826

5827

sgs->group_imb = sg_imbalanced(group);

5828

sgs->group_imb = sg_imbalanced(group);

5828

sgs->group_capacity = sg_capacity(env, group);

5829

sgs->group_capacity = sg_capacity(env, group);

5829

5830

if (sgs->group_capacity > sgs->sum_nr_running)

5831

if (sgs->group_capacity > sgs->sum_nr_running)

5831

sgs->group_has_capacity = 1;

5832

sgs->group_has_capacity = 1;

5832

}

5833

}

5833

5834

/**

5835

/**

5835

* update_sd_pick_busiest - return 1 on busiest group

5836

* update_sd_pick_busiest - return 1 on busiest group

5836

* @env: The load balancing environment.

5837

* @env: The load balancing environment.

5837

* @sds: sched_domain statistics

5838

* @sds: sched_domain statistics

5838

* @sg: sched_group candidate to be checked for being the busiest

5839

* @sg: sched_group candidate to be checked for being the busiest

5839

* @sgs: sched_group statistics

5840

* @sgs: sched_group statistics

5840

*

5841

*

5841

* Determine if @sg is a busier group than the previously selected

5842

* Determine if @sg is a busier group than the previously selected

5842

* busiest group.

5843

* busiest group.

5843

*

5844

*

5844

* Return: %true if @sg is a busier group than the previously selected

5845

* Return: %true if @sg is a busier group than the previously selected

5845

* busiest group. %false otherwise.

5846

* busiest group. %false otherwise.

5846

*/

5847

*/

5847

static bool update_sd_pick_busiest(struct lb_env *env,

5848

static bool update_sd_pick_busiest(struct lb_env *env,

5848

struct sd_lb_stats *sds,

5849

struct sd_lb_stats *sds,

5849

struct sched_group *sg,

5850

struct sched_group *sg,

5850

struct sg_lb_stats *sgs)

5851

struct sg_lb_stats *sgs)

5851

{

5852

{

5852

if (sgs->avg_load <= sds->busiest_stat.avg_load)

5853

if (sgs->avg_load <= sds->busiest_stat.avg_load)

5853

return false;

5854

return false;

5854

5855

if (sgs->sum_nr_running > sgs->group_capacity)

5856

if (sgs->sum_nr_running > sgs->group_capacity)

5856

return true;

5857

return true;

5857

5858

if (sgs->group_imb)

5859

if (sgs->group_imb)

5859

return true;

5860

return true;

5860

5861

/*

5862

/*

5862

* ASYM_PACKING needs to move all the work to the lowest

5863

* ASYM_PACKING needs to move all the work to the lowest

5863

* numbered CPUs in the group, therefore mark all groups

5864

* numbered CPUs in the group, therefore mark all groups

5864

* higher than ourself as busy.

5865

* higher than ourself as busy.

5865

*/

5866

*/

5866

if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&

5867

if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&

5867

env->dst_cpu < group_first_cpu(sg)) {

5868

env->dst_cpu < group_first_cpu(sg)) {

5868

if (!sds->busiest)

5869

if (!sds->busiest)

5869

return true;

5870

return true;

5870

5871

if (group_first_cpu(sds->busiest) > group_first_cpu(sg))

5872

if (group_first_cpu(sds->busiest) > group_first_cpu(sg))

5872

return true;

5873

return true;

5873

}

5874

}

5874

5875

return false;

5876

return false;

5876

}

5877

}

5877

5878

#ifdef CONFIG_NUMA_BALANCING

5879

#ifdef CONFIG_NUMA_BALANCING

5879

static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)

5880

static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)

5880

{

5881

{

5881

if (sgs->sum_nr_running > sgs->nr_numa_running)

5882

if (sgs->sum_nr_running > sgs->nr_numa_running)

5882

return regular;

5883

return regular;

5883

if (sgs->sum_nr_running > sgs->nr_preferred_running)

5884

if (sgs->sum_nr_running > sgs->nr_preferred_running)

5884

return remote;

5885

return remote;

5885

return all;

5886

return all;

5886

}

5887

}

5887

5888

static inline enum fbq_type fbq_classify_rq(struct rq *rq)

5889

static inline enum fbq_type fbq_classify_rq(struct rq *rq)

5889

{

5890

{

5890

if (rq->nr_running > rq->nr_numa_running)

5891

if (rq->nr_running > rq->nr_numa_running)

5891

return regular;

5892

return regular;

5892

if (rq->nr_running > rq->nr_preferred_running)

5893

if (rq->nr_running > rq->nr_preferred_running)

5893

return remote;

5894

return remote;

5894

return all;

5895

return all;

5895

}

5896

}

5896

#else

5897

#else

5897

static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)

5898

static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)

5898

{

5899

{

5899

return all;

5900

return all;

5900

}

5901

}

5901

5902

static inline enum fbq_type fbq_classify_rq(struct rq *rq)

5903

static inline enum fbq_type fbq_classify_rq(struct rq *rq)

5903

{

5904

{

5904

return regular;

5905

return regular;

5905

}

5906

}

5906

#endif /* CONFIG_NUMA_BALANCING */

5907

#endif /* CONFIG_NUMA_BALANCING */

5907

5908

/**

5909

/**

5909

* update_sd_lb_stats - Update sched_domain's statistics for load balancing.

5910

* update_sd_lb_stats - Update sched_domain's statistics for load balancing.

5910

* @env: The load balancing environment.

5911

* @env: The load balancing environment.

5911

* @sds: variable to hold the statistics for this sched_domain.

5912

* @sds: variable to hold the statistics for this sched_domain.

5912

*/

5913

*/

5913

static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)

5914

static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)

5914

{

5915

{

5915

struct sched_domain *child = env->sd->child;

5916

struct sched_domain *child = env->sd->child;

5916

struct sched_group *sg = env->sd->groups;

5917

struct sched_group *sg = env->sd->groups;

5917

struct sg_lb_stats tmp_sgs;

5918

struct sg_lb_stats tmp_sgs;

5918

int load_idx, prefer_sibling = 0;

5919

int load_idx, prefer_sibling = 0;

5919

5920

if (child && child->flags & SD_PREFER_SIBLING)

5921

if (child && child->flags & SD_PREFER_SIBLING)

5921

prefer_sibling = 1;

5922

prefer_sibling = 1;

5922

5923

load_idx = get_sd_load_idx(env->sd, env->idle);

5924

load_idx = get_sd_load_idx(env->sd, env->idle);

5924

5925

do {

5926

do {

5926

struct sg_lb_stats *sgs = &tmp_sgs;

5927

struct sg_lb_stats *sgs = &tmp_sgs;

5927

int local_group;

5928

int local_group;

5928

5929

local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));

5930

local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));

5930

if (local_group) {

5931

if (local_group) {

5931

sds->local = sg;

5932

sds->local = sg;

5932

sgs = &sds->local_stat;

5933

sgs = &sds->local_stat;

5933

5934

if (env->idle != CPU_NEWLY_IDLE ||

5935

if (env->idle != CPU_NEWLY_IDLE ||

5935

time_after_eq(jiffies, sg->sgp->next_update))

5936

time_after_eq(jiffies, sg->sgp->next_update))

5936

update_group_power(env->sd, env->dst_cpu);

5937

update_group_power(env->sd, env->dst_cpu);

5937

}

5938

}

5938

5939

update_sg_lb_stats(env, sg, load_idx, local_group, sgs);

5940

update_sg_lb_stats(env, sg, load_idx, local_group, sgs);

5940

5941

if (local_group)

5942

if (local_group)

5942

goto next_group;

5943

goto next_group;

5943

5944

/*

5945

/*

5945

* In case the child domain prefers tasks go to siblings

5946

* In case the child domain prefers tasks go to siblings

5946

* first, lower the sg capacity to one so that we'll try

5947

* first, lower the sg capacity to one so that we'll try

5947

* and move all the excess tasks away. We lower the capacity

5948

* and move all the excess tasks away. We lower the capacity

5948

* of a group only if the local group has the capacity to fit

5949

* of a group only if the local group has the capacity to fit

5949

* these excess tasks, i.e. nr_running < group_capacity. The

5950

* these excess tasks, i.e. nr_running < group_capacity. The

5950

* extra check prevents the case where you always pull from the

5951

* extra check prevents the case where you always pull from the

5951

* heaviest group when it is already under-utilized (possible

5952

* heaviest group when it is already under-utilized (possible

5952

* with a large weight task outweighs the tasks on the system).

5953

* with a large weight task outweighs the tasks on the system).

5953

*/

5954

*/

5954

if (prefer_sibling && sds->local &&

5955

if (prefer_sibling && sds->local &&

5955

sds->local_stat.group_has_capacity)

5956

sds->local_stat.group_has_capacity)

5956

sgs->group_capacity = min(sgs->group_capacity, 1U);

5957

sgs->group_capacity = min(sgs->group_capacity, 1U);

5957

5958

if (update_sd_pick_busiest(env, sds, sg, sgs)) {

5959

if (update_sd_pick_busiest(env, sds, sg, sgs)) {

5959

sds->busiest = sg;

5960

sds->busiest = sg;

5960

sds->busiest_stat = *sgs;

5961

sds->busiest_stat = *sgs;

5961

}

5962

}

5962

5963

next_group:

5964

next_group:

5964

/* Now, start updating sd_lb_stats */

5965

/* Now, start updating sd_lb_stats */

5965

sds->total_load += sgs->group_load;

5966

sds->total_load += sgs->group_load;

5966

sds->total_pwr += sgs->group_power;

5967

sds->total_pwr += sgs->group_power;

5967

5968

sg = sg->next;

5969

sg = sg->next;

5969

} while (sg != env->sd->groups);

5970

} while (sg != env->sd->groups);

5970

5971

if (env->sd->flags & SD_NUMA)

5972

if (env->sd->flags & SD_NUMA)

5972

env->fbq_type = fbq_classify_group(&sds->busiest_stat);

5973

env->fbq_type = fbq_classify_group(&sds->busiest_stat);

5973

}

5974

}

5974

5975

/**

5976

/**

5976

* check_asym_packing - Check to see if the group is packed into the

5977

* check_asym_packing - Check to see if the group is packed into the

5977

* sched doman.

5978

* sched doman.

5978

*

5979

*

5979

* This is primarily intended to used at the sibling level. Some

5980

* This is primarily intended to used at the sibling level. Some

5980

* cores like POWER7 prefer to use lower numbered SMT threads. In the

5981

* cores like POWER7 prefer to use lower numbered SMT threads. In the

5981

* case of POWER7, it can move to lower SMT modes only when higher

5982

* case of POWER7, it can move to lower SMT modes only when higher

5982

* threads are idle. When in lower SMT modes, the threads will

5983

* threads are idle. When in lower SMT modes, the threads will

5983

* perform better since they share less core resources. Hence when we

5984

* perform better since they share less core resources. Hence when we

5984

* have idle threads, we want them to be the higher ones.

5985

* have idle threads, we want them to be the higher ones.

5985

*

5986

*

5986

* This packing function is run on idle threads. It checks to see if

5987

* This packing function is run on idle threads. It checks to see if

5987

* the busiest CPU in this domain (core in the P7 case) has a higher

5988

* the busiest CPU in this domain (core in the P7 case) has a higher

5988

* CPU number than the packing function is being run on. Here we are

5989

* CPU number than the packing function is being run on. Here we are

5989

* assuming lower CPU number will be equivalent to lower a SMT thread

5990

* assuming lower CPU number will be equivalent to lower a SMT thread

5990

* number.

5991

* number.

5991

*

5992

*

5992

* Return: 1 when packing is required and a task should be moved to

5993

* Return: 1 when packing is required and a task should be moved to

5993

* this CPU. The amount of the imbalance is returned in *imbalance.

5994

* this CPU. The amount of the imbalance is returned in *imbalance.

5994

*

5995

*

5995

* @env: The load balancing environment.

5996

* @env: The load balancing environment.

5996

* @sds: Statistics of the sched_domain which is to be packed

5997

* @sds: Statistics of the sched_domain which is to be packed

5997

*/

5998

*/

5998

static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)

5999

static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)

5999

{

6000

{

6000

int busiest_cpu;

6001

int busiest_cpu;

6001

6002

if (!(env->sd->flags & SD_ASYM_PACKING))

6003

if (!(env->sd->flags & SD_ASYM_PACKING))

6003

return 0;

6004

return 0;

6004

6005

if (!sds->busiest)

6006

if (!sds->busiest)

6006

return 0;

6007

return 0;

6007

6008

busiest_cpu = group_first_cpu(sds->busiest);

6009

busiest_cpu = group_first_cpu(sds->busiest);

6009

if (env->dst_cpu > busiest_cpu)

6010

if (env->dst_cpu > busiest_cpu)

6010

return 0;

6011

return 0;

6011

6012

env->imbalance = DIV_ROUND_CLOSEST(

6013

env->imbalance = DIV_ROUND_CLOSEST(

6013

sds->busiest_stat.avg_load * sds->busiest_stat.group_power,

6014

sds->busiest_stat.avg_load * sds->busiest_stat.group_power,

6014

SCHED_POWER_SCALE);

6015

SCHED_POWER_SCALE);

6015

6016

return 1;

6017

return 1;

6017

}

6018

}

6018

6019

/**

6020

/**

6020

* fix_small_imbalance - Calculate the minor imbalance that exists

6021

* fix_small_imbalance - Calculate the minor imbalance that exists

6021

* amongst the groups of a sched_domain, during

6022

* amongst the groups of a sched_domain, during

6022

* load balancing.

6023

* load balancing.

6023

* @env: The load balancing environment.

6024

* @env: The load balancing environment.

6024

* @sds: Statistics of the sched_domain whose imbalance is to be calculated.

6025

* @sds: Statistics of the sched_domain whose imbalance is to be calculated.

6025

*/

6026

*/

6026

static inline

6027

static inline

6027

void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)

6028

void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)

6028

{

6029

{

6029

unsigned long tmp, pwr_now = 0, pwr_move = 0;

6030

unsigned long tmp, pwr_now = 0, pwr_move = 0;

6030

unsigned int imbn = 2;

6031

unsigned int imbn = 2;

6031

unsigned long scaled_busy_load_per_task;

6032

unsigned long scaled_busy_load_per_task;

6032

struct sg_lb_stats *local, *busiest;

6033

struct sg_lb_stats *local, *busiest;

6033

6034

local = &sds->local_stat;

6035

local = &sds->local_stat;

6035

busiest = &sds->busiest_stat;

6036

busiest = &sds->busiest_stat;

6036

6037

if (!local->sum_nr_running)

6038

if (!local->sum_nr_running)

6038

local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);

6039

local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);

6039

else if (busiest->load_per_task > local->load_per_task)

6040

else if (busiest->load_per_task > local->load_per_task)

6040

imbn = 1;

6041

imbn = 1;

6041

6042

scaled_busy_load_per_task =

6043

scaled_busy_load_per_task =

6043

(busiest->load_per_task * SCHED_POWER_SCALE) /

6044

(busiest->load_per_task * SCHED_POWER_SCALE) /

6044

busiest->group_power;

6045

busiest->group_power;

6045

6046

if (busiest->avg_load + scaled_busy_load_per_task >=

6047

if (busiest->avg_load + scaled_busy_load_per_task >=

6047

local->avg_load + (scaled_busy_load_per_task * imbn)) {

6048

local->avg_load + (scaled_busy_load_per_task * imbn)) {

6048

env->imbalance = busiest->load_per_task;

6049

env->imbalance = busiest->load_per_task;

6049

return;

6050

return;

6050

}

6051

}

6051

6052

/*

6053

/*

6053

* OK, we don't have enough imbalance to justify moving tasks,

6054

* OK, we don't have enough imbalance to justify moving tasks,

6054

* however we may be able to increase total CPU power used by

6055

* however we may be able to increase total CPU power used by

6055

* moving them.

6056

* moving them.

6056

*/

6057

*/

6057

6058

pwr_now += busiest->group_power *

6059

pwr_now += busiest->group_power *

6059

min(busiest->load_per_task, busiest->avg_load);

6060

min(busiest->load_per_task, busiest->avg_load);

6060

pwr_now += local->group_power *

6061

pwr_now += local->group_power *

6061

min(local->load_per_task, local->avg_load);

6062

min(local->load_per_task, local->avg_load);

6062

pwr_now /= SCHED_POWER_SCALE;

6063

pwr_now /= SCHED_POWER_SCALE;

6063

6064

/* Amount of load we'd subtract */

6065

/* Amount of load we'd subtract */

6065

if (busiest->avg_load > scaled_busy_load_per_task) {

6066

if (busiest->avg_load > scaled_busy_load_per_task) {

6066

pwr_move += busiest->group_power *

6067

pwr_move += busiest->group_power *

6067

min(busiest->load_per_task,

6068

min(busiest->load_per_task,

6068

busiest->avg_load - scaled_busy_load_per_task);

6069

busiest->avg_load - scaled_busy_load_per_task);

6069

}

6070

}

6070

6071

/* Amount of load we'd add */

6072

/* Amount of load we'd add */

6072

if (busiest->avg_load * busiest->group_power <

6073

if (busiest->avg_load * busiest->group_power <

6073

busiest->load_per_task * SCHED_POWER_SCALE) {

6074

busiest->load_per_task * SCHED_POWER_SCALE) {

6074

tmp = (busiest->avg_load * busiest->group_power) /

6075

tmp = (busiest->avg_load * busiest->group_power) /

6075

local->group_power;

6076

local->group_power;

6076

} else {

6077

} else {

6077

tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /

6078

tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /

6078

local->group_power;

6079

local->group_power;

6079

}

6080

}

6080

pwr_move += local->group_power *

6081

pwr_move += local->group_power *

6081

min(local->load_per_task, local->avg_load + tmp);

6082

min(local->load_per_task, local->avg_load + tmp);

6082

pwr_move /= SCHED_POWER_SCALE;

6083

pwr_move /= SCHED_POWER_SCALE;

6083

6084

/* Move if we gain throughput */

6085

/* Move if we gain throughput */

6085

if (pwr_move > pwr_now)

6086

if (pwr_move > pwr_now)

6086

env->imbalance = busiest->load_per_task;

6087

env->imbalance = busiest->load_per_task;

6087

}

6088

}

6088

6089

/**

6090

/**

6090

* calculate_imbalance - Calculate the amount of imbalance present within the

6091

* calculate_imbalance - Calculate the amount of imbalance present within the

6091

* groups of a given sched_domain during load balance.

6092

* groups of a given sched_domain during load balance.

6092

* @env: load balance environment

6093

* @env: load balance environment

6093

* @sds: statistics of the sched_domain whose imbalance is to be calculated.

6094

* @sds: statistics of the sched_domain whose imbalance is to be calculated.

6094

*/

6095

*/

6095

static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)

6096

static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)

6096

{

6097

{

6097

unsigned long max_pull, load_above_capacity = ~0UL;

6098

unsigned long max_pull, load_above_capacity = ~0UL;

6098

struct sg_lb_stats *local, *busiest;

6099

struct sg_lb_stats *local, *busiest;

6099

6100

local = &sds->local_stat;

6101

local = &sds->local_stat;

6101

busiest = &sds->busiest_stat;

6102

busiest = &sds->busiest_stat;

6102

6103

if (busiest->group_imb) {

6104

if (busiest->group_imb) {

6104

/*

6105

/*

6105

* In the group_imb case we cannot rely on group-wide averages

6106

* In the group_imb case we cannot rely on group-wide averages

6106

* to ensure cpu-load equilibrium, look at wider averages. XXX

6107

* to ensure cpu-load equilibrium, look at wider averages. XXX

6107

*/

6108

*/

6108

busiest->load_per_task =

6109

busiest->load_per_task =

6109

min(busiest->load_per_task, sds->avg_load);

6110

min(busiest->load_per_task, sds->avg_load);

6110

}

6111

}

6111

6112

/*

6113

/*

6113

* In the presence of smp nice balancing, certain scenarios can have

6114

* In the presence of smp nice balancing, certain scenarios can have

6114

* max load less than avg load(as we skip the groups at or below

6115

* max load less than avg load(as we skip the groups at or below

6115

* its cpu_power, while calculating max_load..)

6116

* its cpu_power, while calculating max_load..)

6116

*/

6117

*/

6117

if (busiest->avg_load <= sds->avg_load ||

6118

if (busiest->avg_load <= sds->avg_load ||

6118

local->avg_load >= sds->avg_load) {

6119

local->avg_load >= sds->avg_load) {

6119

env->imbalance = 0;

6120

env->imbalance = 0;

6120

return fix_small_imbalance(env, sds);

6121

return fix_small_imbalance(env, sds);

6121

}

6122

}

6122

6123

if (!busiest->group_imb) {

6124

if (!busiest->group_imb) {

6124

/*

6125

/*

6125

* Don't want to pull so many tasks that a group would go idle.

6126

* Don't want to pull so many tasks that a group would go idle.

6126

* Except of course for the group_imb case, since then we might

6127

* Except of course for the group_imb case, since then we might

6127

* have to drop below capacity to reach cpu-load equilibrium.

6128

* have to drop below capacity to reach cpu-load equilibrium.

6128

*/

6129

*/

6129

load_above_capacity =

6130

load_above_capacity =

6130

(busiest->sum_nr_running - busiest->group_capacity);

6131

(busiest->sum_nr_running - busiest->group_capacity);

6131

6132

load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);

6133

load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);

6133

load_above_capacity /= busiest->group_power;

6134

load_above_capacity /= busiest->group_power;

6134

}

6135

}

6135

6136

/*

6137

/*

6137

* We're trying to get all the cpus to the average_load, so we don't

6138

* We're trying to get all the cpus to the average_load, so we don't

6138

* want to push ourselves above the average load, nor do we wish to

6139

* want to push ourselves above the average load, nor do we wish to

6139

* reduce the max loaded cpu below the average load. At the same time,

6140

* reduce the max loaded cpu below the average load. At the same time,

6140

* we also don't want to reduce the group load below the group capacity

6141

* we also don't want to reduce the group load below the group capacity

6141

* (so that we can implement power-savings policies etc). Thus we look

6142

* (so that we can implement power-savings policies etc). Thus we look

6142

* for the minimum possible imbalance.

6143

* for the minimum possible imbalance.

6143

*/

6144

*/

6144

max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);

6145

max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);

6145

6146

/* How much load to actually move to equalise the imbalance */

6147

/* How much load to actually move to equalise the imbalance */

6147

env->imbalance = min(

6148

env->imbalance = min(

6148

max_pull * busiest->group_power,

6149

max_pull * busiest->group_power,

6149

(sds->avg_load - local->avg_load) * local->group_power

6150

(sds->avg_load - local->avg_load) * local->group_power

6150

) / SCHED_POWER_SCALE;

6151

) / SCHED_POWER_SCALE;

6151

6152

/*

6153

/*

6153

* if *imbalance is less than the average load per runnable task

6154

* if *imbalance is less than the average load per runnable task

6154

* there is no guarantee that any tasks will be moved so we'll have

6155

* there is no guarantee that any tasks will be moved so we'll have

6155

* a think about bumping its value to force at least one task to be

6156

* a think about bumping its value to force at least one task to be

6156

* moved

6157

* moved

6157

*/

6158

*/

6158

if (env->imbalance < busiest->load_per_task)

6159

if (env->imbalance < busiest->load_per_task)

6159

return fix_small_imbalance(env, sds);

6160

return fix_small_imbalance(env, sds);

6160

}

6161

}

6161

6162

/******* find_busiest_group() helpers end here *********************/

6163

/******* find_busiest_group() helpers end here *********************/

6163

6164

/**

6165

/**

6165

* find_busiest_group - Returns the busiest group within the sched_domain

6166

* find_busiest_group - Returns the busiest group within the sched_domain

6166

* if there is an imbalance. If there isn't an imbalance, and

6167

* if there is an imbalance. If there isn't an imbalance, and

6167

* the user has opted for power-savings, it returns a group whose

6168

* the user has opted for power-savings, it returns a group whose

6168

* CPUs can be put to idle by rebalancing those tasks elsewhere, if

6169

* CPUs can be put to idle by rebalancing those tasks elsewhere, if

6169

* such a group exists.

6170

* such a group exists.

6170

*

6171

*

6171

* Also calculates the amount of weighted load which should be moved

6172

* Also calculates the amount of weighted load which should be moved

6172

* to restore balance.

6173

* to restore balance.

6173

*

6174

*

6174

* @env: The load balancing environment.

6175

* @env: The load balancing environment.

6175

*

6176

*

6176

* Return: - The busiest group if imbalance exists.

6177

* Return: - The busiest group if imbalance exists.

6177

* - If no imbalance and user has opted for power-savings balance,

6178

* - If no imbalance and user has opted for power-savings balance,

6178

* return the least loaded group whose CPUs can be

6179

* return the least loaded group whose CPUs can be

6179

* put to idle by rebalancing its tasks onto our group.

6180

* put to idle by rebalancing its tasks onto our group.

6180

*/

6181

*/

6181

static struct sched_group *find_busiest_group(struct lb_env *env)

6182

static struct sched_group *find_busiest_group(struct lb_env *env)

6182

{

6183

{

6183

struct sg_lb_stats *local, *busiest;

6184

struct sg_lb_stats *local, *busiest;

6184

struct sd_lb_stats sds;

6185

struct sd_lb_stats sds;

6185

6186

init_sd_lb_stats(&sds);

6187

init_sd_lb_stats(&sds);

6187

6188

/*

6189

/*

6189

* Compute the various statistics relavent for load balancing at

6190

* Compute the various statistics relavent for load balancing at

6190

* this level.

6191

* this level.

6191

*/

6192

*/

6192

update_sd_lb_stats(env, &sds);

6193

update_sd_lb_stats(env, &sds);

6193

local = &sds.local_stat;

6194

local = &sds.local_stat;

6194

busiest = &sds.busiest_stat;

6195

busiest = &sds.busiest_stat;

6195

6196

if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&

6197

if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&

6197

check_asym_packing(env, &sds))

6198

check_asym_packing(env, &sds))

6198

return sds.busiest;

6199

return sds.busiest;

6199

6200

/* There is no busy sibling group to pull tasks from */

6201

/* There is no busy sibling group to pull tasks from */

6201

if (!sds.busiest || busiest->sum_nr_running == 0)

6202

if (!sds.busiest || busiest->sum_nr_running == 0)

6202

goto out_balanced;

6203

goto out_balanced;

6203

6204

sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;

6205

sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;

6205

6206

/*

6207

/*

6207

* If the busiest group is imbalanced the below checks don't

6208

* If the busiest group is imbalanced the below checks don't

6208

* work because they assume all things are equal, which typically

6209

* work because they assume all things are equal, which typically

6209

* isn't true due to cpus_allowed constraints and the like.

6210

* isn't true due to cpus_allowed constraints and the like.

6210

*/

6211

*/

6211

if (busiest->group_imb)

6212

if (busiest->group_imb)

6212

goto force_balance;

6213

goto force_balance;

6213

6214

/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */

6215

/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */

6215

if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&

6216

if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&

6216

!busiest->group_has_capacity)

6217

!busiest->group_has_capacity)

6217

goto force_balance;

6218

goto force_balance;

6218

6219

/*

6220

/*

6220

* If the local group is more busy than the selected busiest group

6221

* If the local group is more busy than the selected busiest group

6221

* don't try and pull any tasks.

6222

* don't try and pull any tasks.

6222

*/

6223

*/

6223

if (local->avg_load >= busiest->avg_load)

6224

if (local->avg_load >= busiest->avg_load)

6224

goto out_balanced;

6225

goto out_balanced;

6225

6226

/*

6227

/*

6227

* Don't pull any tasks if this group is already above the domain

6228

* Don't pull any tasks if this group is already above the domain

6228

* average load.

6229

* average load.

6229

*/

6230

*/

6230

if (local->avg_load >= sds.avg_load)

6231

if (local->avg_load >= sds.avg_load)

6231

goto out_balanced;

6232

goto out_balanced;

6232

6233

if (env->idle == CPU_IDLE) {

6234

if (env->idle == CPU_IDLE) {

6234

/*

6235

/*

6235

* This cpu is idle. If the busiest group load doesn't

6236

* This cpu is idle. If the busiest group load doesn't

6236

* have more tasks than the number of available cpu's and

6237

* have more tasks than the number of available cpu's and

6237

* there is no imbalance between this and busiest group

6238

* there is no imbalance between this and busiest group

6238

* wrt to idle cpu's, it is balanced.

6239

* wrt to idle cpu's, it is balanced.

6239

*/

6240

*/

6240

if ((local->idle_cpus < busiest->idle_cpus) &&

6241

if ((local->idle_cpus < busiest->idle_cpus) &&

6241

busiest->sum_nr_running <= busiest->group_weight)

6242

busiest->sum_nr_running <= busiest->group_weight)

6242

goto out_balanced;

6243

goto out_balanced;

6243

} else {

6244

} else {

6244

/*

6245

/*

6245

* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use

6246

* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use

6246

* imbalance_pct to be conservative.

6247

* imbalance_pct to be conservative.

6247

*/

6248

*/

6248

if (100 * busiest->avg_load <=

6249

if (100 * busiest->avg_load <=

6249

env->sd->imbalance_pct * local->avg_load)

6250

env->sd->imbalance_pct * local->avg_load)

6250

goto out_balanced;

6251

goto out_balanced;

6251

}

6252

}

6252

6253

force_balance:

6254

force_balance:

6254

/* Looks like there is an imbalance. Compute it */

6255

/* Looks like there is an imbalance. Compute it */

6255

calculate_imbalance(env, &sds);

6256

calculate_imbalance(env, &sds);

6256

return sds.busiest;

6257

return sds.busiest;

6257

6258

out_balanced:

6259

out_balanced:

6259

env->imbalance = 0;

6260

env->imbalance = 0;

6260

return NULL;

6261

return NULL;

6261

}

6262

}

6262

6263

/*

6264

/*

6264

* find_busiest_queue - find the busiest runqueue among the cpus in group.

6265

* find_busiest_queue - find the busiest runqueue among the cpus in group.

6265

*/

6266

*/

6266

static struct rq *find_busiest_queue(struct lb_env *env,

6267

static struct rq *find_busiest_queue(struct lb_env *env,

6267

struct sched_group *group)

6268

struct sched_group *group)

6268

{

6269

{

6269

struct rq *busiest = NULL, *rq;

6270

struct rq *busiest = NULL, *rq;

6270

unsigned long busiest_load = 0, busiest_power = 1;

6271

unsigned long busiest_load = 0, busiest_power = 1;

6271

int i;

6272

int i;

6272

6273

for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {

6274

for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {

6274

unsigned long power, capacity, wl;

6275

unsigned long power, capacity, wl;

6275

enum fbq_type rt;

6276

enum fbq_type rt;

6276

6277

rq = cpu_rq(i);

6278

rq = cpu_rq(i);

6278

rt = fbq_classify_rq(rq);

6279

rt = fbq_classify_rq(rq);

6279

6280

/*

6281

/*

6281

* We classify groups/runqueues into three groups:

6282

* We classify groups/runqueues into three groups:

6282

* - regular: there are !numa tasks

6283

* - regular: there are !numa tasks

6283

* - remote: there are numa tasks that run on the 'wrong' node

6284

* - remote: there are numa tasks that run on the 'wrong' node

6284

* - all: there is no distinction

6285

* - all: there is no distinction

6285

*

6286

*

6286

* In order to avoid migrating ideally placed numa tasks,

6287

* In order to avoid migrating ideally placed numa tasks,

6287

* ignore those when there's better options.

6288

* ignore those when there's better options.

6288

*

6289

*

6289

* If we ignore the actual busiest queue to migrate another

6290

* If we ignore the actual busiest queue to migrate another

6290

* task, the next balance pass can still reduce the busiest

6291

* task, the next balance pass can still reduce the busiest

6291

* queue by moving tasks around inside the node.

6292

* queue by moving tasks around inside the node.

6292

*

6293

*

6293

* If we cannot move enough load due to this classification

6294

* If we cannot move enough load due to this classification

6294

* the next pass will adjust the group classification and

6295

* the next pass will adjust the group classification and

6295

* allow migration of more tasks.

6296

* allow migration of more tasks.

6296

*

6297

*

6297

* Both cases only affect the total convergence complexity.

6298

* Both cases only affect the total convergence complexity.

6298

*/

6299

*/

6299

if (rt > env->fbq_type)

6300

if (rt > env->fbq_type)

6300

continue;

6301

continue;

6301

6302

power = power_of(i);

6303

power = power_of(i);

6303

capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);

6304

capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);

6304

if (!capacity)

6305

if (!capacity)

6305

capacity = fix_small_capacity(env->sd, group);

6306

capacity = fix_small_capacity(env->sd, group);

6306

6307

wl = weighted_cpuload(i);

6308

wl = weighted_cpuload(i);

6308

6309

/*

6310

/*

6310

* When comparing with imbalance, use weighted_cpuload()

6311

* When comparing with imbalance, use weighted_cpuload()

6311

* which is not scaled with the cpu power.

6312

* which is not scaled with the cpu power.

6312

*/

6313

*/

6313

if (capacity && rq->nr_running == 1 && wl > env->imbalance)

6314

if (capacity && rq->nr_running == 1 && wl > env->imbalance)

6314

continue;

6315

continue;

6315

6316

/*

6317

/*

6317

* For the load comparisons with the other cpu's, consider

6318

* For the load comparisons with the other cpu's, consider

6318

* the weighted_cpuload() scaled with the cpu power, so that

6319

* the weighted_cpuload() scaled with the cpu power, so that

6319

* the load can be moved away from the cpu that is potentially

6320

* the load can be moved away from the cpu that is potentially

6320

* running at a lower capacity.

6321

* running at a lower capacity.

6321

*

6322

*

6322

* Thus we're looking for max(wl_i / power_i), crosswise

6323

* Thus we're looking for max(wl_i / power_i), crosswise

6323

* multiplication to rid ourselves of the division works out

6324

* multiplication to rid ourselves of the division works out

6324

* to: wl_i * power_j > wl_j * power_i; where j is our

6325

* to: wl_i * power_j > wl_j * power_i; where j is our

6325

* previous maximum.

6326

* previous maximum.

6326

*/

6327

*/

6327

if (wl * busiest_power > busiest_load * power) {

6328

if (wl * busiest_power > busiest_load * power) {

6328

busiest_load = wl;

6329

busiest_load = wl;

6329

busiest_power = power;

6330

busiest_power = power;

6330

busiest = rq;

6331

busiest = rq;

6331

}

6332

}

6332

}

6333

}

6333

6334

return busiest;

6335

return busiest;

6335

}

6336

}

6336

6337

/*

6338

/*

6338

* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but

6339

* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but

6339

* so long as it is large enough.

6340

* so long as it is large enough.

6340

*/

6341

*/

6341

#define MAX_PINNED_INTERVAL 512

6342

#define MAX_PINNED_INTERVAL 512

6342

6343

/* Working cpumask for load_balance and load_balance_newidle. */

6344

/* Working cpumask for load_balance and load_balance_newidle. */

6344

DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);

6345

DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);

6345

6346

static int need_active_balance(struct lb_env *env)

6347

static int need_active_balance(struct lb_env *env)

6347

{

6348

{

6348

struct sched_domain *sd = env->sd;

6349

struct sched_domain *sd = env->sd;

6349

6350

if (env->idle == CPU_NEWLY_IDLE) {

6351

if (env->idle == CPU_NEWLY_IDLE) {

6351

6352

/*

6353

/*

6353

* ASYM_PACKING needs to force migrate tasks from busy but

6354

* ASYM_PACKING needs to force migrate tasks from busy but

6354

* higher numbered CPUs in order to pack all tasks in the

6355

* higher numbered CPUs in order to pack all tasks in the

6355

* lowest numbered CPUs.

6356

* lowest numbered CPUs.

6356

*/

6357

*/

6357

if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)

6358

if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)

6358

return 1;

6359

return 1;

6359

}

6360

}

6360

6361

return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);

6362

return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);

6362

}

6363

}

6363

6364

static int active_load_balance_cpu_stop(void *data);

6365

static int active_load_balance_cpu_stop(void *data);

6365

6366

static int should_we_balance(struct lb_env *env)

6367

static int should_we_balance(struct lb_env *env)

6367

{

6368

{

6368

struct sched_group *sg = env->sd->groups;

6369

struct sched_group *sg = env->sd->groups;

6369

struct cpumask *sg_cpus, *sg_mask;

6370

struct cpumask *sg_cpus, *sg_mask;

6370

int cpu, balance_cpu = -1;

6371

int cpu, balance_cpu = -1;

6371

6372

/*

6373

/*

6373

* In the newly idle case, we will allow all the cpu's

6374

* In the newly idle case, we will allow all the cpu's

6374

* to do the newly idle load balance.

6375

* to do the newly idle load balance.

6375

*/

6376

*/

6376

if (env->idle == CPU_NEWLY_IDLE)

6377

if (env->idle == CPU_NEWLY_IDLE)

6377

return 1;

6378

return 1;

6378

6379

sg_cpus = sched_group_cpus(sg);

6380

sg_cpus = sched_group_cpus(sg);

6380

sg_mask = sched_group_mask(sg);

6381

sg_mask = sched_group_mask(sg);

6381

/* Try to find first idle cpu */

6382

/* Try to find first idle cpu */

6382

for_each_cpu_and(cpu, sg_cpus, env->cpus) {

6383

for_each_cpu_and(cpu, sg_cpus, env->cpus) {

6383

if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))

6384

if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))

6384

continue;

6385

continue;

6385

6386

balance_cpu = cpu;

6387

balance_cpu = cpu;

6387

break;

6388

break;

6388

}

6389

}

6389

6390

if (balance_cpu == -1)

6391

if (balance_cpu == -1)

6391

balance_cpu = group_balance_cpu(sg);

6392

balance_cpu = group_balance_cpu(sg);

6392

6393

/*

6394

/*

6394

* First idle cpu or the first cpu(busiest) in this sched group

6395

* First idle cpu or the first cpu(busiest) in this sched group

6395

* is eligible for doing load balancing at this and above domains.

6396

* is eligible for doing load balancing at this and above domains.

6396

*/

6397

*/

6397

return balance_cpu == env->dst_cpu;

6398

return balance_cpu == env->dst_cpu;

6398

}

6399

}

6399

6400

/*

6401

/*

6401

* Check this_cpu to ensure it is balanced within domain. Attempt to move

6402

* Check this_cpu to ensure it is balanced within domain. Attempt to move

6402

* tasks if there is an imbalance.

6403

* tasks if there is an imbalance.

6403

*/

6404

*/

6404

static int load_balance(int this_cpu, struct rq *this_rq,

6405

static int load_balance(int this_cpu, struct rq *this_rq,

6405

struct sched_domain *sd, enum cpu_idle_type idle,

6406

struct sched_domain *sd, enum cpu_idle_type idle,

6406

int *continue_balancing)

6407

int *continue_balancing)

6407

{

6408

{

6408

int ld_moved, cur_ld_moved, active_balance = 0;

6409

int ld_moved, cur_ld_moved, active_balance = 0;

6409

struct sched_domain *sd_parent = sd->parent;

6410

struct sched_domain *sd_parent = sd->parent;

6410

struct sched_group *group;

6411

struct sched_group *group;

6411

struct rq *busiest;

6412

struct rq *busiest;

6412

unsigned long flags;

6413

unsigned long flags;

6413

struct cpumask *cpus = __get_cpu_var(load_balance_mask);

6414

struct cpumask *cpus = __get_cpu_var(load_balance_mask);

6414

6415

struct lb_env env = {

6416

struct lb_env env = {

6416

.sd = sd,

6417

.sd = sd,

6417

.dst_cpu = this_cpu,

6418

.dst_cpu = this_cpu,

6418

.dst_rq = this_rq,

6419

.dst_rq = this_rq,

6419

.dst_grpmask = sched_group_cpus(sd->groups),

6420

.dst_grpmask = sched_group_cpus(sd->groups),

6420

.idle = idle,

6421

.idle = idle,

6421

.loop_break = sched_nr_migrate_break,

6422

.loop_break = sched_nr_migrate_break,

6422

.cpus = cpus,

6423

.cpus = cpus,

6423

.fbq_type = all,

6424

.fbq_type = all,

6424

};

6425

};

6425

6426

/*

6427

/*

6427

* For NEWLY_IDLE load_balancing, we don't need to consider

6428

* For NEWLY_IDLE load_balancing, we don't need to consider

6428

* other cpus in our group

6429

* other cpus in our group

6429

*/

6430

*/

6430

if (idle == CPU_NEWLY_IDLE)

6431

if (idle == CPU_NEWLY_IDLE)

6431

env.dst_grpmask = NULL;

6432

env.dst_grpmask = NULL;

6432

6433

cpumask_copy(cpus, cpu_active_mask);

6434

cpumask_copy(cpus, cpu_active_mask);

6434

6435

schedstat_inc(sd, lb_count[idle]);

6436

schedstat_inc(sd, lb_count[idle]);

6436

6437

redo:

6438

redo:

6438

if (!should_we_balance(&env)) {

6439

if (!should_we_balance(&env)) {

6439

*continue_balancing = 0;

6440

*continue_balancing = 0;

6440

goto out_balanced;

6441

goto out_balanced;

6441

}

6442

}

6442

6443

group = find_busiest_group(&env);

6444

group = find_busiest_group(&env);

6444

if (!group) {

6445

if (!group) {

6445

schedstat_inc(sd, lb_nobusyg[idle]);

6446

schedstat_inc(sd, lb_nobusyg[idle]);

6446

goto out_balanced;

6447

goto out_balanced;

6447

}

6448

}

6448

6449

busiest = find_busiest_queue(&env, group);

6450

busiest = find_busiest_queue(&env, group);

6450

if (!busiest) {

6451

if (!busiest) {

6451

schedstat_inc(sd, lb_nobusyq[idle]);

6452

schedstat_inc(sd, lb_nobusyq[idle]);

6452

goto out_balanced;

6453

goto out_balanced;

6453

}

6454

}

6454

6455

BUG_ON(busiest == env.dst_rq);

6456

BUG_ON(busiest == env.dst_rq);

6456

6457

schedstat_add(sd, lb_imbalance[idle], env.imbalance);

6458

schedstat_add(sd, lb_imbalance[idle], env.imbalance);

6458

6459

ld_moved = 0;

6460

ld_moved = 0;

6460

if (busiest->nr_running > 1) {

6461

if (busiest->nr_running > 1) {

6461

/*

6462

/*

6462

* Attempt to move tasks. If find_busiest_group has found

6463

* Attempt to move tasks. If find_busiest_group has found

6463

* an imbalance but busiest->nr_running <= 1, the group is

6464

* an imbalance but busiest->nr_running <= 1, the group is

6464

* still unbalanced. ld_moved simply stays zero, so it is

6465

* still unbalanced. ld_moved simply stays zero, so it is

6465

* correctly treated as an imbalance.

6466

* correctly treated as an imbalance.

6466

*/

6467

*/

6467

env.flags |= LBF_ALL_PINNED;

6468

env.flags |= LBF_ALL_PINNED;

6468

env.src_cpu = busiest->cpu;

6469

env.src_cpu = busiest->cpu;

6469

env.src_rq = busiest;

6470

env.src_rq = busiest;

6470

env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);

6471

env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);

6471

6472

more_balance:

6473

more_balance:

6473

local_irq_save(flags);

6474

local_irq_save(flags);

6474

double_rq_lock(env.dst_rq, busiest);

6475

double_rq_lock(env.dst_rq, busiest);

6475

6476

/*

6477

/*

6477

* cur_ld_moved - load moved in current iteration

6478

* cur_ld_moved - load moved in current iteration

6478

* ld_moved - cumulative load moved across iterations

6479

* ld_moved - cumulative load moved across iterations

6479

*/

6480

*/

6480

cur_ld_moved = move_tasks(&env);

6481

cur_ld_moved = move_tasks(&env);

6481

ld_moved += cur_ld_moved;

6482

ld_moved += cur_ld_moved;

6482

double_rq_unlock(env.dst_rq, busiest);

6483

double_rq_unlock(env.dst_rq, busiest);

6483

local_irq_restore(flags);

6484

local_irq_restore(flags);

6484

6485

/*

6486

/*

6486

* some other cpu did the load balance for us.

6487

* some other cpu did the load balance for us.

6487

*/

6488

*/

6488

if (cur_ld_moved && env.dst_cpu != smp_processor_id())

6489

if (cur_ld_moved && env.dst_cpu != smp_processor_id())

6489

resched_cpu(env.dst_cpu);

6490

resched_cpu(env.dst_cpu);

6490

6491

if (env.flags & LBF_NEED_BREAK) {

6492

if (env.flags & LBF_NEED_BREAK) {

6492

env.flags &= ~LBF_NEED_BREAK;

6493

env.flags &= ~LBF_NEED_BREAK;

6493

goto more_balance;

6494

goto more_balance;

6494

}

6495

}

6495

6496

/*

6497

/*

6497

* Revisit (affine) tasks on src_cpu that couldn't be moved to

6498

* Revisit (affine) tasks on src_cpu that couldn't be moved to

6498

* us and move them to an alternate dst_cpu in our sched_group

6499

* us and move them to an alternate dst_cpu in our sched_group

6499

* where they can run. The upper limit on how many times we

6500

* where they can run. The upper limit on how many times we

6500

* iterate on same src_cpu is dependent on number of cpus in our

6501

* iterate on same src_cpu is dependent on number of cpus in our

6501

* sched_group.

6502

* sched_group.

6502

*

6503

*

6503

* This changes load balance semantics a bit on who can move

6504

* This changes load balance semantics a bit on who can move

6504

* load to a given_cpu. In addition to the given_cpu itself

6505

* load to a given_cpu. In addition to the given_cpu itself

6505

* (or a ilb_cpu acting on its behalf where given_cpu is

6506

* (or a ilb_cpu acting on its behalf where given_cpu is

6506

* nohz-idle), we now have balance_cpu in a position to move

6507

* nohz-idle), we now have balance_cpu in a position to move

6507

* load to given_cpu. In rare situations, this may cause

6508

* load to given_cpu. In rare situations, this may cause

6508

* conflicts (balance_cpu and given_cpu/ilb_cpu deciding

6509

* conflicts (balance_cpu and given_cpu/ilb_cpu deciding

6509

* _independently_ and at _same_ time to move some load to

6510

* _independently_ and at _same_ time to move some load to

6510

* given_cpu) causing exceess load to be moved to given_cpu.

6511

* given_cpu) causing exceess load to be moved to given_cpu.

6511

* This however should not happen so much in practice and

6512

* This however should not happen so much in practice and

6512

* moreover subsequent load balance cycles should correct the

6513

* moreover subsequent load balance cycles should correct the

6513

* excess load moved.

6514

* excess load moved.

6514

*/

6515

*/

6515

if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {

6516

if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {

6516

6517

/* Prevent to re-select dst_cpu via env's cpus */

6518

/* Prevent to re-select dst_cpu via env's cpus */

6518

cpumask_clear_cpu(env.dst_cpu, env.cpus);

6519

cpumask_clear_cpu(env.dst_cpu, env.cpus);

6519

6520

env.dst_rq = cpu_rq(env.new_dst_cpu);

6521

env.dst_rq = cpu_rq(env.new_dst_cpu);

6521

env.dst_cpu = env.new_dst_cpu;

6522

env.dst_cpu = env.new_dst_cpu;

6522

env.flags &= ~LBF_DST_PINNED;

6523

env.flags &= ~LBF_DST_PINNED;

6523

env.loop = 0;

6524

env.loop = 0;

6524

env.loop_break = sched_nr_migrate_break;

6525

env.loop_break = sched_nr_migrate_break;

6525

6526

/*

6527

/*

6527

* Go back to "more_balance" rather than "redo" since we

6528

* Go back to "more_balance" rather than "redo" since we

6528

* need to continue with same src_cpu.

6529

* need to continue with same src_cpu.

6529

*/

6530

*/

6530

goto more_balance;

6531

goto more_balance;

6531

}

6532

}

6532

6533

/*

6534

/*

6534

* We failed to reach balance because of affinity.

6535

* We failed to reach balance because of affinity.

6535

*/

6536

*/

6536

if (sd_parent) {

6537

if (sd_parent) {

6537

int *group_imbalance = &sd_parent->groups->sgp->imbalance;

6538

int *group_imbalance = &sd_parent->groups->sgp->imbalance;

6538

6539

if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {

6540

if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {

6540

*group_imbalance = 1;

6541

*group_imbalance = 1;

6541

} else if (*group_imbalance)

6542

} else if (*group_imbalance)

6542

*group_imbalance = 0;

6543

*group_imbalance = 0;

6543

}

6544

}

6544

6545

/* All tasks on this runqueue were pinned by CPU affinity */

6546

/* All tasks on this runqueue were pinned by CPU affinity */

6546

if (unlikely(env.flags & LBF_ALL_PINNED)) {

6547

if (unlikely(env.flags & LBF_ALL_PINNED)) {

6547

cpumask_clear_cpu(cpu_of(busiest), cpus);

6548

cpumask_clear_cpu(cpu_of(busiest), cpus);

6548

if (!cpumask_empty(cpus)) {

6549

if (!cpumask_empty(cpus)) {

6549

env.loop = 0;

6550

env.loop = 0;

6550

env.loop_break = sched_nr_migrate_break;

6551

env.loop_break = sched_nr_migrate_break;

6551

goto redo;

6552

goto redo;

6552

}

6553

}

6553

goto out_balanced;

6554

goto out_balanced;

6554

}

6555

}

6555

}

6556

}

6556

6557

if (!ld_moved) {

6558

if (!ld_moved) {

6558

schedstat_inc(sd, lb_failed[idle]);

6559

schedstat_inc(sd, lb_failed[idle]);

6559

/*

6560

/*

6560

* Increment the failure counter only on periodic balance.

6561

* Increment the failure counter only on periodic balance.

6561

* We do not want newidle balance, which can be very

6562

* We do not want newidle balance, which can be very

6562

* frequent, pollute the failure counter causing

6563

* frequent, pollute the failure counter causing

6563

* excessive cache_hot migrations and active balances.

6564

* excessive cache_hot migrations and active balances.

6564

*/

6565

*/

6565

if (idle != CPU_NEWLY_IDLE)

6566

if (idle != CPU_NEWLY_IDLE)

6566

sd->nr_balance_failed++;

6567

sd->nr_balance_failed++;

6567

6568

if (need_active_balance(&env)) {

6569

if (need_active_balance(&env)) {

6569

raw_spin_lock_irqsave(&busiest->lock, flags);

6570

raw_spin_lock_irqsave(&busiest->lock, flags);

6570

6571

/* don't kick the active_load_balance_cpu_stop,

6572

/* don't kick the active_load_balance_cpu_stop,

6572

* if the curr task on busiest cpu can't be

6573

* if the curr task on busiest cpu can't be

6573

* moved to this_cpu

6574

* moved to this_cpu

6574

*/

6575

*/

6575

if (!cpumask_test_cpu(this_cpu,

6576

if (!cpumask_test_cpu(this_cpu,

6576

tsk_cpus_allowed(busiest->curr))) {

6577

tsk_cpus_allowed(busiest->curr))) {

6577

raw_spin_unlock_irqrestore(&busiest->lock,

6578

raw_spin_unlock_irqrestore(&busiest->lock,

6578

flags);

6579

flags);

6579

env.flags |= LBF_ALL_PINNED;

6580

env.flags |= LBF_ALL_PINNED;

6580

goto out_one_pinned;

6581

goto out_one_pinned;

6581

}

6582

}

6582

6583

/*

6584

/*

6584

* ->active_balance synchronizes accesses to

6585

* ->active_balance synchronizes accesses to

6585

* ->active_balance_work. Once set, it's cleared

6586

* ->active_balance_work. Once set, it's cleared

6586

* only after active load balance is finished.

6587

* only after active load balance is finished.

6587

*/

6588

*/

6588

if (!busiest->active_balance) {

6589

if (!busiest->active_balance) {

6589

busiest->active_balance = 1;

6590

busiest->active_balance = 1;

6590

busiest->push_cpu = this_cpu;

6591

busiest->push_cpu = this_cpu;

6591

active_balance = 1;

6592

active_balance = 1;

6592

}

6593

}

6593

raw_spin_unlock_irqrestore(&busiest->lock, flags);

6594

raw_spin_unlock_irqrestore(&busiest->lock, flags);

6594

6595

if (active_balance) {

6596

if (active_balance) {

6596

stop_one_cpu_nowait(cpu_of(busiest),

6597

stop_one_cpu_nowait(cpu_of(busiest),

6597

active_load_balance_cpu_stop, busiest,

6598

active_load_balance_cpu_stop, busiest,

6598

&busiest->active_balance_work);

6599

&busiest->active_balance_work);

6599

}

6600

}

6600

6601

/*

6602

/*

6602

* We've kicked active balancing, reset the failure

6603

* We've kicked active balancing, reset the failure

6603

* counter.

6604

* counter.

6604

*/

6605

*/

6605

sd->nr_balance_failed = sd->cache_nice_tries+1;

6606

sd->nr_balance_failed = sd->cache_nice_tries+1;

6606

}

6607

}

6607

} else

6608

} else

6608

sd->nr_balance_failed = 0;

6609

sd->nr_balance_failed = 0;

6609

6610

if (likely(!active_balance)) {

6611

if (likely(!active_balance)) {

6611

/* We were unbalanced, so reset the balancing interval */

6612

/* We were unbalanced, so reset the balancing interval */

6612

sd->balance_interval = sd->min_interval;

6613

sd->balance_interval = sd->min_interval;

6613

} else {

6614

} else {

6614

/*

6615

/*

6615

* If we've begun active balancing, start to back off. This

6616

* If we've begun active balancing, start to back off. This

6616

* case may not be covered by the all_pinned logic if there

6617

* case may not be covered by the all_pinned logic if there

6617

* is only 1 task on the busy runqueue (because we don't call

6618

* is only 1 task on the busy runqueue (because we don't call

6618

* move_tasks).

6619

* move_tasks).

6619

*/

6620

*/

6620

if (sd->balance_interval < sd->max_interval)

6621

if (sd->balance_interval < sd->max_interval)

6621

sd->balance_interval *= 2;

6622

sd->balance_interval *= 2;

6622

}

6623

}

6623

6624

goto out;

6625

goto out;

6625

6626

out_balanced:

6627

out_balanced:

6627

schedstat_inc(sd, lb_balanced[idle]);

6628

schedstat_inc(sd, lb_balanced[idle]);

6628

6629

sd->nr_balance_failed = 0;

6630

sd->nr_balance_failed = 0;

6630

6631

out_one_pinned:

6632

out_one_pinned:

6632

/* tune up the balancing interval */

6633

/* tune up the balancing interval */

6633

if (((env.flags & LBF_ALL_PINNED) &&

6634

if (((env.flags & LBF_ALL_PINNED) &&

6634

sd->balance_interval < MAX_PINNED_INTERVAL) ||

6635

sd->balance_interval < MAX_PINNED_INTERVAL) ||

6635

(sd->balance_interval < sd->max_interval))

6636

(sd->balance_interval < sd->max_interval))

6636

sd->balance_interval *= 2;

6637

sd->balance_interval *= 2;

6637

6638

ld_moved = 0;

6639

ld_moved = 0;

6639

out:

6640

out:

6640

return ld_moved;

6641

return ld_moved;

6641

}

6642

}

6642

6643

/*

6644

/*

6644

* idle_balance is called by schedule() if this_cpu is about to become

6645

* idle_balance is called by schedule() if this_cpu is about to become

6645

* idle. Attempts to pull tasks from other CPUs.

6646

* idle. Attempts to pull tasks from other CPUs.

6646

*/

6647

*/

6647

static int idle_balance(struct rq *this_rq)

6648

static int idle_balance(struct rq *this_rq)

6648

{

6649

{

6649

struct sched_domain *sd;

6650

struct sched_domain *sd;

6650

int pulled_task = 0;

6651

int pulled_task = 0;

6651

unsigned long next_balance = jiffies + HZ;

6652

unsigned long next_balance = jiffies + HZ;

6652

u64 curr_cost = 0;

6653

u64 curr_cost = 0;

6653

int this_cpu = this_rq->cpu;

6654

int this_cpu = this_rq->cpu;

6654

6655

idle_enter_fair(this_rq);

6656

idle_enter_fair(this_rq);

6656

6657

/*

6658

/*

6658

* We must set idle_stamp _before_ calling idle_balance(), such that we

6659

* We must set idle_stamp _before_ calling idle_balance(), such that we

6659

* measure the duration of idle_balance() as idle time.

6660

* measure the duration of idle_balance() as idle time.

6660

*/

6661

*/

6661

this_rq->idle_stamp = rq_clock(this_rq);

6662

this_rq->idle_stamp = rq_clock(this_rq);

6662

6663

if (this_rq->avg_idle < sysctl_sched_migration_cost)

6664

if (this_rq->avg_idle < sysctl_sched_migration_cost)

6664

goto out;

6665

goto out;

6665

6666

/*

6667

/*

6667

* Drop the rq->lock, but keep IRQ/preempt disabled.

6668

* Drop the rq->lock, but keep IRQ/preempt disabled.

6668

*/

6669

*/

6669

raw_spin_unlock(&this_rq->lock);

6670

raw_spin_unlock(&this_rq->lock);

6670

6671

update_blocked_averages(this_cpu);

6672

update_blocked_averages(this_cpu);

6672

rcu_read_lock();

6673

rcu_read_lock();

6673

for_each_domain(this_cpu, sd) {

6674

for_each_domain(this_cpu, sd) {

6674

unsigned long interval;

6675

unsigned long interval;

6675

int continue_balancing = 1;

6676

int continue_balancing = 1;

6676

u64 t0, domain_cost;

6677

u64 t0, domain_cost;

6677

6678

if (!(sd->flags & SD_LOAD_BALANCE))

6679

if (!(sd->flags & SD_LOAD_BALANCE))

6679

continue;

6680

continue;

6680

6681

if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)

6682

if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)

6682

break;

6683

break;

6683

6684

if (sd->flags & SD_BALANCE_NEWIDLE) {

6685

if (sd->flags & SD_BALANCE_NEWIDLE) {

6685

t0 = sched_clock_cpu(this_cpu);

6686

t0 = sched_clock_cpu(this_cpu);

6686

6687

/* If we've pulled tasks over stop searching: */

6688

/* If we've pulled tasks over stop searching: */

6688

pulled_task = load_balance(this_cpu, this_rq,

6689

pulled_task = load_balance(this_cpu, this_rq,

6689

sd, CPU_NEWLY_IDLE,

6690

sd, CPU_NEWLY_IDLE,

6690

&continue_balancing);

6691

&continue_balancing);

6691

6692

domain_cost = sched_clock_cpu(this_cpu) - t0;

6693

domain_cost = sched_clock_cpu(this_cpu) - t0;

6693

if (domain_cost > sd->max_newidle_lb_cost)

6694

if (domain_cost > sd->max_newidle_lb_cost)

6694

sd->max_newidle_lb_cost = domain_cost;

6695

sd->max_newidle_lb_cost = domain_cost;

6695

6696

curr_cost += domain_cost;

6697

curr_cost += domain_cost;

6697

}

6698

}

6698

6699

interval = msecs_to_jiffies(sd->balance_interval);

6700

interval = msecs_to_jiffies(sd->balance_interval);

6700

if (time_after(next_balance, sd->last_balance + interval))

6701

if (time_after(next_balance, sd->last_balance + interval))

6701

next_balance = sd->last_balance + interval;

6702

next_balance = sd->last_balance + interval;

6702

if (pulled_task)

6703

if (pulled_task)

6703

break;

6704

break;

6704

}

6705

}

6705

rcu_read_unlock();

6706

rcu_read_unlock();

6706

6707

raw_spin_lock(&this_rq->lock);

6708

raw_spin_lock(&this_rq->lock);

6708

6709

if (curr_cost > this_rq->max_idle_balance_cost)

6710

if (curr_cost > this_rq->max_idle_balance_cost)

6710

this_rq->max_idle_balance_cost = curr_cost;

6711

this_rq->max_idle_balance_cost = curr_cost;

6711

6712

/*

6713

/*

6713

* While browsing the domains, we released the rq lock, a task could

6714

* While browsing the domains, we released the rq lock, a task could

6714

* have been enqueued in the meantime. Since we're not going idle,

6715

* have been enqueued in the meantime. Since we're not going idle,

6715

* pretend we pulled a task.

6716

* pretend we pulled a task.

6716

*/

6717

*/

6717

if (this_rq->cfs.h_nr_running && !pulled_task)

6718

if (this_rq->cfs.h_nr_running && !pulled_task)

6718

pulled_task = 1;

6719

pulled_task = 1;

6719

6720

if (pulled_task || time_after(jiffies, this_rq->next_balance)) {

6721

if (pulled_task || time_after(jiffies, this_rq->next_balance)) {

6721

/*

6722

/*

6722

* We are going idle. next_balance may be set based on

6723

* We are going idle. next_balance may be set based on

6723

* a busy processor. So reset next_balance.

6724

* a busy processor. So reset next_balance.

6724

*/

6725

*/

6725

this_rq->next_balance = next_balance;

6726

this_rq->next_balance = next_balance;

6726

}

6727

}

6727

6728

out:

6729

out:

6729

/* Is there a task of a high priority class? */

6730

/* Is there a task of a high priority class? */

6730

if (this_rq->nr_running != this_rq->cfs.h_nr_running &&

6731

if (this_rq->nr_running != this_rq->cfs.h_nr_running &&

6731

((this_rq->stop && this_rq->stop->on_rq) ||

6732

((this_rq->stop && this_rq->stop->on_rq) ||

6732

this_rq->dl.dl_nr_running ||

6733

this_rq->dl.dl_nr_running ||

6733

(this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))

6734

(this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))

6734

pulled_task = -1;

6735

pulled_task = -1;

6735

6736

if (pulled_task) {

6737

if (pulled_task) {

6737

idle_exit_fair(this_rq);

6738

idle_exit_fair(this_rq);

6738

this_rq->idle_stamp = 0;

6739

this_rq->idle_stamp = 0;

6739

}

6740

}

6740

6741

return pulled_task;

6742

return pulled_task;

6742

}

6743

}

6743

6744

/*

6745

/*

6745

* active_load_balance_cpu_stop is run by cpu stopper. It pushes

6746

* active_load_balance_cpu_stop is run by cpu stopper. It pushes

6746

* running tasks off the busiest CPU onto idle CPUs. It requires at

6747

* running tasks off the busiest CPU onto idle CPUs. It requires at

6747

* least 1 task to be running on each physical CPU where possible, and

6748

* least 1 task to be running on each physical CPU where possible, and

6748

* avoids physical / logical imbalances.

6749

* avoids physical / logical imbalances.

6749

*/

6750

*/

6750

static int active_load_balance_cpu_stop(void *data)

6751

static int active_load_balance_cpu_stop(void *data)

6751

{

6752

{

6752

struct rq *busiest_rq = data;

6753

struct rq *busiest_rq = data;

6753

int busiest_cpu = cpu_of(busiest_rq);

6754

int busiest_cpu = cpu_of(busiest_rq);

6754

int target_cpu = busiest_rq->push_cpu;

6755

int target_cpu = busiest_rq->push_cpu;

6755

struct rq *target_rq = cpu_rq(target_cpu);

6756

struct rq *target_rq = cpu_rq(target_cpu);

6756

struct sched_domain *sd;

6757

struct sched_domain *sd;

6757

6758

raw_spin_lock_irq(&busiest_rq->lock);

6759

raw_spin_lock_irq(&busiest_rq->lock);

6759

6760

/* make sure the requested cpu hasn't gone down in the meantime */

6761

/* make sure the requested cpu hasn't gone down in the meantime */

6761

if (unlikely(busiest_cpu != smp_processor_id() ||

6762

if (unlikely(busiest_cpu != smp_processor_id() ||

6762

!busiest_rq->active_balance))

6763

!busiest_rq->active_balance))

6763

goto out_unlock;

6764

goto out_unlock;

6764

6765

/* Is there any task to move? */

6766

/* Is there any task to move? */

6766

if (busiest_rq->nr_running <= 1)

6767

if (busiest_rq->nr_running <= 1)

6767

goto out_unlock;

6768

goto out_unlock;

6768

6769

/*

6770

/*

6770

* This condition is "impossible", if it occurs

6771

* This condition is "impossible", if it occurs

6771

* we need to fix it. Originally reported by

6772

* we need to fix it. Originally reported by

6772

* Bjorn Helgaas on a 128-cpu setup.

6773

* Bjorn Helgaas on a 128-cpu setup.

6773

*/

6774

*/

6774

BUG_ON(busiest_rq == target_rq);

6775

BUG_ON(busiest_rq == target_rq);

6775

6776

/* move a task from busiest_rq to target_rq */

6777

/* move a task from busiest_rq to target_rq */

6777

double_lock_balance(busiest_rq, target_rq);

6778

double_lock_balance(busiest_rq, target_rq);

6778

6779

/* Search for an sd spanning us and the target CPU. */

6780

/* Search for an sd spanning us and the target CPU. */

6780

rcu_read_lock();

6781

rcu_read_lock();

6781

for_each_domain(target_cpu, sd) {

6782

for_each_domain(target_cpu, sd) {

6782

if ((sd->flags & SD_LOAD_BALANCE) &&

6783

if ((sd->flags & SD_LOAD_BALANCE) &&

6783

cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))

6784

cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))

6784

break;

6785

break;

6785

}

6786

}

6786

6787

if (likely(sd)) {

6788

if (likely(sd)) {

6788

struct lb_env env = {

6789

struct lb_env env = {

6789

.sd = sd,

6790

.sd = sd,

6790

.dst_cpu = target_cpu,

6791

.dst_cpu = target_cpu,

6791

.dst_rq = target_rq,

6792

.dst_rq = target_rq,

6792

.src_cpu = busiest_rq->cpu,

6793

.src_cpu = busiest_rq->cpu,

6793

.src_rq = busiest_rq,

6794

.src_rq = busiest_rq,

6794

.idle = CPU_IDLE,

6795

.idle = CPU_IDLE,

6795

};

6796

};

6796

6797

schedstat_inc(sd, alb_count);

6798

schedstat_inc(sd, alb_count);

6798

6799

if (move_one_task(&env))

6800

if (move_one_task(&env))

6800

schedstat_inc(sd, alb_pushed);

6801

schedstat_inc(sd, alb_pushed);

6801

else

6802

else

6802

schedstat_inc(sd, alb_failed);

6803

schedstat_inc(sd, alb_failed);

6803

}

6804

}

6804

rcu_read_unlock();

6805

rcu_read_unlock();

6805

double_unlock_balance(busiest_rq, target_rq);

6806

double_unlock_balance(busiest_rq, target_rq);

6806

out_unlock:

6807

out_unlock:

6807

busiest_rq->active_balance = 0;

6808

busiest_rq->active_balance = 0;

6808

raw_spin_unlock_irq(&busiest_rq->lock);

6809

raw_spin_unlock_irq(&busiest_rq->lock);

6809

return 0;

6810

return 0;

6810

}

6811

}

6811

6812

static inline int on_null_domain(struct rq *rq)

6813

static inline int on_null_domain(struct rq *rq)

6813

{

6814

{

6814

return unlikely(!rcu_dereference_sched(rq->sd));

6815

return unlikely(!rcu_dereference_sched(rq->sd));

6815

}

6816

}

6816

6817

#ifdef CONFIG_NO_HZ_COMMON

6818

#ifdef CONFIG_NO_HZ_COMMON

6818

/*

6819

/*

6819

* idle load balancing details

6820

* idle load balancing details

6820

* - When one of the busy CPUs notice that there may be an idle rebalancing

6821

* - When one of the busy CPUs notice that there may be an idle rebalancing

6821

* needed, they will kick the idle load balancer, which then does idle

6822

* needed, they will kick the idle load balancer, which then does idle

6822

* load balancing for all the idle CPUs.

6823

* load balancing for all the idle CPUs.

6823

*/

6824

*/

6824

static struct {

6825

static struct {

6825

cpumask_var_t idle_cpus_mask;

6826

cpumask_var_t idle_cpus_mask;

6826

atomic_t nr_cpus;

6827

atomic_t nr_cpus;

6827

unsigned long next_balance; /* in jiffy units */

6828

unsigned long next_balance; /* in jiffy units */

6828

} nohz ____cacheline_aligned;

6829

} nohz ____cacheline_aligned;

6829

6830

static inline int find_new_ilb(void)

6831

static inline int find_new_ilb(void)

6831

{

6832

{

6832

int ilb = cpumask_first(nohz.idle_cpus_mask);

6833

int ilb = cpumask_first(nohz.idle_cpus_mask);

6833

6834

if (ilb < nr_cpu_ids && idle_cpu(ilb))

6835

if (ilb < nr_cpu_ids && idle_cpu(ilb))

6835

return ilb;

6836

return ilb;

6836

6837

return nr_cpu_ids;

6838

return nr_cpu_ids;

6838

}

6839

}

6839

6840

/*

6841

/*

6841

* Kick a CPU to do the nohz balancing, if it is time for it. We pick the

6842

* Kick a CPU to do the nohz balancing, if it is time for it. We pick the

6842

* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle

6843

* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle

6843

* CPU (if there is one).

6844

* CPU (if there is one).

6844

*/

6845

*/

6845

static void nohz_balancer_kick(void)

6846

static void nohz_balancer_kick(void)

6846

{

6847

{

6847

int ilb_cpu;

6848

int ilb_cpu;

6848

6849

nohz.next_balance++;

6850

nohz.next_balance++;

6850

6851

ilb_cpu = find_new_ilb();

6852

ilb_cpu = find_new_ilb();

6852

6853

if (ilb_cpu >= nr_cpu_ids)

6854

if (ilb_cpu >= nr_cpu_ids)

6854

return;

6855

return;

6855

6856

if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))

6857

if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))

6857

return;

6858

return;

6858

/*

6859

/*

6859

* Use smp_send_reschedule() instead of resched_cpu().

6860

* Use smp_send_reschedule() instead of resched_cpu().

6860

* This way we generate a sched IPI on the target cpu which

6861

* This way we generate a sched IPI on the target cpu which

6861

* is idle. And the softirq performing nohz idle load balance

6862

* is idle. And the softirq performing nohz idle load balance

6862

* will be run before returning from the IPI.

6863

* will be run before returning from the IPI.

6863

*/

6864

*/

6864

smp_send_reschedule(ilb_cpu);

6865

smp_send_reschedule(ilb_cpu);

6865

return;

6866

return;

6866

}

6867

}

6867

6868

static inline void nohz_balance_exit_idle(int cpu)

6869

static inline void nohz_balance_exit_idle(int cpu)

6869

{

6870

{

6870

if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {

6871

if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {

6871

/*

6872

/*

6872

* Completely isolated CPUs don't ever set, so we must test.

6873

* Completely isolated CPUs don't ever set, so we must test.

6873

*/

6874

*/

6874

if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {

6875

if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {

6875

cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);

6876

cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);

6876

atomic_dec(&nohz.nr_cpus);

6877

atomic_dec(&nohz.nr_cpus);

6877

}

6878

}

6878

clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));

6879

clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));

6879

}

6880

}

6880

}

6881

}

6881

6882

static inline void set_cpu_sd_state_busy(void)

6883

static inline void set_cpu_sd_state_busy(void)

6883

{

6884

{

6884

struct sched_domain *sd;

6885

struct sched_domain *sd;

6885

int cpu = smp_processor_id();

6886

int cpu = smp_processor_id();

6886

6887

rcu_read_lock();

6888

rcu_read_lock();

6888

sd = rcu_dereference(per_cpu(sd_busy, cpu));

6889

sd = rcu_dereference(per_cpu(sd_busy, cpu));

6889

6890

if (!sd || !sd->nohz_idle)

6891

if (!sd || !sd->nohz_idle)

6891

goto unlock;

6892

goto unlock;

6892

sd->nohz_idle = 0;

6893

sd->nohz_idle = 0;

6893

6894

atomic_inc(&sd->groups->sgp->nr_busy_cpus);

6895

atomic_inc(&sd->groups->sgp->nr_busy_cpus);

6895

unlock:

6896

unlock:

6896

rcu_read_unlock();

6897

rcu_read_unlock();

6897

}

6898

}

6898

6899

void set_cpu_sd_state_idle(void)

6900

void set_cpu_sd_state_idle(void)

6900

{

6901

{

6901

struct sched_domain *sd;

6902

struct sched_domain *sd;

6902

int cpu = smp_processor_id();

6903

int cpu = smp_processor_id();

6903

6904

rcu_read_lock();

6905

rcu_read_lock();

6905

sd = rcu_dereference(per_cpu(sd_busy, cpu));

6906

sd = rcu_dereference(per_cpu(sd_busy, cpu));

6906

6907

if (!sd || sd->nohz_idle)

6908

if (!sd || sd->nohz_idle)

6908

goto unlock;

6909

goto unlock;

6909

sd->nohz_idle = 1;

6910

sd->nohz_idle = 1;

6910

6911

atomic_dec(&sd->groups->sgp->nr_busy_cpus);

6912

atomic_dec(&sd->groups->sgp->nr_busy_cpus);

6912

unlock:

6913

unlock:

6913

rcu_read_unlock();

6914

rcu_read_unlock();

6914

}

6915

}

6915

6916

/*

6917

/*

6917

* This routine will record that the cpu is going idle with tick stopped.

6918

* This routine will record that the cpu is going idle with tick stopped.

6918

* This info will be used in performing idle load balancing in the future.

6919

* This info will be used in performing idle load balancing in the future.

6919

*/

6920

*/

6920

void nohz_balance_enter_idle(int cpu)

6921

void nohz_balance_enter_idle(int cpu)

6921

{

6922

{

6922

/*

6923

/*

6923

* If this cpu is going down, then nothing needs to be done.

6924

* If this cpu is going down, then nothing needs to be done.

6924

*/

6925

*/

6925

if (!cpu_active(cpu))

6926

if (!cpu_active(cpu))

6926

return;

6927

return;

6927

6928

if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))

6929

if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))

6929

return;

6930

return;

6930

6931

/*

6932

/*

6932

* If we're a completely isolated CPU, we don't play.

6933

* If we're a completely isolated CPU, we don't play.

6933

*/

6934

*/

6934

if (on_null_domain(cpu_rq(cpu)))

6935

if (on_null_domain(cpu_rq(cpu)))

6935

return;

6936

return;

6936

6937

cpumask_set_cpu(cpu, nohz.idle_cpus_mask);

6938

cpumask_set_cpu(cpu, nohz.idle_cpus_mask);

6938

atomic_inc(&nohz.nr_cpus);

6939

atomic_inc(&nohz.nr_cpus);

6939

set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));

6940

set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));

6940

}

6941

}

6941

6942

static int sched_ilb_notifier(struct notifier_block *nfb,

6943

static int sched_ilb_notifier(struct notifier_block *nfb,

6943

unsigned long action, void *hcpu)

6944

unsigned long action, void *hcpu)

6944

{

6945

{

6945

switch (action & ~CPU_TASKS_FROZEN) {

6946

switch (action & ~CPU_TASKS_FROZEN) {

6946

case CPU_DYING:

6947

case CPU_DYING:

6947

nohz_balance_exit_idle(smp_processor_id());

6948

nohz_balance_exit_idle(smp_processor_id());

6948

return NOTIFY_OK;

6949

return NOTIFY_OK;

6949

default:

6950

default:

6950

return NOTIFY_DONE;

6951

return NOTIFY_DONE;

6951

}

6952

}

6952

}

6953

}

6953

#endif

6954

#endif

6954

6955

static DEFINE_SPINLOCK(balancing);

6956

static DEFINE_SPINLOCK(balancing);

6956

6957

/*

6958

/*

6958

* Scale the max load_balance interval with the number of CPUs in the system.

6959

* Scale the max load_balance interval with the number of CPUs in the system.

6959

* This trades load-balance latency on larger machines for less cross talk.

6960

* This trades load-balance latency on larger machines for less cross talk.

6960

*/

6961

*/

6961

void update_max_interval(void)

6962

void update_max_interval(void)

6962

{

6963

{

6963

max_load_balance_interval = HZ*num_online_cpus()/10;

6964

max_load_balance_interval = HZ*num_online_cpus()/10;

6964

}

6965

}

6965

6966

/*

6967

/*

6967

* It checks each scheduling domain to see if it is due to be balanced,

6968

* It checks each scheduling domain to see if it is due to be balanced,

6968

* and initiates a balancing operation if so.

6969

* and initiates a balancing operation if so.

6969

*

6970

*

6970

* Balancing parameters are set up in init_sched_domains.

6971

* Balancing parameters are set up in init_sched_domains.

6971

*/

6972

*/

6972

static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)

6973

static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)

6973

{

6974

{

6974

int continue_balancing = 1;

6975

int continue_balancing = 1;

6975

int cpu = rq->cpu;

6976

int cpu = rq->cpu;

6976

unsigned long interval;

6977

unsigned long interval;

6977

struct sched_domain *sd;

6978

struct sched_domain *sd;

6978

/* Earliest time when we have to do rebalance again */

6979

/* Earliest time when we have to do rebalance again */

6979

unsigned long next_balance = jiffies + 60*HZ;

6980

unsigned long next_balance = jiffies + 60*HZ;

6980

int update_next_balance = 0;

6981

int update_next_balance = 0;

6981

int need_serialize, need_decay = 0;

6982

int need_serialize, need_decay = 0;

6982

u64 max_cost = 0;

6983

u64 max_cost = 0;

6983

6984

update_blocked_averages(cpu);

6985

update_blocked_averages(cpu);

6985

6986

rcu_read_lock();

6987

rcu_read_lock();

6987

for_each_domain(cpu, sd) {

6988

for_each_domain(cpu, sd) {

6988

/*

6989

/*

6989

* Decay the newidle max times here because this is a regular

6990

* Decay the newidle max times here because this is a regular

6990

* visit to all the domains. Decay ~1% per second.

6991

* visit to all the domains. Decay ~1% per second.

6991

*/

6992

*/

6992

if (time_after(jiffies, sd->next_decay_max_lb_cost)) {

6993

if (time_after(jiffies, sd->next_decay_max_lb_cost)) {

6993

sd->max_newidle_lb_cost =

6994

sd->max_newidle_lb_cost =

6994

(sd->max_newidle_lb_cost * 253) / 256;

6995

(sd->max_newidle_lb_cost * 253) / 256;

6995

sd->next_decay_max_lb_cost = jiffies + HZ;

6996

sd->next_decay_max_lb_cost = jiffies + HZ;

6996

need_decay = 1;

6997

need_decay = 1;

6997

}

6998

}

6998

max_cost += sd->max_newidle_lb_cost;

6999

max_cost += sd->max_newidle_lb_cost;

6999

7000

if (!(sd->flags & SD_LOAD_BALANCE))

7001

if (!(sd->flags & SD_LOAD_BALANCE))

7001

continue;

7002

continue;

7002

7003

/*

7004

/*

7004

* Stop the load balance at this level. There is another

7005

* Stop the load balance at this level. There is another

7005

* CPU in our sched group which is doing load balancing more

7006

* CPU in our sched group which is doing load balancing more

7006

* actively.

7007

* actively.

7007

*/

7008

*/

7008

if (!continue_balancing) {

7009

if (!continue_balancing) {

7009

if (need_decay)

7010

if (need_decay)

7010

continue;

7011

continue;

7011

break;

7012

break;

7012

}

7013

}

7013

7014

interval = sd->balance_interval;

7015

interval = sd->balance_interval;

7015

if (idle != CPU_IDLE)

7016

if (idle != CPU_IDLE)

7016

interval *= sd->busy_factor;

7017

interval *= sd->busy_factor;

7017

7018

/* scale ms to jiffies */

7019

/* scale ms to jiffies */

7019

interval = msecs_to_jiffies(interval);

7020

interval = msecs_to_jiffies(interval);

7020

interval = clamp(interval, 1UL, max_load_balance_interval);

7021

interval = clamp(interval, 1UL, max_load_balance_interval);

7021

7022

need_serialize = sd->flags & SD_SERIALIZE;

7023

need_serialize = sd->flags & SD_SERIALIZE;

7023

7024

if (need_serialize) {

7025

if (need_serialize) {

7025

if (!spin_trylock(&balancing))

7026

if (!spin_trylock(&balancing))

7026

goto out;

7027

goto out;

7027

}

7028

}

7028

7029

if (time_after_eq(jiffies, sd->last_balance + interval)) {

7030

if (time_after_eq(jiffies, sd->last_balance + interval)) {

7030

if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {

7031

if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {

7031

/*

7032

/*

7032

* The LBF_DST_PINNED logic could have changed

7033

* The LBF_DST_PINNED logic could have changed

7033

* env->dst_cpu, so we can't know our idle

7034

* env->dst_cpu, so we can't know our idle

7034

* state even if we migrated tasks. Update it.

7035

* state even if we migrated tasks. Update it.

7035

*/

7036

*/

7036

idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;

7037

idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;

7037

}

7038

}

7038

sd->last_balance = jiffies;

7039

sd->last_balance = jiffies;

7039

}

7040

}

7040

if (need_serialize)

7041

if (need_serialize)

7041

spin_unlock(&balancing);

7042

spin_unlock(&balancing);

7042

out:

7043

out:

7043

if (time_after(next_balance, sd->last_balance + interval)) {

7044

if (time_after(next_balance, sd->last_balance + interval)) {

7044

next_balance = sd->last_balance + interval;

7045

next_balance = sd->last_balance + interval;

7045

update_next_balance = 1;

7046

update_next_balance = 1;

7046

}

7047

}

7047

}

7048

}

7048

if (need_decay) {

7049

if (need_decay) {

7049

/*

7050

/*

7050

* Ensure the rq-wide value also decays but keep it at a

7051

* Ensure the rq-wide value also decays but keep it at a

7051

* reasonable floor to avoid funnies with rq->avg_idle.

7052

* reasonable floor to avoid funnies with rq->avg_idle.

7052

*/

7053

*/

7053

rq->max_idle_balance_cost =

7054

rq->max_idle_balance_cost =

7054

max((u64)sysctl_sched_migration_cost, max_cost);

7055

max((u64)sysctl_sched_migration_cost, max_cost);

7055

}

7056

}

7056

rcu_read_unlock();

7057

rcu_read_unlock();

7057

7058

/*

7059

/*

7059

* next_balance will be updated only when there is a need.

7060

* next_balance will be updated only when there is a need.

7060

* When the cpu is attached to null domain for ex, it will not be

7061

* When the cpu is attached to null domain for ex, it will not be

7061

* updated.

7062

* updated.

7062

*/

7063

*/

7063

if (likely(update_next_balance))

7064

if (likely(update_next_balance))

7064

rq->next_balance = next_balance;

7065

rq->next_balance = next_balance;

7065

}

7066

}

7066

7067

#ifdef CONFIG_NO_HZ_COMMON

7068

#ifdef CONFIG_NO_HZ_COMMON

7068

/*

7069

/*

7069

* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the

7070

* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the

7070

* rebalancing for all the cpus for whom scheduler ticks are stopped.

7071

* rebalancing for all the cpus for whom scheduler ticks are stopped.

7071

*/

7072

*/

7072

static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)

7073

static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)

7073

{

7074

{

7074

int this_cpu = this_rq->cpu;

7075

int this_cpu = this_rq->cpu;

7075

struct rq *rq;

7076

struct rq *rq;

7076

int balance_cpu;

7077

int balance_cpu;

7077

7078

if (idle != CPU_IDLE ||

7079

if (idle != CPU_IDLE ||

7079

!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))

7080

!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))

7080

goto end;

7081

goto end;

7081

7082

for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {

7083

for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {

7083

if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))

7084

if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))

7084

continue;

7085

continue;

7085

7086

/*

7087

/*

7087

* If this cpu gets work to do, stop the load balancing

7088

* If this cpu gets work to do, stop the load balancing

7088

* work being done for other cpus. Next load

7089

* work being done for other cpus. Next load

7089

* balancing owner will pick it up.

7090

* balancing owner will pick it up.

7090

*/

7091

*/

7091

if (need_resched())

7092

if (need_resched())

7092

break;

7093

break;

7093

7094

rq = cpu_rq(balance_cpu);

7095

rq = cpu_rq(balance_cpu);

7095

7096

raw_spin_lock_irq(&rq->lock);

7097

raw_spin_lock_irq(&rq->lock);

7097

update_rq_clock(rq);

7098

update_rq_clock(rq);

7098

update_idle_cpu_load(rq);

7099

update_idle_cpu_load(rq);

7099

raw_spin_unlock_irq(&rq->lock);

7100

raw_spin_unlock_irq(&rq->lock);

7100

7101

rebalance_domains(rq, CPU_IDLE);

7102

rebalance_domains(rq, CPU_IDLE);

7102

7103

if (time_after(this_rq->next_balance, rq->next_balance))

7104

if (time_after(this_rq->next_balance, rq->next_balance))

7104

this_rq->next_balance = rq->next_balance;

7105

this_rq->next_balance = rq->next_balance;

7105

}

7106

}

7106

nohz.next_balance = this_rq->next_balance;

7107

nohz.next_balance = this_rq->next_balance;

7107

end:

7108

end:

7108

clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));

7109

clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));

7109

}

7110

}

7110

7111

/*

7112

/*

7112

* Current heuristic for kicking the idle load balancer in the presence

7113

* Current heuristic for kicking the idle load balancer in the presence

7113

* of an idle cpu is the system.

7114

* of an idle cpu is the system.

7114

* - This rq has more than one task.

7115

* - This rq has more than one task.

7115

* - At any scheduler domain level, this cpu's scheduler group has multiple

7116

* - At any scheduler domain level, this cpu's scheduler group has multiple

7116

* busy cpu's exceeding the group's power.

7117

* busy cpu's exceeding the group's power.

7117

* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler

7118

* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler

7118

* domain span are idle.

7119

* domain span are idle.

7119

*/

7120

*/

7120

static inline int nohz_kick_needed(struct rq *rq)

7121

static inline int nohz_kick_needed(struct rq *rq)

7121

{

7122

{

7122

unsigned long now = jiffies;

7123

unsigned long now = jiffies;

7123

struct sched_domain *sd;

7124

struct sched_domain *sd;

7124

struct sched_group_power *sgp;

7125

struct sched_group_power *sgp;

7125

int nr_busy, cpu = rq->cpu;

7126

int nr_busy, cpu = rq->cpu;

7126

7127

if (unlikely(rq->idle_balance))

7128

if (unlikely(rq->idle_balance))

7128

return 0;

7129

return 0;

7129

7130

/*

7131

/*

7131

* We may be recently in ticked or tickless idle mode. At the first

7132

* We may be recently in ticked or tickless idle mode. At the first

7132

* busy tick after returning from idle, we will update the busy stats.

7133

* busy tick after returning from idle, we will update the busy stats.

7133

*/

7134

*/

7134

set_cpu_sd_state_busy();

7135

set_cpu_sd_state_busy();

7135

nohz_balance_exit_idle(cpu);

7136

nohz_balance_exit_idle(cpu);

7136

7137

/*

7138

/*

7138

* None are in tickless mode and hence no need for NOHZ idle load

7139

* None are in tickless mode and hence no need for NOHZ idle load

7139

* balancing.

7140

* balancing.

7140

*/

7141

*/

7141

if (likely(!atomic_read(&nohz.nr_cpus)))

7142

if (likely(!atomic_read(&nohz.nr_cpus)))

7142

return 0;

7143

return 0;

7143

7144

if (time_before(now, nohz.next_balance))

7145

if (time_before(now, nohz.next_balance))

7145

return 0;

7146

return 0;

7146

7147

if (rq->nr_running >= 2)

7148

if (rq->nr_running >= 2)

7148

goto need_kick;

7149

goto need_kick;

7149

7150

rcu_read_lock();

7151

rcu_read_lock();

7151

sd = rcu_dereference(per_cpu(sd_busy, cpu));

7152

sd = rcu_dereference(per_cpu(sd_busy, cpu));

7152

7153

if (sd) {

7154

if (sd) {

7154

sgp = sd->groups->sgp;

7155

sgp = sd->groups->sgp;

7155

nr_busy = atomic_read(&sgp->nr_busy_cpus);

7156

nr_busy = atomic_read(&sgp->nr_busy_cpus);

7156

7157

if (nr_busy > 1)

7158

if (nr_busy > 1)

7158

goto need_kick_unlock;

7159

goto need_kick_unlock;

7159

}

7160

}

7160

7161

sd = rcu_dereference(per_cpu(sd_asym, cpu));

7162

sd = rcu_dereference(per_cpu(sd_asym, cpu));

7162

7163

if (sd && (cpumask_first_and(nohz.idle_cpus_mask,

7164

if (sd && (cpumask_first_and(nohz.idle_cpus_mask,

7164

sched_domain_span(sd)) < cpu))

7165

sched_domain_span(sd)) < cpu))

7165

goto need_kick_unlock;

7166

goto need_kick_unlock;

7166

7167

rcu_read_unlock();

7168

rcu_read_unlock();

7168

return 0;

7169

return 0;

7169

7170

need_kick_unlock:

7171

need_kick_unlock:

7171

rcu_read_unlock();

7172

rcu_read_unlock();

7172

need_kick:

7173

need_kick:

7173

return 1;

7174

return 1;

7174

}

7175

}

7175

#else

7176

#else

7176

static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }

7177

static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }

7177

#endif

7178

#endif

7178

7179

/*

7180

/*

7180

* run_rebalance_domains is triggered when needed from the scheduler tick.

7181

* run_rebalance_domains is triggered when needed from the scheduler tick.

7181

* Also triggered for nohz idle balancing (with nohz_balancing_kick set).

7182

* Also triggered for nohz idle balancing (with nohz_balancing_kick set).

7182

*/

7183

*/

7183

static void run_rebalance_domains(struct softirq_action *h)

7184

static void run_rebalance_domains(struct softirq_action *h)

7184

{

7185

{

7185

struct rq *this_rq = this_rq();

7186

struct rq *this_rq = this_rq();

7186

enum cpu_idle_type idle = this_rq->idle_balance ?

7187

enum cpu_idle_type idle = this_rq->idle_balance ?

7187

CPU_IDLE : CPU_NOT_IDLE;

7188

CPU_IDLE : CPU_NOT_IDLE;

7188

7189

rebalance_domains(this_rq, idle);

7190

rebalance_domains(this_rq, idle);

7190

7191

/*

7192

/*

7192

* If this cpu has a pending nohz_balance_kick, then do the

7193

* If this cpu has a pending nohz_balance_kick, then do the

7193

* balancing on behalf of the other idle cpus whose ticks are

7194

* balancing on behalf of the other idle cpus whose ticks are

7194

* stopped.

7195

* stopped.

7195

*/

7196

*/

7196

nohz_idle_balance(this_rq, idle);

7197

nohz_idle_balance(this_rq, idle);

7197

}

7198

}

7198

7199

/*

7200

/*

7200

* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.

7201

* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.

7201

*/

7202

*/

7202

void trigger_load_balance(struct rq *rq)

7203

void trigger_load_balance(struct rq *rq)

7203

{

7204

{

7204

/* Don't need to rebalance while attached to NULL domain */

7205

/* Don't need to rebalance while attached to NULL domain */

7205

if (unlikely(on_null_domain(rq)))

7206

if (unlikely(on_null_domain(rq)))

7206

return;

7207

return;

7207

7208

if (time_after_eq(jiffies, rq->next_balance))

7209

if (time_after_eq(jiffies, rq->next_balance))

7209

raise_softirq(SCHED_SOFTIRQ);

7210

raise_softirq(SCHED_SOFTIRQ);

7210

#ifdef CONFIG_NO_HZ_COMMON

7211

#ifdef CONFIG_NO_HZ_COMMON

7211

if (nohz_kick_needed(rq))

7212

if (nohz_kick_needed(rq))

7212

nohz_balancer_kick();

7213

nohz_balancer_kick();

7213

#endif

7214

#endif

7214

}

7215

}

7215

7216

static void rq_online_fair(struct rq *rq)

7217

static void rq_online_fair(struct rq *rq)

7217

{

7218

{

7218

update_sysctl();

7219

update_sysctl();

7219

}

7220

}

7220

7221

static void rq_offline_fair(struct rq *rq)

7222

static void rq_offline_fair(struct rq *rq)

7222

{

7223

{

7223

update_sysctl();

7224

update_sysctl();

7224

7225

/* Ensure any throttled groups are reachable by pick_next_task */

7226

/* Ensure any throttled groups are reachable by pick_next_task */

7226

unthrottle_offline_cfs_rqs(rq);

7227

unthrottle_offline_cfs_rqs(rq);

7227

}

7228

}

7228

7229

#endif /* CONFIG_SMP */

7230

#endif /* CONFIG_SMP */

7230

7231

/*

7232

/*

7232

* scheduler tick hitting a task of our scheduling class:

7233

* scheduler tick hitting a task of our scheduling class:

7233

*/

7234

*/

7234

static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)

7235

static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)

7235

{

7236

{

7236

struct cfs_rq *cfs_rq;

7237

struct cfs_rq *cfs_rq;

7237

struct sched_entity *se = &curr->se;

7238

struct sched_entity *se = &curr->se;

7238

7239

for_each_sched_entity(se) {

7240

for_each_sched_entity(se) {

7240

cfs_rq = cfs_rq_of(se);

7241

cfs_rq = cfs_rq_of(se);

7241

entity_tick(cfs_rq, se, queued);

7242

entity_tick(cfs_rq, se, queued);

7242

}

7243

}

7243

7244

if (numabalancing_enabled)

7245

if (numabalancing_enabled)

7245

task_tick_numa(rq, curr);

7246

task_tick_numa(rq, curr);

7246

7247

update_rq_runnable_avg(rq, 1);

7248

update_rq_runnable_avg(rq, 1);

7248

}

7249

}

7249

7250

/*

7251

/*

7251

* called on fork with the child task as argument from the parent's context

7252

* called on fork with the child task as argument from the parent's context

7252

* - child not yet on the tasklist

7253

* - child not yet on the tasklist

7253

* - preemption disabled

7254

* - preemption disabled

7254

*/

7255

*/

7255

static void task_fork_fair(struct task_struct *p)

7256

static void task_fork_fair(struct task_struct *p)

7256

{

7257

{

7257

struct cfs_rq *cfs_rq;

7258

struct cfs_rq *cfs_rq;

7258

struct sched_entity *se = &p->se, *curr;

7259

struct sched_entity *se = &p->se, *curr;

7259

int this_cpu = smp_processor_id();

7260

int this_cpu = smp_processor_id();

7260

struct rq *rq = this_rq();

7261

struct rq *rq = this_rq();

7261

unsigned long flags;

7262

unsigned long flags;

7262

7263

raw_spin_lock_irqsave(&rq->lock, flags);

7264

raw_spin_lock_irqsave(&rq->lock, flags);

7264

7265

update_rq_clock(rq);

7266

update_rq_clock(rq);

7266

7267

cfs_rq = task_cfs_rq(current);

7268

cfs_rq = task_cfs_rq(current);

7268

curr = cfs_rq->curr;

7269

curr = cfs_rq->curr;

7269

7270

/*

7271

/*

7271

* Not only the cpu but also the task_group of the parent might have

7272

* Not only the cpu but also the task_group of the parent might have

7272

* been changed after parent->se.parent,cfs_rq were copied to

7273

* been changed after parent->se.parent,cfs_rq were copied to

7273

* child->se.parent,cfs_rq. So call __set_task_cpu() to make those

7274

* child->se.parent,cfs_rq. So call __set_task_cpu() to make those

7274

* of child point to valid ones.

7275

* of child point to valid ones.

7275

*/

7276

*/

7276

rcu_read_lock();

7277

rcu_read_lock();

7277

__set_task_cpu(p, this_cpu);

7278

__set_task_cpu(p, this_cpu);

7278

rcu_read_unlock();

7279

rcu_read_unlock();

7279

7280

update_curr(cfs_rq);

7281

update_curr(cfs_rq);

7281

7282

if (curr)

7283

if (curr)

7283

se->vruntime = curr->vruntime;

7284

se->vruntime = curr->vruntime;

7284

place_entity(cfs_rq, se, 1);

7285

place_entity(cfs_rq, se, 1);

7285

7286

if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {

7287

if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {

7287

/*

7288

/*

7288

* Upon rescheduling, sched_class::put_prev_task() will place

7289

* Upon rescheduling, sched_class::put_prev_task() will place

7289

* 'current' within the tree based on its new key value.

7290

* 'current' within the tree based on its new key value.

7290

*/

7291

*/

7291

swap(curr->vruntime, se->vruntime);

7292

swap(curr->vruntime, se->vruntime);

7292

resched_task(rq->curr);

7293

resched_task(rq->curr);

7293

}

7294

}

7294

7295

se->vruntime -= cfs_rq->min_vruntime;

7296

se->vruntime -= cfs_rq->min_vruntime;

7296

7297

raw_spin_unlock_irqrestore(&rq->lock, flags);

7298

raw_spin_unlock_irqrestore(&rq->lock, flags);

7298

}

7299

}

7299

7300

/*

7301

/*

7301

* Priority of the task has changed. Check to see if we preempt

7302

* Priority of the task has changed. Check to see if we preempt

7302

* the current task.

7303

* the current task.

7303

*/

7304

*/

7304

static void

7305

static void

7305

prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)

7306

prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)

7306

{

7307

{

7307

if (!p->se.on_rq)

7308

if (!p->se.on_rq)

7308

return;

7309

return;

7309

7310

/*

7311

/*

7311

* Reschedule if we are currently running on this runqueue and

7312

* Reschedule if we are currently running on this runqueue and

7312

* our priority decreased, or if we are not currently running on

7313

* our priority decreased, or if we are not currently running on

7313

* this runqueue and our priority is higher than the current's

7314

* this runqueue and our priority is higher than the current's

7314

*/

7315

*/

7315

if (rq->curr == p) {

7316

if (rq->curr == p) {

7316

if (p->prio > oldprio)

7317

if (p->prio > oldprio)

7317

resched_task(rq->curr);

7318

resched_task(rq->curr);

7318

} else

7319

} else

7319

check_preempt_curr(rq, p, 0);

7320

check_preempt_curr(rq, p, 0);

7320

}

7321

}

7321

7322

static void switched_from_fair(struct rq *rq, struct task_struct *p)

7323

static void switched_from_fair(struct rq *rq, struct task_struct *p)

7323

{

7324

{

7324

struct sched_entity *se = &p->se;

7325

struct sched_entity *se = &p->se;

7325

struct cfs_rq *cfs_rq = cfs_rq_of(se);

7326

struct cfs_rq *cfs_rq = cfs_rq_of(se);

7326

7327

/*

7328

/*

7328

* Ensure the task's vruntime is normalized, so that when it's

7329

* Ensure the task's vruntime is normalized, so that when it's

7329

* switched back to the fair class the enqueue_entity(.flags=0) will

7330

* switched back to the fair class the enqueue_entity(.flags=0) will

7330

* do the right thing.

7331

* do the right thing.

7331

*

7332

*

7332

* If it's on_rq, then the dequeue_entity(.flags=0) will already

7333

* If it's on_rq, then the dequeue_entity(.flags=0) will already

7333

* have normalized the vruntime, if it's !on_rq, then only when

7334

* have normalized the vruntime, if it's !on_rq, then only when

7334

* the task is sleeping will it still have non-normalized vruntime.

7335

* the task is sleeping will it still have non-normalized vruntime.

7335

*/

7336

*/

7336

if (!p->on_rq && p->state != TASK_RUNNING) {

7337

if (!p->on_rq && p->state != TASK_RUNNING) {

7337

/*

7338

/*

7338

* Fix up our vruntime so that the current sleep doesn't

7339

* Fix up our vruntime so that the current sleep doesn't

7339

* cause 'unlimited' sleep bonus.

7340

* cause 'unlimited' sleep bonus.

7340

*/

7341

*/

7341

place_entity(cfs_rq, se, 0);

7342

place_entity(cfs_rq, se, 0);

7342

se->vruntime -= cfs_rq->min_vruntime;

7343

se->vruntime -= cfs_rq->min_vruntime;

7343

}

7344

}

7344

7345

#ifdef CONFIG_SMP

7346

#ifdef CONFIG_SMP

7346

/*

7347

/*

7347

* Remove our load from contribution when we leave sched_fair

7348

* Remove our load from contribution when we leave sched_fair

7348

* and ensure we don't carry in an old decay_count if we

7349

* and ensure we don't carry in an old decay_count if we

7349

* switch back.

7350

* switch back.

7350

*/

7351

*/

7351

if (se->avg.decay_count) {

7352

if (se->avg.decay_count) {

7352

__synchronize_entity_decay(se);

7353

__synchronize_entity_decay(se);

7353

subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);

7354

subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);

7354

}

7355

}

7355

#endif

7356

#endif

7356

}

7357

}

7357

7358

/*

7359

/*

7359

* We switched to the sched_fair class.

7360

* We switched to the sched_fair class.

7360

*/

7361

*/

7361

static void switched_to_fair(struct rq *rq, struct task_struct *p)

7362

static void switched_to_fair(struct rq *rq, struct task_struct *p)

7362

{

7363

{

7363

struct sched_entity *se = &p->se;

7364

struct sched_entity *se = &p->se;

7364

#ifdef CONFIG_FAIR_GROUP_SCHED

7365

#ifdef CONFIG_FAIR_GROUP_SCHED

7365

/*

7366

/*

7366

* Since the real-depth could have been changed (only FAIR

7367

* Since the real-depth could have been changed (only FAIR

7367

* class maintain depth value), reset depth properly.

7368

* class maintain depth value), reset depth properly.

7368

*/

7369

*/

7369

se->depth = se->parent ? se->parent->depth + 1 : 0;

7370

se->depth = se->parent ? se->parent->depth + 1 : 0;

7370

#endif

7371

#endif

7371

if (!se->on_rq)

7372

if (!se->on_rq)

7372

return;

7373

return;

7373

7374

/*

7375

/*

7375

* We were most likely switched from sched_rt, so

7376

* We were most likely switched from sched_rt, so

7376

* kick off the schedule if running, otherwise just see

7377

* kick off the schedule if running, otherwise just see

7377

* if we can still preempt the current task.

7378

* if we can still preempt the current task.

7378

*/

7379

*/

7379

if (rq->curr == p)

7380

if (rq->curr == p)

7380

resched_task(rq->curr);

7381

resched_task(rq->curr);

7381

else

7382

else

7382

check_preempt_curr(rq, p, 0);

7383

check_preempt_curr(rq, p, 0);

7383

}

7384

}

7384

7385

/* Account for a task changing its policy or group.

7386

/* Account for a task changing its policy or group.

7386

*

7387

*

7387

* This routine is mostly called to set cfs_rq->curr field when a task

7388

* This routine is mostly called to set cfs_rq->curr field when a task

7388

* migrates between groups/classes.

7389

* migrates between groups/classes.

7389

*/

7390

*/

7390

static void set_curr_task_fair(struct rq *rq)

7391

static void set_curr_task_fair(struct rq *rq)

7391

{

7392

{

7392

struct sched_entity *se = &rq->curr->se;

7393

struct sched_entity *se = &rq->curr->se;

7393

7394

for_each_sched_entity(se) {

7395

for_each_sched_entity(se) {

7395

struct cfs_rq *cfs_rq = cfs_rq_of(se);

7396

struct cfs_rq *cfs_rq = cfs_rq_of(se);

7396

7397

set_next_entity(cfs_rq, se);

7398

set_next_entity(cfs_rq, se);

7398

/* ensure bandwidth has been allocated on our new cfs_rq */

7399

/* ensure bandwidth has been allocated on our new cfs_rq */

7399

account_cfs_rq_runtime(cfs_rq, 0);

7400

account_cfs_rq_runtime(cfs_rq, 0);

7400

}

7401

}

7401

}

7402

}

7402

7403

void init_cfs_rq(struct cfs_rq *cfs_rq)

7404

void init_cfs_rq(struct cfs_rq *cfs_rq)

7404

{

7405

{

7405

cfs_rq->tasks_timeline = RB_ROOT;

7406

cfs_rq->tasks_timeline = RB_ROOT;

7406

cfs_rq->min_vruntime = (u64)(-(1LL << 20));

7407

cfs_rq->min_vruntime = (u64)(-(1LL << 20));

7407

#ifndef CONFIG_64BIT

7408

#ifndef CONFIG_64BIT

7408

cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;

7409

cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;

7409

#endif

7410

#endif

7410

#ifdef CONFIG_SMP

7411

#ifdef CONFIG_SMP

7411

atomic64_set(&cfs_rq->decay_counter, 1);

7412

atomic64_set(&cfs_rq->decay_counter, 1);

7412

atomic_long_set(&cfs_rq->removed_load, 0);

7413

atomic_long_set(&cfs_rq->removed_load, 0);

7413

#endif

7414

#endif

7414

}

7415

}

7415

7416

#ifdef CONFIG_FAIR_GROUP_SCHED

7417

#ifdef CONFIG_FAIR_GROUP_SCHED

7417

static void task_move_group_fair(struct task_struct *p, int on_rq)

7418

static void task_move_group_fair(struct task_struct *p, int on_rq)

7418

{

7419

{

7419

struct sched_entity *se = &p->se;

7420

struct sched_entity *se = &p->se;

7420

struct cfs_rq *cfs_rq;

7421

struct cfs_rq *cfs_rq;

7421

7422

/*

7423

/*

7423

* If the task was not on the rq at the time of this cgroup movement

7424

* If the task was not on the rq at the time of this cgroup movement

7424

* it must have been asleep, sleeping tasks keep their ->vruntime

7425

* it must have been asleep, sleeping tasks keep their ->vruntime

7425

* absolute on their old rq until wakeup (needed for the fair sleeper

7426

* absolute on their old rq until wakeup (needed for the fair sleeper

7426

* bonus in place_entity()).

7427

* bonus in place_entity()).

7427

*

7428

*

7428

* If it was on the rq, we've just 'preempted' it, which does convert

7429

* If it was on the rq, we've just 'preempted' it, which does convert

7429

* ->vruntime to a relative base.

7430

* ->vruntime to a relative base.

7430

*

7431

*

7431

* Make sure both cases convert their relative position when migrating

7432

* Make sure both cases convert their relative position when migrating

7432

* to another cgroup's rq. This does somewhat interfere with the

7433

* to another cgroup's rq. This does somewhat interfere with the

7433

* fair sleeper stuff for the first placement, but who cares.

7434

* fair sleeper stuff for the first placement, but who cares.

7434

*/

7435

*/

7435

/*

7436

/*

7436

* When !on_rq, vruntime of the task has usually NOT been normalized.

7437

* When !on_rq, vruntime of the task has usually NOT been normalized.

7437

* But there are some cases where it has already been normalized:

7438

* But there are some cases where it has already been normalized:

7438

*

7439

*

7439

* - Moving a forked child which is waiting for being woken up by

7440

* - Moving a forked child which is waiting for being woken up by

7440

* wake_up_new_task().

7441

* wake_up_new_task().

7441

* - Moving a task which has been woken up by try_to_wake_up() and

7442

* - Moving a task which has been woken up by try_to_wake_up() and

7442

* waiting for actually being woken up by sched_ttwu_pending().

7443

* waiting for actually being woken up by sched_ttwu_pending().

7443

*

7444

*

7444

* To prevent boost or penalty in the new cfs_rq caused by delta

7445

* To prevent boost or penalty in the new cfs_rq caused by delta

7445

* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.

7446

* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.

7446

*/

7447

*/

7447

if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))

7448

if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))

7448

on_rq = 1;

7449

on_rq = 1;

7449

7450

if (!on_rq)

7451

if (!on_rq)

7451

se->vruntime -= cfs_rq_of(se)->min_vruntime;

7452

se->vruntime -= cfs_rq_of(se)->min_vruntime;

7452

set_task_rq(p, task_cpu(p));

7453

set_task_rq(p, task_cpu(p));

7453

se->depth = se->parent ? se->parent->depth + 1 : 0;

7454

se->depth = se->parent ? se->parent->depth + 1 : 0;

7454

if (!on_rq) {

7455

if (!on_rq) {

7455

cfs_rq = cfs_rq_of(se);

7456

cfs_rq = cfs_rq_of(se);

7456

se->vruntime += cfs_rq->min_vruntime;

7457

se->vruntime += cfs_rq->min_vruntime;

7457

#ifdef CONFIG_SMP

7458

#ifdef CONFIG_SMP

7458

/*

7459

/*

7459

* migrate_task_rq_fair() will have removed our previous

7460

* migrate_task_rq_fair() will have removed our previous

7460

* contribution, but we must synchronize for ongoing future

7461

* contribution, but we must synchronize for ongoing future

7461

* decay.

7462

* decay.

7462

*/

7463

*/

7463

se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);

7464

se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);

7464

cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;

7465

cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;

7465

#endif

7466

#endif

7466

}

7467

}

7467

}

7468

}

7468

7469

void free_fair_sched_group(struct task_group *tg)

7470

void free_fair_sched_group(struct task_group *tg)

7470

{

7471

{

7471

int i;

7472

int i;

7472

7473

destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));

7474

destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));

7474

7475

for_each_possible_cpu(i) {

7476

for_each_possible_cpu(i) {

7476

if (tg->cfs_rq)

7477

if (tg->cfs_rq)

7477

kfree(tg->cfs_rq[i]);

7478

kfree(tg->cfs_rq[i]);

7478

if (tg->se)

7479

if (tg->se)

7479

kfree(tg->se[i]);

7480

kfree(tg->se[i]);

7480

}

7481

}

7481

7482

kfree(tg->cfs_rq);

7483

kfree(tg->cfs_rq);

7483

kfree(tg->se);

7484

kfree(tg->se);

7484

}

7485

}

7485

7486

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

7487

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

7487

{

7488

{

7488

struct cfs_rq *cfs_rq;

7489

struct cfs_rq *cfs_rq;

7489

struct sched_entity *se;

7490

struct sched_entity *se;

7490

int i;

7491

int i;

7491

7492

tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);

7493

tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);

7493

if (!tg->cfs_rq)

7494

if (!tg->cfs_rq)

7494

goto err;

7495

goto err;

7495

tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);

7496

tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);

7496

if (!tg->se)

7497

if (!tg->se)

7497

goto err;

7498

goto err;

7498

7499

tg->shares = NICE_0_LOAD;

7500

tg->shares = NICE_0_LOAD;

7500

7501

init_cfs_bandwidth(tg_cfs_bandwidth(tg));

7502

init_cfs_bandwidth(tg_cfs_bandwidth(tg));

7502

7503

for_each_possible_cpu(i) {

7504

for_each_possible_cpu(i) {

7504

cfs_rq = kzalloc_node(sizeof(struct cfs_rq),

7505

cfs_rq = kzalloc_node(sizeof(struct cfs_rq),

7505

GFP_KERNEL, cpu_to_node(i));

7506

GFP_KERNEL, cpu_to_node(i));

7506

if (!cfs_rq)

7507

if (!cfs_rq)

7507

goto err;

7508

goto err;

7508

7509

se = kzalloc_node(sizeof(struct sched_entity),

7510

se = kzalloc_node(sizeof(struct sched_entity),

7510

GFP_KERNEL, cpu_to_node(i));

7511

GFP_KERNEL, cpu_to_node(i));

7511

if (!se)

7512

if (!se)

7512

goto err_free_rq;

7513

goto err_free_rq;

7513

7514

init_cfs_rq(cfs_rq);

7515

init_cfs_rq(cfs_rq);

7515

init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);

7516

init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);

7516

}

7517

}

7517

7518

return 1;

7519

return 1;

7519

7520

err_free_rq:

7521

err_free_rq:

7521

kfree(cfs_rq);

7522

kfree(cfs_rq);

7522

err:

7523

err:

7523

return 0;

7524

return 0;

7524

}

7525

}

7525

7526

void unregister_fair_sched_group(struct task_group *tg, int cpu)

7527

void unregister_fair_sched_group(struct task_group *tg, int cpu)

7527

{

7528

{

7528

struct rq *rq = cpu_rq(cpu);

7529

struct rq *rq = cpu_rq(cpu);

7529

unsigned long flags;

7530

unsigned long flags;

7530

7531

/*

7532

/*

7532

* Only empty task groups can be destroyed; so we can speculatively

7533

* Only empty task groups can be destroyed; so we can speculatively

7533

* check on_list without danger of it being re-added.

7534

* check on_list without danger of it being re-added.

7534

*/

7535

*/

7535

if (!tg->cfs_rq[cpu]->on_list)

7536

if (!tg->cfs_rq[cpu]->on_list)

7536

return;

7537

return;

7537

7538

raw_spin_lock_irqsave(&rq->lock, flags);

7539

raw_spin_lock_irqsave(&rq->lock, flags);

7539

list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);

7540

list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);

7540

raw_spin_unlock_irqrestore(&rq->lock, flags);

7541

raw_spin_unlock_irqrestore(&rq->lock, flags);

7541

}

7542

}

7542

7543

void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,

7544

void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,

7544

struct sched_entity *se, int cpu,

7545

struct sched_entity *se, int cpu,

7545

struct sched_entity *parent)

7546

struct sched_entity *parent)

7546

{

7547

{

7547

struct rq *rq = cpu_rq(cpu);

7548

struct rq *rq = cpu_rq(cpu);

7548

7549

cfs_rq->tg = tg;

7550

cfs_rq->tg = tg;

7550

cfs_rq->rq = rq;

7551

cfs_rq->rq = rq;

7551

init_cfs_rq_runtime(cfs_rq);

7552

init_cfs_rq_runtime(cfs_rq);

7552

7553

tg->cfs_rq[cpu] = cfs_rq;

7554

tg->cfs_rq[cpu] = cfs_rq;

7554

tg->se[cpu] = se;

7555

tg->se[cpu] = se;

7555

7556

/* se could be NULL for root_task_group */

7557

/* se could be NULL for root_task_group */

7557

if (!se)

7558

if (!se)

7558

return;

7559

return;

7559

7560

if (!parent) {

7561

if (!parent) {

7561

se->cfs_rq = &rq->cfs;

7562

se->cfs_rq = &rq->cfs;

7562

se->depth = 0;

7563

se->depth = 0;

7563

} else {

7564

} else {

7564

se->cfs_rq = parent->my_q;

7565

se->cfs_rq = parent->my_q;

7565

se->depth = parent->depth + 1;

7566

se->depth = parent->depth + 1;

7566

}

7567

}

7567

7568

se->my_q = cfs_rq;

7569

se->my_q = cfs_rq;

7569

/* guarantee group entities always have weight */

7570

/* guarantee group entities always have weight */

7570

update_load_set(&se->load, NICE_0_LOAD);

7571

update_load_set(&se->load, NICE_0_LOAD);

7571

se->parent = parent;

7572

se->parent = parent;

7572

}

7573

}

7573

7574

static DEFINE_MUTEX(shares_mutex);

7575

static DEFINE_MUTEX(shares_mutex);

7575

7576

int sched_group_set_shares(struct task_group *tg, unsigned long shares)

7577

int sched_group_set_shares(struct task_group *tg, unsigned long shares)

7577

{

7578

{

7578

int i;

7579

int i;

7579

unsigned long flags;

7580

unsigned long flags;

7580

7581

/*

7582

/*

7582

* We can't change the weight of the root cgroup.

7583

* We can't change the weight of the root cgroup.

7583

*/

7584

*/

7584

if (!tg->se[0])

7585

if (!tg->se[0])

7585

return -EINVAL;

7586

return -EINVAL;

7586

7587

shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));

7588

shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));

7588

7589

mutex_lock(&shares_mutex);

7590

mutex_lock(&shares_mutex);

7590

if (tg->shares == shares)

7591

if (tg->shares == shares)

7591

goto done;

7592

goto done;

7592

7593

tg->shares = shares;

7594

tg->shares = shares;

7594

for_each_possible_cpu(i) {

7595

for_each_possible_cpu(i) {

7595

struct rq *rq = cpu_rq(i);

7596

struct rq *rq = cpu_rq(i);

7596

struct sched_entity *se;

7597

struct sched_entity *se;

7597

7598

se = tg->se[i];

7599

se = tg->se[i];

7599

/* Propagate contribution to hierarchy */

7600

/* Propagate contribution to hierarchy */

7600

raw_spin_lock_irqsave(&rq->lock, flags);

7601

raw_spin_lock_irqsave(&rq->lock, flags);

7601

7602

/* Possible calls to update_curr() need rq clock */

7603

/* Possible calls to update_curr() need rq clock */

7603

update_rq_clock(rq);

7604

update_rq_clock(rq);

7604

for_each_sched_entity(se)

7605

for_each_sched_entity(se)

7605

update_cfs_shares(group_cfs_rq(se));

7606

update_cfs_shares(group_cfs_rq(se));

7606

raw_spin_unlock_irqrestore(&rq->lock, flags);

7607

raw_spin_unlock_irqrestore(&rq->lock, flags);

7607

}

7608

}

7608

7609

done:

7610

done:

7610

mutex_unlock(&shares_mutex);

7611

mutex_unlock(&shares_mutex);

7611

return 0;

7612

return 0;

7612

}

7613

}

7613

#else /* CONFIG_FAIR_GROUP_SCHED */

7614

#else /* CONFIG_FAIR_GROUP_SCHED */

7614

7615

void free_fair_sched_group(struct task_group *tg) { }

7616

void free_fair_sched_group(struct task_group *tg) { }

7616

7617

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

7618

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

7618

{

7619

{

7619

return 1;

7620

return 1;

7620

}

7621

}

7621

7622

void unregister_fair_sched_group(struct task_group *tg, int cpu) { }

7623

void unregister_fair_sched_group(struct task_group *tg, int cpu) { }

7623

7624

#endif /* CONFIG_FAIR_GROUP_SCHED */

7625

#endif /* CONFIG_FAIR_GROUP_SCHED */

7625

7626

7627

static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)

7628

static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)

7628

{

7629

{

7629

struct sched_entity *se = &task->se;

7630

struct sched_entity *se = &task->se;

7630

unsigned int rr_interval = 0;

7631

unsigned int rr_interval = 0;

7631

7632

/*

7633

/*

7633

* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise

7634

* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise

7634

* idle runqueue:

7635

* idle runqueue:

7635

*/

7636

*/

7636

if (rq->cfs.load.weight)

7637

if (rq->cfs.load.weight)

7637

rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));

7638

rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));

7638

7639

return rr_interval;

7640

return rr_interval;

7640

}

7641

}

7641

7642

/*

7643

/*

7643

* All the scheduling class methods:

7644

* All the scheduling class methods:

7644

*/

7645

*/

7645

const struct sched_class fair_sched_class = {

7646

const struct sched_class fair_sched_class = {

7646

.next = &idle_sched_class,

7647

.next = &idle_sched_class,

7647

.enqueue_task = enqueue_task_fair,

7648

.enqueue_task = enqueue_task_fair,

7648

.dequeue_task = dequeue_task_fair,

7649

.dequeue_task = dequeue_task_fair,

7649

.yield_task = yield_task_fair,

7650

.yield_task = yield_task_fair,

7650

.yield_to_task = yield_to_task_fair,

7651

.yield_to_task = yield_to_task_fair,

7651

7652

.check_preempt_curr = check_preempt_wakeup,

7653

.check_preempt_curr = check_preempt_wakeup,

7653

7654

.pick_next_task = pick_next_task_fair,

7655

.pick_next_task = pick_next_task_fair,

7655

.put_prev_task = put_prev_task_fair,

7656

.put_prev_task = put_prev_task_fair,

7656

7657

#ifdef CONFIG_SMP

7658

#ifdef CONFIG_SMP

7658

.select_task_rq = select_task_rq_fair,

7659

.select_task_rq = select_task_rq_fair,

7659

.migrate_task_rq = migrate_task_rq_fair,

7660

.migrate_task_rq = migrate_task_rq_fair,

7660

7661

.rq_online = rq_online_fair,

7662

.rq_online = rq_online_fair,

7662

.rq_offline = rq_offline_fair,

7663

.rq_offline = rq_offline_fair,

7663

7664

.task_waking = task_waking_fair,

7665

.task_waking = task_waking_fair,

7665

#endif

7666

#endif

7666

7667

.set_curr_task = set_curr_task_fair,

7668

.set_curr_task = set_curr_task_fair,

7668

.task_tick = task_tick_fair,

7669

.task_tick = task_tick_fair,

7669

.task_fork = task_fork_fair,

7670

.task_fork = task_fork_fair,

7670

7671

.prio_changed = prio_changed_fair,

7672

.prio_changed = prio_changed_fair,

7672

.switched_from = switched_from_fair,

7673

.switched_from = switched_from_fair,

7673

.switched_to = switched_to_fair,

7674

.switched_to = switched_to_fair,

7674

7675

.get_rr_interval = get_rr_interval_fair,

7676

.get_rr_interval = get_rr_interval_fair,

7676

7677

#ifdef CONFIG_FAIR_GROUP_SCHED

7678

#ifdef CONFIG_FAIR_GROUP_SCHED

7678

.task_move_group = task_move_group_fair,

7679

.task_move_group = task_move_group_fair,

7679

#endif

7680

#endif

7680

};

7681

};

7681

7682

#ifdef CONFIG_SCHED_DEBUG

7683

#ifdef CONFIG_SCHED_DEBUG

7683

void print_cfs_stats(struct seq_file *m, int cpu)

7684

void print_cfs_stats(struct seq_file *m, int cpu)

7684

{

7685

{

7685

struct cfs_rq *cfs_rq;

7686

struct cfs_rq *cfs_rq;

7686

7687

rcu_read_lock();

7688

rcu_read_lock();

7688

for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)

7689

for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)

7689

print_cfs_rq(m, cpu, cfs_rq);

7690

print_cfs_rq(m, cpu, cfs_rq);

7690

rcu_read_unlock();

7691

rcu_read_unlock();

7691

}

7692

}

7692

#endif

7693

#endif

7693

7694

__init void init_sched_fair_class(void)

7695

__init void init_sched_fair_class(void)

7695

{

7696

{

7696

#ifdef CONFIG_SMP

7697

#ifdef CONFIG_SMP

7697

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

7698

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

7698

7699

#ifdef CONFIG_NO_HZ_COMMON

7700

#ifdef CONFIG_NO_HZ_COMMON

7700

nohz.next_balance = jiffies;

7701

nohz.next_balance = jiffies;

7701

zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);

7702

zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);

7702

cpu_notifier(sched_ilb_notifier, 0);

7703

cpu_notifier(sched_ilb_notifier, 0);

7703

#endif

7704

#endif

7704

#endif /* SMP */

7705

#endif /* SMP */

7705

7706

}

7707

}

GITLAB

sched/numa: Fix use of spin_{un}lock_irq() when interrupts are disabled

 /*
  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
  *
  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  *
  *  Interactivity improvements by Mike Galbraith
  *  (C) 2007 Mike Galbraith <efault@gmx.de>
  *
  *  Various enhancements by Dmitry Adamushko.
  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  *
  *  Group scheduling enhancements by Srivatsa Vaddagiri
  *  Copyright IBM Corporation, 2007
  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  *
  *  Scaled math optimizations by Thomas Gleixner
  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  *
  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  */
 #include <linux/latencytop.h>
 #include <linux/sched.h>
 #include <linux/cpumask.h>
 #include <linux/slab.h>
 #include <linux/profile.h>
 #include <linux/interrupt.h>
 #include <linux/mempolicy.h>
 #include <linux/migrate.h>
 #include <linux/task_work.h>
 #include <trace/events/sched.h>
 #include "sched.h"
 /*
  * Targeted preemption latency for CPU-bound tasks:
  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  *
  * NOTE: this latency value is not the same as the concept of
  * 'timeslice length' - timeslices in CFS are of variable length
  * and have no persistent notion like in traditional, time-slice
  * based scheduling concepts.
  *
  * (to see the precise effective timeslice length of your workload,
  *  run vmstat and monitor the context-switches (cs) field)
  */
 unsigned int sysctl_sched_latency = 6000000ULL;
 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
 /*
  * The initial- and re-scaling of tunables is configurable
  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
  *
  * Options are:
  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
  */
 enum sched_tunable_scaling sysctl_sched_tunable_scaling
 	= SCHED_TUNABLESCALING_LOG;
 /*
  * Minimal preemption granularity for CPU-bound tasks:
  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
 unsigned int sysctl_sched_min_granularity = 750000ULL;
 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
 /*
  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
  */
 static unsigned int sched_nr_latency = 8;
 /*
  * After fork, child runs first. If set to 0 (default) then
  * parent will (try to) run first.
  */
 unsigned int sysctl_sched_child_runs_first __read_mostly;
 /*
  * SCHED_OTHER wake-up granularity.
  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  *
  * This option delays the preemption effects of decoupled workloads
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 /*
  * The exponential sliding  window over which load is averaged for shares
  * distribution.
  * (default: 10msec)
  */
 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 #ifdef CONFIG_CFS_BANDWIDTH
 /*
  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
  * each time a cfs_rq requests quota.
  *
  * Note: in the case that the slice exceeds the runtime remaining (either due
  * to consumption or the quota being specified to be smaller than the slice)
  * we will always only issue the remaining available time.
  *
  * default: 5 msec, units: microseconds
   */
 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 #endif
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
 	lw->inv_weight = 0;
 }
 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 {
 	lw->weight -= dec;
 	lw->inv_weight = 0;
 }
 static inline void update_load_set(struct load_weight *lw, unsigned long w)
 {
 	lw->weight = w;
 	lw->inv_weight = 0;
 }
 /*
  * Increase the granularity value when there are more CPUs,
  * because with more CPUs the 'effective latency' as visible
  * to users decreases. But the relationship is not linear,
  * so pick a second-best guess by going with the log2 of the
  * number of CPUs.
  *
  * This idea comes from the SD scheduler of Con Kolivas:
  */
 static int get_update_sysctl_factor(void)
 {
 	unsigned int cpus = min_t(int, num_online_cpus(), 8);
 	unsigned int factor;
 	switch (sysctl_sched_tunable_scaling) {
 	case SCHED_TUNABLESCALING_NONE:
 		factor = 1;
 		break;
 	case SCHED_TUNABLESCALING_LINEAR:
 		factor = cpus;
 		break;
 	case SCHED_TUNABLESCALING_LOG:
 	default:
 		factor = 1 + ilog2(cpus);
 		break;
 	}
 	return factor;
 }
 static void update_sysctl(void)
 {
 	unsigned int factor = get_update_sysctl_factor();
 #define SET_SYSCTL(name) \
 	(sysctl_##name = (factor) * normalized_sysctl_##name)
 	SET_SYSCTL(sched_min_granularity);
 	SET_SYSCTL(sched_latency);
 	SET_SYSCTL(sched_wakeup_granularity);
 #undef SET_SYSCTL
 }
 void sched_init_granularity(void)
 {
 	update_sysctl();
 }
 #define WMULT_CONST	(~0U)
 #define WMULT_SHIFT	32
 static void __update_inv_weight(struct load_weight *lw)
 {
 	unsigned long w;
 	if (likely(lw->inv_weight))
 		return;
 	w = scale_load_down(lw->weight);
 	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
 		lw->inv_weight = 1;
 	else if (unlikely(!w))
 		lw->inv_weight = WMULT_CONST;
 	else
 		lw->inv_weight = WMULT_CONST / w;
 }
 /*
  * delta_exec * weight / lw.weight
  *   OR
  * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
  *
  * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
  * we're guaranteed shift stays positive because inv_weight is guaranteed to
  * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
  *
  * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
  * weight/lw.weight <= 1, and therefore our shift will also be positive.
  */
 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
 {
 	u64 fact = scale_load_down(weight);
 	int shift = WMULT_SHIFT;
 	__update_inv_weight(lw);
 	if (unlikely(fact >> 32)) {
 		while (fact >> 32) {
 			fact >>= 1;
 			shift--;
 		}
 	}
 	/* hint to use a 32x32->64 mul */
 	fact = (u64)(u32)fact * lw->inv_weight;
 	while (fact >> 32) {
 		fact >>= 1;
 		shift--;
 	}
 	return mul_u64_u32_shr(delta_exec, fact, shift);
 }
 const struct sched_class fair_sched_class;
 /**************************************************************
  * CFS operations on generic schedulable entities:
  */
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /* cpu runqueue to which this cfs_rq is attached */
 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 {
 	return cfs_rq->rq;
 }
 /* An entity is a task if it doesn't "own" a runqueue */
 #define entity_is_task(se)	(!se->my_q)
 static inline struct task_struct *task_of(struct sched_entity *se)
 {
 #ifdef CONFIG_SCHED_DEBUG
 	WARN_ON_ONCE(!entity_is_task(se));
 #endif
 	return container_of(se, struct task_struct, se);
 }
 /* Walk up scheduling entities hierarchy */
 #define for_each_sched_entity(se) \
 		for (; se; se = se->parent)
 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 {
 	return p->se.cfs_rq;
 }
 /* runqueue on which this entity is (to be) queued */
 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 {
 	return se->cfs_rq;
 }
 /* runqueue "owned" by this group */
 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 {
 	return grp->my_q;
 }
 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
 				       int force_update);
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	if (!cfs_rq->on_list) {
 		/*
 		 * Ensure we either appear before our parent (if already
 		 * enqueued) or force our parent to appear after us when it is
 		 * enqueued.  The fact that we always enqueue bottom-up
 		 * reduces this to two cases.
 		 */
 		if (cfs_rq->tg->parent &&
 		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
 			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
 		} else {
 			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 				&rq_of(cfs_rq)->leaf_cfs_rq_list);
 		}
 		cfs_rq->on_list = 1;
 		/* We should have no load, but we need to update last_decay. */
 		update_cfs_rq_blocked_load(cfs_rq, 0);
 	}
 }
 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	if (cfs_rq->on_list) {
 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 		cfs_rq->on_list = 0;
 	}
 }
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 /* Do the two (enqueued) entities belong to the same group ? */
 static inline struct cfs_rq *
 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
 	if (se->cfs_rq == pse->cfs_rq)
 		return se->cfs_rq;
 	return NULL;
 }
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 {
 	return se->parent;
 }
 static void
 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 {
 	int se_depth, pse_depth;
 	/*
 	 * preemption test can be made between sibling entities who are in the
 	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
 	 * both tasks until we find their ancestors who are siblings of common
 	 * parent.
 	 */
 	/* First walk up until both entities are at same depth */
 	se_depth = (*se)->depth;
 	pse_depth = (*pse)->depth;
 	while (se_depth > pse_depth) {
 		se_depth--;
 		*se = parent_entity(*se);
 	}
 	while (pse_depth > se_depth) {
 		pse_depth--;
 		*pse = parent_entity(*pse);
 	}
 	while (!is_same_group(*se, *pse)) {
 		*se = parent_entity(*se);
 		*pse = parent_entity(*pse);
 	}
 }
 #else	/* !CONFIG_FAIR_GROUP_SCHED */
 static inline struct task_struct *task_of(struct sched_entity *se)
 {
 	return container_of(se, struct task_struct, se);
 }
 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 {
 	return container_of(cfs_rq, struct rq, cfs);
 }
 #define entity_is_task(se)	1
 #define for_each_sched_entity(se) \
 		for (; se; se = NULL)
 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 {
 	return &task_rq(p)->cfs;
 }
 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 {
 	struct task_struct *p = task_of(se);
 	struct rq *rq = task_rq(p);
 	return &rq->cfs;
 }
 /* runqueue "owned" by this group */
 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 {
 	return NULL;
 }
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 }
 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 }
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 {
 	return NULL;
 }
 static inline void
 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 {
 }
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 static __always_inline
 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 /**************************************************************
  * Scheduling class tree data structure manipulation methods:
  */
 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
 {
 	s64 delta = (s64)(vruntime - max_vruntime);
 	if (delta > 0)
 		max_vruntime = vruntime;
 	return max_vruntime;
 }
 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
 {
 	s64 delta = (s64)(vruntime - min_vruntime);
 	if (delta < 0)
 		min_vruntime = vruntime;
 	return min_vruntime;
 }
 static inline int entity_before(struct sched_entity *a,
 				struct sched_entity *b)
 {
 	return (s64)(a->vruntime - b->vruntime) < 0;
 }
 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 {
 	u64 vruntime = cfs_rq->min_vruntime;
 	if (cfs_rq->curr)
 		vruntime = cfs_rq->curr->vruntime;
 	if (cfs_rq->rb_leftmost) {
 		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
 						   struct sched_entity,
 						   run_node);
 		if (!cfs_rq->curr)
 			vruntime = se->vruntime;
 		else
 			vruntime = min_vruntime(vruntime, se->vruntime);
 	}
 	/* ensure we never gain time by being placed backwards. */
 	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
 #ifndef CONFIG_64BIT
 	smp_wmb();
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 #endif
 }
 /*
  * Enqueue an entity into the rb-tree:
  */
 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 	struct rb_node *parent = NULL;
 	struct sched_entity *entry;
 	int leftmost = 1;
 	/*
 	 * Find the right place in the rbtree:
 	 */
 	while (*link) {
 		parent = *link;
 		entry = rb_entry(parent, struct sched_entity, run_node);
 		/*
 		 * We dont care about collisions. Nodes with
 		 * the same key stay together.
 		 */
 		if (entity_before(se, entry)) {
 			link = &parent->rb_left;
 		} else {
 			link = &parent->rb_right;
 			leftmost = 0;
 		}
 	}
 	/*
 	 * Maintain a cache of leftmost tree entries (it is frequently
 	 * used):
 	 */
 	if (leftmost)
 		cfs_rq->rb_leftmost = &se->run_node;
 	rb_link_node(&se->run_node, parent, link);
 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 }
 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if (cfs_rq->rb_leftmost == &se->run_node) {
 		struct rb_node *next_node;
 		next_node = rb_next(&se->run_node);
 		cfs_rq->rb_leftmost = next_node;
 	}
 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 }
 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 {
 	struct rb_node *left = cfs_rq->rb_leftmost;
 	if (!left)
 		return NULL;
 	return rb_entry(left, struct sched_entity, run_node);
 }
 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
 {
 	struct rb_node *next = rb_next(&se->run_node);
 	if (!next)
 		return NULL;
 	return rb_entry(next, struct sched_entity, run_node);
 }
 #ifdef CONFIG_SCHED_DEBUG
 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
 	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
 	if (!last)
 		return NULL;
 	return rb_entry(last, struct sched_entity, run_node);
 }
 /**************************************************************
  * Scheduling class statistics methods:
  */
 int sched_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	int factor = get_update_sysctl_factor();
 	if (ret || !write)
 		return ret;
 	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
 					sysctl_sched_min_granularity);
 #define WRT_SYSCTL(name) \
 	(normalized_sysctl_##name = sysctl_##name / (factor))
 	WRT_SYSCTL(sched_min_granularity);
 	WRT_SYSCTL(sched_latency);
 	WRT_SYSCTL(sched_wakeup_granularity);
 #undef WRT_SYSCTL
 	return 0;
 }
 #endif
 /*
  * delta /= w
  */
 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
 {
 	if (unlikely(se->load.weight != NICE_0_LOAD))
 		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
 	return delta;
 }
 /*
  * The idea is to set a period in which each task runs once.
  *
  * When there are too many tasks (sched_nr_latency) we have to stretch
  * this period because otherwise the slices get too small.
  *
  * p = (nr <= nl) ? l : l*nr/nl
  */
 static u64 __sched_period(unsigned long nr_running)
 {
 	u64 period = sysctl_sched_latency;
 	unsigned long nr_latency = sched_nr_latency;
 	if (unlikely(nr_running > nr_latency)) {
 		period = sysctl_sched_min_granularity;
 		period *= nr_running;
 	}
 	return period;
 }
 /*
  * We calculate the wall-time slice from the period by taking a part
  * proportional to the weight.
  *
  * s = p*P[w/rw]
  */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
 	for_each_sched_entity(se) {
 		struct load_weight *load;
 		struct load_weight lw;
 		cfs_rq = cfs_rq_of(se);
 		load = &cfs_rq->load;
 		if (unlikely(!se->on_rq)) {
 			lw = cfs_rq->load;
 			update_load_add(&lw, se->load.weight);
 			load = &lw;
 		}
 		slice = __calc_delta(slice, se->load.weight, load);
 	}
 	return slice;
 }
 /*
  * We calculate the vruntime slice of a to-be-inserted task.
  *
  * vs = s/w
  */
 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
 #ifdef CONFIG_SMP
 static unsigned long task_h_load(struct task_struct *p);
 static inline void __update_task_entity_contrib(struct sched_entity *se);
 /* Give new task start runnable values to heavy its load in infant time */
 void init_task_runnable_average(struct task_struct *p)
 {
 	u32 slice;
 	p->se.avg.decay_count = 0;
 	slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
 	p->se.avg.runnable_avg_sum = slice;
 	p->se.avg.runnable_avg_period = slice;
 	__update_task_entity_contrib(&p->se);
 }
 #else
 void init_task_runnable_average(struct task_struct *p)
 {
 }
 #endif
 /*
  * Update the current task's runtime statistics.
  */
 static void update_curr(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *curr = cfs_rq->curr;
 	u64 now = rq_clock_task(rq_of(cfs_rq));
 	u64 delta_exec;
 	if (unlikely(!curr))
 		return;
 	delta_exec = now - curr->exec_start;
 	if (unlikely((s64)delta_exec <= 0))
 		return;
 	curr->exec_start = now;
 	schedstat_set(curr->statistics.exec_max,
 		      max(delta_exec, curr->statistics.exec_max));
 	curr->sum_exec_runtime += delta_exec;
 	schedstat_add(cfs_rq, exec_clock, delta_exec);
 	curr->vruntime += calc_delta_fair(delta_exec, curr);
 	update_min_vruntime(cfs_rq);
 	if (entity_is_task(curr)) {
 		struct task_struct *curtask = task_of(curr);
 		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
 		cpuacct_charge(curtask, delta_exec);
 		account_group_exec_runtime(curtask, delta_exec);
 	}
 	account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
 }
 /*
  * Task is being enqueued - update stats:
  */
 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	/*
 	 * Are we enqueueing a waiting task? (for current tasks
 	 * a dequeue/enqueue event is a NOP)
 	 */
 	if (se != cfs_rq->curr)
 		update_stats_wait_start(cfs_rq, se);
 }
 static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
 	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
 	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
 #ifdef CONFIG_SCHEDSTATS
 	if (entity_is_task(se)) {
 		trace_sched_stat_wait(task_of(se),
 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
 	}
 #endif
 	schedstat_set(se->statistics.wait_start, 0);
 }
 static inline void
 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	/*
 	 * Mark the end of the wait period if dequeueing a
 	 * waiting task:
 	 */
 	if (se != cfs_rq->curr)
 		update_stats_wait_end(cfs_rq, se);
 }
 /*
  * We are picking a new current task - update its stats:
  */
 static inline void
 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	/*
 	 * We are starting a new run period:
 	 */
 	se->exec_start = rq_clock_task(rq_of(cfs_rq));
 }
 /**************************************************
  * Scheduling class queueing methods:
  */
 #ifdef CONFIG_NUMA_BALANCING
 /*
  * Approximate time to scan a full NUMA task in ms. The task scan period is
  * calculated based on the tasks virtual memory size and
  * numa_balancing_scan_size.
  */
 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
 /* Portion of address space to scan in MB */
 unsigned int sysctl_numa_balancing_scan_size = 256;
 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 unsigned int sysctl_numa_balancing_scan_delay = 1000;
 static unsigned int task_nr_scan_windows(struct task_struct *p)
 {
 	unsigned long rss = 0;
 	unsigned long nr_scan_pages;
 	/*
 	 * Calculations based on RSS as non-present and empty pages are skipped
 	 * by the PTE scanner and NUMA hinting faults should be trapped based
 	 * on resident pages
 	 */
 	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
 	rss = get_mm_rss(p->mm);
 	if (!rss)
 		rss = nr_scan_pages;
 	rss = round_up(rss, nr_scan_pages);
 	return rss / nr_scan_pages;
 }
 /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
 #define MAX_SCAN_WINDOW 2560
 static unsigned int task_scan_min(struct task_struct *p)
 {
 	unsigned int scan, floor;
 	unsigned int windows = 1;
 	if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
 		windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
 	floor = 1000 / windows;
 	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
 	return max_t(unsigned int, floor, scan);
 }
 static unsigned int task_scan_max(struct task_struct *p)
 {
 	unsigned int smin = task_scan_min(p);
 	unsigned int smax;
 	/* Watch for min being lower than max due to floor calculations */
 	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
 	return max(smin, smax);
 }
 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
 {
 	rq->nr_numa_running += (p->numa_preferred_nid != -1);
 	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
 }
 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 {
 	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
 	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
 }
 struct numa_group {
 	atomic_t refcount;
 	spinlock_t lock; /* nr_tasks, tasks */
 	int nr_tasks;
 	pid_t gid;
 	struct list_head task_list;
 	struct rcu_head rcu;
 	nodemask_t active_nodes;
 	unsigned long total_faults;
 	/*
 	 * Faults_cpu is used to decide whether memory should move
 	 * towards the CPU. As a consequence, these stats are weighted
 	 * more by CPU use than by memory faults.
 	 */
 	unsigned long *faults_cpu;
 	unsigned long faults[0];
 };
 /* Shared or private faults. */
 #define NR_NUMA_HINT_FAULT_TYPES 2
 /* Memory and CPU locality */
 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
 /* Averaged statistics, and temporary buffers. */
 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
 pid_t task_numa_group_id(struct task_struct *p)
 {
 	return p->numa_group ? p->numa_group->gid : 0;
 }
 static inline int task_faults_idx(int nid, int priv)
 {
 	return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
 }
 static inline unsigned long task_faults(struct task_struct *p, int nid)
 {
 	if (!p->numa_faults_memory)
 		return 0;
 	return p->numa_faults_memory[task_faults_idx(nid, 0)] +
 		p->numa_faults_memory[task_faults_idx(nid, 1)];
 }
 static inline unsigned long group_faults(struct task_struct *p, int nid)
 {
 	if (!p->numa_group)
 		return 0;
 	return p->numa_group->faults[task_faults_idx(nid, 0)] +
 		p->numa_group->faults[task_faults_idx(nid, 1)];
 }
 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
 {
 	return group->faults_cpu[task_faults_idx(nid, 0)] +
 		group->faults_cpu[task_faults_idx(nid, 1)];
 }
 /*
  * These return the fraction of accesses done by a particular task, or
  * task group, on a particular numa node.  The group weight is given a
  * larger multiplier, in order to group tasks together that are almost
  * evenly spread out between numa nodes.
  */
 static inline unsigned long task_weight(struct task_struct *p, int nid)
 {
 	unsigned long total_faults;
 	if (!p->numa_faults_memory)
 		return 0;
 	total_faults = p->total_numa_faults;
 	if (!total_faults)
 		return 0;
 	return 1000 * task_faults(p, nid) / total_faults;
 }
 static inline unsigned long group_weight(struct task_struct *p, int nid)
 {
 	if (!p->numa_group || !p->numa_group->total_faults)
 		return 0;
 	return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
 }
 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
 				int src_nid, int dst_cpu)
 {
 	struct numa_group *ng = p->numa_group;
 	int dst_nid = cpu_to_node(dst_cpu);
 	int last_cpupid, this_cpupid;
 	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
 	/*
 	 * Multi-stage node selection is used in conjunction with a periodic
 	 * migration fault to build a temporal task<->page relation. By using
 	 * a two-stage filter we remove short/unlikely relations.
 	 *
 	 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
 	 * a task's usage of a particular page (n_p) per total usage of this
 	 * page (n_t) (in a given time-span) to a probability.
 	 *
 	 * Our periodic faults will sample this probability and getting the
 	 * same result twice in a row, given these samples are fully
 	 * independent, is then given by P(n)^2, provided our sample period
 	 * is sufficiently short compared to the usage pattern.
 	 *
 	 * This quadric squishes small probabilities, making it less likely we
 	 * act on an unlikely task<->page relation.
 	 */
 	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
 	if (!cpupid_pid_unset(last_cpupid) &&
 				cpupid_to_nid(last_cpupid) != dst_nid)
 		return false;
 	/* Always allow migrate on private faults */
 	if (cpupid_match_pid(p, last_cpupid))
 		return true;
 	/* A shared fault, but p->numa_group has not been set up yet. */
 	if (!ng)
 		return true;
 	/*
 	 * Do not migrate if the destination is not a node that
 	 * is actively used by this numa group.
 	 */
 	if (!node_isset(dst_nid, ng->active_nodes))
 		return false;
 	/*
 	 * Source is a node that is not actively used by this
 	 * numa group, while the destination is. Migrate.
 	 */
 	if (!node_isset(src_nid, ng->active_nodes))
 		return true;
 	/*
 	 * Both source and destination are nodes in active
 	 * use by this numa group. Maximize memory bandwidth
 	 * by migrating from more heavily used groups, to less
 	 * heavily used ones, spreading the load around.
 	 * Use a 1/4 hysteresis to avoid spurious page movement.
 	 */
 	return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
 }
 static unsigned long weighted_cpuload(const int cpu);
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
 static unsigned long power_of(int cpu);
 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
 /* Cached statistics for all CPUs within a node */
 struct numa_stats {
 	unsigned long nr_running;
 	unsigned long load;
 	/* Total compute capacity of CPUs on a node */
 	unsigned long power;
 	/* Approximate capacity in terms of runnable tasks on a node */
 	unsigned long capacity;
 	int has_capacity;
 };
 /*
  * XXX borrowed from update_sg_lb_stats
  */
 static void update_numa_stats(struct numa_stats *ns, int nid)
 {
 	int cpu, cpus = 0;
 	memset(ns, 0, sizeof(*ns));
 	for_each_cpu(cpu, cpumask_of_node(nid)) {
 		struct rq *rq = cpu_rq(cpu);
 		ns->nr_running += rq->nr_running;
 		ns->load += weighted_cpuload(cpu);
 		ns->power += power_of(cpu);
 		cpus++;
 	}
 	/*
 	 * If we raced with hotplug and there are no CPUs left in our mask
 	 * the @ns structure is NULL'ed and task_numa_compare() will
 	 * not find this node attractive.
 	 *
 	 * We'll either bail at !has_capacity, or we'll detect a huge imbalance
 	 * and bail there.
 	 */
 	if (!cpus)
 		return;
 	ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
 	ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
 	ns->has_capacity = (ns->nr_running < ns->capacity);
 }
 struct task_numa_env {
 	struct task_struct *p;
 	int src_cpu, src_nid;
 	int dst_cpu, dst_nid;
 	struct numa_stats src_stats, dst_stats;
 	int imbalance_pct;
 	struct task_struct *best_task;
 	long best_imp;
 	int best_cpu;
 };
 static void task_numa_assign(struct task_numa_env *env,
 			     struct task_struct *p, long imp)
 {
 	if (env->best_task)
 		put_task_struct(env->best_task);
 	if (p)
 		get_task_struct(p);
 	env->best_task = p;
 	env->best_imp = imp;
 	env->best_cpu = env->dst_cpu;
 }
 /*
  * This checks if the overall compute and NUMA accesses of the system would
  * be improved if the source tasks was migrated to the target dst_cpu taking
  * into account that it might be best if task running on the dst_cpu should
  * be exchanged with the source task
  */
 static void task_numa_compare(struct task_numa_env *env,
 			      long taskimp, long groupimp)
 {
 	struct rq *src_rq = cpu_rq(env->src_cpu);
 	struct rq *dst_rq = cpu_rq(env->dst_cpu);
 	struct task_struct *cur;
 	long dst_load, src_load;
 	long load;
 	long imp = (groupimp > 0) ? groupimp : taskimp;
 	rcu_read_lock();
 	cur = ACCESS_ONCE(dst_rq->curr);
 	if (cur->pid == 0) /* idle */
 		cur = NULL;
 	/*
 	 * "imp" is the fault differential for the source task between the
 	 * source and destination node. Calculate the total differential for
 	 * the source task and potential destination task. The more negative
 	 * the value is, the more rmeote accesses that would be expected to
 	 * be incurred if the tasks were swapped.
 	 */
 	if (cur) {
 		/* Skip this swap candidate if cannot move to the source cpu */
 		if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
 			goto unlock;
 		/*
 		 * If dst and source tasks are in the same NUMA group, or not
 		 * in any group then look only at task weights.
 		 */
 		if (cur->numa_group == env->p->numa_group) {
 			imp = taskimp + task_weight(cur, env->src_nid) -
 			      task_weight(cur, env->dst_nid);
 			/*
 			 * Add some hysteresis to prevent swapping the
 			 * tasks within a group over tiny differences.
 			 */
 			if (cur->numa_group)
 				imp -= imp/16;
 		} else {
 			/*
 			 * Compare the group weights. If a task is all by
 			 * itself (not part of a group), use the task weight
 			 * instead.
 			 */
 			if (env->p->numa_group)
 				imp = groupimp;
 			else
 				imp = taskimp;
 			if (cur->numa_group)
 				imp += group_weight(cur, env->src_nid) -
 				       group_weight(cur, env->dst_nid);
 			else
 				imp += task_weight(cur, env->src_nid) -
 				       task_weight(cur, env->dst_nid);
 		}
 	}
 	if (imp < env->best_imp)
 		goto unlock;
 	if (!cur) {
 		/* Is there capacity at our destination? */
 		if (env->src_stats.has_capacity &&
 		    !env->dst_stats.has_capacity)
 			goto unlock;
 		goto balance;
 	}
 	/* Balance doesn't matter much if we're running a task per cpu */
 	if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
 		goto assign;
 	/*
 	 * In the overloaded case, try and keep the load balanced.
 	 */
 balance:
 	dst_load = env->dst_stats.load;
 	src_load = env->src_stats.load;
 	/* XXX missing power terms */
 	load = task_h_load(env->p);
 	dst_load += load;
 	src_load -= load;
 	if (cur) {
 		load = task_h_load(cur);
 		dst_load -= load;
 		src_load += load;
 	}
 	/* make src_load the smaller */
 	if (dst_load < src_load)
 		swap(dst_load, src_load);
 	if (src_load * env->imbalance_pct < dst_load * 100)
 		goto unlock;
 assign:
 	task_numa_assign(env, cur, imp);
 unlock:
 	rcu_read_unlock();
 }
 static void task_numa_find_cpu(struct task_numa_env *env,
 				long taskimp, long groupimp)
 {
 	int cpu;
 	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
 		/* Skip this CPU if the source task cannot migrate */
 		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
 			continue;
 		env->dst_cpu = cpu;
 		task_numa_compare(env, taskimp, groupimp);
 	}
 }
 static int task_numa_migrate(struct task_struct *p)
 {
 	struct task_numa_env env = {
 		.p = p,
 		.src_cpu = task_cpu(p),
 		.src_nid = task_node(p),
 		.imbalance_pct = 112,
 		.best_task = NULL,
 		.best_imp = 0,
 		.best_cpu = -1
 	};
 	struct sched_domain *sd;
 	unsigned long taskweight, groupweight;
 	int nid, ret;
 	long taskimp, groupimp;
 	/*
 	 * Pick the lowest SD_NUMA domain, as that would have the smallest
 	 * imbalance and would be the first to start moving tasks about.
 	 *
 	 * And we want to avoid any moving of tasks about, as that would create
 	 * random movement of tasks -- counter the numa conditions we're trying
 	 * to satisfy here.
 	 */
 	rcu_read_lock();
 	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
 	if (sd)
 		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
 	rcu_read_unlock();
 	/*
 	 * Cpusets can break the scheduler domain tree into smaller
 	 * balance domains, some of which do not cross NUMA boundaries.
 	 * Tasks that are "trapped" in such domains cannot be migrated
 	 * elsewhere, so there is no point in (re)trying.
 	 */
 	if (unlikely(!sd)) {
 		p->numa_preferred_nid = task_node(p);
 		return -EINVAL;
 	}
 	taskweight = task_weight(p, env.src_nid);
 	groupweight = group_weight(p, env.src_nid);
 	update_numa_stats(&env.src_stats, env.src_nid);
 	env.dst_nid = p->numa_preferred_nid;
 	taskimp = task_weight(p, env.dst_nid) - taskweight;
 	groupimp = group_weight(p, env.dst_nid) - groupweight;
 	update_numa_stats(&env.dst_stats, env.dst_nid);
 	/* If the preferred nid has capacity, try to use it. */
 	if (env.dst_stats.has_capacity)
 		task_numa_find_cpu(&env, taskimp, groupimp);
 	/* No space available on the preferred nid. Look elsewhere. */
 	if (env.best_cpu == -1) {
 		for_each_online_node(nid) {
 			if (nid == env.src_nid || nid == p->numa_preferred_nid)
 				continue;
 			/* Only consider nodes where both task and groups benefit */
 			taskimp = task_weight(p, nid) - taskweight;
 			groupimp = group_weight(p, nid) - groupweight;
 			if (taskimp < 0 && groupimp < 0)
 				continue;
 			env.dst_nid = nid;
 			update_numa_stats(&env.dst_stats, env.dst_nid);
 			task_numa_find_cpu(&env, taskimp, groupimp);
 		}
 	}
 	/* No better CPU than the current one was found. */
 	if (env.best_cpu == -1)
 		return -EAGAIN;
 	sched_setnuma(p, env.dst_nid);
 	/*
 	 * Reset the scan period if the task is being rescheduled on an
 	 * alternative node to recheck if the tasks is now properly placed.
 	 */
 	p->numa_scan_period = task_scan_min(p);
 	if (env.best_task == NULL) {
 		ret = migrate_task_to(p, env.best_cpu);
 		if (ret != 0)
 			trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
 		return ret;
 	}
 	ret = migrate_swap(p, env.best_task);
 	if (ret != 0)
 		trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
 	put_task_struct(env.best_task);
 	return ret;
 }
 /* Attempt to migrate a task to a CPU on the preferred node. */
 static void numa_migrate_preferred(struct task_struct *p)
 {
 	/* This task has no NUMA fault statistics yet */
 	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
 		return;
 	/* Periodically retry migrating the task to the preferred node */
 	p->numa_migrate_retry = jiffies + HZ;
 	/* Success if task is already running on preferred CPU */
 	if (task_node(p) == p->numa_preferred_nid)
 		return;
 	/* Otherwise, try migrate to a CPU on the preferred node */
 	task_numa_migrate(p);
 }
 /*
  * Find the nodes on which the workload is actively running. We do this by
  * tracking the nodes from which NUMA hinting faults are triggered. This can
  * be different from the set of nodes where the workload's memory is currently
  * located.
  *
  * The bitmask is used to make smarter decisions on when to do NUMA page
  * migrations, To prevent flip-flopping, and excessive page migrations, nodes
  * are added when they cause over 6/16 of the maximum number of faults, but
  * only removed when they drop below 3/16.
  */
 static void update_numa_active_node_mask(struct numa_group *numa_group)
 {
 	unsigned long faults, max_faults = 0;
 	int nid;
 	for_each_online_node(nid) {
 		faults = group_faults_cpu(numa_group, nid);
 		if (faults > max_faults)
 			max_faults = faults;
 	}
 	for_each_online_node(nid) {
 		faults = group_faults_cpu(numa_group, nid);
 		if (!node_isset(nid, numa_group->active_nodes)) {
 			if (faults > max_faults * 6 / 16)
 				node_set(nid, numa_group->active_nodes);
 		} else if (faults < max_faults * 3 / 16)
 			node_clear(nid, numa_group->active_nodes);
 	}
 }
 /*
  * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
  * increments. The more local the fault statistics are, the higher the scan
  * period will be for the next scan window. If local/remote ratio is below
  * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
  * scan period will decrease
  */
 #define NUMA_PERIOD_SLOTS 10
 #define NUMA_PERIOD_THRESHOLD 3
 /*
  * Increase the scan period (slow down scanning) if the majority of
  * our memory is already on our local node, or if the majority of
  * the page accesses are shared with other processes.
  * Otherwise, decrease the scan period.
  */
 static void update_task_scan_period(struct task_struct *p,
 			unsigned long shared, unsigned long private)
 {
 	unsigned int period_slot;
 	int ratio;
 	int diff;
 	unsigned long remote = p->numa_faults_locality[0];
 	unsigned long local = p->numa_faults_locality[1];
 	/*
 	 * If there were no record hinting faults then either the task is
 	 * completely idle or all activity is areas that are not of interest
 	 * to automatic numa balancing. Scan slower
 	 */
 	if (local + shared == 0) {
 		p->numa_scan_period = min(p->numa_scan_period_max,
 			p->numa_scan_period << 1);
 		p->mm->numa_next_scan = jiffies +
 			msecs_to_jiffies(p->numa_scan_period);
 		return;
 	}
 	/*
 	 * Prepare to scale scan period relative to the current period.
 	 *	 == NUMA_PERIOD_THRESHOLD scan period stays the same
 	 *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
 	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
 	 */
 	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
 	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
 	if (ratio >= NUMA_PERIOD_THRESHOLD) {
 		int slot = ratio - NUMA_PERIOD_THRESHOLD;
 		if (!slot)
 			slot = 1;
 		diff = slot * period_slot;
 	} else {
 		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
 		/*
 		 * Scale scan rate increases based on sharing. There is an
 		 * inverse relationship between the degree of sharing and
 		 * the adjustment made to the scanning period. Broadly
 		 * speaking the intent is that there is little point
 		 * scanning faster if shared accesses dominate as it may
 		 * simply bounce migrations uselessly
 		 */
 		ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
 		diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
 	}
 	p->numa_scan_period = clamp(p->numa_scan_period + diff,
 			task_scan_min(p), task_scan_max(p));
 	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
 }
 /*
  * Get the fraction of time the task has been running since the last
  * NUMA placement cycle. The scheduler keeps similar statistics, but
  * decays those on a 32ms period, which is orders of magnitude off
  * from the dozens-of-seconds NUMA balancing period. Use the scheduler
  * stats only if the task is so new there are no NUMA statistics yet.
  */
 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
 {
 	u64 runtime, delta, now;
 	/* Use the start of this time slice to avoid calculations. */
 	now = p->se.exec_start;
 	runtime = p->se.sum_exec_runtime;
 	if (p->last_task_numa_placement) {
 		delta = runtime - p->last_sum_exec_runtime;
 		*period = now - p->last_task_numa_placement;
 	} else {
 		delta = p->se.avg.runnable_avg_sum;
 		*period = p->se.avg.runnable_avg_period;
 	}
 	p->last_sum_exec_runtime = runtime;
 	p->last_task_numa_placement = now;
 	return delta;
 }
 static void task_numa_placement(struct task_struct *p)
 {
 	int seq, nid, max_nid = -1, max_group_nid = -1;
 	unsigned long max_faults = 0, max_group_faults = 0;
 	unsigned long fault_types[2] = { 0, 0 };
 	unsigned long total_faults;
 	u64 runtime, period;
 	spinlock_t *group_lock = NULL;
 	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
 	if (p->numa_scan_seq == seq)
 		return;
 	p->numa_scan_seq = seq;
 	p->numa_scan_period_max = task_scan_max(p);
 	total_faults = p->numa_faults_locality[0] +
 		       p->numa_faults_locality[1];
 	runtime = numa_get_avg_runtime(p, &period);
 	/* If the task is part of a group prevent parallel updates to group stats */
 	if (p->numa_group) {
 		group_lock = &p->numa_group->lock;
 		spin_lock_irq(group_lock);
 	}
 	/* Find the node with the highest number of faults */
 	for_each_online_node(nid) {
 		unsigned long faults = 0, group_faults = 0;
 		int priv, i;
 		for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
 			long diff, f_diff, f_weight;
 			i = task_faults_idx(nid, priv);
 			/* Decay existing window, copy faults since last scan */
 			diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
 			fault_types[priv] += p->numa_faults_buffer_memory[i];
 			p->numa_faults_buffer_memory[i] = 0;
 			/*
 			 * Normalize the faults_from, so all tasks in a group
 			 * count according to CPU use, instead of by the raw
 			 * number of faults. Tasks with little runtime have
 			 * little over-all impact on throughput, and thus their
 			 * faults are less important.
 			 */
 			f_weight = div64_u64(runtime << 16, period + 1);
 			f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
 				   (total_faults + 1);
 			f_diff = f_weight - p->numa_faults_cpu[i] / 2;
 			p->numa_faults_buffer_cpu[i] = 0;
 			p->numa_faults_memory[i] += diff;
 			p->numa_faults_cpu[i] += f_diff;
 			faults += p->numa_faults_memory[i];
 			p->total_numa_faults += diff;
 			if (p->numa_group) {
 				/* safe because we can only change our own group */
 				p->numa_group->faults[i] += diff;
 				p->numa_group->faults_cpu[i] += f_diff;
 				p->numa_group->total_faults += diff;
 				group_faults += p->numa_group->faults[i];
 			}
 		}
 		if (faults > max_faults) {
 			max_faults = faults;
 			max_nid = nid;
 		}
 		if (group_faults > max_group_faults) {
 			max_group_faults = group_faults;
 			max_group_nid = nid;
 		}
 	}
 	update_task_scan_period(p, fault_types[0], fault_types[1]);
 	if (p->numa_group) {
 		update_numa_active_node_mask(p->numa_group);
 		/*
 		 * If the preferred task and group nids are different,
 		 * iterate over the nodes again to find the best place.
 		 */
 		if (max_nid != max_group_nid) {
 			unsigned long weight, max_weight = 0;
 			for_each_online_node(nid) {
 				weight = task_weight(p, nid) + group_weight(p, nid);
 				if (weight > max_weight) {
 					max_weight = weight;
 					max_nid = nid;
 				}
 			}
 		}
 		spin_unlock_irq(group_lock);
 	}
 	/* Preferred node as the node with the most faults */
 	if (max_faults && max_nid != p->numa_preferred_nid) {
 		/* Update the preferred nid and migrate task if possible */
 		sched_setnuma(p, max_nid);
 		numa_migrate_preferred(p);
 	}
 }
 static inline int get_numa_group(struct numa_group *grp)
 {
 	return atomic_inc_not_zero(&grp->refcount);
 }
 static inline void put_numa_group(struct numa_group *grp)
 {
 	if (atomic_dec_and_test(&grp->refcount))
 		kfree_rcu(grp, rcu);
 }
 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 			int *priv)
 {
 	struct numa_group *grp, *my_grp;
 	struct task_struct *tsk;
 	bool join = false;
 	int cpu = cpupid_to_cpu(cpupid);
 	int i;
 	if (unlikely(!p->numa_group)) {
 		unsigned int size = sizeof(struct numa_group) +
 				    4*nr_node_ids*sizeof(unsigned long);
 		grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
 		if (!grp)
 			return;
 		atomic_set(&grp->refcount, 1);
 		spin_lock_init(&grp->lock);
 		INIT_LIST_HEAD(&grp->task_list);
 		grp->gid = p->pid;
 		/* Second half of the array tracks nids where faults happen */
 		grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
 						nr_node_ids;
 		node_set(task_node(current), grp->active_nodes);
 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
 			grp->faults[i] = p->numa_faults_memory[i];
 		grp->total_faults = p->total_numa_faults;
 		list_add(&p->numa_entry, &grp->task_list);
 		grp->nr_tasks++;
 		rcu_assign_pointer(p->numa_group, grp);
 	}
 	rcu_read_lock();
 	tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
 	if (!cpupid_match_pid(tsk, cpupid))
 		goto no_join;
 	grp = rcu_dereference(tsk->numa_group);
 	if (!grp)
 		goto no_join;
 	my_grp = p->numa_group;
 	if (grp == my_grp)
 		goto no_join;
 	/*
 	 * Only join the other group if its bigger; if we're the bigger group,
 	 * the other task will join us.
 	 */
 	if (my_grp->nr_tasks > grp->nr_tasks)
 		goto no_join;
 	/*
 	 * Tie-break on the grp address.
 	 */
 	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
 		goto no_join;
 	/* Always join threads in the same process. */
 	if (tsk->mm == current->mm)
 		join = true;
 	/* Simple filter to avoid false positives due to PID collisions */
 	if (flags & TNF_SHARED)
 		join = true;
 	/* Update priv based on whether false sharing was detected */
 	*priv = !join;
 	if (join && !get_numa_group(grp))
 		goto no_join;
 	rcu_read_unlock();
 	if (!join)
 		return;
 	BUG_ON(irqs_disabled());
 	double_lock_irq(&my_grp->lock, &grp->lock);
 	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
 		my_grp->faults[i] -= p->numa_faults_memory[i];
 		grp->faults[i] += p->numa_faults_memory[i];
 	}
 	my_grp->total_faults -= p->total_numa_faults;
 	grp->total_faults += p->total_numa_faults;
 	list_move(&p->numa_entry, &grp->task_list);
 	my_grp->nr_tasks--;
 	grp->nr_tasks++;
 	spin_unlock(&my_grp->lock);
 	spin_unlock_irq(&grp->lock);
 	rcu_assign_pointer(p->numa_group, grp);
 	put_numa_group(my_grp);
 	return;
 no_join:
 	rcu_read_unlock();
 	return;
 }
 void task_numa_free(struct task_struct *p)
 {
 	struct numa_group *grp = p->numa_group;
-	int i;
 	void *numa_faults = p->numa_faults_memory;
+	unsigned long flags;
+	int i;
 	if (grp) {
-		spin_lock_irq(&grp->lock);
+		spin_lock_irqsave(&grp->lock, flags);
 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
 			grp->faults[i] -= p->numa_faults_memory[i];
 		grp->total_faults -= p->total_numa_faults;
 		list_del(&p->numa_entry);
 		grp->nr_tasks--;
-		spin_unlock_irq(&grp->lock);
+		spin_unlock_irqrestore(&grp->lock, flags);
 		rcu_assign_pointer(p->numa_group, NULL);
 		put_numa_group(grp);
 	}
 	p->numa_faults_memory = NULL;
 	p->numa_faults_buffer_memory = NULL;
 	p->numa_faults_cpu= NULL;
 	p->numa_faults_buffer_cpu = NULL;
 	kfree(numa_faults);
 }
 /*
  * Got a PROT_NONE fault for a page on @node.
  */
 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 {
 	struct task_struct *p = current;
 	bool migrated = flags & TNF_MIGRATED;
 	int cpu_node = task_node(current);
 	int priv;
 	if (!numabalancing_enabled)
 		return;
 	/* for example, ksmd faulting in a user's mm */
 	if (!p->mm)
 		return;
 	/* Do not worry about placement if exiting */
 	if (p->state == TASK_DEAD)
 		return;
 	/* Allocate buffer to track faults on a per-node basis */
 	if (unlikely(!p->numa_faults_memory)) {
 		int size = sizeof(*p->numa_faults_memory) *
 			   NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
 		p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
 		if (!p->numa_faults_memory)
 			return;
 		BUG_ON(p->numa_faults_buffer_memory);
 		/*
 		 * The averaged statistics, shared & private, memory & cpu,
 		 * occupy the first half of the array. The second half of the
 		 * array is for current counters, which are averaged into the
 		 * first set by task_numa_placement.
 		 */
 		p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
 		p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
 		p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
 		p->total_numa_faults = 0;
 		memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
 	}
 	/*
 	 * First accesses are treated as private, otherwise consider accesses
 	 * to be private if the accessing pid has not changed
 	 */
 	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
 		priv = 1;
 	} else {
 		priv = cpupid_match_pid(p, last_cpupid);
 		if (!priv && !(flags & TNF_NO_GROUP))
 			task_numa_group(p, last_cpupid, flags, &priv);
 	}
 	task_numa_placement(p);
 	/*
 	 * Retry task to preferred node migration periodically, in case it
 	 * case it previously failed, or the scheduler moved us.
 	 */
 	if (time_after(jiffies, p->numa_migrate_retry))
 		numa_migrate_preferred(p);
 	if (migrated)
 		p->numa_pages_migrated += pages;
 	p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
 	p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
 	p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
 }
 static void reset_ptenuma_scan(struct task_struct *p)
 {
 	ACCESS_ONCE(p->mm->numa_scan_seq)++;
 	p->mm->numa_scan_offset = 0;
 }
 /*
  * The expensive part of numa migration is done from task_work context.
  * Triggered from task_tick_numa().
  */
 void task_numa_work(struct callback_head *work)
 {
 	unsigned long migrate, next_scan, now = jiffies;
 	struct task_struct *p = current;
 	struct mm_struct *mm = p->mm;
 	struct vm_area_struct *vma;
 	unsigned long start, end;
 	unsigned long nr_pte_updates = 0;
 	long pages;
 	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
 	work->next = work; /* protect against double add */
 	/*
 	 * Who cares about NUMA placement when they're dying.
 	 *
 	 * NOTE: make sure not to dereference p->mm before this check,
 	 * exit_task_work() happens _after_ exit_mm() so we could be called
 	 * without p->mm even though we still had it when we enqueued this
 	 * work.
 	 */
 	if (p->flags & PF_EXITING)
 		return;
 	if (!mm->numa_next_scan) {
 		mm->numa_next_scan = now +
 			msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
 	}
 	/*
 	 * Enforce maximal scan/migration frequency..
 	 */
 	migrate = mm->numa_next_scan;
 	if (time_before(now, migrate))
 		return;
 	if (p->numa_scan_period == 0) {
 		p->numa_scan_period_max = task_scan_max(p);
 		p->numa_scan_period = task_scan_min(p);
 	}
 	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
 	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
 		return;
 	/*
 	 * Delay this task enough that another task of this mm will likely win
 	 * the next time around.
 	 */
 	p->node_stamp += 2 * TICK_NSEC;
 	start = mm->numa_scan_offset;
 	pages = sysctl_numa_balancing_scan_size;
 	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
 	if (!pages)
 		return;
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, start);
 	if (!vma) {
 		reset_ptenuma_scan(p);
 		start = 0;
 		vma = mm->mmap;
 	}
 	for (; vma; vma = vma->vm_next) {
 		if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
 			continue;
 		/*
 		 * Shared library pages mapped by multiple processes are not
 		 * migrated as it is expected they are cache replicated. Avoid
 		 * hinting faults in read-only file-backed mappings or the vdso
 		 * as migrating the pages will be of marginal benefit.
 		 */
 		if (!vma->vm_mm ||
 		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
 			continue;
 		/*
 		 * Skip inaccessible VMAs to avoid any confusion between
 		 * PROT_NONE and NUMA hinting ptes
 		 */
 		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
 			continue;
 		do {
 			start = max(start, vma->vm_start);
 			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
 			end = min(end, vma->vm_end);
 			nr_pte_updates += change_prot_numa(vma, start, end);
 			/*
 			 * Scan sysctl_numa_balancing_scan_size but ensure that
 			 * at least one PTE is updated so that unused virtual
 			 * address space is quickly skipped.
 			 */
 			if (nr_pte_updates)
 				pages -= (end - start) >> PAGE_SHIFT;
 			start = end;
 			if (pages <= 0)
 				goto out;
 			cond_resched();
 		} while (end != vma->vm_end);
 	}
 out:
 	/*
 	 * It is possible to reach the end of the VMA list but the last few
 	 * VMAs are not guaranteed to the vma_migratable. If they are not, we
 	 * would find the !migratable VMA on the next scan but not reset the
 	 * scanner to the start so check it now.
 	 */
 	if (vma)
 		mm->numa_scan_offset = start;
 	else
 		reset_ptenuma_scan(p);
 	up_read(&mm->mmap_sem);
 }
 /*
  * Drive the periodic memory faults..
  */
 void task_tick_numa(struct rq *rq, struct task_struct *curr)
 {
 	struct callback_head *work = &curr->numa_work;
 	u64 period, now;
 	/*
 	 * We don't care about NUMA placement if we don't have memory.
 	 */
 	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
 		return;
 	/*
 	 * Using runtime rather than walltime has the dual advantage that
 	 * we (mostly) drive the selection from busy threads and that the
 	 * task needs to have done some actual work before we bother with
 	 * NUMA placement.
 	 */
 	now = curr->se.sum_exec_runtime;
 	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
 	if (now - curr->node_stamp > period) {
 		if (!curr->node_stamp)
 			curr->numa_scan_period = task_scan_min(curr);
 		curr->node_stamp += period;
 		if (!time_before(jiffies, curr->mm->numa_next_scan)) {
 			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
 			task_work_add(curr, work, true);
 		}
 	}
 }
 #else
 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 {
 }
 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
 {
 }
 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 {
 }
 #endif /* CONFIG_NUMA_BALANCING */
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_add(&cfs_rq->load, se->load.weight);
 	if (!parent_entity(se))
 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
 #ifdef CONFIG_SMP
 	if (entity_is_task(se)) {
 		struct rq *rq = rq_of(cfs_rq);
 		account_numa_enqueue(rq, task_of(se));
 		list_add(&se->group_node, &rq->cfs_tasks);
 	}
 #endif
 	cfs_rq->nr_running++;
 }
 static void
 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_sub(&cfs_rq->load, se->load.weight);
 	if (!parent_entity(se))
 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
 	if (entity_is_task(se)) {
 		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
 		list_del_init(&se->group_node);
 	}
 	cfs_rq->nr_running--;
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
 # ifdef CONFIG_SMP
 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
 {
 	long tg_weight;
 	/*
 	 * Use this CPU's actual weight instead of the last load_contribution
 	 * to gain a more accurate current total weight. See
 	 * update_cfs_rq_load_contribution().
 	 */
 	tg_weight = atomic_long_read(&tg->load_avg);
 	tg_weight -= cfs_rq->tg_load_contrib;
 	tg_weight += cfs_rq->load.weight;
 	return tg_weight;
 }
 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
 	long tg_weight, load, shares;
 	tg_weight = calc_tg_weight(tg, cfs_rq);
 	load = cfs_rq->load.weight;
 	shares = (tg->shares * load);
 	if (tg_weight)
 		shares /= tg_weight;
 	if (shares < MIN_SHARES)
 		shares = MIN_SHARES;
 	if (shares > tg->shares)
 		shares = tg->shares;
 	return shares;
 }
 # else /* CONFIG_SMP */
 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
 	return tg->shares;
 }
 # endif /* CONFIG_SMP */
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			    unsigned long weight)
 {
 	if (se->on_rq) {
 		/* commit outstanding execution time */
 		if (cfs_rq->curr == se)
 			update_curr(cfs_rq);
 		account_entity_dequeue(cfs_rq, se);
 	}
 	update_load_set(&se->load, weight);
 	if (se->on_rq)
 		account_entity_enqueue(cfs_rq, se);
 }
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 static void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
 	struct task_group *tg;
 	struct sched_entity *se;
 	long shares;
 	tg = cfs_rq->tg;
 	se = tg->se[cpu_of(rq_of(cfs_rq))];
 	if (!se || throttled_hierarchy(cfs_rq))
 		return;
 #ifndef CONFIG_SMP
 	if (likely(se->load.weight == tg->shares))
 		return;
 #endif
 	shares = calc_cfs_shares(cfs_rq, tg);
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
 #else /* CONFIG_FAIR_GROUP_SCHED */
 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_SMP
 /*
  * We choose a half-life close to 1 scheduling period.
  * Note: The tables below are dependent on this value.
  */
 #define LOAD_AVG_PERIOD 32
 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
 /* Precomputed fixed inverse multiplies for multiplication by y^n */
 static const u32 runnable_avg_yN_inv[] = {
 	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
 	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
 	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
 	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
 	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
 	0x85aac367, 0x82cd8698,
 };
 /*
  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
  * over-estimates when re-combining.
  */
 static const u32 runnable_avg_yN_sum[] = {
 	    0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
 	 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
 	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
 };
 /*
  * Approximate:
  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
  */
 static __always_inline u64 decay_load(u64 val, u64 n)
 {
 	unsigned int local_n;
 	if (!n)
 		return val;
 	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
 		return 0;
 	/* after bounds checking we can collapse to 32-bit */
 	local_n = n;
 	/*
 	 * As y^PERIOD = 1/2, we can combine
 	 *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
 	 * With a look-up table which covers k^n (n<PERIOD)
 	 *
 	 * To achieve constant time decay_load.
 	 */
 	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
 		val >>= local_n / LOAD_AVG_PERIOD;
 		local_n %= LOAD_AVG_PERIOD;
 	}
 	val *= runnable_avg_yN_inv[local_n];
 	/* We don't use SRR here since we always want to round down. */
 	return val >> 32;
 }
 /*
  * For updates fully spanning n periods, the contribution to runnable
  * average will be: \Sum 1024*y^n
  *
  * We can compute this reasonably efficiently by combining:
  *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
  */
 static u32 __compute_runnable_contrib(u64 n)
 {
 	u32 contrib = 0;
 	if (likely(n <= LOAD_AVG_PERIOD))
 		return runnable_avg_yN_sum[n];
 	else if (unlikely(n >= LOAD_AVG_MAX_N))
 		return LOAD_AVG_MAX;
 	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
 	do {
 		contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
 		contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
 		n -= LOAD_AVG_PERIOD;
 	} while (n > LOAD_AVG_PERIOD);
 	contrib = decay_load(contrib, n);
 	return contrib + runnable_avg_yN_sum[n];
 }
 /*
  * We can represent the historical contribution to runnable average as the
  * coefficients of a geometric series.  To do this we sub-divide our runnable
  * history into segments of approximately 1ms (1024us); label the segment that
  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
  *
  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
  *      p0            p1           p2
  *     (now)       (~1ms ago)  (~2ms ago)
  *
  * Let u_i denote the fraction of p_i that the entity was runnable.
  *
  * We then designate the fractions u_i as our co-efficients, yielding the
  * following representation of historical load:
  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
  *
  * We choose y based on the with of a reasonably scheduling period, fixing:
  *   y^32 = 0.5
  *
  * This means that the contribution to load ~32ms ago (u_32) will be weighted
  * approximately half as much as the contribution to load within the last ms
  * (u_0).
  *
  * When a period "rolls over" and we have new u_0`, multiplying the previous
  * sum again by y is sufficient to update:
  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
  */
 static __always_inline int __update_entity_runnable_avg(u64 now,
 							struct sched_avg *sa,
 							int runnable)
 {
 	u64 delta, periods;
 	u32 runnable_contrib;
 	int delta_w, decayed = 0;
 	delta = now - sa->last_runnable_update;
 	/*
 	 * This should only happen when time goes backwards, which it
 	 * unfortunately does during sched clock init when we swap over to TSC.
 	 */
 	if ((s64)delta < 0) {
 		sa->last_runnable_update = now;
 		return 0;
 	}
 	/*
 	 * Use 1024ns as the unit of measurement since it's a reasonable
 	 * approximation of 1us and fast to compute.
 	 */
 	delta >>= 10;
 	if (!delta)
 		return 0;
 	sa->last_runnable_update = now;
 	/* delta_w is the amount already accumulated against our next period */
 	delta_w = sa->runnable_avg_period % 1024;
 	if (delta + delta_w >= 1024) {
 		/* period roll-over */
 		decayed = 1;
 		/*
 		 * Now that we know we're crossing a period boundary, figure
 		 * out how much from delta we need to complete the current
 		 * period and accrue it.
 		 */
 		delta_w = 1024 - delta_w;
 		if (runnable)
 			sa->runnable_avg_sum += delta_w;
 		sa->runnable_avg_period += delta_w;
 		delta -= delta_w;
 		/* Figure out how many additional periods this update spans */
 		periods = delta / 1024;
 		delta %= 1024;
 		sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
 						  periods + 1);
 		sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
 						     periods + 1);
 		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
 		runnable_contrib = __compute_runnable_contrib(periods);
 		if (runnable)
 			sa->runnable_avg_sum += runnable_contrib;
 		sa->runnable_avg_period += runnable_contrib;
 	}
 	/* Remainder of delta accrued against u_0` */
 	if (runnable)
 		sa->runnable_avg_sum += delta;
 	sa->runnable_avg_period += delta;
 	return decayed;
 }
 /* Synchronize an entity's decay with its parenting cfs_rq.*/
 static inline u64 __synchronize_entity_decay(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	u64 decays = atomic64_read(&cfs_rq->decay_counter);
 	decays -= se->avg.decay_count;
 	if (!decays)
 		return 0;
 	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
 	se->avg.decay_count = 0;
 	return decays;
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
 						 int force_update)
 {
 	struct task_group *tg = cfs_rq->tg;
 	long tg_contrib;
 	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
 	tg_contrib -= cfs_rq->tg_load_contrib;
 	if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
 		atomic_long_add(tg_contrib, &tg->load_avg);
 		cfs_rq->tg_load_contrib += tg_contrib;
 	}
 }
 /*
  * Aggregate cfs_rq runnable averages into an equivalent task_group
  * representation for computing load contributions.
  */
 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
 						  struct cfs_rq *cfs_rq)
 {
 	struct task_group *tg = cfs_rq->tg;
 	long contrib;
 	/* The fraction of a cpu used by this cfs_rq */
 	contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
 			  sa->runnable_avg_period + 1);
 	contrib -= cfs_rq->tg_runnable_contrib;
 	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
 		atomic_add(contrib, &tg->runnable_avg);
 		cfs_rq->tg_runnable_contrib += contrib;
 	}
 }
 static inline void __update_group_entity_contrib(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = group_cfs_rq(se);
 	struct task_group *tg = cfs_rq->tg;
 	int runnable_avg;
 	u64 contrib;
 	contrib = cfs_rq->tg_load_contrib * tg->shares;
 	se->avg.load_avg_contrib = div_u64(contrib,
 				     atomic_long_read(&tg->load_avg) + 1);
 	/*
 	 * For group entities we need to compute a correction term in the case
 	 * that they are consuming <1 cpu so that we would contribute the same
 	 * load as a task of equal weight.
 	 *
 	 * Explicitly co-ordinating this measurement would be expensive, but
 	 * fortunately the sum of each cpus contribution forms a usable
 	 * lower-bound on the true value.
 	 *
 	 * Consider the aggregate of 2 contributions.  Either they are disjoint
 	 * (and the sum represents true value) or they are disjoint and we are
 	 * understating by the aggregate of their overlap.
 	 *
 	 * Extending this to N cpus, for a given overlap, the maximum amount we
 	 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
 	 * cpus that overlap for this interval and w_i is the interval width.
 	 *
 	 * On a small machine; the first term is well-bounded which bounds the
 	 * total error since w_i is a subset of the period.  Whereas on a
 	 * larger machine, while this first term can be larger, if w_i is the
 	 * of consequential size guaranteed to see n_i*w_i quickly converge to
 	 * our upper bound of 1-cpu.
 	 */
 	runnable_avg = atomic_read(&tg->runnable_avg);
 	if (runnable_avg < NICE_0_LOAD) {
 		se->avg.load_avg_contrib *= runnable_avg;
 		se->avg.load_avg_contrib >>= NICE_0_SHIFT;
 	}
 }
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 {
 	__update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
 	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
 }
 #else /* CONFIG_FAIR_GROUP_SCHED */
 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
 						 int force_update) {}
 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
 						  struct cfs_rq *cfs_rq) {}
 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 static inline void __update_task_entity_contrib(struct sched_entity *se)
 {
 	u32 contrib;
 	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
 	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
 	contrib /= (se->avg.runnable_avg_period + 1);
 	se->avg.load_avg_contrib = scale_load(contrib);
 }
 /* Compute the current contribution to load_avg by se, return any delta */
 static long __update_entity_load_avg_contrib(struct sched_entity *se)
 {
 	long old_contrib = se->avg.load_avg_contrib;
 	if (entity_is_task(se)) {
 		__update_task_entity_contrib(se);
 	} else {
 		__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
 		__update_group_entity_contrib(se);
 	}
 	return se->avg.load_avg_contrib - old_contrib;
 }
 static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
 						 long load_contrib)
 {
 	if (likely(load_contrib < cfs_rq->blocked_load_avg))
 		cfs_rq->blocked_load_avg -= load_contrib;
 	else
 		cfs_rq->blocked_load_avg = 0;
 }
 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
 /* Update a sched_entity's runnable average */
 static inline void update_entity_load_avg(struct sched_entity *se,
 					  int update_cfs_rq)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	long contrib_delta;
 	u64 now;
 	/*
 	 * For a group entity we need to use their owned cfs_rq_clock_task() in
 	 * case they are the parent of a throttled hierarchy.
 	 */
 	if (entity_is_task(se))
 		now = cfs_rq_clock_task(cfs_rq);
 	else
 		now = cfs_rq_clock_task(group_cfs_rq(se));
 	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
 		return;
 	contrib_delta = __update_entity_load_avg_contrib(se);
 	if (!update_cfs_rq)
 		return;
 	if (se->on_rq)
 		cfs_rq->runnable_load_avg += contrib_delta;
 	else
 		subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
 }
 /*
  * Decay the load contributed by all blocked children and account this so that
  * their contribution may appropriately discounted when they wake up.
  */
 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 {
 	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
 	u64 decays;
 	decays = now - cfs_rq->last_decay;
 	if (!decays && !force_update)
 		return;
 	if (atomic_long_read(&cfs_rq->removed_load)) {
 		unsigned long removed_load;
 		removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
 		subtract_blocked_load_contrib(cfs_rq, removed_load);
 	}
 	if (decays) {
 		cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
 						      decays);
 		atomic64_add(decays, &cfs_rq->decay_counter);
 		cfs_rq->last_decay = now;
 	}
 	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
 }
 /* Add the load generated by se into cfs_rq's child load-average */
 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
 						  struct sched_entity *se,
 						  int wakeup)
 {
 	/*
 	 * We track migrations using entity decay_count <= 0, on a wake-up
 	 * migration we use a negative decay count to track the remote decays
 	 * accumulated while sleeping.
 	 *
 	 * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
 	 * are seen by enqueue_entity_load_avg() as a migration with an already
 	 * constructed load_avg_contrib.
 	 */
 	if (unlikely(se->avg.decay_count <= 0)) {
 		se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
 		if (se->avg.decay_count) {
 			/*
 			 * In a wake-up migration we have to approximate the
 			 * time sleeping.  This is because we can't synchronize
 			 * clock_task between the two cpus, and it is not
 			 * guaranteed to be read-safe.  Instead, we can
 			 * approximate this using our carried decays, which are
 			 * explicitly atomically readable.
 			 */
 			se->avg.last_runnable_update -= (-se->avg.decay_count)
 							<< 20;
 			update_entity_load_avg(se, 0);
 			/* Indicate that we're now synchronized and on-rq */
 			se->avg.decay_count = 0;
 		}
 		wakeup = 0;
 	} else {
 		__synchronize_entity_decay(se);
 	}
 	/* migrated tasks did not contribute to our blocked load */
 	if (wakeup) {
 		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
 		update_entity_load_avg(se, 0);
 	}
 	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
 	/* we force update consideration on load-balancer moves */
 	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
 }
 /*
  * Remove se's load from this cfs_rq child load-average, if the entity is
  * transitioning to a blocked state we track its projected decay using
  * blocked_load_avg.
  */
 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
 						  struct sched_entity *se,
 						  int sleep)
 {
 	update_entity_load_avg(se, 1);
 	/* we force update consideration on load-balancer moves */
 	update_cfs_rq_blocked_load(cfs_rq, !sleep);
 	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
 	if (sleep) {
 		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
 		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
 	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
 }
 /*
  * Update the rq's load with the elapsed running time before entering
  * idle. if the last scheduled task is not a CFS task, idle_enter will
  * be the only way to update the runnable statistic.
  */
 void idle_enter_fair(struct rq *this_rq)
 {
 	update_rq_runnable_avg(this_rq, 1);
 }
 /*
  * Update the rq's load with the elapsed idle time before a task is
  * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
  * be the only way to update the runnable statistic.
  */
 void idle_exit_fair(struct rq *this_rq)
 {
 	update_rq_runnable_avg(this_rq, 0);
 }
 static int idle_balance(struct rq *this_rq);
 #else /* CONFIG_SMP */
 static inline void update_entity_load_avg(struct sched_entity *se,
 					  int update_cfs_rq) {}
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
 					   struct sched_entity *se,
 					   int wakeup) {}
 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
 					   struct sched_entity *se,
 					   int sleep) {}
 static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
 					      int force_update) {}
 static inline int idle_balance(struct rq *rq)
 {
 	return 0;
 }
 #endif /* CONFIG_SMP */
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHEDSTATS
 	struct task_struct *tsk = NULL;
 	if (entity_is_task(se))
 		tsk = task_of(se);
 	if (se->statistics.sleep_start) {
 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
 		if ((s64)delta < 0)
 			delta = 0;
 		if (unlikely(delta > se->statistics.sleep_max))
 			se->statistics.sleep_max = delta;
 		se->statistics.sleep_start = 0;
 		se->statistics.sum_sleep_runtime += delta;
 		if (tsk) {
 			account_scheduler_latency(tsk, delta >> 10, 1);
 			trace_sched_stat_sleep(tsk, delta);
 		}
 	}
 	if (se->statistics.block_start) {
 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
 		if ((s64)delta < 0)
 			delta = 0;
 		if (unlikely(delta > se->statistics.block_max))
 			se->statistics.block_max = delta;
 		se->statistics.block_start = 0;
 		se->statistics.sum_sleep_runtime += delta;
 		if (tsk) {
 			if (tsk->in_iowait) {
 				se->statistics.iowait_sum += delta;
 				se->statistics.iowait_count++;
 				trace_sched_stat_iowait(tsk, delta);
 			}
 			trace_sched_stat_blocked(tsk, delta);
 			/*
 			 * Blocking time is in units of nanosecs, so shift by
 			 * 20 to get a milliseconds-range estimation of the
 			 * amount of time that the task spent sleeping:
 			 */
 			if (unlikely(prof_on == SLEEP_PROFILING)) {
 				profile_hits(SLEEP_PROFILING,
 						(void *)get_wchan(tsk),
 						delta >> 20);
 			}
 			account_scheduler_latency(tsk, delta >> 10, 0);
 		}
 	}
 #endif
 }
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHED_DEBUG
 	s64 d = se->vruntime - cfs_rq->min_vruntime;
 	if (d < 0)
 		d = -d;
 	if (d > 3*sysctl_sched_latency)
 		schedstat_inc(cfs_rq, nr_spread_over);
 #endif
 }
 static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 {
 	u64 vruntime = cfs_rq->min_vruntime;
 	/*
 	 * The 'current' period is already promised to the current tasks,
 	 * however the extra weight of the new task will slow them down a
 	 * little, place the new task so that it fits in the slot that
 	 * stays open at the end.
 	 */
 	if (initial && sched_feat(START_DEBIT))
 		vruntime += sched_vslice(cfs_rq, se);
 	/* sleeps up to a single latency don't count. */
 	if (!initial) {
 		unsigned long thresh = sysctl_sched_latency;
 		/*
 		 * Halve their sleep time's effect, to allow
 		 * for a gentler effect of sleepers:
 		 */
 		if (sched_feat(GENTLE_FAIR_SLEEPERS))
 			thresh >>= 1;
 		vruntime -= thresh;
 	}
 	/* ensure we never gain time by being placed backwards. */
 	se->vruntime = max_vruntime(se->vruntime, vruntime);
 }
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
 	/*
 	 * Update the normalized vruntime before updating min_vruntime
 	 * through calling update_curr().
 	 */
 	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
 		se->vruntime += cfs_rq->min_vruntime;
 	/*
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
 	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
 	account_entity_enqueue(cfs_rq, se);
 	update_cfs_shares(cfs_rq);
 	if (flags & ENQUEUE_WAKEUP) {
 		place_entity(cfs_rq, se, 0);
 		enqueue_sleeper(cfs_rq, se);
 	}
 	update_stats_enqueue(cfs_rq, se);
 	check_spread(cfs_rq, se);
 	if (se != cfs_rq->curr)
 		__enqueue_entity(cfs_rq, se);
 	se->on_rq = 1;
 	if (cfs_rq->nr_running == 1) {
 		list_add_leaf_cfs_rq(cfs_rq);
 		check_enqueue_throttle(cfs_rq);
 	}
 }
 static void __clear_buddies_last(struct sched_entity *se)
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 		if (cfs_rq->last != se)
 			break;
 		cfs_rq->last = NULL;
 	}
 }
 static void __clear_buddies_next(struct sched_entity *se)
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 		if (cfs_rq->next != se)
 			break;
 		cfs_rq->next = NULL;
 	}
 }
 static void __clear_buddies_skip(struct sched_entity *se)
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 		if (cfs_rq->skip != se)
 			break;
 		cfs_rq->skip = NULL;
 	}
 }
 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if (cfs_rq->last == se)
 		__clear_buddies_last(se);
 	if (cfs_rq->next == se)
 		__clear_buddies_next(se);
 	if (cfs_rq->skip == se)
 		__clear_buddies_skip(se);
 }
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
 	/*
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
 	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
 	update_stats_dequeue(cfs_rq, se);
 	if (flags & DEQUEUE_SLEEP) {
 #ifdef CONFIG_SCHEDSTATS
 		if (entity_is_task(se)) {
 			struct task_struct *tsk = task_of(se);
 			if (tsk->state & TASK_INTERRUPTIBLE)
 				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
 			if (tsk->state & TASK_UNINTERRUPTIBLE)
 				se->statistics.block_start = rq_clock(rq_of(cfs_rq));
 		}
 #endif
 	}
 	clear_buddies(cfs_rq, se);
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	se->on_rq = 0;
 	account_entity_dequeue(cfs_rq, se);
 	/*
 	 * Normalize the entity after updating the min_vruntime because the
 	 * update can refer to the ->curr item and we need to reflect this
 	 * movement in our normalized position.
 	 */
 	if (!(flags & DEQUEUE_SLEEP))
 		se->vruntime -= cfs_rq->min_vruntime;
 	/* return excess runtime on last dequeue */
 	return_cfs_rq_runtime(cfs_rq);
 	update_min_vruntime(cfs_rq);
 	update_cfs_shares(cfs_rq);
 }
 /*
  * Preempt the current task with a newly woken task if needed:
  */
 static void
 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
 	unsigned long ideal_runtime, delta_exec;
 	struct sched_entity *se;
 	s64 delta;
 	ideal_runtime = sched_slice(cfs_rq, curr);
 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
 	if (delta_exec > ideal_runtime) {
 		resched_task(rq_of(cfs_rq)->curr);
 		/*
 		 * The current task ran long enough, ensure it doesn't get
 		 * re-elected due to buddy favours.
 		 */
 		clear_buddies(cfs_rq, curr);
 		return;
 	}
 	/*
 	 * Ensure that a task that missed wakeup preemption by a
 	 * narrow margin doesn't have to wait for a full slice.
 	 * This also mitigates buddy induced latencies under load.
 	 */
 	if (delta_exec < sysctl_sched_min_granularity)
 		return;
 	se = __pick_first_entity(cfs_rq);
 	delta = curr->vruntime - se->vruntime;
 	if (delta < 0)
 		return;
 	if (delta > ideal_runtime)
 		resched_task(rq_of(cfs_rq)->curr);
 }
 static void
 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	/* 'current' is not kept within the tree. */
 	if (se->on_rq) {
 		/*
 		 * Any task has to be enqueued before it get to execute on
 		 * a CPU. So account for the time it spent waiting on the
 		 * runqueue.
 		 */
 		update_stats_wait_end(cfs_rq, se);
 		__dequeue_entity(cfs_rq, se);
 	}
 	update_stats_curr_start(cfs_rq, se);
 	cfs_rq->curr = se;
 #ifdef CONFIG_SCHEDSTATS
 	/*
 	 * Track our maximum slice length, if the CPU's load is at
 	 * least twice that of our own weight (i.e. dont track it
 	 * when there are only lesser-weight tasks around):
 	 */
 	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
 		se->statistics.slice_max = max(se->statistics.slice_max,
 			se->sum_exec_runtime - se->prev_sum_exec_runtime);
 	}
 #endif
 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
 static int
 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
 /*
  * Pick the next process, keeping these things in mind, in this order:
  * 1) keep things fair between processes/task groups
  * 2) pick the "next" process, since someone really wants that to run
  * 3) pick the "last" process, for cache locality
  * 4) do not run the "skip" process, if something else is available
  */
 static struct sched_entity *
 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
 	struct sched_entity *left = __pick_first_entity(cfs_rq);
 	struct sched_entity *se;
 	/*
 	 * If curr is set we have to see if its left of the leftmost entity
 	 * still in the tree, provided there was anything in the tree at all.
 	 */
 	if (!left || (curr && entity_before(curr, left)))
 		left = curr;
 	se = left; /* ideally we run the leftmost entity */
 	/*
 	 * Avoid running the skip buddy, if running something else can
 	 * be done without getting too unfair.
 	 */
 	if (cfs_rq->skip == se) {
 		struct sched_entity *second;
 		if (se == curr) {
 			second = __pick_first_entity(cfs_rq);
 		} else {
 			second = __pick_next_entity(se);
 			if (!second || (curr && entity_before(curr, second)))
 				second = curr;
 		}
 		if (second && wakeup_preempt_entity(second, left) < 1)
 			se = second;
 	}
 	/*
 	 * Prefer last buddy, try to return the CPU to a preempted task.
 	 */
 	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
 		se = cfs_rq->last;
 	/*
 	 * Someone really wants this to run. If it's not unfair, run it.
 	 */
 	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
 		se = cfs_rq->next;
 	clear_buddies(cfs_rq, se);
 	return se;
 }
 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
 	/*
 	 * If still on the runqueue then deactivate_task()
 	 * was not called and update_curr() has to be done:
 	 */
 	if (prev->on_rq)
 		update_curr(cfs_rq);
 	/* throttle cfs_rqs exceeding runtime */
 	check_cfs_rq_runtime(cfs_rq);
 	check_spread(cfs_rq, prev);
 	if (prev->on_rq) {
 		update_stats_wait_start(cfs_rq, prev);
 		/* Put 'current' back into the tree. */
 		__enqueue_entity(cfs_rq, prev);
 		/* in !on_rq case, update occurred at dequeue */
 		update_entity_load_avg(prev, 1);
 	}
 	cfs_rq->curr = NULL;
 }
 static void
 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 {
 	/*
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
 	/*
 	 * Ensure that runnable average is periodically updated.
 	 */
 	update_entity_load_avg(curr, 1);
 	update_cfs_rq_blocked_load(cfs_rq, 1);
 	update_cfs_shares(cfs_rq);
 #ifdef CONFIG_SCHED_HRTICK
 	/*
 	 * queued ticks are scheduled to match the slice, so don't bother
 	 * validating it and just reschedule.
 	 */
 	if (queued) {
 		resched_task(rq_of(cfs_rq)->curr);
 		return;
 	}
 	/*
 	 * don't let the period tick interfere with the hrtick preemption
 	 */
 	if (!sched_feat(DOUBLE_TICK) &&
 			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
 		return;
 #endif
 	if (cfs_rq->nr_running > 1)
 		check_preempt_tick(cfs_rq, curr);
 }
 /**************************************************
  * CFS bandwidth control machinery
  */
 #ifdef CONFIG_CFS_BANDWIDTH
 #ifdef HAVE_JUMP_LABEL
 static struct static_key __cfs_bandwidth_used;
 static inline bool cfs_bandwidth_used(void)
 {
 	return static_key_false(&__cfs_bandwidth_used);
 }
 void cfs_bandwidth_usage_inc(void)
 {
 	static_key_slow_inc(&__cfs_bandwidth_used);
 }
 void cfs_bandwidth_usage_dec(void)
 {
 	static_key_slow_dec(&__cfs_bandwidth_used);
 }
 #else /* HAVE_JUMP_LABEL */
 static bool cfs_bandwidth_used(void)
 {
 	return true;
 }
 void cfs_bandwidth_usage_inc(void) {}
 void cfs_bandwidth_usage_dec(void) {}
 #endif /* HAVE_JUMP_LABEL */
 /*
  * default period for cfs group bandwidth.
  * default: 0.1s, units: nanoseconds
  */
 static inline u64 default_cfs_period(void)
 {
 	return 100000000ULL;
 }
 static inline u64 sched_cfs_bandwidth_slice(void)
 {
 	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
 }
 /*
  * Replenish runtime according to assigned quota and update expiration time.
  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
  * additional synchronization around rq->lock.
  *
  * requires cfs_b->lock
  */
 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
 {
 	u64 now;
 	if (cfs_b->quota == RUNTIME_INF)
 		return;
 	now = sched_clock_cpu(smp_processor_id());
 	cfs_b->runtime = cfs_b->quota;
 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
 }
 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
 {
 	return &tg->cfs_bandwidth;
 }
 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 {
 	if (unlikely(cfs_rq->throttle_count))
 		return cfs_rq->throttled_clock_task;
 	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
 }
 /* returns 0 on failure to allocate runtime */
 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	struct task_group *tg = cfs_rq->tg;
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
 	u64 amount = 0, min_amount, expires;
 	/* note: this is a positive sum as runtime_remaining <= 0 */
 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
 	raw_spin_lock(&cfs_b->lock);
 	if (cfs_b->quota == RUNTIME_INF)
 		amount = min_amount;
 	else {
 		/*
 		 * If the bandwidth pool has become inactive, then at least one
 		 * period must have elapsed since the last consumption.
 		 * Refresh the global state and ensure bandwidth timer becomes
 		 * active.
 		 */
 		if (!cfs_b->timer_active) {
 			__refill_cfs_bandwidth_runtime(cfs_b);
 			__start_cfs_bandwidth(cfs_b);
 		}
 		if (cfs_b->runtime > 0) {
 			amount = min(cfs_b->runtime, min_amount);
 			cfs_b->runtime -= amount;
 			cfs_b->idle = 0;
 		}
 	}
 	expires = cfs_b->runtime_expires;
 	raw_spin_unlock(&cfs_b->lock);
 	cfs_rq->runtime_remaining += amount;
 	/*
 	 * we may have advanced our local expiration to account for allowed
 	 * spread between our sched_clock and the one on which runtime was
 	 * issued.
 	 */
 	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
 		cfs_rq->runtime_expires = expires;
 	return cfs_rq->runtime_remaining > 0;
 }
 /*
  * Note: This depends on the synchronization provided by sched_clock and the
  * fact that rq->clock snapshots this value.
  */
 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 	/* if the deadline is ahead of our clock, nothing to do */
 	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
 		return;
 	if (cfs_rq->runtime_remaining < 0)
 		return;
 	/*
 	 * If the local deadline has passed we have to consider the
 	 * possibility that our sched_clock is 'fast' and the global deadline
 	 * has not truly expired.
 	 *
 	 * Fortunately we can check determine whether this the case by checking
 	 * whether the global deadline has advanced.
 	 */
 	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
 		/* extend local deadline, drift is bounded above by 2 ticks */
 		cfs_rq->runtime_expires += TICK_NSEC;
 	} else {
 		/* global deadline is ahead, expiration has passed */
 		cfs_rq->runtime_remaining = 0;
 	}
 }
 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
 	/* dock delta_exec before expiring quota (as it could span periods) */
 	cfs_rq->runtime_remaining -= delta_exec;
 	expire_cfs_rq_runtime(cfs_rq);
 	if (likely(cfs_rq->runtime_remaining > 0))
 		return;
 	/*
 	 * if we're unable to extend our runtime we resched so that the active
 	 * hierarchy can be throttled
 	 */
 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
 		resched_task(rq_of(cfs_rq)->curr);
 }
 static __always_inline
 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
 		return;
 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
 	return cfs_bandwidth_used() && cfs_rq->throttled;
 }
 /* check whether cfs_rq, or any parent, is throttled */
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
 	return cfs_bandwidth_used() && cfs_rq->throttle_count;
 }
 /*
  * Ensure that neither of the group entities corresponding to src_cpu or
  * dest_cpu are members of a throttled hierarchy when performing group
  * load-balance operations.
  */
 static inline int throttled_lb_pair(struct task_group *tg,
 				    int src_cpu, int dest_cpu)
 {
 	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
 	src_cfs_rq = tg->cfs_rq[src_cpu];
 	dest_cfs_rq = tg->cfs_rq[dest_cpu];
 	return throttled_hierarchy(src_cfs_rq) ||
 	       throttled_hierarchy(dest_cfs_rq);
 }
 /* updated child weight may affect parent so we have to do this bottom up */
 static int tg_unthrottle_up(struct task_group *tg, void *data)
 {
 	struct rq *rq = data;
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 	cfs_rq->throttle_count--;
 #ifdef CONFIG_SMP
 	if (!cfs_rq->throttle_count) {
 		/* adjust cfs_rq_clock_task() */
 		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
 					     cfs_rq->throttled_clock_task;
 	}
 #endif
 	return 0;
 }
 static int tg_throttle_down(struct task_group *tg, void *data)
 {
 	struct rq *rq = data;
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 	/* group is entering throttled state, stop time */
 	if (!cfs_rq->throttle_count)
 		cfs_rq->throttled_clock_task = rq_clock_task(rq);
 	cfs_rq->throttle_count++;
 	return 0;
 }
 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 	struct sched_entity *se;
 	long task_delta, dequeue = 1;
 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
 	/* freeze hierarchy runnable averages while throttled */
 	rcu_read_lock();
 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
 	rcu_read_unlock();
 	task_delta = cfs_rq->h_nr_running;
 	for_each_sched_entity(se) {
 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
 		/* throttled entity or throttle-on-deactivate */
 		if (!se->on_rq)
 			break;
 		if (dequeue)
 			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
 		qcfs_rq->h_nr_running -= task_delta;
 		if (qcfs_rq->load.weight)
 			dequeue = 0;
 	}
 	if (!se)
 		rq->nr_running -= task_delta;
 	cfs_rq->throttled = 1;
 	cfs_rq->throttled_clock = rq_clock(rq);
 	raw_spin_lock(&cfs_b->lock);
 	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
 	if (!cfs_b->timer_active)
 		__start_cfs_bandwidth(cfs_b);
 	raw_spin_unlock(&cfs_b->lock);
 }
 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 	struct sched_entity *se;
 	int enqueue = 1;
 	long task_delta;
 	se = cfs_rq->tg->se[cpu_of(rq)];
 	cfs_rq->throttled = 0;
 	update_rq_clock(rq);
 	raw_spin_lock(&cfs_b->lock);
 	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
 	list_del_rcu(&cfs_rq->throttled_list);
 	raw_spin_unlock(&cfs_b->lock);
 	/* update hierarchical throttle state */
 	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
 	if (!cfs_rq->load.weight)
 		return;
 	task_delta = cfs_rq->h_nr_running;
 	for_each_sched_entity(se) {
 		if (se->on_rq)
 			enqueue = 0;
 		cfs_rq = cfs_rq_of(se);
 		if (enqueue)
 			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
 		cfs_rq->h_nr_running += task_delta;
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 	}
 	if (!se)
 		rq->nr_running += task_delta;
 	/* determine whether we need to wake up potentially idle cpu */
 	if (rq->curr == rq->idle && rq->cfs.nr_running)
 		resched_task(rq->curr);
 }
 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
 		u64 remaining, u64 expires)
 {
 	struct cfs_rq *cfs_rq;
 	u64 runtime = remaining;
 	rcu_read_lock();
 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
 				throttled_list) {
 		struct rq *rq = rq_of(cfs_rq);
 		raw_spin_lock(&rq->lock);
 		if (!cfs_rq_throttled(cfs_rq))
 			goto next;
 		runtime = -cfs_rq->runtime_remaining + 1;
 		if (runtime > remaining)
 			runtime = remaining;
 		remaining -= runtime;
 		cfs_rq->runtime_remaining += runtime;
 		cfs_rq->runtime_expires = expires;
 		/* we check whether we're throttled above */
 		if (cfs_rq->runtime_remaining > 0)
 			unthrottle_cfs_rq(cfs_rq);
 next:
 		raw_spin_unlock(&rq->lock);
 		if (!remaining)
 			break;
 	}
 	rcu_read_unlock();
 	return remaining;
 }
 /*
  * Responsible for refilling a task_group's bandwidth and unthrottling its
  * cfs_rqs as appropriate. If there has been no activity within the last
  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
  * used to track this state.
  */
 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 {
 	u64 runtime, runtime_expires;
 	int idle = 1, throttled;
 	raw_spin_lock(&cfs_b->lock);
 	/* no need to continue the timer with no bandwidth constraint */
 	if (cfs_b->quota == RUNTIME_INF)
 		goto out_unlock;
 	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 	/* idle depends on !throttled (for the case of a large deficit) */
 	idle = cfs_b->idle && !throttled;
 	cfs_b->nr_periods += overrun;
 	/* if we're going inactive then everything else can be deferred */
 	if (idle)
 		goto out_unlock;
 	/*
 	 * if we have relooped after returning idle once, we need to update our
 	 * status as actually running, so that other cpus doing
 	 * __start_cfs_bandwidth will stop trying to cancel us.
 	 */
 	cfs_b->timer_active = 1;
 	__refill_cfs_bandwidth_runtime(cfs_b);
 	if (!throttled) {
 		/* mark as potentially idle for the upcoming period */
 		cfs_b->idle = 1;
 		goto out_unlock;
 	}
 	/* account preceding periods in which throttling occurred */
 	cfs_b->nr_throttled += overrun;
 	/*
 	 * There are throttled entities so we must first use the new bandwidth
 	 * to unthrottle them before making it generally available.  This
 	 * ensures that all existing debts will be paid before a new cfs_rq is
 	 * allowed to run.
 	 */
 	runtime = cfs_b->runtime;
 	runtime_expires = cfs_b->runtime_expires;
 	cfs_b->runtime = 0;
 	/*
 	 * This check is repeated as we are holding onto the new bandwidth
 	 * while we unthrottle.  This can potentially race with an unthrottled
 	 * group trying to acquire new bandwidth from the global pool.
 	 */
 	while (throttled && runtime > 0) {
 		raw_spin_unlock(&cfs_b->lock);
 		/* we can't nest cfs_b->lock while distributing bandwidth */
 		runtime = distribute_cfs_runtime(cfs_b, runtime,
 						 runtime_expires);
 		raw_spin_lock(&cfs_b->lock);
 		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 	}
 	/* return (any) remaining runtime */
 	cfs_b->runtime = runtime;
 	/*
 	 * While we are ensured activity in the period following an
 	 * unthrottle, this also covers the case in which the new bandwidth is
 	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
 	 * timer to remain active while there are any throttled entities.)
 	 */
 	cfs_b->idle = 0;
 out_unlock:
 	if (idle)
 		cfs_b->timer_active = 0;
 	raw_spin_unlock(&cfs_b->lock);
 	return idle;
 }
 /* a cfs_rq won't donate quota below this amount */
 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
 /* minimum remaining period time to redistribute slack quota */
 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
 /* how long we wait to gather additional slack before distributing */
 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
 /*
  * Are we near the end of the current quota period?
  *
  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
  * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
  * migrate_hrtimers, base is never cleared, so we are fine.
  */
 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
 {
 	struct hrtimer *refresh_timer = &cfs_b->period_timer;
 	u64 remaining;
 	/* if the call-back is running a quota refresh is already occurring */
 	if (hrtimer_callback_running(refresh_timer))
 		return 1;
 	/* is a quota refresh about to occur? */
 	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
 	if (remaining < min_expire)
 		return 1;
 	return 0;
 }
 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
 {
 	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
 	/* if there's a quota refresh soon don't bother with slack */
 	if (runtime_refresh_within(cfs_b, min_left))
 		return;
 	start_bandwidth_timer(&cfs_b->slack_timer,
 				ns_to_ktime(cfs_bandwidth_slack_period));
 }
 /* we know any runtime found here is valid as update_curr() precedes return */
 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
 	if (slack_runtime <= 0)
 		return;
 	raw_spin_lock(&cfs_b->lock);
 	if (cfs_b->quota != RUNTIME_INF &&
 	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
 		cfs_b->runtime += slack_runtime;
 		/* we are under rq->lock, defer unthrottling using a timer */
 		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
 		    !list_empty(&cfs_b->throttled_cfs_rq))
 			start_cfs_slack_bandwidth(cfs_b);
 	}
 	raw_spin_unlock(&cfs_b->lock);
 	/* even if it's not valid for return we don't want to try again */
 	cfs_rq->runtime_remaining -= slack_runtime;
 }
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	if (!cfs_bandwidth_used())
 		return;
 	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
 		return;
 	__return_cfs_rq_runtime(cfs_rq);
 }
 /*
  * This is done with a timer (instead of inline with bandwidth return) since
  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
  */
 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 {
 	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
 	u64 expires;
 	/* confirm we're still not at a refresh boundary */
 	raw_spin_lock(&cfs_b->lock);
 	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
 		raw_spin_unlock(&cfs_b->lock);
 		return;
 	}
 	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
 		runtime = cfs_b->runtime;
 		cfs_b->runtime = 0;
 	}
 	expires = cfs_b->runtime_expires;
 	raw_spin_unlock(&cfs_b->lock);
 	if (!runtime)
 		return;
 	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
 	raw_spin_lock(&cfs_b->lock);
 	if (expires == cfs_b->runtime_expires)
 		cfs_b->runtime = runtime;
 	raw_spin_unlock(&cfs_b->lock);
 }
 /*
  * When a group wakes up we want to make sure that its quota is not already
  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
  * runtime as update_curr() throttling can not not trigger until it's on-rq.
  */
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 {
 	if (!cfs_bandwidth_used())
 		return;
 	/* an active group must be handled by the update_curr()->put() path */
 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
 		return;
 	/* ensure the group is not already throttled */
 	if (cfs_rq_throttled(cfs_rq))
 		return;
 	/* update runtime allocation */
 	account_cfs_rq_runtime(cfs_rq, 0);
 	if (cfs_rq->runtime_remaining <= 0)
 		throttle_cfs_rq(cfs_rq);
 }
 /* conditionally throttle active cfs_rq's from put_prev_entity() */
 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	if (!cfs_bandwidth_used())
 		return false;
 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
 		return false;
 	/*
 	 * it's possible for a throttled entity to be forced into a running
 	 * state (e.g. set_curr_task), in this case we're finished.
 	 */
 	if (cfs_rq_throttled(cfs_rq))
 		return true;
 	throttle_cfs_rq(cfs_rq);
 	return true;
 }
 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
 {
 	struct cfs_bandwidth *cfs_b =
 		container_of(timer, struct cfs_bandwidth, slack_timer);
 	do_sched_cfs_slack_timer(cfs_b);
 	return HRTIMER_NORESTART;
 }
 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
 {
 	struct cfs_bandwidth *cfs_b =
 		container_of(timer, struct cfs_bandwidth, period_timer);
 	ktime_t now;
 	int overrun;
 	int idle = 0;
 	for (;;) {
 		now = hrtimer_cb_get_time(timer);
 		overrun = hrtimer_forward(timer, now, cfs_b->period);
 		if (!overrun)
 			break;
 		idle = do_sched_cfs_period_timer(cfs_b, overrun);
 	}
 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 }
 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
 	raw_spin_lock_init(&cfs_b->lock);
 	cfs_b->runtime = 0;
 	cfs_b->quota = RUNTIME_INF;
 	cfs_b->period = ns_to_ktime(default_cfs_period());
 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	cfs_b->period_timer.function = sched_cfs_period_timer;
 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
 }
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->runtime_enabled = 0;
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
 }
 /* requires cfs_b->lock, may release to reprogram timer */
 void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
 	/*
 	 * The timer may be active because we're trying to set a new bandwidth
 	 * period or because we're racing with the tear-down path
 	 * (timer_active==0 becomes visible before the hrtimer call-back
 	 * terminates).  In either case we ensure that it's re-programmed
 	 */
 	while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
 	       hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
 		/* bounce the lock to allow do_sched_cfs_period_timer to run */
 		raw_spin_unlock(&cfs_b->lock);
 		cpu_relax();
 		raw_spin_lock(&cfs_b->lock);
 		/* if someone else restarted the timer then we're done */
 		if (cfs_b->timer_active)
 			return;
 	}
 	cfs_b->timer_active = 1;
 	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
 }
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
 	hrtimer_cancel(&cfs_b->period_timer);
 	hrtimer_cancel(&cfs_b->slack_timer);
 }
 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 {
 	struct cfs_rq *cfs_rq;
 	for_each_leaf_cfs_rq(rq, cfs_rq) {
 		struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 		if (!cfs_rq->runtime_enabled)
 			continue;
 		/*
 		 * clock_task is not advancing so we just need to make sure
 		 * there's some valid quota amount
 		 */
 		cfs_rq->runtime_remaining = cfs_b->quota;
 		if (cfs_rq_throttled(cfs_rq))
 			unthrottle_cfs_rq(cfs_rq);
 	}
 }
 #else /* CONFIG_CFS_BANDWIDTH */
 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 {
 	return rq_clock_task(rq_of(cfs_rq));
 }
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
 	return 0;
 }
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
 	return 0;
 }
 static inline int throttled_lb_pair(struct task_group *tg,
 				    int src_cpu, int dest_cpu)
 {
 	return 0;
 }
 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 #endif
 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
 {
 	return NULL;
 }
 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
 #endif /* CONFIG_CFS_BANDWIDTH */
 /**************************************************
  * CFS operations on tasks:
  */
 #ifdef CONFIG_SCHED_HRTICK
 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	WARN_ON(task_rq(p) != rq);
 	if (cfs_rq->nr_running > 1) {
 		u64 slice = sched_slice(cfs_rq, se);
 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
 		s64 delta = slice - ran;
 		if (delta < 0) {
 			if (rq->curr == p)
 				resched_task(p);
 			return;
 		}
 		/*
 		 * Don't schedule slices shorter than 10000ns, that just
 		 * doesn't make sense. Rely on vruntime for fairness.
 		 */
 		if (rq->curr != p)
 			delta = max_t(s64, 10000LL, delta);
 		hrtick_start(rq, delta);
 	}
 }
 /*
  * called from enqueue/dequeue and updates the hrtick when the
  * current task is from our class and nr_running is low enough
  * to matter.
  */
 static void hrtick_update(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
 		return;
 	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
 		hrtick_start_fair(rq, curr);
 }
 #else /* !CONFIG_SCHED_HRTICK */
 static inline void
 hrtick_start_fair(struct rq *rq, struct task_struct *p)
 {
 }
 static inline void hrtick_update(struct rq *rq)
 {
 }
 #endif
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
  * then put the task into the rbtree:
  */
 static void
 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
 	for_each_sched_entity(se) {
 		if (se->on_rq)
 			break;
 		cfs_rq = cfs_rq_of(se);
 		enqueue_entity(cfs_rq, se, flags);
 		/*
 		 * end evaluation on encountering a throttled cfs_rq
 		 *
 		 * note: in the case of encountering a throttled cfs_rq we will
 		 * post the final h_nr_running increment below.
 		*/
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 		cfs_rq->h_nr_running++;
 		flags = ENQUEUE_WAKEUP;
 	}
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq->h_nr_running++;
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 		update_cfs_shares(cfs_rq);
 		update_entity_load_avg(se, 1);
 	}
 	if (!se) {
 		update_rq_runnable_avg(rq, rq->nr_running);
 		inc_nr_running(rq);
 	}
 	hrtick_update(rq);
 }
 static void set_next_buddy(struct sched_entity *se);
 /*
  * The dequeue_task method is called before nr_running is
  * decreased. We remove the task from the rbtree and
  * update the fair scheduling stats:
  */
 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
 	int task_sleep = flags & DEQUEUE_SLEEP;
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, flags);
 		/*
 		 * end evaluation on encountering a throttled cfs_rq
 		 *
 		 * note: in the case of encountering a throttled cfs_rq we will
 		 * post the final h_nr_running decrement below.
 		*/
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 		cfs_rq->h_nr_running--;
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
 			/*
 			 * Bias pick_next to pick a task from this cfs_rq, as
 			 * p is sleeping when it is within its sched_slice.
 			 */
 			if (task_sleep && parent_entity(se))
 				set_next_buddy(parent_entity(se));
 			/* avoid re-evaluating load for this entity */
 			se = parent_entity(se);
 			break;
 		}
 		flags |= DEQUEUE_SLEEP;
 	}
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq->h_nr_running--;
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 		update_cfs_shares(cfs_rq);
 		update_entity_load_avg(se, 1);
 	}
 	if (!se) {
 		dec_nr_running(rq);
 		update_rq_runnable_avg(rq, 1);
 	}
 	hrtick_update(rq);
 }
 #ifdef CONFIG_SMP
 /* Used instead of source_load when we know the type == 0 */
 static unsigned long weighted_cpuload(const int cpu)
 {
 	return cpu_rq(cpu)->cfs.runnable_load_avg;
 }
 /*
  * Return a low guess at the load of a migration-source cpu weighted
  * according to the scheduling class and "nice" value.
  *
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
 static unsigned long source_load(int cpu, int type)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
 	if (type == 0 || !sched_feat(LB_BIAS))
 		return total;
 	return min(rq->cpu_load[type-1], total);
 }
 /*
  * Return a high guess at the load of a migration-target cpu weighted
  * according to the scheduling class and "nice" value.
  */
 static unsigned long target_load(int cpu, int type)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
 	if (type == 0 || !sched_feat(LB_BIAS))
 		return total;
 	return max(rq->cpu_load[type-1], total);
 }
 static unsigned long power_of(int cpu)
 {
 	return cpu_rq(cpu)->cpu_power;
 }
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
 	unsigned long load_avg = rq->cfs.runnable_load_avg;
 	if (nr_running)
 		return load_avg / nr_running;
 	return 0;
 }
 static void record_wakee(struct task_struct *p)
 {
 	/*
 	 * Rough decay (wiping) for cost saving, don't worry
 	 * about the boundary, really active task won't care
 	 * about the loss.
 	 */
 	if (jiffies > current->wakee_flip_decay_ts + HZ) {
 		current->wakee_flips = 0;
 		current->wakee_flip_decay_ts = jiffies;
 	}
 	if (current->last_wakee != p) {
 		current->last_wakee = p;
 		current->wakee_flips++;
 	}
 }
 static void task_waking_fair(struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	u64 min_vruntime;
 #ifndef CONFIG_64BIT
 	u64 min_vruntime_copy;
 	do {
 		min_vruntime_copy = cfs_rq->min_vruntime_copy;
 		smp_rmb();
 		min_vruntime = cfs_rq->min_vruntime;
 	} while (min_vruntime != min_vruntime_copy);
 #else
 	min_vruntime = cfs_rq->min_vruntime;
 #endif
 	se->vruntime -= min_vruntime;
 	record_wakee(p);
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /*
  * effective_load() calculates the load change as seen from the root_task_group
  *
  * Adding load to a group doesn't make a group heavier, but can cause movement
  * of group shares between cpus. Assuming the shares were perfectly aligned one
  * can calculate the shift in shares.
  *
  * Calculate the effective load difference if @wl is added (subtracted) to @tg
  * on this @cpu and results in a total addition (subtraction) of @wg to the
  * total group weight.
  *
  * Given a runqueue weight distribution (rw_i) we can compute a shares
  * distribution (s_i) using:
  *
  *   s_i = rw_i / \Sum rw_j						(1)
  *
  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
  * shares distribution (s_i):
  *
  *   rw_i = {   2,   4,   1,   0 }
  *   s_i  = { 2/7, 4/7, 1/7,   0 }
  *
  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
  * task used to run on and the CPU the waker is running on), we need to
  * compute the effect of waking a task on either CPU and, in case of a sync
  * wakeup, compute the effect of the current task going to sleep.
  *
  * So for a change of @wl to the local @cpu with an overall group weight change
  * of @wl we can compute the new shares distribution (s'_i) using:
  *
  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)				(2)
  *
  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
  * differences in waking a task to CPU 0. The additional task changes the
  * weight and shares distributions like:
  *
  *   rw'_i = {   3,   4,   1,   0 }
  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
  *
  * We can then compute the difference in effective weight by using:
  *
  *   dw_i = S * (s'_i - s_i)						(3)
  *
  * Where 'S' is the group weight as seen by its parent.
  *
  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
  * 4/7) times the weight of the group.
  */
 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 {
 	struct sched_entity *se = tg->se[cpu];
 	if (!tg->parent)	/* the trivial, non-cgroup case */
 		return wl;
 	for_each_sched_entity(se) {
 		long w, W;
 		tg = se->my_q->tg;
 		/*
 		 * W = @wg + \Sum rw_j
 		 */
 		W = wg + calc_tg_weight(tg, se->my_q);
 		/*
 		 * w = rw_i + @wl
 		 */
 		w = se->my_q->load.weight + wl;
 		/*
 		 * wl = S * s'_i; see (2)
 		 */
 		if (W > 0 && w < W)
 			wl = (w * tg->shares) / W;
 		else
 			wl = tg->shares;
 		/*
 		 * Per the above, wl is the new se->load.weight value; since
 		 * those are clipped to [MIN_SHARES, ...) do so now. See
 		 * calc_cfs_shares().
 		 */
 		if (wl < MIN_SHARES)
 			wl = MIN_SHARES;
 		/*
 		 * wl = dw_i = S * (s'_i - s_i); see (3)
 		 */
 		wl -= se->load.weight;
 		/*
 		 * Recursively apply this logic to all parent groups to compute
 		 * the final effective load change on the root group. Since
 		 * only the @tg group gets extra weight, all parent groups can
 		 * only redistribute existing shares. @wl is the shift in shares
 		 * resulting from this level per the above.
 		 */
 		wg = 0;
 	}
 	return wl;
 }
 #else
 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 {
 	return wl;
 }
 #endif
 static int wake_wide(struct task_struct *p)
 {
 	int factor = this_cpu_read(sd_llc_size);
 	/*
 	 * Yeah, it's the switching-frequency, could means many wakee or
 	 * rapidly switch, use factor here will just help to automatically
 	 * adjust the loose-degree, so bigger node will lead to more pull.
 	 */
 	if (p->wakee_flips > factor) {
 		/*
 		 * wakee is somewhat hot, it needs certain amount of cpu
 		 * resource, so if waker is far more hot, prefer to leave
 		 * it alone.
 		 */
 		if (current->wakee_flips > (factor * p->wakee_flips))
 			return 1;
 	}
 	return 0;
 }
 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 {
 	s64 this_load, load;
 	int idx, this_cpu, prev_cpu;
 	unsigned long tl_per_task;
 	struct task_group *tg;
 	unsigned long weight;
 	int balanced;
 	/*
 	 * If we wake multiple tasks be careful to not bounce
 	 * ourselves around too much.
 	 */
 	if (wake_wide(p))
 		return 0;
 	idx	  = sd->wake_idx;
 	this_cpu  = smp_processor_id();
 	prev_cpu  = task_cpu(p);
 	load	  = source_load(prev_cpu, idx);
 	this_load = target_load(this_cpu, idx);
 	/*
 	 * If sync wakeup then subtract the (maximum possible)
 	 * effect of the currently running task from the load
 	 * of the current CPU:
 	 */
 	if (sync) {
 		tg = task_group(current);
 		weight = current->se.load.weight;
 		this_load += effective_load(tg, this_cpu, -weight, -weight);
 		load += effective_load(tg, prev_cpu, 0, -weight);
 	}
 	tg = task_group(p);
 	weight = p->se.load.weight;
 	/*
 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
 	 * due to the sync cause above having dropped this_load to 0, we'll
 	 * always have an imbalance, but there's really nothing you can do
 	 * about that, so that's good too.
 	 *
 	 * Otherwise check if either cpus are near enough in load to allow this
 	 * task to be woken on this_cpu.
 	 */
 	if (this_load > 0) {
 		s64 this_eff_load, prev_eff_load;
 		this_eff_load = 100;
 		this_eff_load *= power_of(prev_cpu);
 		this_eff_load *= this_load +
 			effective_load(tg, this_cpu, weight, weight);
 		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
 		prev_eff_load *= power_of(this_cpu);
 		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
 		balanced = this_eff_load <= prev_eff_load;
 	} else
 		balanced = true;
 	/*
 	 * If the currently running task will sleep within
 	 * a reasonable amount of time then attract this newly
 	 * woken task:
 	 */
 	if (sync && balanced)
 		return 1;
 	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
 	tl_per_task = cpu_avg_load_per_task(this_cpu);
 	if (balanced ||
 	    (this_load <= load &&
 	     this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
 		/*
 		 * This domain has SD_WAKE_AFFINE and
 		 * p is cache cold in this domain, and
 		 * there is no bad imbalance.
 		 */
 		schedstat_inc(sd, ttwu_move_affine);
 		schedstat_inc(p, se.statistics.nr_wakeups_affine);
 		return 1;
 	}
 	return 0;
 }
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
  */
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		  int this_cpu, int sd_flag)
 {
 	struct sched_group *idlest = NULL, *group = sd->groups;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
 	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
 	if (sd_flag & SD_BALANCE_WAKE)
 		load_idx = sd->wake_idx;
 	do {
 		unsigned long load, avg_load;
 		int local_group;
 		int i;
 		/* Skip over this group if it has no CPUs allowed */
 		if (!cpumask_intersects(sched_group_cpus(group),
 					tsk_cpus_allowed(p)))
 			continue;
 		local_group = cpumask_test_cpu(this_cpu,
 					       sched_group_cpus(group));
 		/* Tally up the load of all CPUs in the group */
 		avg_load = 0;
 		for_each_cpu(i, sched_group_cpus(group)) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
 				load = source_load(i, load_idx);
 			else
 				load = target_load(i, load_idx);
 			avg_load += load;
 		}
 		/* Adjust by relative CPU power of the group */
 		avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
 		if (local_group) {
 			this_load = avg_load;
 		} else if (avg_load < min_load) {
 			min_load = avg_load;
 			idlest = group;
 		}
 	} while (group = group->next, group != sd->groups);
 	if (!idlest || 100*this_load < imbalance*min_load)
 		return NULL;
 	return idlest;
 }
 /*
  * find_idlest_cpu - find the idlest cpu among the cpus in group.
  */
 static int
 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 {
 	unsigned long load, min_load = ULONG_MAX;
 	int idlest = -1;
 	int i;
 	/* Traverse only the allowed CPUs */
 	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
 		load = weighted_cpuload(i);
 		if (load < min_load || (load == min_load && i == this_cpu)) {
 			min_load = load;
 			idlest = i;
 		}
 	}
 	return idlest;
 }
 /*
  * Try and locate an idle CPU in the sched_domain.
  */
 static int select_idle_sibling(struct task_struct *p, int target)
 {
 	struct sched_domain *sd;
 	struct sched_group *sg;
 	int i = task_cpu(p);
 	if (idle_cpu(target))
 		return target;
 	/*
 	 * If the prevous cpu is cache affine and idle, don't be stupid.
 	 */
 	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
 		return i;
 	/*
 	 * Otherwise, iterate the domains and find an elegible idle cpu.
 	 */
 	sd = rcu_dereference(per_cpu(sd_llc, target));
 	for_each_lower_domain(sd) {
 		sg = sd->groups;
 		do {
 			if (!cpumask_intersects(sched_group_cpus(sg),
 						tsk_cpus_allowed(p)))
 				goto next;
 			for_each_cpu(i, sched_group_cpus(sg)) {
 				if (i == target || !idle_cpu(i))
 					goto next;
 			}
 			target = cpumask_first_and(sched_group_cpus(sg),
 					tsk_cpus_allowed(p));
 			goto done;
 next:
 			sg = sg->next;
 		} while (sg != sd->groups);
 	}
 done:
 	return target;
 }
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
  * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
  *
  * Balances load by selecting the idlest cpu in the idlest group, or under
  * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
  *
  * Returns the target cpu number.
  *
  * preempt must be disabled.
  */
 static int
 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
 {
 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
 	int cpu = smp_processor_id();
 	int new_cpu = cpu;
 	int want_affine = 0;
 	int sync = wake_flags & WF_SYNC;
 	if (p->nr_cpus_allowed == 1)
 		return prev_cpu;
 	if (sd_flag & SD_BALANCE_WAKE) {
 		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
 			want_affine = 1;
 		new_cpu = prev_cpu;
 	}
 	rcu_read_lock();
 	for_each_domain(cpu, tmp) {
 		if (!(tmp->flags & SD_LOAD_BALANCE))
 			continue;
 		/*
 		 * If both cpu and prev_cpu are part of this domain,
 		 * cpu is a valid SD_WAKE_AFFINE target.
 		 */
 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
 			affine_sd = tmp;
 			break;
 		}
 		if (tmp->flags & sd_flag)
 			sd = tmp;
 	}
 	if (affine_sd) {
 		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
 			prev_cpu = cpu;
 		new_cpu = select_idle_sibling(p, prev_cpu);
 		goto unlock;
 	}
 	while (sd) {
 		struct sched_group *group;
 		int weight;
 		if (!(sd->flags & sd_flag)) {
 			sd = sd->child;
 			continue;
 		}
 		group = find_idlest_group(sd, p, cpu, sd_flag);
 		if (!group) {
 			sd = sd->child;
 			continue;
 		}
 		new_cpu = find_idlest_cpu(group, p, cpu);
 		if (new_cpu == -1 || new_cpu == cpu) {
 			/* Now try balancing at a lower domain level of cpu */
 			sd = sd->child;
 			continue;
 		}
 		/* Now try balancing at a lower domain level of new_cpu */
 		cpu = new_cpu;
 		weight = sd->span_weight;
 		sd = NULL;
 		for_each_domain(cpu, tmp) {
 			if (weight <= tmp->span_weight)
 				break;
 			if (tmp->flags & sd_flag)
 				sd = tmp;
 		}
 		/* while loop will break here if sd == NULL */
 	}
 unlock:
 	rcu_read_unlock();
 	return new_cpu;
 }
 /*
  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
  * cfs_rq_of(p) references at time of call are still valid and identify the
  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
  * other assumptions, including the state of rq->lock, should be made.
  */
 static void
 migrate_task_rq_fair(struct task_struct *p, int next_cpu)
 {
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	/*
 	 * Load tracking: accumulate removed load so that it can be processed
 	 * when we next update owning cfs_rq under rq->lock.  Tasks contribute
 	 * to blocked load iff they have a positive decay-count.  It can never
 	 * be negative here since on-rq tasks have decay-count == 0.
 	 */
 	if (se->avg.decay_count) {
 		se->avg.decay_count = -__synchronize_entity_decay(se);
 		atomic_long_add(se->avg.load_avg_contrib,
 						&cfs_rq->removed_load);
 	}
 }
 #endif /* CONFIG_SMP */
 static unsigned long
 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
 {
 	unsigned long gran = sysctl_sched_wakeup_granularity;
 	/*
 	 * Since its curr running now, convert the gran from real-time
 	 * to virtual-time in his units.
 	 *
 	 * By using 'se' instead of 'curr' we penalize light tasks, so
 	 * they get preempted easier. That is, if 'se' < 'curr' then
 	 * the resulting gran will be larger, therefore penalizing the
 	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
 	 * be smaller, again penalizing the lighter task.
 	 *
 	 * This is especially important for buddies when the leftmost
 	 * task is higher priority than the buddy.
 	 */
 	return calc_delta_fair(gran, se);
 }
 /*
  * Should 'se' preempt 'curr'.
  *
  *             |s1
  *        |s2
  *   |s3
  *         g
  *      |<--->|c
  *
  *  w(c, s1) = -1
  *  w(c, s2) =  0
  *  w(c, s3) =  1
  *
  */
 static int
 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
 {
 	s64 gran, vdiff = curr->vruntime - se->vruntime;
 	if (vdiff <= 0)
 		return -1;
 	gran = wakeup_gran(curr, se);
 	if (vdiff > gran)
 		return 1;
 	return 0;
 }
 static void set_last_buddy(struct sched_entity *se)
 {
 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
 		return;
 	for_each_sched_entity(se)
 		cfs_rq_of(se)->last = se;
 }
 static void set_next_buddy(struct sched_entity *se)
 {
 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
 		return;
 	for_each_sched_entity(se)
 		cfs_rq_of(se)->next = se;
 }
 static void set_skip_buddy(struct sched_entity *se)
 {
 	for_each_sched_entity(se)
 		cfs_rq_of(se)->skip = se;
 }
 /*
  * Preempt the current task with a newly woken task if needed:
  */
 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_entity *se = &curr->se, *pse = &p->se;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	int scale = cfs_rq->nr_running >= sched_nr_latency;
 	int next_buddy_marked = 0;
 	if (unlikely(se == pse))
 		return;
 	/*
 	 * This is possible from callers such as move_task(), in which we
 	 * unconditionally check_prempt_curr() after an enqueue (which may have
 	 * lead to a throttle).  This both saves work and prevents false
 	 * next-buddy nomination below.
 	 */
 	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
 		return;
 	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
 		set_next_buddy(pse);
 		next_buddy_marked = 1;
 	}
 	/*
 	 * We can come here with TIF_NEED_RESCHED already set from new task
 	 * wake up path.
 	 *
 	 * Note: this also catches the edge-case of curr being in a throttled
 	 * group (e.g. via set_curr_task), since update_curr() (in the
 	 * enqueue of curr) will have resulted in resched being set.  This
 	 * prevents us from potentially nominating it as a false LAST_BUDDY
 	 * below.
 	 */
 	if (test_tsk_need_resched(curr))
 		return;
 	/* Idle tasks are by definition preempted by non-idle tasks. */
 	if (unlikely(curr->policy == SCHED_IDLE) &&
 	    likely(p->policy != SCHED_IDLE))
 		goto preempt;
 	/*
 	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
 	 * is driven by the tick):
 	 */
 	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
 		return;
 	find_matching_se(&se, &pse);
 	update_curr(cfs_rq_of(se));
 	BUG_ON(!pse);
 	if (wakeup_preempt_entity(se, pse) == 1) {
 		/*
 		 * Bias pick_next to pick the sched entity that is
 		 * triggering this preemption.
 		 */
 		if (!next_buddy_marked)
 			set_next_buddy(pse);
 		goto preempt;
 	}
 	return;
 preempt:
 	resched_task(curr);
 	/*
 	 * Only set the backward buddy when the current task is still
 	 * on the rq. This can happen when a wakeup gets interleaved
 	 * with schedule on the ->pre_schedule() or idle_balance()
 	 * point, either of which can * drop the rq lock.
 	 *
 	 * Also, during early boot the idle thread is in the fair class,
 	 * for obvious reasons its a bad idea to schedule back to it.
 	 */
 	if (unlikely(!se->on_rq || curr == rq->idle))
 		return;
 	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
 		set_last_buddy(se);
 }
 static struct task_struct *
 pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 {
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
 	struct task_struct *p;
 	int new_tasks;
 again:
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (!cfs_rq->nr_running)
 		goto idle;
 	if (prev->sched_class != &fair_sched_class)
 		goto simple;
 	/*
 	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
 	 * likely that a next task is from the same cgroup as the current.
 	 *
 	 * Therefore attempt to avoid putting and setting the entire cgroup
 	 * hierarchy, only change the part that actually changes.
 	 */
 	do {
 		struct sched_entity *curr = cfs_rq->curr;
 		/*
 		 * Since we got here without doing put_prev_entity() we also
 		 * have to consider cfs_rq->curr. If it is still a runnable
 		 * entity, update_curr() will update its vruntime, otherwise
 		 * forget we've ever seen it.
 		 */
 		if (curr && curr->on_rq)
 			update_curr(cfs_rq);
 		else
 			curr = NULL;
 		/*
 		 * This call to check_cfs_rq_runtime() will do the throttle and
 		 * dequeue its entity in the parent(s). Therefore the 'simple'
 		 * nr_running test will indeed be correct.
 		 */
 		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
 			goto simple;
 		se = pick_next_entity(cfs_rq, curr);
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 	p = task_of(se);
 	/*
 	 * Since we haven't yet done put_prev_entity and if the selected task
 	 * is a different task than we started out with, try and touch the
 	 * least amount of cfs_rqs.
 	 */
 	if (prev != p) {
 		struct sched_entity *pse = &prev->se;
 		while (!(cfs_rq = is_same_group(se, pse))) {
 			int se_depth = se->depth;
 			int pse_depth = pse->depth;
 			if (se_depth <= pse_depth) {
 				put_prev_entity(cfs_rq_of(pse), pse);
 				pse = parent_entity(pse);
 			}
 			if (se_depth >= pse_depth) {
 				set_next_entity(cfs_rq_of(se), se);
 				se = parent_entity(se);
 			}
 		}
 		put_prev_entity(cfs_rq, pse);
 		set_next_entity(cfs_rq, se);
 	}
 	if (hrtick_enabled(rq))
 		hrtick_start_fair(rq, p);
 	return p;
 simple:
 	cfs_rq = &rq->cfs;
 #endif
 	if (!cfs_rq->nr_running)
 		goto idle;
 	put_prev_task(rq, prev);
 	do {
 		se = pick_next_entity(cfs_rq, NULL);
 		set_next_entity(cfs_rq, se);
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 	p = task_of(se);
 	if (hrtick_enabled(rq))
 		hrtick_start_fair(rq, p);
 	return p;
 idle:
 	new_tasks = idle_balance(rq);
 	/*
 	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
 	 * possible for any higher priority task to appear. In that case we
 	 * must re-start the pick_next_entity() loop.
 	 */
 	if (new_tasks < 0)
 		return RETRY_TASK;
 	if (new_tasks > 0)
 		goto again;
 	return NULL;
 }
 /*
  * Account for a descheduled task:
  */
 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 {
 	struct sched_entity *se = &prev->se;
 	struct cfs_rq *cfs_rq;
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		put_prev_entity(cfs_rq, se);
 	}
 }
 /*
  * sched_yield() is very simple
  *
  * The magic of dealing with the ->skip buddy is in pick_next_entity.
  */
 static void yield_task_fair(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	struct sched_entity *se = &curr->se;
 	/*
 	 * Are we the only task in the tree?
 	 */
 	if (unlikely(rq->nr_running == 1))
 		return;
 	clear_buddies(cfs_rq, se);
 	if (curr->policy != SCHED_BATCH) {
 		update_rq_clock(rq);
 		/*
 		 * Update run-time statistics of the 'current'.
 		 */
 		update_curr(cfs_rq);
 		/*
 		 * Tell update_rq_clock() that we've just updated,
 		 * so we don't do microscopic update in schedule()
 		 * and double the fastpath cost.
 		 */
 		 rq->skip_clock_update = 1;
 	}
 	set_skip_buddy(se);
 }
 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
 {
 	struct sched_entity *se = &p->se;
 	/* throttled hierarchies are not runnable */
 	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
 		return false;
 	/* Tell the scheduler that we'd really like pse to run next. */
 	set_next_buddy(se);
 	yield_task_fair(rq);
 	return true;
 }
 #ifdef CONFIG_SMP
 /**************************************************
  * Fair scheduling class load-balancing methods.
  *
  * BASICS
  *
  * The purpose of load-balancing is to achieve the same basic fairness the
  * per-cpu scheduler provides, namely provide a proportional amount of compute
  * time to each task. This is expressed in the following equation:
  *
  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
  *
  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
  * W_i,0 is defined as:
  *
  *   W_i,0 = \Sum_j w_i,j                                             (2)
  *
  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
  * is derived from the nice value as per prio_to_weight[].
  *
  * The weight average is an exponential decay average of the instantaneous
  * weight:
  *
  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
  *
  * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
  * can also include other factors [XXX].
  *
  * To achieve this balance we define a measure of imbalance which follows
  * directly from (1):
  *
  *   imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j }    (4)
  *
  * We them move tasks around to minimize the imbalance. In the continuous
  * function space it is obvious this converges, in the discrete case we get
  * a few fun cases generally called infeasible weight scenarios.
  *
  * [XXX expand on:
  *     - infeasible weights;
  *     - local vs global optima in the discrete case. ]
  *
  *
  * SCHED DOMAINS
  *
  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
  * for all i,j solution, we create a tree of cpus that follows the hardware
  * topology where each level pairs two lower groups (or better). This results
  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
  * tree to only the first of the previous level and we decrease the frequency
  * of load-balance at each level inv. proportional to the number of cpus in
  * the groups.
  *
  * This yields:
  *
  *     log_2 n     1     n
  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
  *     i = 0      2^i   2^i
  *                               `- size of each group
  *         |         |     `- number of cpus doing load-balance
  *         |         `- freq
  *         `- sum over all levels
  *
  * Coupled with a limit on how many tasks we can migrate every balance pass,
  * this makes (5) the runtime complexity of the balancer.
  *
  * An important property here is that each CPU is still (indirectly) connected
  * to every other cpu in at most O(log n) steps:
  *
  * The adjacency matrix of the resulting graph is given by:
  *
  *             log_2 n
  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
  *             k = 0
  *
  * And you'll find that:
  *
  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
  *
  * Showing there's indeed a path between every cpu in at most O(log n) steps.
  * The task movement gives a factor of O(m), giving a convergence complexity
  * of:
  *
  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
  *
  *
  * WORK CONSERVING
  *
  * In order to avoid CPUs going idle while there's still work to do, new idle
  * balancing is more aggressive and has the newly idle cpu iterate up the domain
  * tree itself instead of relying on other CPUs to bring it work.
  *
  * This adds some complexity to both (5) and (8) but it reduces the total idle
  * time.
  *
  * [XXX more?]
  *
  *
  * CGROUPS
  *
  * Cgroups make a horror show out of (2), instead of a simple sum we get:
  *
  *                                s_k,i
  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
  *                                 S_k
  *
  * Where
  *
  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
  *
  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
  *
  * The big problem is S_k, its a global sum needed to compute a local (W_i)
  * property.
  *
  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
  *      rewrite all of this once again.]
  */
 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 enum fbq_type { regular, remote, all };
 #define LBF_ALL_PINNED	0x01
 #define LBF_NEED_BREAK	0x02
 #define LBF_DST_PINNED  0x04
 #define LBF_SOME_PINNED	0x08
 struct lb_env {
 	struct sched_domain	*sd;
 	struct rq		*src_rq;
 	int			src_cpu;
 	int			dst_cpu;
 	struct rq		*dst_rq;
 	struct cpumask		*dst_grpmask;
 	int			new_dst_cpu;
 	enum cpu_idle_type	idle;
 	long			imbalance;
 	/* The set of CPUs under consideration for load-balancing */
 	struct cpumask		*cpus;
 	unsigned int		flags;
 	unsigned int		loop;
 	unsigned int		loop_break;
 	unsigned int		loop_max;
 	enum fbq_type		fbq_type;
 };
 /*
  * move_task - move a task from one runqueue to another runqueue.
  * Both runqueues must be locked.
  */
 static void move_task(struct task_struct *p, struct lb_env *env)
 {
 	deactivate_task(env->src_rq, p, 0);
 	set_task_cpu(p, env->dst_cpu);
 	activate_task(env->dst_rq, p, 0);
 	check_preempt_curr(env->dst_rq, p, 0);
 }
 /*
  * Is this task likely cache-hot:
  */
 static int
 task_hot(struct task_struct *p, u64 now)
 {
 	s64 delta;
 	if (p->sched_class != &fair_sched_class)
 		return 0;
 	if (unlikely(p->policy == SCHED_IDLE))
 		return 0;
 	/*
 	 * Buddy candidates are cache hot:
 	 */
 	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
 			(&p->se == cfs_rq_of(&p->se)->next ||
 			 &p->se == cfs_rq_of(&p->se)->last))
 		return 1;
 	if (sysctl_sched_migration_cost == -1)
 		return 1;
 	if (sysctl_sched_migration_cost == 0)
 		return 0;
 	delta = now - p->se.exec_start;
 	return delta < (s64)sysctl_sched_migration_cost;
 }
 #ifdef CONFIG_NUMA_BALANCING
 /* Returns true if the destination node has incurred more faults */
 static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
 {
 	int src_nid, dst_nid;
 	if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
 	    !(env->sd->flags & SD_NUMA)) {
 		return false;
 	}
 	src_nid = cpu_to_node(env->src_cpu);
 	dst_nid = cpu_to_node(env->dst_cpu);
 	if (src_nid == dst_nid)
 		return false;
 	/* Always encourage migration to the preferred node. */
 	if (dst_nid == p->numa_preferred_nid)
 		return true;
 	/* If both task and group weight improve, this move is a winner. */
 	if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
 	    group_weight(p, dst_nid) > group_weight(p, src_nid))
 		return true;
 	return false;
 }
 static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 {
 	int src_nid, dst_nid;
 	if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
 		return false;
 	if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
 		return false;
 	src_nid = cpu_to_node(env->src_cpu);
 	dst_nid = cpu_to_node(env->dst_cpu);
 	if (src_nid == dst_nid)
 		return false;
 	/* Migrating away from the preferred node is always bad. */
 	if (src_nid == p->numa_preferred_nid)
 		return true;
 	/* If either task or group weight get worse, don't do it. */
 	if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
 	    group_weight(p, dst_nid) < group_weight(p, src_nid))
 		return true;
 	return false;
 }
 #else
 static inline bool migrate_improves_locality(struct task_struct *p,
 					     struct lb_env *env)
 {
 	return false;
 }
 static inline bool migrate_degrades_locality(struct task_struct *p,
 					     struct lb_env *env)
 {
 	return false;
 }
 #endif
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
 static
 int can_migrate_task(struct task_struct *p, struct lb_env *env)
 {
 	int tsk_cache_hot = 0;
 	/*
 	 * We do not migrate tasks that are:
 	 * 1) throttled_lb_pair, or
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) running (obviously), or
 	 * 4) are cache-hot on their current CPU.
 	 */
 	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
 		return 0;
 	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
 		int cpu;
 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
 		env->flags |= LBF_SOME_PINNED;
 		/*
 		 * Remember if this task can be migrated to any other cpu in
 		 * our sched_group. We may want to revisit it if we couldn't
 		 * meet load balance goals by pulling other tasks on src_cpu.
 		 *
 		 * Also avoid computing new_dst_cpu if we have already computed
 		 * one in current iteration.
 		 */
 		if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
 			return 0;
 		/* Prevent to re-select dst_cpu via env's cpus */
 		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
 			if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
 				env->flags |= LBF_DST_PINNED;
 				env->new_dst_cpu = cpu;
 				break;
 			}
 		}
 		return 0;
 	}
 	/* Record that we found atleast one task that could run on dst_cpu */
 	env->flags &= ~LBF_ALL_PINNED;
 	if (task_running(env->src_rq, p)) {
 		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
 		return 0;
 	}
 	/*
 	 * Aggressive migration if:
 	 * 1) destination numa is preferred
 	 * 2) task is cache cold, or
 	 * 3) too many balance attempts have failed.
 	 */
 	tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));
 	if (!tsk_cache_hot)
 		tsk_cache_hot = migrate_degrades_locality(p, env);
 	if (migrate_improves_locality(p, env)) {
 #ifdef CONFIG_SCHEDSTATS
 		if (tsk_cache_hot) {
 			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
 			schedstat_inc(p, se.statistics.nr_forced_migrations);
 		}
 #endif
 		return 1;
 	}
 	if (!tsk_cache_hot ||
 		env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
 		if (tsk_cache_hot) {
 			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
 			schedstat_inc(p, se.statistics.nr_forced_migrations);
 		}
 		return 1;
 	}
 	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
 	return 0;
 }
 /*
  * move_one_task tries to move exactly one task from busiest to this_rq, as
  * part of active balancing operations within "domain".
  * Returns 1 if successful and 0 otherwise.
  *
  * Called with both runqueues locked.
  */
 static int move_one_task(struct lb_env *env)
 {
 	struct task_struct *p, *n;
 	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
 		if (!can_migrate_task(p, env))
 			continue;
 		move_task(p, env);
 		/*
 		 * Right now, this is only the second place move_task()
 		 * is called, so we can safely collect move_task()
 		 * stats here rather than inside move_task().
 		 */
 		schedstat_inc(env->sd, lb_gained[env->idle]);
 		return 1;
 	}
 	return 0;
 }
 static const unsigned int sched_nr_migrate_break = 32;
 /*
  * move_tasks tries to move up to imbalance weighted load from busiest to
  * this_rq, as part of a balancing operation within domain "sd".
  * Returns 1 if successful and 0 otherwise.
  *
  * Called with both runqueues locked.
  */
 static int move_tasks(struct lb_env *env)
 {
 	struct list_head *tasks = &env->src_rq->cfs_tasks;
 	struct task_struct *p;
 	unsigned long load;
 	int pulled = 0;
 	if (env->imbalance <= 0)
 		return 0;
 	while (!list_empty(tasks)) {
 		p = list_first_entry(tasks, struct task_struct, se.group_node);
 		env->loop++;
 		/* We've more or less seen every task there is, call it quits */
 		if (env->loop > env->loop_max)
 			break;
 		/* take a breather every nr_migrate tasks */
 		if (env->loop > env->loop_break) {
 			env->loop_break += sched_nr_migrate_break;
 			env->flags |= LBF_NEED_BREAK;
 			break;
 		}
 		if (!can_migrate_task(p, env))
 			goto next;
 		load = task_h_load(p);
 		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
 			goto next;
 		if ((load / 2) > env->imbalance)
 			goto next;
 		move_task(p, env);
 		pulled++;
 		env->imbalance -= load;
 #ifdef CONFIG_PREEMPT
 		/*
 		 * NEWIDLE balancing is a source of latency, so preemptible
 		 * kernels will stop after the first task is pulled to minimize
 		 * the critical section.
 		 */
 		if (env->idle == CPU_NEWLY_IDLE)
 			break;
 #endif
 		/*
 		 * We only want to steal up to the prescribed amount of
 		 * weighted load.
 		 */
 		if (env->imbalance <= 0)
 			break;
 		continue;
 next:
 		list_move_tail(&p->se.group_node, tasks);
 	}
 	/*
 	 * Right now, this is one of only two places move_task() is called,
 	 * so we can safely collect move_task() stats here rather than
 	 * inside move_task().
 	 */
 	schedstat_add(env->sd, lb_gained[env->idle], pulled);
 	return pulled;
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /*
  * update tg->load_weight by folding this cpu's load_avg
  */
 static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
 {
 	struct sched_entity *se = tg->se[cpu];
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
 	/* throttled entities do not contribute to load */
 	if (throttled_hierarchy(cfs_rq))
 		return;
 	update_cfs_rq_blocked_load(cfs_rq, 1);
 	if (se) {
 		update_entity_load_avg(se, 1);
 		/*
 		 * We pivot on our runnable average having decayed to zero for
 		 * list removal.  This generally implies that all our children
 		 * have also been removed (modulo rounding error or bandwidth
 		 * control); however, such cases are rare and we can fix these
 		 * at enqueue.
 		 *
 		 * TODO: fix up out-of-order children on enqueue.
 		 */
 		if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
 			list_del_leaf_cfs_rq(cfs_rq);
 	} else {
 		struct rq *rq = rq_of(cfs_rq);
 		update_rq_runnable_avg(rq, rq->nr_running);
 	}
 }
 static void update_blocked_averages(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct cfs_rq *cfs_rq;
 	unsigned long flags;
 	raw_spin_lock_irqsave(&rq->lock, flags);
 	update_rq_clock(rq);
 	/*
 	 * Iterates the task_group tree in a bottom up fashion, see
 	 * list_add_leaf_cfs_rq() for details.
 	 */
 	for_each_leaf_cfs_rq(rq, cfs_rq) {
 		/*
 		 * Note: We may want to consider periodically releasing
 		 * rq->lock about these updates so that creating many task
 		 * groups does not result in continually extending hold time.
 		 */
 		__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
 	}
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 /*
  * Compute the hierarchical load factor for cfs_rq and all its ascendants.
  * This needs to be done in a top-down fashion because the load of a child
  * group is a fraction of its parents load.
  */
 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
 	unsigned long now = jiffies;
 	unsigned long load;
 	if (cfs_rq->last_h_load_update == now)
 		return;
 	cfs_rq->h_load_next = NULL;
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq->h_load_next = se;
 		if (cfs_rq->last_h_load_update == now)
 			break;
 	}
 	if (!se) {
 		cfs_rq->h_load = cfs_rq->runnable_load_avg;
 		cfs_rq->last_h_load_update = now;
 	}
 	while ((se = cfs_rq->h_load_next) != NULL) {
 		load = cfs_rq->h_load;
 		load = div64_ul(load * se->avg.load_avg_contrib,
 				cfs_rq->runnable_load_avg + 1);
 		cfs_rq = group_cfs_rq(se);
 		cfs_rq->h_load = load;
 		cfs_rq->last_h_load_update = now;
 	}
 }
 static unsigned long task_h_load(struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
 	update_cfs_rq_h_load(cfs_rq);
 	return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
 			cfs_rq->runnable_load_avg + 1);
 }
 #else
 static inline void update_blocked_averages(int cpu)
 {
 }
 static unsigned long task_h_load(struct task_struct *p)
 {
 	return p->se.avg.load_avg_contrib;
 }
 #endif
 /********** Helpers for find_busiest_group ************************/
 /*
  * sg_lb_stats - stats of a sched_group required for load_balancing
  */
 struct sg_lb_stats {
 	unsigned long avg_load; /*Avg load across the CPUs of the group */
 	unsigned long group_load; /* Total load over the CPUs of the group */
 	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
 	unsigned long load_per_task;
 	unsigned long group_power;
 	unsigned int sum_nr_running; /* Nr tasks running in the group */
 	unsigned int group_capacity;
 	unsigned int idle_cpus;
 	unsigned int group_weight;
 	int group_imb; /* Is there an imbalance in the group ? */
 	int group_has_capacity; /* Is there extra capacity in the group? */
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int nr_numa_running;
 	unsigned int nr_preferred_running;
 #endif
 };
 /*
  * sd_lb_stats - Structure to store the statistics of a sched_domain
  *		 during load balancing.
  */
 struct sd_lb_stats {
 	struct sched_group *busiest;	/* Busiest group in this sd */
 	struct sched_group *local;	/* Local group in this sd */
 	unsigned long total_load;	/* Total load of all groups in sd */
 	unsigned long total_pwr;	/* Total power of all groups in sd */
 	unsigned long avg_load;	/* Average load across all groups in sd */
 	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
 	struct sg_lb_stats local_stat;	/* Statistics of the local group */
 };
 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 {
 	/*
 	 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
 	 * local_stat because update_sg_lb_stats() does a full clear/assignment.
 	 * We must however clear busiest_stat::avg_load because
 	 * update_sd_pick_busiest() reads this before assignment.
 	 */
 	*sds = (struct sd_lb_stats){
 		.busiest = NULL,
 		.local = NULL,
 		.total_load = 0UL,
 		.total_pwr = 0UL,
 		.busiest_stat = {
 			.avg_load = 0UL,
 		},
 	};
 }
 /**
  * get_sd_load_idx - Obtain the load index for a given sched domain.
  * @sd: The sched_domain whose load_idx is to be obtained.
  * @idle: The idle status of the CPU for whose sd load_idx is obtained.
  *
  * Return: The load index.
  */
 static inline int get_sd_load_idx(struct sched_domain *sd,
 					enum cpu_idle_type idle)
 {
 	int load_idx;
 	switch (idle) {
 	case CPU_NOT_IDLE:
 		load_idx = sd->busy_idx;
 		break;
 	case CPU_NEWLY_IDLE:
 		load_idx = sd->newidle_idx;
 		break;
 	default:
 		load_idx = sd->idle_idx;
 		break;
 	}
 	return load_idx;
 }
 static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
 {
 	return SCHED_POWER_SCALE;
 }
 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
 {
 	return default_scale_freq_power(sd, cpu);
 }
 static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
 {
 	unsigned long weight = sd->span_weight;
 	unsigned long smt_gain = sd->smt_gain;
 	smt_gain /= weight;
 	return smt_gain;
 }
 unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
 {
 	return default_scale_smt_power(sd, cpu);
 }
 static unsigned long scale_rt_power(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	u64 total, available, age_stamp, avg;
 	/*
 	 * Since we're reading these variables without serialization make sure
 	 * we read them once before doing sanity checks on them.
 	 */
 	age_stamp = ACCESS_ONCE(rq->age_stamp);
 	avg = ACCESS_ONCE(rq->rt_avg);
 	total = sched_avg_period() + (rq_clock(rq) - age_stamp);
 	if (unlikely(total < avg)) {
 		/* Ensures that power won't end up being negative */
 		available = 0;
 	} else {
 		available = total - avg;
 	}
 	if (unlikely((s64)total < SCHED_POWER_SCALE))
 		total = SCHED_POWER_SCALE;
 	total >>= SCHED_POWER_SHIFT;
 	return div_u64(available, total);
 }
 static void update_cpu_power(struct sched_domain *sd, int cpu)
 {
 	unsigned long weight = sd->span_weight;
 	unsigned long power = SCHED_POWER_SCALE;
 	struct sched_group *sdg = sd->groups;
 	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
 		if (sched_feat(ARCH_POWER))
 			power *= arch_scale_smt_power(sd, cpu);
 		else
 			power *= default_scale_smt_power(sd, cpu);
 		power >>= SCHED_POWER_SHIFT;
 	}
 	sdg->sgp->power_orig = power;
 	if (sched_feat(ARCH_POWER))
 		power *= arch_scale_freq_power(sd, cpu);
 	else
 		power *= default_scale_freq_power(sd, cpu);
 	power >>= SCHED_POWER_SHIFT;
 	power *= scale_rt_power(cpu);
 	power >>= SCHED_POWER_SHIFT;
 	if (!power)
 		power = 1;
 	cpu_rq(cpu)->cpu_power = power;
 	sdg->sgp->power = power;
 }
 void update_group_power(struct sched_domain *sd, int cpu)
 {
 	struct sched_domain *child = sd->child;
 	struct sched_group *group, *sdg = sd->groups;
 	unsigned long power, power_orig;
 	unsigned long interval;
 	interval = msecs_to_jiffies(sd->balance_interval);
 	interval = clamp(interval, 1UL, max_load_balance_interval);
 	sdg->sgp->next_update = jiffies + interval;
 	if (!child) {
 		update_cpu_power(sd, cpu);
 		return;
 	}
 	power_orig = power = 0;
 	if (child->flags & SD_OVERLAP) {
 		/*
 		 * SD_OVERLAP domains cannot assume that child groups
 		 * span the current group.
 		 */
 		for_each_cpu(cpu, sched_group_cpus(sdg)) {
 			struct sched_group_power *sgp;
 			struct rq *rq = cpu_rq(cpu);
 			/*
 			 * build_sched_domains() -> init_sched_groups_power()
 			 * gets here before we've attached the domains to the
 			 * runqueues.
 			 *
 			 * Use power_of(), which is set irrespective of domains
 			 * in update_cpu_power().
 			 *
 			 * This avoids power/power_orig from being 0 and
 			 * causing divide-by-zero issues on boot.
 			 *
 			 * Runtime updates will correct power_orig.
 			 */
 			if (unlikely(!rq->sd)) {
 				power_orig += power_of(cpu);
 				power += power_of(cpu);
 				continue;
 			}
 			sgp = rq->sd->groups->sgp;
 			power_orig += sgp->power_orig;
 			power += sgp->power;
 		}
 	} else  {
 		/*
 		 * !SD_OVERLAP domains can assume that child groups
 		 * span the current group.
 		 */
 		group = child->groups;
 		do {
 			power_orig += group->sgp->power_orig;
 			power += group->sgp->power;
 			group = group->next;
 		} while (group != child->groups);
 	}
 	sdg->sgp->power_orig = power_orig;
 	sdg->sgp->power = power;
 }
 /*
  * Try and fix up capacity for tiny siblings, this is needed when
  * things like SD_ASYM_PACKING need f_b_g to select another sibling
  * which on its own isn't powerful enough.
  *
  * See update_sd_pick_busiest() and check_asym_packing().
  */
 static inline int
 fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 {
 	/*
 	 * Only siblings can have significantly less than SCHED_POWER_SCALE
 	 */
 	if (!(sd->flags & SD_SHARE_CPUPOWER))
 		return 0;
 	/*
 	 * If ~90% of the cpu_power is still there, we're good.
 	 */
 	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
 		return 1;
 	return 0;
 }
 /*
  * Group imbalance indicates (and tries to solve) the problem where balancing
  * groups is inadequate due to tsk_cpus_allowed() constraints.
  *
  * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
  * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
  * Something like:
  *
  * 	{ 0 1 2 3 } { 4 5 6 7 }
  * 	        *     * * *
  *
  * If we were to balance group-wise we'd place two tasks in the first group and
  * two tasks in the second group. Clearly this is undesired as it will overload
  * cpu 3 and leave one of the cpus in the second group unused.
  *
  * The current solution to this issue is detecting the skew in the first group
  * by noticing the lower domain failed to reach balance and had difficulty
  * moving tasks due to affinity constraints.
  *
  * When this is so detected; this group becomes a candidate for busiest; see
  * update_sd_pick_busiest(). And calculate_imbalance() and
  * find_busiest_group() avoid some of the usual balance conditions to allow it
  * to create an effective group imbalance.
  *
  * This is a somewhat tricky proposition since the next run might not find the
  * group imbalance and decide the groups need to be balanced again. A most
  * subtle and fragile situation.
  */
 static inline int sg_imbalanced(struct sched_group *group)
 {
 	return group->sgp->imbalance;
 }
 /*
  * Compute the group capacity.
  *
  * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
  * first dividing out the smt factor and computing the actual number of cores
  * and limit power unit capacity with that.
  */
 static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
 {
 	unsigned int capacity, smt, cpus;
 	unsigned int power, power_orig;
 	power = group->sgp->power;
 	power_orig = group->sgp->power_orig;
 	cpus = group->group_weight;
 	/* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
 	smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
 	capacity = cpus / smt; /* cores */
 	capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
 	if (!capacity)
 		capacity = fix_small_capacity(env->sd, group);
 	return capacity;
 }
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @env: The load balancing environment.
  * @group: sched_group whose statistics are to be updated.
  * @load_idx: Load index of sched_domain of this_cpu for load calc.
  * @local_group: Does group contain this_cpu.
  * @sgs: variable to hold the statistics for this group.
  */
 static inline void update_sg_lb_stats(struct lb_env *env,
 			struct sched_group *group, int load_idx,
 			int local_group, struct sg_lb_stats *sgs)
 {
 	unsigned long load;
 	int i;
 	memset(sgs, 0, sizeof(*sgs));
 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
 		struct rq *rq = cpu_rq(i);
 		/* Bias balancing toward cpus of our domain */
 		if (local_group)
 			load = target_load(i, load_idx);
 		else
 			load = source_load(i, load_idx);
 		sgs->group_load += load;
 		sgs->sum_nr_running += rq->nr_running;
 #ifdef CONFIG_NUMA_BALANCING
 		sgs->nr_numa_running += rq->nr_numa_running;
 		sgs->nr_preferred_running += rq->nr_preferred_running;
 #endif
 		sgs->sum_weighted_load += weighted_cpuload(i);
 		if (idle_cpu(i))
 			sgs->idle_cpus++;
 	}
 	/* Adjust by relative CPU power of the group */
 	sgs->group_power = group->sgp->power;
 	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
 	if (sgs->sum_nr_running)
 		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
 	sgs->group_weight = group->group_weight;
 	sgs->group_imb = sg_imbalanced(group);
 	sgs->group_capacity = sg_capacity(env, group);
 	if (sgs->group_capacity > sgs->sum_nr_running)
 		sgs->group_has_capacity = 1;
 }
 /**
  * update_sd_pick_busiest - return 1 on busiest group
  * @env: The load balancing environment.
  * @sds: sched_domain statistics
  * @sg: sched_group candidate to be checked for being the busiest
  * @sgs: sched_group statistics
  *
  * Determine if @sg is a busier group than the previously selected
  * busiest group.
  *
  * Return: %true if @sg is a busier group than the previously selected
  * busiest group. %false otherwise.
  */
 static bool update_sd_pick_busiest(struct lb_env *env,
 				   struct sd_lb_stats *sds,
 				   struct sched_group *sg,
 				   struct sg_lb_stats *sgs)
 {
 	if (sgs->avg_load <= sds->busiest_stat.avg_load)
 		return false;
 	if (sgs->sum_nr_running > sgs->group_capacity)
 		return true;
 	if (sgs->group_imb)
 		return true;
 	/*
 	 * ASYM_PACKING needs to move all the work to the lowest
 	 * numbered CPUs in the group, therefore mark all groups
 	 * higher than ourself as busy.
 	 */
 	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
 	    env->dst_cpu < group_first_cpu(sg)) {
 		if (!sds->busiest)
 			return true;
 		if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
 			return true;
 	}
 	return false;
 }
 #ifdef CONFIG_NUMA_BALANCING
 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
 {
 	if (sgs->sum_nr_running > sgs->nr_numa_running)
 		return regular;
 	if (sgs->sum_nr_running > sgs->nr_preferred_running)
 		return remote;
 	return all;
 }
 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
 {
 	if (rq->nr_running > rq->nr_numa_running)
 		return regular;
 	if (rq->nr_running > rq->nr_preferred_running)
 		return remote;
 	return all;
 }
 #else
 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
 {
 	return all;
 }
 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
 {
 	return regular;
 }
 #endif /* CONFIG_NUMA_BALANCING */
 /**
  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
  * @env: The load balancing environment.
  * @sds: variable to hold the statistics for this sched_domain.
  */
 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
 {
 	struct sched_domain *child = env->sd->child;
 	struct sched_group *sg = env->sd->groups;
 	struct sg_lb_stats tmp_sgs;
 	int load_idx, prefer_sibling = 0;
 	if (child && child->flags & SD_PREFER_SIBLING)
 		prefer_sibling = 1;
 	load_idx = get_sd_load_idx(env->sd, env->idle);
 	do {
 		struct sg_lb_stats *sgs = &tmp_sgs;
 		int local_group;
 		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
 		if (local_group) {
 			sds->local = sg;
 			sgs = &sds->local_stat;
 			if (env->idle != CPU_NEWLY_IDLE ||
 			    time_after_eq(jiffies, sg->sgp->next_update))
 				update_group_power(env->sd, env->dst_cpu);
 		}
 		update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
 		if (local_group)
 			goto next_group;
 		/*
 		 * In case the child domain prefers tasks go to siblings
 		 * first, lower the sg capacity to one so that we'll try
 		 * and move all the excess tasks away. We lower the capacity
 		 * of a group only if the local group has the capacity to fit
 		 * these excess tasks, i.e. nr_running < group_capacity. The
 		 * extra check prevents the case where you always pull from the
 		 * heaviest group when it is already under-utilized (possible
 		 * with a large weight task outweighs the tasks on the system).
 		 */
 		if (prefer_sibling && sds->local &&
 		    sds->local_stat.group_has_capacity)
 			sgs->group_capacity = min(sgs->group_capacity, 1U);
 		if (update_sd_pick_busiest(env, sds, sg, sgs)) {
 			sds->busiest = sg;
 			sds->busiest_stat = *sgs;
 		}
 next_group:
 		/* Now, start updating sd_lb_stats */
 		sds->total_load += sgs->group_load;
 		sds->total_pwr += sgs->group_power;
 		sg = sg->next;
 	} while (sg != env->sd->groups);
 	if (env->sd->flags & SD_NUMA)
 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
 }
 /**
  * check_asym_packing - Check to see if the group is packed into the
  *			sched doman.
  *
  * This is primarily intended to used at the sibling level.  Some
  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
  * case of POWER7, it can move to lower SMT modes only when higher
  * threads are idle.  When in lower SMT modes, the threads will
  * perform better since they share less core resources.  Hence when we
  * have idle threads, we want them to be the higher ones.
  *
  * This packing function is run on idle threads.  It checks to see if
  * the busiest CPU in this domain (core in the P7 case) has a higher
  * CPU number than the packing function is being run on.  Here we are
  * assuming lower CPU number will be equivalent to lower a SMT thread
  * number.
  *
  * Return: 1 when packing is required and a task should be moved to
  * this CPU.  The amount of the imbalance is returned in *imbalance.
  *
  * @env: The load balancing environment.
  * @sds: Statistics of the sched_domain which is to be packed
  */
 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
 {
 	int busiest_cpu;
 	if (!(env->sd->flags & SD_ASYM_PACKING))
 		return 0;
 	if (!sds->busiest)
 		return 0;
 	busiest_cpu = group_first_cpu(sds->busiest);
 	if (env->dst_cpu > busiest_cpu)
 		return 0;
 	env->imbalance = DIV_ROUND_CLOSEST(
 		sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
 		SCHED_POWER_SCALE);
 	return 1;
 }
 /**
  * fix_small_imbalance - Calculate the minor imbalance that exists
  *			amongst the groups of a sched_domain, during
  *			load balancing.
  * @env: The load balancing environment.
  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
  */
 static inline
 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 {
 	unsigned long tmp, pwr_now = 0, pwr_move = 0;
 	unsigned int imbn = 2;
 	unsigned long scaled_busy_load_per_task;
 	struct sg_lb_stats *local, *busiest;
 	local = &sds->local_stat;
 	busiest = &sds->busiest_stat;
 	if (!local->sum_nr_running)
 		local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
 	else if (busiest->load_per_task > local->load_per_task)
 		imbn = 1;
 	scaled_busy_load_per_task =
 		(busiest->load_per_task * SCHED_POWER_SCALE) /
 		busiest->group_power;
 	if (busiest->avg_load + scaled_busy_load_per_task >=
 	    local->avg_load + (scaled_busy_load_per_task * imbn)) {
 		env->imbalance = busiest->load_per_task;
 		return;
 	}
 	/*
 	 * OK, we don't have enough imbalance to justify moving tasks,
 	 * however we may be able to increase total CPU power used by
 	 * moving them.
 	 */
 	pwr_now += busiest->group_power *
 			min(busiest->load_per_task, busiest->avg_load);
 	pwr_now += local->group_power *
 			min(local->load_per_task, local->avg_load);
 	pwr_now /= SCHED_POWER_SCALE;
 	/* Amount of load we'd subtract */
 	if (busiest->avg_load > scaled_busy_load_per_task) {
 		pwr_move += busiest->group_power *
 			    min(busiest->load_per_task,
 				busiest->avg_load - scaled_busy_load_per_task);
 	}
 	/* Amount of load we'd add */
 	if (busiest->avg_load * busiest->group_power <
 	    busiest->load_per_task * SCHED_POWER_SCALE) {
 		tmp = (busiest->avg_load * busiest->group_power) /
 		      local->group_power;
 	} else {
 		tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
 		      local->group_power;
 	}
 	pwr_move += local->group_power *
 		    min(local->load_per_task, local->avg_load + tmp);
 	pwr_move /= SCHED_POWER_SCALE;
 	/* Move if we gain throughput */
 	if (pwr_move > pwr_now)
 		env->imbalance = busiest->load_per_task;
 }
 /**
  * calculate_imbalance - Calculate the amount of imbalance present within the
  *			 groups of a given sched_domain during load balance.
  * @env: load balance environment
  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
  */
 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 {
 	unsigned long max_pull, load_above_capacity = ~0UL;
 	struct sg_lb_stats *local, *busiest;
 	local = &sds->local_stat;
 	busiest = &sds->busiest_stat;
 	if (busiest->group_imb) {
 		/*
 		 * In the group_imb case we cannot rely on group-wide averages
 		 * to ensure cpu-load equilibrium, look at wider averages. XXX
 		 */
 		busiest->load_per_task =
 			min(busiest->load_per_task, sds->avg_load);
 	}
 	/*
 	 * In the presence of smp nice balancing, certain scenarios can have
 	 * max load less than avg load(as we skip the groups at or below
 	 * its cpu_power, while calculating max_load..)
 	 */
 	if (busiest->avg_load <= sds->avg_load ||
 	    local->avg_load >= sds->avg_load) {
 		env->imbalance = 0;
 		return fix_small_imbalance(env, sds);
 	}
 	if (!busiest->group_imb) {
 		/*
 		 * Don't want to pull so many tasks that a group would go idle.
 		 * Except of course for the group_imb case, since then we might
 		 * have to drop below capacity to reach cpu-load equilibrium.
 		 */
 		load_above_capacity =
 			(busiest->sum_nr_running - busiest->group_capacity);
 		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
 		load_above_capacity /= busiest->group_power;
 	}
 	/*
 	 * We're trying to get all the cpus to the average_load, so we don't
 	 * want to push ourselves above the average load, nor do we wish to
 	 * reduce the max loaded cpu below the average load. At the same time,
 	 * we also don't want to reduce the group load below the group capacity
 	 * (so that we can implement power-savings policies etc). Thus we look
 	 * for the minimum possible imbalance.
 	 */
 	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
 	/* How much load to actually move to equalise the imbalance */
 	env->imbalance = min(
 		max_pull * busiest->group_power,
 		(sds->avg_load - local->avg_load) * local->group_power
 	) / SCHED_POWER_SCALE;
 	/*
 	 * if *imbalance is less than the average load per runnable task
 	 * there is no guarantee that any tasks will be moved so we'll have
 	 * a think about bumping its value to force at least one task to be
 	 * moved
 	 */
 	if (env->imbalance < busiest->load_per_task)
 		return fix_small_imbalance(env, sds);
 }
 /******* find_busiest_group() helpers end here *********************/
 /**
  * find_busiest_group - Returns the busiest group within the sched_domain
  * if there is an imbalance. If there isn't an imbalance, and
  * the user has opted for power-savings, it returns a group whose
  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
  * such a group exists.
  *
  * Also calculates the amount of weighted load which should be moved
  * to restore balance.
  *
  * @env: The load balancing environment.
  *
  * Return:	- The busiest group if imbalance exists.
  *		- If no imbalance and user has opted for power-savings balance,
  *		   return the least loaded group whose CPUs can be
  *		   put to idle by rebalancing its tasks onto our group.
  */
 static struct sched_group *find_busiest_group(struct lb_env *env)
 {
 	struct sg_lb_stats *local, *busiest;
 	struct sd_lb_stats sds;
 	init_sd_lb_stats(&sds);
 	/*
 	 * Compute the various statistics relavent for load balancing at
 	 * this level.
 	 */
 	update_sd_lb_stats(env, &sds);
 	local = &sds.local_stat;
 	busiest = &sds.busiest_stat;
 	if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
 	    check_asym_packing(env, &sds))
 		return sds.busiest;
 	/* There is no busy sibling group to pull tasks from */
 	if (!sds.busiest || busiest->sum_nr_running == 0)
 		goto out_balanced;
 	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
 	/*
 	 * If the busiest group is imbalanced the below checks don't
 	 * work because they assume all things are equal, which typically
 	 * isn't true due to cpus_allowed constraints and the like.
 	 */
 	if (busiest->group_imb)
 		goto force_balance;
 	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
 	if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
 	    !busiest->group_has_capacity)
 		goto force_balance;
 	/*
 	 * If the local group is more busy than the selected busiest group
 	 * don't try and pull any tasks.
 	 */
 	if (local->avg_load >= busiest->avg_load)
 		goto out_balanced;
 	/*
 	 * Don't pull any tasks if this group is already above the domain
 	 * average load.
 	 */
 	if (local->avg_load >= sds.avg_load)
 		goto out_balanced;
 	if (env->idle == CPU_IDLE) {
 		/*
 		 * This cpu is idle. If the busiest group load doesn't
 		 * have more tasks than the number of available cpu's and
 		 * there is no imbalance between this and busiest group
 		 * wrt to idle cpu's, it is balanced.
 		 */
 		if ((local->idle_cpus < busiest->idle_cpus) &&
 		    busiest->sum_nr_running <= busiest->group_weight)
 			goto out_balanced;
 	} else {
 		/*
 		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
 		 * imbalance_pct to be conservative.
 		 */
 		if (100 * busiest->avg_load <=
 				env->sd->imbalance_pct * local->avg_load)
 			goto out_balanced;
 	}
 force_balance:
 	/* Looks like there is an imbalance. Compute it */
 	calculate_imbalance(env, &sds);
 	return sds.busiest;
 out_balanced:
 	env->imbalance = 0;
 	return NULL;
 }
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
 static struct rq *find_busiest_queue(struct lb_env *env,
 				     struct sched_group *group)
 {
 	struct rq *busiest = NULL, *rq;
 	unsigned long busiest_load = 0, busiest_power = 1;
 	int i;
 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
 		unsigned long power, capacity, wl;
 		enum fbq_type rt;
 		rq = cpu_rq(i);
 		rt = fbq_classify_rq(rq);
 		/*
 		 * We classify groups/runqueues into three groups:
 		 *  - regular: there are !numa tasks
 		 *  - remote:  there are numa tasks that run on the 'wrong' node
 		 *  - all:     there is no distinction
 		 *
 		 * In order to avoid migrating ideally placed numa tasks,
 		 * ignore those when there's better options.
 		 *
 		 * If we ignore the actual busiest queue to migrate another
 		 * task, the next balance pass can still reduce the busiest
 		 * queue by moving tasks around inside the node.
 		 *
 		 * If we cannot move enough load due to this classification
 		 * the next pass will adjust the group classification and
 		 * allow migration of more tasks.
 		 *
 		 * Both cases only affect the total convergence complexity.
 		 */
 		if (rt > env->fbq_type)
 			continue;
 		power = power_of(i);
 		capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
 		if (!capacity)
 			capacity = fix_small_capacity(env->sd, group);
 		wl = weighted_cpuload(i);
 		/*
 		 * When comparing with imbalance, use weighted_cpuload()
 		 * which is not scaled with the cpu power.
 		 */
 		if (capacity && rq->nr_running == 1 && wl > env->imbalance)
 			continue;
 		/*
 		 * For the load comparisons with the other cpu's, consider
 		 * the weighted_cpuload() scaled with the cpu power, so that
 		 * the load can be moved away from the cpu that is potentially
 		 * running at a lower capacity.
 		 *
 		 * Thus we're looking for max(wl_i / power_i), crosswise
 		 * multiplication to rid ourselves of the division works out
 		 * to: wl_i * power_j > wl_j * power_i;  where j is our
 		 * previous maximum.
 		 */
 		if (wl * busiest_power > busiest_load * power) {
 			busiest_load = wl;
 			busiest_power = power;
 			busiest = rq;
 		}
 	}
 	return busiest;
 }
 /*
  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
  * so long as it is large enough.
  */
 #define MAX_PINNED_INTERVAL	512
 /* Working cpumask for load_balance and load_balance_newidle. */
 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
 static int need_active_balance(struct lb_env *env)
 {
 	struct sched_domain *sd = env->sd;
 	if (env->idle == CPU_NEWLY_IDLE) {
 		/*
 		 * ASYM_PACKING needs to force migrate tasks from busy but
 		 * higher numbered CPUs in order to pack all tasks in the
 		 * lowest numbered CPUs.
 		 */
 		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
 			return 1;
 	}
 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 }
 static int active_load_balance_cpu_stop(void *data);
 static int should_we_balance(struct lb_env *env)
 {
 	struct sched_group *sg = env->sd->groups;
 	struct cpumask *sg_cpus, *sg_mask;
 	int cpu, balance_cpu = -1;
 	/*
 	 * In the newly idle case, we will allow all the cpu's
 	 * to do the newly idle load balance.
 	 */
 	if (env->idle == CPU_NEWLY_IDLE)
 		return 1;
 	sg_cpus = sched_group_cpus(sg);
 	sg_mask = sched_group_mask(sg);
 	/* Try to find first idle cpu */
 	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
 		if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
 			continue;
 		balance_cpu = cpu;
 		break;
 	}
 	if (balance_cpu == -1)
 		balance_cpu = group_balance_cpu(sg);
 	/*
 	 * First idle cpu or the first cpu(busiest) in this sched group
 	 * is eligible for doing load balancing at this and above domains.
 	 */
 	return balance_cpu == env->dst_cpu;
 }
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *continue_balancing)
 {
 	int ld_moved, cur_ld_moved, active_balance = 0;
 	struct sched_domain *sd_parent = sd->parent;
 	struct sched_group *group;
 	struct rq *busiest;
 	unsigned long flags;
 	struct cpumask *cpus = __get_cpu_var(load_balance_mask);
 	struct lb_env env = {
 		.sd		= sd,
 		.dst_cpu	= this_cpu,
 		.dst_rq		= this_rq,
 		.dst_grpmask    = sched_group_cpus(sd->groups),
 		.idle		= idle,
 		.loop_break	= sched_nr_migrate_break,
 		.cpus		= cpus,
 		.fbq_type	= all,
 	};
 	/*
 	 * For NEWLY_IDLE load_balancing, we don't need to consider
 	 * other cpus in our group
 	 */
 	if (idle == CPU_NEWLY_IDLE)
 		env.dst_grpmask = NULL;
 	cpumask_copy(cpus, cpu_active_mask);
 	schedstat_inc(sd, lb_count[idle]);
 redo:
 	if (!should_we_balance(&env)) {
 		*continue_balancing = 0;
 		goto out_balanced;
 	}
 	group = find_busiest_group(&env);
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[idle]);
 		goto out_balanced;
 	}
 	busiest = find_busiest_queue(&env, group);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
 	}
 	BUG_ON(busiest == env.dst_rq);
 	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
 	ld_moved = 0;
 	if (busiest->nr_running > 1) {
 		/*
 		 * Attempt to move tasks. If find_busiest_group has found
 		 * an imbalance but busiest->nr_running <= 1, the group is
 		 * still unbalanced. ld_moved simply stays zero, so it is
 		 * correctly treated as an imbalance.
 		 */
 		env.flags |= LBF_ALL_PINNED;
 		env.src_cpu   = busiest->cpu;
 		env.src_rq    = busiest;
 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
 more_balance:
 		local_irq_save(flags);
 		double_rq_lock(env.dst_rq, busiest);
 		/*
 		 * cur_ld_moved - load moved in current iteration
 		 * ld_moved     - cumulative load moved across iterations
 		 */
 		cur_ld_moved = move_tasks(&env);
 		ld_moved += cur_ld_moved;
 		double_rq_unlock(env.dst_rq, busiest);
 		local_irq_restore(flags);
 		/*
 		 * some other cpu did the load balance for us.
 		 */
 		if (cur_ld_moved && env.dst_cpu != smp_processor_id())
 			resched_cpu(env.dst_cpu);
 		if (env.flags & LBF_NEED_BREAK) {
 			env.flags &= ~LBF_NEED_BREAK;
 			goto more_balance;
 		}
 		/*
 		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
 		 * us and move them to an alternate dst_cpu in our sched_group
 		 * where they can run. The upper limit on how many times we
 		 * iterate on same src_cpu is dependent on number of cpus in our
 		 * sched_group.
 		 *
 		 * This changes load balance semantics a bit on who can move
 		 * load to a given_cpu. In addition to the given_cpu itself
 		 * (or a ilb_cpu acting on its behalf where given_cpu is
 		 * nohz-idle), we now have balance_cpu in a position to move
 		 * load to given_cpu. In rare situations, this may cause
 		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
 		 * _independently_ and at _same_ time to move some load to
 		 * given_cpu) causing exceess load to be moved to given_cpu.
 		 * This however should not happen so much in practice and
 		 * moreover subsequent load balance cycles should correct the
 		 * excess load moved.
 		 */
 		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
 			/* Prevent to re-select dst_cpu via env's cpus */
 			cpumask_clear_cpu(env.dst_cpu, env.cpus);
 			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
 			env.dst_cpu	 = env.new_dst_cpu;
 			env.flags	&= ~LBF_DST_PINNED;
 			env.loop	 = 0;
 			env.loop_break	 = sched_nr_migrate_break;
 			/*
 			 * Go back to "more_balance" rather than "redo" since we
 			 * need to continue with same src_cpu.
 			 */
 			goto more_balance;
 		}
 		/*
 		 * We failed to reach balance because of affinity.
 		 */
 		if (sd_parent) {
 			int *group_imbalance = &sd_parent->groups->sgp->imbalance;
 			if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
 				*group_imbalance = 1;
 			} else if (*group_imbalance)
 				*group_imbalance = 0;
 		}
 		/* All tasks on this runqueue were pinned by CPU affinity */
 		if (unlikely(env.flags & LBF_ALL_PINNED)) {
 			cpumask_clear_cpu(cpu_of(busiest), cpus);
 			if (!cpumask_empty(cpus)) {
 				env.loop = 0;
 				env.loop_break = sched_nr_migrate_break;
 				goto redo;
 			}
 			goto out_balanced;
 		}
 	}
 	if (!ld_moved) {
 		schedstat_inc(sd, lb_failed[idle]);
 		/*
 		 * Increment the failure counter only on periodic balance.
 		 * We do not want newidle balance, which can be very
 		 * frequent, pollute the failure counter causing
 		 * excessive cache_hot migrations and active balances.
 		 */
 		if (idle != CPU_NEWLY_IDLE)
 			sd->nr_balance_failed++;
 		if (need_active_balance(&env)) {
 			raw_spin_lock_irqsave(&busiest->lock, flags);
 			/* don't kick the active_load_balance_cpu_stop,
 			 * if the curr task on busiest cpu can't be
 			 * moved to this_cpu
 			 */
 			if (!cpumask_test_cpu(this_cpu,
 					tsk_cpus_allowed(busiest->curr))) {
 				raw_spin_unlock_irqrestore(&busiest->lock,
 							    flags);
 				env.flags |= LBF_ALL_PINNED;
 				goto out_one_pinned;
 			}
 			/*
 			 * ->active_balance synchronizes accesses to
 			 * ->active_balance_work.  Once set, it's cleared
 			 * only after active load balance is finished.
 			 */
 			if (!busiest->active_balance) {
 				busiest->active_balance = 1;
 				busiest->push_cpu = this_cpu;
 				active_balance = 1;
 			}
 			raw_spin_unlock_irqrestore(&busiest->lock, flags);
 			if (active_balance) {
 				stop_one_cpu_nowait(cpu_of(busiest),
 					active_load_balance_cpu_stop, busiest,
 					&busiest->active_balance_work);
 			}
 			/*
 			 * We've kicked active balancing, reset the failure
 			 * counter.
 			 */
 			sd->nr_balance_failed = sd->cache_nice_tries+1;
 		}
 	} else
 		sd->nr_balance_failed = 0;
 	if (likely(!active_balance)) {
 		/* We were unbalanced, so reset the balancing interval */
 		sd->balance_interval = sd->min_interval;
 	} else {
 		/*
 		 * If we've begun active balancing, start to back off. This
 		 * case may not be covered by the all_pinned logic if there
 		 * is only 1 task on the busy runqueue (because we don't call
 		 * move_tasks).
 		 */
 		if (sd->balance_interval < sd->max_interval)
 			sd->balance_interval *= 2;
 	}
 	goto out;
 out_balanced:
 	schedstat_inc(sd, lb_balanced[idle]);
 	sd->nr_balance_failed = 0;
 out_one_pinned:
 	/* tune up the balancing interval */
 	if (((env.flags & LBF_ALL_PINNED) &&
 			sd->balance_interval < MAX_PINNED_INTERVAL) ||
 			(sd->balance_interval < sd->max_interval))
 		sd->balance_interval *= 2;
 	ld_moved = 0;
 out:
 	return ld_moved;
 }
 /*
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
  */
 static int idle_balance(struct rq *this_rq)
 {
 	struct sched_domain *sd;
 	int pulled_task = 0;
 	unsigned long next_balance = jiffies + HZ;
 	u64 curr_cost = 0;
 	int this_cpu = this_rq->cpu;
 	idle_enter_fair(this_rq);
 	/*
 	 * We must set idle_stamp _before_ calling idle_balance(), such that we
 	 * measure the duration of idle_balance() as idle time.
 	 */
 	this_rq->idle_stamp = rq_clock(this_rq);
 	if (this_rq->avg_idle < sysctl_sched_migration_cost)
 		goto out;
 	/*
 	 * Drop the rq->lock, but keep IRQ/preempt disabled.
 	 */
 	raw_spin_unlock(&this_rq->lock);
 	update_blocked_averages(this_cpu);
 	rcu_read_lock();
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
 		int continue_balancing = 1;
 		u64 t0, domain_cost;
 		if (!(sd->flags & SD_LOAD_BALANCE))
 			continue;
 		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
 			break;
 		if (sd->flags & SD_BALANCE_NEWIDLE) {
 			t0 = sched_clock_cpu(this_cpu);
 			/* If we've pulled tasks over stop searching: */
 			pulled_task = load_balance(this_cpu, this_rq,
 						   sd, CPU_NEWLY_IDLE,
 						   &continue_balancing);
 			domain_cost = sched_clock_cpu(this_cpu) - t0;
 			if (domain_cost > sd->max_newidle_lb_cost)
 				sd->max_newidle_lb_cost = domain_cost;
 			curr_cost += domain_cost;
 		}
 		interval = msecs_to_jiffies(sd->balance_interval);
 		if (time_after(next_balance, sd->last_balance + interval))
 			next_balance = sd->last_balance + interval;
 		if (pulled_task)
 			break;
 	}
 	rcu_read_unlock();
 	raw_spin_lock(&this_rq->lock);
 	if (curr_cost > this_rq->max_idle_balance_cost)
 		this_rq->max_idle_balance_cost = curr_cost;
 	/*
 	 * While browsing the domains, we released the rq lock, a task could
 	 * have been enqueued in the meantime. Since we're not going idle,
 	 * pretend we pulled a task.
 	 */
 	if (this_rq->cfs.h_nr_running && !pulled_task)
 		pulled_task = 1;
 	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
 		/*
 		 * We are going idle. next_balance may be set based on
 		 * a busy processor. So reset next_balance.
 		 */
 		this_rq->next_balance = next_balance;
 	}
 out:
 	/* Is there a task of a high priority class? */
 	if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
 	    ((this_rq->stop && this_rq->stop->on_rq) ||
 	     this_rq->dl.dl_nr_running ||
 	     (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
 		pulled_task = -1;
 	if (pulled_task) {
 		idle_exit_fair(this_rq);
 		this_rq->idle_stamp = 0;
 	}
 	return pulled_task;
 }
 /*
  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
  * running tasks off the busiest CPU onto idle CPUs. It requires at
  * least 1 task to be running on each physical CPU where possible, and
  * avoids physical / logical imbalances.
  */
 static int active_load_balance_cpu_stop(void *data)
 {
 	struct rq *busiest_rq = data;
 	int busiest_cpu = cpu_of(busiest_rq);
 	int target_cpu = busiest_rq->push_cpu;
 	struct rq *target_rq = cpu_rq(target_cpu);
 	struct sched_domain *sd;
 	raw_spin_lock_irq(&busiest_rq->lock);
 	/* make sure the requested cpu hasn't gone down in the meantime */
 	if (unlikely(busiest_cpu != smp_processor_id() ||
 		     !busiest_rq->active_balance))
 		goto out_unlock;
 	/* Is there any task to move? */
 	if (busiest_rq->nr_running <= 1)
 		goto out_unlock;
 	/*
 	 * This condition is "impossible", if it occurs
 	 * we need to fix it. Originally reported by
 	 * Bjorn Helgaas on a 128-cpu setup.
 	 */
 	BUG_ON(busiest_rq == target_rq);
 	/* move a task from busiest_rq to target_rq */
 	double_lock_balance(busiest_rq, target_rq);
 	/* Search for an sd spanning us and the target CPU. */
 	rcu_read_lock();
 	for_each_domain(target_cpu, sd) {
 		if ((sd->flags & SD_LOAD_BALANCE) &&
 		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
 				break;
 	}
 	if (likely(sd)) {
 		struct lb_env env = {
 			.sd		= sd,
 			.dst_cpu	= target_cpu,
 			.dst_rq		= target_rq,
 			.src_cpu	= busiest_rq->cpu,
 			.src_rq		= busiest_rq,
 			.idle		= CPU_IDLE,
 		};
 		schedstat_inc(sd, alb_count);
 		if (move_one_task(&env))
 			schedstat_inc(sd, alb_pushed);
 		else
 			schedstat_inc(sd, alb_failed);
 	}
 	rcu_read_unlock();
 	double_unlock_balance(busiest_rq, target_rq);
 out_unlock:
 	busiest_rq->active_balance = 0;
 	raw_spin_unlock_irq(&busiest_rq->lock);
 	return 0;
 }
 static inline int on_null_domain(struct rq *rq)
 {
 	return unlikely(!rcu_dereference_sched(rq->sd));
 }
 #ifdef CONFIG_NO_HZ_COMMON
 /*
  * idle load balancing details
  * - When one of the busy CPUs notice that there may be an idle rebalancing
  *   needed, they will kick the idle load balancer, which then does idle
  *   load balancing for all the idle CPUs.
  */
 static struct {
 	cpumask_var_t idle_cpus_mask;
 	atomic_t nr_cpus;
 	unsigned long next_balance;     /* in jiffy units */
 } nohz ____cacheline_aligned;
 static inline int find_new_ilb(void)
 {
 	int ilb = cpumask_first(nohz.idle_cpus_mask);
 	if (ilb < nr_cpu_ids && idle_cpu(ilb))
 		return ilb;
 	return nr_cpu_ids;
 }
 /*
  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
  * CPU (if there is one).
  */
 static void nohz_balancer_kick(void)
 {
 	int ilb_cpu;
 	nohz.next_balance++;
 	ilb_cpu = find_new_ilb();
 	if (ilb_cpu >= nr_cpu_ids)
 		return;
 	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
 		return;
 	/*
 	 * Use smp_send_reschedule() instead of resched_cpu().
 	 * This way we generate a sched IPI on the target cpu which
 	 * is idle. And the softirq performing nohz idle load balance
 	 * will be run before returning from the IPI.
 	 */
 	smp_send_reschedule(ilb_cpu);
 	return;
 }
 static inline void nohz_balance_exit_idle(int cpu)
 {
 	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
 		/*
 		 * Completely isolated CPUs don't ever set, so we must test.
 		 */
 		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
 			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
 			atomic_dec(&nohz.nr_cpus);
 		}
 		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
 	}
 }
 static inline void set_cpu_sd_state_busy(void)
 {
 	struct sched_domain *sd;
 	int cpu = smp_processor_id();
 	rcu_read_lock();
 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
 	if (!sd || !sd->nohz_idle)
 		goto unlock;
 	sd->nohz_idle = 0;
 	atomic_inc(&sd->groups->sgp->nr_busy_cpus);
 unlock:
 	rcu_read_unlock();
 }
 void set_cpu_sd_state_idle(void)
 {
 	struct sched_domain *sd;
 	int cpu = smp_processor_id();
 	rcu_read_lock();
 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
 	if (!sd || sd->nohz_idle)
 		goto unlock;
 	sd->nohz_idle = 1;
 	atomic_dec(&sd->groups->sgp->nr_busy_cpus);
 unlock:
 	rcu_read_unlock();
 }
 /*
  * This routine will record that the cpu is going idle with tick stopped.
  * This info will be used in performing idle load balancing in the future.
  */
 void nohz_balance_enter_idle(int cpu)
 {
 	/*
 	 * If this cpu is going down, then nothing needs to be done.
 	 */
 	if (!cpu_active(cpu))
 		return;
 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
 		return;
 	/*
 	 * If we're a completely isolated CPU, we don't play.
 	 */
 	if (on_null_domain(cpu_rq(cpu)))
 		return;
 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
 	atomic_inc(&nohz.nr_cpus);
 	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
 }
 static int sched_ilb_notifier(struct notifier_block *nfb,
 					unsigned long action, void *hcpu)
 {
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_DYING:
 		nohz_balance_exit_idle(smp_processor_id());
 		return NOTIFY_OK;
 	default:
 		return NOTIFY_DONE;
 	}
 }
 #endif
 static DEFINE_SPINLOCK(balancing);
 /*
  * Scale the max load_balance interval with the number of CPUs in the system.
  * This trades load-balance latency on larger machines for less cross talk.
  */
 void update_max_interval(void)
 {
 	max_load_balance_interval = HZ*num_online_cpus()/10;
 }
 /*
  * It checks each scheduling domain to see if it is due to be balanced,
  * and initiates a balancing operation if so.
  *
  * Balancing parameters are set up in init_sched_domains.
  */
 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
 {
 	int continue_balancing = 1;
 	int cpu = rq->cpu;
 	unsigned long interval;
 	struct sched_domain *sd;
 	/* Earliest time when we have to do rebalance again */
 	unsigned long next_balance = jiffies + 60*HZ;
 	int update_next_balance = 0;
 	int need_serialize, need_decay = 0;
 	u64 max_cost = 0;
 	update_blocked_averages(cpu);
 	rcu_read_lock();
 	for_each_domain(cpu, sd) {
 		/*
 		 * Decay the newidle max times here because this is a regular
 		 * visit to all the domains. Decay ~1% per second.
 		 */
 		if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
 			sd->max_newidle_lb_cost =
 				(sd->max_newidle_lb_cost * 253) / 256;
 			sd->next_decay_max_lb_cost = jiffies + HZ;
 			need_decay = 1;
 		}
 		max_cost += sd->max_newidle_lb_cost;
 		if (!(sd->flags & SD_LOAD_BALANCE))
 			continue;
 		/*
 		 * Stop the load balance at this level. There is another
 		 * CPU in our sched group which is doing load balancing more
 		 * actively.
 		 */
 		if (!continue_balancing) {
 			if (need_decay)
 				continue;
 			break;
 		}
 		interval = sd->balance_interval;
 		if (idle != CPU_IDLE)
 			interval *= sd->busy_factor;
 		/* scale ms to jiffies */
 		interval = msecs_to_jiffies(interval);
 		interval = clamp(interval, 1UL, max_load_balance_interval);
 		need_serialize = sd->flags & SD_SERIALIZE;
 		if (need_serialize) {
 			if (!spin_trylock(&balancing))
 				goto out;
 		}
 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
 			if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
 				/*
 				 * The LBF_DST_PINNED logic could have changed
 				 * env->dst_cpu, so we can't know our idle
 				 * state even if we migrated tasks. Update it.
 				 */
 				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
 			}
 			sd->last_balance = jiffies;
 		}
 		if (need_serialize)
 			spin_unlock(&balancing);
 out:
 		if (time_after(next_balance, sd->last_balance + interval)) {
 			next_balance = sd->last_balance + interval;
 			update_next_balance = 1;
 		}
 	}
 	if (need_decay) {
 		/*
 		 * Ensure the rq-wide value also decays but keep it at a
 		 * reasonable floor to avoid funnies with rq->avg_idle.
 		 */
 		rq->max_idle_balance_cost =
 			max((u64)sysctl_sched_migration_cost, max_cost);
 	}
 	rcu_read_unlock();
 	/*
 	 * next_balance will be updated only when there is a need.
 	 * When the cpu is attached to null domain for ex, it will not be
 	 * updated.
 	 */
 	if (likely(update_next_balance))
 		rq->next_balance = next_balance;
 }
 #ifdef CONFIG_NO_HZ_COMMON
 /*
  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
  * rebalancing for all the cpus for whom scheduler ticks are stopped.
  */
 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 {
 	int this_cpu = this_rq->cpu;
 	struct rq *rq;
 	int balance_cpu;
 	if (idle != CPU_IDLE ||
 	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
 		goto end;
 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
 			continue;
 		/*
 		 * If this cpu gets work to do, stop the load balancing
 		 * work being done for other cpus. Next load
 		 * balancing owner will pick it up.
 		 */
 		if (need_resched())
 			break;
 		rq = cpu_rq(balance_cpu);
 		raw_spin_lock_irq(&rq->lock);
 		update_rq_clock(rq);
 		update_idle_cpu_load(rq);
 		raw_spin_unlock_irq(&rq->lock);
 		rebalance_domains(rq, CPU_IDLE);
 		if (time_after(this_rq->next_balance, rq->next_balance))
 			this_rq->next_balance = rq->next_balance;
 	}
 	nohz.next_balance = this_rq->next_balance;
 end:
 	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
 }
 /*
  * Current heuristic for kicking the idle load balancer in the presence
  * of an idle cpu is the system.
  *   - This rq has more than one task.
  *   - At any scheduler domain level, this cpu's scheduler group has multiple
  *     busy cpu's exceeding the group's power.
  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
  *     domain span are idle.
  */
 static inline int nohz_kick_needed(struct rq *rq)
 {
 	unsigned long now = jiffies;
 	struct sched_domain *sd;
 	struct sched_group_power *sgp;
 	int nr_busy, cpu = rq->cpu;
 	if (unlikely(rq->idle_balance))
 		return 0;
        /*
 	* We may be recently in ticked or tickless idle mode. At the first
 	* busy tick after returning from idle, we will update the busy stats.
 	*/
 	set_cpu_sd_state_busy();
 	nohz_balance_exit_idle(cpu);
 	/*
 	 * None are in tickless mode and hence no need for NOHZ idle load
 	 * balancing.
 	 */
 	if (likely(!atomic_read(&nohz.nr_cpus)))
 		return 0;
 	if (time_before(now, nohz.next_balance))
 		return 0;
 	if (rq->nr_running >= 2)
 		goto need_kick;
 	rcu_read_lock();
 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
 	if (sd) {
 		sgp = sd->groups->sgp;
 		nr_busy = atomic_read(&sgp->nr_busy_cpus);
 		if (nr_busy > 1)
 			goto need_kick_unlock;
 	}
 	sd = rcu_dereference(per_cpu(sd_asym, cpu));
 	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
 				  sched_domain_span(sd)) < cpu))
 		goto need_kick_unlock;
 	rcu_read_unlock();
 	return 0;
 need_kick_unlock:
 	rcu_read_unlock();
 need_kick:
 	return 1;
 }
 #else
 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
 #endif
 /*
  * run_rebalance_domains is triggered when needed from the scheduler tick.
  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
  */
 static void run_rebalance_domains(struct softirq_action *h)
 {
 	struct rq *this_rq = this_rq();
 	enum cpu_idle_type idle = this_rq->idle_balance ?
 						CPU_IDLE : CPU_NOT_IDLE;
 	rebalance_domains(this_rq, idle);
 	/*
 	 * If this cpu has a pending nohz_balance_kick, then do the
 	 * balancing on behalf of the other idle cpus whose ticks are
 	 * stopped.
 	 */
 	nohz_idle_balance(this_rq, idle);
 }
 /*
  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
  */
 void trigger_load_balance(struct rq *rq)
 {
 	/* Don't need to rebalance while attached to NULL domain */
 	if (unlikely(on_null_domain(rq)))
 		return;
 	if (time_after_eq(jiffies, rq->next_balance))
 		raise_softirq(SCHED_SOFTIRQ);
 #ifdef CONFIG_NO_HZ_COMMON
 	if (nohz_kick_needed(rq))
 		nohz_balancer_kick();
 #endif
 }
 static void rq_online_fair(struct rq *rq)
 {
 	update_sysctl();
 }
 static void rq_offline_fair(struct rq *rq)
 {
 	update_sysctl();
 	/* Ensure any throttled groups are reachable by pick_next_task */
 	unthrottle_offline_cfs_rqs(rq);
 }
 #endif /* CONFIG_SMP */
 /*
  * scheduler tick hitting a task of our scheduling class:
  */
 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &curr->se;
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		entity_tick(cfs_rq, se, queued);
 	}
 	if (numabalancing_enabled)
 		task_tick_numa(rq, curr);
 	update_rq_runnable_avg(rq, 1);
 }
 /*
  * called on fork with the child task as argument from the parent's context
  *  - child not yet on the tasklist
  *  - preemption disabled
  */
 static void task_fork_fair(struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se, *curr;
 	int this_cpu = smp_processor_id();
 	struct rq *rq = this_rq();
 	unsigned long flags;
 	raw_spin_lock_irqsave(&rq->lock, flags);
 	update_rq_clock(rq);
 	cfs_rq = task_cfs_rq(current);
 	curr = cfs_rq->curr;
 	/*
 	 * Not only the cpu but also the task_group of the parent might have
 	 * been changed after parent->se.parent,cfs_rq were copied to
 	 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
 	 * of child point to valid ones.
 	 */
 	rcu_read_lock();
 	__set_task_cpu(p, this_cpu);
 	rcu_read_unlock();
 	update_curr(cfs_rq);
 	if (curr)
 		se->vruntime = curr->vruntime;
 	place_entity(cfs_rq, se, 1);
 	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
 		/*
 		 * Upon rescheduling, sched_class::put_prev_task() will place
 		 * 'current' within the tree based on its new key value.
 		 */
 		swap(curr->vruntime, se->vruntime);
 		resched_task(rq->curr);
 	}
 	se->vruntime -= cfs_rq->min_vruntime;
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 /*
  * Priority of the task has changed. Check to see if we preempt
  * the current task.
  */
 static void
 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 {
 	if (!p->se.on_rq)
 		return;
 	/*
 	 * Reschedule if we are currently running on this runqueue and
 	 * our priority decreased, or if we are not currently running on
 	 * this runqueue and our priority is higher than the current's
 	 */
 	if (rq->curr == p) {
 		if (p->prio > oldprio)
 			resched_task(rq->curr);
 	} else
 		check_preempt_curr(rq, p, 0);
 }
 static void switched_from_fair(struct rq *rq, struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	/*
 	 * Ensure the task's vruntime is normalized, so that when it's
 	 * switched back to the fair class the enqueue_entity(.flags=0) will
 	 * do the right thing.
 	 *
 	 * If it's on_rq, then the dequeue_entity(.flags=0) will already
 	 * have normalized the vruntime, if it's !on_rq, then only when
 	 * the task is sleeping will it still have non-normalized vruntime.
 	 */
 	if (!p->on_rq && p->state != TASK_RUNNING) {
 		/*
 		 * Fix up our vruntime so that the current sleep doesn't
 		 * cause 'unlimited' sleep bonus.
 		 */
 		place_entity(cfs_rq, se, 0);
 		se->vruntime -= cfs_rq->min_vruntime;
 	}
 #ifdef CONFIG_SMP
 	/*
 	* Remove our load from contribution when we leave sched_fair
 	* and ensure we don't carry in an old decay_count if we
 	* switch back.
 	*/
 	if (se->avg.decay_count) {
 		__synchronize_entity_decay(se);
 		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
 	}
 #endif
 }
 /*
  * We switched to the sched_fair class.
  */
 static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/*
 	 * Since the real-depth could have been changed (only FAIR
 	 * class maintain depth value), reset depth properly.
 	 */
 	se->depth = se->parent ? se->parent->depth + 1 : 0;
 #endif
 	if (!se->on_rq)
 		return;
 	/*
 	 * We were most likely switched from sched_rt, so
 	 * kick off the schedule if running, otherwise just see
 	 * if we can still preempt the current task.
 	 */
 	if (rq->curr == p)
 		resched_task(rq->curr);
 	else
 		check_preempt_curr(rq, p, 0);
 }
 /* Account for a task changing its policy or group.
  *
  * This routine is mostly called to set cfs_rq->curr field when a task
  * migrates between groups/classes.
  */
 static void set_curr_task_fair(struct rq *rq)
 {
 	struct sched_entity *se = &rq->curr->se;
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
 		set_next_entity(cfs_rq, se);
 		/* ensure bandwidth has been allocated on our new cfs_rq */
 		account_cfs_rq_runtime(cfs_rq, 0);
 	}
 }
 void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT;
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 #ifndef CONFIG_64BIT
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 #endif
 #ifdef CONFIG_SMP
 	atomic64_set(&cfs_rq->decay_counter, 1);
 	atomic_long_set(&cfs_rq->removed_load, 0);
 #endif
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq;
 	/*
 	 * If the task was not on the rq at the time of this cgroup movement
 	 * it must have been asleep, sleeping tasks keep their ->vruntime
 	 * absolute on their old rq until wakeup (needed for the fair sleeper
 	 * bonus in place_entity()).
 	 *
 	 * If it was on the rq, we've just 'preempted' it, which does convert
 	 * ->vruntime to a relative base.
 	 *
 	 * Make sure both cases convert their relative position when migrating
 	 * to another cgroup's rq. This does somewhat interfere with the
 	 * fair sleeper stuff for the first placement, but who cares.
 	 */
 	/*
 	 * When !on_rq, vruntime of the task has usually NOT been normalized.
 	 * But there are some cases where it has already been normalized:
 	 *
 	 * - Moving a forked child which is waiting for being woken up by
 	 *   wake_up_new_task().
 	 * - Moving a task which has been woken up by try_to_wake_up() and
 	 *   waiting for actually being woken up by sched_ttwu_pending().
 	 *
 	 * To prevent boost or penalty in the new cfs_rq caused by delta
 	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
 	 */
 	if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
 		on_rq = 1;
 	if (!on_rq)
 		se->vruntime -= cfs_rq_of(se)->min_vruntime;
 	set_task_rq(p, task_cpu(p));
 	se->depth = se->parent ? se->parent->depth + 1 : 0;
 	if (!on_rq) {
 		cfs_rq = cfs_rq_of(se);
 		se->vruntime += cfs_rq->min_vruntime;
 #ifdef CONFIG_SMP
 		/*
 		 * migrate_task_rq_fair() will have removed our previous
 		 * contribution, but we must synchronize for ongoing future
 		 * decay.
 		 */
 		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
 		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
 #endif
 	}
 }
 void free_fair_sched_group(struct task_group *tg)
 {
 	int i;
 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
 	for_each_possible_cpu(i) {
 		if (tg->cfs_rq)
 			kfree(tg->cfs_rq[i]);
 		if (tg->se)
 			kfree(tg->se[i]);
 	}
 	kfree(tg->cfs_rq);
 	kfree(tg->se);
 }
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se;
 	int i;
 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
 	if (!tg->cfs_rq)
 		goto err;
 	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
 	if (!tg->se)
 		goto err;
 	tg->shares = NICE_0_LOAD;
 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
 	for_each_possible_cpu(i) {
 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
 				      GFP_KERNEL, cpu_to_node(i));
 		if (!cfs_rq)
 			goto err;
 		se = kzalloc_node(sizeof(struct sched_entity),
 				  GFP_KERNEL, cpu_to_node(i));
 		if (!se)
 			goto err_free_rq;
 		init_cfs_rq(cfs_rq);
 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
 	}
 	return 1;
 err_free_rq:
 	kfree(cfs_rq);
 err:
 	return 0;
 }
 void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 	/*
 	* Only empty task groups can be destroyed; so we can speculatively
 	* check on_list without danger of it being re-added.
 	*/
 	if (!tg->cfs_rq[cpu]->on_list)
 		return;
 	raw_spin_lock_irqsave(&rq->lock, flags);
 	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 			struct sched_entity *se, int cpu,
 			struct sched_entity *parent)
 {
 	struct rq *rq = cpu_rq(cpu);
 	cfs_rq->tg = tg;
 	cfs_rq->rq = rq;
 	init_cfs_rq_runtime(cfs_rq);
 	tg->cfs_rq[cpu] = cfs_rq;
 	tg->se[cpu] = se;
 	/* se could be NULL for root_task_group */
 	if (!se)
 		return;
 	if (!parent) {
 		se->cfs_rq = &rq->cfs;
 		se->depth = 0;
 	} else {
 		se->cfs_rq = parent->my_q;
 		se->depth = parent->depth + 1;
 	}
 	se->my_q = cfs_rq;
 	/* guarantee group entities always have weight */
 	update_load_set(&se->load, NICE_0_LOAD);
 	se->parent = parent;
 }
 static DEFINE_MUTEX(shares_mutex);
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
 	int i;
 	unsigned long flags;
 	/*
 	 * We can't change the weight of the root cgroup.
 	 */
 	if (!tg->se[0])
 		return -EINVAL;
 	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
 	mutex_lock(&shares_mutex);
 	if (tg->shares == shares)
 		goto done;
 	tg->shares = shares;
 	for_each_possible_cpu(i) {
 		struct rq *rq = cpu_rq(i);
 		struct sched_entity *se;
 		se = tg->se[i];
 		/* Propagate contribution to hierarchy */
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		/* Possible calls to update_curr() need rq clock */
 		update_rq_clock(rq);
 		for_each_sched_entity(se)
 			update_cfs_shares(group_cfs_rq(se));
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	}
 done:
 	mutex_unlock(&shares_mutex);
 	return 0;
 }
 #else /* CONFIG_FAIR_GROUP_SCHED */
 void free_fair_sched_group(struct task_group *tg) { }
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	return 1;
 }
 void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
 {
 	struct sched_entity *se = &task->se;
 	unsigned int rr_interval = 0;
 	/*
 	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
 	 * idle runqueue:
 	 */
 	if (rq->cfs.load.weight)
 		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
 	return rr_interval;
 }
 /*
  * All the scheduling class methods:
  */
 const struct sched_class fair_sched_class = {
 	.next			= &idle_sched_class,
 	.enqueue_task		= enqueue_task_fair,
 	.dequeue_task		= dequeue_task_fair,
 	.yield_task		= yield_task_fair,
 	.yield_to_task		= yield_to_task_fair,
 	.check_preempt_curr	= check_preempt_wakeup,
 	.pick_next_task		= pick_next_task_fair,
 	.put_prev_task		= put_prev_task_fair,
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_fair,
 	.migrate_task_rq	= migrate_task_rq_fair,
 	.rq_online		= rq_online_fair,
 	.rq_offline		= rq_offline_fair,
 	.task_waking		= task_waking_fair,
 #endif
 	.set_curr_task          = set_curr_task_fair,
 	.task_tick		= task_tick_fair,
 	.task_fork		= task_fork_fair,
 	.prio_changed		= prio_changed_fair,
 	.switched_from		= switched_from_fair,
 	.switched_to		= switched_to_fair,
 	.get_rr_interval	= get_rr_interval_fair,
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	.task_move_group	= task_move_group_fair,
 #endif
 };
 #ifdef CONFIG_SCHED_DEBUG
 void print_cfs_stats(struct seq_file *m, int cpu)
 {
 	struct cfs_rq *cfs_rq;
 	rcu_read_lock();
 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
 		print_cfs_rq(m, cpu, cfs_rq);
 	rcu_read_unlock();
 }
 #endif
 __init void init_sched_fair_class(void)
 {
 #ifdef CONFIG_SMP
 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
 #ifdef CONFIG_NO_HZ_COMMON
 	nohz.next_balance = jiffies;
 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
 	cpu_notifier(sched_ilb_notifier, 0);
 #endif
 #endif /* SMP */
 }