Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)

2

* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)

3

*

3

*

4

5

*

5

*

6

* Interactivity improvements by Mike Galbraith

6

* Interactivity improvements by Mike Galbraith

7

8

*

8

*

9

* Various enhancements by Dmitry Adamushko.

9

* Various enhancements by Dmitry Adamushko.

10

11

*

11

*

12

* Group scheduling enhancements by Srivatsa Vaddagiri

12

* Group scheduling enhancements by Srivatsa Vaddagiri

13

* Copyright IBM Corporation, 2007

13

* Copyright IBM Corporation, 2007

14

* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>

14

* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>

15

*

15

*

16

* Scaled math optimizations by Thomas Gleixner

16

* Scaled math optimizations by Thomas Gleixner

17

18

*

18

*

19

* Adaptive scheduling granularity, math enhancements by Peter Zijlstra

19

* Adaptive scheduling granularity, math enhancements by Peter Zijlstra

20

21

*/

21

*/

22

23

#include <linux/latencytop.h>

23

#include <linux/latencytop.h>

24

#include <linux/sched.h>

24

#include <linux/sched.h>

25

#include <linux/cpumask.h>

25

#include <linux/cpumask.h>

26

#include <linux/cpuidle.h>

26

#include <linux/cpuidle.h>

27

#include <linux/slab.h>

27

#include <linux/slab.h>

28

#include <linux/profile.h>

28

#include <linux/profile.h>

29

#include <linux/interrupt.h>

29

#include <linux/interrupt.h>

30

#include <linux/mempolicy.h>

30

#include <linux/mempolicy.h>

31

#include <linux/migrate.h>

31

#include <linux/migrate.h>

32

#include <linux/task_work.h>

32

#include <linux/task_work.h>

33

34

#include <trace/events/sched.h>

34

#include <trace/events/sched.h>

35

36

#include "sched.h"

36

#include "sched.h"

37

38

/*

38

/*

39

* Targeted preemption latency for CPU-bound tasks:

39

* Targeted preemption latency for CPU-bound tasks:

40

* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)

40

* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)

41

*

41

*

42

* NOTE: this latency value is not the same as the concept of

42

* NOTE: this latency value is not the same as the concept of

43

* 'timeslice length' - timeslices in CFS are of variable length

43

* 'timeslice length' - timeslices in CFS are of variable length

44

* and have no persistent notion like in traditional, time-slice

44

* and have no persistent notion like in traditional, time-slice

45

* based scheduling concepts.

45

* based scheduling concepts.

46

*

46

*

47

* (to see the precise effective timeslice length of your workload,

47

* (to see the precise effective timeslice length of your workload,

48

* run vmstat and monitor the context-switches (cs) field)

48

* run vmstat and monitor the context-switches (cs) field)

49

*/

49

*/

50

unsigned int sysctl_sched_latency = 6000000ULL;

50

unsigned int sysctl_sched_latency = 6000000ULL;

51

unsigned int normalized_sysctl_sched_latency = 6000000ULL;

51

unsigned int normalized_sysctl_sched_latency = 6000000ULL;

52

53

/*

53

/*

54

* The initial- and re-scaling of tunables is configurable

54

* The initial- and re-scaling of tunables is configurable

55

* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))

55

* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))

56

*

56

*

57

* Options are:

57

* Options are:

58

* SCHED_TUNABLESCALING_NONE - unscaled, always *1

58

* SCHED_TUNABLESCALING_NONE - unscaled, always *1

59

* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)

59

* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)

60

* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus

60

* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus

61

*/

61

*/

62

enum sched_tunable_scaling sysctl_sched_tunable_scaling

62

enum sched_tunable_scaling sysctl_sched_tunable_scaling

63

= SCHED_TUNABLESCALING_LOG;

63

= SCHED_TUNABLESCALING_LOG;

64

65

/*

65

/*

66

* Minimal preemption granularity for CPU-bound tasks:

66

* Minimal preemption granularity for CPU-bound tasks:

67

* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)

67

* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)

68

*/

68

*/

69

unsigned int sysctl_sched_min_granularity = 750000ULL;

69

unsigned int sysctl_sched_min_granularity = 750000ULL;

70

unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;

70

unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;

71

72

/*

72

/*

73

* is kept at sysctl_sched_latency / sysctl_sched_min_granularity

73

* is kept at sysctl_sched_latency / sysctl_sched_min_granularity

74

*/

74

*/

75

static unsigned int sched_nr_latency = 8;

75

static unsigned int sched_nr_latency = 8;

76

77

/*

77

/*

78

* After fork, child runs first. If set to 0 (default) then

78

* After fork, child runs first. If set to 0 (default) then

79

* parent will (try to) run first.

79

* parent will (try to) run first.

80

*/

80

*/

81

unsigned int sysctl_sched_child_runs_first __read_mostly;

81

unsigned int sysctl_sched_child_runs_first __read_mostly;

82

83

/*

83

/*

84

* SCHED_OTHER wake-up granularity.

84

* SCHED_OTHER wake-up granularity.

85

* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)

85

* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)

86

*

86

*

87

* This option delays the preemption effects of decoupled workloads

87

* This option delays the preemption effects of decoupled workloads

88

* and reduces their over-scheduling. Synchronous workloads will still

88

* and reduces their over-scheduling. Synchronous workloads will still

89

* have immediate wakeup/sleep latencies.

89

* have immediate wakeup/sleep latencies.

90

*/

90

*/

91

unsigned int sysctl_sched_wakeup_granularity = 1000000UL;

91

unsigned int sysctl_sched_wakeup_granularity = 1000000UL;

92

unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;

92

unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;

93

94

const_debug unsigned int sysctl_sched_migration_cost = 500000UL;

94

const_debug unsigned int sysctl_sched_migration_cost = 500000UL;

95

96

/*

96

/*

97

* The exponential sliding window over which load is averaged for shares

97

* The exponential sliding window over which load is averaged for shares

98

* distribution.

98

* distribution.

99

* (default: 10msec)

99

* (default: 10msec)

100

*/

100

*/

101

unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;

101

unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;

102

103

#ifdef CONFIG_CFS_BANDWIDTH

103

#ifdef CONFIG_CFS_BANDWIDTH

104

/*

104

/*

105

* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool

105

* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool

106

* each time a cfs_rq requests quota.

106

* each time a cfs_rq requests quota.

107

*

107

*

108

* Note: in the case that the slice exceeds the runtime remaining (either due

108

* Note: in the case that the slice exceeds the runtime remaining (either due

109

* to consumption or the quota being specified to be smaller than the slice)

109

* to consumption or the quota being specified to be smaller than the slice)

110

* we will always only issue the remaining available time.

110

* we will always only issue the remaining available time.

111

*

111

*

112

* default: 5 msec, units: microseconds

112

* default: 5 msec, units: microseconds

113

*/

113

*/

114

unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;

114

unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;

115

#endif

115

#endif

116

117

static inline void update_load_add(struct load_weight *lw, unsigned long inc)

117

static inline void update_load_add(struct load_weight *lw, unsigned long inc)

118

{

118

{

119

lw->weight += inc;

119

lw->weight += inc;

120

lw->inv_weight = 0;

120

lw->inv_weight = 0;

121

}

121

}

122

123

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)

123

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)

124

{

124

{

125

lw->weight -= dec;

125

lw->weight -= dec;

126

lw->inv_weight = 0;

126

lw->inv_weight = 0;

127

}

127

}

128

129

static inline void update_load_set(struct load_weight *lw, unsigned long w)

129

static inline void update_load_set(struct load_weight *lw, unsigned long w)

130

{

130

{

131

lw->weight = w;

131

lw->weight = w;

132

lw->inv_weight = 0;

132

lw->inv_weight = 0;

133

}

133

}

134

135

/*

135

/*

136

* Increase the granularity value when there are more CPUs,

136

* Increase the granularity value when there are more CPUs,

137

* because with more CPUs the 'effective latency' as visible

137

* because with more CPUs the 'effective latency' as visible

138

* to users decreases. But the relationship is not linear,

138

* to users decreases. But the relationship is not linear,

139

* so pick a second-best guess by going with the log2 of the

139

* so pick a second-best guess by going with the log2 of the

140

* number of CPUs.

140

* number of CPUs.

141

*

141

*

142

* This idea comes from the SD scheduler of Con Kolivas:

142

* This idea comes from the SD scheduler of Con Kolivas:

143

*/

143

*/

144

static int get_update_sysctl_factor(void)

144

static int get_update_sysctl_factor(void)

145

{

145

{

146

unsigned int cpus = min_t(int, num_online_cpus(), 8);

146

unsigned int cpus = min_t(int, num_online_cpus(), 8);

147

unsigned int factor;

147

unsigned int factor;

148

149

switch (sysctl_sched_tunable_scaling) {

149

switch (sysctl_sched_tunable_scaling) {

150

case SCHED_TUNABLESCALING_NONE:

150

case SCHED_TUNABLESCALING_NONE:

151

factor = 1;

151

factor = 1;

152

break;

152

break;

153

case SCHED_TUNABLESCALING_LINEAR:

153

case SCHED_TUNABLESCALING_LINEAR:

154

factor = cpus;

154

factor = cpus;

155

break;

155

break;

156

case SCHED_TUNABLESCALING_LOG:

156

case SCHED_TUNABLESCALING_LOG:

157

default:

157

default:

158

factor = 1 + ilog2(cpus);

158

factor = 1 + ilog2(cpus);

159

break;

159

break;

160

}

160

}

161

162

return factor;

162

return factor;

163

}

163

}

164

165

static void update_sysctl(void)

165

static void update_sysctl(void)

166

{

166

{

167

unsigned int factor = get_update_sysctl_factor();

167

unsigned int factor = get_update_sysctl_factor();

168

169

#define SET_SYSCTL(name) \

169

#define SET_SYSCTL(name) \

170

(sysctl_##name = (factor) * normalized_sysctl_##name)

170

(sysctl_##name = (factor) * normalized_sysctl_##name)

171

SET_SYSCTL(sched_min_granularity);

171

SET_SYSCTL(sched_min_granularity);

172

SET_SYSCTL(sched_latency);

172

SET_SYSCTL(sched_latency);

173

SET_SYSCTL(sched_wakeup_granularity);

173

SET_SYSCTL(sched_wakeup_granularity);

174

#undef SET_SYSCTL

174

#undef SET_SYSCTL

175

}

175

}

176

177

void sched_init_granularity(void)

177

void sched_init_granularity(void)

178

{

178

{

179

update_sysctl();

179

update_sysctl();

180

}

180

}

181

182

#define WMULT_CONST (~0U)

182

#define WMULT_CONST (~0U)

183

#define WMULT_SHIFT 32

183

#define WMULT_SHIFT 32

184

185

static void __update_inv_weight(struct load_weight *lw)

185

static void __update_inv_weight(struct load_weight *lw)

186

{

186

{

187

unsigned long w;

187

unsigned long w;

188

189

if (likely(lw->inv_weight))

189

if (likely(lw->inv_weight))

190

return;

190

return;

191

192

w = scale_load_down(lw->weight);

192

w = scale_load_down(lw->weight);

193

194

if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))

194

if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))

195

lw->inv_weight = 1;

195

lw->inv_weight = 1;

196

else if (unlikely(!w))

196

else if (unlikely(!w))

197

lw->inv_weight = WMULT_CONST;

197

lw->inv_weight = WMULT_CONST;

198

else

198

else

199

lw->inv_weight = WMULT_CONST / w;

199

lw->inv_weight = WMULT_CONST / w;

200

}

200

}

201

202

/*

202

/*

203

* delta_exec * weight / lw.weight

203

* delta_exec * weight / lw.weight

204

* OR

204

* OR

205

* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT

205

* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT

206

*

206

*

207

* Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case

207

* Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case

208

* we're guaranteed shift stays positive because inv_weight is guaranteed to

208

* we're guaranteed shift stays positive because inv_weight is guaranteed to

209

* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.

209

* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.

210

*

210

*

211

* Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus

211

* Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus

212

* weight/lw.weight <= 1, and therefore our shift will also be positive.

212

* weight/lw.weight <= 1, and therefore our shift will also be positive.

213

*/

213

*/

214

static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)

214

static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)

215

{

215

{

216

u64 fact = scale_load_down(weight);

216

u64 fact = scale_load_down(weight);

217

int shift = WMULT_SHIFT;

217

int shift = WMULT_SHIFT;

218

219

__update_inv_weight(lw);

219

__update_inv_weight(lw);

220

221

if (unlikely(fact >> 32)) {

221

if (unlikely(fact >> 32)) {

222

while (fact >> 32) {

222

while (fact >> 32) {

223

fact >>= 1;

223

fact >>= 1;

224

shift--;

224

shift--;

225

}

225

}

226

}

226

}

227

228

/* hint to use a 32x32->64 mul */

228

/* hint to use a 32x32->64 mul */

229

fact = (u64)(u32)fact * lw->inv_weight;

229

fact = (u64)(u32)fact * lw->inv_weight;

230

231

while (fact >> 32) {

231

while (fact >> 32) {

232

fact >>= 1;

232

fact >>= 1;

233

shift--;

233

shift--;

234

}

234

}

235

236

return mul_u64_u32_shr(delta_exec, fact, shift);

236

return mul_u64_u32_shr(delta_exec, fact, shift);

237

}

237

}

238

239

240

const struct sched_class fair_sched_class;

240

const struct sched_class fair_sched_class;

241

242

/**************************************************************

242

/**************************************************************

243

* CFS operations on generic schedulable entities:

243

* CFS operations on generic schedulable entities:

244

*/

244

*/

245

246

#ifdef CONFIG_FAIR_GROUP_SCHED

246

#ifdef CONFIG_FAIR_GROUP_SCHED

247

248

/* cpu runqueue to which this cfs_rq is attached */

248

/* cpu runqueue to which this cfs_rq is attached */

249

static inline struct rq *rq_of(struct cfs_rq *cfs_rq)

249

static inline struct rq *rq_of(struct cfs_rq *cfs_rq)

250

{

250

{

251

return cfs_rq->rq;

251

return cfs_rq->rq;

252

}

252

}

253

254

/* An entity is a task if it doesn't "own" a runqueue */

254

/* An entity is a task if it doesn't "own" a runqueue */

255

#define entity_is_task(se) (!se->my_q)

255

#define entity_is_task(se) (!se->my_q)

256

257

static inline struct task_struct *task_of(struct sched_entity *se)

257

static inline struct task_struct *task_of(struct sched_entity *se)

258

{

258

{

259

#ifdef CONFIG_SCHED_DEBUG

259

#ifdef CONFIG_SCHED_DEBUG

260

WARN_ON_ONCE(!entity_is_task(se));

260

WARN_ON_ONCE(!entity_is_task(se));

261

#endif

261

#endif

262

return container_of(se, struct task_struct, se);

262

return container_of(se, struct task_struct, se);

263

}

263

}

264

265

/* Walk up scheduling entities hierarchy */

265

/* Walk up scheduling entities hierarchy */

266

#define for_each_sched_entity(se) \

266

#define for_each_sched_entity(se) \

267

for (; se; se = se->parent)

267

for (; se; se = se->parent)

268

269

static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)

269

static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)

270

{

270

{

271

return p->se.cfs_rq;

271

return p->se.cfs_rq;

272

}

272

}

273

274

/* runqueue on which this entity is (to be) queued */

274

/* runqueue on which this entity is (to be) queued */

275

static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)

275

static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)

276

{

276

{

277

return se->cfs_rq;

277

return se->cfs_rq;

278

}

278

}

279

280

/* runqueue "owned" by this group */

280

/* runqueue "owned" by this group */

281

static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)

281

static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)

282

{

282

{

283

return grp->my_q;

283

return grp->my_q;

284

}

284

}

285

286

static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,

286

static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,

287

int force_update);

287

int force_update);

288

289

static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)

289

static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)

290

{

290

{

291

if (!cfs_rq->on_list) {

291

if (!cfs_rq->on_list) {

292

/*

292

/*

293

* Ensure we either appear before our parent (if already

293

* Ensure we either appear before our parent (if already

294

* enqueued) or force our parent to appear after us when it is

294

* enqueued) or force our parent to appear after us when it is

295

* enqueued. The fact that we always enqueue bottom-up

295

* enqueued. The fact that we always enqueue bottom-up

296

* reduces this to two cases.

296

* reduces this to two cases.

297

*/

297

*/

298

if (cfs_rq->tg->parent &&

298

if (cfs_rq->tg->parent &&

299

cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {

299

cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {

300

list_add_rcu(&cfs_rq->leaf_cfs_rq_list,

300

list_add_rcu(&cfs_rq->leaf_cfs_rq_list,

301

&rq_of(cfs_rq)->leaf_cfs_rq_list);

301

&rq_of(cfs_rq)->leaf_cfs_rq_list);

302

} else {

302

} else {

303

list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,

303

list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,

304

&rq_of(cfs_rq)->leaf_cfs_rq_list);

304

&rq_of(cfs_rq)->leaf_cfs_rq_list);

305

}

305

}

306

307

cfs_rq->on_list = 1;

307

cfs_rq->on_list = 1;

308

/* We should have no load, but we need to update last_decay. */

308

/* We should have no load, but we need to update last_decay. */

309

update_cfs_rq_blocked_load(cfs_rq, 0);

309

update_cfs_rq_blocked_load(cfs_rq, 0);

310

}

310

}

311

}

311

}

312

313

static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)

313

static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)

314

{

314

{

315

if (cfs_rq->on_list) {

315

if (cfs_rq->on_list) {

316

list_del_rcu(&cfs_rq->leaf_cfs_rq_list);

316

list_del_rcu(&cfs_rq->leaf_cfs_rq_list);

317

cfs_rq->on_list = 0;

317

cfs_rq->on_list = 0;

318

}

318

}

319

}

319

}

320

321

/* Iterate thr' all leaf cfs_rq's on a runqueue */

321

/* Iterate thr' all leaf cfs_rq's on a runqueue */

322

#define for_each_leaf_cfs_rq(rq, cfs_rq) \

322

#define for_each_leaf_cfs_rq(rq, cfs_rq) \

323

list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)

323

list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)

324

325

/* Do the two (enqueued) entities belong to the same group ? */

325

/* Do the two (enqueued) entities belong to the same group ? */

326

static inline struct cfs_rq *

326

static inline struct cfs_rq *

327

is_same_group(struct sched_entity *se, struct sched_entity *pse)

327

is_same_group(struct sched_entity *se, struct sched_entity *pse)

328

{

328

{

329

if (se->cfs_rq == pse->cfs_rq)

329

if (se->cfs_rq == pse->cfs_rq)

330

return se->cfs_rq;

330

return se->cfs_rq;

331

332

return NULL;

332

return NULL;

333

}

333

}

334

335

static inline struct sched_entity *parent_entity(struct sched_entity *se)

335

static inline struct sched_entity *parent_entity(struct sched_entity *se)

336

{

336

{

337

return se->parent;

337

return se->parent;

338

}

338

}

339

340

static void

340

static void

341

find_matching_se(struct sched_entity **se, struct sched_entity **pse)

341

find_matching_se(struct sched_entity **se, struct sched_entity **pse)

342

{

342

{

343

int se_depth, pse_depth;

343

int se_depth, pse_depth;

344

345

/*

345

/*

346

* preemption test can be made between sibling entities who are in the

346

* preemption test can be made between sibling entities who are in the

347

* same cfs_rq i.e who have a common parent. Walk up the hierarchy of

347

* same cfs_rq i.e who have a common parent. Walk up the hierarchy of

348

* both tasks until we find their ancestors who are siblings of common

348

* both tasks until we find their ancestors who are siblings of common

349

* parent.

349

* parent.

350

*/

350

*/

351

352

/* First walk up until both entities are at same depth */

352

/* First walk up until both entities are at same depth */

353

se_depth = (*se)->depth;

353

se_depth = (*se)->depth;

354

pse_depth = (*pse)->depth;

354

pse_depth = (*pse)->depth;

355

356

while (se_depth > pse_depth) {

356

while (se_depth > pse_depth) {

357

se_depth--;

357

se_depth--;

358

*se = parent_entity(*se);

358

*se = parent_entity(*se);

359

}

359

}

360

361

while (pse_depth > se_depth) {

361

while (pse_depth > se_depth) {

362

pse_depth--;

362

pse_depth--;

363

*pse = parent_entity(*pse);

363

*pse = parent_entity(*pse);

364

}

364

}

365

366

while (!is_same_group(*se, *pse)) {

366

while (!is_same_group(*se, *pse)) {

367

*se = parent_entity(*se);

367

*se = parent_entity(*se);

368

*pse = parent_entity(*pse);

368

*pse = parent_entity(*pse);

369

}

369

}

370

}

370

}

371

372

#else /* !CONFIG_FAIR_GROUP_SCHED */

372

#else /* !CONFIG_FAIR_GROUP_SCHED */

373

374

static inline struct task_struct *task_of(struct sched_entity *se)

374

static inline struct task_struct *task_of(struct sched_entity *se)

375

{

375

{

376

return container_of(se, struct task_struct, se);

376

return container_of(se, struct task_struct, se);

377

}

377

}

378

379

static inline struct rq *rq_of(struct cfs_rq *cfs_rq)

379

static inline struct rq *rq_of(struct cfs_rq *cfs_rq)

380

{

380

{

381

return container_of(cfs_rq, struct rq, cfs);

381

return container_of(cfs_rq, struct rq, cfs);

382

}

382

}

383

384

#define entity_is_task(se) 1

384

#define entity_is_task(se) 1

385

386

#define for_each_sched_entity(se) \

386

#define for_each_sched_entity(se) \

387

for (; se; se = NULL)

387

for (; se; se = NULL)

388

389

static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)

389

static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)

390

{

390

{

391

return &task_rq(p)->cfs;

391

return &task_rq(p)->cfs;

392

}

392

}

393

394

static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)

394

static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)

395

{

395

{

396

struct task_struct *p = task_of(se);

396

struct task_struct *p = task_of(se);

397

struct rq *rq = task_rq(p);

397

struct rq *rq = task_rq(p);

398

399

return &rq->cfs;

399

return &rq->cfs;

400

}

400

}

401

402

/* runqueue "owned" by this group */

402

/* runqueue "owned" by this group */

403

static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)

403

static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)

404

{

404

{

405

return NULL;

405

return NULL;

406

}

406

}

407

408

static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)

408

static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)

409

{

409

{

410

}

410

}

411

412

static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)

412

static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)

413

{

413

{

414

}

414

}

415

416

#define for_each_leaf_cfs_rq(rq, cfs_rq) \

416

#define for_each_leaf_cfs_rq(rq, cfs_rq) \

417

for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)

417

for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)

418

419

static inline struct sched_entity *parent_entity(struct sched_entity *se)

419

static inline struct sched_entity *parent_entity(struct sched_entity *se)

420

{

420

{

421

return NULL;

421

return NULL;

422

}

422

}

423

424

static inline void

424

static inline void

425

find_matching_se(struct sched_entity **se, struct sched_entity **pse)

425

find_matching_se(struct sched_entity **se, struct sched_entity **pse)

426

{

426

{

427

}

427

}

428

429

#endif /* CONFIG_FAIR_GROUP_SCHED */

429

#endif /* CONFIG_FAIR_GROUP_SCHED */

430

431

static __always_inline

431

static __always_inline

432

void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);

432

void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);

433

434

/**************************************************************

434

/**************************************************************

435

* Scheduling class tree data structure manipulation methods:

435

* Scheduling class tree data structure manipulation methods:

436

*/

436

*/

437

438

static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)

438

static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)

439

{

439

{

440

s64 delta = (s64)(vruntime - max_vruntime);

440

s64 delta = (s64)(vruntime - max_vruntime);

441

if (delta > 0)

441

if (delta > 0)

442

max_vruntime = vruntime;

442

max_vruntime = vruntime;

443

444

return max_vruntime;

444

return max_vruntime;

445

}

445

}

446

447

static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)

447

static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)

448

{

448

{

449

s64 delta = (s64)(vruntime - min_vruntime);

449

s64 delta = (s64)(vruntime - min_vruntime);

450

if (delta < 0)

450

if (delta < 0)

451

min_vruntime = vruntime;

451

min_vruntime = vruntime;

452

453

return min_vruntime;

453

return min_vruntime;

454

}

454

}

455

456

static inline int entity_before(struct sched_entity *a,

456

static inline int entity_before(struct sched_entity *a,

457

struct sched_entity *b)

457

struct sched_entity *b)

458

{

458

{

459

return (s64)(a->vruntime - b->vruntime) < 0;

459

return (s64)(a->vruntime - b->vruntime) < 0;

460

}

460

}

461

462

static void update_min_vruntime(struct cfs_rq *cfs_rq)

462

static void update_min_vruntime(struct cfs_rq *cfs_rq)

463

{

463

{

464

u64 vruntime = cfs_rq->min_vruntime;

464

u64 vruntime = cfs_rq->min_vruntime;

465

466

if (cfs_rq->curr)

466

if (cfs_rq->curr)

467

vruntime = cfs_rq->curr->vruntime;

467

vruntime = cfs_rq->curr->vruntime;

468

469

if (cfs_rq->rb_leftmost) {

469

if (cfs_rq->rb_leftmost) {

470

struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,

470

struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,

471

struct sched_entity,

471

struct sched_entity,

472

run_node);

472

run_node);

473

474

if (!cfs_rq->curr)

474

if (!cfs_rq->curr)

475

vruntime = se->vruntime;

475

vruntime = se->vruntime;

476

else

476

else

477

vruntime = min_vruntime(vruntime, se->vruntime);

477

vruntime = min_vruntime(vruntime, se->vruntime);

478

}

478

}

479

480

/* ensure we never gain time by being placed backwards. */

480

/* ensure we never gain time by being placed backwards. */

481

cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);

481

cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);

482

#ifndef CONFIG_64BIT

482

#ifndef CONFIG_64BIT

483

smp_wmb();

483

smp_wmb();

484

cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;

484

cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;

485

#endif

485

#endif

486

}

486

}

487

488

/*

488

/*

489

* Enqueue an entity into the rb-tree:

489

* Enqueue an entity into the rb-tree:

490

*/

490

*/

491

static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)

491

static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)

492

{

492

{

493

struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;

493

struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;

494

struct rb_node *parent = NULL;

494

struct rb_node *parent = NULL;

495

struct sched_entity *entry;

495

struct sched_entity *entry;

496

int leftmost = 1;

496

int leftmost = 1;

497

498

/*

498

/*

499

* Find the right place in the rbtree:

499

* Find the right place in the rbtree:

500

*/

500

*/

501

while (*link) {

501

while (*link) {

502

parent = *link;

502

parent = *link;

503

entry = rb_entry(parent, struct sched_entity, run_node);

503

entry = rb_entry(parent, struct sched_entity, run_node);

504

/*

504

/*

505

* We dont care about collisions. Nodes with

505

* We dont care about collisions. Nodes with

506

* the same key stay together.

506

* the same key stay together.

507

*/

507

*/

508

if (entity_before(se, entry)) {

508

if (entity_before(se, entry)) {

509

link = &parent->rb_left;

509

link = &parent->rb_left;

510

} else {

510

} else {

511

link = &parent->rb_right;

511

link = &parent->rb_right;

512

leftmost = 0;

512

leftmost = 0;

513

}

513

}

514

}

514

}

515

516

/*

516

/*

517

* Maintain a cache of leftmost tree entries (it is frequently

517

* Maintain a cache of leftmost tree entries (it is frequently

518

* used):

518

* used):

519

*/

519

*/

520

if (leftmost)

520

if (leftmost)

521

cfs_rq->rb_leftmost = &se->run_node;

521

cfs_rq->rb_leftmost = &se->run_node;

522

523

rb_link_node(&se->run_node, parent, link);

523

rb_link_node(&se->run_node, parent, link);

524

rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);

524

rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);

525

}

525

}

526

527

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)

527

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)

528

{

528

{

529

if (cfs_rq->rb_leftmost == &se->run_node) {

529

if (cfs_rq->rb_leftmost == &se->run_node) {

530

struct rb_node *next_node;

530

struct rb_node *next_node;

531

532

next_node = rb_next(&se->run_node);

532

next_node = rb_next(&se->run_node);

533

cfs_rq->rb_leftmost = next_node;

533

cfs_rq->rb_leftmost = next_node;

534

}

534

}

535

536

rb_erase(&se->run_node, &cfs_rq->tasks_timeline);

536

rb_erase(&se->run_node, &cfs_rq->tasks_timeline);

537

}

537

}

538

539

struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)

539

struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)

540

{

540

{

541

struct rb_node *left = cfs_rq->rb_leftmost;

541

struct rb_node *left = cfs_rq->rb_leftmost;

542

543

if (!left)

543

if (!left)

544

return NULL;

544

return NULL;

545

546

return rb_entry(left, struct sched_entity, run_node);

546

return rb_entry(left, struct sched_entity, run_node);

547

}

547

}

548

549

static struct sched_entity *__pick_next_entity(struct sched_entity *se)

549

static struct sched_entity *__pick_next_entity(struct sched_entity *se)

550

{

550

{

551

struct rb_node *next = rb_next(&se->run_node);

551

struct rb_node *next = rb_next(&se->run_node);

552

553

if (!next)

553

if (!next)

554

return NULL;

554

return NULL;

555

556

return rb_entry(next, struct sched_entity, run_node);

556

return rb_entry(next, struct sched_entity, run_node);

557

}

557

}

558

559

#ifdef CONFIG_SCHED_DEBUG

559

#ifdef CONFIG_SCHED_DEBUG

560

struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)

560

struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)

561

{

561

{

562

struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);

562

struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);

563

564

if (!last)

564

if (!last)

565

return NULL;

565

return NULL;

566

567

return rb_entry(last, struct sched_entity, run_node);

567

return rb_entry(last, struct sched_entity, run_node);

568

}

568

}

569

570

/**************************************************************

570

/**************************************************************

571

* Scheduling class statistics methods:

571

* Scheduling class statistics methods:

572

*/

572

*/

573

574

int sched_proc_update_handler(struct ctl_table *table, int write,

574

int sched_proc_update_handler(struct ctl_table *table, int write,

575

void __user *buffer, size_t *lenp,

575

void __user *buffer, size_t *lenp,

576

loff_t *ppos)

576

loff_t *ppos)

577

{

577

{

578

int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);

578

int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);

579

int factor = get_update_sysctl_factor();

579

int factor = get_update_sysctl_factor();

580

581

if (ret || !write)

581

if (ret || !write)

582

return ret;

582

return ret;

583

584

sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,

584

sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,

585

sysctl_sched_min_granularity);

585

sysctl_sched_min_granularity);

586

587

#define WRT_SYSCTL(name) \

587

#define WRT_SYSCTL(name) \

588

(normalized_sysctl_##name = sysctl_##name / (factor))

588

(normalized_sysctl_##name = sysctl_##name / (factor))

589

WRT_SYSCTL(sched_min_granularity);

589

WRT_SYSCTL(sched_min_granularity);

590

WRT_SYSCTL(sched_latency);

590

WRT_SYSCTL(sched_latency);

591

WRT_SYSCTL(sched_wakeup_granularity);

591

WRT_SYSCTL(sched_wakeup_granularity);

592

#undef WRT_SYSCTL

592

#undef WRT_SYSCTL

593

594

return 0;

594

return 0;

595

}

595

}

596

#endif

596

#endif

597

598

/*

598

/*

599

* delta /= w

599

* delta /= w

600

*/

600

*/

601

static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)

601

static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)

602

{

602

{

603

if (unlikely(se->load.weight != NICE_0_LOAD))

603

if (unlikely(se->load.weight != NICE_0_LOAD))

604

delta = __calc_delta(delta, NICE_0_LOAD, &se->load);

604

delta = __calc_delta(delta, NICE_0_LOAD, &se->load);

605

606

return delta;

606

return delta;

607

}

607

}

608

609

/*

609

/*

610

* The idea is to set a period in which each task runs once.

610

* The idea is to set a period in which each task runs once.

611

*

611

*

612

* When there are too many tasks (sched_nr_latency) we have to stretch

612

* When there are too many tasks (sched_nr_latency) we have to stretch

613

* this period because otherwise the slices get too small.

613

* this period because otherwise the slices get too small.

614

*

614

*

615

* p = (nr <= nl) ? l : l*nr/nl

615

* p = (nr <= nl) ? l : l*nr/nl

616

*/

616

*/

617

static u64 __sched_period(unsigned long nr_running)

617

static u64 __sched_period(unsigned long nr_running)

618

{

618

{

619

u64 period = sysctl_sched_latency;

619

u64 period = sysctl_sched_latency;

620

unsigned long nr_latency = sched_nr_latency;

620

unsigned long nr_latency = sched_nr_latency;

621

622

if (unlikely(nr_running > nr_latency)) {

622

if (unlikely(nr_running > nr_latency)) {

623

period = sysctl_sched_min_granularity;

623

period = sysctl_sched_min_granularity;

624

period *= nr_running;

624

period *= nr_running;

625

}

625

}

626

627

return period;

627

return period;

628

}

628

}

629

630

/*

630

/*

631

* We calculate the wall-time slice from the period by taking a part

631

* We calculate the wall-time slice from the period by taking a part

632

* proportional to the weight.

632

* proportional to the weight.

633

*

633

*

634

* s = p*P[w/rw]

634

* s = p*P[w/rw]

635

*/

635

*/

636

static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)

636

static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)

637

{

637

{

638

u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);

638

u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);

639

640

for_each_sched_entity(se) {

640

for_each_sched_entity(se) {

641

struct load_weight *load;

641

struct load_weight *load;

642

struct load_weight lw;

642

struct load_weight lw;

643

644

cfs_rq = cfs_rq_of(se);

644

cfs_rq = cfs_rq_of(se);

645

load = &cfs_rq->load;

645

load = &cfs_rq->load;

646

647

if (unlikely(!se->on_rq)) {

647

if (unlikely(!se->on_rq)) {

648

lw = cfs_rq->load;

648

lw = cfs_rq->load;

649

650

update_load_add(&lw, se->load.weight);

650

update_load_add(&lw, se->load.weight);

651

load = &lw;

651

load = &lw;

652

}

652

}

653

slice = __calc_delta(slice, se->load.weight, load);

653

slice = __calc_delta(slice, se->load.weight, load);

654

}

654

}

655

return slice;

655

return slice;

656

}

656

}

657

658

/*

658

/*

659

* We calculate the vruntime slice of a to-be-inserted task.

659

* We calculate the vruntime slice of a to-be-inserted task.

660

*

660

*

661

* vs = s/w

661

* vs = s/w

662

*/

662

*/

663

static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)

663

static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)

664

{

664

{

665

return calc_delta_fair(sched_slice(cfs_rq, se), se);

665

return calc_delta_fair(sched_slice(cfs_rq, se), se);

666

}

666

}

667

668

#ifdef CONFIG_SMP

668

#ifdef CONFIG_SMP

669

static int select_idle_sibling(struct task_struct *p, int cpu);

669

static int select_idle_sibling(struct task_struct *p, int cpu);

670

static unsigned long task_h_load(struct task_struct *p);

670

static unsigned long task_h_load(struct task_struct *p);

671

672

static inline void __update_task_entity_contrib(struct sched_entity *se);

672

static inline void __update_task_entity_contrib(struct sched_entity *se);

673

674

/* Give new task start runnable values to heavy its load in infant time */

674

/* Give new task start runnable values to heavy its load in infant time */

675

void init_task_runnable_average(struct task_struct *p)

675

void init_task_runnable_average(struct task_struct *p)

676

{

676

{

677

u32 slice;

677

u32 slice;

678

679

p->se.avg.decay_count = 0;

679

p->se.avg.decay_count = 0;

680

slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;

680

slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;

681

p->se.avg.runnable_avg_sum = slice;

681

p->se.avg.runnable_avg_sum = slice;

682

p->se.avg.runnable_avg_period = slice;

682

p->se.avg.runnable_avg_period = slice;

683

__update_task_entity_contrib(&p->se);

683

__update_task_entity_contrib(&p->se);

684

}

684

}

685

#else

685

#else

686

void init_task_runnable_average(struct task_struct *p)

686

void init_task_runnable_average(struct task_struct *p)

687

{

687

{

688

}

688

}

689

#endif

689

#endif

690

691

/*

691

/*

692

* Update the current task's runtime statistics.

692

* Update the current task's runtime statistics.

693

*/

693

*/

694

static void update_curr(struct cfs_rq *cfs_rq)

694

static void update_curr(struct cfs_rq *cfs_rq)

695

{

695

{

696

struct sched_entity *curr = cfs_rq->curr;

696

struct sched_entity *curr = cfs_rq->curr;

697

u64 now = rq_clock_task(rq_of(cfs_rq));

697

u64 now = rq_clock_task(rq_of(cfs_rq));

698

u64 delta_exec;

698

u64 delta_exec;

699

700

if (unlikely(!curr))

700

if (unlikely(!curr))

701

return;

701

return;

702

703

delta_exec = now - curr->exec_start;

703

delta_exec = now - curr->exec_start;

704

if (unlikely((s64)delta_exec <= 0))

704

if (unlikely((s64)delta_exec <= 0))

705

return;

705

return;

706

707

curr->exec_start = now;

707

curr->exec_start = now;

708

709

schedstat_set(curr->statistics.exec_max,

709

schedstat_set(curr->statistics.exec_max,

710

max(delta_exec, curr->statistics.exec_max));

710

max(delta_exec, curr->statistics.exec_max));

711

712

curr->sum_exec_runtime += delta_exec;

712

curr->sum_exec_runtime += delta_exec;

713

schedstat_add(cfs_rq, exec_clock, delta_exec);

713

schedstat_add(cfs_rq, exec_clock, delta_exec);

714

715

curr->vruntime += calc_delta_fair(delta_exec, curr);

715

curr->vruntime += calc_delta_fair(delta_exec, curr);

716

update_min_vruntime(cfs_rq);

716

update_min_vruntime(cfs_rq);

717

718

if (entity_is_task(curr)) {

718

if (entity_is_task(curr)) {

719

struct task_struct *curtask = task_of(curr);

719

struct task_struct *curtask = task_of(curr);

720

721

trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);

721

trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);

722

cpuacct_charge(curtask, delta_exec);

722

cpuacct_charge(curtask, delta_exec);

723

account_group_exec_runtime(curtask, delta_exec);

723

account_group_exec_runtime(curtask, delta_exec);

724

}

724

}

725

726

account_cfs_rq_runtime(cfs_rq, delta_exec);

726

account_cfs_rq_runtime(cfs_rq, delta_exec);

727

}

727

}

728

729

static void update_curr_fair(struct rq *rq)

729

static void update_curr_fair(struct rq *rq)

730

{

730

{

731

update_curr(cfs_rq_of(&rq->curr->se));

731

update_curr(cfs_rq_of(&rq->curr->se));

732

}

732

}

733

734

static inline void

734

static inline void

735

update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)

735

update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)

736

{

736

{

737

schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));

737

schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));

738

}

738

}

739

740

/*

740

/*

741

* Task is being enqueued - update stats:

741

* Task is being enqueued - update stats:

742

*/

742

*/

743

static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)

743

static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)

744

{

744

{

745

/*

745

/*

746

* Are we enqueueing a waiting task? (for current tasks

746

* Are we enqueueing a waiting task? (for current tasks

747

* a dequeue/enqueue event is a NOP)

747

* a dequeue/enqueue event is a NOP)

748

*/

748

*/

749

if (se != cfs_rq->curr)

749

if (se != cfs_rq->curr)

750

update_stats_wait_start(cfs_rq, se);

750

update_stats_wait_start(cfs_rq, se);

751

}

751

}

752

753

static void

753

static void

754

update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)

754

update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)

755

{

755

{

756

schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,

756

schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,

757

rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));

757

rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));

758

schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);

758

schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);

759

schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +

759

schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +

760

rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);

760

rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);

761

#ifdef CONFIG_SCHEDSTATS

761

#ifdef CONFIG_SCHEDSTATS

762

if (entity_is_task(se)) {

762

if (entity_is_task(se)) {

763

trace_sched_stat_wait(task_of(se),

763

trace_sched_stat_wait(task_of(se),

764

rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);

764

rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);

765

}

765

}

766

#endif

766

#endif

767

schedstat_set(se->statistics.wait_start, 0);

767

schedstat_set(se->statistics.wait_start, 0);

768

}

768

}

769

770

static inline void

770

static inline void

771

update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)

771

update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)

772

{

772

{

773

/*

773

/*

774

* Mark the end of the wait period if dequeueing a

774

* Mark the end of the wait period if dequeueing a

775

* waiting task:

775

* waiting task:

776

*/

776

*/

777

if (se != cfs_rq->curr)

777

if (se != cfs_rq->curr)

778

update_stats_wait_end(cfs_rq, se);

778

update_stats_wait_end(cfs_rq, se);

779

}

779

}

780

781

/*

781

/*

782

* We are picking a new current task - update its stats:

782

* We are picking a new current task - update its stats:

783

*/

783

*/

784

static inline void

784

static inline void

785

update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)

785

update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)

786

{

786

{

787

/*

787

/*

788

* We are starting a new run period:

788

* We are starting a new run period:

789

*/

789

*/

790

se->exec_start = rq_clock_task(rq_of(cfs_rq));

790

se->exec_start = rq_clock_task(rq_of(cfs_rq));

791

}

791

}

792

793

/**************************************************

793

/**************************************************

794

* Scheduling class queueing methods:

794

* Scheduling class queueing methods:

795

*/

795

*/

796

797

#ifdef CONFIG_NUMA_BALANCING

797

#ifdef CONFIG_NUMA_BALANCING

798

/*

798

/*

799

* Approximate time to scan a full NUMA task in ms. The task scan period is

799

* Approximate time to scan a full NUMA task in ms. The task scan period is

800

* calculated based on the tasks virtual memory size and

800

* calculated based on the tasks virtual memory size and

801

* numa_balancing_scan_size.

801

* numa_balancing_scan_size.

802

*/

802

*/

803

unsigned int sysctl_numa_balancing_scan_period_min = 1000;

803

unsigned int sysctl_numa_balancing_scan_period_min = 1000;

804

unsigned int sysctl_numa_balancing_scan_period_max = 60000;

804

unsigned int sysctl_numa_balancing_scan_period_max = 60000;

805

806

/* Portion of address space to scan in MB */

806

/* Portion of address space to scan in MB */

807

unsigned int sysctl_numa_balancing_scan_size = 256;

807

unsigned int sysctl_numa_balancing_scan_size = 256;

808

809

/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */

809

/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */

810

unsigned int sysctl_numa_balancing_scan_delay = 1000;

810

unsigned int sysctl_numa_balancing_scan_delay = 1000;

811

812

static unsigned int task_nr_scan_windows(struct task_struct *p)

812

static unsigned int task_nr_scan_windows(struct task_struct *p)

813

{

813

{

814

unsigned long rss = 0;

814

unsigned long rss = 0;

815

unsigned long nr_scan_pages;

815

unsigned long nr_scan_pages;

816

817

/*

817

/*

818

* Calculations based on RSS as non-present and empty pages are skipped

818

* Calculations based on RSS as non-present and empty pages are skipped

819

* by the PTE scanner and NUMA hinting faults should be trapped based

819

* by the PTE scanner and NUMA hinting faults should be trapped based

820

* on resident pages

820

* on resident pages

821

*/

821

*/

822

nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);

822

nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);

823

rss = get_mm_rss(p->mm);

823

rss = get_mm_rss(p->mm);

824

if (!rss)

824

if (!rss)

825

rss = nr_scan_pages;

825

rss = nr_scan_pages;

826

827

rss = round_up(rss, nr_scan_pages);

827

rss = round_up(rss, nr_scan_pages);

828

return rss / nr_scan_pages;

828

return rss / nr_scan_pages;

829

}

829

}

830

831

/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */

831

/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */

832

#define MAX_SCAN_WINDOW 2560

832

#define MAX_SCAN_WINDOW 2560

833

834

static unsigned int task_scan_min(struct task_struct *p)

834

static unsigned int task_scan_min(struct task_struct *p)

835

{

835

{

836

unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);

836

unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);

837

unsigned int scan, floor;

837

unsigned int scan, floor;

838

unsigned int windows = 1;

838

unsigned int windows = 1;

839

840

if (scan_size < MAX_SCAN_WINDOW)

840

if (scan_size < MAX_SCAN_WINDOW)

841

windows = MAX_SCAN_WINDOW / scan_size;

841

windows = MAX_SCAN_WINDOW / scan_size;

842

floor = 1000 / windows;

842

floor = 1000 / windows;

843

844

scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);

844

scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);

845

return max_t(unsigned int, floor, scan);

845

return max_t(unsigned int, floor, scan);

846

}

846

}

847

848

static unsigned int task_scan_max(struct task_struct *p)

848

static unsigned int task_scan_max(struct task_struct *p)

849

{

849

{

850

unsigned int smin = task_scan_min(p);

850

unsigned int smin = task_scan_min(p);

851

unsigned int smax;

851

unsigned int smax;

852

853

/* Watch for min being lower than max due to floor calculations */

853

/* Watch for min being lower than max due to floor calculations */

854

smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);

854

smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);

855

return max(smin, smax);

855

return max(smin, smax);

856

}

856

}

857

858

static void account_numa_enqueue(struct rq *rq, struct task_struct *p)

858

static void account_numa_enqueue(struct rq *rq, struct task_struct *p)

859

{

859

{

860

rq->nr_numa_running += (p->numa_preferred_nid != -1);

860

rq->nr_numa_running += (p->numa_preferred_nid != -1);

861

rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));

861

rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));

862

}

862

}

863

864

static void account_numa_dequeue(struct rq *rq, struct task_struct *p)

864

static void account_numa_dequeue(struct rq *rq, struct task_struct *p)

865

{

865

{

866

rq->nr_numa_running -= (p->numa_preferred_nid != -1);

866

rq->nr_numa_running -= (p->numa_preferred_nid != -1);

867

rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));

867

rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));

868

}

868

}

869

870

struct numa_group {

870

struct numa_group {

871

atomic_t refcount;

871

atomic_t refcount;

872

873

spinlock_t lock; /* nr_tasks, tasks */

873

spinlock_t lock; /* nr_tasks, tasks */

874

int nr_tasks;

874

int nr_tasks;

875

pid_t gid;

875

pid_t gid;

876

877

struct rcu_head rcu;

877

struct rcu_head rcu;

878

nodemask_t active_nodes;

878

nodemask_t active_nodes;

879

unsigned long total_faults;

879

unsigned long total_faults;

880

/*

880

/*

881

* Faults_cpu is used to decide whether memory should move

881

* Faults_cpu is used to decide whether memory should move

882

* towards the CPU. As a consequence, these stats are weighted

882

* towards the CPU. As a consequence, these stats are weighted

883

* more by CPU use than by memory faults.

883

* more by CPU use than by memory faults.

884

*/

884

*/

885

unsigned long *faults_cpu;

885

unsigned long *faults_cpu;

886

unsigned long faults[0];

886

unsigned long faults[0];

887

};

887

};

888

889

/* Shared or private faults. */

889

/* Shared or private faults. */

890

#define NR_NUMA_HINT_FAULT_TYPES 2

890

#define NR_NUMA_HINT_FAULT_TYPES 2

891

892

/* Memory and CPU locality */

892

/* Memory and CPU locality */

893

#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)

893

#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)

894

895

/* Averaged statistics, and temporary buffers. */

895

/* Averaged statistics, and temporary buffers. */

896

#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)

896

#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)

897

898

pid_t task_numa_group_id(struct task_struct *p)

898

pid_t task_numa_group_id(struct task_struct *p)

899

{

899

{

900

return p->numa_group ? p->numa_group->gid : 0;

900

return p->numa_group ? p->numa_group->gid : 0;

901

}

901

}

902

903

/*

903

/*

904

* The averaged statistics, shared & private, memory & cpu,

904

* The averaged statistics, shared & private, memory & cpu,

905

* occupy the first half of the array. The second half of the

905

* occupy the first half of the array. The second half of the

906

* array is for current counters, which are averaged into the

906

* array is for current counters, which are averaged into the

907

* first set by task_numa_placement.

907

* first set by task_numa_placement.

908

*/

908

*/

909

static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)

909

static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)

910

{

910

{

911

return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;

911

return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;

912

}

912

}

913

914

static inline unsigned long task_faults(struct task_struct *p, int nid)

914

static inline unsigned long task_faults(struct task_struct *p, int nid)

915

{

915

{

916

if (!p->numa_faults)

916

if (!p->numa_faults)

917

return 0;

917

return 0;

918

919

return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +

919

return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +

920

p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];

920

p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];

921

}

921

}

922

923

static inline unsigned long group_faults(struct task_struct *p, int nid)

923

static inline unsigned long group_faults(struct task_struct *p, int nid)

924

{

924

{

925

if (!p->numa_group)

925

if (!p->numa_group)

926

return 0;

926

return 0;

927

928

return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +

928

return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +

929

p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];

929

p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];

930

}

930

}

931

932

static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)

932

static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)

933

{

933

{

934

return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +

934

return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +

935

group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];

935

group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];

936

}

936

}

937

938

/* Handle placement on systems where not all nodes are directly connected. */

938

/* Handle placement on systems where not all nodes are directly connected. */

939

static unsigned long score_nearby_nodes(struct task_struct *p, int nid,

939

static unsigned long score_nearby_nodes(struct task_struct *p, int nid,

940

int maxdist, bool task)

940

int maxdist, bool task)

941

{

941

{

942

unsigned long score = 0;

942

unsigned long score = 0;

943

int node;

943

int node;

944

945

/*

945

/*

946

* All nodes are directly connected, and the same distance

946

* All nodes are directly connected, and the same distance

947

* from each other. No need for fancy placement algorithms.

947

* from each other. No need for fancy placement algorithms.

948

*/

948

*/

949

if (sched_numa_topology_type == NUMA_DIRECT)

949

if (sched_numa_topology_type == NUMA_DIRECT)

950

return 0;

950

return 0;

951

952

/*

952

/*

953

* This code is called for each node, introducing N^2 complexity,

953

* This code is called for each node, introducing N^2 complexity,

954

* which should be ok given the number of nodes rarely exceeds 8.

954

* which should be ok given the number of nodes rarely exceeds 8.

955

*/

955

*/

956

for_each_online_node(node) {

956

for_each_online_node(node) {

957

unsigned long faults;

957

unsigned long faults;

958

int dist = node_distance(nid, node);

958

int dist = node_distance(nid, node);

959

960

/*

960

/*

961

* The furthest away nodes in the system are not interesting

961

* The furthest away nodes in the system are not interesting

962

* for placement; nid was already counted.

962

* for placement; nid was already counted.

963

*/

963

*/

964

if (dist == sched_max_numa_distance || node == nid)

964

if (dist == sched_max_numa_distance || node == nid)

965

continue;

965

continue;

966

967

/*

967

/*

968

* On systems with a backplane NUMA topology, compare groups

968

* On systems with a backplane NUMA topology, compare groups

969

* of nodes, and move tasks towards the group with the most

969

* of nodes, and move tasks towards the group with the most

970

* memory accesses. When comparing two nodes at distance

970

* memory accesses. When comparing two nodes at distance

971

* "hoplimit", only nodes closer by than "hoplimit" are part

971

* "hoplimit", only nodes closer by than "hoplimit" are part

972

* of each group. Skip other nodes.

972

* of each group. Skip other nodes.

973

*/

973

*/

974

if (sched_numa_topology_type == NUMA_BACKPLANE &&

974

if (sched_numa_topology_type == NUMA_BACKPLANE &&

975

dist > maxdist)

975

dist > maxdist)

976

continue;

976

continue;

977

978

/* Add up the faults from nearby nodes. */

978

/* Add up the faults from nearby nodes. */

979

if (task)

979

if (task)

980

faults = task_faults(p, node);

980

faults = task_faults(p, node);

981

else

981

else

982

faults = group_faults(p, node);

982

faults = group_faults(p, node);

983

984

/*

984

/*

985

* On systems with a glueless mesh NUMA topology, there are

985

* On systems with a glueless mesh NUMA topology, there are

986

* no fixed "groups of nodes". Instead, nodes that are not

986

* no fixed "groups of nodes". Instead, nodes that are not

987

* directly connected bounce traffic through intermediate

987

* directly connected bounce traffic through intermediate

988

* nodes; a numa_group can occupy any set of nodes.

988

* nodes; a numa_group can occupy any set of nodes.

989

* The further away a node is, the less the faults count.

989

* The further away a node is, the less the faults count.

990

* This seems to result in good task placement.

990

* This seems to result in good task placement.

991

*/

991

*/

992

if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {

992

if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {

993

faults *= (sched_max_numa_distance - dist);

993

faults *= (sched_max_numa_distance - dist);

994

faults /= (sched_max_numa_distance - LOCAL_DISTANCE);

994

faults /= (sched_max_numa_distance - LOCAL_DISTANCE);

995

}

995

}

996

997

score += faults;

997

score += faults;

998

}

998

}

999

1000

return score;

1000

return score;

1001

}

1001

}

1002

1003

/*

1003

/*

1004

* These return the fraction of accesses done by a particular task, or

1004

* These return the fraction of accesses done by a particular task, or

1005

* task group, on a particular numa node. The group weight is given a

1005

* task group, on a particular numa node. The group weight is given a

1006

* larger multiplier, in order to group tasks together that are almost

1006

* larger multiplier, in order to group tasks together that are almost

1007

* evenly spread out between numa nodes.

1007

* evenly spread out between numa nodes.

1008

*/

1008

*/

1009

static inline unsigned long task_weight(struct task_struct *p, int nid,

1009

static inline unsigned long task_weight(struct task_struct *p, int nid,

1010

int dist)

1010

int dist)

1011

{

1011

{

1012

unsigned long faults, total_faults;

1012

unsigned long faults, total_faults;

1013

1014

if (!p->numa_faults)

1014

if (!p->numa_faults)

1015

return 0;

1015

return 0;

1016

1017

total_faults = p->total_numa_faults;

1017

total_faults = p->total_numa_faults;

1018

1019

if (!total_faults)

1019

if (!total_faults)

1020

return 0;

1020

return 0;

1021

1022

faults = task_faults(p, nid);

1022

faults = task_faults(p, nid);

1023

faults += score_nearby_nodes(p, nid, dist, true);

1023

faults += score_nearby_nodes(p, nid, dist, true);

1024

1025

return 1000 * faults / total_faults;

1025

return 1000 * faults / total_faults;

1026

}

1026

}

1027

1028

static inline unsigned long group_weight(struct task_struct *p, int nid,

1028

static inline unsigned long group_weight(struct task_struct *p, int nid,

1029

int dist)

1029

int dist)

1030

{

1030

{

1031

unsigned long faults, total_faults;

1031

unsigned long faults, total_faults;

1032

1033

if (!p->numa_group)

1033

if (!p->numa_group)

1034

return 0;

1034

return 0;

1035

1036

total_faults = p->numa_group->total_faults;

1036

total_faults = p->numa_group->total_faults;

1037

1038

if (!total_faults)

1038

if (!total_faults)

1039

return 0;

1039

return 0;

1040

1041

faults = group_faults(p, nid);

1041

faults = group_faults(p, nid);

1042

faults += score_nearby_nodes(p, nid, dist, false);

1042

faults += score_nearby_nodes(p, nid, dist, false);

1043

1044

return 1000 * faults / total_faults;

1044

return 1000 * faults / total_faults;

1045

}

1045

}

1046

1047

bool should_numa_migrate_memory(struct task_struct *p, struct page * page,

1047

bool should_numa_migrate_memory(struct task_struct *p, struct page * page,

1048

int src_nid, int dst_cpu)

1048

int src_nid, int dst_cpu)

1049

{

1049

{

1050

struct numa_group *ng = p->numa_group;

1050

struct numa_group *ng = p->numa_group;

1051

int dst_nid = cpu_to_node(dst_cpu);

1051

int dst_nid = cpu_to_node(dst_cpu);

1052

int last_cpupid, this_cpupid;

1052

int last_cpupid, this_cpupid;

1053

1054

this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);

1054

this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);

1055

1056

/*

1056

/*

1057

* Multi-stage node selection is used in conjunction with a periodic

1057

* Multi-stage node selection is used in conjunction with a periodic

1058

* migration fault to build a temporal task<->page relation. By using

1058

* migration fault to build a temporal task<->page relation. By using

1059

* a two-stage filter we remove short/unlikely relations.

1059

* a two-stage filter we remove short/unlikely relations.

1060

*

1060

*

1061

* Using P(p) ~ n_p / n_t as per frequentist probability, we can equate

1061

* Using P(p) ~ n_p / n_t as per frequentist probability, we can equate

1062

* a task's usage of a particular page (n_p) per total usage of this

1062

* a task's usage of a particular page (n_p) per total usage of this

1063

* page (n_t) (in a given time-span) to a probability.

1063

* page (n_t) (in a given time-span) to a probability.

1064

*

1064

*

1065

* Our periodic faults will sample this probability and getting the

1065

* Our periodic faults will sample this probability and getting the

1066

* same result twice in a row, given these samples are fully

1066

* same result twice in a row, given these samples are fully

1067

* independent, is then given by P(n)^2, provided our sample period

1067

* independent, is then given by P(n)^2, provided our sample period

1068

* is sufficiently short compared to the usage pattern.

1068

* is sufficiently short compared to the usage pattern.

1069

*

1069

*

1070

* This quadric squishes small probabilities, making it less likely we

1070

* This quadric squishes small probabilities, making it less likely we

1071

* act on an unlikely task<->page relation.

1071

* act on an unlikely task<->page relation.

1072

*/

1072

*/

1073

last_cpupid = page_cpupid_xchg_last(page, this_cpupid);

1073

last_cpupid = page_cpupid_xchg_last(page, this_cpupid);

1074

if (!cpupid_pid_unset(last_cpupid) &&

1074

if (!cpupid_pid_unset(last_cpupid) &&

1075

cpupid_to_nid(last_cpupid) != dst_nid)

1075

cpupid_to_nid(last_cpupid) != dst_nid)

1076

return false;

1076

return false;

1077

1078

/* Always allow migrate on private faults */

1078

/* Always allow migrate on private faults */

1079

if (cpupid_match_pid(p, last_cpupid))

1079

if (cpupid_match_pid(p, last_cpupid))

1080

return true;

1080

return true;

1081

1082

/* A shared fault, but p->numa_group has not been set up yet. */

1082

/* A shared fault, but p->numa_group has not been set up yet. */

1083

if (!ng)

1083

if (!ng)

1084

return true;

1084

return true;

1085

1086

/*

1086

/*

1087

* Do not migrate if the destination is not a node that

1087

* Do not migrate if the destination is not a node that

1088

* is actively used by this numa group.

1088

* is actively used by this numa group.

1089

*/

1089

*/

1090

if (!node_isset(dst_nid, ng->active_nodes))

1090

if (!node_isset(dst_nid, ng->active_nodes))

1091

return false;

1091

return false;

1092

1093

/*

1093

/*

1094

* Source is a node that is not actively used by this

1094

* Source is a node that is not actively used by this

1095

* numa group, while the destination is. Migrate.

1095

* numa group, while the destination is. Migrate.

1096

*/

1096

*/

1097

if (!node_isset(src_nid, ng->active_nodes))

1097

if (!node_isset(src_nid, ng->active_nodes))

1098

return true;

1098

return true;

1099

1100

/*

1100

/*

1101

* Both source and destination are nodes in active

1101

* Both source and destination are nodes in active

1102

* use by this numa group. Maximize memory bandwidth

1102

* use by this numa group. Maximize memory bandwidth

1103

* by migrating from more heavily used groups, to less

1103

* by migrating from more heavily used groups, to less

1104

* heavily used ones, spreading the load around.

1104

* heavily used ones, spreading the load around.

1105

* Use a 1/4 hysteresis to avoid spurious page movement.

1105

* Use a 1/4 hysteresis to avoid spurious page movement.

1106

*/

1106

*/

1107

return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);

1107

return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);

1108

}

1108

}

1109

1110

static unsigned long weighted_cpuload(const int cpu);

1110

static unsigned long weighted_cpuload(const int cpu);

1111

static unsigned long source_load(int cpu, int type);

1111

static unsigned long source_load(int cpu, int type);

1112

static unsigned long target_load(int cpu, int type);

1112

static unsigned long target_load(int cpu, int type);

1113

static unsigned long capacity_of(int cpu);

1113

static unsigned long capacity_of(int cpu);

1114

static long effective_load(struct task_group *tg, int cpu, long wl, long wg);

1114

static long effective_load(struct task_group *tg, int cpu, long wl, long wg);

1115

1116

/* Cached statistics for all CPUs within a node */

1116

/* Cached statistics for all CPUs within a node */

1117

struct numa_stats {

1117

struct numa_stats {

1118

unsigned long nr_running;

1118

unsigned long nr_running;

1119

unsigned long load;

1119

unsigned long load;

1120

1121

/* Total compute capacity of CPUs on a node */

1121

/* Total compute capacity of CPUs on a node */

1122

unsigned long compute_capacity;

1122

unsigned long compute_capacity;

1123

1124

/* Approximate capacity in terms of runnable tasks on a node */

1124

/* Approximate capacity in terms of runnable tasks on a node */

1125

unsigned long task_capacity;

1125

unsigned long task_capacity;

1126

int has_free_capacity;

1126

int has_free_capacity;

1127

};

1127

};

1128

1129

/*

1129

/*

1130

* XXX borrowed from update_sg_lb_stats

1130

* XXX borrowed from update_sg_lb_stats

1131

*/

1131

*/

1132

static void update_numa_stats(struct numa_stats *ns, int nid)

1132

static void update_numa_stats(struct numa_stats *ns, int nid)

1133

{

1133

{

1134

int smt, cpu, cpus = 0;

1134

int smt, cpu, cpus = 0;

1135

unsigned long capacity;

1135

unsigned long capacity;

1136

1137

memset(ns, 0, sizeof(*ns));

1137

memset(ns, 0, sizeof(*ns));

1138

for_each_cpu(cpu, cpumask_of_node(nid)) {

1138

for_each_cpu(cpu, cpumask_of_node(nid)) {

1139

struct rq *rq = cpu_rq(cpu);

1139

struct rq *rq = cpu_rq(cpu);

1140

1141

ns->nr_running += rq->nr_running;

1141

ns->nr_running += rq->nr_running;

1142

ns->load += weighted_cpuload(cpu);

1142

ns->load += weighted_cpuload(cpu);

1143

ns->compute_capacity += capacity_of(cpu);

1143

ns->compute_capacity += capacity_of(cpu);

1144

1145

cpus++;

1145

cpus++;

1146

}

1146

}

1147

1148

/*

1148

/*

1149

* If we raced with hotplug and there are no CPUs left in our mask

1149

* If we raced with hotplug and there are no CPUs left in our mask

1150

* the @ns structure is NULL'ed and task_numa_compare() will

1150

* the @ns structure is NULL'ed and task_numa_compare() will

1151

* not find this node attractive.

1151

* not find this node attractive.

1152

*

1152

*

1153

* We'll either bail at !has_free_capacity, or we'll detect a huge

1153

* We'll either bail at !has_free_capacity, or we'll detect a huge

1154

* imbalance and bail there.

1154

* imbalance and bail there.

1155

*/

1155

*/

1156

if (!cpus)

1156

if (!cpus)

1157

return;

1157

return;

1158

1159

/* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */

1159

/* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */

1160

smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);

1160

smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);

1161

capacity = cpus / smt; /* cores */

1161

capacity = cpus / smt; /* cores */

1162

1163

ns->task_capacity = min_t(unsigned, capacity,

1163

ns->task_capacity = min_t(unsigned, capacity,

1164

DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));

1164

DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));

1165

ns->has_free_capacity = (ns->nr_running < ns->task_capacity);

1165

ns->has_free_capacity = (ns->nr_running < ns->task_capacity);

1166

}

1166

}

1167

1168

struct task_numa_env {

1168

struct task_numa_env {

1169

struct task_struct *p;

1169

struct task_struct *p;

1170

1171

int src_cpu, src_nid;

1171

int src_cpu, src_nid;

1172

int dst_cpu, dst_nid;

1172

int dst_cpu, dst_nid;

1173

1174

struct numa_stats src_stats, dst_stats;

1174

struct numa_stats src_stats, dst_stats;

1175

1176

int imbalance_pct;

1176

int imbalance_pct;

1177

int dist;

1177

int dist;

1178

1179

struct task_struct *best_task;

1179

struct task_struct *best_task;

1180

long best_imp;

1180

long best_imp;

1181

int best_cpu;

1181

int best_cpu;

1182

};

1182

};

1183

1184

static void task_numa_assign(struct task_numa_env *env,

1184

static void task_numa_assign(struct task_numa_env *env,

1185

struct task_struct *p, long imp)

1185

struct task_struct *p, long imp)

1186

{

1186

{

1187

if (env->best_task)

1187

if (env->best_task)

1188

put_task_struct(env->best_task);

1188

put_task_struct(env->best_task);

1189

if (p)

1189

if (p)

1190

get_task_struct(p);

1190

get_task_struct(p);

1191

1192

env->best_task = p;

1192

env->best_task = p;

1193

env->best_imp = imp;

1193

env->best_imp = imp;

1194

env->best_cpu = env->dst_cpu;

1194

env->best_cpu = env->dst_cpu;

1195

}

1195

}

1196

1197

static bool load_too_imbalanced(long src_load, long dst_load,

1197

static bool load_too_imbalanced(long src_load, long dst_load,

1198

struct task_numa_env *env)

1198

struct task_numa_env *env)

1199

{

1199

{

1200

long imb, old_imb;

1200

long imb, old_imb;

1201

long orig_src_load, orig_dst_load;

1201

long orig_src_load, orig_dst_load;

1202

long src_capacity, dst_capacity;

1202

long src_capacity, dst_capacity;

1203

1204

/*

1204

/*

1205

* The load is corrected for the CPU capacity available on each node.

1205

* The load is corrected for the CPU capacity available on each node.

1206

*

1206

*

1207

* src_load dst_load

1207

* src_load dst_load

1208

* ------------ vs ---------

1208

* ------------ vs ---------

1209

* src_capacity dst_capacity

1209

* src_capacity dst_capacity

1210

*/

1210

*/

1211

src_capacity = env->src_stats.compute_capacity;

1211

src_capacity = env->src_stats.compute_capacity;

1212

dst_capacity = env->dst_stats.compute_capacity;

1212

dst_capacity = env->dst_stats.compute_capacity;

1213

1214

/* We care about the slope of the imbalance, not the direction. */

1214

/* We care about the slope of the imbalance, not the direction. */

1215

if (dst_load < src_load)

1215

if (dst_load < src_load)

1216

swap(dst_load, src_load);

1216

swap(dst_load, src_load);

1217

1218

/* Is the difference below the threshold? */

1218

/* Is the difference below the threshold? */

1219

imb = dst_load * src_capacity * 100 -

1219

imb = dst_load * src_capacity * 100 -

1220

src_load * dst_capacity * env->imbalance_pct;

1220

src_load * dst_capacity * env->imbalance_pct;

1221

if (imb <= 0)

1221

if (imb <= 0)

1222

return false;

1222

return false;

1223

1224

/*

1224

/*

1225

* The imbalance is above the allowed threshold.

1225

* The imbalance is above the allowed threshold.

1226

* Compare it with the old imbalance.

1226

* Compare it with the old imbalance.

1227

*/

1227

*/

1228

orig_src_load = env->src_stats.load;

1228

orig_src_load = env->src_stats.load;

1229

orig_dst_load = env->dst_stats.load;

1229

orig_dst_load = env->dst_stats.load;

1230

1231

if (orig_dst_load < orig_src_load)

1231

if (orig_dst_load < orig_src_load)

1232

swap(orig_dst_load, orig_src_load);

1232

swap(orig_dst_load, orig_src_load);

1233

1234

old_imb = orig_dst_load * src_capacity * 100 -

1234

old_imb = orig_dst_load * src_capacity * 100 -

1235

orig_src_load * dst_capacity * env->imbalance_pct;

1235

orig_src_load * dst_capacity * env->imbalance_pct;

1236

1237

/* Would this change make things worse? */

1237

/* Would this change make things worse? */

1238

return (imb > old_imb);

1238

return (imb > old_imb);

1239

}

1239

}

1240

1241

/*

1241

/*

1242

* This checks if the overall compute and NUMA accesses of the system would

1242

* This checks if the overall compute and NUMA accesses of the system would

1243

* be improved if the source tasks was migrated to the target dst_cpu taking

1243

* be improved if the source tasks was migrated to the target dst_cpu taking

1244

* into account that it might be best if task running on the dst_cpu should

1244

* into account that it might be best if task running on the dst_cpu should

1245

* be exchanged with the source task

1245

* be exchanged with the source task

1246

*/

1246

*/

1247

static void task_numa_compare(struct task_numa_env *env,

1247

static void task_numa_compare(struct task_numa_env *env,

1248

long taskimp, long groupimp)

1248

long taskimp, long groupimp)

1249

{

1249

{

1250

struct rq *src_rq = cpu_rq(env->src_cpu);

1250

struct rq *src_rq = cpu_rq(env->src_cpu);

1251

struct rq *dst_rq = cpu_rq(env->dst_cpu);

1251

struct rq *dst_rq = cpu_rq(env->dst_cpu);

1252

struct task_struct *cur;

1252

struct task_struct *cur;

1253

long src_load, dst_load;

1253

long src_load, dst_load;

1254

long load;

1254

long load;

1255

long imp = env->p->numa_group ? groupimp : taskimp;

1255

long imp = env->p->numa_group ? groupimp : taskimp;

1256

long moveimp = imp;

1256

long moveimp = imp;

1257

int dist = env->dist;

1257

int dist = env->dist;

1258

1259

rcu_read_lock();

1259

rcu_read_lock();

1260

1261

raw_spin_lock_irq(&dst_rq->lock);

1261

raw_spin_lock_irq(&dst_rq->lock);

1262

cur = dst_rq->curr;

1262

cur = dst_rq->curr;

1263

/*

1263

/*

1264

* No need to move the exiting task, and this ensures that ->curr

1264

* No need to move the exiting task, and this ensures that ->curr

1265

* wasn't reaped and thus get_task_struct() in task_numa_assign()

1265

* wasn't reaped and thus get_task_struct() in task_numa_assign()

1266

* is safe under RCU read lock.

1266

* is safe under RCU read lock.

1267

* Note that rcu_read_lock() itself can't protect from the final

1267

* Note that rcu_read_lock() itself can't protect from the final

1268

* put_task_struct() after the last schedule().

1268

* put_task_struct() after the last schedule().

1269

*/

1269

*/

1270

if ((cur->flags & PF_EXITING) || is_idle_task(cur))

1270

if ((cur->flags & PF_EXITING) || is_idle_task(cur))

1271

cur = NULL;

1271

cur = NULL;

1272

raw_spin_unlock_irq(&dst_rq->lock);

1272

raw_spin_unlock_irq(&dst_rq->lock);

1273

1274

/*

1274

/*

1275

* Because we have preemption enabled we can get migrated around and

1275

* Because we have preemption enabled we can get migrated around and

1276

* end try selecting ourselves (current == env->p) as a swap candidate.

1276

* end try selecting ourselves (current == env->p) as a swap candidate.

1277

*/

1277

*/

1278

if (cur == env->p)

1278

if (cur == env->p)

1279

goto unlock;

1279

goto unlock;

1280

1281

/*

1281

/*

1282

* "imp" is the fault differential for the source task between the

1282

* "imp" is the fault differential for the source task between the

1283

* source and destination node. Calculate the total differential for

1283

* source and destination node. Calculate the total differential for

1284

* the source task and potential destination task. The more negative

1284

* the source task and potential destination task. The more negative

1285

* the value is, the more rmeote accesses that would be expected to

1285

* the value is, the more rmeote accesses that would be expected to

1286

* be incurred if the tasks were swapped.

1286

* be incurred if the tasks were swapped.

1287

*/

1287

*/

1288

if (cur) {

1288

if (cur) {

1289

/* Skip this swap candidate if cannot move to the source cpu */

1289

/* Skip this swap candidate if cannot move to the source cpu */

1290

if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))

1290

if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))

1291

goto unlock;

1291

goto unlock;

1292

1293

/*

1293

/*

1294

* If dst and source tasks are in the same NUMA group, or not

1294

* If dst and source tasks are in the same NUMA group, or not

1295

* in any group then look only at task weights.

1295

* in any group then look only at task weights.

1296

*/

1296

*/

1297

if (cur->numa_group == env->p->numa_group) {

1297

if (cur->numa_group == env->p->numa_group) {

1298

imp = taskimp + task_weight(cur, env->src_nid, dist) -

1298

imp = taskimp + task_weight(cur, env->src_nid, dist) -

1299

task_weight(cur, env->dst_nid, dist);

1299

task_weight(cur, env->dst_nid, dist);

1300

/*

1300

/*

1301

* Add some hysteresis to prevent swapping the

1301

* Add some hysteresis to prevent swapping the

1302

* tasks within a group over tiny differences.

1302

* tasks within a group over tiny differences.

1303

*/

1303

*/

1304

if (cur->numa_group)

1304

if (cur->numa_group)

1305

imp -= imp/16;

1305

imp -= imp/16;

1306

} else {

1306

} else {

1307

/*

1307

/*

1308

* Compare the group weights. If a task is all by

1308

* Compare the group weights. If a task is all by

1309

* itself (not part of a group), use the task weight

1309

* itself (not part of a group), use the task weight

1310

* instead.

1310

* instead.

1311

*/

1311

*/

1312

if (cur->numa_group)

1312

if (cur->numa_group)

1313

imp += group_weight(cur, env->src_nid, dist) -

1313

imp += group_weight(cur, env->src_nid, dist) -

1314

group_weight(cur, env->dst_nid, dist);

1314

group_weight(cur, env->dst_nid, dist);

1315

else

1315

else

1316

imp += task_weight(cur, env->src_nid, dist) -

1316

imp += task_weight(cur, env->src_nid, dist) -

1317

task_weight(cur, env->dst_nid, dist);

1317

task_weight(cur, env->dst_nid, dist);

1318

}

1318

}

1319

}

1319

}

1320

1321

if (imp <= env->best_imp && moveimp <= env->best_imp)

1321

if (imp <= env->best_imp && moveimp <= env->best_imp)

1322

goto unlock;

1322

goto unlock;

1323

1324

if (!cur) {

1324

if (!cur) {

1325

/* Is there capacity at our destination? */

1325

/* Is there capacity at our destination? */

1326

if (env->src_stats.nr_running <= env->src_stats.task_capacity &&

1326

if (env->src_stats.nr_running <= env->src_stats.task_capacity &&

1327

!env->dst_stats.has_free_capacity)

1327

!env->dst_stats.has_free_capacity)

1328

goto unlock;

1328

goto unlock;

1329

1330

goto balance;

1330

goto balance;

1331

}

1331

}

1332

1333

/* Balance doesn't matter much if we're running a task per cpu */

1333

/* Balance doesn't matter much if we're running a task per cpu */

1334

if (imp > env->best_imp && src_rq->nr_running == 1 &&

1334

if (imp > env->best_imp && src_rq->nr_running == 1 &&

1335

dst_rq->nr_running == 1)

1335

dst_rq->nr_running == 1)

1336

goto assign;

1336

goto assign;

1337

1338

/*

1338

/*

1339

* In the overloaded case, try and keep the load balanced.

1339

* In the overloaded case, try and keep the load balanced.

1340

*/

1340

*/

1341

balance:

1341

balance:

1342

load = task_h_load(env->p);

1342

load = task_h_load(env->p);

1343

dst_load = env->dst_stats.load + load;

1343

dst_load = env->dst_stats.load + load;

1344

src_load = env->src_stats.load - load;

1344

src_load = env->src_stats.load - load;

1345

1346

if (moveimp > imp && moveimp > env->best_imp) {

1346

if (moveimp > imp && moveimp > env->best_imp) {

1347

/*

1347

/*

1348

* If the improvement from just moving env->p direction is

1348

* If the improvement from just moving env->p direction is

1349

* better than swapping tasks around, check if a move is

1349

* better than swapping tasks around, check if a move is

1350

* possible. Store a slightly smaller score than moveimp,

1350

* possible. Store a slightly smaller score than moveimp,

1351

* so an actually idle CPU will win.

1351

* so an actually idle CPU will win.

1352

*/

1352

*/

1353

if (!load_too_imbalanced(src_load, dst_load, env)) {

1353

if (!load_too_imbalanced(src_load, dst_load, env)) {

1354

imp = moveimp - 1;

1354

imp = moveimp - 1;

1355

cur = NULL;

1355

cur = NULL;

1356

goto assign;

1356

goto assign;

1357

}

1357

}

1358

}

1358

}

1359

1360

if (imp <= env->best_imp)

1360

if (imp <= env->best_imp)

1361

goto unlock;

1361

goto unlock;

1362

1363

if (cur) {

1363

if (cur) {

1364

load = task_h_load(cur);

1364

load = task_h_load(cur);

1365

dst_load -= load;

1365

dst_load -= load;

1366

src_load += load;

1366

src_load += load;

1367

}

1367

}

1368

1369

if (load_too_imbalanced(src_load, dst_load, env))

1369

if (load_too_imbalanced(src_load, dst_load, env))

1370

goto unlock;

1370

goto unlock;

1371

1372

/*

1372

/*

1373

* One idle CPU per node is evaluated for a task numa move.

1373

* One idle CPU per node is evaluated for a task numa move.

1374

* Call select_idle_sibling to maybe find a better one.

1374

* Call select_idle_sibling to maybe find a better one.

1375

*/

1375

*/

1376

if (!cur)

1376

if (!cur)

1377

env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);

1377

env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);

1378

1379

assign:

1379

assign:

1380

task_numa_assign(env, cur, imp);

1380

task_numa_assign(env, cur, imp);

1381

unlock:

1381

unlock:

1382

rcu_read_unlock();

1382

rcu_read_unlock();

1383

}

1383

}

1384

1385

static void task_numa_find_cpu(struct task_numa_env *env,

1385

static void task_numa_find_cpu(struct task_numa_env *env,

1386

long taskimp, long groupimp)

1386

long taskimp, long groupimp)

1387

{

1387

{

1388

int cpu;

1388

int cpu;

1389

1390

for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {

1390

for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {

1391

/* Skip this CPU if the source task cannot migrate */

1391

/* Skip this CPU if the source task cannot migrate */

1392

if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))

1392

if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))

1393

continue;

1393

continue;

1394

1395

env->dst_cpu = cpu;

1395

env->dst_cpu = cpu;

1396

task_numa_compare(env, taskimp, groupimp);

1396

task_numa_compare(env, taskimp, groupimp);

1397

}

1397

}

1398

}

1398

}

1399

1400

static int task_numa_migrate(struct task_struct *p)

1400

static int task_numa_migrate(struct task_struct *p)

1401

{

1401

{

1402

struct task_numa_env env = {

1402

struct task_numa_env env = {

1403

.p = p,

1403

.p = p,

1404

1405

.src_cpu = task_cpu(p),

1405

.src_cpu = task_cpu(p),

1406

.src_nid = task_node(p),

1406

.src_nid = task_node(p),

1407

1408

.imbalance_pct = 112,

1408

.imbalance_pct = 112,

1409

1410

.best_task = NULL,

1410

.best_task = NULL,

1411

.best_imp = 0,

1411

.best_imp = 0,

1412

.best_cpu = -1

1412

.best_cpu = -1

1413

};

1413

};

1414

struct sched_domain *sd;

1414

struct sched_domain *sd;

1415

unsigned long taskweight, groupweight;

1415

unsigned long taskweight, groupweight;

1416

int nid, ret, dist;

1416

int nid, ret, dist;

1417

long taskimp, groupimp;

1417

long taskimp, groupimp;

1418

1419

/*

1419

/*

1420

* Pick the lowest SD_NUMA domain, as that would have the smallest

1420

* Pick the lowest SD_NUMA domain, as that would have the smallest

1421

* imbalance and would be the first to start moving tasks about.

1421

* imbalance and would be the first to start moving tasks about.

1422

*

1422

*

1423

* And we want to avoid any moving of tasks about, as that would create

1423

* And we want to avoid any moving of tasks about, as that would create

1424

* random movement of tasks -- counter the numa conditions we're trying

1424

* random movement of tasks -- counter the numa conditions we're trying

1425

* to satisfy here.

1425

* to satisfy here.

1426

*/

1426

*/

1427

rcu_read_lock();

1427

rcu_read_lock();

1428

sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));

1428

sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));

1429

if (sd)

1429

if (sd)

1430

env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;

1430

env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;

1431

rcu_read_unlock();

1431

rcu_read_unlock();

1432

1433

/*

1433

/*

1434

* Cpusets can break the scheduler domain tree into smaller

1434

* Cpusets can break the scheduler domain tree into smaller

1435

* balance domains, some of which do not cross NUMA boundaries.

1435

* balance domains, some of which do not cross NUMA boundaries.

1436

* Tasks that are "trapped" in such domains cannot be migrated

1436

* Tasks that are "trapped" in such domains cannot be migrated

1437

* elsewhere, so there is no point in (re)trying.

1437

* elsewhere, so there is no point in (re)trying.

1438

*/

1438

*/

1439

if (unlikely(!sd)) {

1439

if (unlikely(!sd)) {

1440

p->numa_preferred_nid = task_node(p);

1440

p->numa_preferred_nid = task_node(p);

1441

return -EINVAL;

1441

return -EINVAL;

1442

}

1442

}

1443

1444

env.dst_nid = p->numa_preferred_nid;

1444

env.dst_nid = p->numa_preferred_nid;

1445

dist = env.dist = node_distance(env.src_nid, env.dst_nid);

1445

dist = env.dist = node_distance(env.src_nid, env.dst_nid);

1446

taskweight = task_weight(p, env.src_nid, dist);

1446

taskweight = task_weight(p, env.src_nid, dist);

1447

groupweight = group_weight(p, env.src_nid, dist);

1447

groupweight = group_weight(p, env.src_nid, dist);

1448

update_numa_stats(&env.src_stats, env.src_nid);

1448

update_numa_stats(&env.src_stats, env.src_nid);

1449

taskimp = task_weight(p, env.dst_nid, dist) - taskweight;

1449

taskimp = task_weight(p, env.dst_nid, dist) - taskweight;

1450

groupimp = group_weight(p, env.dst_nid, dist) - groupweight;

1450

groupimp = group_weight(p, env.dst_nid, dist) - groupweight;

1451

update_numa_stats(&env.dst_stats, env.dst_nid);

1451

update_numa_stats(&env.dst_stats, env.dst_nid);

1452

1453

/* Try to find a spot on the preferred nid. */

1453

/* Try to find a spot on the preferred nid. */

1454

task_numa_find_cpu(&env, taskimp, groupimp);

1454

task_numa_find_cpu(&env, taskimp, groupimp);

1455

1456

/*

1456

/*

1457

* Look at other nodes in these cases:

1457

* Look at other nodes in these cases:

1458

* - there is no space available on the preferred_nid

1458

* - there is no space available on the preferred_nid

1459

* - the task is part of a numa_group that is interleaved across

1459

* - the task is part of a numa_group that is interleaved across

1460

* multiple NUMA nodes; in order to better consolidate the group,

1460

* multiple NUMA nodes; in order to better consolidate the group,

1461

* we need to check other locations.

1461

* we need to check other locations.

1462

*/

1462

*/

1463

if (env.best_cpu == -1 || (p->numa_group &&

1463

if (env.best_cpu == -1 || (p->numa_group &&

1464

nodes_weight(p->numa_group->active_nodes) > 1)) {

1464

nodes_weight(p->numa_group->active_nodes) > 1)) {

1465

for_each_online_node(nid) {

1465

for_each_online_node(nid) {

1466

if (nid == env.src_nid || nid == p->numa_preferred_nid)

1466

if (nid == env.src_nid || nid == p->numa_preferred_nid)

1467

continue;

1467

continue;

1468

1469

dist = node_distance(env.src_nid, env.dst_nid);

1469

dist = node_distance(env.src_nid, env.dst_nid);

1470

if (sched_numa_topology_type == NUMA_BACKPLANE &&

1470

if (sched_numa_topology_type == NUMA_BACKPLANE &&

1471

dist != env.dist) {

1471

dist != env.dist) {

1472

taskweight = task_weight(p, env.src_nid, dist);

1472

taskweight = task_weight(p, env.src_nid, dist);

1473

groupweight = group_weight(p, env.src_nid, dist);

1473

groupweight = group_weight(p, env.src_nid, dist);

1474

}

1474

}

1475

1476

/* Only consider nodes where both task and groups benefit */

1476

/* Only consider nodes where both task and groups benefit */

1477

taskimp = task_weight(p, nid, dist) - taskweight;

1477

taskimp = task_weight(p, nid, dist) - taskweight;

1478

groupimp = group_weight(p, nid, dist) - groupweight;

1478

groupimp = group_weight(p, nid, dist) - groupweight;

1479

if (taskimp < 0 && groupimp < 0)

1479

if (taskimp < 0 && groupimp < 0)

1480

continue;

1480

continue;

1481

1482

env.dist = dist;

1482

env.dist = dist;

1483

env.dst_nid = nid;

1483

env.dst_nid = nid;

1484

update_numa_stats(&env.dst_stats, env.dst_nid);

1484

update_numa_stats(&env.dst_stats, env.dst_nid);

1485

task_numa_find_cpu(&env, taskimp, groupimp);

1485

task_numa_find_cpu(&env, taskimp, groupimp);

1486

}

1486

}

1487

}

1487

}

1488

1489

/*

1489

/*

1490

* If the task is part of a workload that spans multiple NUMA nodes,

1490

* If the task is part of a workload that spans multiple NUMA nodes,

1491

* and is migrating into one of the workload's active nodes, remember

1491

* and is migrating into one of the workload's active nodes, remember

1492

* this node as the task's preferred numa node, so the workload can

1492

* this node as the task's preferred numa node, so the workload can

1493

* settle down.

1493

* settle down.

1494

* A task that migrated to a second choice node will be better off

1494

* A task that migrated to a second choice node will be better off

1495

* trying for a better one later. Do not set the preferred node here.

1495

* trying for a better one later. Do not set the preferred node here.

1496

*/

1496

*/

1497

if (p->numa_group) {

1497

if (p->numa_group) {

1498

if (env.best_cpu == -1)

1498

if (env.best_cpu == -1)

1499

nid = env.src_nid;

1499

nid = env.src_nid;

1500

else

1500

else

1501

nid = env.dst_nid;

1501

nid = env.dst_nid;

1502

1503

if (node_isset(nid, p->numa_group->active_nodes))

1503

if (node_isset(nid, p->numa_group->active_nodes))

1504

sched_setnuma(p, env.dst_nid);

1504

sched_setnuma(p, env.dst_nid);

1505

}

1505

}

1506

1507

/* No better CPU than the current one was found. */

1507

/* No better CPU than the current one was found. */

1508

if (env.best_cpu == -1)

1508

if (env.best_cpu == -1)

1509

return -EAGAIN;

1509

return -EAGAIN;

1510

1511

/*

1511

/*

1512

* Reset the scan period if the task is being rescheduled on an

1512

* Reset the scan period if the task is being rescheduled on an

1513

* alternative node to recheck if the tasks is now properly placed.

1513

* alternative node to recheck if the tasks is now properly placed.

1514

*/

1514

*/

1515

p->numa_scan_period = task_scan_min(p);

1515

p->numa_scan_period = task_scan_min(p);

1516

1517

if (env.best_task == NULL) {

1517

if (env.best_task == NULL) {

1518

ret = migrate_task_to(p, env.best_cpu);

1518

ret = migrate_task_to(p, env.best_cpu);

1519

if (ret != 0)

1519

if (ret != 0)

1520

trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);

1520

trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);

1521

return ret;

1521

return ret;

1522

}

1522

}

1523

1524

ret = migrate_swap(p, env.best_task);

1524

ret = migrate_swap(p, env.best_task);

1525

if (ret != 0)

1525

if (ret != 0)

1526

trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));

1526

trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));

1527

put_task_struct(env.best_task);

1527

put_task_struct(env.best_task);

1528

return ret;

1528

return ret;

1529

}

1529

}

1530

1531

/* Attempt to migrate a task to a CPU on the preferred node. */

1531

/* Attempt to migrate a task to a CPU on the preferred node. */

1532

static void numa_migrate_preferred(struct task_struct *p)

1532

static void numa_migrate_preferred(struct task_struct *p)

1533

{

1533

{

1534

unsigned long interval = HZ;

1534

unsigned long interval = HZ;

1535

1536

/* This task has no NUMA fault statistics yet */

1536

/* This task has no NUMA fault statistics yet */

1537

if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))

1537

if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))

1538

return;

1538

return;

1539

1540

/* Periodically retry migrating the task to the preferred node */

1540

/* Periodically retry migrating the task to the preferred node */

1541

interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);

1541

interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);

1542

p->numa_migrate_retry = jiffies + interval;

1542

p->numa_migrate_retry = jiffies + interval;

1543

1544

/* Success if task is already running on preferred CPU */

1544

/* Success if task is already running on preferred CPU */

1545

if (task_node(p) == p->numa_preferred_nid)

1545

if (task_node(p) == p->numa_preferred_nid)

1546

return;

1546

return;

1547

1548

/* Otherwise, try migrate to a CPU on the preferred node */

1548

/* Otherwise, try migrate to a CPU on the preferred node */

1549

task_numa_migrate(p);

1549

task_numa_migrate(p);

1550

}

1550

}

1551

1552

/*

1552

/*

1553

* Find the nodes on which the workload is actively running. We do this by

1553

* Find the nodes on which the workload is actively running. We do this by

1554

* tracking the nodes from which NUMA hinting faults are triggered. This can

1554

* tracking the nodes from which NUMA hinting faults are triggered. This can

1555

* be different from the set of nodes where the workload's memory is currently

1555

* be different from the set of nodes where the workload's memory is currently

1556

* located.

1556

* located.

1557

*

1557

*

1558

* The bitmask is used to make smarter decisions on when to do NUMA page

1558

* The bitmask is used to make smarter decisions on when to do NUMA page

1559

* migrations, To prevent flip-flopping, and excessive page migrations, nodes

1559

* migrations, To prevent flip-flopping, and excessive page migrations, nodes

1560

* are added when they cause over 6/16 of the maximum number of faults, but

1560

* are added when they cause over 6/16 of the maximum number of faults, but

1561

* only removed when they drop below 3/16.

1561

* only removed when they drop below 3/16.

1562

*/

1562

*/

1563

static void update_numa_active_node_mask(struct numa_group *numa_group)

1563

static void update_numa_active_node_mask(struct numa_group *numa_group)

1564

{

1564

{

1565

unsigned long faults, max_faults = 0;

1565

unsigned long faults, max_faults = 0;

1566

int nid;

1566

int nid;

1567

1568

for_each_online_node(nid) {

1568

for_each_online_node(nid) {

1569

faults = group_faults_cpu(numa_group, nid);

1569

faults = group_faults_cpu(numa_group, nid);

1570

if (faults > max_faults)

1570

if (faults > max_faults)

1571

max_faults = faults;

1571

max_faults = faults;

1572

}

1572

}

1573

1574

for_each_online_node(nid) {

1574

for_each_online_node(nid) {

1575

faults = group_faults_cpu(numa_group, nid);

1575

faults = group_faults_cpu(numa_group, nid);

1576

if (!node_isset(nid, numa_group->active_nodes)) {

1576

if (!node_isset(nid, numa_group->active_nodes)) {

1577

if (faults > max_faults * 6 / 16)

1577

if (faults > max_faults * 6 / 16)

1578

node_set(nid, numa_group->active_nodes);

1578

node_set(nid, numa_group->active_nodes);

1579

} else if (faults < max_faults * 3 / 16)

1579

} else if (faults < max_faults * 3 / 16)

1580

node_clear(nid, numa_group->active_nodes);

1580

node_clear(nid, numa_group->active_nodes);

1581

}

1581

}

1582

}

1582

}

1583

1584

/*

1584

/*

1585

* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS

1585

* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS

1586

* increments. The more local the fault statistics are, the higher the scan

1586

* increments. The more local the fault statistics are, the higher the scan

1587

* period will be for the next scan window. If local/(local+remote) ratio is

1587

* period will be for the next scan window. If local/(local+remote) ratio is

1588

* below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)

1588

* below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)

1589

* the scan period will decrease. Aim for 70% local accesses.

1589

* the scan period will decrease. Aim for 70% local accesses.

1590

*/

1590

*/

1591

#define NUMA_PERIOD_SLOTS 10

1591

#define NUMA_PERIOD_SLOTS 10

1592

#define NUMA_PERIOD_THRESHOLD 7

1592

#define NUMA_PERIOD_THRESHOLD 7

1593

1594

/*

1594

/*

1595

* Increase the scan period (slow down scanning) if the majority of

1595

* Increase the scan period (slow down scanning) if the majority of

1596

* our memory is already on our local node, or if the majority of

1596

* our memory is already on our local node, or if the majority of

1597

* the page accesses are shared with other processes.

1597

* the page accesses are shared with other processes.

1598

* Otherwise, decrease the scan period.

1598

* Otherwise, decrease the scan period.

1599

*/

1599

*/

1600

static void update_task_scan_period(struct task_struct *p,

1600

static void update_task_scan_period(struct task_struct *p,

1601

unsigned long shared, unsigned long private)

1601

unsigned long shared, unsigned long private)

1602

{

1602

{

1603

unsigned int period_slot;

1603

unsigned int period_slot;

1604

int ratio;

1604

int ratio;

1605

int diff;

1605

int diff;

1606

1607

unsigned long remote = p->numa_faults_locality[0];

1607

unsigned long remote = p->numa_faults_locality[0];

1608

unsigned long local = p->numa_faults_locality[1];

1608

unsigned long local = p->numa_faults_locality[1];

1609

1610

/*

1610

/*

1611

* If there were no record hinting faults then either the task is

1611

* If there were no record hinting faults then either the task is

1612

* completely idle or all activity is areas that are not of interest

1612

* completely idle or all activity is areas that are not of interest

1613

* to automatic numa balancing. Scan slower

1613

* to automatic numa balancing. Scan slower

1614

*/

1614

*/

1615

if (local + shared == 0) {

1615

if (local + shared == 0) {

1616

p->numa_scan_period = min(p->numa_scan_period_max,

1616

p->numa_scan_period = min(p->numa_scan_period_max,

1617

p->numa_scan_period << 1);

1617

p->numa_scan_period << 1);

1618

1619

p->mm->numa_next_scan = jiffies +

1619

p->mm->numa_next_scan = jiffies +

1620

msecs_to_jiffies(p->numa_scan_period);

1620

msecs_to_jiffies(p->numa_scan_period);

1621

1622

return;

1622

return;

1623

}

1623

}

1624

1625

/*

1625

/*

1626

* Prepare to scale scan period relative to the current period.

1626

* Prepare to scale scan period relative to the current period.

1627

* == NUMA_PERIOD_THRESHOLD scan period stays the same

1627

* == NUMA_PERIOD_THRESHOLD scan period stays the same

1628

* < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)

1628

* < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)

1629

* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)

1629

* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)

1630

*/

1630

*/

1631

period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);

1631

period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);

1632

ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);

1632

ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);

1633

if (ratio >= NUMA_PERIOD_THRESHOLD) {

1633

if (ratio >= NUMA_PERIOD_THRESHOLD) {

1634

int slot = ratio - NUMA_PERIOD_THRESHOLD;

1634

int slot = ratio - NUMA_PERIOD_THRESHOLD;

1635

if (!slot)

1635

if (!slot)

1636

slot = 1;

1636

slot = 1;

1637

diff = slot * period_slot;

1637

diff = slot * period_slot;

1638

} else {

1638

} else {

1639

diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;

1639

diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;

1640

1641

/*

1641

/*

1642

* Scale scan rate increases based on sharing. There is an

1642

* Scale scan rate increases based on sharing. There is an

1643

* inverse relationship between the degree of sharing and

1643

* inverse relationship between the degree of sharing and

1644

* the adjustment made to the scanning period. Broadly

1644

* the adjustment made to the scanning period. Broadly

1645

* speaking the intent is that there is little point

1645

* speaking the intent is that there is little point

1646

* scanning faster if shared accesses dominate as it may

1646

* scanning faster if shared accesses dominate as it may

1647

* simply bounce migrations uselessly

1647

* simply bounce migrations uselessly

1648

*/

1648

*/

1649

ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));

1649

ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));

1650

diff = (diff * ratio) / NUMA_PERIOD_SLOTS;

1650

diff = (diff * ratio) / NUMA_PERIOD_SLOTS;

1651

}

1651

}

1652

1653

p->numa_scan_period = clamp(p->numa_scan_period + diff,

1653

p->numa_scan_period = clamp(p->numa_scan_period + diff,

1654

task_scan_min(p), task_scan_max(p));

1654

task_scan_min(p), task_scan_max(p));

1655

memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));

1655

memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));

1656

}

1656

}

1657

1658

/*

1658

/*

1659

* Get the fraction of time the task has been running since the last

1659

* Get the fraction of time the task has been running since the last

1660

* NUMA placement cycle. The scheduler keeps similar statistics, but

1660

* NUMA placement cycle. The scheduler keeps similar statistics, but

1661

* decays those on a 32ms period, which is orders of magnitude off

1661

* decays those on a 32ms period, which is orders of magnitude off

1662

* from the dozens-of-seconds NUMA balancing period. Use the scheduler

1662

* from the dozens-of-seconds NUMA balancing period. Use the scheduler

1663

* stats only if the task is so new there are no NUMA statistics yet.

1663

* stats only if the task is so new there are no NUMA statistics yet.

1664

*/

1664

*/

1665

static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)

1665

static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)

1666

{

1666

{

1667

u64 runtime, delta, now;

1667

u64 runtime, delta, now;

1668

/* Use the start of this time slice to avoid calculations. */

1668

/* Use the start of this time slice to avoid calculations. */

1669

now = p->se.exec_start;

1669

now = p->se.exec_start;

1670

runtime = p->se.sum_exec_runtime;

1670

runtime = p->se.sum_exec_runtime;

1671

1672

if (p->last_task_numa_placement) {

1672

if (p->last_task_numa_placement) {

1673

delta = runtime - p->last_sum_exec_runtime;

1673

delta = runtime - p->last_sum_exec_runtime;

1674

*period = now - p->last_task_numa_placement;

1674

*period = now - p->last_task_numa_placement;

1675

} else {

1675

} else {

1676

delta = p->se.avg.runnable_avg_sum;

1676

delta = p->se.avg.runnable_avg_sum;

1677

*period = p->se.avg.runnable_avg_period;

1677

*period = p->se.avg.runnable_avg_period;

1678

}

1678

}

1679

1680

p->last_sum_exec_runtime = runtime;

1680

p->last_sum_exec_runtime = runtime;

1681

p->last_task_numa_placement = now;

1681

p->last_task_numa_placement = now;

1682

1683

return delta;

1683

return delta;

1684

}

1684

}

1685

1686

/*

1686

/*

1687

* Determine the preferred nid for a task in a numa_group. This needs to

1687

* Determine the preferred nid for a task in a numa_group. This needs to

1688

* be done in a way that produces consistent results with group_weight,

1688

* be done in a way that produces consistent results with group_weight,

1689

* otherwise workloads might not converge.

1689

* otherwise workloads might not converge.

1690

*/

1690

*/

1691

static int preferred_group_nid(struct task_struct *p, int nid)

1691

static int preferred_group_nid(struct task_struct *p, int nid)

1692

{

1692

{

1693

nodemask_t nodes;

1693

nodemask_t nodes;

1694

int dist;

1694

int dist;

1695

1696

/* Direct connections between all NUMA nodes. */

1696

/* Direct connections between all NUMA nodes. */

1697

if (sched_numa_topology_type == NUMA_DIRECT)

1697

if (sched_numa_topology_type == NUMA_DIRECT)

1698

return nid;

1698

return nid;

1699

1700

/*

1700

/*

1701

* On a system with glueless mesh NUMA topology, group_weight

1701

* On a system with glueless mesh NUMA topology, group_weight

1702

* scores nodes according to the number of NUMA hinting faults on

1702

* scores nodes according to the number of NUMA hinting faults on

1703

* both the node itself, and on nearby nodes.

1703

* both the node itself, and on nearby nodes.

1704

*/

1704

*/

1705

if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {

1705

if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {

1706

unsigned long score, max_score = 0;

1706

unsigned long score, max_score = 0;

1707

int node, max_node = nid;

1707

int node, max_node = nid;

1708

1709

dist = sched_max_numa_distance;

1709

dist = sched_max_numa_distance;

1710

1711

for_each_online_node(node) {

1711

for_each_online_node(node) {

1712

score = group_weight(p, node, dist);

1712

score = group_weight(p, node, dist);

1713

if (score > max_score) {

1713

if (score > max_score) {

1714

max_score = score;

1714

max_score = score;

1715

max_node = node;

1715

max_node = node;

1716

}

1716

}

1717

}

1717

}

1718

return max_node;

1718

return max_node;

1719

}

1719

}

1720

1721

/*

1721

/*

1722

* Finding the preferred nid in a system with NUMA backplane

1722

* Finding the preferred nid in a system with NUMA backplane

1723

* interconnect topology is more involved. The goal is to locate

1723

* interconnect topology is more involved. The goal is to locate

1724

* tasks from numa_groups near each other in the system, and

1724

* tasks from numa_groups near each other in the system, and

1725

* untangle workloads from different sides of the system. This requires

1725

* untangle workloads from different sides of the system. This requires

1726

* searching down the hierarchy of node groups, recursively searching

1726

* searching down the hierarchy of node groups, recursively searching

1727

* inside the highest scoring group of nodes. The nodemask tricks

1727

* inside the highest scoring group of nodes. The nodemask tricks

1728

* keep the complexity of the search down.

1728

* keep the complexity of the search down.

1729

*/

1729

*/

1730

nodes = node_online_map;

1730

nodes = node_online_map;

1731

for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {

1731

for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {

1732

unsigned long max_faults = 0;

1732

unsigned long max_faults = 0;

1733

nodemask_t max_group;

1733

nodemask_t max_group;

1734

int a, b;

1734

int a, b;

1735

1736

/* Are there nodes at this distance from each other? */

1736

/* Are there nodes at this distance from each other? */

1737

if (!find_numa_distance(dist))

1737

if (!find_numa_distance(dist))

1738

continue;

1738

continue;

1739

1740

for_each_node_mask(a, nodes) {

1740

for_each_node_mask(a, nodes) {

1741

unsigned long faults = 0;

1741

unsigned long faults = 0;

1742

nodemask_t this_group;

1742

nodemask_t this_group;

1743

nodes_clear(this_group);

1743

nodes_clear(this_group);

1744

1745

/* Sum group's NUMA faults; includes a==b case. */

1745

/* Sum group's NUMA faults; includes a==b case. */

1746

for_each_node_mask(b, nodes) {

1746

for_each_node_mask(b, nodes) {

1747

if (node_distance(a, b) < dist) {

1747

if (node_distance(a, b) < dist) {

1748

faults += group_faults(p, b);

1748

faults += group_faults(p, b);

1749

node_set(b, this_group);

1749

node_set(b, this_group);

1750

node_clear(b, nodes);

1750

node_clear(b, nodes);

1751

}

1751

}

1752

}

1752

}

1753

1754

/* Remember the top group. */

1754

/* Remember the top group. */

1755

if (faults > max_faults) {

1755

if (faults > max_faults) {

1756

max_faults = faults;

1756

max_faults = faults;

1757

max_group = this_group;

1757

max_group = this_group;

1758

/*

1758

/*

1759

* subtle: at the smallest distance there is

1759

* subtle: at the smallest distance there is

1760

* just one node left in each "group", the

1760

* just one node left in each "group", the

1761

* winner is the preferred nid.

1761

* winner is the preferred nid.

1762

*/

1762

*/

1763

nid = a;

1763

nid = a;

1764

}

1764

}

1765

}

1765

}

1766

/* Next round, evaluate the nodes within max_group. */

1766

/* Next round, evaluate the nodes within max_group. */

1767

nodes = max_group;

1767

nodes = max_group;

1768

}

1768

}

1769

return nid;

1769

return nid;

1770

}

1770

}

1771

1772

static void task_numa_placement(struct task_struct *p)

1772

static void task_numa_placement(struct task_struct *p)

1773

{

1773

{

1774

int seq, nid, max_nid = -1, max_group_nid = -1;

1774

int seq, nid, max_nid = -1, max_group_nid = -1;

1775

unsigned long max_faults = 0, max_group_faults = 0;

1775

unsigned long max_faults = 0, max_group_faults = 0;

1776

unsigned long fault_types[2] = { 0, 0 };

1776

unsigned long fault_types[2] = { 0, 0 };

1777

unsigned long total_faults;

1777

unsigned long total_faults;

1778

u64 runtime, period;

1778

u64 runtime, period;

1779

spinlock_t *group_lock = NULL;

1779

spinlock_t *group_lock = NULL;

1780

1781

seq = ACCESS_ONCE(p->mm->numa_scan_seq);

1781

seq = ACCESS_ONCE(p->mm->numa_scan_seq);

1782

if (p->numa_scan_seq == seq)

1782

if (p->numa_scan_seq == seq)

1783

return;

1783

return;

1784

p->numa_scan_seq = seq;

1784

p->numa_scan_seq = seq;

1785

p->numa_scan_period_max = task_scan_max(p);

1785

p->numa_scan_period_max = task_scan_max(p);

1786

1787

total_faults = p->numa_faults_locality[0] +

1787

total_faults = p->numa_faults_locality[0] +

1788

p->numa_faults_locality[1];

1788

p->numa_faults_locality[1];

1789

runtime = numa_get_avg_runtime(p, &period);

1789

runtime = numa_get_avg_runtime(p, &period);

1790

1791

/* If the task is part of a group prevent parallel updates to group stats */

1791

/* If the task is part of a group prevent parallel updates to group stats */

1792

if (p->numa_group) {

1792

if (p->numa_group) {

1793

group_lock = &p->numa_group->lock;

1793

group_lock = &p->numa_group->lock;

1794

spin_lock_irq(group_lock);

1794

spin_lock_irq(group_lock);

1795

}

1795

}

1796

1797

/* Find the node with the highest number of faults */

1797

/* Find the node with the highest number of faults */

1798

for_each_online_node(nid) {

1798

for_each_online_node(nid) {

1799

/* Keep track of the offsets in numa_faults array */

1799

/* Keep track of the offsets in numa_faults array */

1800

int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;

1800

int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;

1801

unsigned long faults = 0, group_faults = 0;

1801

unsigned long faults = 0, group_faults = 0;

1802

int priv;

1802

int priv;

1803

1804

for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {

1804

for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {

1805

long diff, f_diff, f_weight;

1805

long diff, f_diff, f_weight;

1806

1807

mem_idx = task_faults_idx(NUMA_MEM, nid, priv);

1807

mem_idx = task_faults_idx(NUMA_MEM, nid, priv);

1808

membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);

1808

membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);

1809

cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);

1809

cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);

1810

cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);

1810

cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);

1811

1812

/* Decay existing window, copy faults since last scan */

1812

/* Decay existing window, copy faults since last scan */

1813

diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;

1813

diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;

1814

fault_types[priv] += p->numa_faults[membuf_idx];

1814

fault_types[priv] += p->numa_faults[membuf_idx];

1815

p->numa_faults[membuf_idx] = 0;

1815

p->numa_faults[membuf_idx] = 0;

1816

1817

/*

1817

/*

1818

* Normalize the faults_from, so all tasks in a group

1818

* Normalize the faults_from, so all tasks in a group

1819

* count according to CPU use, instead of by the raw

1819

* count according to CPU use, instead of by the raw

1820

* number of faults. Tasks with little runtime have

1820

* number of faults. Tasks with little runtime have

1821

* little over-all impact on throughput, and thus their

1821

* little over-all impact on throughput, and thus their

1822

* faults are less important.

1822

* faults are less important.

1823

*/

1823

*/

1824

f_weight = div64_u64(runtime << 16, period + 1);

1824

f_weight = div64_u64(runtime << 16, period + 1);

1825

f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /

1825

f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /

1826

(total_faults + 1);

1826

(total_faults + 1);

1827

f_diff = f_weight - p->numa_faults[cpu_idx] / 2;

1827

f_diff = f_weight - p->numa_faults[cpu_idx] / 2;

1828

p->numa_faults[cpubuf_idx] = 0;

1828

p->numa_faults[cpubuf_idx] = 0;

1829

1830

p->numa_faults[mem_idx] += diff;

1830

p->numa_faults[mem_idx] += diff;

1831

p->numa_faults[cpu_idx] += f_diff;

1831

p->numa_faults[cpu_idx] += f_diff;

1832

faults += p->numa_faults[mem_idx];

1832

faults += p->numa_faults[mem_idx];

1833

p->total_numa_faults += diff;

1833

p->total_numa_faults += diff;

1834

if (p->numa_group) {

1834

if (p->numa_group) {

1835

/*

1835

/*

1836

* safe because we can only change our own group

1836

* safe because we can only change our own group

1837

*

1837

*

1838

* mem_idx represents the offset for a given

1838

* mem_idx represents the offset for a given

1839

* nid and priv in a specific region because it

1839

* nid and priv in a specific region because it

1840

* is at the beginning of the numa_faults array.

1840

* is at the beginning of the numa_faults array.

1841

*/

1841

*/

1842

p->numa_group->faults[mem_idx] += diff;

1842

p->numa_group->faults[mem_idx] += diff;

1843

p->numa_group->faults_cpu[mem_idx] += f_diff;

1843

p->numa_group->faults_cpu[mem_idx] += f_diff;

1844

p->numa_group->total_faults += diff;

1844

p->numa_group->total_faults += diff;

1845

group_faults += p->numa_group->faults[mem_idx];

1845

group_faults += p->numa_group->faults[mem_idx];

1846

}

1846

}

1847

}

1847

}

1848

1849

if (faults > max_faults) {

1849

if (faults > max_faults) {

1850

max_faults = faults;

1850

max_faults = faults;

1851

max_nid = nid;

1851

max_nid = nid;

1852

}

1852

}

1853

1854

if (group_faults > max_group_faults) {

1854

if (group_faults > max_group_faults) {

1855

max_group_faults = group_faults;

1855

max_group_faults = group_faults;

1856

max_group_nid = nid;

1856

max_group_nid = nid;

1857

}

1857

}

1858

}

1858

}

1859

1860

update_task_scan_period(p, fault_types[0], fault_types[1]);

1860

update_task_scan_period(p, fault_types[0], fault_types[1]);

1861

1862

if (p->numa_group) {

1862

if (p->numa_group) {

1863

update_numa_active_node_mask(p->numa_group);

1863

update_numa_active_node_mask(p->numa_group);

1864

spin_unlock_irq(group_lock);

1864

spin_unlock_irq(group_lock);

1865

max_nid = preferred_group_nid(p, max_group_nid);

1865

max_nid = preferred_group_nid(p, max_group_nid);

1866

}

1866

}

1867

1868

if (max_faults) {

1868

if (max_faults) {

1869

/* Set the new preferred node */

1869

/* Set the new preferred node */

1870

if (max_nid != p->numa_preferred_nid)

1870

if (max_nid != p->numa_preferred_nid)

1871

sched_setnuma(p, max_nid);

1871

sched_setnuma(p, max_nid);

1872

1873

if (task_node(p) != p->numa_preferred_nid)

1873

if (task_node(p) != p->numa_preferred_nid)

1874

numa_migrate_preferred(p);

1874

numa_migrate_preferred(p);

1875

}

1875

}

1876

}

1876

}

1877

1878

static inline int get_numa_group(struct numa_group *grp)

1878

static inline int get_numa_group(struct numa_group *grp)

1879

{

1879

{

1880

return atomic_inc_not_zero(&grp->refcount);

1880

return atomic_inc_not_zero(&grp->refcount);

1881

}

1881

}

1882

1883

static inline void put_numa_group(struct numa_group *grp)

1883

static inline void put_numa_group(struct numa_group *grp)

1884

{

1884

{

1885

if (atomic_dec_and_test(&grp->refcount))

1885

if (atomic_dec_and_test(&grp->refcount))

1886

kfree_rcu(grp, rcu);

1886

kfree_rcu(grp, rcu);

1887

}

1887

}

1888

1889

static void task_numa_group(struct task_struct *p, int cpupid, int flags,

1889

static void task_numa_group(struct task_struct *p, int cpupid, int flags,

1890

int *priv)

1890

int *priv)

1891

{

1891

{

1892

struct numa_group *grp, *my_grp;

1892

struct numa_group *grp, *my_grp;

1893

struct task_struct *tsk;

1893

struct task_struct *tsk;

1894

bool join = false;

1894

bool join = false;

1895

int cpu = cpupid_to_cpu(cpupid);

1895

int cpu = cpupid_to_cpu(cpupid);

1896

int i;

1896

int i;

1897

1898

if (unlikely(!p->numa_group)) {

1898

if (unlikely(!p->numa_group)) {

1899

unsigned int size = sizeof(struct numa_group) +

1899

unsigned int size = sizeof(struct numa_group) +

1900

4*nr_node_ids*sizeof(unsigned long);

1900

4*nr_node_ids*sizeof(unsigned long);

1901

1902

grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);

1902

grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);

1903

if (!grp)

1903

if (!grp)

1904

return;

1904

return;

1905

1906

atomic_set(&grp->refcount, 1);

1906

atomic_set(&grp->refcount, 1);

1907

spin_lock_init(&grp->lock);

1907

spin_lock_init(&grp->lock);

1908

grp->gid = p->pid;

1908

grp->gid = p->pid;

1909

/* Second half of the array tracks nids where faults happen */

1909

/* Second half of the array tracks nids where faults happen */

1910

grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *

1910

grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *

1911

nr_node_ids;

1911

nr_node_ids;

1912

1913

node_set(task_node(current), grp->active_nodes);

1913

node_set(task_node(current), grp->active_nodes);

1914

1915

for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)

1915

for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)

1916

grp->faults[i] = p->numa_faults[i];

1916

grp->faults[i] = p->numa_faults[i];

1917

1918

grp->total_faults = p->total_numa_faults;

1918

grp->total_faults = p->total_numa_faults;

1919

1920

grp->nr_tasks++;

1920

grp->nr_tasks++;

1921

rcu_assign_pointer(p->numa_group, grp);

1921

rcu_assign_pointer(p->numa_group, grp);

1922

}

1922

}

1923

1924

rcu_read_lock();

1924

rcu_read_lock();

1925

tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);

1925

tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);

1926

1927

if (!cpupid_match_pid(tsk, cpupid))

1927

if (!cpupid_match_pid(tsk, cpupid))

1928

goto no_join;

1928

goto no_join;

1929

1930

grp = rcu_dereference(tsk->numa_group);

1930

grp = rcu_dereference(tsk->numa_group);

1931

if (!grp)

1931

if (!grp)

1932

goto no_join;

1932

goto no_join;

1933

1934

my_grp = p->numa_group;

1934

my_grp = p->numa_group;

1935

if (grp == my_grp)

1935

if (grp == my_grp)

1936

goto no_join;

1936

goto no_join;

1937

1938

/*

1938

/*

1939

* Only join the other group if its bigger; if we're the bigger group,

1939

* Only join the other group if its bigger; if we're the bigger group,

1940

* the other task will join us.

1940

* the other task will join us.

1941

*/

1941

*/

1942

if (my_grp->nr_tasks > grp->nr_tasks)

1942

if (my_grp->nr_tasks > grp->nr_tasks)

1943

goto no_join;

1943

goto no_join;

1944

1945

/*

1945

/*

1946

* Tie-break on the grp address.

1946

* Tie-break on the grp address.

1947

*/

1947

*/

1948

if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)

1948

if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)

1949

goto no_join;

1949

goto no_join;

1950

1951

/* Always join threads in the same process. */

1951

/* Always join threads in the same process. */

1952

if (tsk->mm == current->mm)

1952

if (tsk->mm == current->mm)

1953

join = true;

1953

join = true;

1954

1955

/* Simple filter to avoid false positives due to PID collisions */

1955

/* Simple filter to avoid false positives due to PID collisions */

1956

if (flags & TNF_SHARED)

1956

if (flags & TNF_SHARED)

1957

join = true;

1957

join = true;

1958

1959

/* Update priv based on whether false sharing was detected */

1959

/* Update priv based on whether false sharing was detected */

1960

*priv = !join;

1960

*priv = !join;

1961

1962

if (join && !get_numa_group(grp))

1962

if (join && !get_numa_group(grp))

1963

goto no_join;

1963

goto no_join;

1964

1965

rcu_read_unlock();

1965

rcu_read_unlock();

1966

1967

if (!join)

1967

if (!join)

1968

return;

1968

return;

1969

1970

BUG_ON(irqs_disabled());

1970

BUG_ON(irqs_disabled());

1971

double_lock_irq(&my_grp->lock, &grp->lock);

1971

double_lock_irq(&my_grp->lock, &grp->lock);

1972

1973

for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {

1973

for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {

1974

my_grp->faults[i] -= p->numa_faults[i];

1974

my_grp->faults[i] -= p->numa_faults[i];

1975

grp->faults[i] += p->numa_faults[i];

1975

grp->faults[i] += p->numa_faults[i];

1976

}

1976

}

1977

my_grp->total_faults -= p->total_numa_faults;

1977

my_grp->total_faults -= p->total_numa_faults;

1978

grp->total_faults += p->total_numa_faults;

1978

grp->total_faults += p->total_numa_faults;

1979

1980

my_grp->nr_tasks--;

1980

my_grp->nr_tasks--;

1981

grp->nr_tasks++;

1981

grp->nr_tasks++;

1982

1983

spin_unlock(&my_grp->lock);

1983

spin_unlock(&my_grp->lock);

1984

spin_unlock_irq(&grp->lock);

1984

spin_unlock_irq(&grp->lock);

1985

1986

rcu_assign_pointer(p->numa_group, grp);

1986

rcu_assign_pointer(p->numa_group, grp);

1987

1988

put_numa_group(my_grp);

1988

put_numa_group(my_grp);

1989

return;

1989

return;

1990

1991

no_join:

1991

no_join:

1992

rcu_read_unlock();

1992

rcu_read_unlock();

1993

return;

1993

return;

1994

}

1994

}

1995

1996

void task_numa_free(struct task_struct *p)

1996

void task_numa_free(struct task_struct *p)

1997

{

1997

{

1998

struct numa_group *grp = p->numa_group;

1998

struct numa_group *grp = p->numa_group;

1999

void *numa_faults = p->numa_faults;

1999

void *numa_faults = p->numa_faults;

2000

unsigned long flags;

2000

unsigned long flags;

2001

int i;

2001

int i;

2002

2003

if (grp) {

2003

if (grp) {

2004

spin_lock_irqsave(&grp->lock, flags);

2004

spin_lock_irqsave(&grp->lock, flags);

2005

for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)

2005

for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)

2006

grp->faults[i] -= p->numa_faults[i];

2006

grp->faults[i] -= p->numa_faults[i];

2007

grp->total_faults -= p->total_numa_faults;

2007

grp->total_faults -= p->total_numa_faults;

2008

2009

grp->nr_tasks--;

2009

grp->nr_tasks--;

2010

spin_unlock_irqrestore(&grp->lock, flags);

2010

spin_unlock_irqrestore(&grp->lock, flags);

2011

RCU_INIT_POINTER(p->numa_group, NULL);

2011

RCU_INIT_POINTER(p->numa_group, NULL);

2012

put_numa_group(grp);

2012

put_numa_group(grp);

2013

}

2013

}

2014

2015

p->numa_faults = NULL;

2015

p->numa_faults = NULL;

2016

kfree(numa_faults);

2016

kfree(numa_faults);

2017

}

2017

}

2018

2019

/*

2019

/*

2020

* Got a PROT_NONE fault for a page on @node.

2020

* Got a PROT_NONE fault for a page on @node.

2021

*/

2021

*/

2022

void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)

2022

void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)

2023

{

2023

{

2024

struct task_struct *p = current;

2024

struct task_struct *p = current;

2025

bool migrated = flags & TNF_MIGRATED;

2025

bool migrated = flags & TNF_MIGRATED;

2026

int cpu_node = task_node(current);

2026

int cpu_node = task_node(current);

2027

int local = !!(flags & TNF_FAULT_LOCAL);

2027

int local = !!(flags & TNF_FAULT_LOCAL);

2028

int priv;

2028

int priv;

2029

2030

if (!numabalancing_enabled)

2030

if (!numabalancing_enabled)

2031

return;

2031

return;

2032

2033

/* for example, ksmd faulting in a user's mm */

2033

/* for example, ksmd faulting in a user's mm */

2034

if (!p->mm)

2034

if (!p->mm)

2035

return;

2035

return;

2036

2037

/* Allocate buffer to track faults on a per-node basis */

2037

/* Allocate buffer to track faults on a per-node basis */

2038

if (unlikely(!p->numa_faults)) {

2038

if (unlikely(!p->numa_faults)) {

2039

int size = sizeof(*p->numa_faults) *

2039

int size = sizeof(*p->numa_faults) *

2040

NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;

2040

NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;

2041

2042

p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);

2042

p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);

2043

if (!p->numa_faults)

2043

if (!p->numa_faults)

2044

return;

2044

return;

2045

2046

p->total_numa_faults = 0;

2046

p->total_numa_faults = 0;

2047

memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));

2047

memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));

2048

}

2048

}

2049

2050

/*

2050

/*

2051

* First accesses are treated as private, otherwise consider accesses

2051

* First accesses are treated as private, otherwise consider accesses

2052

* to be private if the accessing pid has not changed

2052

* to be private if the accessing pid has not changed

2053

*/

2053

*/

2054

if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {

2054

if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {

2055

priv = 1;

2055

priv = 1;

2056

} else {

2056

} else {

2057

priv = cpupid_match_pid(p, last_cpupid);

2057

priv = cpupid_match_pid(p, last_cpupid);

2058

if (!priv && !(flags & TNF_NO_GROUP))

2058

if (!priv && !(flags & TNF_NO_GROUP))

2059

task_numa_group(p, last_cpupid, flags, &priv);

2059

task_numa_group(p, last_cpupid, flags, &priv);

2060

}

2060

}

2061

2062

/*

2062

/*

2063

* If a workload spans multiple NUMA nodes, a shared fault that

2063

* If a workload spans multiple NUMA nodes, a shared fault that

2064

* occurs wholly within the set of nodes that the workload is

2064

* occurs wholly within the set of nodes that the workload is

2065

* actively using should be counted as local. This allows the

2065

* actively using should be counted as local. This allows the

2066

* scan rate to slow down when a workload has settled down.

2066

* scan rate to slow down when a workload has settled down.

2067

*/

2067

*/

2068

if (!priv && !local && p->numa_group &&

2068

if (!priv && !local && p->numa_group &&

2069

node_isset(cpu_node, p->numa_group->active_nodes) &&

2069

node_isset(cpu_node, p->numa_group->active_nodes) &&

2070

node_isset(mem_node, p->numa_group->active_nodes))

2070

node_isset(mem_node, p->numa_group->active_nodes))

2071

local = 1;

2071

local = 1;

2072

2073

task_numa_placement(p);

2073

task_numa_placement(p);

2074

2075

/*

2075

/*

2076

* Retry task to preferred node migration periodically, in case it

2076

* Retry task to preferred node migration periodically, in case it

2077

* case it previously failed, or the scheduler moved us.

2077

* case it previously failed, or the scheduler moved us.

2078

*/

2078

*/

2079

if (time_after(jiffies, p->numa_migrate_retry))

2079

if (time_after(jiffies, p->numa_migrate_retry))

2080

numa_migrate_preferred(p);

2080

numa_migrate_preferred(p);

2081

2082

if (migrated)

2082

if (migrated)

2083

p->numa_pages_migrated += pages;

2083

p->numa_pages_migrated += pages;

2084

2085

p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;

2085

p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;

2086

p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;

2086

p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;

2087

p->numa_faults_locality[local] += pages;

2087

p->numa_faults_locality[local] += pages;

2088

}

2088

}

2089

2090

static void reset_ptenuma_scan(struct task_struct *p)

2090

static void reset_ptenuma_scan(struct task_struct *p)

2091

{

2091

{

2092

ACCESS_ONCE(p->mm->numa_scan_seq)++;

2092

ACCESS_ONCE(p->mm->numa_scan_seq)++;

2093

p->mm->numa_scan_offset = 0;

2093

p->mm->numa_scan_offset = 0;

2094

}

2094

}

2095

2096

/*

2096

/*

2097

* The expensive part of numa migration is done from task_work context.

2097

* The expensive part of numa migration is done from task_work context.

2098

* Triggered from task_tick_numa().

2098

* Triggered from task_tick_numa().

2099

*/

2099

*/

2100

void task_numa_work(struct callback_head *work)

2100

void task_numa_work(struct callback_head *work)

2101

{

2101

{

2102

unsigned long migrate, next_scan, now = jiffies;

2102

unsigned long migrate, next_scan, now = jiffies;

2103

struct task_struct *p = current;

2103

struct task_struct *p = current;

2104

struct mm_struct *mm = p->mm;

2104

struct mm_struct *mm = p->mm;

2105

struct vm_area_struct *vma;

2105

struct vm_area_struct *vma;

2106

unsigned long start, end;

2106

unsigned long start, end;

2107

unsigned long nr_pte_updates = 0;

2107

unsigned long nr_pte_updates = 0;

2108

long pages;

2108

long pages;

2109

2110

WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));

2110

WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));

2111

2112

work->next = work; /* protect against double add */

2112

work->next = work; /* protect against double add */

2113

/*

2113

/*

2114

* Who cares about NUMA placement when they're dying.

2114

* Who cares about NUMA placement when they're dying.

2115

*

2115

*

2116

* NOTE: make sure not to dereference p->mm before this check,

2116

* NOTE: make sure not to dereference p->mm before this check,

2117

* exit_task_work() happens _after_ exit_mm() so we could be called

2117

* exit_task_work() happens _after_ exit_mm() so we could be called

2118

* without p->mm even though we still had it when we enqueued this

2118

* without p->mm even though we still had it when we enqueued this

2119

* work.

2119

* work.

2120

*/

2120

*/

2121

if (p->flags & PF_EXITING)

2121

if (p->flags & PF_EXITING)

2122

return;

2122

return;

2123

2124

if (!mm->numa_next_scan) {

2124

if (!mm->numa_next_scan) {

2125

mm->numa_next_scan = now +

2125

mm->numa_next_scan = now +

2126

msecs_to_jiffies(sysctl_numa_balancing_scan_delay);

2126

msecs_to_jiffies(sysctl_numa_balancing_scan_delay);

2127

}

2127

}

2128

2129

/*

2129

/*

2130

* Enforce maximal scan/migration frequency..

2130

* Enforce maximal scan/migration frequency..

2131

*/

2131

*/

2132

migrate = mm->numa_next_scan;

2132

migrate = mm->numa_next_scan;

2133

if (time_before(now, migrate))

2133

if (time_before(now, migrate))

2134

return;

2134

return;

2135

2136

if (p->numa_scan_period == 0) {

2136

if (p->numa_scan_period == 0) {

2137

p->numa_scan_period_max = task_scan_max(p);

2137

p->numa_scan_period_max = task_scan_max(p);

2138

p->numa_scan_period = task_scan_min(p);

2138

p->numa_scan_period = task_scan_min(p);

2139

}

2139

}

2140

2141

next_scan = now + msecs_to_jiffies(p->numa_scan_period);

2141

next_scan = now + msecs_to_jiffies(p->numa_scan_period);

2142

if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)

2142

if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)

2143

return;

2143

return;

2144

2145

/*

2145

/*

2146

* Delay this task enough that another task of this mm will likely win

2146

* Delay this task enough that another task of this mm will likely win

2147

* the next time around.

2147

* the next time around.

2148

*/

2148

*/

2149

p->node_stamp += 2 * TICK_NSEC;

2149

p->node_stamp += 2 * TICK_NSEC;

2150

2151

start = mm->numa_scan_offset;

2151

start = mm->numa_scan_offset;

2152

pages = sysctl_numa_balancing_scan_size;

2152

pages = sysctl_numa_balancing_scan_size;

2153

pages <<= 20 - PAGE_SHIFT; /* MB in pages */

2153

pages <<= 20 - PAGE_SHIFT; /* MB in pages */

2154

if (!pages)

2154

if (!pages)

2155

return;

2155

return;

2156

2157

down_read(&mm->mmap_sem);

2157

down_read(&mm->mmap_sem);

2158

vma = find_vma(mm, start);

2158

vma = find_vma(mm, start);

2159

if (!vma) {

2159

if (!vma) {

2160

reset_ptenuma_scan(p);

2160

reset_ptenuma_scan(p);

2161

start = 0;

2161

start = 0;

2162

vma = mm->mmap;

2162

vma = mm->mmap;

2163

}

2163

}

2164

for (; vma; vma = vma->vm_next) {

2164

for (; vma; vma = vma->vm_next) {

2165

if (!vma_migratable(vma) || !vma_policy_mof(vma))

2165

if (!vma_migratable(vma) || !vma_policy_mof(vma))

2166

continue;

2166

continue;

2167

2168

/*

2168

/*

2169

* Shared library pages mapped by multiple processes are not

2169

* Shared library pages mapped by multiple processes are not

2170

* migrated as it is expected they are cache replicated. Avoid

2170

* migrated as it is expected they are cache replicated. Avoid

2171

* hinting faults in read-only file-backed mappings or the vdso

2171

* hinting faults in read-only file-backed mappings or the vdso

2172

* as migrating the pages will be of marginal benefit.

2172

* as migrating the pages will be of marginal benefit.

2173

*/

2173

*/

2174

if (!vma->vm_mm ||

2174

if (!vma->vm_mm ||

2175

(vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))

2175

(vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))

2176

continue;

2176

continue;

2177

2178

/*

2178

/*

2179

* Skip inaccessible VMAs to avoid any confusion between

2179

* Skip inaccessible VMAs to avoid any confusion between

2180

* PROT_NONE and NUMA hinting ptes

2180

* PROT_NONE and NUMA hinting ptes

2181

*/

2181

*/

2182

if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))

2182

if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))

2183

continue;

2183

continue;

2184

2185

do {

2185

do {

2186

start = max(start, vma->vm_start);

2186

start = max(start, vma->vm_start);

2187

end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);

2187

end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);

2188

end = min(end, vma->vm_end);

2188

end = min(end, vma->vm_end);

2189

nr_pte_updates += change_prot_numa(vma, start, end);

2189

nr_pte_updates += change_prot_numa(vma, start, end);

2190

2191

/*

2191

/*

2192

* Scan sysctl_numa_balancing_scan_size but ensure that

2192

* Scan sysctl_numa_balancing_scan_size but ensure that

2193

* at least one PTE is updated so that unused virtual

2193

* at least one PTE is updated so that unused virtual

2194

* address space is quickly skipped.

2194

* address space is quickly skipped.

2195

*/

2195

*/

2196

if (nr_pte_updates)

2196

if (nr_pte_updates)

2197

pages -= (end - start) >> PAGE_SHIFT;

2197

pages -= (end - start) >> PAGE_SHIFT;

2198

2199

start = end;

2199

start = end;

2200

if (pages <= 0)

2200

if (pages <= 0)

2201

goto out;

2201

goto out;

2202

2203

cond_resched();

2203

cond_resched();

2204

} while (end != vma->vm_end);

2204

} while (end != vma->vm_end);

2205

}

2205

}

2206

2207

out:

2207

out:

2208

/*

2208

/*

2209

* It is possible to reach the end of the VMA list but the last few

2209

* It is possible to reach the end of the VMA list but the last few

2210

* VMAs are not guaranteed to the vma_migratable. If they are not, we

2210

* VMAs are not guaranteed to the vma_migratable. If they are not, we

2211

* would find the !migratable VMA on the next scan but not reset the

2211

* would find the !migratable VMA on the next scan but not reset the

2212

* scanner to the start so check it now.

2212

* scanner to the start so check it now.

2213

*/

2213

*/

2214

if (vma)

2214

if (vma)

2215

mm->numa_scan_offset = start;

2215

mm->numa_scan_offset = start;

2216

else

2216

else

2217

reset_ptenuma_scan(p);

2217

reset_ptenuma_scan(p);

2218

up_read(&mm->mmap_sem);

2218

up_read(&mm->mmap_sem);

2219

}

2219

}

2220

2221

/*

2221

/*

2222

* Drive the periodic memory faults..

2222

* Drive the periodic memory faults..

2223

*/

2223

*/

2224

void task_tick_numa(struct rq *rq, struct task_struct *curr)

2224

void task_tick_numa(struct rq *rq, struct task_struct *curr)

2225

{

2225

{

2226

struct callback_head *work = &curr->numa_work;

2226

struct callback_head *work = &curr->numa_work;

2227

u64 period, now;

2227

u64 period, now;

2228

2229

/*

2229

/*

2230

* We don't care about NUMA placement if we don't have memory.

2230

* We don't care about NUMA placement if we don't have memory.

2231

*/

2231

*/

2232

if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)

2232

if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)

2233

return;

2233

return;

2234

2235

/*

2235

/*

2236

* Using runtime rather than walltime has the dual advantage that

2236

* Using runtime rather than walltime has the dual advantage that

2237

* we (mostly) drive the selection from busy threads and that the

2237

* we (mostly) drive the selection from busy threads and that the

2238

* task needs to have done some actual work before we bother with

2238

* task needs to have done some actual work before we bother with

2239

* NUMA placement.

2239

* NUMA placement.

2240

*/

2240

*/

2241

now = curr->se.sum_exec_runtime;

2241

now = curr->se.sum_exec_runtime;

2242

period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;

2242

period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;

2243

2244

if (now - curr->node_stamp > period) {

2244

if (now - curr->node_stamp > period) {

2245

if (!curr->node_stamp)

2245

if (!curr->node_stamp)

2246

curr->numa_scan_period = task_scan_min(curr);

2246

curr->numa_scan_period = task_scan_min(curr);

2247

curr->node_stamp += period;

2247

curr->node_stamp += period;

2248

2249

if (!time_before(jiffies, curr->mm->numa_next_scan)) {

2249

if (!time_before(jiffies, curr->mm->numa_next_scan)) {

2250

init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */

2250

init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */

2251

task_work_add(curr, work, true);

2251

task_work_add(curr, work, true);

2252

}

2252

}

2253

}

2253

}

2254

}

2254

}

2255

#else

2255

#else

2256

static void task_tick_numa(struct rq *rq, struct task_struct *curr)

2256

static void task_tick_numa(struct rq *rq, struct task_struct *curr)

2257

{

2257

{

2258

}

2258

}

2259

2260

static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)

2260

static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)

2261

{

2261

{

2262

}

2262

}

2263

2264

static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)

2264

static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)

2265

{

2265

{

2266

}

2266

}

2267

#endif /* CONFIG_NUMA_BALANCING */

2267

#endif /* CONFIG_NUMA_BALANCING */

2268

2269

static void

2269

static void

2270

account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)

2270

account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)

2271

{

2271

{

2272

update_load_add(&cfs_rq->load, se->load.weight);

2272

update_load_add(&cfs_rq->load, se->load.weight);

2273

if (!parent_entity(se))

2273

if (!parent_entity(se))

2274

update_load_add(&rq_of(cfs_rq)->load, se->load.weight);

2274

update_load_add(&rq_of(cfs_rq)->load, se->load.weight);

2275

#ifdef CONFIG_SMP

2275

#ifdef CONFIG_SMP

2276

if (entity_is_task(se)) {

2276

if (entity_is_task(se)) {

2277

struct rq *rq = rq_of(cfs_rq);

2277

struct rq *rq = rq_of(cfs_rq);

2278

2279

account_numa_enqueue(rq, task_of(se));

2279

account_numa_enqueue(rq, task_of(se));

2280

list_add(&se->group_node, &rq->cfs_tasks);

2280

list_add(&se->group_node, &rq->cfs_tasks);

2281

}

2281

}

2282

#endif

2282

#endif

2283

cfs_rq->nr_running++;

2283

cfs_rq->nr_running++;

2284

}

2284

}

2285

2286

static void

2286

static void

2287

account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)

2287

account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)

2288

{

2288

{

2289

update_load_sub(&cfs_rq->load, se->load.weight);

2289

update_load_sub(&cfs_rq->load, se->load.weight);

2290

if (!parent_entity(se))

2290

if (!parent_entity(se))

2291

update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);

2291

update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);

2292

if (entity_is_task(se)) {

2292

if (entity_is_task(se)) {

2293

account_numa_dequeue(rq_of(cfs_rq), task_of(se));

2293

account_numa_dequeue(rq_of(cfs_rq), task_of(se));

2294

list_del_init(&se->group_node);

2294

list_del_init(&se->group_node);

2295

}

2295

}

2296

cfs_rq->nr_running--;

2296

cfs_rq->nr_running--;

2297

}

2297

}

2298

2299

#ifdef CONFIG_FAIR_GROUP_SCHED

2299

#ifdef CONFIG_FAIR_GROUP_SCHED

2300

# ifdef CONFIG_SMP

2300

# ifdef CONFIG_SMP

2301

static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)

2301

static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)

2302

{

2302

{

2303

long tg_weight;

2303

long tg_weight;

2304

2305

/*

2305

/*

2306

* Use this CPU's actual weight instead of the last load_contribution

2306

* Use this CPU's actual weight instead of the last load_contribution

2307

* to gain a more accurate current total weight. See

2307

* to gain a more accurate current total weight. See

2308

* update_cfs_rq_load_contribution().

2308

* update_cfs_rq_load_contribution().

2309

*/

2309

*/

2310

tg_weight = atomic_long_read(&tg->load_avg);

2310

tg_weight = atomic_long_read(&tg->load_avg);

2311

tg_weight -= cfs_rq->tg_load_contrib;

2311

tg_weight -= cfs_rq->tg_load_contrib;

2312

tg_weight += cfs_rq->load.weight;

2312

tg_weight += cfs_rq->load.weight;

2313

2314

return tg_weight;

2314

return tg_weight;

2315

}

2315

}

2316

2317

static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)

2317

static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)

2318

{

2318

{

2319

long tg_weight, load, shares;

2319

long tg_weight, load, shares;

2320

2321

tg_weight = calc_tg_weight(tg, cfs_rq);

2321

tg_weight = calc_tg_weight(tg, cfs_rq);

2322

load = cfs_rq->load.weight;

2322

load = cfs_rq->load.weight;

2323

2324

shares = (tg->shares * load);

2324

shares = (tg->shares * load);

2325

if (tg_weight)

2325

if (tg_weight)

2326

shares /= tg_weight;

2326

shares /= tg_weight;

2327

2328

if (shares < MIN_SHARES)

2328

if (shares < MIN_SHARES)

2329

shares = MIN_SHARES;

2329

shares = MIN_SHARES;

2330

if (shares > tg->shares)

2330

if (shares > tg->shares)

2331

shares = tg->shares;

2331

shares = tg->shares;

2332

2333

return shares;

2333

return shares;

2334

}

2334

}

2335

# else /* CONFIG_SMP */

2335

# else /* CONFIG_SMP */

2336

static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)

2336

static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)

2337

{

2337

{

2338

return tg->shares;

2338

return tg->shares;

2339

}

2339

}

2340

# endif /* CONFIG_SMP */

2340

# endif /* CONFIG_SMP */

2341

static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,

2341

static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,

2342

unsigned long weight)

2342

unsigned long weight)

2343

{

2343

{

2344

if (se->on_rq) {

2344

if (se->on_rq) {

2345

/* commit outstanding execution time */

2345

/* commit outstanding execution time */

2346

if (cfs_rq->curr == se)

2346

if (cfs_rq->curr == se)

2347

update_curr(cfs_rq);

2347

update_curr(cfs_rq);

2348

account_entity_dequeue(cfs_rq, se);

2348

account_entity_dequeue(cfs_rq, se);

2349

}

2349

}

2350

2351

update_load_set(&se->load, weight);

2351

update_load_set(&se->load, weight);

2352

2353

if (se->on_rq)

2353

if (se->on_rq)

2354

account_entity_enqueue(cfs_rq, se);

2354

account_entity_enqueue(cfs_rq, se);

2355

}

2355

}

2356

2357

static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);

2357

static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);

2358

2359

static void update_cfs_shares(struct cfs_rq *cfs_rq)

2359

static void update_cfs_shares(struct cfs_rq *cfs_rq)

2360

{

2360

{

2361

struct task_group *tg;

2361

struct task_group *tg;

2362

struct sched_entity *se;

2362

struct sched_entity *se;

2363

long shares;

2363

long shares;

2364

2365

tg = cfs_rq->tg;

2365

tg = cfs_rq->tg;

2366

se = tg->se[cpu_of(rq_of(cfs_rq))];

2366

se = tg->se[cpu_of(rq_of(cfs_rq))];

2367

if (!se || throttled_hierarchy(cfs_rq))

2367

if (!se || throttled_hierarchy(cfs_rq))

2368

return;

2368

return;

2369

#ifndef CONFIG_SMP

2369

#ifndef CONFIG_SMP

2370

if (likely(se->load.weight == tg->shares))

2370

if (likely(se->load.weight == tg->shares))

2371

return;

2371

return;

2372

#endif

2372

#endif

2373

shares = calc_cfs_shares(cfs_rq, tg);

2373

shares = calc_cfs_shares(cfs_rq, tg);

2374

2375

reweight_entity(cfs_rq_of(se), se, shares);

2375

reweight_entity(cfs_rq_of(se), se, shares);

2376

}

2376

}

2377

#else /* CONFIG_FAIR_GROUP_SCHED */

2377

#else /* CONFIG_FAIR_GROUP_SCHED */

2378

static inline void update_cfs_shares(struct cfs_rq *cfs_rq)

2378

static inline void update_cfs_shares(struct cfs_rq *cfs_rq)

2379

{

2379

{

2380

}

2380

}

2381

#endif /* CONFIG_FAIR_GROUP_SCHED */

2381

#endif /* CONFIG_FAIR_GROUP_SCHED */

2382

2383

#ifdef CONFIG_SMP

2383

#ifdef CONFIG_SMP

2384

/*

2384

/*

2385

* We choose a half-life close to 1 scheduling period.

2385

* We choose a half-life close to 1 scheduling period.

2386

* Note: The tables below are dependent on this value.

2386

* Note: The tables below are dependent on this value.

2387

*/

2387

*/

2388

#define LOAD_AVG_PERIOD 32

2388

#define LOAD_AVG_PERIOD 32

2389

#define LOAD_AVG_MAX 47742 /* maximum possible load avg */

2389

#define LOAD_AVG_MAX 47742 /* maximum possible load avg */

2390

#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */

2390

#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */

2391

2392

/* Precomputed fixed inverse multiplies for multiplication by y^n */

2392

/* Precomputed fixed inverse multiplies for multiplication by y^n */

2393

static const u32 runnable_avg_yN_inv[] = {

2393

static const u32 runnable_avg_yN_inv[] = {

2394

0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,

2394

0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,

2395

0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,

2395

0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,

2396

0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,

2396

0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,

2397

0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,

2397

0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,

2398

0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,

2398

0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,

2399

0x85aac367, 0x82cd8698,

2399

0x85aac367, 0x82cd8698,

2400

};

2400

};

2401

2402

/*

2402

/*

2403

* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent

2403

* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent

2404

* over-estimates when re-combining.

2404

* over-estimates when re-combining.

2405

*/

2405

*/

2406

static const u32 runnable_avg_yN_sum[] = {

2406

static const u32 runnable_avg_yN_sum[] = {

2407

0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,

2407

0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,

2408

9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,

2408

9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,

2409

17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,

2409

17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,

2410

};

2410

};

2411

2412

/*

2412

/*

2413

* Approximate:

2413

* Approximate:

2414

* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)

2414

* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)

2415

*/

2415

*/

2416

static __always_inline u64 decay_load(u64 val, u64 n)

2416

static __always_inline u64 decay_load(u64 val, u64 n)

2417

{

2417

{

2418

unsigned int local_n;

2418

unsigned int local_n;

2419

2420

if (!n)

2420

if (!n)

2421

return val;

2421

return val;

2422

else if (unlikely(n > LOAD_AVG_PERIOD * 63))

2422

else if (unlikely(n > LOAD_AVG_PERIOD * 63))

2423

return 0;

2423

return 0;

2424

2425

/* after bounds checking we can collapse to 32-bit */

2425

/* after bounds checking we can collapse to 32-bit */

2426

local_n = n;

2426

local_n = n;

2427

2428

/*

2428

/*

2429

* As y^PERIOD = 1/2, we can combine

2429

* As y^PERIOD = 1/2, we can combine

2430

* y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)

2430

* y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)

2431

* With a look-up table which covers y^n (n<PERIOD)

2431

* With a look-up table which covers y^n (n<PERIOD)

2432

*

2432

*

2433

* To achieve constant time decay_load.

2433

* To achieve constant time decay_load.

2434

*/

2434

*/

2435

if (unlikely(local_n >= LOAD_AVG_PERIOD)) {

2435

if (unlikely(local_n >= LOAD_AVG_PERIOD)) {

2436

val >>= local_n / LOAD_AVG_PERIOD;

2436

val >>= local_n / LOAD_AVG_PERIOD;

2437

local_n %= LOAD_AVG_PERIOD;

2437

local_n %= LOAD_AVG_PERIOD;

2438

}

2438

}

2439

2440

val *= runnable_avg_yN_inv[local_n];

2440

val *= runnable_avg_yN_inv[local_n];

2441

/* We don't use SRR here since we always want to round down. */

2441

/* We don't use SRR here since we always want to round down. */

2442

return val >> 32;

2442

return val >> 32;

2443

}

2443

}

2444

2445

/*

2445

/*

2446

* For updates fully spanning n periods, the contribution to runnable

2446

* For updates fully spanning n periods, the contribution to runnable

2447

* average will be: \Sum 1024*y^n

2447

* average will be: \Sum 1024*y^n

2448

*

2448

*

2449

* We can compute this reasonably efficiently by combining:

2449

* We can compute this reasonably efficiently by combining:

2450

* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}

2450

* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}

2451

*/

2451

*/

2452

static u32 __compute_runnable_contrib(u64 n)

2452

static u32 __compute_runnable_contrib(u64 n)

2453

{

2453

{

2454

u32 contrib = 0;

2454

u32 contrib = 0;

2455

2456

if (likely(n <= LOAD_AVG_PERIOD))

2456

if (likely(n <= LOAD_AVG_PERIOD))

2457

return runnable_avg_yN_sum[n];

2457

return runnable_avg_yN_sum[n];

2458

else if (unlikely(n >= LOAD_AVG_MAX_N))

2458

else if (unlikely(n >= LOAD_AVG_MAX_N))

2459

return LOAD_AVG_MAX;

2459

return LOAD_AVG_MAX;

2460

2461

/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */

2461

/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */

2462

do {

2462

do {

2463

contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */

2463

contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */

2464

contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];

2464

contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];

2465

2466

n -= LOAD_AVG_PERIOD;

2466

n -= LOAD_AVG_PERIOD;

2467

} while (n > LOAD_AVG_PERIOD);

2467

} while (n > LOAD_AVG_PERIOD);

2468

2469

contrib = decay_load(contrib, n);

2469

contrib = decay_load(contrib, n);

2470

return contrib + runnable_avg_yN_sum[n];

2470

return contrib + runnable_avg_yN_sum[n];

2471

}

2471

}

2472

2473

/*

2473

/*

2474

* We can represent the historical contribution to runnable average as the

2474

* We can represent the historical contribution to runnable average as the

2475

* coefficients of a geometric series. To do this we sub-divide our runnable

2475

* coefficients of a geometric series. To do this we sub-divide our runnable

2476

* history into segments of approximately 1ms (1024us); label the segment that

2476

* history into segments of approximately 1ms (1024us); label the segment that

2477

* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.

2477

* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.

2478

*

2478

*

2479

* [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...

2479

* [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...

2480

* p0 p1 p2

2480

* p0 p1 p2

2481

* (now) (~1ms ago) (~2ms ago)

2481

* (now) (~1ms ago) (~2ms ago)

2482

*

2482

*

2483

* Let u_i denote the fraction of p_i that the entity was runnable.

2483

* Let u_i denote the fraction of p_i that the entity was runnable.

2484

*

2484

*

2485

* We then designate the fractions u_i as our co-efficients, yielding the

2485

* We then designate the fractions u_i as our co-efficients, yielding the

2486

* following representation of historical load:

2486

* following representation of historical load:

2487

* u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...

2487

* u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...

2488

*

2488

*

2489

* We choose y based on the with of a reasonably scheduling period, fixing:

2489

* We choose y based on the with of a reasonably scheduling period, fixing:

2490

* y^32 = 0.5

2490

* y^32 = 0.5

2491

*

2491

*

2492

* This means that the contribution to load ~32ms ago (u_32) will be weighted

2492

* This means that the contribution to load ~32ms ago (u_32) will be weighted

2493

* approximately half as much as the contribution to load within the last ms

2493

* approximately half as much as the contribution to load within the last ms

2494

* (u_0).

2494

* (u_0).

2495

*

2495

*

2496

* When a period "rolls over" and we have new u_0`, multiplying the previous

2496

* When a period "rolls over" and we have new u_0`, multiplying the previous

2497

* sum again by y is sufficient to update:

2497

* sum again by y is sufficient to update:

2498

* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )

2498

* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )

2499

* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]

2499

* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]

2500

*/

2500

*/

2501

static __always_inline int __update_entity_runnable_avg(u64 now,

2501

static __always_inline int __update_entity_runnable_avg(u64 now,

2502

struct sched_avg *sa,

2502

struct sched_avg *sa,

2503

int runnable)

2503

int runnable)

2504

{

2504

{

2505

u64 delta, periods;

2505

u64 delta, periods;

2506

u32 runnable_contrib;

2506

u32 runnable_contrib;

2507

int delta_w, decayed = 0;

2507

int delta_w, decayed = 0;

2508

2509

delta = now - sa->last_runnable_update;

2509

delta = now - sa->last_runnable_update;

2510

/*

2510

/*

2511

* This should only happen when time goes backwards, which it

2511

* This should only happen when time goes backwards, which it

2512

* unfortunately does during sched clock init when we swap over to TSC.

2512

* unfortunately does during sched clock init when we swap over to TSC.

2513

*/

2513

*/

2514

if ((s64)delta < 0) {

2514

if ((s64)delta < 0) {

2515

sa->last_runnable_update = now;

2515

sa->last_runnable_update = now;

2516

return 0;

2516

return 0;

2517

}

2517

}

2518

2519

/*

2519

/*

2520

* Use 1024ns as the unit of measurement since it's a reasonable

2520

* Use 1024ns as the unit of measurement since it's a reasonable

2521

* approximation of 1us and fast to compute.

2521

* approximation of 1us and fast to compute.

2522

*/

2522

*/

2523

delta >>= 10;

2523

delta >>= 10;

2524

if (!delta)

2524

if (!delta)

2525

return 0;

2525

return 0;

2526

sa->last_runnable_update = now;

2526

sa->last_runnable_update = now;

2527

2528

/* delta_w is the amount already accumulated against our next period */

2528

/* delta_w is the amount already accumulated against our next period */

2529

delta_w = sa->runnable_avg_period % 1024;

2529

delta_w = sa->runnable_avg_period % 1024;

2530

if (delta + delta_w >= 1024) {

2530

if (delta + delta_w >= 1024) {

2531

/* period roll-over */

2531

/* period roll-over */

2532

decayed = 1;

2532

decayed = 1;

2533

2534

/*

2534

/*

2535

* Now that we know we're crossing a period boundary, figure

2535

* Now that we know we're crossing a period boundary, figure

2536

* out how much from delta we need to complete the current

2536

* out how much from delta we need to complete the current

2537

* period and accrue it.

2537

* period and accrue it.

2538

*/

2538

*/

2539

delta_w = 1024 - delta_w;

2539

delta_w = 1024 - delta_w;

2540

if (runnable)

2540

if (runnable)

2541

sa->runnable_avg_sum += delta_w;

2541

sa->runnable_avg_sum += delta_w;

2542

sa->runnable_avg_period += delta_w;

2542

sa->runnable_avg_period += delta_w;

2543

2544

delta -= delta_w;

2544

delta -= delta_w;

2545

2546

/* Figure out how many additional periods this update spans */

2546

/* Figure out how many additional periods this update spans */

2547

periods = delta / 1024;

2547

periods = delta / 1024;

2548

delta %= 1024;

2548

delta %= 1024;

2549

2550

sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,

2550

sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,

2551

periods + 1);

2551

periods + 1);

2552

sa->runnable_avg_period = decay_load(sa->runnable_avg_period,

2552

sa->runnable_avg_period = decay_load(sa->runnable_avg_period,

2553

periods + 1);

2553

periods + 1);

2554

2555

/* Efficiently calculate \sum (1..n_period) 1024*y^i */

2555

/* Efficiently calculate \sum (1..n_period) 1024*y^i */

2556

runnable_contrib = __compute_runnable_contrib(periods);

2556

runnable_contrib = __compute_runnable_contrib(periods);

2557

if (runnable)

2557

if (runnable)

2558

sa->runnable_avg_sum += runnable_contrib;

2558

sa->runnable_avg_sum += runnable_contrib;

2559

sa->runnable_avg_period += runnable_contrib;

2559

sa->runnable_avg_period += runnable_contrib;

2560

}

2560

}

2561

2562

/* Remainder of delta accrued against u_0` */

2562

/* Remainder of delta accrued against u_0` */

2563

if (runnable)

2563

if (runnable)

2564

sa->runnable_avg_sum += delta;

2564

sa->runnable_avg_sum += delta;

2565

sa->runnable_avg_period += delta;

2565

sa->runnable_avg_period += delta;

2566

2567

return decayed;

2567

return decayed;

2568

}

2568

}

2569

2570

/* Synchronize an entity's decay with its parenting cfs_rq.*/

2570

/* Synchronize an entity's decay with its parenting cfs_rq.*/

2571

static inline u64 __synchronize_entity_decay(struct sched_entity *se)

2571

static inline u64 __synchronize_entity_decay(struct sched_entity *se)

2572

{

2572

{

2573

struct cfs_rq *cfs_rq = cfs_rq_of(se);

2573

struct cfs_rq *cfs_rq = cfs_rq_of(se);

2574

u64 decays = atomic64_read(&cfs_rq->decay_counter);

2574

u64 decays = atomic64_read(&cfs_rq->decay_counter);

2575

2576

decays -= se->avg.decay_count;

2576

decays -= se->avg.decay_count;

2577

if (!decays)

2577

if (!decays)

2578

return 0;

2578

return 0;

2579

2580

se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);

2580

se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);

2581

se->avg.decay_count = 0;

2581

se->avg.decay_count = 0;

2582

2583

return decays;

2583

return decays;

2584

}

2584

}

2585

2586

#ifdef CONFIG_FAIR_GROUP_SCHED

2586

#ifdef CONFIG_FAIR_GROUP_SCHED

2587

static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,

2587

static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,

2588

int force_update)

2588

int force_update)

2589

{

2589

{

2590

struct task_group *tg = cfs_rq->tg;

2590

struct task_group *tg = cfs_rq->tg;

2591

long tg_contrib;

2591

long tg_contrib;

2592

2593

tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;

2593

tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;

2594

tg_contrib -= cfs_rq->tg_load_contrib;

2594

tg_contrib -= cfs_rq->tg_load_contrib;

2595

2596

if (!tg_contrib)

2596

if (!tg_contrib)

2597

return;

2597

return;

2598

2599

if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {

2599

if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {

2600

atomic_long_add(tg_contrib, &tg->load_avg);

2600

atomic_long_add(tg_contrib, &tg->load_avg);

2601

cfs_rq->tg_load_contrib += tg_contrib;

2601

cfs_rq->tg_load_contrib += tg_contrib;

2602

}

2602

}

2603

}

2603

}

2604

2605

/*

2605

/*

2606

* Aggregate cfs_rq runnable averages into an equivalent task_group

2606

* Aggregate cfs_rq runnable averages into an equivalent task_group

2607

* representation for computing load contributions.

2607

* representation for computing load contributions.

2608

*/

2608

*/

2609

static inline void __update_tg_runnable_avg(struct sched_avg *sa,

2609

static inline void __update_tg_runnable_avg(struct sched_avg *sa,

2610

struct cfs_rq *cfs_rq)

2610

struct cfs_rq *cfs_rq)

2611

{

2611

{

2612

struct task_group *tg = cfs_rq->tg;

2612

struct task_group *tg = cfs_rq->tg;

2613

long contrib;

2613

long contrib;

2614

2615

/* The fraction of a cpu used by this cfs_rq */

2615

/* The fraction of a cpu used by this cfs_rq */

2616

contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,

2616

contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,

2617

sa->runnable_avg_period + 1);

2617

sa->runnable_avg_period + 1);

2618

contrib -= cfs_rq->tg_runnable_contrib;

2618

contrib -= cfs_rq->tg_runnable_contrib;

2619

2620

if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {

2620

if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {

2621

atomic_add(contrib, &tg->runnable_avg);

2621

atomic_add(contrib, &tg->runnable_avg);

2622

cfs_rq->tg_runnable_contrib += contrib;

2622

cfs_rq->tg_runnable_contrib += contrib;

2623

}

2623

}

2624

}

2624

}

2625

2626

static inline void __update_group_entity_contrib(struct sched_entity *se)

2626

static inline void __update_group_entity_contrib(struct sched_entity *se)

2627

{

2627

{

2628

struct cfs_rq *cfs_rq = group_cfs_rq(se);

2628

struct cfs_rq *cfs_rq = group_cfs_rq(se);

2629

struct task_group *tg = cfs_rq->tg;

2629

struct task_group *tg = cfs_rq->tg;

2630

int runnable_avg;

2630

int runnable_avg;

2631

2632

u64 contrib;

2632

u64 contrib;

2633

2634

contrib = cfs_rq->tg_load_contrib * tg->shares;

2634

contrib = cfs_rq->tg_load_contrib * tg->shares;

2635

se->avg.load_avg_contrib = div_u64(contrib,

2635

se->avg.load_avg_contrib = div_u64(contrib,

2636

atomic_long_read(&tg->load_avg) + 1);

2636

atomic_long_read(&tg->load_avg) + 1);

2637

2638

/*

2638

/*

2639

* For group entities we need to compute a correction term in the case

2639

* For group entities we need to compute a correction term in the case

2640

* that they are consuming <1 cpu so that we would contribute the same

2640

* that they are consuming <1 cpu so that we would contribute the same

2641

* load as a task of equal weight.

2641

* load as a task of equal weight.

2642

*

2642

*

2643

* Explicitly co-ordinating this measurement would be expensive, but

2643

* Explicitly co-ordinating this measurement would be expensive, but

2644

* fortunately the sum of each cpus contribution forms a usable

2644

* fortunately the sum of each cpus contribution forms a usable

2645

* lower-bound on the true value.

2645

* lower-bound on the true value.

2646

*

2646

*

2647

* Consider the aggregate of 2 contributions. Either they are disjoint

2647

* Consider the aggregate of 2 contributions. Either they are disjoint

2648

* (and the sum represents true value) or they are disjoint and we are

2648

* (and the sum represents true value) or they are disjoint and we are

2649

* understating by the aggregate of their overlap.

2649

* understating by the aggregate of their overlap.

2650

*

2650

*

2651

* Extending this to N cpus, for a given overlap, the maximum amount we

2651

* Extending this to N cpus, for a given overlap, the maximum amount we

2652

* understand is then n_i(n_i+1)/2 * w_i where n_i is the number of

2652

* understand is then n_i(n_i+1)/2 * w_i where n_i is the number of

2653

* cpus that overlap for this interval and w_i is the interval width.

2653

* cpus that overlap for this interval and w_i is the interval width.

2654

*

2654

*

2655

* On a small machine; the first term is well-bounded which bounds the

2655

* On a small machine; the first term is well-bounded which bounds the

2656

* total error since w_i is a subset of the period. Whereas on a

2656

* total error since w_i is a subset of the period. Whereas on a

2657

* larger machine, while this first term can be larger, if w_i is the

2657

* larger machine, while this first term can be larger, if w_i is the

2658

* of consequential size guaranteed to see n_i*w_i quickly converge to

2658

* of consequential size guaranteed to see n_i*w_i quickly converge to

2659

* our upper bound of 1-cpu.

2659

* our upper bound of 1-cpu.

2660

*/

2660

*/

2661

runnable_avg = atomic_read(&tg->runnable_avg);

2661

runnable_avg = atomic_read(&tg->runnable_avg);

2662

if (runnable_avg < NICE_0_LOAD) {

2662

if (runnable_avg < NICE_0_LOAD) {

2663

se->avg.load_avg_contrib *= runnable_avg;

2663

se->avg.load_avg_contrib *= runnable_avg;

2664

se->avg.load_avg_contrib >>= NICE_0_SHIFT;

2664

se->avg.load_avg_contrib >>= NICE_0_SHIFT;

2665

}

2665

}

2666

}

2666

}

2667

2668

static inline void update_rq_runnable_avg(struct rq *rq, int runnable)

2668

static inline void update_rq_runnable_avg(struct rq *rq, int runnable)

2669

{

2669

{

2670

__update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);

2670

__update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);

2671

__update_tg_runnable_avg(&rq->avg, &rq->cfs);

2671

__update_tg_runnable_avg(&rq->avg, &rq->cfs);

2672

}

2672

}

2673

#else /* CONFIG_FAIR_GROUP_SCHED */

2673

#else /* CONFIG_FAIR_GROUP_SCHED */

2674

static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,

2674

static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,

2675

int force_update) {}

2675

int force_update) {}

2676

static inline void __update_tg_runnable_avg(struct sched_avg *sa,

2676

static inline void __update_tg_runnable_avg(struct sched_avg *sa,

2677

struct cfs_rq *cfs_rq) {}

2677

struct cfs_rq *cfs_rq) {}

2678

static inline void __update_group_entity_contrib(struct sched_entity *se) {}

2678

static inline void __update_group_entity_contrib(struct sched_entity *se) {}

2679

static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}

2679

static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}

2680

#endif /* CONFIG_FAIR_GROUP_SCHED */

2680

#endif /* CONFIG_FAIR_GROUP_SCHED */

2681

2682

static inline void __update_task_entity_contrib(struct sched_entity *se)

2682

static inline void __update_task_entity_contrib(struct sched_entity *se)

2683

{

2683

{

2684

u32 contrib;

2684

u32 contrib;

2685

2686

/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */

2686

/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */

2687

contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);

2687

contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);

2688

contrib /= (se->avg.runnable_avg_period + 1);

2688

contrib /= (se->avg.runnable_avg_period + 1);

2689

se->avg.load_avg_contrib = scale_load(contrib);

2689

se->avg.load_avg_contrib = scale_load(contrib);

2690

}

2690

}

2691

2692

/* Compute the current contribution to load_avg by se, return any delta */

2692

/* Compute the current contribution to load_avg by se, return any delta */

2693

static long __update_entity_load_avg_contrib(struct sched_entity *se)

2693

static long __update_entity_load_avg_contrib(struct sched_entity *se)

2694

{

2694

{

2695

long old_contrib = se->avg.load_avg_contrib;

2695

long old_contrib = se->avg.load_avg_contrib;

2696

2697

if (entity_is_task(se)) {

2697

if (entity_is_task(se)) {

2698

__update_task_entity_contrib(se);

2698

__update_task_entity_contrib(se);

2699

} else {

2699

} else {

2700

__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));

2700

__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));

2701

__update_group_entity_contrib(se);

2701

__update_group_entity_contrib(se);

2702

}

2702

}

2703

2704

return se->avg.load_avg_contrib - old_contrib;

2704

return se->avg.load_avg_contrib - old_contrib;

2705

}

2705

}

2706

2707

static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,

2707

static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,

2708

long load_contrib)

2708

long load_contrib)

2709

{

2709

{

2710

if (likely(load_contrib < cfs_rq->blocked_load_avg))

2710

if (likely(load_contrib < cfs_rq->blocked_load_avg))

2711

cfs_rq->blocked_load_avg -= load_contrib;

2711

cfs_rq->blocked_load_avg -= load_contrib;

2712

else

2712

else

2713

cfs_rq->blocked_load_avg = 0;

2713

cfs_rq->blocked_load_avg = 0;

2714

}

2714

}

2715

2716

static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);

2716

static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);

2717

2718

/* Update a sched_entity's runnable average */

2718

/* Update a sched_entity's runnable average */

2719

static inline void update_entity_load_avg(struct sched_entity *se,

2719

static inline void update_entity_load_avg(struct sched_entity *se,

2720

int update_cfs_rq)

2720

int update_cfs_rq)

2721

{

2721

{

2722

struct cfs_rq *cfs_rq = cfs_rq_of(se);

2722

struct cfs_rq *cfs_rq = cfs_rq_of(se);

2723

long contrib_delta;

2723

long contrib_delta;

2724

u64 now;

2724

u64 now;

2725

2726

/*

2726

/*

2727

* For a group entity we need to use their owned cfs_rq_clock_task() in

2727

* For a group entity we need to use their owned cfs_rq_clock_task() in

2728

* case they are the parent of a throttled hierarchy.

2728

* case they are the parent of a throttled hierarchy.

2729

*/

2729

*/

2730

if (entity_is_task(se))

2730

if (entity_is_task(se))

2731

now = cfs_rq_clock_task(cfs_rq);

2731

now = cfs_rq_clock_task(cfs_rq);

2732

else

2732

else

2733

now = cfs_rq_clock_task(group_cfs_rq(se));

2733

now = cfs_rq_clock_task(group_cfs_rq(se));

2734

2735

if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))

2735

if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))

2736

return;

2736

return;

2737

2738

contrib_delta = __update_entity_load_avg_contrib(se);

2738

contrib_delta = __update_entity_load_avg_contrib(se);

2739

2740

if (!update_cfs_rq)

2740

if (!update_cfs_rq)

2741

return;

2741

return;

2742

2743

if (se->on_rq)

2743

if (se->on_rq)

2744

cfs_rq->runnable_load_avg += contrib_delta;

2744

cfs_rq->runnable_load_avg += contrib_delta;

2745

else

2745

else

2746

subtract_blocked_load_contrib(cfs_rq, -contrib_delta);

2746

subtract_blocked_load_contrib(cfs_rq, -contrib_delta);

2747

}

2747

}

2748

2749

/*

2749

/*

2750

* Decay the load contributed by all blocked children and account this so that

2750

* Decay the load contributed by all blocked children and account this so that

2751

* their contribution may appropriately discounted when they wake up.

2751

* their contribution may appropriately discounted when they wake up.

2752

*/

2752

*/

2753

static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)

2753

static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)

2754

{

2754

{

2755

u64 now = cfs_rq_clock_task(cfs_rq) >> 20;

2755

u64 now = cfs_rq_clock_task(cfs_rq) >> 20;

2756

u64 decays;

2756

u64 decays;

2757

2758

decays = now - cfs_rq->last_decay;

2758

decays = now - cfs_rq->last_decay;

2759

if (!decays && !force_update)

2759

if (!decays && !force_update)

2760

return;

2760

return;

2761

2762

if (atomic_long_read(&cfs_rq->removed_load)) {

2762

if (atomic_long_read(&cfs_rq->removed_load)) {

2763

unsigned long removed_load;

2763

unsigned long removed_load;

2764

removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);

2764

removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);

2765

subtract_blocked_load_contrib(cfs_rq, removed_load);

2765

subtract_blocked_load_contrib(cfs_rq, removed_load);

2766

}

2766

}

2767

2768

if (decays) {

2768

if (decays) {

2769

cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,

2769

cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,

2770

decays);

2770

decays);

2771

atomic64_add(decays, &cfs_rq->decay_counter);

2771

atomic64_add(decays, &cfs_rq->decay_counter);

2772

cfs_rq->last_decay = now;

2772

cfs_rq->last_decay = now;

2773

}

2773

}

2774

2775

__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);

2775

__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);

2776

}

2776

}

2777

2778

/* Add the load generated by se into cfs_rq's child load-average */

2778

/* Add the load generated by se into cfs_rq's child load-average */

2779

static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,

2779

static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,

2780

struct sched_entity *se,

2780

struct sched_entity *se,

2781

int wakeup)

2781

int wakeup)

2782

{

2782

{

2783

/*

2783

/*

2784

* We track migrations using entity decay_count <= 0, on a wake-up

2784

* We track migrations using entity decay_count <= 0, on a wake-up

2785

* migration we use a negative decay count to track the remote decays

2785

* migration we use a negative decay count to track the remote decays

2786

* accumulated while sleeping.

2786

* accumulated while sleeping.

2787

*

2787

*

2788

* Newly forked tasks are enqueued with se->avg.decay_count == 0, they

2788

* Newly forked tasks are enqueued with se->avg.decay_count == 0, they

2789

* are seen by enqueue_entity_load_avg() as a migration with an already

2789

* are seen by enqueue_entity_load_avg() as a migration with an already

2790

* constructed load_avg_contrib.

2790

* constructed load_avg_contrib.

2791

*/

2791

*/

2792

if (unlikely(se->avg.decay_count <= 0)) {

2792

if (unlikely(se->avg.decay_count <= 0)) {

2793

se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));

2793

se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));

2794

if (se->avg.decay_count) {

2794

if (se->avg.decay_count) {

2795

/*

2795

/*

2796

* In a wake-up migration we have to approximate the

2796

* In a wake-up migration we have to approximate the

2797

* time sleeping. This is because we can't synchronize

2797

* time sleeping. This is because we can't synchronize

2798

* clock_task between the two cpus, and it is not

2798

* clock_task between the two cpus, and it is not

2799

* guaranteed to be read-safe. Instead, we can

2799

* guaranteed to be read-safe. Instead, we can

2800

* approximate this using our carried decays, which are

2800

* approximate this using our carried decays, which are

2801

* explicitly atomically readable.

2801

* explicitly atomically readable.

2802

*/

2802

*/

2803

se->avg.last_runnable_update -= (-se->avg.decay_count)

2803

se->avg.last_runnable_update -= (-se->avg.decay_count)

2804

<< 20;

2804

<< 20;

2805

update_entity_load_avg(se, 0);

2805

update_entity_load_avg(se, 0);

2806

/* Indicate that we're now synchronized and on-rq */

2806

/* Indicate that we're now synchronized and on-rq */

2807

se->avg.decay_count = 0;

2807

se->avg.decay_count = 0;

2808

}

2808

}

2809

wakeup = 0;

2809

wakeup = 0;

2810

} else {

2810

} else {

2811

__synchronize_entity_decay(se);

2811

__synchronize_entity_decay(se);

2812

}

2812

}

2813

2814

/* migrated tasks did not contribute to our blocked load */

2814

/* migrated tasks did not contribute to our blocked load */

2815

if (wakeup) {

2815

if (wakeup) {

2816

subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);

2816

subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);

2817

update_entity_load_avg(se, 0);

2817

update_entity_load_avg(se, 0);

2818

}

2818

}

2819

2820

cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;

2820

cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;

2821

/* we force update consideration on load-balancer moves */

2821

/* we force update consideration on load-balancer moves */

2822

update_cfs_rq_blocked_load(cfs_rq, !wakeup);

2822

update_cfs_rq_blocked_load(cfs_rq, !wakeup);

2823

}

2823

}

2824

2825

/*

2825

/*

2826

* Remove se's load from this cfs_rq child load-average, if the entity is

2826

* Remove se's load from this cfs_rq child load-average, if the entity is

2827

* transitioning to a blocked state we track its projected decay using

2827

* transitioning to a blocked state we track its projected decay using

2828

* blocked_load_avg.

2828

* blocked_load_avg.

2829

*/

2829

*/

2830

static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,

2830

static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,

2831

struct sched_entity *se,

2831

struct sched_entity *se,

2832

int sleep)

2832

int sleep)

2833

{

2833

{

2834

update_entity_load_avg(se, 1);

2834

update_entity_load_avg(se, 1);

2835

/* we force update consideration on load-balancer moves */

2835

/* we force update consideration on load-balancer moves */

2836

update_cfs_rq_blocked_load(cfs_rq, !sleep);

2836

update_cfs_rq_blocked_load(cfs_rq, !sleep);

2837

2838

cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;

2838

cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;

2839

if (sleep) {

2839

if (sleep) {

2840

cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;

2840

cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;

2841

se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);

2841

se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);

2842

} /* migrations, e.g. sleep=0 leave decay_count == 0 */

2842

} /* migrations, e.g. sleep=0 leave decay_count == 0 */

2843

}

2843

}

2844

2845

/*

2845

/*

2846

* Update the rq's load with the elapsed running time before entering

2846

* Update the rq's load with the elapsed running time before entering

2847

* idle. if the last scheduled task is not a CFS task, idle_enter will

2847

* idle. if the last scheduled task is not a CFS task, idle_enter will

2848

* be the only way to update the runnable statistic.

2848

* be the only way to update the runnable statistic.

2849

*/

2849

*/

2850

void idle_enter_fair(struct rq *this_rq)

2850

void idle_enter_fair(struct rq *this_rq)

2851

{

2851

{

2852

update_rq_runnable_avg(this_rq, 1);

2852

update_rq_runnable_avg(this_rq, 1);

2853

}

2853

}

2854

2855

/*

2855

/*

2856

* Update the rq's load with the elapsed idle time before a task is

2856

* Update the rq's load with the elapsed idle time before a task is

2857

* scheduled. if the newly scheduled task is not a CFS task, idle_exit will

2857

* scheduled. if the newly scheduled task is not a CFS task, idle_exit will

2858

* be the only way to update the runnable statistic.

2858

* be the only way to update the runnable statistic.

2859

*/

2859

*/

2860

void idle_exit_fair(struct rq *this_rq)

2860

void idle_exit_fair(struct rq *this_rq)

2861

{

2861

{

2862

update_rq_runnable_avg(this_rq, 0);

2862

update_rq_runnable_avg(this_rq, 0);

2863

}

2863

}

2864

2865

static int idle_balance(struct rq *this_rq);

2865

static int idle_balance(struct rq *this_rq);

2866

2867

#else /* CONFIG_SMP */

2867

#else /* CONFIG_SMP */

2868

2869

static inline void update_entity_load_avg(struct sched_entity *se,

2869

static inline void update_entity_load_avg(struct sched_entity *se,

2870

int update_cfs_rq) {}

2870

int update_cfs_rq) {}

2871

static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}

2871

static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}

2872

static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,

2872

static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,

2873

struct sched_entity *se,

2873

struct sched_entity *se,

2874

int wakeup) {}

2874

int wakeup) {}

2875

static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,

2875

static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,

2876

struct sched_entity *se,

2876

struct sched_entity *se,

2877

int sleep) {}

2877

int sleep) {}

2878

static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,

2878

static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,

2879

int force_update) {}

2879

int force_update) {}

2880

2881

static inline int idle_balance(struct rq *rq)

2881

static inline int idle_balance(struct rq *rq)

2882

{

2882

{

2883

return 0;

2883

return 0;

2884

}

2884

}

2885

2886

#endif /* CONFIG_SMP */

2886

#endif /* CONFIG_SMP */

2887

2888

static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)

2888

static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)

2889

{

2889

{

2890

#ifdef CONFIG_SCHEDSTATS

2890

#ifdef CONFIG_SCHEDSTATS

2891

struct task_struct *tsk = NULL;

2891

struct task_struct *tsk = NULL;

2892

2893

if (entity_is_task(se))

2893

if (entity_is_task(se))

2894

tsk = task_of(se);

2894

tsk = task_of(se);

2895

2896

if (se->statistics.sleep_start) {

2896

if (se->statistics.sleep_start) {

2897

u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;

2897

u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;

2898

2899

if ((s64)delta < 0)

2899

if ((s64)delta < 0)

2900

delta = 0;

2900

delta = 0;

2901

2902

if (unlikely(delta > se->statistics.sleep_max))

2902

if (unlikely(delta > se->statistics.sleep_max))

2903

se->statistics.sleep_max = delta;

2903

se->statistics.sleep_max = delta;

2904

2905

se->statistics.sleep_start = 0;

2905

se->statistics.sleep_start = 0;

2906

se->statistics.sum_sleep_runtime += delta;

2906

se->statistics.sum_sleep_runtime += delta;

2907

2908

if (tsk) {

2908

if (tsk) {

2909

account_scheduler_latency(tsk, delta >> 10, 1);

2909

account_scheduler_latency(tsk, delta >> 10, 1);

2910

trace_sched_stat_sleep(tsk, delta);

2910

trace_sched_stat_sleep(tsk, delta);

2911

}

2911

}

2912

}

2912

}

2913

if (se->statistics.block_start) {

2913

if (se->statistics.block_start) {

2914

u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;

2914

u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;

2915

2916

if ((s64)delta < 0)

2916

if ((s64)delta < 0)

2917

delta = 0;

2917

delta = 0;

2918

2919

if (unlikely(delta > se->statistics.block_max))

2919

if (unlikely(delta > se->statistics.block_max))

2920

se->statistics.block_max = delta;

2920

se->statistics.block_max = delta;

2921

2922

se->statistics.block_start = 0;

2922

se->statistics.block_start = 0;

2923

se->statistics.sum_sleep_runtime += delta;

2923

se->statistics.sum_sleep_runtime += delta;

2924

2925

if (tsk) {

2925

if (tsk) {

2926

if (tsk->in_iowait) {

2926

if (tsk->in_iowait) {

2927

se->statistics.iowait_sum += delta;

2927

se->statistics.iowait_sum += delta;

2928

se->statistics.iowait_count++;

2928

se->statistics.iowait_count++;

2929

trace_sched_stat_iowait(tsk, delta);

2929

trace_sched_stat_iowait(tsk, delta);

2930

}

2930

}

2931

2932

trace_sched_stat_blocked(tsk, delta);

2932

trace_sched_stat_blocked(tsk, delta);

2933

2934

/*

2934

/*

2935

* Blocking time is in units of nanosecs, so shift by

2935

* Blocking time is in units of nanosecs, so shift by

2936

* 20 to get a milliseconds-range estimation of the

2936

* 20 to get a milliseconds-range estimation of the

2937

* amount of time that the task spent sleeping:

2937

* amount of time that the task spent sleeping:

2938

*/

2938

*/

2939

if (unlikely(prof_on == SLEEP_PROFILING)) {

2939

if (unlikely(prof_on == SLEEP_PROFILING)) {

2940

profile_hits(SLEEP_PROFILING,

2940

profile_hits(SLEEP_PROFILING,

2941

(void *)get_wchan(tsk),

2941

(void *)get_wchan(tsk),

2942

delta >> 20);

2942

delta >> 20);

2943

}

2943

}

2944

account_scheduler_latency(tsk, delta >> 10, 0);

2944

account_scheduler_latency(tsk, delta >> 10, 0);

2945

}

2945

}

2946

}

2946

}

2947

#endif

2947

#endif

2948

}

2948

}

2949

2950

static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)

2950

static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)

2951

{

2951

{

2952

#ifdef CONFIG_SCHED_DEBUG

2952

#ifdef CONFIG_SCHED_DEBUG

2953

s64 d = se->vruntime - cfs_rq->min_vruntime;

2953

s64 d = se->vruntime - cfs_rq->min_vruntime;

2954

2955

if (d < 0)

2955

if (d < 0)

2956

d = -d;

2956

d = -d;

2957

2958

if (d > 3*sysctl_sched_latency)

2958

if (d > 3*sysctl_sched_latency)

2959

schedstat_inc(cfs_rq, nr_spread_over);

2959

schedstat_inc(cfs_rq, nr_spread_over);

2960

#endif

2960

#endif

2961

}

2961

}

2962

2963

static void

2963

static void

2964

place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)

2964

place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)

2965

{

2965

{

2966

u64 vruntime = cfs_rq->min_vruntime;

2966

u64 vruntime = cfs_rq->min_vruntime;

2967

2968

/*

2968

/*

2969

* The 'current' period is already promised to the current tasks,

2969

* The 'current' period is already promised to the current tasks,

2970

* however the extra weight of the new task will slow them down a

2970

* however the extra weight of the new task will slow them down a

2971

* little, place the new task so that it fits in the slot that

2971

* little, place the new task so that it fits in the slot that

2972

* stays open at the end.

2972

* stays open at the end.

2973

*/

2973

*/

2974

if (initial && sched_feat(START_DEBIT))

2974

if (initial && sched_feat(START_DEBIT))

2975

vruntime += sched_vslice(cfs_rq, se);

2975

vruntime += sched_vslice(cfs_rq, se);

2976

2977

/* sleeps up to a single latency don't count. */

2977

/* sleeps up to a single latency don't count. */

2978

if (!initial) {

2978

if (!initial) {

2979

unsigned long thresh = sysctl_sched_latency;

2979

unsigned long thresh = sysctl_sched_latency;

2980

2981

/*

2981

/*

2982

* Halve their sleep time's effect, to allow

2982

* Halve their sleep time's effect, to allow

2983

* for a gentler effect of sleepers:

2983

* for a gentler effect of sleepers:

2984

*/

2984

*/

2985

if (sched_feat(GENTLE_FAIR_SLEEPERS))

2985

if (sched_feat(GENTLE_FAIR_SLEEPERS))

2986

thresh >>= 1;

2986

thresh >>= 1;

2987

2988

vruntime -= thresh;

2988

vruntime -= thresh;

2989

}

2989

}

2990

2991

/* ensure we never gain time by being placed backwards. */

2991

/* ensure we never gain time by being placed backwards. */

2992

se->vruntime = max_vruntime(se->vruntime, vruntime);

2992

se->vruntime = max_vruntime(se->vruntime, vruntime);

2993

}

2993

}

2994

2995

static void check_enqueue_throttle(struct cfs_rq *cfs_rq);

2995

static void check_enqueue_throttle(struct cfs_rq *cfs_rq);

2996

2997

static void

2997

static void

2998

enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)

2998

enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)

2999

{

2999

{

3000

/*

3000

/*

3001

* Update the normalized vruntime before updating min_vruntime

3001

* Update the normalized vruntime before updating min_vruntime

3002

* through calling update_curr().

3002

* through calling update_curr().

3003

*/

3003

*/

3004

if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))

3004

if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))

3005

se->vruntime += cfs_rq->min_vruntime;

3005

se->vruntime += cfs_rq->min_vruntime;

3006

3007

/*

3007

/*

3008

* Update run-time statistics of the 'current'.

3008

* Update run-time statistics of the 'current'.

3009

*/

3009

*/

3010

update_curr(cfs_rq);

3010

update_curr(cfs_rq);

3011

enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);

3011

enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);

3012

account_entity_enqueue(cfs_rq, se);

3012

account_entity_enqueue(cfs_rq, se);

3013

update_cfs_shares(cfs_rq);

3013

update_cfs_shares(cfs_rq);

3014

3015

if (flags & ENQUEUE_WAKEUP) {

3015

if (flags & ENQUEUE_WAKEUP) {

3016

place_entity(cfs_rq, se, 0);

3016

place_entity(cfs_rq, se, 0);

3017

enqueue_sleeper(cfs_rq, se);

3017

enqueue_sleeper(cfs_rq, se);

3018

}

3018

}

3019

3020

update_stats_enqueue(cfs_rq, se);

3020

update_stats_enqueue(cfs_rq, se);

3021

check_spread(cfs_rq, se);

3021

check_spread(cfs_rq, se);

3022

if (se != cfs_rq->curr)

3022

if (se != cfs_rq->curr)

3023

__enqueue_entity(cfs_rq, se);

3023

__enqueue_entity(cfs_rq, se);

3024

se->on_rq = 1;

3024

se->on_rq = 1;

3025

3026

if (cfs_rq->nr_running == 1) {

3026

if (cfs_rq->nr_running == 1) {

3027

list_add_leaf_cfs_rq(cfs_rq);

3027

list_add_leaf_cfs_rq(cfs_rq);

3028

check_enqueue_throttle(cfs_rq);

3028

check_enqueue_throttle(cfs_rq);

3029

}

3029

}

3030

}

3030

}

3031

3032

static void __clear_buddies_last(struct sched_entity *se)

3032

static void __clear_buddies_last(struct sched_entity *se)

3033

{

3033

{

3034

for_each_sched_entity(se) {

3034

for_each_sched_entity(se) {

3035

struct cfs_rq *cfs_rq = cfs_rq_of(se);

3035

struct cfs_rq *cfs_rq = cfs_rq_of(se);

3036

if (cfs_rq->last != se)

3036

if (cfs_rq->last != se)

3037

break;

3037

break;

3038

3039

cfs_rq->last = NULL;

3039

cfs_rq->last = NULL;

3040

}

3040

}

3041

}

3041

}

3042

3043

static void __clear_buddies_next(struct sched_entity *se)

3043

static void __clear_buddies_next(struct sched_entity *se)

3044

{

3044

{

3045

for_each_sched_entity(se) {

3045

for_each_sched_entity(se) {

3046

struct cfs_rq *cfs_rq = cfs_rq_of(se);

3046

struct cfs_rq *cfs_rq = cfs_rq_of(se);

3047

if (cfs_rq->next != se)

3047

if (cfs_rq->next != se)

3048

break;

3048

break;

3049

3050

cfs_rq->next = NULL;

3050

cfs_rq->next = NULL;

3051

}

3051

}

3052

}

3052

}

3053

3054

static void __clear_buddies_skip(struct sched_entity *se)

3054

static void __clear_buddies_skip(struct sched_entity *se)

3055

{

3055

{

3056

for_each_sched_entity(se) {

3056

for_each_sched_entity(se) {

3057

struct cfs_rq *cfs_rq = cfs_rq_of(se);

3057

struct cfs_rq *cfs_rq = cfs_rq_of(se);

3058

if (cfs_rq->skip != se)

3058

if (cfs_rq->skip != se)

3059

break;

3059

break;

3060

3061

cfs_rq->skip = NULL;

3061

cfs_rq->skip = NULL;

3062

}

3062

}

3063

}

3063

}

3064

3065

static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)

3065

static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)

3066

{

3066

{

3067

if (cfs_rq->last == se)

3067

if (cfs_rq->last == se)

3068

__clear_buddies_last(se);

3068

__clear_buddies_last(se);

3069

3070

if (cfs_rq->next == se)

3070

if (cfs_rq->next == se)

3071

__clear_buddies_next(se);

3071

__clear_buddies_next(se);

3072

3073

if (cfs_rq->skip == se)

3073

if (cfs_rq->skip == se)

3074

__clear_buddies_skip(se);

3074

__clear_buddies_skip(se);

3075

}

3075

}

3076

3077

static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);

3077

static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);

3078

3079

static void

3079

static void

3080

dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)

3080

dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)

3081

{

3081

{

3082

/*

3082

/*

3083

* Update run-time statistics of the 'current'.

3083

* Update run-time statistics of the 'current'.

3084

*/

3084

*/

3085

update_curr(cfs_rq);

3085

update_curr(cfs_rq);

3086

dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);

3086

dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);

3087

3088

update_stats_dequeue(cfs_rq, se);

3088

update_stats_dequeue(cfs_rq, se);

3089

if (flags & DEQUEUE_SLEEP) {

3089

if (flags & DEQUEUE_SLEEP) {

3090

#ifdef CONFIG_SCHEDSTATS

3090

#ifdef CONFIG_SCHEDSTATS

3091

if (entity_is_task(se)) {

3091

if (entity_is_task(se)) {

3092

struct task_struct *tsk = task_of(se);

3092

struct task_struct *tsk = task_of(se);

3093

3094

if (tsk->state & TASK_INTERRUPTIBLE)

3094

if (tsk->state & TASK_INTERRUPTIBLE)

3095

se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));

3095

se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));

3096

if (tsk->state & TASK_UNINTERRUPTIBLE)

3096

if (tsk->state & TASK_UNINTERRUPTIBLE)

3097

se->statistics.block_start = rq_clock(rq_of(cfs_rq));

3097

se->statistics.block_start = rq_clock(rq_of(cfs_rq));

3098

}

3098

}

3099

#endif

3099

#endif

3100

}

3100

}

3101

3102

clear_buddies(cfs_rq, se);

3102

clear_buddies(cfs_rq, se);

3103

3104

if (se != cfs_rq->curr)

3104

if (se != cfs_rq->curr)

3105

__dequeue_entity(cfs_rq, se);

3105

__dequeue_entity(cfs_rq, se);

3106

se->on_rq = 0;

3106

se->on_rq = 0;

3107

account_entity_dequeue(cfs_rq, se);

3107

account_entity_dequeue(cfs_rq, se);

3108

3109

/*

3109

/*

3110

* Normalize the entity after updating the min_vruntime because the

3110

* Normalize the entity after updating the min_vruntime because the

3111

* update can refer to the ->curr item and we need to reflect this

3111

* update can refer to the ->curr item and we need to reflect this

3112

* movement in our normalized position.

3112

* movement in our normalized position.

3113

*/

3113

*/

3114

if (!(flags & DEQUEUE_SLEEP))

3114

if (!(flags & DEQUEUE_SLEEP))

3115

se->vruntime -= cfs_rq->min_vruntime;

3115

se->vruntime -= cfs_rq->min_vruntime;

3116

3117

/* return excess runtime on last dequeue */

3117

/* return excess runtime on last dequeue */

3118

return_cfs_rq_runtime(cfs_rq);

3118

return_cfs_rq_runtime(cfs_rq);

3119

3120

update_min_vruntime(cfs_rq);

3120

update_min_vruntime(cfs_rq);

3121

update_cfs_shares(cfs_rq);

3121

update_cfs_shares(cfs_rq);

3122

}

3122

}

3123

3124

/*

3124

/*

3125

* Preempt the current task with a newly woken task if needed:

3125

* Preempt the current task with a newly woken task if needed:

3126

*/

3126

*/

3127

static void

3127

static void

3128

check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)

3128

check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)

3129

{

3129

{

3130

unsigned long ideal_runtime, delta_exec;

3130

unsigned long ideal_runtime, delta_exec;

3131

struct sched_entity *se;

3131

struct sched_entity *se;

3132

s64 delta;

3132

s64 delta;

3133

3134

ideal_runtime = sched_slice(cfs_rq, curr);

3134

ideal_runtime = sched_slice(cfs_rq, curr);

3135

delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;

3135

delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;

3136

if (delta_exec > ideal_runtime) {

3136

if (delta_exec > ideal_runtime) {

3137

resched_curr(rq_of(cfs_rq));

3137

resched_curr(rq_of(cfs_rq));

3138

/*

3138

/*

3139

* The current task ran long enough, ensure it doesn't get

3139

* The current task ran long enough, ensure it doesn't get

3140

* re-elected due to buddy favours.

3140

* re-elected due to buddy favours.

3141

*/

3141

*/

3142

clear_buddies(cfs_rq, curr);

3142

clear_buddies(cfs_rq, curr);

3143

return;

3143

return;

3144

}

3144

}

3145

3146

/*

3146

/*

3147

* Ensure that a task that missed wakeup preemption by a

3147

* Ensure that a task that missed wakeup preemption by a

3148

* narrow margin doesn't have to wait for a full slice.

3148

* narrow margin doesn't have to wait for a full slice.

3149

* This also mitigates buddy induced latencies under load.

3149

* This also mitigates buddy induced latencies under load.

3150

*/

3150

*/

3151

if (delta_exec < sysctl_sched_min_granularity)

3151

if (delta_exec < sysctl_sched_min_granularity)

3152

return;

3152

return;

3153

3154

se = __pick_first_entity(cfs_rq);

3154

se = __pick_first_entity(cfs_rq);

3155

delta = curr->vruntime - se->vruntime;

3155

delta = curr->vruntime - se->vruntime;

3156

3157

if (delta < 0)

3157

if (delta < 0)

3158

return;

3158

return;

3159

3160

if (delta > ideal_runtime)

3160

if (delta > ideal_runtime)

3161

resched_curr(rq_of(cfs_rq));

3161

resched_curr(rq_of(cfs_rq));

3162

}

3162

}

3163

3164

static void

3164

static void

3165

set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)

3165

set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)

3166

{

3166

{

3167

/* 'current' is not kept within the tree. */

3167

/* 'current' is not kept within the tree. */

3168

if (se->on_rq) {

3168

if (se->on_rq) {

3169

/*

3169

/*

3170

* Any task has to be enqueued before it get to execute on

3170

* Any task has to be enqueued before it get to execute on

3171

* a CPU. So account for the time it spent waiting on the

3171

* a CPU. So account for the time it spent waiting on the

3172

* runqueue.

3172

* runqueue.

3173

*/

3173

*/

3174

update_stats_wait_end(cfs_rq, se);

3174

update_stats_wait_end(cfs_rq, se);

3175

__dequeue_entity(cfs_rq, se);

3175

__dequeue_entity(cfs_rq, se);

3176

}

3176

}

3177

3178

update_stats_curr_start(cfs_rq, se);

3178

update_stats_curr_start(cfs_rq, se);

3179

cfs_rq->curr = se;

3179

cfs_rq->curr = se;

3180

#ifdef CONFIG_SCHEDSTATS

3180

#ifdef CONFIG_SCHEDSTATS

3181

/*

3181

/*

3182

* Track our maximum slice length, if the CPU's load is at

3182

* Track our maximum slice length, if the CPU's load is at

3183

* least twice that of our own weight (i.e. dont track it

3183

* least twice that of our own weight (i.e. dont track it

3184

* when there are only lesser-weight tasks around):

3184

* when there are only lesser-weight tasks around):

3185

*/

3185

*/

3186

if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {

3186

if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {

3187

se->statistics.slice_max = max(se->statistics.slice_max,

3187

se->statistics.slice_max = max(se->statistics.slice_max,

3188

se->sum_exec_runtime - se->prev_sum_exec_runtime);

3188

se->sum_exec_runtime - se->prev_sum_exec_runtime);

3189

}

3189

}

3190

#endif

3190

#endif

3191

se->prev_sum_exec_runtime = se->sum_exec_runtime;

3191

se->prev_sum_exec_runtime = se->sum_exec_runtime;

3192

}

3192

}

3193

3194

static int

3194

static int

3195

wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);

3195

wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);

3196

3197

/*

3197

/*

3198

* Pick the next process, keeping these things in mind, in this order:

3198

* Pick the next process, keeping these things in mind, in this order:

3199

* 1) keep things fair between processes/task groups

3199

* 1) keep things fair between processes/task groups

3200

* 2) pick the "next" process, since someone really wants that to run

3200

* 2) pick the "next" process, since someone really wants that to run

3201

* 3) pick the "last" process, for cache locality

3201

* 3) pick the "last" process, for cache locality

3202

* 4) do not run the "skip" process, if something else is available

3202

* 4) do not run the "skip" process, if something else is available

3203

*/

3203

*/

3204

static struct sched_entity *

3204

static struct sched_entity *

3205

pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)

3205

pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)

3206

{

3206

{

3207

struct sched_entity *left = __pick_first_entity(cfs_rq);

3207

struct sched_entity *left = __pick_first_entity(cfs_rq);

3208

struct sched_entity *se;

3208

struct sched_entity *se;

3209

3210

/*

3210

/*

3211

* If curr is set we have to see if its left of the leftmost entity

3211

* If curr is set we have to see if its left of the leftmost entity

3212

* still in the tree, provided there was anything in the tree at all.

3212

* still in the tree, provided there was anything in the tree at all.

3213

*/

3213

*/

3214

if (!left || (curr && entity_before(curr, left)))

3214

if (!left || (curr && entity_before(curr, left)))

3215

left = curr;

3215

left = curr;

3216

3217

se = left; /* ideally we run the leftmost entity */

3217

se = left; /* ideally we run the leftmost entity */

3218

3219

/*

3219

/*

3220

* Avoid running the skip buddy, if running something else can

3220

* Avoid running the skip buddy, if running something else can

3221

* be done without getting too unfair.

3221

* be done without getting too unfair.

3222

*/

3222

*/

3223

if (cfs_rq->skip == se) {

3223

if (cfs_rq->skip == se) {

3224

struct sched_entity *second;

3224

struct sched_entity *second;

3225

3226

if (se == curr) {

3226

if (se == curr) {

3227

second = __pick_first_entity(cfs_rq);

3227

second = __pick_first_entity(cfs_rq);

3228

} else {

3228

} else {

3229

second = __pick_next_entity(se);

3229

second = __pick_next_entity(se);

3230

if (!second || (curr && entity_before(curr, second)))

3230

if (!second || (curr && entity_before(curr, second)))

3231

second = curr;

3231

second = curr;

3232

}

3232

}

3233

3234

if (second && wakeup_preempt_entity(second, left) < 1)

3234

if (second && wakeup_preempt_entity(second, left) < 1)

3235

se = second;

3235

se = second;

3236

}

3236

}

3237

3238

/*

3238

/*

3239

* Prefer last buddy, try to return the CPU to a preempted task.

3239

* Prefer last buddy, try to return the CPU to a preempted task.

3240

*/

3240

*/

3241

if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)

3241

if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)

3242

se = cfs_rq->last;

3242

se = cfs_rq->last;

3243

3244

/*

3244

/*

3245

* Someone really wants this to run. If it's not unfair, run it.

3245

* Someone really wants this to run. If it's not unfair, run it.

3246

*/

3246

*/

3247

if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)

3247

if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)

3248

se = cfs_rq->next;

3248

se = cfs_rq->next;

3249

3250

clear_buddies(cfs_rq, se);

3250

clear_buddies(cfs_rq, se);

3251

3252

return se;

3252

return se;

3253

}

3253

}

3254

3255

static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);

3255

static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);

3256

3257

static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)

3257

static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)

3258

{

3258

{

3259

/*

3259

/*

3260

* If still on the runqueue then deactivate_task()

3260

* If still on the runqueue then deactivate_task()

3261

* was not called and update_curr() has to be done:

3261

* was not called and update_curr() has to be done:

3262

*/

3262

*/

3263

if (prev->on_rq)

3263

if (prev->on_rq)

3264

update_curr(cfs_rq);

3264

update_curr(cfs_rq);

3265

3266

/* throttle cfs_rqs exceeding runtime */

3266

/* throttle cfs_rqs exceeding runtime */

3267

check_cfs_rq_runtime(cfs_rq);

3267

check_cfs_rq_runtime(cfs_rq);

3268

3269

check_spread(cfs_rq, prev);

3269

check_spread(cfs_rq, prev);

3270

if (prev->on_rq) {

3270

if (prev->on_rq) {

3271

update_stats_wait_start(cfs_rq, prev);

3271

update_stats_wait_start(cfs_rq, prev);

3272

/* Put 'current' back into the tree. */

3272

/* Put 'current' back into the tree. */

3273

__enqueue_entity(cfs_rq, prev);

3273

__enqueue_entity(cfs_rq, prev);

3274

/* in !on_rq case, update occurred at dequeue */

3274

/* in !on_rq case, update occurred at dequeue */

3275

update_entity_load_avg(prev, 1);

3275

update_entity_load_avg(prev, 1);

3276

}

3276

}

3277

cfs_rq->curr = NULL;

3277

cfs_rq->curr = NULL;

3278

}

3278

}

3279

3280

static void

3280

static void

3281

entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)

3281

entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)

3282

{

3282

{

3283

/*

3283

/*

3284

* Update run-time statistics of the 'current'.

3284

* Update run-time statistics of the 'current'.

3285

*/

3285

*/

3286

update_curr(cfs_rq);

3286

update_curr(cfs_rq);

3287

3288

/*

3288

/*

3289

* Ensure that runnable average is periodically updated.

3289

* Ensure that runnable average is periodically updated.

3290

*/

3290

*/

3291

update_entity_load_avg(curr, 1);

3291

update_entity_load_avg(curr, 1);

3292

update_cfs_rq_blocked_load(cfs_rq, 1);

3292

update_cfs_rq_blocked_load(cfs_rq, 1);

3293

update_cfs_shares(cfs_rq);

3293

update_cfs_shares(cfs_rq);

3294

3295

#ifdef CONFIG_SCHED_HRTICK

3295

#ifdef CONFIG_SCHED_HRTICK

3296

/*

3296

/*

3297

* queued ticks are scheduled to match the slice, so don't bother

3297

* queued ticks are scheduled to match the slice, so don't bother

3298

* validating it and just reschedule.

3298

* validating it and just reschedule.

3299

*/

3299

*/

3300

if (queued) {

3300

if (queued) {

3301

resched_curr(rq_of(cfs_rq));

3301

resched_curr(rq_of(cfs_rq));

3302

return;

3302

return;

3303

}

3303

}

3304

/*

3304

/*

3305

* don't let the period tick interfere with the hrtick preemption

3305

* don't let the period tick interfere with the hrtick preemption

3306

*/

3306

*/

3307

if (!sched_feat(DOUBLE_TICK) &&

3307

if (!sched_feat(DOUBLE_TICK) &&

3308

hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))

3308

hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))

3309

return;

3309

return;

3310

#endif

3310

#endif

3311

3312

if (cfs_rq->nr_running > 1)

3312

if (cfs_rq->nr_running > 1)

3313

check_preempt_tick(cfs_rq, curr);

3313

check_preempt_tick(cfs_rq, curr);

3314

}

3314

}

3315

3316

3317

/**************************************************

3317

/**************************************************

3318

* CFS bandwidth control machinery

3318

* CFS bandwidth control machinery

3319

*/

3319

*/

3320

3321

#ifdef CONFIG_CFS_BANDWIDTH

3321

#ifdef CONFIG_CFS_BANDWIDTH

3322

3323

#ifdef HAVE_JUMP_LABEL

3323

#ifdef HAVE_JUMP_LABEL

3324

static struct static_key __cfs_bandwidth_used;

3324

static struct static_key __cfs_bandwidth_used;

3325

3326

static inline bool cfs_bandwidth_used(void)

3326

static inline bool cfs_bandwidth_used(void)

3327

{

3327

{

3328

return static_key_false(&__cfs_bandwidth_used);

3328

return static_key_false(&__cfs_bandwidth_used);

3329

}

3329

}

3330

3331

void cfs_bandwidth_usage_inc(void)

3331

void cfs_bandwidth_usage_inc(void)

3332

{

3332

{

3333

static_key_slow_inc(&__cfs_bandwidth_used);

3333

static_key_slow_inc(&__cfs_bandwidth_used);

3334

}

3334

}

3335

3336

void cfs_bandwidth_usage_dec(void)

3336

void cfs_bandwidth_usage_dec(void)

3337

{

3337

{

3338

static_key_slow_dec(&__cfs_bandwidth_used);

3338

static_key_slow_dec(&__cfs_bandwidth_used);

3339

}

3339

}

3340

#else /* HAVE_JUMP_LABEL */

3340

#else /* HAVE_JUMP_LABEL */

3341

static bool cfs_bandwidth_used(void)

3341

static bool cfs_bandwidth_used(void)

3342

{

3342

{

3343

return true;

3343

return true;

3344

}

3344

}

3345

3346

void cfs_bandwidth_usage_inc(void) {}

3346

void cfs_bandwidth_usage_inc(void) {}

3347

void cfs_bandwidth_usage_dec(void) {}

3347

void cfs_bandwidth_usage_dec(void) {}

3348

#endif /* HAVE_JUMP_LABEL */

3348

#endif /* HAVE_JUMP_LABEL */

3349

3350

/*

3350

/*

3351

* default period for cfs group bandwidth.

3351

* default period for cfs group bandwidth.

3352

* default: 0.1s, units: nanoseconds

3352

* default: 0.1s, units: nanoseconds

3353

*/

3353

*/

3354

static inline u64 default_cfs_period(void)

3354

static inline u64 default_cfs_period(void)

3355

{

3355

{

3356

return 100000000ULL;

3356

return 100000000ULL;

3357

}

3357

}

3358

3359

static inline u64 sched_cfs_bandwidth_slice(void)

3359

static inline u64 sched_cfs_bandwidth_slice(void)

3360

{

3360

{

3361

return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;

3361

return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;

3362

}

3362

}

3363

3364

/*

3364

/*

3365

* Replenish runtime according to assigned quota and update expiration time.

3365

* Replenish runtime according to assigned quota and update expiration time.

3366

* We use sched_clock_cpu directly instead of rq->clock to avoid adding

3366

* We use sched_clock_cpu directly instead of rq->clock to avoid adding

3367

* additional synchronization around rq->lock.

3367

* additional synchronization around rq->lock.

3368

*

3368

*

3369

* requires cfs_b->lock

3369

* requires cfs_b->lock

3370

*/

3370

*/

3371

void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)

3371

void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)

3372

{

3372

{

3373

u64 now;

3373

u64 now;

3374

3375

if (cfs_b->quota == RUNTIME_INF)

3375

if (cfs_b->quota == RUNTIME_INF)

3376

return;

3376

return;

3377

3378

now = sched_clock_cpu(smp_processor_id());

3378

now = sched_clock_cpu(smp_processor_id());

3379

cfs_b->runtime = cfs_b->quota;

3379

cfs_b->runtime = cfs_b->quota;

3380

cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);

3380

cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);

3381

}

3381

}

3382

3383

static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)

3383

static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)

3384

{

3384

{

3385

return &tg->cfs_bandwidth;

3385

return &tg->cfs_bandwidth;

3386

}

3386

}

3387

3388

/* rq->task_clock normalized against any time this cfs_rq has spent throttled */

3388

/* rq->task_clock normalized against any time this cfs_rq has spent throttled */

3389

static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)

3389

static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)

3390

{

3390

{

3391

if (unlikely(cfs_rq->throttle_count))

3391

if (unlikely(cfs_rq->throttle_count))

3392

return cfs_rq->throttled_clock_task;

3392

return cfs_rq->throttled_clock_task;

3393

3394

return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;

3394

return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;

3395

}

3395

}

3396

3397

/* returns 0 on failure to allocate runtime */

3397

/* returns 0 on failure to allocate runtime */

3398

static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3398

static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3399

{

3399

{

3400

struct task_group *tg = cfs_rq->tg;

3400

struct task_group *tg = cfs_rq->tg;

3401

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);

3401

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);

3402

u64 amount = 0, min_amount, expires;

3402

u64 amount = 0, min_amount, expires;

3403

3404

/* note: this is a positive sum as runtime_remaining <= 0 */

3404

/* note: this is a positive sum as runtime_remaining <= 0 */

3405

min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;

3405

min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;

3406

3407

raw_spin_lock(&cfs_b->lock);

3407

raw_spin_lock(&cfs_b->lock);

3408

if (cfs_b->quota == RUNTIME_INF)

3408

if (cfs_b->quota == RUNTIME_INF)

3409

amount = min_amount;

3409

amount = min_amount;

3410

else {

3410

else {

3411

/*

3411

/*

3412

* If the bandwidth pool has become inactive, then at least one

3412

* If the bandwidth pool has become inactive, then at least one

3413

* period must have elapsed since the last consumption.

3413

* period must have elapsed since the last consumption.

3414

* Refresh the global state and ensure bandwidth timer becomes

3414

* Refresh the global state and ensure bandwidth timer becomes

3415

* active.

3415

* active.

3416

*/

3416

*/

3417

if (!cfs_b->timer_active) {

3417

if (!cfs_b->timer_active) {

3418

__refill_cfs_bandwidth_runtime(cfs_b);

3418

__refill_cfs_bandwidth_runtime(cfs_b);

3419

__start_cfs_bandwidth(cfs_b, false);

3419

__start_cfs_bandwidth(cfs_b, false);

3420

}

3420

}

3421

3422

if (cfs_b->runtime > 0) {

3422

if (cfs_b->runtime > 0) {

3423

amount = min(cfs_b->runtime, min_amount);

3423

amount = min(cfs_b->runtime, min_amount);

3424

cfs_b->runtime -= amount;

3424

cfs_b->runtime -= amount;

3425

cfs_b->idle = 0;

3425

cfs_b->idle = 0;

3426

}

3426

}

3427

}

3427

}

3428

expires = cfs_b->runtime_expires;

3428

expires = cfs_b->runtime_expires;

3429

raw_spin_unlock(&cfs_b->lock);

3429

raw_spin_unlock(&cfs_b->lock);

3430

3431

cfs_rq->runtime_remaining += amount;

3431

cfs_rq->runtime_remaining += amount;

3432

/*

3432

/*

3433

* we may have advanced our local expiration to account for allowed

3433

* we may have advanced our local expiration to account for allowed

3434

* spread between our sched_clock and the one on which runtime was

3434

* spread between our sched_clock and the one on which runtime was

3435

* issued.

3435

* issued.

3436

*/

3436

*/

3437

if ((s64)(expires - cfs_rq->runtime_expires) > 0)

3437

if ((s64)(expires - cfs_rq->runtime_expires) > 0)

3438

cfs_rq->runtime_expires = expires;

3438

cfs_rq->runtime_expires = expires;

3439

3440

return cfs_rq->runtime_remaining > 0;

3440

return cfs_rq->runtime_remaining > 0;

3441

}

3441

}

3442

3443

/*

3443

/*

3444

* Note: This depends on the synchronization provided by sched_clock and the

3444

* Note: This depends on the synchronization provided by sched_clock and the

3445

* fact that rq->clock snapshots this value.

3445

* fact that rq->clock snapshots this value.

3446

*/

3446

*/

3447

static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3447

static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3448

{

3448

{

3449

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);

3449

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);

3450

3451

/* if the deadline is ahead of our clock, nothing to do */

3451

/* if the deadline is ahead of our clock, nothing to do */

3452

if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))

3452

if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))

3453

return;

3453

return;

3454

3455

if (cfs_rq->runtime_remaining < 0)

3455

if (cfs_rq->runtime_remaining < 0)

3456

return;

3456

return;

3457

3458

/*

3458

/*

3459

* If the local deadline has passed we have to consider the

3459

* If the local deadline has passed we have to consider the

3460

* possibility that our sched_clock is 'fast' and the global deadline

3460

* possibility that our sched_clock is 'fast' and the global deadline

3461

* has not truly expired.

3461

* has not truly expired.

3462

*

3462

*

3463

* Fortunately we can check determine whether this the case by checking

3463

* Fortunately we can check determine whether this the case by checking

3464

* whether the global deadline has advanced. It is valid to compare

3464

* whether the global deadline has advanced. It is valid to compare

3465

* cfs_b->runtime_expires without any locks since we only care about

3465

* cfs_b->runtime_expires without any locks since we only care about

3466

* exact equality, so a partial write will still work.

3466

* exact equality, so a partial write will still work.

3467

*/

3467

*/

3468

3469

if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {

3469

if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {

3470

/* extend local deadline, drift is bounded above by 2 ticks */

3470

/* extend local deadline, drift is bounded above by 2 ticks */

3471

cfs_rq->runtime_expires += TICK_NSEC;

3471

cfs_rq->runtime_expires += TICK_NSEC;

3472

} else {

3472

} else {

3473

/* global deadline is ahead, expiration has passed */

3473

/* global deadline is ahead, expiration has passed */

3474

cfs_rq->runtime_remaining = 0;

3474

cfs_rq->runtime_remaining = 0;

3475

}

3475

}

3476

}

3476

}

3477

3478

static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)

3478

static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)

3479

{

3479

{

3480

/* dock delta_exec before expiring quota (as it could span periods) */

3480

/* dock delta_exec before expiring quota (as it could span periods) */

3481

cfs_rq->runtime_remaining -= delta_exec;

3481

cfs_rq->runtime_remaining -= delta_exec;

3482

expire_cfs_rq_runtime(cfs_rq);

3482

expire_cfs_rq_runtime(cfs_rq);

3483

3484

if (likely(cfs_rq->runtime_remaining > 0))

3484

if (likely(cfs_rq->runtime_remaining > 0))

3485

return;

3485

return;

3486

3487

/*

3487

/*

3488

* if we're unable to extend our runtime we resched so that the active

3488

* if we're unable to extend our runtime we resched so that the active

3489

* hierarchy can be throttled

3489

* hierarchy can be throttled

3490

*/

3490

*/

3491

if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))

3491

if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))

3492

resched_curr(rq_of(cfs_rq));

3492

resched_curr(rq_of(cfs_rq));

3493

}

3493

}

3494

3495

static __always_inline

3495

static __always_inline

3496

void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)

3496

void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)

3497

{

3497

{

3498

if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)

3498

if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)

3499

return;

3499

return;

3500

3501

__account_cfs_rq_runtime(cfs_rq, delta_exec);

3501

__account_cfs_rq_runtime(cfs_rq, delta_exec);

3502

}

3502

}

3503

3504

static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)

3504

static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)

3505

{

3505

{

3506

return cfs_bandwidth_used() && cfs_rq->throttled;

3506

return cfs_bandwidth_used() && cfs_rq->throttled;

3507

}

3507

}

3508

3509

/* check whether cfs_rq, or any parent, is throttled */

3509

/* check whether cfs_rq, or any parent, is throttled */

3510

static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)

3510

static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)

3511

{

3511

{

3512

return cfs_bandwidth_used() && cfs_rq->throttle_count;

3512

return cfs_bandwidth_used() && cfs_rq->throttle_count;

3513

}

3513

}

3514

3515

/*

3515

/*

3516

* Ensure that neither of the group entities corresponding to src_cpu or

3516

* Ensure that neither of the group entities corresponding to src_cpu or

3517

* dest_cpu are members of a throttled hierarchy when performing group

3517

* dest_cpu are members of a throttled hierarchy when performing group

3518

* load-balance operations.

3518

* load-balance operations.

3519

*/

3519

*/

3520

static inline int throttled_lb_pair(struct task_group *tg,

3520

static inline int throttled_lb_pair(struct task_group *tg,

3521

int src_cpu, int dest_cpu)

3521

int src_cpu, int dest_cpu)

3522

{

3522

{

3523

struct cfs_rq *src_cfs_rq, *dest_cfs_rq;

3523

struct cfs_rq *src_cfs_rq, *dest_cfs_rq;

3524

3525

src_cfs_rq = tg->cfs_rq[src_cpu];

3525

src_cfs_rq = tg->cfs_rq[src_cpu];

3526

dest_cfs_rq = tg->cfs_rq[dest_cpu];

3526

dest_cfs_rq = tg->cfs_rq[dest_cpu];

3527

3528

return throttled_hierarchy(src_cfs_rq) ||

3528

return throttled_hierarchy(src_cfs_rq) ||

3529

throttled_hierarchy(dest_cfs_rq);

3529

throttled_hierarchy(dest_cfs_rq);

3530

}

3530

}

3531

3532

/* updated child weight may affect parent so we have to do this bottom up */

3532

/* updated child weight may affect parent so we have to do this bottom up */

3533

static int tg_unthrottle_up(struct task_group *tg, void *data)

3533

static int tg_unthrottle_up(struct task_group *tg, void *data)

3534

{

3534

{

3535

struct rq *rq = data;

3535

struct rq *rq = data;

3536

struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];

3536

struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];

3537

3538

cfs_rq->throttle_count--;

3538

cfs_rq->throttle_count--;

3539

#ifdef CONFIG_SMP

3539

#ifdef CONFIG_SMP

3540

if (!cfs_rq->throttle_count) {

3540

if (!cfs_rq->throttle_count) {

3541

/* adjust cfs_rq_clock_task() */

3541

/* adjust cfs_rq_clock_task() */

3542

cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -

3542

cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -

3543

cfs_rq->throttled_clock_task;

3543

cfs_rq->throttled_clock_task;

3544

}

3544

}

3545

#endif

3545

#endif

3546

3547

return 0;

3547

return 0;

3548

}

3548

}

3549

3550

static int tg_throttle_down(struct task_group *tg, void *data)

3550

static int tg_throttle_down(struct task_group *tg, void *data)

3551

{

3551

{

3552

struct rq *rq = data;

3552

struct rq *rq = data;

3553

struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];

3553

struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];

3554

3555

/* group is entering throttled state, stop time */

3555

/* group is entering throttled state, stop time */

3556

if (!cfs_rq->throttle_count)

3556

if (!cfs_rq->throttle_count)

3557

cfs_rq->throttled_clock_task = rq_clock_task(rq);

3557

cfs_rq->throttled_clock_task = rq_clock_task(rq);

3558

cfs_rq->throttle_count++;

3558

cfs_rq->throttle_count++;

3559

3560

return 0;

3560

return 0;

3561

}

3561

}

3562

3563

static void throttle_cfs_rq(struct cfs_rq *cfs_rq)

3563

static void throttle_cfs_rq(struct cfs_rq *cfs_rq)

3564

{

3564

{

3565

struct rq *rq = rq_of(cfs_rq);

3565

struct rq *rq = rq_of(cfs_rq);

3566

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);

3566

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);

3567

struct sched_entity *se;

3567

struct sched_entity *se;

3568

long task_delta, dequeue = 1;

3568

long task_delta, dequeue = 1;

3569

3570

se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];

3570

se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];

3571

3572

/* freeze hierarchy runnable averages while throttled */

3572

/* freeze hierarchy runnable averages while throttled */

3573

rcu_read_lock();

3573

rcu_read_lock();

3574

walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);

3574

walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);

3575

rcu_read_unlock();

3575

rcu_read_unlock();

3576

3577

task_delta = cfs_rq->h_nr_running;

3577

task_delta = cfs_rq->h_nr_running;

3578

for_each_sched_entity(se) {

3578

for_each_sched_entity(se) {

3579

struct cfs_rq *qcfs_rq = cfs_rq_of(se);

3579

struct cfs_rq *qcfs_rq = cfs_rq_of(se);

3580

/* throttled entity or throttle-on-deactivate */

3580

/* throttled entity or throttle-on-deactivate */

3581

if (!se->on_rq)

3581

if (!se->on_rq)

3582

break;

3582

break;

3583

3584

if (dequeue)

3584

if (dequeue)

3585

dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);

3585

dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);

3586

qcfs_rq->h_nr_running -= task_delta;

3586

qcfs_rq->h_nr_running -= task_delta;

3587

3588

if (qcfs_rq->load.weight)

3588

if (qcfs_rq->load.weight)

3589

dequeue = 0;

3589

dequeue = 0;

3590

}

3590

}

3591

3592

if (!se)

3592

if (!se)

3593

sub_nr_running(rq, task_delta);

3593

sub_nr_running(rq, task_delta);

3594

3595

cfs_rq->throttled = 1;

3595

cfs_rq->throttled = 1;

3596

cfs_rq->throttled_clock = rq_clock(rq);

3596

cfs_rq->throttled_clock = rq_clock(rq);

3597

raw_spin_lock(&cfs_b->lock);

3597

raw_spin_lock(&cfs_b->lock);

3598

/*

3598

/*

3599

* Add to the _head_ of the list, so that an already-started

3599

* Add to the _head_ of the list, so that an already-started

3600

* distribute_cfs_runtime will not see us

3600

* distribute_cfs_runtime will not see us

3601

*/

3601

*/

3602

list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);

3602

list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);

3603

if (!cfs_b->timer_active)

3603

if (!cfs_b->timer_active)

3604

__start_cfs_bandwidth(cfs_b, false);

3604

__start_cfs_bandwidth(cfs_b, false);

3605

raw_spin_unlock(&cfs_b->lock);

3605

raw_spin_unlock(&cfs_b->lock);

3606

}

3606

}

3607

3608

void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)

3608

void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)

3609

{

3609

{

3610

struct rq *rq = rq_of(cfs_rq);

3610

struct rq *rq = rq_of(cfs_rq);

3611

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);

3611

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);

3612

struct sched_entity *se;

3612

struct sched_entity *se;

3613

int enqueue = 1;

3613

int enqueue = 1;

3614

long task_delta;

3614

long task_delta;

3615

3616

se = cfs_rq->tg->se[cpu_of(rq)];

3616

se = cfs_rq->tg->se[cpu_of(rq)];

3617

3618

cfs_rq->throttled = 0;

3618

cfs_rq->throttled = 0;

3619

3620

update_rq_clock(rq);

3620

update_rq_clock(rq);

3621

3622

raw_spin_lock(&cfs_b->lock);

3622

raw_spin_lock(&cfs_b->lock);

3623

cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;

3623

cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;

3624

list_del_rcu(&cfs_rq->throttled_list);

3624

list_del_rcu(&cfs_rq->throttled_list);

3625

raw_spin_unlock(&cfs_b->lock);

3625

raw_spin_unlock(&cfs_b->lock);

3626

3627

/* update hierarchical throttle state */

3627

/* update hierarchical throttle state */

3628

walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);

3628

walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);

3629

3630

if (!cfs_rq->load.weight)

3630

if (!cfs_rq->load.weight)

3631

return;

3631

return;

3632

3633

task_delta = cfs_rq->h_nr_running;

3633

task_delta = cfs_rq->h_nr_running;

3634

for_each_sched_entity(se) {

3634

for_each_sched_entity(se) {

3635

if (se->on_rq)

3635

if (se->on_rq)

3636

enqueue = 0;

3636

enqueue = 0;

3637

3638

cfs_rq = cfs_rq_of(se);

3638

cfs_rq = cfs_rq_of(se);

3639

if (enqueue)

3639

if (enqueue)

3640

enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);

3640

enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);

3641

cfs_rq->h_nr_running += task_delta;

3641

cfs_rq->h_nr_running += task_delta;

3642

3643

if (cfs_rq_throttled(cfs_rq))

3643

if (cfs_rq_throttled(cfs_rq))

3644

break;

3644

break;

3645

}

3645

}

3646

3647

if (!se)

3647

if (!se)

3648

add_nr_running(rq, task_delta);

3648

add_nr_running(rq, task_delta);

3649

3650

/* determine whether we need to wake up potentially idle cpu */

3650

/* determine whether we need to wake up potentially idle cpu */

3651

if (rq->curr == rq->idle && rq->cfs.nr_running)

3651

if (rq->curr == rq->idle && rq->cfs.nr_running)

3652

resched_curr(rq);

3652

resched_curr(rq);

3653

}

3653

}

3654

3655

static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,

3655

static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,

3656

u64 remaining, u64 expires)

3656

u64 remaining, u64 expires)

3657

{

3657

{

3658

struct cfs_rq *cfs_rq;

3658

struct cfs_rq *cfs_rq;

3659

u64 runtime;

3659

u64 runtime;

3660

u64 starting_runtime = remaining;

3660

u64 starting_runtime = remaining;

3661

3662

rcu_read_lock();

3662

rcu_read_lock();

3663

list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,

3663

list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,

3664

throttled_list) {

3664

throttled_list) {

3665

struct rq *rq = rq_of(cfs_rq);

3665

struct rq *rq = rq_of(cfs_rq);

3666

3667

raw_spin_lock(&rq->lock);

3667

raw_spin_lock(&rq->lock);

3668

if (!cfs_rq_throttled(cfs_rq))

3668

if (!cfs_rq_throttled(cfs_rq))

3669

goto next;

3669

goto next;

3670

3671

runtime = -cfs_rq->runtime_remaining + 1;

3671

runtime = -cfs_rq->runtime_remaining + 1;

3672

if (runtime > remaining)

3672

if (runtime > remaining)

3673

runtime = remaining;

3673

runtime = remaining;

3674

remaining -= runtime;

3674

remaining -= runtime;

3675

3676

cfs_rq->runtime_remaining += runtime;

3676

cfs_rq->runtime_remaining += runtime;

3677

cfs_rq->runtime_expires = expires;

3677

cfs_rq->runtime_expires = expires;

3678

3679

/* we check whether we're throttled above */

3679

/* we check whether we're throttled above */

3680

if (cfs_rq->runtime_remaining > 0)

3680

if (cfs_rq->runtime_remaining > 0)

3681

unthrottle_cfs_rq(cfs_rq);

3681

unthrottle_cfs_rq(cfs_rq);

3682

3683

raw_spin_unlock(&rq->lock);

3684

raw_spin_unlock(&rq->lock);

3685

3686

if (!remaining)

3686

if (!remaining)

3687

break;

3687

break;

3688

}

3688

}

3689

rcu_read_unlock();

3689

rcu_read_unlock();

3690

3691

return starting_runtime - remaining;

3691

return starting_runtime - remaining;

3692

}

3692

}

3693

3694

/*

3694

/*

3695

* Responsible for refilling a task_group's bandwidth and unthrottling its

3695

* Responsible for refilling a task_group's bandwidth and unthrottling its

3696

* cfs_rqs as appropriate. If there has been no activity within the last

3696

* cfs_rqs as appropriate. If there has been no activity within the last

3697

* period the timer is deactivated until scheduling resumes; cfs_b->idle is

3697

* period the timer is deactivated until scheduling resumes; cfs_b->idle is

3698

* used to track this state.

3698

* used to track this state.

3699

*/

3699

*/

3700

static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)

3700

static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)

3701

{

3701

{

3702

u64 runtime, runtime_expires;

3702

u64 runtime, runtime_expires;

3703

int throttled;

3703

int throttled;

3704

3705

/* no need to continue the timer with no bandwidth constraint */

3705

/* no need to continue the timer with no bandwidth constraint */

3706

if (cfs_b->quota == RUNTIME_INF)

3706

if (cfs_b->quota == RUNTIME_INF)

3707

goto out_deactivate;

3707

goto out_deactivate;

3708

3709

throttled = !list_empty(&cfs_b->throttled_cfs_rq);

3709

throttled = !list_empty(&cfs_b->throttled_cfs_rq);

3710

cfs_b->nr_periods += overrun;

3710

cfs_b->nr_periods += overrun;

3711

3712

/*

3712

/*

3713

* idle depends on !throttled (for the case of a large deficit), and if

3713

* idle depends on !throttled (for the case of a large deficit), and if

3714

* we're going inactive then everything else can be deferred

3714

* we're going inactive then everything else can be deferred

3715

*/

3715

*/

3716

if (cfs_b->idle && !throttled)

3716

if (cfs_b->idle && !throttled)

3717

goto out_deactivate;

3717

goto out_deactivate;

3718

3719

/*

3719

/*

3720

* if we have relooped after returning idle once, we need to update our

3720

* if we have relooped after returning idle once, we need to update our

3721

* status as actually running, so that other cpus doing

3721

* status as actually running, so that other cpus doing

3722

* __start_cfs_bandwidth will stop trying to cancel us.

3722

* __start_cfs_bandwidth will stop trying to cancel us.

3723

*/

3723

*/

3724

cfs_b->timer_active = 1;

3724

cfs_b->timer_active = 1;

3725

3726

__refill_cfs_bandwidth_runtime(cfs_b);

3726

__refill_cfs_bandwidth_runtime(cfs_b);

3727

3728

if (!throttled) {

3728

if (!throttled) {

3729

/* mark as potentially idle for the upcoming period */

3729

/* mark as potentially idle for the upcoming period */

3730

cfs_b->idle = 1;

3730

cfs_b->idle = 1;

3731

return 0;

3731

return 0;

3732

}

3732

}

3733

3734

/* account preceding periods in which throttling occurred */

3734

/* account preceding periods in which throttling occurred */

3735

cfs_b->nr_throttled += overrun;

3735

cfs_b->nr_throttled += overrun;

3736

3737

runtime_expires = cfs_b->runtime_expires;

3737

runtime_expires = cfs_b->runtime_expires;

3738

3739

/*

3739

/*

3740

* This check is repeated as we are holding onto the new bandwidth while

3740

* This check is repeated as we are holding onto the new bandwidth while

3741

* we unthrottle. This can potentially race with an unthrottled group

3741

* we unthrottle. This can potentially race with an unthrottled group

3742

* trying to acquire new bandwidth from the global pool. This can result

3742

* trying to acquire new bandwidth from the global pool. This can result

3743

* in us over-using our runtime if it is all used during this loop, but

3743

* in us over-using our runtime if it is all used during this loop, but

3744

* only by limited amounts in that extreme case.

3744

* only by limited amounts in that extreme case.

3745

*/

3745

*/

3746

while (throttled && cfs_b->runtime > 0) {

3746

while (throttled && cfs_b->runtime > 0) {

3747

runtime = cfs_b->runtime;

3747

runtime = cfs_b->runtime;

3748

raw_spin_unlock(&cfs_b->lock);

3748

raw_spin_unlock(&cfs_b->lock);

3749

/* we can't nest cfs_b->lock while distributing bandwidth */

3749

/* we can't nest cfs_b->lock while distributing bandwidth */

3750

runtime = distribute_cfs_runtime(cfs_b, runtime,

3750

runtime = distribute_cfs_runtime(cfs_b, runtime,

3751

runtime_expires);

3751

runtime_expires);

3752

raw_spin_lock(&cfs_b->lock);

3752

raw_spin_lock(&cfs_b->lock);

3753

3754

throttled = !list_empty(&cfs_b->throttled_cfs_rq);

3754

throttled = !list_empty(&cfs_b->throttled_cfs_rq);

3755

3756

cfs_b->runtime -= min(runtime, cfs_b->runtime);

3756

cfs_b->runtime -= min(runtime, cfs_b->runtime);

3757

}

3757

}

3758

3759

/*

3759

/*

3760

* While we are ensured activity in the period following an

3760

* While we are ensured activity in the period following an

3761

* unthrottle, this also covers the case in which the new bandwidth is

3761

* unthrottle, this also covers the case in which the new bandwidth is

3762

* insufficient to cover the existing bandwidth deficit. (Forcing the

3762

* insufficient to cover the existing bandwidth deficit. (Forcing the

3763

* timer to remain active while there are any throttled entities.)

3763

* timer to remain active while there are any throttled entities.)

3764

*/

3764

*/

3765

cfs_b->idle = 0;

3765

cfs_b->idle = 0;

3766

3767

return 0;

3767

return 0;

3768

3769

out_deactivate:

3769

out_deactivate:

3770

cfs_b->timer_active = 0;

3770

cfs_b->timer_active = 0;

3771

return 1;

3771

return 1;

3772

}

3772

}

3773

3774

/* a cfs_rq won't donate quota below this amount */

3774

/* a cfs_rq won't donate quota below this amount */

3775

static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;

3775

static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;

3776

/* minimum remaining period time to redistribute slack quota */

3776

/* minimum remaining period time to redistribute slack quota */

3777

static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;

3777

static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;

3778

/* how long we wait to gather additional slack before distributing */

3778

/* how long we wait to gather additional slack before distributing */

3779

static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;

3779

static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;

3780

3781

/*

3781

/*

3782

* Are we near the end of the current quota period?

3782

* Are we near the end of the current quota period?

3783

*

3783

*

3784

* Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the

3784

* Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the

3785

* hrtimer base being cleared by __hrtimer_start_range_ns. In the case of

3785

* hrtimer base being cleared by __hrtimer_start_range_ns. In the case of

3786

* migrate_hrtimers, base is never cleared, so we are fine.

3786

* migrate_hrtimers, base is never cleared, so we are fine.

3787

*/

3787

*/

3788

static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)

3788

static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)

3789

{

3789

{

3790

struct hrtimer *refresh_timer = &cfs_b->period_timer;

3790

struct hrtimer *refresh_timer = &cfs_b->period_timer;

3791

u64 remaining;

3791

u64 remaining;

3792

3793

/* if the call-back is running a quota refresh is already occurring */

3793

/* if the call-back is running a quota refresh is already occurring */

3794

if (hrtimer_callback_running(refresh_timer))

3794

if (hrtimer_callback_running(refresh_timer))

3795

return 1;

3795

return 1;

3796

3797

/* is a quota refresh about to occur? */

3797

/* is a quota refresh about to occur? */

3798

remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));

3798

remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));

3799

if (remaining < min_expire)

3799

if (remaining < min_expire)

3800

return 1;

3800

return 1;

3801

3802

return 0;

3802

return 0;

3803

}

3803

}

3804

3805

static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)

3805

static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)

3806

{

3806

{

3807

u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;

3807

u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;

3808

3809

/* if there's a quota refresh soon don't bother with slack */

3809

/* if there's a quota refresh soon don't bother with slack */

3810

if (runtime_refresh_within(cfs_b, min_left))

3810

if (runtime_refresh_within(cfs_b, min_left))

3811

return;

3811

return;

3812

3813

start_bandwidth_timer(&cfs_b->slack_timer,

3813

start_bandwidth_timer(&cfs_b->slack_timer,

3814

ns_to_ktime(cfs_bandwidth_slack_period));

3814

ns_to_ktime(cfs_bandwidth_slack_period));

3815

}

3815

}

3816

3817

/* we know any runtime found here is valid as update_curr() precedes return */

3817

/* we know any runtime found here is valid as update_curr() precedes return */

3818

static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3818

static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3819

{

3819

{

3820

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);

3820

struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);

3821

s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;

3821

s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;

3822

3823

if (slack_runtime <= 0)

3823

if (slack_runtime <= 0)

3824

return;

3824

return;

3825

3826

raw_spin_lock(&cfs_b->lock);

3826

raw_spin_lock(&cfs_b->lock);

3827

if (cfs_b->quota != RUNTIME_INF &&

3827

if (cfs_b->quota != RUNTIME_INF &&

3828

cfs_rq->runtime_expires == cfs_b->runtime_expires) {

3828

cfs_rq->runtime_expires == cfs_b->runtime_expires) {

3829

cfs_b->runtime += slack_runtime;

3829

cfs_b->runtime += slack_runtime;

3830

3831

/* we are under rq->lock, defer unthrottling using a timer */

3831

/* we are under rq->lock, defer unthrottling using a timer */

3832

if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&

3832

if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&

3833

!list_empty(&cfs_b->throttled_cfs_rq))

3833

!list_empty(&cfs_b->throttled_cfs_rq))

3834

start_cfs_slack_bandwidth(cfs_b);

3834

start_cfs_slack_bandwidth(cfs_b);

3835

}

3835

}

3836

raw_spin_unlock(&cfs_b->lock);

3836

raw_spin_unlock(&cfs_b->lock);

3837

3838

/* even if it's not valid for return we don't want to try again */

3838

/* even if it's not valid for return we don't want to try again */

3839

cfs_rq->runtime_remaining -= slack_runtime;

3839

cfs_rq->runtime_remaining -= slack_runtime;

3840

}

3840

}

3841

3842

static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3842

static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3843

{

3843

{

3844

if (!cfs_bandwidth_used())

3844

if (!cfs_bandwidth_used())

3845

return;

3845

return;

3846

3847

if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)

3847

if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)

3848

return;

3848

return;

3849

3850

__return_cfs_rq_runtime(cfs_rq);

3850

__return_cfs_rq_runtime(cfs_rq);

3851

}

3851

}

3852

3853

/*

3853

/*

3854

* This is done with a timer (instead of inline with bandwidth return) since

3854

* This is done with a timer (instead of inline with bandwidth return) since

3855

* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.

3855

* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.

3856

*/

3856

*/

3857

static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)

3857

static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)

3858

{

3858

{

3859

u64 runtime = 0, slice = sched_cfs_bandwidth_slice();

3859

u64 runtime = 0, slice = sched_cfs_bandwidth_slice();

3860

u64 expires;

3860

u64 expires;

3861

3862

/* confirm we're still not at a refresh boundary */

3862

/* confirm we're still not at a refresh boundary */

3863

raw_spin_lock(&cfs_b->lock);

3863

raw_spin_lock(&cfs_b->lock);

3864

if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {

3864

if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {

3865

raw_spin_unlock(&cfs_b->lock);

3865

raw_spin_unlock(&cfs_b->lock);

3866

return;

3866

return;

3867

}

3867

}

3868

3869

if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)

3869

if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)

3870

runtime = cfs_b->runtime;

3870

runtime = cfs_b->runtime;

3871

3872

expires = cfs_b->runtime_expires;

3872

expires = cfs_b->runtime_expires;

3873

raw_spin_unlock(&cfs_b->lock);

3873

raw_spin_unlock(&cfs_b->lock);

3874

3875

if (!runtime)

3875

if (!runtime)

3876

return;

3876

return;

3877

3878

runtime = distribute_cfs_runtime(cfs_b, runtime, expires);

3878

runtime = distribute_cfs_runtime(cfs_b, runtime, expires);

3879

3880

raw_spin_lock(&cfs_b->lock);

3880

raw_spin_lock(&cfs_b->lock);

3881

if (expires == cfs_b->runtime_expires)

3881

if (expires == cfs_b->runtime_expires)

3882

cfs_b->runtime -= min(runtime, cfs_b->runtime);

3882

cfs_b->runtime -= min(runtime, cfs_b->runtime);

3883

raw_spin_unlock(&cfs_b->lock);

3883

raw_spin_unlock(&cfs_b->lock);

3884

}

3884

}

3885

3886

/*

3886

/*

3887

* When a group wakes up we want to make sure that its quota is not already

3887

* When a group wakes up we want to make sure that its quota is not already

3888

* expired/exceeded, otherwise it may be allowed to steal additional ticks of

3888

* expired/exceeded, otherwise it may be allowed to steal additional ticks of

3889

* runtime as update_curr() throttling can not not trigger until it's on-rq.

3889

* runtime as update_curr() throttling can not not trigger until it's on-rq.

3890

*/

3890

*/

3891

static void check_enqueue_throttle(struct cfs_rq *cfs_rq)

3891

static void check_enqueue_throttle(struct cfs_rq *cfs_rq)

3892

{

3892

{

3893

if (!cfs_bandwidth_used())

3893

if (!cfs_bandwidth_used())

3894

return;

3894

return;

3895

3896

/* an active group must be handled by the update_curr()->put() path */

3896

/* an active group must be handled by the update_curr()->put() path */

3897

if (!cfs_rq->runtime_enabled || cfs_rq->curr)

3897

if (!cfs_rq->runtime_enabled || cfs_rq->curr)

3898

return;

3898

return;

3899

3900

/* ensure the group is not already throttled */

3900

/* ensure the group is not already throttled */

3901

if (cfs_rq_throttled(cfs_rq))

3901

if (cfs_rq_throttled(cfs_rq))

3902

return;

3902

return;

3903

3904

/* update runtime allocation */

3904

/* update runtime allocation */

3905

account_cfs_rq_runtime(cfs_rq, 0);

3905

account_cfs_rq_runtime(cfs_rq, 0);

3906

if (cfs_rq->runtime_remaining <= 0)

3906

if (cfs_rq->runtime_remaining <= 0)

3907

throttle_cfs_rq(cfs_rq);

3907

throttle_cfs_rq(cfs_rq);

3908

}

3908

}

3909

3910

/* conditionally throttle active cfs_rq's from put_prev_entity() */

3910

/* conditionally throttle active cfs_rq's from put_prev_entity() */

3911

static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3911

static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3912

{

3912

{

3913

if (!cfs_bandwidth_used())

3913

if (!cfs_bandwidth_used())

3914

return false;

3914

return false;

3915

3916

if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))

3916

if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))

3917

return false;

3917

return false;

3918

3919

/*

3919

/*

3920

* it's possible for a throttled entity to be forced into a running

3920

* it's possible for a throttled entity to be forced into a running

3921

* state (e.g. set_curr_task), in this case we're finished.

3921

* state (e.g. set_curr_task), in this case we're finished.

3922

*/

3922

*/

3923

if (cfs_rq_throttled(cfs_rq))

3923

if (cfs_rq_throttled(cfs_rq))

3924

return true;

3924

return true;

3925

3926

throttle_cfs_rq(cfs_rq);

3926

throttle_cfs_rq(cfs_rq);

3927

return true;

3927

return true;

3928

}

3928

}

3929

3930

static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)

3930

static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)

3931

{

3931

{

3932

struct cfs_bandwidth *cfs_b =

3932

struct cfs_bandwidth *cfs_b =

3933

container_of(timer, struct cfs_bandwidth, slack_timer);

3933

container_of(timer, struct cfs_bandwidth, slack_timer);

3934

do_sched_cfs_slack_timer(cfs_b);

3934

do_sched_cfs_slack_timer(cfs_b);

3935

3936

return HRTIMER_NORESTART;

3936

return HRTIMER_NORESTART;

3937

}

3937

}

3938

3939

static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)

3939

static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)

3940

{

3940

{

3941

struct cfs_bandwidth *cfs_b =

3941

struct cfs_bandwidth *cfs_b =

3942

container_of(timer, struct cfs_bandwidth, period_timer);

3942

container_of(timer, struct cfs_bandwidth, period_timer);

3943

ktime_t now;

3943

ktime_t now;

3944

int overrun;

3944

int overrun;

3945

int idle = 0;

3945

int idle = 0;

3946

3947

raw_spin_lock(&cfs_b->lock);

3947

raw_spin_lock(&cfs_b->lock);

3948

for (;;) {

3948

for (;;) {

3949

now = hrtimer_cb_get_time(timer);

3949

now = hrtimer_cb_get_time(timer);

3950

overrun = hrtimer_forward(timer, now, cfs_b->period);

3950

overrun = hrtimer_forward(timer, now, cfs_b->period);

3951

3952

if (!overrun)

3952

if (!overrun)

3953

break;

3953

break;

3954

3955

idle = do_sched_cfs_period_timer(cfs_b, overrun);

3955

idle = do_sched_cfs_period_timer(cfs_b, overrun);

3956

}

3956

}

3957

raw_spin_unlock(&cfs_b->lock);

3957

raw_spin_unlock(&cfs_b->lock);

3958

3959

return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;

3959

return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;

3960

}

3960

}

3961

3962

void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)

3962

void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)

3963

{

3963

{

3964

raw_spin_lock_init(&cfs_b->lock);

3964

raw_spin_lock_init(&cfs_b->lock);

3965

cfs_b->runtime = 0;

3965

cfs_b->runtime = 0;

3966

cfs_b->quota = RUNTIME_INF;

3966

cfs_b->quota = RUNTIME_INF;

3967

cfs_b->period = ns_to_ktime(default_cfs_period());

3967

cfs_b->period = ns_to_ktime(default_cfs_period());

3968

3969

INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);

3969

INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);

3970

hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

3970

hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

3971

cfs_b->period_timer.function = sched_cfs_period_timer;

3971

cfs_b->period_timer.function = sched_cfs_period_timer;

3972

hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

3972

hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

3973

cfs_b->slack_timer.function = sched_cfs_slack_timer;

3973

cfs_b->slack_timer.function = sched_cfs_slack_timer;

3974

}

3974

}

3975

3976

static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3976

static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)

3977

{

3977

{

3978

cfs_rq->runtime_enabled = 0;

3978

cfs_rq->runtime_enabled = 0;

3979

INIT_LIST_HEAD(&cfs_rq->throttled_list);

3979

INIT_LIST_HEAD(&cfs_rq->throttled_list);

3980

}

3980

}

3981

3982

/* requires cfs_b->lock, may release to reprogram timer */

3982

/* requires cfs_b->lock, may release to reprogram timer */

3983

void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)

3983

void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)

3984

{

3984

{

3985

/*

3985

/*

3986

* The timer may be active because we're trying to set a new bandwidth

3986

* The timer may be active because we're trying to set a new bandwidth

3987

* period or because we're racing with the tear-down path

3987

* period or because we're racing with the tear-down path

3988

* (timer_active==0 becomes visible before the hrtimer call-back

3988

* (timer_active==0 becomes visible before the hrtimer call-back

3989

* terminates). In either case we ensure that it's re-programmed

3989

* terminates). In either case we ensure that it's re-programmed

3990

*/

3990

*/

3991

while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&

3991

while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&

3992

hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {

3992

hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {

3993

/* bounce the lock to allow do_sched_cfs_period_timer to run */

3993

/* bounce the lock to allow do_sched_cfs_period_timer to run */

3994

raw_spin_unlock(&cfs_b->lock);

3994

raw_spin_unlock(&cfs_b->lock);

3995

cpu_relax();

3995

cpu_relax();

3996

raw_spin_lock(&cfs_b->lock);

3996

raw_spin_lock(&cfs_b->lock);

3997

/* if someone else restarted the timer then we're done */

3997

/* if someone else restarted the timer then we're done */

3998

if (!force && cfs_b->timer_active)

3998

if (!force && cfs_b->timer_active)

3999

return;

3999

return;

4000

}

4000

}

4001

4002

cfs_b->timer_active = 1;

4002

cfs_b->timer_active = 1;

4003

start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);

4003

start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);

4004

}

4004

}

4005

4006

static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)

4006

static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)

4007

{

4007

{

4008

hrtimer_cancel(&cfs_b->period_timer);

4008

hrtimer_cancel(&cfs_b->period_timer);

4009

hrtimer_cancel(&cfs_b->slack_timer);

4009

hrtimer_cancel(&cfs_b->slack_timer);

4010

}

4010

}

4011

4012

static void __maybe_unused update_runtime_enabled(struct rq *rq)

4012

static void __maybe_unused update_runtime_enabled(struct rq *rq)

4013

{

4013

{

4014

struct cfs_rq *cfs_rq;

4014

struct cfs_rq *cfs_rq;

4015

4016

for_each_leaf_cfs_rq(rq, cfs_rq) {

4016

for_each_leaf_cfs_rq(rq, cfs_rq) {

4017

struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;

4017

struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;

4018

4019

raw_spin_lock(&cfs_b->lock);

4019

raw_spin_lock(&cfs_b->lock);

4020

cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;

4020

cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;

4021

raw_spin_unlock(&cfs_b->lock);

4021

raw_spin_unlock(&cfs_b->lock);

4022

}

4022

}

4023

}

4023

}

4024

4025

static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)

4025

static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)

4026

{

4026

{

4027

struct cfs_rq *cfs_rq;

4027

struct cfs_rq *cfs_rq;

4028

4029

for_each_leaf_cfs_rq(rq, cfs_rq) {

4029

for_each_leaf_cfs_rq(rq, cfs_rq) {

4030

if (!cfs_rq->runtime_enabled)

4030

if (!cfs_rq->runtime_enabled)

4031

continue;

4031

continue;

4032

4033

/*

4033

/*

4034

* clock_task is not advancing so we just need to make sure

4034

* clock_task is not advancing so we just need to make sure

4035

* there's some valid quota amount

4035

* there's some valid quota amount

4036

*/

4036

*/

4037

cfs_rq->runtime_remaining = 1;

4037

cfs_rq->runtime_remaining = 1;

4038

/*

4038

/*

4039

* Offline rq is schedulable till cpu is completely disabled

4039

* Offline rq is schedulable till cpu is completely disabled

4040

* in take_cpu_down(), so we prevent new cfs throttling here.

4040

* in take_cpu_down(), so we prevent new cfs throttling here.

4041

*/

4041

*/

4042

cfs_rq->runtime_enabled = 0;

4042

cfs_rq->runtime_enabled = 0;

4043

4044

if (cfs_rq_throttled(cfs_rq))

4044

if (cfs_rq_throttled(cfs_rq))

4045

unthrottle_cfs_rq(cfs_rq);

4045

unthrottle_cfs_rq(cfs_rq);

4046

}

4046

}

4047

}

4047

}

4048

4049

#else /* CONFIG_CFS_BANDWIDTH */

4049

#else /* CONFIG_CFS_BANDWIDTH */

4050

static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)

4050

static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)

4051

{

4051

{

4052

return rq_clock_task(rq_of(cfs_rq));

4052

return rq_clock_task(rq_of(cfs_rq));

4053

}

4053

}

4054

4055

static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}

4055

static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}

4056

static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }

4056

static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }

4057

static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}

4057

static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}

4058

static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}

4058

static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}

4059

4060

static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)

4060

static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)

4061

{

4061

{

4062

return 0;

4062

return 0;

4063

}

4063

}

4064

4065

static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)

4065

static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)

4066

{

4066

{

4067

return 0;

4067

return 0;

4068

}

4068

}

4069

4070

static inline int throttled_lb_pair(struct task_group *tg,

4070

static inline int throttled_lb_pair(struct task_group *tg,

4071

int src_cpu, int dest_cpu)

4071

int src_cpu, int dest_cpu)

4072

{

4072

{

4073

return 0;

4073

return 0;

4074

}

4074

}

4075

4076

void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}

4076

void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}

4077

4078

#ifdef CONFIG_FAIR_GROUP_SCHED

4078

#ifdef CONFIG_FAIR_GROUP_SCHED

4079

static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}

4079

static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}

4080

#endif

4080

#endif

4081

4082

static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)

4082

static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)

4083

{

4083

{

4084

return NULL;

4084

return NULL;

4085

}

4085

}

4086

static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}

4086

static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}

4087

static inline void update_runtime_enabled(struct rq *rq) {}

4087

static inline void update_runtime_enabled(struct rq *rq) {}

4088

static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}

4088

static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}

4089

4090

#endif /* CONFIG_CFS_BANDWIDTH */

4090

#endif /* CONFIG_CFS_BANDWIDTH */

4091

4092

/**************************************************

4092

/**************************************************

4093

* CFS operations on tasks:

4093

* CFS operations on tasks:

4094

*/

4094

*/

4095

4096

#ifdef CONFIG_SCHED_HRTICK

4096

#ifdef CONFIG_SCHED_HRTICK

4097

static void hrtick_start_fair(struct rq *rq, struct task_struct *p)

4097

static void hrtick_start_fair(struct rq *rq, struct task_struct *p)

4098

{

4098

{

4099

struct sched_entity *se = &p->se;

4099

struct sched_entity *se = &p->se;

4100

struct cfs_rq *cfs_rq = cfs_rq_of(se);

4100

struct cfs_rq *cfs_rq = cfs_rq_of(se);

4101

4102

WARN_ON(task_rq(p) != rq);

4102

WARN_ON(task_rq(p) != rq);

4103

4104

if (cfs_rq->nr_running > 1) {

4104

if (cfs_rq->nr_running > 1) {

4105

u64 slice = sched_slice(cfs_rq, se);

4105

u64 slice = sched_slice(cfs_rq, se);

4106

u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;

4106

u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;

4107

s64 delta = slice - ran;

4107

s64 delta = slice - ran;

4108

4109

if (delta < 0) {

4109

if (delta < 0) {

4110

if (rq->curr == p)

4110

if (rq->curr == p)

4111

resched_curr(rq);

4111

resched_curr(rq);

4112

return;

4112

return;

4113

}

4113

}

4114

hrtick_start(rq, delta);

4114

hrtick_start(rq, delta);

4115

}

4115

}

4116

}

4116

}

4117

4118

/*

4118

/*

4119

* called from enqueue/dequeue and updates the hrtick when the

4119

* called from enqueue/dequeue and updates the hrtick when the

4120

* current task is from our class and nr_running is low enough

4120

* current task is from our class and nr_running is low enough

4121

* to matter.

4121

* to matter.

4122

*/

4122

*/

4123

static void hrtick_update(struct rq *rq)

4123

static void hrtick_update(struct rq *rq)

4124

{

4124

{

4125

struct task_struct *curr = rq->curr;

4125

struct task_struct *curr = rq->curr;

4126

4127

if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)

4127

if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)

4128

return;

4128

return;

4129

4130

if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)

4130

if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)

4131

hrtick_start_fair(rq, curr);

4131

hrtick_start_fair(rq, curr);

4132

}

4132

}

4133

#else /* !CONFIG_SCHED_HRTICK */

4133

#else /* !CONFIG_SCHED_HRTICK */

4134

static inline void

4134

static inline void

4135

hrtick_start_fair(struct rq *rq, struct task_struct *p)

4135

hrtick_start_fair(struct rq *rq, struct task_struct *p)

4136

{

4136

{

4137

}

4137

}

4138

4139

static inline void hrtick_update(struct rq *rq)

4139

static inline void hrtick_update(struct rq *rq)

4140

{

4140

{

4141

}

4141

}

4142

#endif

4142

#endif

4143

4144

/*

4144

/*

4145

* The enqueue_task method is called before nr_running is

4145

* The enqueue_task method is called before nr_running is

4146

* increased. Here we update the fair scheduling stats and

4146

* increased. Here we update the fair scheduling stats and

4147

* then put the task into the rbtree:

4147

* then put the task into the rbtree:

4148

*/

4148

*/

4149

static void

4149

static void

4150

enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)

4150

enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)

4151

{

4151

{

4152

struct cfs_rq *cfs_rq;

4152

struct cfs_rq *cfs_rq;

4153

struct sched_entity *se = &p->se;

4153

struct sched_entity *se = &p->se;

4154

4155

for_each_sched_entity(se) {

4155

for_each_sched_entity(se) {

4156

if (se->on_rq)

4156

if (se->on_rq)

4157

break;

4157

break;

4158

cfs_rq = cfs_rq_of(se);

4158

cfs_rq = cfs_rq_of(se);

4159

enqueue_entity(cfs_rq, se, flags);

4159

enqueue_entity(cfs_rq, se, flags);

4160

4161

/*

4161

/*

4162

* end evaluation on encountering a throttled cfs_rq

4162

* end evaluation on encountering a throttled cfs_rq

4163

*

4163

*

4164

* note: in the case of encountering a throttled cfs_rq we will

4164

* note: in the case of encountering a throttled cfs_rq we will

4165

* post the final h_nr_running increment below.

4165

* post the final h_nr_running increment below.

4166

*/

4166

*/

4167

if (cfs_rq_throttled(cfs_rq))

4167

if (cfs_rq_throttled(cfs_rq))

4168

break;

4168

break;

4169

cfs_rq->h_nr_running++;

4169

cfs_rq->h_nr_running++;

4170

4171

flags = ENQUEUE_WAKEUP;

4171

flags = ENQUEUE_WAKEUP;

4172

}

4172

}

4173

4174

for_each_sched_entity(se) {

4174

for_each_sched_entity(se) {

4175

cfs_rq = cfs_rq_of(se);

4175

cfs_rq = cfs_rq_of(se);

4176

cfs_rq->h_nr_running++;

4176

cfs_rq->h_nr_running++;

4177

4178

if (cfs_rq_throttled(cfs_rq))

4178

if (cfs_rq_throttled(cfs_rq))

4179

break;

4179

break;

4180

4181

update_cfs_shares(cfs_rq);

4181

update_cfs_shares(cfs_rq);

4182

update_entity_load_avg(se, 1);

4182

update_entity_load_avg(se, 1);

4183

}

4183

}

4184

4185

if (!se) {

4185

if (!se) {

4186

update_rq_runnable_avg(rq, rq->nr_running);

4186

update_rq_runnable_avg(rq, rq->nr_running);

4187

add_nr_running(rq, 1);

4187

add_nr_running(rq, 1);

4188

}

4188

}

4189

hrtick_update(rq);

4189

hrtick_update(rq);

4190

}

4190

}

4191

4192

static void set_next_buddy(struct sched_entity *se);

4192

static void set_next_buddy(struct sched_entity *se);

4193

4194

/*

4194

/*

4195

* The dequeue_task method is called before nr_running is

4195

* The dequeue_task method is called before nr_running is

4196

* decreased. We remove the task from the rbtree and

4196

* decreased. We remove the task from the rbtree and

4197

* update the fair scheduling stats:

4197

* update the fair scheduling stats:

4198

*/

4198

*/

4199

static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)

4199

static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)

4200

{

4200

{

4201

struct cfs_rq *cfs_rq;

4201

struct cfs_rq *cfs_rq;

4202

struct sched_entity *se = &p->se;

4202

struct sched_entity *se = &p->se;

4203

int task_sleep = flags & DEQUEUE_SLEEP;

4203

int task_sleep = flags & DEQUEUE_SLEEP;

4204

4205

for_each_sched_entity(se) {

4205

for_each_sched_entity(se) {

4206

cfs_rq = cfs_rq_of(se);

4206

cfs_rq = cfs_rq_of(se);

4207

dequeue_entity(cfs_rq, se, flags);

4207

dequeue_entity(cfs_rq, se, flags);

4208

4209

/*

4209

/*

4210

* end evaluation on encountering a throttled cfs_rq

4210

* end evaluation on encountering a throttled cfs_rq

4211

*

4211

*

4212

* note: in the case of encountering a throttled cfs_rq we will

4212

* note: in the case of encountering a throttled cfs_rq we will

4213

* post the final h_nr_running decrement below.

4213

* post the final h_nr_running decrement below.

4214

*/

4214

*/

4215

if (cfs_rq_throttled(cfs_rq))

4215

if (cfs_rq_throttled(cfs_rq))

4216

break;

4216

break;

4217

cfs_rq->h_nr_running--;

4217

cfs_rq->h_nr_running--;

4218

4219

/* Don't dequeue parent if it has other entities besides us */

4219

/* Don't dequeue parent if it has other entities besides us */

4220

if (cfs_rq->load.weight) {

4220

if (cfs_rq->load.weight) {

4221

/*

4221

/*

4222

* Bias pick_next to pick a task from this cfs_rq, as

4222

* Bias pick_next to pick a task from this cfs_rq, as

4223

* p is sleeping when it is within its sched_slice.

4223

* p is sleeping when it is within its sched_slice.

4224

*/

4224

*/

4225

if (task_sleep && parent_entity(se))

4225

if (task_sleep && parent_entity(se))

4226

set_next_buddy(parent_entity(se));

4226

set_next_buddy(parent_entity(se));

4227

4228

/* avoid re-evaluating load for this entity */

4228

/* avoid re-evaluating load for this entity */

4229

se = parent_entity(se);

4229

se = parent_entity(se);

4230

break;

4230

break;

4231

}

4231

}

4232

flags |= DEQUEUE_SLEEP;

4232

flags |= DEQUEUE_SLEEP;

4233

}

4233

}

4234

4235

for_each_sched_entity(se) {

4235

for_each_sched_entity(se) {

4236

cfs_rq = cfs_rq_of(se);

4236

cfs_rq = cfs_rq_of(se);

4237

cfs_rq->h_nr_running--;

4237

cfs_rq->h_nr_running--;

4238

4239

if (cfs_rq_throttled(cfs_rq))

4239

if (cfs_rq_throttled(cfs_rq))

4240

break;

4240

break;

4241

4242

update_cfs_shares(cfs_rq);

4242

update_cfs_shares(cfs_rq);

4243

update_entity_load_avg(se, 1);

4243

update_entity_load_avg(se, 1);

4244

}

4244

}

4245

4246

if (!se) {

4246

if (!se) {

4247

sub_nr_running(rq, 1);

4247

sub_nr_running(rq, 1);

4248

update_rq_runnable_avg(rq, 1);

4248

update_rq_runnable_avg(rq, 1);

4249

}

4249

}

4250

hrtick_update(rq);

4250

hrtick_update(rq);

4251

}

4251

}

4252

4253

#ifdef CONFIG_SMP

4253

#ifdef CONFIG_SMP

4254

/* Used instead of source_load when we know the type == 0 */

4254

/* Used instead of source_load when we know the type == 0 */

4255

static unsigned long weighted_cpuload(const int cpu)

4255

static unsigned long weighted_cpuload(const int cpu)

4256

{

4256

{

4257

return cpu_rq(cpu)->cfs.runnable_load_avg;

4257

return cpu_rq(cpu)->cfs.runnable_load_avg;

4258

}

4258

}

4259

4260

/*

4260

/*

4261

* Return a low guess at the load of a migration-source cpu weighted

4261

* Return a low guess at the load of a migration-source cpu weighted

4262

* according to the scheduling class and "nice" value.

4262

* according to the scheduling class and "nice" value.

4263

*

4263

*

4264

* We want to under-estimate the load of migration sources, to

4264

* We want to under-estimate the load of migration sources, to

4265

* balance conservatively.

4265

* balance conservatively.

4266

*/

4266

*/

4267

static unsigned long source_load(int cpu, int type)

4267

static unsigned long source_load(int cpu, int type)

4268

{

4268

{

4269

struct rq *rq = cpu_rq(cpu);

4269

struct rq *rq = cpu_rq(cpu);

4270

unsigned long total = weighted_cpuload(cpu);

4270

unsigned long total = weighted_cpuload(cpu);

4271

4272

if (type == 0 || !sched_feat(LB_BIAS))

4272

if (type == 0 || !sched_feat(LB_BIAS))

4273

return total;

4273

return total;

4274

4275

return min(rq->cpu_load[type-1], total);

4275

return min(rq->cpu_load[type-1], total);

4276

}

4276

}

4277

4278

/*

4278

/*

4279

* Return a high guess at the load of a migration-target cpu weighted

4279

* Return a high guess at the load of a migration-target cpu weighted

4280

* according to the scheduling class and "nice" value.

4280

* according to the scheduling class and "nice" value.

4281

*/

4281

*/

4282

static unsigned long target_load(int cpu, int type)

4282

static unsigned long target_load(int cpu, int type)

4283

{

4283

{

4284

struct rq *rq = cpu_rq(cpu);

4284

struct rq *rq = cpu_rq(cpu);

4285

unsigned long total = weighted_cpuload(cpu);

4285

unsigned long total = weighted_cpuload(cpu);

4286

4287

if (type == 0 || !sched_feat(LB_BIAS))

4287

if (type == 0 || !sched_feat(LB_BIAS))

4288

return total;

4288

return total;

4289

4290

return max(rq->cpu_load[type-1], total);

4290

return max(rq->cpu_load[type-1], total);

4291

}

4291

}

4292

4293

static unsigned long capacity_of(int cpu)

4293

static unsigned long capacity_of(int cpu)

4294

{

4294

{

4295

return cpu_rq(cpu)->cpu_capacity;

4295

return cpu_rq(cpu)->cpu_capacity;

4296

}

4296

}

4297

4298

static unsigned long cpu_avg_load_per_task(int cpu)

4298

static unsigned long cpu_avg_load_per_task(int cpu)

4299

{

4299

{

4300

struct rq *rq = cpu_rq(cpu);

4300

struct rq *rq = cpu_rq(cpu);

4301

unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);

4301

unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);

4302

unsigned long load_avg = rq->cfs.runnable_load_avg;

4302

unsigned long load_avg = rq->cfs.runnable_load_avg;

4303

4304

if (nr_running)

4304

if (nr_running)

4305

return load_avg / nr_running;

4305

return load_avg / nr_running;

4306

4307

return 0;

4307

return 0;

4308

}

4308

}

4309

4310

static void record_wakee(struct task_struct *p)

4310

static void record_wakee(struct task_struct *p)

4311

{

4311

{

4312

/*

4312

/*

4313

* Rough decay (wiping) for cost saving, don't worry

4313

* Rough decay (wiping) for cost saving, don't worry

4314

* about the boundary, really active task won't care

4314

* about the boundary, really active task won't care

4315

* about the loss.

4315

* about the loss.

4316

*/

4316

*/

4317

if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {

4317

if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {

4318

current->wakee_flips >>= 1;

4318

current->wakee_flips >>= 1;

4319

current->wakee_flip_decay_ts = jiffies;

4319

current->wakee_flip_decay_ts = jiffies;

4320

}

4320

}

4321

4322

if (current->last_wakee != p) {

4322

if (current->last_wakee != p) {

4323

current->last_wakee = p;

4323

current->last_wakee = p;

4324

current->wakee_flips++;

4324

current->wakee_flips++;

4325

}

4325

}

4326

}

4326

}

4327

4328

static void task_waking_fair(struct task_struct *p)

4328

static void task_waking_fair(struct task_struct *p)

4329

{

4329

{

4330

struct sched_entity *se = &p->se;

4330

struct sched_entity *se = &p->se;

4331

struct cfs_rq *cfs_rq = cfs_rq_of(se);

4331

struct cfs_rq *cfs_rq = cfs_rq_of(se);

4332

u64 min_vruntime;

4332

u64 min_vruntime;

4333

4334

#ifndef CONFIG_64BIT

4334

#ifndef CONFIG_64BIT

4335

u64 min_vruntime_copy;

4335

u64 min_vruntime_copy;

4336

4337

do {

4337

do {

4338

min_vruntime_copy = cfs_rq->min_vruntime_copy;

4338

min_vruntime_copy = cfs_rq->min_vruntime_copy;

4339

smp_rmb();

4339

smp_rmb();

4340

min_vruntime = cfs_rq->min_vruntime;

4340

min_vruntime = cfs_rq->min_vruntime;

4341

} while (min_vruntime != min_vruntime_copy);

4341

} while (min_vruntime != min_vruntime_copy);

4342

#else

4342

#else

4343

min_vruntime = cfs_rq->min_vruntime;

4343

min_vruntime = cfs_rq->min_vruntime;

4344

#endif

4344

#endif

4345

4346

se->vruntime -= min_vruntime;

4346

se->vruntime -= min_vruntime;

4347

record_wakee(p);

4347

record_wakee(p);

4348

}

4348

}

4349

4350

#ifdef CONFIG_FAIR_GROUP_SCHED

4350

#ifdef CONFIG_FAIR_GROUP_SCHED

4351

/*

4351

/*

4352

* effective_load() calculates the load change as seen from the root_task_group

4352

* effective_load() calculates the load change as seen from the root_task_group

4353

*

4353

*

4354

* Adding load to a group doesn't make a group heavier, but can cause movement

4354

* Adding load to a group doesn't make a group heavier, but can cause movement

4355

* of group shares between cpus. Assuming the shares were perfectly aligned one

4355

* of group shares between cpus. Assuming the shares were perfectly aligned one

4356

* can calculate the shift in shares.

4356

* can calculate the shift in shares.

4357

*

4357

*

4358

* Calculate the effective load difference if @wl is added (subtracted) to @tg

4358

* Calculate the effective load difference if @wl is added (subtracted) to @tg

4359

* on this @cpu and results in a total addition (subtraction) of @wg to the

4359

* on this @cpu and results in a total addition (subtraction) of @wg to the

4360

* total group weight.

4360

* total group weight.

4361

*

4361

*

4362

* Given a runqueue weight distribution (rw_i) we can compute a shares

4362

* Given a runqueue weight distribution (rw_i) we can compute a shares

4363

* distribution (s_i) using:

4363

* distribution (s_i) using:

4364

*

4364

*

4365

* s_i = rw_i / \Sum rw_j (1)

4365

* s_i = rw_i / \Sum rw_j (1)

4366

*

4366

*

4367

* Suppose we have 4 CPUs and our @tg is a direct child of the root group and

4367

* Suppose we have 4 CPUs and our @tg is a direct child of the root group and

4368

* has 7 equal weight tasks, distributed as below (rw_i), with the resulting

4368

* has 7 equal weight tasks, distributed as below (rw_i), with the resulting

4369

* shares distribution (s_i):

4369

* shares distribution (s_i):

4370

*

4370

*

4371

* rw_i = { 2, 4, 1, 0 }

4371

* rw_i = { 2, 4, 1, 0 }

4372

* s_i = { 2/7, 4/7, 1/7, 0 }

4372

* s_i = { 2/7, 4/7, 1/7, 0 }

4373

*

4373

*

4374

* As per wake_affine() we're interested in the load of two CPUs (the CPU the

4374

* As per wake_affine() we're interested in the load of two CPUs (the CPU the

4375

* task used to run on and the CPU the waker is running on), we need to

4375

* task used to run on and the CPU the waker is running on), we need to

4376

* compute the effect of waking a task on either CPU and, in case of a sync

4376

* compute the effect of waking a task on either CPU and, in case of a sync

4377

* wakeup, compute the effect of the current task going to sleep.

4377

* wakeup, compute the effect of the current task going to sleep.

4378

*

4378

*

4379

* So for a change of @wl to the local @cpu with an overall group weight change

4379

* So for a change of @wl to the local @cpu with an overall group weight change

4380

* of @wl we can compute the new shares distribution (s'_i) using:

4380

* of @wl we can compute the new shares distribution (s'_i) using:

4381

*

4381

*

4382

* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)

4382

* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)

4383

*

4383

*

4384

* Suppose we're interested in CPUs 0 and 1, and want to compute the load

4384

* Suppose we're interested in CPUs 0 and 1, and want to compute the load

4385

* differences in waking a task to CPU 0. The additional task changes the

4385

* differences in waking a task to CPU 0. The additional task changes the

4386

* weight and shares distributions like:

4386

* weight and shares distributions like:

4387

*

4387

*

4388

* rw'_i = { 3, 4, 1, 0 }

4388

* rw'_i = { 3, 4, 1, 0 }

4389

* s'_i = { 3/8, 4/8, 1/8, 0 }

4389

* s'_i = { 3/8, 4/8, 1/8, 0 }

4390

*

4390

*

4391

* We can then compute the difference in effective weight by using:

4391

* We can then compute the difference in effective weight by using:

4392

*

4392

*

4393

* dw_i = S * (s'_i - s_i) (3)

4393

* dw_i = S * (s'_i - s_i) (3)

4394

*

4394

*

4395

* Where 'S' is the group weight as seen by its parent.

4395

* Where 'S' is the group weight as seen by its parent.

4396

*

4396

*

4397

* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)

4397

* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)

4398

* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -

4398

* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -

4399

* 4/7) times the weight of the group.

4399

* 4/7) times the weight of the group.

4400

*/

4400

*/

4401

static long effective_load(struct task_group *tg, int cpu, long wl, long wg)

4401

static long effective_load(struct task_group *tg, int cpu, long wl, long wg)

4402

{

4402

{

4403

struct sched_entity *se = tg->se[cpu];

4403

struct sched_entity *se = tg->se[cpu];

4404

4405

if (!tg->parent) /* the trivial, non-cgroup case */

4405

if (!tg->parent) /* the trivial, non-cgroup case */

4406

return wl;

4406

return wl;

4407

4408

for_each_sched_entity(se) {

4408

for_each_sched_entity(se) {

4409

long w, W;

4409

long w, W;

4410

4411

tg = se->my_q->tg;

4411

tg = se->my_q->tg;

4412

4413

/*

4413

/*

4414

* W = @wg + \Sum rw_j

4414

* W = @wg + \Sum rw_j

4415

*/

4415

*/

4416

W = wg + calc_tg_weight(tg, se->my_q);

4416

W = wg + calc_tg_weight(tg, se->my_q);

4417

4418

/*

4418

/*

4419

* w = rw_i + @wl

4419

* w = rw_i + @wl

4420

*/

4420

*/

4421

w = se->my_q->load.weight + wl;

4421

w = se->my_q->load.weight + wl;

4422

4423

/*

4423

/*

4424

* wl = S * s'_i; see (2)

4424

* wl = S * s'_i; see (2)

4425

*/

4425

*/

4426

if (W > 0 && w < W)

4426

if (W > 0 && w < W)

4427

wl = (w * tg->shares) / W;

4427

wl = (w * (long)tg->shares) / W;

4428

else

4428

else

4429

wl = tg->shares;

4429

wl = tg->shares;

4430

4431

/*

4431

/*

4432

* Per the above, wl is the new se->load.weight value; since

4432

* Per the above, wl is the new se->load.weight value; since

4433

* those are clipped to [MIN_SHARES, ...) do so now. See

4433

* those are clipped to [MIN_SHARES, ...) do so now. See

4434

* calc_cfs_shares().

4434

* calc_cfs_shares().

4435

*/

4435

*/

4436

if (wl < MIN_SHARES)

4436

if (wl < MIN_SHARES)

4437

wl = MIN_SHARES;

4437

wl = MIN_SHARES;

4438

4439

/*

4439

/*

4440

* wl = dw_i = S * (s'_i - s_i); see (3)

4440

* wl = dw_i = S * (s'_i - s_i); see (3)

4441

*/

4441

*/

4442

wl -= se->load.weight;

4442

wl -= se->load.weight;

4443

4444

/*

4444

/*

4445

* Recursively apply this logic to all parent groups to compute

4445

* Recursively apply this logic to all parent groups to compute

4446

* the final effective load change on the root group. Since

4446

* the final effective load change on the root group. Since

4447

* only the @tg group gets extra weight, all parent groups can

4447

* only the @tg group gets extra weight, all parent groups can

4448

* only redistribute existing shares. @wl is the shift in shares

4448

* only redistribute existing shares. @wl is the shift in shares

4449

* resulting from this level per the above.

4449

* resulting from this level per the above.

4450

*/

4450

*/

4451

wg = 0;

4451

wg = 0;

4452

}

4452

}

4453

4454

return wl;

4454

return wl;

4455

}

4455

}

4456

#else

4456

#else

4457

4458

static long effective_load(struct task_group *tg, int cpu, long wl, long wg)

4458

static long effective_load(struct task_group *tg, int cpu, long wl, long wg)

4459

{

4459

{

4460

return wl;

4460

return wl;

4461

}

4461

}

4462

4463

#endif

4463

#endif

4464

4465

static int wake_wide(struct task_struct *p)

4465

static int wake_wide(struct task_struct *p)

4466

{

4466

{

4467

int factor = this_cpu_read(sd_llc_size);

4467

int factor = this_cpu_read(sd_llc_size);

4468

4469

/*

4469

/*

4470

* Yeah, it's the switching-frequency, could means many wakee or

4470

* Yeah, it's the switching-frequency, could means many wakee or

4471

* rapidly switch, use factor here will just help to automatically

4471

* rapidly switch, use factor here will just help to automatically

4472

* adjust the loose-degree, so bigger node will lead to more pull.

4472

* adjust the loose-degree, so bigger node will lead to more pull.

4473

*/

4473

*/

4474

if (p->wakee_flips > factor) {

4474

if (p->wakee_flips > factor) {

4475

/*

4475

/*

4476

* wakee is somewhat hot, it needs certain amount of cpu

4476

* wakee is somewhat hot, it needs certain amount of cpu

4477

* resource, so if waker is far more hot, prefer to leave

4477

* resource, so if waker is far more hot, prefer to leave

4478

* it alone.

4478

* it alone.

4479

*/

4479

*/

4480

if (current->wakee_flips > (factor * p->wakee_flips))

4480

if (current->wakee_flips > (factor * p->wakee_flips))

4481

return 1;

4481

return 1;

4482

}

4482

}

4483

4484

return 0;

4484

return 0;

4485

}

4485

}

4486

4487

static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)

4487

static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)

4488

{

4488

{

4489

s64 this_load, load;

4489

s64 this_load, load;

4490

s64 this_eff_load, prev_eff_load;

4490

s64 this_eff_load, prev_eff_load;

4491

int idx, this_cpu, prev_cpu;

4491

int idx, this_cpu, prev_cpu;

4492

struct task_group *tg;

4492

struct task_group *tg;

4493

unsigned long weight;

4493

unsigned long weight;

4494

int balanced;

4494

int balanced;

4495

4496

/*

4496

/*

4497

* If we wake multiple tasks be careful to not bounce

4497

* If we wake multiple tasks be careful to not bounce

4498

* ourselves around too much.

4498

* ourselves around too much.

4499

*/

4499

*/

4500

if (wake_wide(p))

4500

if (wake_wide(p))

4501

return 0;

4501

return 0;

4502

4503

idx = sd->wake_idx;

4503

idx = sd->wake_idx;

4504

this_cpu = smp_processor_id();

4504

this_cpu = smp_processor_id();

4505

prev_cpu = task_cpu(p);

4505

prev_cpu = task_cpu(p);

4506

load = source_load(prev_cpu, idx);

4506

load = source_load(prev_cpu, idx);

4507

this_load = target_load(this_cpu, idx);

4507

this_load = target_load(this_cpu, idx);

4508

4509

/*

4509

/*

4510

* If sync wakeup then subtract the (maximum possible)

4510

* If sync wakeup then subtract the (maximum possible)

4511

* effect of the currently running task from the load

4511

* effect of the currently running task from the load

4512

* of the current CPU:

4512

* of the current CPU:

4513

*/

4513

*/

4514

if (sync) {

4514

if (sync) {

4515

tg = task_group(current);

4515

tg = task_group(current);

4516

weight = current->se.load.weight;

4516

weight = current->se.load.weight;

4517

4518

this_load += effective_load(tg, this_cpu, -weight, -weight);

4518

this_load += effective_load(tg, this_cpu, -weight, -weight);

4519

load += effective_load(tg, prev_cpu, 0, -weight);

4519

load += effective_load(tg, prev_cpu, 0, -weight);

4520

}

4520

}

4521

4522

tg = task_group(p);

4522

tg = task_group(p);

4523

weight = p->se.load.weight;

4523

weight = p->se.load.weight;

4524

4525

/*

4525

/*

4526

* In low-load situations, where prev_cpu is idle and this_cpu is idle

4526

* In low-load situations, where prev_cpu is idle and this_cpu is idle

4527

* due to the sync cause above having dropped this_load to 0, we'll

4527

* due to the sync cause above having dropped this_load to 0, we'll

4528

* always have an imbalance, but there's really nothing you can do

4528

* always have an imbalance, but there's really nothing you can do

4529

* about that, so that's good too.

4529

* about that, so that's good too.

4530

*

4530

*

4531

* Otherwise check if either cpus are near enough in load to allow this

4531

* Otherwise check if either cpus are near enough in load to allow this

4532

* task to be woken on this_cpu.

4532

* task to be woken on this_cpu.

4533

*/

4533

*/

4534

this_eff_load = 100;

4534

this_eff_load = 100;

4535

this_eff_load *= capacity_of(prev_cpu);

4535

this_eff_load *= capacity_of(prev_cpu);

4536

4537

prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;

4537

prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;

4538

prev_eff_load *= capacity_of(this_cpu);

4538

prev_eff_load *= capacity_of(this_cpu);

4539

4540

if (this_load > 0) {

4540

if (this_load > 0) {

4541

this_eff_load *= this_load +

4541

this_eff_load *= this_load +

4542

effective_load(tg, this_cpu, weight, weight);

4542

effective_load(tg, this_cpu, weight, weight);

4543

4544

prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);

4544

prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);

4545

}

4545

}

4546

4547

balanced = this_eff_load <= prev_eff_load;

4547

balanced = this_eff_load <= prev_eff_load;

4548

4549

schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);

4549

schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);

4550

4551

if (!balanced)

4551

if (!balanced)

4552

return 0;

4552

return 0;

4553

4554

schedstat_inc(sd, ttwu_move_affine);

4554

schedstat_inc(sd, ttwu_move_affine);

4555

schedstat_inc(p, se.statistics.nr_wakeups_affine);

4555

schedstat_inc(p, se.statistics.nr_wakeups_affine);

4556

4557

return 1;

4557

return 1;

4558

}

4558

}

4559

4560

/*

4560

/*

4561

* find_idlest_group finds and returns the least busy CPU group within the

4561

* find_idlest_group finds and returns the least busy CPU group within the

4562

* domain.

4562

* domain.

4563

*/

4563

*/

4564

static struct sched_group *

4564

static struct sched_group *

4565

find_idlest_group(struct sched_domain *sd, struct task_struct *p,

4565

find_idlest_group(struct sched_domain *sd, struct task_struct *p,

4566

int this_cpu, int sd_flag)

4566

int this_cpu, int sd_flag)

4567

{

4567

{

4568

struct sched_group *idlest = NULL, *group = sd->groups;

4568

struct sched_group *idlest = NULL, *group = sd->groups;

4569

unsigned long min_load = ULONG_MAX, this_load = 0;

4569

unsigned long min_load = ULONG_MAX, this_load = 0;

4570

int load_idx = sd->forkexec_idx;

4570

int load_idx = sd->forkexec_idx;

4571

int imbalance = 100 + (sd->imbalance_pct-100)/2;

4571

int imbalance = 100 + (sd->imbalance_pct-100)/2;

4572

4573

if (sd_flag & SD_BALANCE_WAKE)

4573

if (sd_flag & SD_BALANCE_WAKE)

4574

load_idx = sd->wake_idx;

4574

load_idx = sd->wake_idx;

4575

4576

do {

4576

do {

4577

unsigned long load, avg_load;

4577

unsigned long load, avg_load;

4578

int local_group;

4578

int local_group;

4579

int i;

4579

int i;

4580

4581

/* Skip over this group if it has no CPUs allowed */

4581

/* Skip over this group if it has no CPUs allowed */

4582

if (!cpumask_intersects(sched_group_cpus(group),

4582

if (!cpumask_intersects(sched_group_cpus(group),

4583

tsk_cpus_allowed(p)))

4583

tsk_cpus_allowed(p)))

4584

continue;

4584

continue;

4585

4586

local_group = cpumask_test_cpu(this_cpu,

4586

local_group = cpumask_test_cpu(this_cpu,

4587

sched_group_cpus(group));

4587

sched_group_cpus(group));

4588

4589

/* Tally up the load of all CPUs in the group */

4589

/* Tally up the load of all CPUs in the group */

4590

avg_load = 0;

4590

avg_load = 0;

4591

4592

for_each_cpu(i, sched_group_cpus(group)) {

4592

for_each_cpu(i, sched_group_cpus(group)) {

4593

/* Bias balancing toward cpus of our domain */

4593

/* Bias balancing toward cpus of our domain */

4594

if (local_group)

4594

if (local_group)

4595

load = source_load(i, load_idx);

4595

load = source_load(i, load_idx);

4596

else

4596

else

4597

load = target_load(i, load_idx);

4597

load = target_load(i, load_idx);

4598

4599

avg_load += load;

4599

avg_load += load;

4600

}

4600

}

4601

4602

/* Adjust by relative CPU capacity of the group */

4602

/* Adjust by relative CPU capacity of the group */

4603

avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;

4603

avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;

4604

4605

if (local_group) {

4605

if (local_group) {

4606

this_load = avg_load;

4606

this_load = avg_load;

4607

} else if (avg_load < min_load) {

4607

} else if (avg_load < min_load) {

4608

min_load = avg_load;

4608

min_load = avg_load;

4609

idlest = group;

4609

idlest = group;

4610

}

4610

}

4611

} while (group = group->next, group != sd->groups);

4611

} while (group = group->next, group != sd->groups);

4612

4613

if (!idlest || 100*this_load < imbalance*min_load)

4613

if (!idlest || 100*this_load < imbalance*min_load)

4614

return NULL;

4614

return NULL;

4615

return idlest;

4615

return idlest;

4616

}

4616

}

4617

4618

/*

4618

/*

4619

* find_idlest_cpu - find the idlest cpu among the cpus in group.

4619

* find_idlest_cpu - find the idlest cpu among the cpus in group.

4620

*/

4620

*/

4621

static int

4621

static int

4622

find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)

4622

find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)

4623

{

4623

{

4624

unsigned long load, min_load = ULONG_MAX;

4624

unsigned long load, min_load = ULONG_MAX;

4625

unsigned int min_exit_latency = UINT_MAX;

4625

unsigned int min_exit_latency = UINT_MAX;

4626

u64 latest_idle_timestamp = 0;

4626

u64 latest_idle_timestamp = 0;

4627

int least_loaded_cpu = this_cpu;

4627

int least_loaded_cpu = this_cpu;

4628

int shallowest_idle_cpu = -1;

4628

int shallowest_idle_cpu = -1;

4629

int i;

4629

int i;

4630

4631

/* Traverse only the allowed CPUs */

4631

/* Traverse only the allowed CPUs */

4632

for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {

4632

for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {

4633

if (idle_cpu(i)) {

4633

if (idle_cpu(i)) {

4634

struct rq *rq = cpu_rq(i);

4634

struct rq *rq = cpu_rq(i);

4635

struct cpuidle_state *idle = idle_get_state(rq);

4635

struct cpuidle_state *idle = idle_get_state(rq);

4636

if (idle && idle->exit_latency < min_exit_latency) {

4636

if (idle && idle->exit_latency < min_exit_latency) {

4637

/*

4637

/*

4638

* We give priority to a CPU whose idle state

4638

* We give priority to a CPU whose idle state

4639

* has the smallest exit latency irrespective

4639

* has the smallest exit latency irrespective

4640

* of any idle timestamp.

4640

* of any idle timestamp.

4641

*/

4641

*/

4642

min_exit_latency = idle->exit_latency;

4642

min_exit_latency = idle->exit_latency;

4643

latest_idle_timestamp = rq->idle_stamp;

4643

latest_idle_timestamp = rq->idle_stamp;

4644

shallowest_idle_cpu = i;

4644

shallowest_idle_cpu = i;

4645

} else if ((!idle || idle->exit_latency == min_exit_latency) &&

4645

} else if ((!idle || idle->exit_latency == min_exit_latency) &&

4646

rq->idle_stamp > latest_idle_timestamp) {

4646

rq->idle_stamp > latest_idle_timestamp) {

4647

/*

4647

/*

4648

* If equal or no active idle state, then

4648

* If equal or no active idle state, then

4649

* the most recently idled CPU might have

4649

* the most recently idled CPU might have

4650

* a warmer cache.

4650

* a warmer cache.

4651

*/

4651

*/

4652

latest_idle_timestamp = rq->idle_stamp;

4652

latest_idle_timestamp = rq->idle_stamp;

4653

shallowest_idle_cpu = i;

4653

shallowest_idle_cpu = i;

4654

}

4654

}

4655

} else if (shallowest_idle_cpu == -1) {

4655

} else if (shallowest_idle_cpu == -1) {

4656

load = weighted_cpuload(i);

4656

load = weighted_cpuload(i);

4657

if (load < min_load || (load == min_load && i == this_cpu)) {

4657

if (load < min_load || (load == min_load && i == this_cpu)) {

4658

min_load = load;

4658

min_load = load;

4659

least_loaded_cpu = i;

4659

least_loaded_cpu = i;

4660

}

4660

}

4661

}

4661

}

4662

}

4662

}

4663

4664

return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;

4664

return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;

4665

}

4665

}

4666

4667

/*

4667

/*

4668

* Try and locate an idle CPU in the sched_domain.

4668

* Try and locate an idle CPU in the sched_domain.

4669

*/

4669

*/

4670

static int select_idle_sibling(struct task_struct *p, int target)

4670

static int select_idle_sibling(struct task_struct *p, int target)

4671

{

4671

{

4672

struct sched_domain *sd;

4672

struct sched_domain *sd;

4673

struct sched_group *sg;

4673

struct sched_group *sg;

4674

int i = task_cpu(p);

4674

int i = task_cpu(p);

4675

4676

if (idle_cpu(target))

4676

if (idle_cpu(target))

4677

return target;

4677

return target;

4678

4679

/*

4679

/*

4680

* If the prevous cpu is cache affine and idle, don't be stupid.

4680

* If the prevous cpu is cache affine and idle, don't be stupid.

4681

*/

4681

*/

4682

if (i != target && cpus_share_cache(i, target) && idle_cpu(i))

4682

if (i != target && cpus_share_cache(i, target) && idle_cpu(i))

4683

return i;

4683

return i;

4684

4685

/*

4685

/*

4686

* Otherwise, iterate the domains and find an elegible idle cpu.

4686

* Otherwise, iterate the domains and find an elegible idle cpu.

4687

*/

4687

*/

4688

sd = rcu_dereference(per_cpu(sd_llc, target));

4688

sd = rcu_dereference(per_cpu(sd_llc, target));

4689

for_each_lower_domain(sd) {

4689

for_each_lower_domain(sd) {

4690

sg = sd->groups;

4690

sg = sd->groups;

4691

do {

4691

do {

4692

if (!cpumask_intersects(sched_group_cpus(sg),

4692

if (!cpumask_intersects(sched_group_cpus(sg),

4693

tsk_cpus_allowed(p)))

4693

tsk_cpus_allowed(p)))

4694

goto next;

4694

goto next;

4695

4696

for_each_cpu(i, sched_group_cpus(sg)) {

4696

for_each_cpu(i, sched_group_cpus(sg)) {

4697

if (i == target || !idle_cpu(i))

4697

if (i == target || !idle_cpu(i))

4698

goto next;

4698

goto next;

4699

}

4699

}

4700

4701

target = cpumask_first_and(sched_group_cpus(sg),

4701

target = cpumask_first_and(sched_group_cpus(sg),

4702

tsk_cpus_allowed(p));

4702

tsk_cpus_allowed(p));

4703

goto done;

4703

goto done;

4704

sg = sg->next;

4705

sg = sg->next;

4706

} while (sg != sd->groups);

4706

} while (sg != sd->groups);

4707

}

4707

}

4708

done:

4708

done:

4709

return target;

4709

return target;

4710

}

4710

}

4711

4712

/*

4712

/*

4713

* select_task_rq_fair: Select target runqueue for the waking task in domains

4713

* select_task_rq_fair: Select target runqueue for the waking task in domains

4714

* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,

4714

* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,

4715

* SD_BALANCE_FORK, or SD_BALANCE_EXEC.

4715

* SD_BALANCE_FORK, or SD_BALANCE_EXEC.

4716

*

4716

*

4717

* Balances load by selecting the idlest cpu in the idlest group, or under

4717

* Balances load by selecting the idlest cpu in the idlest group, or under

4718

* certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.

4718

* certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.

4719

*

4719

*

4720

* Returns the target cpu number.

4720

* Returns the target cpu number.

4721

*

4721

*

4722

* preempt must be disabled.

4722

* preempt must be disabled.

4723

*/

4723

*/

4724

static int

4724

static int

4725

select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)

4725

select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)

4726

{

4726

{

4727

struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;

4727

struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;

4728

int cpu = smp_processor_id();

4728

int cpu = smp_processor_id();

4729

int new_cpu = cpu;

4729

int new_cpu = cpu;

4730

int want_affine = 0;

4730

int want_affine = 0;

4731

int sync = wake_flags & WF_SYNC;

4731

int sync = wake_flags & WF_SYNC;

4732

4733

if (sd_flag & SD_BALANCE_WAKE)

4733

if (sd_flag & SD_BALANCE_WAKE)

4734

want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));

4734

want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));

4735

4736

rcu_read_lock();

4736

rcu_read_lock();

4737

for_each_domain(cpu, tmp) {

4737

for_each_domain(cpu, tmp) {

4738

if (!(tmp->flags & SD_LOAD_BALANCE))

4738

if (!(tmp->flags & SD_LOAD_BALANCE))

4739

continue;

4739

continue;

4740

4741

/*

4741

/*

4742

* If both cpu and prev_cpu are part of this domain,

4742

* If both cpu and prev_cpu are part of this domain,

4743

* cpu is a valid SD_WAKE_AFFINE target.

4743

* cpu is a valid SD_WAKE_AFFINE target.

4744

*/

4744

*/

4745

if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&

4745

if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&

4746

cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {

4746

cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {

4747

affine_sd = tmp;

4747

affine_sd = tmp;

4748

break;

4748

break;

4749

}

4749

}

4750

4751

if (tmp->flags & sd_flag)

4751

if (tmp->flags & sd_flag)

4752

sd = tmp;

4752

sd = tmp;

4753

}

4753

}

4754

4755

if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))

4755

if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))

4756

prev_cpu = cpu;

4756

prev_cpu = cpu;

4757

4758

if (sd_flag & SD_BALANCE_WAKE) {

4758

if (sd_flag & SD_BALANCE_WAKE) {

4759

new_cpu = select_idle_sibling(p, prev_cpu);

4759

new_cpu = select_idle_sibling(p, prev_cpu);

4760

goto unlock;

4760

goto unlock;

4761

}

4761

}

4762

4763

while (sd) {

4763

while (sd) {

4764

struct sched_group *group;

4764

struct sched_group *group;

4765

int weight;

4765

int weight;

4766

4767

if (!(sd->flags & sd_flag)) {

4767

if (!(sd->flags & sd_flag)) {

4768

sd = sd->child;

4768

sd = sd->child;

4769

continue;

4769

continue;

4770

}

4770

}

4771

4772

group = find_idlest_group(sd, p, cpu, sd_flag);

4772

group = find_idlest_group(sd, p, cpu, sd_flag);

4773

if (!group) {

4773

if (!group) {

4774

sd = sd->child;

4774

sd = sd->child;

4775

continue;

4775

continue;

4776

}

4776

}

4777

4778

new_cpu = find_idlest_cpu(group, p, cpu);

4778

new_cpu = find_idlest_cpu(group, p, cpu);

4779

if (new_cpu == -1 || new_cpu == cpu) {

4779

if (new_cpu == -1 || new_cpu == cpu) {

4780

/* Now try balancing at a lower domain level of cpu */

4780

/* Now try balancing at a lower domain level of cpu */

4781

sd = sd->child;

4781

sd = sd->child;

4782

continue;

4782

continue;

4783

}

4783

}

4784

4785

/* Now try balancing at a lower domain level of new_cpu */

4785

/* Now try balancing at a lower domain level of new_cpu */

4786

cpu = new_cpu;

4786

cpu = new_cpu;

4787

weight = sd->span_weight;

4787

weight = sd->span_weight;

4788

sd = NULL;

4788

sd = NULL;

4789

for_each_domain(cpu, tmp) {

4789

for_each_domain(cpu, tmp) {

4790

if (weight <= tmp->span_weight)

4790

if (weight <= tmp->span_weight)

4791

break;

4791

break;

4792

if (tmp->flags & sd_flag)

4792

if (tmp->flags & sd_flag)

4793

sd = tmp;

4793

sd = tmp;

4794

}

4794

}

4795

/* while loop will break here if sd == NULL */

4795

/* while loop will break here if sd == NULL */

4796

}

4796

}

4797

unlock:

4797

unlock:

4798

rcu_read_unlock();

4798

rcu_read_unlock();

4799

4800

return new_cpu;

4800

return new_cpu;

4801

}

4801

}

4802

4803

/*

4803

/*

4804

* Called immediately before a task is migrated to a new cpu; task_cpu(p) and

4804

* Called immediately before a task is migrated to a new cpu; task_cpu(p) and

4805

* cfs_rq_of(p) references at time of call are still valid and identify the

4805

* cfs_rq_of(p) references at time of call are still valid and identify the

4806

* previous cpu. However, the caller only guarantees p->pi_lock is held; no

4806

* previous cpu. However, the caller only guarantees p->pi_lock is held; no

4807

* other assumptions, including the state of rq->lock, should be made.

4807

* other assumptions, including the state of rq->lock, should be made.

4808

*/

4808

*/

4809

static void

4809

static void

4810

migrate_task_rq_fair(struct task_struct *p, int next_cpu)

4810

migrate_task_rq_fair(struct task_struct *p, int next_cpu)

4811

{

4811

{

4812

struct sched_entity *se = &p->se;

4812

struct sched_entity *se = &p->se;

4813

struct cfs_rq *cfs_rq = cfs_rq_of(se);

4813

struct cfs_rq *cfs_rq = cfs_rq_of(se);

4814

4815

/*

4815

/*

4816

* Load tracking: accumulate removed load so that it can be processed

4816

* Load tracking: accumulate removed load so that it can be processed

4817

* when we next update owning cfs_rq under rq->lock. Tasks contribute

4817

* when we next update owning cfs_rq under rq->lock. Tasks contribute

4818

* to blocked load iff they have a positive decay-count. It can never

4818

* to blocked load iff they have a positive decay-count. It can never

4819

* be negative here since on-rq tasks have decay-count == 0.

4819

* be negative here since on-rq tasks have decay-count == 0.

4820

*/

4820

*/

4821

if (se->avg.decay_count) {

4821

if (se->avg.decay_count) {

4822

se->avg.decay_count = -__synchronize_entity_decay(se);

4822

se->avg.decay_count = -__synchronize_entity_decay(se);

4823

atomic_long_add(se->avg.load_avg_contrib,

4823

atomic_long_add(se->avg.load_avg_contrib,

4824

&cfs_rq->removed_load);

4824

&cfs_rq->removed_load);

4825

}

4825

}

4826

4827

/* We have migrated, no longer consider this task hot */

4827

/* We have migrated, no longer consider this task hot */

4828

se->exec_start = 0;

4828

se->exec_start = 0;

4829

}

4829

}

4830

#endif /* CONFIG_SMP */

4830

#endif /* CONFIG_SMP */

4831

4832

static unsigned long

4832

static unsigned long

4833

wakeup_gran(struct sched_entity *curr, struct sched_entity *se)

4833

wakeup_gran(struct sched_entity *curr, struct sched_entity *se)

4834

{

4834

{

4835

unsigned long gran = sysctl_sched_wakeup_granularity;

4835

unsigned long gran = sysctl_sched_wakeup_granularity;

4836

4837

/*

4837

/*

4838

* Since its curr running now, convert the gran from real-time

4838

* Since its curr running now, convert the gran from real-time

4839

* to virtual-time in his units.

4839

* to virtual-time in his units.

4840

*

4840

*

4841

* By using 'se' instead of 'curr' we penalize light tasks, so

4841

* By using 'se' instead of 'curr' we penalize light tasks, so

4842

* they get preempted easier. That is, if 'se' < 'curr' then

4842

* they get preempted easier. That is, if 'se' < 'curr' then

4843

* the resulting gran will be larger, therefore penalizing the

4843

* the resulting gran will be larger, therefore penalizing the

4844

* lighter, if otoh 'se' > 'curr' then the resulting gran will

4844

* lighter, if otoh 'se' > 'curr' then the resulting gran will

4845

* be smaller, again penalizing the lighter task.

4845

* be smaller, again penalizing the lighter task.

4846

*

4846

*

4847

* This is especially important for buddies when the leftmost

4847

* This is especially important for buddies when the leftmost

4848

* task is higher priority than the buddy.

4848

* task is higher priority than the buddy.

4849

*/

4849

*/

4850

return calc_delta_fair(gran, se);

4850

return calc_delta_fair(gran, se);

4851

}

4851

}

4852

4853

/*

4853

/*

4854

* Should 'se' preempt 'curr'.

4854

* Should 'se' preempt 'curr'.

4855

*

4855

*

4856

* |s1

4856

* |s1

4857

* |s2

4857

* |s2

4858

* |s3

4858

* |s3

4859

* g

4859

* g

4860

* |<--->|c

4860

* |<--->|c

4861

*

4861

*

4862

* w(c, s1) = -1

4862

* w(c, s1) = -1

4863

* w(c, s2) = 0

4863

* w(c, s2) = 0

4864

* w(c, s3) = 1

4864

* w(c, s3) = 1

4865

*

4865

*

4866

*/

4866

*/

4867

static int

4867

static int

4868

wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)

4868

wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)

4869

{

4869

{

4870

s64 gran, vdiff = curr->vruntime - se->vruntime;

4870

s64 gran, vdiff = curr->vruntime - se->vruntime;

4871

4872

if (vdiff <= 0)

4872

if (vdiff <= 0)

4873

return -1;

4873

return -1;

4874

4875

gran = wakeup_gran(curr, se);

4875

gran = wakeup_gran(curr, se);

4876

if (vdiff > gran)

4876

if (vdiff > gran)

4877

return 1;

4877

return 1;

4878

4879

return 0;

4879

return 0;

4880

}

4880

}

4881

4882

static void set_last_buddy(struct sched_entity *se)

4882

static void set_last_buddy(struct sched_entity *se)

4883

{

4883

{

4884

if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))

4884

if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))

4885

return;

4885

return;

4886

4887

for_each_sched_entity(se)

4887

for_each_sched_entity(se)

4888

cfs_rq_of(se)->last = se;

4888

cfs_rq_of(se)->last = se;

4889

}

4889

}

4890

4891

static void set_next_buddy(struct sched_entity *se)

4891

static void set_next_buddy(struct sched_entity *se)

4892

{

4892

{

4893

if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))

4893

if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))

4894

return;

4894

return;

4895

4896

for_each_sched_entity(se)

4896

for_each_sched_entity(se)

4897

cfs_rq_of(se)->next = se;

4897

cfs_rq_of(se)->next = se;

4898

}

4898

}

4899

4900

static void set_skip_buddy(struct sched_entity *se)

4900

static void set_skip_buddy(struct sched_entity *se)

4901

{

4901

{

4902

for_each_sched_entity(se)

4902

for_each_sched_entity(se)

4903

cfs_rq_of(se)->skip = se;

4903

cfs_rq_of(se)->skip = se;

4904

}

4904

}

4905

4906

/*

4906

/*

4907

* Preempt the current task with a newly woken task if needed:

4907

* Preempt the current task with a newly woken task if needed:

4908

*/

4908

*/

4909

static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)

4909

static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)

4910

{

4910

{

4911

struct task_struct *curr = rq->curr;

4911

struct task_struct *curr = rq->curr;

4912

struct sched_entity *se = &curr->se, *pse = &p->se;

4912

struct sched_entity *se = &curr->se, *pse = &p->se;

4913

struct cfs_rq *cfs_rq = task_cfs_rq(curr);

4913

struct cfs_rq *cfs_rq = task_cfs_rq(curr);

4914

int scale = cfs_rq->nr_running >= sched_nr_latency;

4914

int scale = cfs_rq->nr_running >= sched_nr_latency;

4915

int next_buddy_marked = 0;

4915

int next_buddy_marked = 0;

4916

4917

if (unlikely(se == pse))

4917

if (unlikely(se == pse))

4918

return;

4918

return;

4919

4920

/*

4920

/*

4921

* This is possible from callers such as attach_tasks(), in which we

4921

* This is possible from callers such as attach_tasks(), in which we

4922

* unconditionally check_prempt_curr() after an enqueue (which may have

4922

* unconditionally check_prempt_curr() after an enqueue (which may have

4923

* lead to a throttle). This both saves work and prevents false

4923

* lead to a throttle). This both saves work and prevents false

4924

* next-buddy nomination below.

4924

* next-buddy nomination below.

4925

*/

4925

*/

4926

if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))

4926

if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))

4927

return;

4927

return;

4928

4929

if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {

4929

if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {

4930

set_next_buddy(pse);

4930

set_next_buddy(pse);

4931

next_buddy_marked = 1;

4931

next_buddy_marked = 1;

4932

}

4932

}

4933

4934

/*

4934

/*

4935

* We can come here with TIF_NEED_RESCHED already set from new task

4935

* We can come here with TIF_NEED_RESCHED already set from new task

4936

* wake up path.

4936

* wake up path.

4937

*

4937

*

4938

* Note: this also catches the edge-case of curr being in a throttled

4938

* Note: this also catches the edge-case of curr being in a throttled

4939

* group (e.g. via set_curr_task), since update_curr() (in the

4939

* group (e.g. via set_curr_task), since update_curr() (in the

4940

* enqueue of curr) will have resulted in resched being set. This

4940

* enqueue of curr) will have resulted in resched being set. This

4941

* prevents us from potentially nominating it as a false LAST_BUDDY

4941

* prevents us from potentially nominating it as a false LAST_BUDDY

4942

* below.

4942

* below.

4943

*/

4943

*/

4944

if (test_tsk_need_resched(curr))

4944

if (test_tsk_need_resched(curr))

4945

return;

4945

return;

4946

4947

/* Idle tasks are by definition preempted by non-idle tasks. */

4947

/* Idle tasks are by definition preempted by non-idle tasks. */

4948

if (unlikely(curr->policy == SCHED_IDLE) &&

4948

if (unlikely(curr->policy == SCHED_IDLE) &&

4949

likely(p->policy != SCHED_IDLE))

4949

likely(p->policy != SCHED_IDLE))

4950

goto preempt;

4950

goto preempt;

4951

4952

/*

4952

/*

4953

* Batch and idle tasks do not preempt non-idle tasks (their preemption

4953

* Batch and idle tasks do not preempt non-idle tasks (their preemption

4954

* is driven by the tick):

4954

* is driven by the tick):

4955

*/

4955

*/

4956

if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))

4956

if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))

4957

return;

4957

return;

4958

4959

find_matching_se(&se, &pse);

4959

find_matching_se(&se, &pse);

4960

update_curr(cfs_rq_of(se));

4960

update_curr(cfs_rq_of(se));

4961

BUG_ON(!pse);

4961

BUG_ON(!pse);

4962

if (wakeup_preempt_entity(se, pse) == 1) {

4962

if (wakeup_preempt_entity(se, pse) == 1) {

4963

/*

4963

/*

4964

* Bias pick_next to pick the sched entity that is

4964

* Bias pick_next to pick the sched entity that is

4965

* triggering this preemption.

4965

* triggering this preemption.

4966

*/

4966

*/

4967

if (!next_buddy_marked)

4967

if (!next_buddy_marked)

4968

set_next_buddy(pse);

4968

set_next_buddy(pse);

4969

goto preempt;

4969

goto preempt;

4970

}

4970

}

4971

4972

return;

4972

return;

4973

4974

preempt:

4974

preempt:

4975

resched_curr(rq);

4975

resched_curr(rq);

4976

/*

4976

/*

4977

* Only set the backward buddy when the current task is still

4977

* Only set the backward buddy when the current task is still

4978

* on the rq. This can happen when a wakeup gets interleaved

4978

* on the rq. This can happen when a wakeup gets interleaved

4979

* with schedule on the ->pre_schedule() or idle_balance()

4979

* with schedule on the ->pre_schedule() or idle_balance()

4980

* point, either of which can * drop the rq lock.

4980

* point, either of which can * drop the rq lock.

4981

*

4981

*

4982

* Also, during early boot the idle thread is in the fair class,

4982

* Also, during early boot the idle thread is in the fair class,

4983

* for obvious reasons its a bad idea to schedule back to it.

4983

* for obvious reasons its a bad idea to schedule back to it.

4984

*/

4984

*/

4985

if (unlikely(!se->on_rq || curr == rq->idle))

4985

if (unlikely(!se->on_rq || curr == rq->idle))

4986

return;

4986

return;

4987

4988

if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))

4988

if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))

4989

set_last_buddy(se);

4989

set_last_buddy(se);

4990

}

4990

}

4991

4992

static struct task_struct *

4992

static struct task_struct *

4993

pick_next_task_fair(struct rq *rq, struct task_struct *prev)

4993

pick_next_task_fair(struct rq *rq, struct task_struct *prev)

4994

{

4994

{

4995

struct cfs_rq *cfs_rq = &rq->cfs;

4995

struct cfs_rq *cfs_rq = &rq->cfs;

4996

struct sched_entity *se;

4996

struct sched_entity *se;

4997

struct task_struct *p;

4997

struct task_struct *p;

4998

int new_tasks;

4998

int new_tasks;

4999

5000

again:

5000

again:

5001

#ifdef CONFIG_FAIR_GROUP_SCHED

5001

#ifdef CONFIG_FAIR_GROUP_SCHED

5002

if (!cfs_rq->nr_running)

5002

if (!cfs_rq->nr_running)

5003

goto idle;

5003

goto idle;

5004

5005

if (prev->sched_class != &fair_sched_class)

5005

if (prev->sched_class != &fair_sched_class)

5006

goto simple;

5006

goto simple;

5007

5008

/*

5008

/*

5009

* Because of the set_next_buddy() in dequeue_task_fair() it is rather

5009

* Because of the set_next_buddy() in dequeue_task_fair() it is rather

5010

* likely that a next task is from the same cgroup as the current.

5010

* likely that a next task is from the same cgroup as the current.

5011

*

5011

*

5012

* Therefore attempt to avoid putting and setting the entire cgroup

5012

* Therefore attempt to avoid putting and setting the entire cgroup

5013

* hierarchy, only change the part that actually changes.

5013

* hierarchy, only change the part that actually changes.

5014

*/

5014

*/

5015

5016

do {

5016

do {

5017

struct sched_entity *curr = cfs_rq->curr;

5017

struct sched_entity *curr = cfs_rq->curr;

5018

5019

/*

5019

/*

5020

* Since we got here without doing put_prev_entity() we also

5020

* Since we got here without doing put_prev_entity() we also

5021

* have to consider cfs_rq->curr. If it is still a runnable

5021

* have to consider cfs_rq->curr. If it is still a runnable

5022

* entity, update_curr() will update its vruntime, otherwise

5022

* entity, update_curr() will update its vruntime, otherwise

5023

* forget we've ever seen it.

5023

* forget we've ever seen it.

5024

*/

5024

*/

5025

if (curr && curr->on_rq)

5025

if (curr && curr->on_rq)

5026

update_curr(cfs_rq);

5026

update_curr(cfs_rq);

5027

else

5027

else

5028

curr = NULL;

5028

curr = NULL;

5029

5030

/*

5030

/*

5031

* This call to check_cfs_rq_runtime() will do the throttle and

5031

* This call to check_cfs_rq_runtime() will do the throttle and

5032

* dequeue its entity in the parent(s). Therefore the 'simple'

5032

* dequeue its entity in the parent(s). Therefore the 'simple'

5033

* nr_running test will indeed be correct.

5033

* nr_running test will indeed be correct.

5034

*/

5034

*/

5035

if (unlikely(check_cfs_rq_runtime(cfs_rq)))

5035

if (unlikely(check_cfs_rq_runtime(cfs_rq)))

5036

goto simple;

5036

goto simple;

5037

5038

se = pick_next_entity(cfs_rq, curr);

5038

se = pick_next_entity(cfs_rq, curr);

5039

cfs_rq = group_cfs_rq(se);

5039

cfs_rq = group_cfs_rq(se);

5040

} while (cfs_rq);

5040

} while (cfs_rq);

5041

5042

p = task_of(se);

5042

p = task_of(se);

5043

5044

/*

5044

/*

5045

* Since we haven't yet done put_prev_entity and if the selected task

5045

* Since we haven't yet done put_prev_entity and if the selected task

5046

* is a different task than we started out with, try and touch the

5046

* is a different task than we started out with, try and touch the

5047

* least amount of cfs_rqs.

5047

* least amount of cfs_rqs.

5048

*/

5048

*/

5049

if (prev != p) {

5049

if (prev != p) {

5050

struct sched_entity *pse = &prev->se;

5050

struct sched_entity *pse = &prev->se;

5051

5052

while (!(cfs_rq = is_same_group(se, pse))) {

5052

while (!(cfs_rq = is_same_group(se, pse))) {

5053

int se_depth = se->depth;

5053

int se_depth = se->depth;

5054

int pse_depth = pse->depth;

5054

int pse_depth = pse->depth;

5055

5056

if (se_depth <= pse_depth) {

5056

if (se_depth <= pse_depth) {

5057

put_prev_entity(cfs_rq_of(pse), pse);

5057

put_prev_entity(cfs_rq_of(pse), pse);

5058

pse = parent_entity(pse);

5058

pse = parent_entity(pse);

5059

}

5059

}

5060

if (se_depth >= pse_depth) {

5060

if (se_depth >= pse_depth) {

5061

set_next_entity(cfs_rq_of(se), se);

5061

set_next_entity(cfs_rq_of(se), se);

5062

se = parent_entity(se);

5062

se = parent_entity(se);

5063

}

5063

}

5064

}

5064

}

5065

5066

put_prev_entity(cfs_rq, pse);

5066

put_prev_entity(cfs_rq, pse);

5067

set_next_entity(cfs_rq, se);

5067

set_next_entity(cfs_rq, se);

5068

}

5068

}

5069

5070

if (hrtick_enabled(rq))

5070

if (hrtick_enabled(rq))

5071

hrtick_start_fair(rq, p);

5071

hrtick_start_fair(rq, p);

5072

5073

return p;

5073

return p;

5074

simple:

5074

simple:

5075

cfs_rq = &rq->cfs;

5075

cfs_rq = &rq->cfs;

5076

#endif

5076

#endif

5077

5078

if (!cfs_rq->nr_running)

5078

if (!cfs_rq->nr_running)

5079

goto idle;

5079

goto idle;

5080

5081

put_prev_task(rq, prev);

5081

put_prev_task(rq, prev);

5082

5083

do {

5083

do {

5084

se = pick_next_entity(cfs_rq, NULL);

5084

se = pick_next_entity(cfs_rq, NULL);

5085

set_next_entity(cfs_rq, se);

5085

set_next_entity(cfs_rq, se);

5086

cfs_rq = group_cfs_rq(se);

5086

cfs_rq = group_cfs_rq(se);

5087

} while (cfs_rq);

5087

} while (cfs_rq);

5088

5089

p = task_of(se);

5089

p = task_of(se);

5090

5091

if (hrtick_enabled(rq))

5091

if (hrtick_enabled(rq))

5092

hrtick_start_fair(rq, p);

5092

hrtick_start_fair(rq, p);

5093

5094

return p;

5094

return p;

5095

5096

idle:

5096

idle:

5097

new_tasks = idle_balance(rq);

5097

new_tasks = idle_balance(rq);

5098

/*

5098

/*

5099

* Because idle_balance() releases (and re-acquires) rq->lock, it is

5099

* Because idle_balance() releases (and re-acquires) rq->lock, it is

5100

* possible for any higher priority task to appear. In that case we

5100

* possible for any higher priority task to appear. In that case we

5101

* must re-start the pick_next_entity() loop.

5101

* must re-start the pick_next_entity() loop.

5102

*/

5102

*/

5103

if (new_tasks < 0)

5103

if (new_tasks < 0)

5104

return RETRY_TASK;

5104

return RETRY_TASK;

5105

5106

if (new_tasks > 0)

5106

if (new_tasks > 0)

5107

goto again;

5107

goto again;

5108

5109

return NULL;

5109

return NULL;

5110

}

5110

}

5111

5112

/*

5112

/*

5113

* Account for a descheduled task:

5113

* Account for a descheduled task:

5114

*/

5114

*/

5115

static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)

5115

static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)

5116

{

5116

{

5117

struct sched_entity *se = &prev->se;

5117

struct sched_entity *se = &prev->se;

5118

struct cfs_rq *cfs_rq;

5118

struct cfs_rq *cfs_rq;

5119

5120

for_each_sched_entity(se) {

5120

for_each_sched_entity(se) {

5121

cfs_rq = cfs_rq_of(se);

5121

cfs_rq = cfs_rq_of(se);

5122

put_prev_entity(cfs_rq, se);

5122

put_prev_entity(cfs_rq, se);

5123

}

5123

}

5124

}

5124

}

5125

5126

/*

5126

/*

5127

* sched_yield() is very simple

5127

* sched_yield() is very simple

5128

*

5128

*

5129

* The magic of dealing with the ->skip buddy is in pick_next_entity.

5129

* The magic of dealing with the ->skip buddy is in pick_next_entity.

5130

*/

5130

*/

5131

static void yield_task_fair(struct rq *rq)

5131

static void yield_task_fair(struct rq *rq)

5132

{

5132

{

5133

struct task_struct *curr = rq->curr;

5133

struct task_struct *curr = rq->curr;

5134

struct cfs_rq *cfs_rq = task_cfs_rq(curr);

5134

struct cfs_rq *cfs_rq = task_cfs_rq(curr);

5135

struct sched_entity *se = &curr->se;

5135

struct sched_entity *se = &curr->se;

5136

5137

/*

5137

/*

5138

* Are we the only task in the tree?

5138

* Are we the only task in the tree?

5139

*/

5139

*/

5140

if (unlikely(rq->nr_running == 1))

5140

if (unlikely(rq->nr_running == 1))

5141

return;

5141

return;

5142

5143

clear_buddies(cfs_rq, se);

5143

clear_buddies(cfs_rq, se);

5144

5145

if (curr->policy != SCHED_BATCH) {

5145

if (curr->policy != SCHED_BATCH) {

5146

update_rq_clock(rq);

5146

update_rq_clock(rq);

5147

/*

5147

/*

5148

* Update run-time statistics of the 'current'.

5148

* Update run-time statistics of the 'current'.

5149

*/

5149

*/

5150

update_curr(cfs_rq);

5150

update_curr(cfs_rq);

5151

/*

5151

/*

5152

* Tell update_rq_clock() that we've just updated,

5152

* Tell update_rq_clock() that we've just updated,

5153

* so we don't do microscopic update in schedule()

5153

* so we don't do microscopic update in schedule()

5154

* and double the fastpath cost.

5154

* and double the fastpath cost.

5155

*/

5155

*/

5156

rq->skip_clock_update = 1;

5156

rq->skip_clock_update = 1;

5157

}

5157

}

5158

5159

set_skip_buddy(se);

5159

set_skip_buddy(se);

5160

}

5160

}

5161

5162

static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)

5162

static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)

5163

{

5163

{

5164

struct sched_entity *se = &p->se;

5164

struct sched_entity *se = &p->se;

5165

5166

/* throttled hierarchies are not runnable */

5166

/* throttled hierarchies are not runnable */

5167

if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))

5167

if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))

5168

return false;

5168

return false;

5169

5170

/* Tell the scheduler that we'd really like pse to run next. */

5170

/* Tell the scheduler that we'd really like pse to run next. */

5171

set_next_buddy(se);

5171

set_next_buddy(se);

5172

5173

yield_task_fair(rq);

5173

yield_task_fair(rq);

5174

5175

return true;

5175

return true;

5176

}

5176

}

5177

5178

#ifdef CONFIG_SMP

5178

#ifdef CONFIG_SMP

5179

/**************************************************

5179

/**************************************************

5180

* Fair scheduling class load-balancing methods.

5180

* Fair scheduling class load-balancing methods.

5181

*

5181

*

5182

* BASICS

5182

* BASICS

5183

*

5183

*

5184

* The purpose of load-balancing is to achieve the same basic fairness the

5184

* The purpose of load-balancing is to achieve the same basic fairness the

5185

* per-cpu scheduler provides, namely provide a proportional amount of compute

5185

* per-cpu scheduler provides, namely provide a proportional amount of compute

5186

* time to each task. This is expressed in the following equation:

5186

* time to each task. This is expressed in the following equation:

5187

*

5187

*

5188

* W_i,n/P_i == W_j,n/P_j for all i,j (1)

5188

* W_i,n/P_i == W_j,n/P_j for all i,j (1)

5189

*

5189

*

5190

* Where W_i,n is the n-th weight average for cpu i. The instantaneous weight

5190

* Where W_i,n is the n-th weight average for cpu i. The instantaneous weight

5191

* W_i,0 is defined as:

5191

* W_i,0 is defined as:

5192

*

5192

*

5193

* W_i,0 = \Sum_j w_i,j (2)

5193

* W_i,0 = \Sum_j w_i,j (2)

5194

*

5194

*

5195

* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight

5195

* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight

5196

* is derived from the nice value as per prio_to_weight[].

5196

* is derived from the nice value as per prio_to_weight[].

5197

*

5197

*

5198

* The weight average is an exponential decay average of the instantaneous

5198

* The weight average is an exponential decay average of the instantaneous

5199

* weight:

5199

* weight:

5200

*

5200

*

5201

* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)

5201

* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)

5202

*

5202

*

5203

* C_i is the compute capacity of cpu i, typically it is the

5203

* C_i is the compute capacity of cpu i, typically it is the

5204

* fraction of 'recent' time available for SCHED_OTHER task execution. But it

5204

* fraction of 'recent' time available for SCHED_OTHER task execution. But it

5205

* can also include other factors [XXX].

5205

* can also include other factors [XXX].

5206

*

5206

*

5207

* To achieve this balance we define a measure of imbalance which follows

5207

* To achieve this balance we define a measure of imbalance which follows

5208

* directly from (1):

5208

* directly from (1):

5209

*

5209

*

5210

* imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)

5210

* imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)

5211

*

5211

*

5212

* We them move tasks around to minimize the imbalance. In the continuous

5212

* We them move tasks around to minimize the imbalance. In the continuous

5213

* function space it is obvious this converges, in the discrete case we get

5213

* function space it is obvious this converges, in the discrete case we get

5214

* a few fun cases generally called infeasible weight scenarios.

5214

* a few fun cases generally called infeasible weight scenarios.

5215

*

5215

*

5216

* [XXX expand on:

5216

* [XXX expand on:

5217

* - infeasible weights;

5217

* - infeasible weights;

5218

* - local vs global optima in the discrete case. ]

5218

* - local vs global optima in the discrete case. ]

5219

*

5219

*

5220

*

5220

*

5221

* SCHED DOMAINS

5221

* SCHED DOMAINS

5222

*

5222

*

5223

* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)

5223

* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)

5224

* for all i,j solution, we create a tree of cpus that follows the hardware

5224

* for all i,j solution, we create a tree of cpus that follows the hardware

5225

* topology where each level pairs two lower groups (or better). This results

5225

* topology where each level pairs two lower groups (or better). This results

5226

* in O(log n) layers. Furthermore we reduce the number of cpus going up the

5226

* in O(log n) layers. Furthermore we reduce the number of cpus going up the

5227

* tree to only the first of the previous level and we decrease the frequency

5227

* tree to only the first of the previous level and we decrease the frequency

5228

* of load-balance at each level inv. proportional to the number of cpus in

5228

* of load-balance at each level inv. proportional to the number of cpus in

5229

* the groups.

5229

* the groups.

5230

*

5230

*

5231

* This yields:

5231

* This yields:

5232

*

5232

*

5233

* log_2 n 1 n

5233

* log_2 n 1 n

5234

* \Sum { --- * --- * 2^i } = O(n) (5)

5234

* \Sum { --- * --- * 2^i } = O(n) (5)

5235

* i = 0 2^i 2^i

5235

* i = 0 2^i 2^i

5236

* `- size of each group

5236

* `- size of each group

5237

* | | `- number of cpus doing load-balance

5237

* | | `- number of cpus doing load-balance

5238

* | `- freq

5238

* | `- freq

5239

* `- sum over all levels

5239

* `- sum over all levels

5240

*

5240

*

5241

* Coupled with a limit on how many tasks we can migrate every balance pass,

5241

* Coupled with a limit on how many tasks we can migrate every balance pass,

5242

* this makes (5) the runtime complexity of the balancer.

5242

* this makes (5) the runtime complexity of the balancer.

5243

*

5243

*

5244

* An important property here is that each CPU is still (indirectly) connected

5244

* An important property here is that each CPU is still (indirectly) connected

5245

* to every other cpu in at most O(log n) steps:

5245

* to every other cpu in at most O(log n) steps:

5246

*

5246

*

5247

* The adjacency matrix of the resulting graph is given by:

5247

* The adjacency matrix of the resulting graph is given by:

5248

*

5248

*

5249

* log_2 n

5249

* log_2 n

5250

* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)

5250

* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)

5251

* k = 0

5251

* k = 0

5252

*

5252

*

5253

* And you'll find that:

5253

* And you'll find that:

5254

*

5254

*

5255

* A^(log_2 n)_i,j != 0 for all i,j (7)

5255

* A^(log_2 n)_i,j != 0 for all i,j (7)

5256

*

5256

*

5257

* Showing there's indeed a path between every cpu in at most O(log n) steps.

5257

* Showing there's indeed a path between every cpu in at most O(log n) steps.

5258

* The task movement gives a factor of O(m), giving a convergence complexity

5258

* The task movement gives a factor of O(m), giving a convergence complexity

5259

* of:

5259

* of:

5260

*

5260

*

5261

* O(nm log n), n := nr_cpus, m := nr_tasks (8)

5261

* O(nm log n), n := nr_cpus, m := nr_tasks (8)

5262

*

5262

*

5263

*

5263

*

5264

* WORK CONSERVING

5264

* WORK CONSERVING

5265

*

5265

*

5266

* In order to avoid CPUs going idle while there's still work to do, new idle

5266

* In order to avoid CPUs going idle while there's still work to do, new idle

5267

* balancing is more aggressive and has the newly idle cpu iterate up the domain

5267

* balancing is more aggressive and has the newly idle cpu iterate up the domain

5268

* tree itself instead of relying on other CPUs to bring it work.

5268

* tree itself instead of relying on other CPUs to bring it work.

5269

*

5269

*

5270

* This adds some complexity to both (5) and (8) but it reduces the total idle

5270

* This adds some complexity to both (5) and (8) but it reduces the total idle

5271

* time.

5271

* time.

5272

*

5272

*

5273

* [XXX more?]

5273

* [XXX more?]

5274

*

5274

*

5275

*

5275

*

5276

* CGROUPS

5276

* CGROUPS

5277

*

5277

*

5278

* Cgroups make a horror show out of (2), instead of a simple sum we get:

5278

* Cgroups make a horror show out of (2), instead of a simple sum we get:

5279

*

5279

*

5280

* s_k,i

5280

* s_k,i

5281

* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)

5281

* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)

5282

* S_k

5282

* S_k

5283

*

5283

*

5284

* Where

5284

* Where

5285

*

5285

*

5286

* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)

5286

* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)

5287

*

5287

*

5288

* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.

5288

* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.

5289

*

5289

*

5290

* The big problem is S_k, its a global sum needed to compute a local (W_i)

5290

* The big problem is S_k, its a global sum needed to compute a local (W_i)

5291

* property.

5291

* property.

5292

*

5292

*

5293

* [XXX write more on how we solve this.. _after_ merging pjt's patches that

5293

* [XXX write more on how we solve this.. _after_ merging pjt's patches that

5294

* rewrite all of this once again.]

5294

* rewrite all of this once again.]

5295

*/

5295

*/

5296

5297

static unsigned long __read_mostly max_load_balance_interval = HZ/10;

5297

static unsigned long __read_mostly max_load_balance_interval = HZ/10;

5298

5299

enum fbq_type { regular, remote, all };

5299

enum fbq_type { regular, remote, all };

5300

5301

#define LBF_ALL_PINNED 0x01

5301

#define LBF_ALL_PINNED 0x01

5302

#define LBF_NEED_BREAK 0x02

5302

#define LBF_NEED_BREAK 0x02

5303

#define LBF_DST_PINNED 0x04

5303

#define LBF_DST_PINNED 0x04

5304

#define LBF_SOME_PINNED 0x08

5304

#define LBF_SOME_PINNED 0x08

5305

5306

struct lb_env {

5306

struct lb_env {

5307

struct sched_domain *sd;

5307

struct sched_domain *sd;

5308

5309

struct rq *src_rq;

5309

struct rq *src_rq;

5310

int src_cpu;

5310

int src_cpu;

5311

5312

int dst_cpu;

5312

int dst_cpu;

5313

struct rq *dst_rq;

5313

struct rq *dst_rq;

5314

5315

struct cpumask *dst_grpmask;

5315

struct cpumask *dst_grpmask;

5316

int new_dst_cpu;

5316

int new_dst_cpu;

5317

enum cpu_idle_type idle;

5317

enum cpu_idle_type idle;

5318

long imbalance;

5318

long imbalance;

5319

/* The set of CPUs under consideration for load-balancing */

5319

/* The set of CPUs under consideration for load-balancing */

5320

struct cpumask *cpus;

5320

struct cpumask *cpus;

5321

5322

unsigned int flags;

5322

unsigned int flags;

5323

5324

unsigned int loop;

5324

unsigned int loop;

5325

unsigned int loop_break;

5325

unsigned int loop_break;

5326

unsigned int loop_max;

5326

unsigned int loop_max;

5327

5328

enum fbq_type fbq_type;

5328

enum fbq_type fbq_type;

5329

struct list_head tasks;

5329

struct list_head tasks;

5330

};

5330

};

5331

5332

/*

5332

/*

5333

* Is this task likely cache-hot:

5333

* Is this task likely cache-hot:

5334

*/

5334

*/

5335

static int task_hot(struct task_struct *p, struct lb_env *env)

5335

static int task_hot(struct task_struct *p, struct lb_env *env)

5336

{

5336

{

5337

s64 delta;

5337

s64 delta;

5338

5339

lockdep_assert_held(&env->src_rq->lock);

5339

lockdep_assert_held(&env->src_rq->lock);

5340

5341

if (p->sched_class != &fair_sched_class)

5341

if (p->sched_class != &fair_sched_class)

5342

return 0;

5342

return 0;

5343

5344

if (unlikely(p->policy == SCHED_IDLE))

5344

if (unlikely(p->policy == SCHED_IDLE))

5345

return 0;

5345

return 0;

5346

5347

/*

5347

/*

5348

* Buddy candidates are cache hot:

5348

* Buddy candidates are cache hot:

5349

*/

5349

*/

5350

if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&

5350

if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&

5351

(&p->se == cfs_rq_of(&p->se)->next ||

5351

(&p->se == cfs_rq_of(&p->se)->next ||

5352

&p->se == cfs_rq_of(&p->se)->last))

5352

&p->se == cfs_rq_of(&p->se)->last))

5353

return 1;

5353

return 1;

5354

5355

if (sysctl_sched_migration_cost == -1)

5355

if (sysctl_sched_migration_cost == -1)

5356

return 1;

5356

return 1;

5357

if (sysctl_sched_migration_cost == 0)

5357

if (sysctl_sched_migration_cost == 0)

5358

return 0;

5358

return 0;

5359

5360

delta = rq_clock_task(env->src_rq) - p->se.exec_start;

5360

delta = rq_clock_task(env->src_rq) - p->se.exec_start;

5361

5362

return delta < (s64)sysctl_sched_migration_cost;

5362

return delta < (s64)sysctl_sched_migration_cost;

5363

}

5363

}

5364

5365

#ifdef CONFIG_NUMA_BALANCING

5365

#ifdef CONFIG_NUMA_BALANCING

5366

/* Returns true if the destination node has incurred more faults */

5366

/* Returns true if the destination node has incurred more faults */

5367

static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)

5367

static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)

5368

{

5368

{

5369

struct numa_group *numa_group = rcu_dereference(p->numa_group);

5369

struct numa_group *numa_group = rcu_dereference(p->numa_group);

5370

int src_nid, dst_nid;

5370

int src_nid, dst_nid;

5371

5372

if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||

5372

if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||

5373

!(env->sd->flags & SD_NUMA)) {

5373

!(env->sd->flags & SD_NUMA)) {

5374

return false;

5374

return false;

5375

}

5375

}

5376

5377

src_nid = cpu_to_node(env->src_cpu);

5377

src_nid = cpu_to_node(env->src_cpu);

5378

dst_nid = cpu_to_node(env->dst_cpu);

5378

dst_nid = cpu_to_node(env->dst_cpu);

5379

5380

if (src_nid == dst_nid)

5380

if (src_nid == dst_nid)

5381

return false;

5381

return false;

5382

5383

if (numa_group) {

5383

if (numa_group) {

5384

/* Task is already in the group's interleave set. */

5384

/* Task is already in the group's interleave set. */

5385

if (node_isset(src_nid, numa_group->active_nodes))

5385

if (node_isset(src_nid, numa_group->active_nodes))

5386

return false;

5386

return false;

5387

5388

/* Task is moving into the group's interleave set. */

5388

/* Task is moving into the group's interleave set. */

5389

if (node_isset(dst_nid, numa_group->active_nodes))

5389

if (node_isset(dst_nid, numa_group->active_nodes))

5390

return true;

5390

return true;

5391

5392

return group_faults(p, dst_nid) > group_faults(p, src_nid);

5392

return group_faults(p, dst_nid) > group_faults(p, src_nid);

5393

}

5393

}

5394

5395

/* Encourage migration to the preferred node. */

5395

/* Encourage migration to the preferred node. */

5396

if (dst_nid == p->numa_preferred_nid)

5396

if (dst_nid == p->numa_preferred_nid)

5397

return true;

5397

return true;

5398

5399

return task_faults(p, dst_nid) > task_faults(p, src_nid);

5399

return task_faults(p, dst_nid) > task_faults(p, src_nid);

5400

}

5400

}

5401

5402

5403

static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)

5403

static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)

5404

{

5404

{

5405

struct numa_group *numa_group = rcu_dereference(p->numa_group);

5405

struct numa_group *numa_group = rcu_dereference(p->numa_group);

5406

int src_nid, dst_nid;

5406

int src_nid, dst_nid;

5407

5408

if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))

5408

if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))

5409

return false;

5409

return false;

5410

5411

if (!p->numa_faults || !(env->sd->flags & SD_NUMA))

5411

if (!p->numa_faults || !(env->sd->flags & SD_NUMA))

5412

return false;

5412

return false;

5413

5414

src_nid = cpu_to_node(env->src_cpu);

5414

src_nid = cpu_to_node(env->src_cpu);

5415

dst_nid = cpu_to_node(env->dst_cpu);

5415

dst_nid = cpu_to_node(env->dst_cpu);

5416

5417

if (src_nid == dst_nid)

5417

if (src_nid == dst_nid)

5418

return false;

5418

return false;

5419

5420

if (numa_group) {

5420

if (numa_group) {

5421

/* Task is moving within/into the group's interleave set. */

5421

/* Task is moving within/into the group's interleave set. */

5422

if (node_isset(dst_nid, numa_group->active_nodes))

5422

if (node_isset(dst_nid, numa_group->active_nodes))

5423

return false;

5423

return false;

5424

5425

/* Task is moving out of the group's interleave set. */

5425

/* Task is moving out of the group's interleave set. */

5426

if (node_isset(src_nid, numa_group->active_nodes))

5426

if (node_isset(src_nid, numa_group->active_nodes))

5427

return true;

5427

return true;

5428

5429

return group_faults(p, dst_nid) < group_faults(p, src_nid);

5429

return group_faults(p, dst_nid) < group_faults(p, src_nid);

5430

}

5430

}

5431

5432

/* Migrating away from the preferred node is always bad. */

5432

/* Migrating away from the preferred node is always bad. */

5433

if (src_nid == p->numa_preferred_nid)

5433

if (src_nid == p->numa_preferred_nid)

5434

return true;

5434

return true;

5435

5436

return task_faults(p, dst_nid) < task_faults(p, src_nid);

5436

return task_faults(p, dst_nid) < task_faults(p, src_nid);

5437

}

5437

}

5438

5439

#else

5439

#else

5440

static inline bool migrate_improves_locality(struct task_struct *p,

5440

static inline bool migrate_improves_locality(struct task_struct *p,

5441

struct lb_env *env)

5441

struct lb_env *env)

5442

{

5442

{

5443

return false;

5443

return false;

5444

}

5444

}

5445

5446

static inline bool migrate_degrades_locality(struct task_struct *p,

5446

static inline bool migrate_degrades_locality(struct task_struct *p,

5447

struct lb_env *env)

5447

struct lb_env *env)

5448

{

5448

{

5449

return false;

5449

return false;

5450

}

5450

}

5451

#endif

5451

#endif

5452

5453

/*

5453

/*

5454

* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?

5454

* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?

5455

*/

5455

*/

5456

static

5456

static

5457

int can_migrate_task(struct task_struct *p, struct lb_env *env)

5457

int can_migrate_task(struct task_struct *p, struct lb_env *env)

5458

{

5458

{

5459

int tsk_cache_hot = 0;

5459

int tsk_cache_hot = 0;

5460

5461

lockdep_assert_held(&env->src_rq->lock);

5461

lockdep_assert_held(&env->src_rq->lock);

5462

5463

/*

5463

/*

5464

* We do not migrate tasks that are:

5464

* We do not migrate tasks that are:

5465

* 1) throttled_lb_pair, or

5465

* 1) throttled_lb_pair, or

5466

* 2) cannot be migrated to this CPU due to cpus_allowed, or

5466

* 2) cannot be migrated to this CPU due to cpus_allowed, or

5467

* 3) running (obviously), or

5467

* 3) running (obviously), or

5468

* 4) are cache-hot on their current CPU.

5468

* 4) are cache-hot on their current CPU.

5469

*/

5469

*/

5470

if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))

5470

if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))

5471

return 0;

5471

return 0;

5472

5473

if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {

5473

if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {

5474

int cpu;

5474

int cpu;

5475

5476

schedstat_inc(p, se.statistics.nr_failed_migrations_affine);

5476

schedstat_inc(p, se.statistics.nr_failed_migrations_affine);

5477

5478

env->flags |= LBF_SOME_PINNED;

5478

env->flags |= LBF_SOME_PINNED;

5479

5480

/*

5480

/*

5481

* Remember if this task can be migrated to any other cpu in

5481

* Remember if this task can be migrated to any other cpu in

5482

* our sched_group. We may want to revisit it if we couldn't

5482

* our sched_group. We may want to revisit it if we couldn't

5483

* meet load balance goals by pulling other tasks on src_cpu.

5483

* meet load balance goals by pulling other tasks on src_cpu.

5484

*

5484

*

5485

* Also avoid computing new_dst_cpu if we have already computed

5485

* Also avoid computing new_dst_cpu if we have already computed

5486

* one in current iteration.

5486

* one in current iteration.

5487

*/

5487

*/

5488

if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))

5488

if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))

5489

return 0;

5489

return 0;

5490

5491

/* Prevent to re-select dst_cpu via env's cpus */

5491

/* Prevent to re-select dst_cpu via env's cpus */

5492

for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {

5492

for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {

5493

if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {

5493

if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {

5494

env->flags |= LBF_DST_PINNED;

5494

env->flags |= LBF_DST_PINNED;

5495

env->new_dst_cpu = cpu;

5495

env->new_dst_cpu = cpu;

5496

break;

5496

break;

5497

}

5497

}

5498

}

5498

}

5499

5500

return 0;

5500

return 0;

5501

}

5501

}

5502

5503

/* Record that we found atleast one task that could run on dst_cpu */

5503

/* Record that we found atleast one task that could run on dst_cpu */

5504

env->flags &= ~LBF_ALL_PINNED;

5504

env->flags &= ~LBF_ALL_PINNED;

5505

5506

if (task_running(env->src_rq, p)) {

5506

if (task_running(env->src_rq, p)) {

5507

schedstat_inc(p, se.statistics.nr_failed_migrations_running);

5507

schedstat_inc(p, se.statistics.nr_failed_migrations_running);

5508

return 0;

5508

return 0;

5509

}

5509

}

5510

5511

/*

5511

/*

5512

* Aggressive migration if:

5512

* Aggressive migration if:

5513

* 1) destination numa is preferred

5513

* 1) destination numa is preferred

5514

* 2) task is cache cold, or

5514

* 2) task is cache cold, or

5515

* 3) too many balance attempts have failed.

5515

* 3) too many balance attempts have failed.

5516

*/

5516

*/

5517

tsk_cache_hot = task_hot(p, env);

5517

tsk_cache_hot = task_hot(p, env);

5518

if (!tsk_cache_hot)

5518

if (!tsk_cache_hot)

5519

tsk_cache_hot = migrate_degrades_locality(p, env);

5519

tsk_cache_hot = migrate_degrades_locality(p, env);

5520

5521

if (migrate_improves_locality(p, env) || !tsk_cache_hot ||

5521

if (migrate_improves_locality(p, env) || !tsk_cache_hot ||

5522

env->sd->nr_balance_failed > env->sd->cache_nice_tries) {

5522

env->sd->nr_balance_failed > env->sd->cache_nice_tries) {

5523

if (tsk_cache_hot) {

5523

if (tsk_cache_hot) {

5524

schedstat_inc(env->sd, lb_hot_gained[env->idle]);

5524

schedstat_inc(env->sd, lb_hot_gained[env->idle]);

5525

schedstat_inc(p, se.statistics.nr_forced_migrations);

5525

schedstat_inc(p, se.statistics.nr_forced_migrations);

5526

}

5526

}

5527

return 1;

5527

return 1;

5528

}

5528

}

5529

5530

schedstat_inc(p, se.statistics.nr_failed_migrations_hot);

5530

schedstat_inc(p, se.statistics.nr_failed_migrations_hot);

5531

return 0;

5531

return 0;

5532

}

5532

}

5533

5534

/*

5534

/*

5535

* detach_task() -- detach the task for the migration specified in env

5535

* detach_task() -- detach the task for the migration specified in env

5536

*/

5536

*/

5537

static void detach_task(struct task_struct *p, struct lb_env *env)

5537

static void detach_task(struct task_struct *p, struct lb_env *env)

5538

{

5538

{

5539

lockdep_assert_held(&env->src_rq->lock);

5539

lockdep_assert_held(&env->src_rq->lock);

5540

5541

deactivate_task(env->src_rq, p, 0);

5541

deactivate_task(env->src_rq, p, 0);

5542

p->on_rq = TASK_ON_RQ_MIGRATING;

5542

p->on_rq = TASK_ON_RQ_MIGRATING;

5543

set_task_cpu(p, env->dst_cpu);

5543

set_task_cpu(p, env->dst_cpu);

5544

}

5544

}

5545

5546

/*

5546

/*

5547

* detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as

5547

* detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as

5548

* part of active balancing operations within "domain".

5548

* part of active balancing operations within "domain".

5549

*

5549

*

5550

* Returns a task if successful and NULL otherwise.

5550

* Returns a task if successful and NULL otherwise.

5551

*/

5551

*/

5552

static struct task_struct *detach_one_task(struct lb_env *env)

5552

static struct task_struct *detach_one_task(struct lb_env *env)

5553

{

5553

{

5554

struct task_struct *p, *n;

5554

struct task_struct *p, *n;

5555

5556

lockdep_assert_held(&env->src_rq->lock);

5556

lockdep_assert_held(&env->src_rq->lock);

5557

5558

list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {

5558

list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {

5559

if (!can_migrate_task(p, env))

5559

if (!can_migrate_task(p, env))

5560

continue;

5560

continue;

5561

5562

detach_task(p, env);

5562

detach_task(p, env);

5563

5564

/*

5564

/*

5565

* Right now, this is only the second place where

5565

* Right now, this is only the second place where

5566

* lb_gained[env->idle] is updated (other is detach_tasks)

5566

* lb_gained[env->idle] is updated (other is detach_tasks)

5567

* so we can safely collect stats here rather than

5567

* so we can safely collect stats here rather than

5568

* inside detach_tasks().

5568

* inside detach_tasks().

5569

*/

5569

*/

5570

schedstat_inc(env->sd, lb_gained[env->idle]);

5570

schedstat_inc(env->sd, lb_gained[env->idle]);

5571

return p;

5571

return p;

5572

}

5572

}

5573

return NULL;

5573

return NULL;

5574

}

5574

}

5575

5576

static const unsigned int sched_nr_migrate_break = 32;

5576

static const unsigned int sched_nr_migrate_break = 32;

5577

5578

/*

5578

/*

5579

* detach_tasks() -- tries to detach up to imbalance weighted load from

5579

* detach_tasks() -- tries to detach up to imbalance weighted load from

5580

* busiest_rq, as part of a balancing operation within domain "sd".

5580

* busiest_rq, as part of a balancing operation within domain "sd".

5581

*

5581

*

5582

* Returns number of detached tasks if successful and 0 otherwise.

5582

* Returns number of detached tasks if successful and 0 otherwise.

5583

*/

5583

*/

5584

static int detach_tasks(struct lb_env *env)

5584

static int detach_tasks(struct lb_env *env)

5585

{

5585

{

5586

struct list_head *tasks = &env->src_rq->cfs_tasks;

5586

struct list_head *tasks = &env->src_rq->cfs_tasks;

5587

struct task_struct *p;

5587

struct task_struct *p;

5588

unsigned long load;

5588

unsigned long load;

5589

int detached = 0;

5589

int detached = 0;

5590

5591

lockdep_assert_held(&env->src_rq->lock);

5591

lockdep_assert_held(&env->src_rq->lock);

5592

5593

if (env->imbalance <= 0)

5593

if (env->imbalance <= 0)

5594

return 0;

5594

return 0;

5595

5596

while (!list_empty(tasks)) {

5596

while (!list_empty(tasks)) {

5597

p = list_first_entry(tasks, struct task_struct, se.group_node);

5597

p = list_first_entry(tasks, struct task_struct, se.group_node);

5598

5599

env->loop++;

5599

env->loop++;

5600

/* We've more or less seen every task there is, call it quits */

5600

/* We've more or less seen every task there is, call it quits */

5601

if (env->loop > env->loop_max)

5601

if (env->loop > env->loop_max)

5602

break;

5602

break;

5603

5604

/* take a breather every nr_migrate tasks */

5604

/* take a breather every nr_migrate tasks */

5605

if (env->loop > env->loop_break) {

5605

if (env->loop > env->loop_break) {

5606

env->loop_break += sched_nr_migrate_break;

5606

env->loop_break += sched_nr_migrate_break;

5607

env->flags |= LBF_NEED_BREAK;

5607

env->flags |= LBF_NEED_BREAK;

5608

break;

5608

break;

5609

}

5609

}

5610

5611

if (!can_migrate_task(p, env))

5611

if (!can_migrate_task(p, env))

5612

goto next;

5612

goto next;

5613

5614

load = task_h_load(p);

5614

load = task_h_load(p);

5615

5616

if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)

5616

if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)

5617

goto next;

5617

goto next;

5618

5619

if ((load / 2) > env->imbalance)

5619

if ((load / 2) > env->imbalance)

5620

goto next;

5620

goto next;

5621

5622

detach_task(p, env);

5622

detach_task(p, env);

5623

list_add(&p->se.group_node, &env->tasks);

5623

list_add(&p->se.group_node, &env->tasks);

5624

5625

detached++;

5625

detached++;

5626

env->imbalance -= load;

5626

env->imbalance -= load;

5627

5628

#ifdef CONFIG_PREEMPT

5628

#ifdef CONFIG_PREEMPT

5629

/*

5629

/*

5630

* NEWIDLE balancing is a source of latency, so preemptible

5630

* NEWIDLE balancing is a source of latency, so preemptible

5631

* kernels will stop after the first task is detached to minimize

5631

* kernels will stop after the first task is detached to minimize

5632

* the critical section.

5632

* the critical section.

5633

*/

5633

*/

5634

if (env->idle == CPU_NEWLY_IDLE)

5634

if (env->idle == CPU_NEWLY_IDLE)

5635

break;

5635

break;

5636

#endif

5636

#endif

5637

5638

/*

5638

/*

5639

* We only want to steal up to the prescribed amount of

5639

* We only want to steal up to the prescribed amount of

5640

* weighted load.

5640

* weighted load.

5641

*/

5641

*/

5642

if (env->imbalance <= 0)

5642

if (env->imbalance <= 0)

5643

break;

5643

break;

5644

5645

continue;

5645

continue;

5646

list_move_tail(&p->se.group_node, tasks);

5647

list_move_tail(&p->se.group_node, tasks);

5648

}

5648

}

5649

5650

/*

5650

/*

5651

* Right now, this is one of only two places we collect this stat

5651

* Right now, this is one of only two places we collect this stat

5652

* so we can safely collect detach_one_task() stats here rather

5652

* so we can safely collect detach_one_task() stats here rather

5653

* than inside detach_one_task().

5653

* than inside detach_one_task().

5654

*/

5654

*/

5655

schedstat_add(env->sd, lb_gained[env->idle], detached);

5655

schedstat_add(env->sd, lb_gained[env->idle], detached);

5656

5657

return detached;

5657

return detached;

5658

}

5658

}

5659

5660

/*

5660

/*

5661

* attach_task() -- attach the task detached by detach_task() to its new rq.

5661

* attach_task() -- attach the task detached by detach_task() to its new rq.

5662

*/

5662

*/

5663

static void attach_task(struct rq *rq, struct task_struct *p)

5663

static void attach_task(struct rq *rq, struct task_struct *p)

5664

{

5664

{

5665

lockdep_assert_held(&rq->lock);

5665

lockdep_assert_held(&rq->lock);

5666

5667

BUG_ON(task_rq(p) != rq);

5667

BUG_ON(task_rq(p) != rq);

5668

p->on_rq = TASK_ON_RQ_QUEUED;

5668

p->on_rq = TASK_ON_RQ_QUEUED;

5669

activate_task(rq, p, 0);

5669

activate_task(rq, p, 0);

5670

check_preempt_curr(rq, p, 0);

5670

check_preempt_curr(rq, p, 0);

5671

}

5671

}

5672

5673

/*

5673

/*

5674

* attach_one_task() -- attaches the task returned from detach_one_task() to

5674

* attach_one_task() -- attaches the task returned from detach_one_task() to

5675

* its new rq.

5675

* its new rq.

5676

*/

5676

*/

5677

static void attach_one_task(struct rq *rq, struct task_struct *p)

5677

static void attach_one_task(struct rq *rq, struct task_struct *p)

5678

{

5678

{

5679

raw_spin_lock(&rq->lock);

5679

raw_spin_lock(&rq->lock);

5680

attach_task(rq, p);

5680

attach_task(rq, p);

5681

raw_spin_unlock(&rq->lock);

5681

raw_spin_unlock(&rq->lock);

5682

}

5682

}

5683

5684

/*

5684

/*

5685

* attach_tasks() -- attaches all tasks detached by detach_tasks() to their

5685

* attach_tasks() -- attaches all tasks detached by detach_tasks() to their

5686

* new rq.

5686

* new rq.

5687

*/

5687

*/

5688

static void attach_tasks(struct lb_env *env)

5688

static void attach_tasks(struct lb_env *env)

5689

{

5689

{

5690

struct list_head *tasks = &env->tasks;

5690

struct list_head *tasks = &env->tasks;

5691

struct task_struct *p;

5691

struct task_struct *p;

5692

5693

raw_spin_lock(&env->dst_rq->lock);

5693

raw_spin_lock(&env->dst_rq->lock);

5694

5695

while (!list_empty(tasks)) {

5695

while (!list_empty(tasks)) {

5696

p = list_first_entry(tasks, struct task_struct, se.group_node);

5696

p = list_first_entry(tasks, struct task_struct, se.group_node);

5697

list_del_init(&p->se.group_node);

5697

list_del_init(&p->se.group_node);

5698

5699

attach_task(env->dst_rq, p);

5699

attach_task(env->dst_rq, p);

5700

}

5700

}

5701

5702

raw_spin_unlock(&env->dst_rq->lock);

5702

raw_spin_unlock(&env->dst_rq->lock);

5703

}

5703

}

5704

5705

#ifdef CONFIG_FAIR_GROUP_SCHED

5705

#ifdef CONFIG_FAIR_GROUP_SCHED

5706

/*

5706

/*

5707

* update tg->load_weight by folding this cpu's load_avg

5707

* update tg->load_weight by folding this cpu's load_avg

5708

*/

5708

*/

5709

static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)

5709

static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)

5710

{

5710

{

5711

struct sched_entity *se = tg->se[cpu];

5711

struct sched_entity *se = tg->se[cpu];

5712

struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];

5712

struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];

5713

5714

/* throttled entities do not contribute to load */

5714

/* throttled entities do not contribute to load */

5715

if (throttled_hierarchy(cfs_rq))

5715

if (throttled_hierarchy(cfs_rq))

5716

return;

5716

return;

5717

5718

update_cfs_rq_blocked_load(cfs_rq, 1);

5718

update_cfs_rq_blocked_load(cfs_rq, 1);

5719

5720

if (se) {

5720

if (se) {

5721

update_entity_load_avg(se, 1);

5721

update_entity_load_avg(se, 1);

5722

/*

5722

/*

5723

* We pivot on our runnable average having decayed to zero for

5723

* We pivot on our runnable average having decayed to zero for

5724

* list removal. This generally implies that all our children

5724

* list removal. This generally implies that all our children

5725

* have also been removed (modulo rounding error or bandwidth

5725

* have also been removed (modulo rounding error or bandwidth

5726

* control); however, such cases are rare and we can fix these

5726

* control); however, such cases are rare and we can fix these

5727

* at enqueue.

5727

* at enqueue.

5728

*

5728

*

5729

* TODO: fix up out-of-order children on enqueue.

5729

* TODO: fix up out-of-order children on enqueue.

5730

*/

5730

*/

5731

if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)

5731

if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)

5732

list_del_leaf_cfs_rq(cfs_rq);

5732

list_del_leaf_cfs_rq(cfs_rq);

5733

} else {

5733

} else {

5734

struct rq *rq = rq_of(cfs_rq);

5734

struct rq *rq = rq_of(cfs_rq);

5735

update_rq_runnable_avg(rq, rq->nr_running);

5735

update_rq_runnable_avg(rq, rq->nr_running);

5736

}

5736

}

5737

}

5737

}

5738

5739

static void update_blocked_averages(int cpu)

5739

static void update_blocked_averages(int cpu)

5740

{

5740

{

5741

struct rq *rq = cpu_rq(cpu);

5741

struct rq *rq = cpu_rq(cpu);

5742

struct cfs_rq *cfs_rq;

5742

struct cfs_rq *cfs_rq;

5743

unsigned long flags;

5743

unsigned long flags;

5744

5745

raw_spin_lock_irqsave(&rq->lock, flags);

5745

raw_spin_lock_irqsave(&rq->lock, flags);

5746

update_rq_clock(rq);

5746

update_rq_clock(rq);

5747

/*

5747

/*

5748

* Iterates the task_group tree in a bottom up fashion, see

5748

* Iterates the task_group tree in a bottom up fashion, see

5749

* list_add_leaf_cfs_rq() for details.

5749

* list_add_leaf_cfs_rq() for details.

5750

*/

5750

*/

5751

for_each_leaf_cfs_rq(rq, cfs_rq) {

5751

for_each_leaf_cfs_rq(rq, cfs_rq) {

5752

/*

5752

/*

5753

* Note: We may want to consider periodically releasing

5753

* Note: We may want to consider periodically releasing

5754

* rq->lock about these updates so that creating many task

5754

* rq->lock about these updates so that creating many task

5755

* groups does not result in continually extending hold time.

5755

* groups does not result in continually extending hold time.

5756

*/

5756

*/

5757

__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);

5757

__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);

5758

}

5758

}

5759

5760

raw_spin_unlock_irqrestore(&rq->lock, flags);

5760

raw_spin_unlock_irqrestore(&rq->lock, flags);

5761

}

5761

}

5762

5763

/*

5763

/*

5764

* Compute the hierarchical load factor for cfs_rq and all its ascendants.

5764

* Compute the hierarchical load factor for cfs_rq and all its ascendants.

5765

* This needs to be done in a top-down fashion because the load of a child

5765

* This needs to be done in a top-down fashion because the load of a child

5766

* group is a fraction of its parents load.

5766

* group is a fraction of its parents load.

5767

*/

5767

*/

5768

static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)

5768

static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)

5769

{

5769

{

5770

struct rq *rq = rq_of(cfs_rq);

5770

struct rq *rq = rq_of(cfs_rq);

5771

struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];

5771

struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];

5772

unsigned long now = jiffies;

5772

unsigned long now = jiffies;

5773

unsigned long load;

5773

unsigned long load;

5774

5775

if (cfs_rq->last_h_load_update == now)

5775

if (cfs_rq->last_h_load_update == now)

5776

return;

5776

return;

5777

5778

cfs_rq->h_load_next = NULL;

5778

cfs_rq->h_load_next = NULL;

5779

for_each_sched_entity(se) {

5779

for_each_sched_entity(se) {

5780

cfs_rq = cfs_rq_of(se);

5780

cfs_rq = cfs_rq_of(se);

5781

cfs_rq->h_load_next = se;

5781

cfs_rq->h_load_next = se;

5782

if (cfs_rq->last_h_load_update == now)

5782

if (cfs_rq->last_h_load_update == now)

5783

break;

5783

break;

5784

}

5784

}

5785

5786

if (!se) {

5786

if (!se) {

5787

cfs_rq->h_load = cfs_rq->runnable_load_avg;

5787

cfs_rq->h_load = cfs_rq->runnable_load_avg;

5788

cfs_rq->last_h_load_update = now;

5788

cfs_rq->last_h_load_update = now;

5789

}

5789

}

5790

5791

while ((se = cfs_rq->h_load_next) != NULL) {

5791

while ((se = cfs_rq->h_load_next) != NULL) {

5792

load = cfs_rq->h_load;

5792

load = cfs_rq->h_load;

5793

load = div64_ul(load * se->avg.load_avg_contrib,

5793

load = div64_ul(load * se->avg.load_avg_contrib,

5794

cfs_rq->runnable_load_avg + 1);

5794

cfs_rq->runnable_load_avg + 1);

5795

cfs_rq = group_cfs_rq(se);

5795

cfs_rq = group_cfs_rq(se);

5796

cfs_rq->h_load = load;

5796

cfs_rq->h_load = load;

5797

cfs_rq->last_h_load_update = now;

5797

cfs_rq->last_h_load_update = now;

5798

}

5798

}

5799

}

5799

}

5800

5801

static unsigned long task_h_load(struct task_struct *p)

5801

static unsigned long task_h_load(struct task_struct *p)

5802

{

5802

{

5803

struct cfs_rq *cfs_rq = task_cfs_rq(p);

5803

struct cfs_rq *cfs_rq = task_cfs_rq(p);

5804

5805

update_cfs_rq_h_load(cfs_rq);

5805

update_cfs_rq_h_load(cfs_rq);

5806

return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,

5806

return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,

5807

cfs_rq->runnable_load_avg + 1);

5807

cfs_rq->runnable_load_avg + 1);

5808

}

5808

}

5809

#else

5809

#else

5810

static inline void update_blocked_averages(int cpu)

5810

static inline void update_blocked_averages(int cpu)

5811

{

5811

{

5812

}

5812

}

5813

5814

static unsigned long task_h_load(struct task_struct *p)

5814

static unsigned long task_h_load(struct task_struct *p)

5815

{

5815

{

5816

return p->se.avg.load_avg_contrib;

5816

return p->se.avg.load_avg_contrib;

5817

}

5817

}

5818

#endif

5818

#endif

5819

5820

/********** Helpers for find_busiest_group ************************/

5820

/********** Helpers for find_busiest_group ************************/

5821

5822

enum group_type {

5822

enum group_type {

5823

group_other = 0,

5823

group_other = 0,

5824

group_imbalanced,

5824

group_imbalanced,

5825

group_overloaded,

5825

group_overloaded,

5826

};

5826

};

5827

5828

/*

5828

/*

5829

* sg_lb_stats - stats of a sched_group required for load_balancing

5829

* sg_lb_stats - stats of a sched_group required for load_balancing

5830

*/

5830

*/

5831

struct sg_lb_stats {

5831

struct sg_lb_stats {

5832

unsigned long avg_load; /*Avg load across the CPUs of the group */

5832

unsigned long avg_load; /*Avg load across the CPUs of the group */

5833

unsigned long group_load; /* Total load over the CPUs of the group */

5833

unsigned long group_load; /* Total load over the CPUs of the group */

5834

unsigned long sum_weighted_load; /* Weighted load of group's tasks */

5834

unsigned long sum_weighted_load; /* Weighted load of group's tasks */

5835

unsigned long load_per_task;

5835

unsigned long load_per_task;

5836

unsigned long group_capacity;

5836

unsigned long group_capacity;

5837

unsigned int sum_nr_running; /* Nr tasks running in the group */

5837

unsigned int sum_nr_running; /* Nr tasks running in the group */

5838

unsigned int group_capacity_factor;

5838

unsigned int group_capacity_factor;

5839

unsigned int idle_cpus;

5839

unsigned int idle_cpus;

5840

unsigned int group_weight;

5840

unsigned int group_weight;

5841

enum group_type group_type;

5841

enum group_type group_type;

5842

int group_has_free_capacity;

5842

int group_has_free_capacity;

5843

#ifdef CONFIG_NUMA_BALANCING

5843

#ifdef CONFIG_NUMA_BALANCING

5844

unsigned int nr_numa_running;

5844

unsigned int nr_numa_running;

5845

unsigned int nr_preferred_running;

5845

unsigned int nr_preferred_running;

5846

#endif

5846

#endif

5847

};

5847

};

5848

5849

/*

5849

/*

5850

* sd_lb_stats - Structure to store the statistics of a sched_domain

5850

* sd_lb_stats - Structure to store the statistics of a sched_domain

5851

* during load balancing.

5851

* during load balancing.

5852

*/

5852

*/

5853

struct sd_lb_stats {

5853

struct sd_lb_stats {

5854

struct sched_group *busiest; /* Busiest group in this sd */

5854

struct sched_group *busiest; /* Busiest group in this sd */

5855

struct sched_group *local; /* Local group in this sd */

5855

struct sched_group *local; /* Local group in this sd */

5856

unsigned long total_load; /* Total load of all groups in sd */

5856

unsigned long total_load; /* Total load of all groups in sd */

5857

unsigned long total_capacity; /* Total capacity of all groups in sd */

5857

unsigned long total_capacity; /* Total capacity of all groups in sd */

5858

unsigned long avg_load; /* Average load across all groups in sd */

5858

unsigned long avg_load; /* Average load across all groups in sd */

5859

5860

struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */

5860

struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */

5861

struct sg_lb_stats local_stat; /* Statistics of the local group */

5861

struct sg_lb_stats local_stat; /* Statistics of the local group */

5862

};

5862

};

5863

5864

static inline void init_sd_lb_stats(struct sd_lb_stats *sds)

5864

static inline void init_sd_lb_stats(struct sd_lb_stats *sds)

5865

{

5865

{

5866

/*

5866

/*

5867

* Skimp on the clearing to avoid duplicate work. We can avoid clearing

5867

* Skimp on the clearing to avoid duplicate work. We can avoid clearing

5868

* local_stat because update_sg_lb_stats() does a full clear/assignment.

5868

* local_stat because update_sg_lb_stats() does a full clear/assignment.

5869

* We must however clear busiest_stat::avg_load because

5869

* We must however clear busiest_stat::avg_load because

5870

* update_sd_pick_busiest() reads this before assignment.

5870

* update_sd_pick_busiest() reads this before assignment.

5871

*/

5871

*/

5872

*sds = (struct sd_lb_stats){

5872

*sds = (struct sd_lb_stats){

5873

.busiest = NULL,

5873

.busiest = NULL,

5874

.local = NULL,

5874

.local = NULL,

5875

.total_load = 0UL,

5875

.total_load = 0UL,

5876

.total_capacity = 0UL,

5876

.total_capacity = 0UL,

5877

.busiest_stat = {

5877

.busiest_stat = {

5878

.avg_load = 0UL,

5878

.avg_load = 0UL,

5879

.sum_nr_running = 0,

5879

.sum_nr_running = 0,

5880

.group_type = group_other,

5880

.group_type = group_other,

5881

},

5881

},

5882

};

5882

};

5883

}

5883

}

5884

5885

/**

5885

/**

5886

* get_sd_load_idx - Obtain the load index for a given sched domain.

5886

* get_sd_load_idx - Obtain the load index for a given sched domain.

5887

* @sd: The sched_domain whose load_idx is to be obtained.

5887

* @sd: The sched_domain whose load_idx is to be obtained.

5888

* @idle: The idle status of the CPU for whose sd load_idx is obtained.

5888

* @idle: The idle status of the CPU for whose sd load_idx is obtained.

5889

*

5889

*

5890

* Return: The load index.

5890

* Return: The load index.

5891

*/

5891

*/

5892

static inline int get_sd_load_idx(struct sched_domain *sd,

5892

static inline int get_sd_load_idx(struct sched_domain *sd,

5893

enum cpu_idle_type idle)

5893

enum cpu_idle_type idle)

5894

{

5894

{

5895

int load_idx;

5895

int load_idx;

5896

5897

switch (idle) {

5897

switch (idle) {

5898

case CPU_NOT_IDLE:

5898

case CPU_NOT_IDLE:

5899

load_idx = sd->busy_idx;

5899

load_idx = sd->busy_idx;

5900

break;

5900

break;

5901

5902

case CPU_NEWLY_IDLE:

5902

case CPU_NEWLY_IDLE:

5903

load_idx = sd->newidle_idx;

5903

load_idx = sd->newidle_idx;

5904

break;

5904

break;

5905

default:

5905

default:

5906

load_idx = sd->idle_idx;

5906

load_idx = sd->idle_idx;

5907

break;

5907

break;

5908

}

5908

}

5909

5910

return load_idx;

5910

return load_idx;

5911

}

5911

}

5912

5913

static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)

5913

static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)

5914

{

5914

{

5915

return SCHED_CAPACITY_SCALE;

5915

return SCHED_CAPACITY_SCALE;

5916

}

5916

}

5917

5918

unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)

5918

unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)

5919

{

5919

{

5920

return default_scale_capacity(sd, cpu);

5920

return default_scale_capacity(sd, cpu);

5921

}

5921

}

5922

5923

static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)

5923

static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)

5924

{

5924

{

5925

if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))

5925

if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))

5926

return sd->smt_gain / sd->span_weight;

5926

return sd->smt_gain / sd->span_weight;

5927

5928

return SCHED_CAPACITY_SCALE;

5928

return SCHED_CAPACITY_SCALE;

5929

}

5929

}

5930

5931

unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)

5931

unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)

5932

{

5932

{

5933

return default_scale_cpu_capacity(sd, cpu);

5933

return default_scale_cpu_capacity(sd, cpu);

5934

}

5934

}

5935

5936

static unsigned long scale_rt_capacity(int cpu)

5936

static unsigned long scale_rt_capacity(int cpu)

5937

{

5937

{

5938

struct rq *rq = cpu_rq(cpu);

5938

struct rq *rq = cpu_rq(cpu);

5939

u64 total, available, age_stamp, avg;

5939

u64 total, available, age_stamp, avg;

5940

s64 delta;

5940

s64 delta;

5941

5942

/*

5942

/*

5943

* Since we're reading these variables without serialization make sure

5943

* Since we're reading these variables without serialization make sure

5944

* we read them once before doing sanity checks on them.

5944

* we read them once before doing sanity checks on them.

5945

*/

5945

*/

5946

age_stamp = ACCESS_ONCE(rq->age_stamp);

5946

age_stamp = ACCESS_ONCE(rq->age_stamp);

5947

avg = ACCESS_ONCE(rq->rt_avg);

5947

avg = ACCESS_ONCE(rq->rt_avg);

5948

5949

delta = rq_clock(rq) - age_stamp;

5949

delta = rq_clock(rq) - age_stamp;

5950

if (unlikely(delta < 0))

5950

if (unlikely(delta < 0))

5951

delta = 0;

5951

delta = 0;

5952

5953

total = sched_avg_period() + delta;

5953

total = sched_avg_period() + delta;

5954

5955

if (unlikely(total < avg)) {

5955

if (unlikely(total < avg)) {

5956

/* Ensures that capacity won't end up being negative */

5956

/* Ensures that capacity won't end up being negative */

5957

available = 0;

5957

available = 0;

5958

} else {

5958

} else {

5959

available = total - avg;

5959

available = total - avg;

5960

}

5960

}

5961

5962

if (unlikely((s64)total < SCHED_CAPACITY_SCALE))

5962

if (unlikely((s64)total < SCHED_CAPACITY_SCALE))

5963

total = SCHED_CAPACITY_SCALE;

5963

total = SCHED_CAPACITY_SCALE;

5964

5965

total >>= SCHED_CAPACITY_SHIFT;

5965

total >>= SCHED_CAPACITY_SHIFT;

5966

5967

return div_u64(available, total);

5967

return div_u64(available, total);

5968

}

5968

}

5969

5970

static void update_cpu_capacity(struct sched_domain *sd, int cpu)

5970

static void update_cpu_capacity(struct sched_domain *sd, int cpu)

5971

{

5971

{

5972

unsigned long capacity = SCHED_CAPACITY_SCALE;

5972

unsigned long capacity = SCHED_CAPACITY_SCALE;

5973

struct sched_group *sdg = sd->groups;

5973

struct sched_group *sdg = sd->groups;

5974

5975

if (sched_feat(ARCH_CAPACITY))

5975

if (sched_feat(ARCH_CAPACITY))

5976

capacity *= arch_scale_cpu_capacity(sd, cpu);

5976

capacity *= arch_scale_cpu_capacity(sd, cpu);

5977

else

5977

else

5978

capacity *= default_scale_cpu_capacity(sd, cpu);

5978

capacity *= default_scale_cpu_capacity(sd, cpu);

5979

5980

capacity >>= SCHED_CAPACITY_SHIFT;

5980

capacity >>= SCHED_CAPACITY_SHIFT;

5981

5982

sdg->sgc->capacity_orig = capacity;

5982

sdg->sgc->capacity_orig = capacity;

5983

5984

if (sched_feat(ARCH_CAPACITY))

5984

if (sched_feat(ARCH_CAPACITY))

5985

capacity *= arch_scale_freq_capacity(sd, cpu);

5985

capacity *= arch_scale_freq_capacity(sd, cpu);

5986

else

5986

else

5987

capacity *= default_scale_capacity(sd, cpu);

5987

capacity *= default_scale_capacity(sd, cpu);

5988

5989

capacity >>= SCHED_CAPACITY_SHIFT;

5989

capacity >>= SCHED_CAPACITY_SHIFT;

5990

5991

capacity *= scale_rt_capacity(cpu);

5991

capacity *= scale_rt_capacity(cpu);

5992

capacity >>= SCHED_CAPACITY_SHIFT;

5992

capacity >>= SCHED_CAPACITY_SHIFT;

5993

5994

if (!capacity)

5994

if (!capacity)

5995

capacity = 1;

5995

capacity = 1;

5996

5997

cpu_rq(cpu)->cpu_capacity = capacity;

5997

cpu_rq(cpu)->cpu_capacity = capacity;

5998

sdg->sgc->capacity = capacity;

5998

sdg->sgc->capacity = capacity;

5999

}

5999

}

6000

6001

void update_group_capacity(struct sched_domain *sd, int cpu)

6001

void update_group_capacity(struct sched_domain *sd, int cpu)

6002

{

6002

{

6003

struct sched_domain *child = sd->child;

6003

struct sched_domain *child = sd->child;

6004

struct sched_group *group, *sdg = sd->groups;

6004

struct sched_group *group, *sdg = sd->groups;

6005

unsigned long capacity, capacity_orig;

6005

unsigned long capacity, capacity_orig;

6006

unsigned long interval;

6006

unsigned long interval;

6007

6008

interval = msecs_to_jiffies(sd->balance_interval);

6008

interval = msecs_to_jiffies(sd->balance_interval);

6009

interval = clamp(interval, 1UL, max_load_balance_interval);

6009

interval = clamp(interval, 1UL, max_load_balance_interval);

6010

sdg->sgc->next_update = jiffies + interval;

6010

sdg->sgc->next_update = jiffies + interval;

6011

6012

if (!child) {

6012

if (!child) {

6013

update_cpu_capacity(sd, cpu);

6013

update_cpu_capacity(sd, cpu);

6014

return;

6014

return;

6015

}

6015

}

6016

6017

capacity_orig = capacity = 0;

6017

capacity_orig = capacity = 0;

6018

6019

if (child->flags & SD_OVERLAP) {

6019

if (child->flags & SD_OVERLAP) {

6020

/*

6020

/*

6021

* SD_OVERLAP domains cannot assume that child groups

6021

* SD_OVERLAP domains cannot assume that child groups

6022

* span the current group.

6022

* span the current group.

6023

*/

6023

*/

6024

6025

for_each_cpu(cpu, sched_group_cpus(sdg)) {

6025

for_each_cpu(cpu, sched_group_cpus(sdg)) {

6026

struct sched_group_capacity *sgc;

6026

struct sched_group_capacity *sgc;

6027

struct rq *rq = cpu_rq(cpu);

6027

struct rq *rq = cpu_rq(cpu);

6028

6029

/*

6029

/*

6030

* build_sched_domains() -> init_sched_groups_capacity()

6030

* build_sched_domains() -> init_sched_groups_capacity()

6031

* gets here before we've attached the domains to the

6031

* gets here before we've attached the domains to the

6032

* runqueues.

6032

* runqueues.

6033

*

6033

*

6034

* Use capacity_of(), which is set irrespective of domains

6034

* Use capacity_of(), which is set irrespective of domains

6035

* in update_cpu_capacity().

6035

* in update_cpu_capacity().

6036

*

6036

*

6037

* This avoids capacity/capacity_orig from being 0 and

6037

* This avoids capacity/capacity_orig from being 0 and

6038

* causing divide-by-zero issues on boot.

6038

* causing divide-by-zero issues on boot.

6039

*

6039

*

6040

* Runtime updates will correct capacity_orig.

6040

* Runtime updates will correct capacity_orig.

6041

*/

6041

*/

6042

if (unlikely(!rq->sd)) {

6042

if (unlikely(!rq->sd)) {

6043

capacity_orig += capacity_of(cpu);

6043

capacity_orig += capacity_of(cpu);

6044

capacity += capacity_of(cpu);

6044

capacity += capacity_of(cpu);

6045

continue;

6045

continue;

6046

}

6046

}

6047

6048

sgc = rq->sd->groups->sgc;

6048

sgc = rq->sd->groups->sgc;

6049

capacity_orig += sgc->capacity_orig;

6049

capacity_orig += sgc->capacity_orig;

6050

capacity += sgc->capacity;

6050

capacity += sgc->capacity;

6051

}

6051

}

6052

} else {

6052

} else {

6053

/*

6053

/*

6054

* !SD_OVERLAP domains can assume that child groups

6054

* !SD_OVERLAP domains can assume that child groups

6055

* span the current group.

6055

* span the current group.

6056

*/

6056

*/

6057

6058

group = child->groups;

6058

group = child->groups;

6059

do {

6059

do {

6060

capacity_orig += group->sgc->capacity_orig;

6060

capacity_orig += group->sgc->capacity_orig;

6061

capacity += group->sgc->capacity;

6061

capacity += group->sgc->capacity;

6062

group = group->next;

6062

group = group->next;

6063

} while (group != child->groups);

6063

} while (group != child->groups);

6064

}

6064

}

6065

6066

sdg->sgc->capacity_orig = capacity_orig;

6066

sdg->sgc->capacity_orig = capacity_orig;

6067

sdg->sgc->capacity = capacity;

6067

sdg->sgc->capacity = capacity;

6068

}

6068

}

6069

6070

/*

6070

/*

6071

* Try and fix up capacity for tiny siblings, this is needed when

6071

* Try and fix up capacity for tiny siblings, this is needed when

6072

* things like SD_ASYM_PACKING need f_b_g to select another sibling

6072

* things like SD_ASYM_PACKING need f_b_g to select another sibling

6073

* which on its own isn't powerful enough.

6073

* which on its own isn't powerful enough.

6074

*

6074

*

6075

* See update_sd_pick_busiest() and check_asym_packing().

6075

* See update_sd_pick_busiest() and check_asym_packing().

6076

*/

6076

*/

6077

static inline int

6077

static inline int

6078

fix_small_capacity(struct sched_domain *sd, struct sched_group *group)

6078

fix_small_capacity(struct sched_domain *sd, struct sched_group *group)

6079

{

6079

{

6080

/*

6080

/*

6081

* Only siblings can have significantly less than SCHED_CAPACITY_SCALE

6081

* Only siblings can have significantly less than SCHED_CAPACITY_SCALE

6082

*/

6082

*/

6083

if (!(sd->flags & SD_SHARE_CPUCAPACITY))

6083

if (!(sd->flags & SD_SHARE_CPUCAPACITY))

6084

return 0;

6084

return 0;

6085

6086

/*

6086

/*

6087

* If ~90% of the cpu_capacity is still there, we're good.

6087

* If ~90% of the cpu_capacity is still there, we're good.

6088

*/

6088

*/

6089

if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)

6089

if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)

6090

return 1;

6090

return 1;

6091

6092

return 0;

6092

return 0;

6093

}

6093

}

6094

6095

/*

6095

/*

6096

* Group imbalance indicates (and tries to solve) the problem where balancing

6096

* Group imbalance indicates (and tries to solve) the problem where balancing

6097

* groups is inadequate due to tsk_cpus_allowed() constraints.

6097

* groups is inadequate due to tsk_cpus_allowed() constraints.

6098

*

6098

*

6099

* Imagine a situation of two groups of 4 cpus each and 4 tasks each with a

6099

* Imagine a situation of two groups of 4 cpus each and 4 tasks each with a

6100

* cpumask covering 1 cpu of the first group and 3 cpus of the second group.

6100

* cpumask covering 1 cpu of the first group and 3 cpus of the second group.

6101

* Something like:

6101

* Something like:

6102

*

6102

*

6103

* { 0 1 2 3 } { 4 5 6 7 }

6103

* { 0 1 2 3 } { 4 5 6 7 }

6104

* * * * *

6104

* * * * *

6105

*

6105

*

6106

* If we were to balance group-wise we'd place two tasks in the first group and

6106

* If we were to balance group-wise we'd place two tasks in the first group and

6107

* two tasks in the second group. Clearly this is undesired as it will overload

6107

* two tasks in the second group. Clearly this is undesired as it will overload

6108

* cpu 3 and leave one of the cpus in the second group unused.

6108

* cpu 3 and leave one of the cpus in the second group unused.

6109

*

6109

*

6110

* The current solution to this issue is detecting the skew in the first group

6110

* The current solution to this issue is detecting the skew in the first group

6111

* by noticing the lower domain failed to reach balance and had difficulty

6111

* by noticing the lower domain failed to reach balance and had difficulty

6112

* moving tasks due to affinity constraints.

6112

* moving tasks due to affinity constraints.

6113

*

6113

*

6114

* When this is so detected; this group becomes a candidate for busiest; see

6114

* When this is so detected; this group becomes a candidate for busiest; see

6115

* update_sd_pick_busiest(). And calculate_imbalance() and

6115

* update_sd_pick_busiest(). And calculate_imbalance() and

6116

* find_busiest_group() avoid some of the usual balance conditions to allow it

6116

* find_busiest_group() avoid some of the usual balance conditions to allow it

6117

* to create an effective group imbalance.

6117

* to create an effective group imbalance.

6118

*

6118

*

6119

* This is a somewhat tricky proposition since the next run might not find the

6119

* This is a somewhat tricky proposition since the next run might not find the

6120

* group imbalance and decide the groups need to be balanced again. A most

6120

* group imbalance and decide the groups need to be balanced again. A most

6121

* subtle and fragile situation.

6121

* subtle and fragile situation.

6122

*/

6122

*/

6123

6124

static inline int sg_imbalanced(struct sched_group *group)

6124

static inline int sg_imbalanced(struct sched_group *group)

6125

{

6125

{

6126

return group->sgc->imbalance;

6126

return group->sgc->imbalance;

6127

}

6127

}

6128

6129

/*

6129

/*

6130

* Compute the group capacity factor.

6130

* Compute the group capacity factor.

6131

*

6131

*

6132

* Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by

6132

* Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by

6133

* first dividing out the smt factor and computing the actual number of cores

6133

* first dividing out the smt factor and computing the actual number of cores

6134

* and limit unit capacity with that.

6134

* and limit unit capacity with that.

6135

*/

6135

*/

6136

static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)

6136

static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)

6137

{

6137

{

6138

unsigned int capacity_factor, smt, cpus;

6138

unsigned int capacity_factor, smt, cpus;

6139

unsigned int capacity, capacity_orig;

6139

unsigned int capacity, capacity_orig;

6140

6141

capacity = group->sgc->capacity;

6141

capacity = group->sgc->capacity;

6142

capacity_orig = group->sgc->capacity_orig;

6142

capacity_orig = group->sgc->capacity_orig;

6143

cpus = group->group_weight;

6143

cpus = group->group_weight;

6144

6145

/* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */

6145

/* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */

6146

smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);

6146

smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);

6147

capacity_factor = cpus / smt; /* cores */

6147

capacity_factor = cpus / smt; /* cores */

6148

6149

capacity_factor = min_t(unsigned,

6149

capacity_factor = min_t(unsigned,

6150

capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));

6150

capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));

6151

if (!capacity_factor)

6151

if (!capacity_factor)

6152

capacity_factor = fix_small_capacity(env->sd, group);

6152

capacity_factor = fix_small_capacity(env->sd, group);

6153

6154

return capacity_factor;

6154

return capacity_factor;

6155

}

6155

}

6156

6157

static enum group_type

6157

static enum group_type

6158

group_classify(struct sched_group *group, struct sg_lb_stats *sgs)

6158

group_classify(struct sched_group *group, struct sg_lb_stats *sgs)

6159

{

6159

{

6160

if (sgs->sum_nr_running > sgs->group_capacity_factor)

6160

if (sgs->sum_nr_running > sgs->group_capacity_factor)

6161

return group_overloaded;

6161

return group_overloaded;

6162

6163

if (sg_imbalanced(group))

6163

if (sg_imbalanced(group))

6164

return group_imbalanced;

6164

return group_imbalanced;

6165

6166

return group_other;

6166

return group_other;

6167

}

6167

}

6168

6169

/**

6169

/**

6170

* update_sg_lb_stats - Update sched_group's statistics for load balancing.

6170

* update_sg_lb_stats - Update sched_group's statistics for load balancing.

6171

* @env: The load balancing environment.

6171

* @env: The load balancing environment.

6172

* @group: sched_group whose statistics are to be updated.

6172

* @group: sched_group whose statistics are to be updated.

6173

* @load_idx: Load index of sched_domain of this_cpu for load calc.

6173

* @load_idx: Load index of sched_domain of this_cpu for load calc.

6174

* @local_group: Does group contain this_cpu.

6174

* @local_group: Does group contain this_cpu.

6175

* @sgs: variable to hold the statistics for this group.

6175

* @sgs: variable to hold the statistics for this group.

6176

* @overload: Indicate more than one runnable task for any CPU.

6176

* @overload: Indicate more than one runnable task for any CPU.

6177

*/

6177

*/

6178

static inline void update_sg_lb_stats(struct lb_env *env,

6178

static inline void update_sg_lb_stats(struct lb_env *env,

6179

struct sched_group *group, int load_idx,

6179

struct sched_group *group, int load_idx,

6180

int local_group, struct sg_lb_stats *sgs,

6180

int local_group, struct sg_lb_stats *sgs,

6181

bool *overload)

6181

bool *overload)

6182

{

6182

{

6183

unsigned long load;

6183

unsigned long load;

6184

int i;

6184

int i;

6185

6186

memset(sgs, 0, sizeof(*sgs));

6186

memset(sgs, 0, sizeof(*sgs));

6187

6188

for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {

6188

for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {

6189

struct rq *rq = cpu_rq(i);

6189

struct rq *rq = cpu_rq(i);

6190

6191

/* Bias balancing toward cpus of our domain */

6191

/* Bias balancing toward cpus of our domain */

6192

if (local_group)

6192

if (local_group)

6193

load = target_load(i, load_idx);

6193

load = target_load(i, load_idx);

6194

else

6194

else

6195

load = source_load(i, load_idx);

6195

load = source_load(i, load_idx);

6196

6197

sgs->group_load += load;

6197

sgs->group_load += load;

6198

sgs->sum_nr_running += rq->cfs.h_nr_running;

6198

sgs->sum_nr_running += rq->cfs.h_nr_running;

6199

6200

if (rq->nr_running > 1)

6200

if (rq->nr_running > 1)

6201

*overload = true;

6201

*overload = true;

6202

6203

#ifdef CONFIG_NUMA_BALANCING

6203

#ifdef CONFIG_NUMA_BALANCING

6204

sgs->nr_numa_running += rq->nr_numa_running;

6204

sgs->nr_numa_running += rq->nr_numa_running;

6205

sgs->nr_preferred_running += rq->nr_preferred_running;

6205

sgs->nr_preferred_running += rq->nr_preferred_running;

6206

#endif

6206

#endif

6207

sgs->sum_weighted_load += weighted_cpuload(i);

6207

sgs->sum_weighted_load += weighted_cpuload(i);

6208

if (idle_cpu(i))

6208

if (idle_cpu(i))

6209

sgs->idle_cpus++;

6209

sgs->idle_cpus++;

6210

}

6210

}

6211

6212

/* Adjust by relative CPU capacity of the group */

6212

/* Adjust by relative CPU capacity of the group */

6213

sgs->group_capacity = group->sgc->capacity;

6213

sgs->group_capacity = group->sgc->capacity;

6214

sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;

6214

sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;

6215

6216

if (sgs->sum_nr_running)

6216

if (sgs->sum_nr_running)

6217

sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;

6217

sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;

6218

6219

sgs->group_weight = group->group_weight;

6219

sgs->group_weight = group->group_weight;

6220

sgs->group_capacity_factor = sg_capacity_factor(env, group);

6220

sgs->group_capacity_factor = sg_capacity_factor(env, group);

6221

sgs->group_type = group_classify(group, sgs);

6221

sgs->group_type = group_classify(group, sgs);

6222

6223

if (sgs->group_capacity_factor > sgs->sum_nr_running)

6223

if (sgs->group_capacity_factor > sgs->sum_nr_running)

6224

sgs->group_has_free_capacity = 1;

6224

sgs->group_has_free_capacity = 1;

6225

}

6225

}

6226

6227

/**

6227

/**

6228

* update_sd_pick_busiest - return 1 on busiest group

6228

* update_sd_pick_busiest - return 1 on busiest group

6229

* @env: The load balancing environment.

6229

* @env: The load balancing environment.

6230

* @sds: sched_domain statistics

6230

* @sds: sched_domain statistics

6231

* @sg: sched_group candidate to be checked for being the busiest

6231

* @sg: sched_group candidate to be checked for being the busiest

6232

* @sgs: sched_group statistics

6232

* @sgs: sched_group statistics

6233

*

6233

*

6234

* Determine if @sg is a busier group than the previously selected

6234

* Determine if @sg is a busier group than the previously selected

6235

* busiest group.

6235

* busiest group.

6236

*

6236

*

6237

* Return: %true if @sg is a busier group than the previously selected

6237

* Return: %true if @sg is a busier group than the previously selected

6238

* busiest group. %false otherwise.

6238

* busiest group. %false otherwise.

6239

*/

6239

*/

6240

static bool update_sd_pick_busiest(struct lb_env *env,

6240

static bool update_sd_pick_busiest(struct lb_env *env,

6241

struct sd_lb_stats *sds,

6241

struct sd_lb_stats *sds,

6242

struct sched_group *sg,

6242

struct sched_group *sg,

6243

struct sg_lb_stats *sgs)

6243

struct sg_lb_stats *sgs)

6244

{

6244

{

6245

struct sg_lb_stats *busiest = &sds->busiest_stat;

6245

struct sg_lb_stats *busiest = &sds->busiest_stat;

6246

6247

if (sgs->group_type > busiest->group_type)

6247

if (sgs->group_type > busiest->group_type)

6248

return true;

6248

return true;

6249

6250

if (sgs->group_type < busiest->group_type)

6250

if (sgs->group_type < busiest->group_type)

6251

return false;

6251

return false;

6252

6253

if (sgs->avg_load <= busiest->avg_load)

6253

if (sgs->avg_load <= busiest->avg_load)

6254

return false;

6254

return false;

6255

6256

/* This is the busiest node in its class. */

6256

/* This is the busiest node in its class. */

6257

if (!(env->sd->flags & SD_ASYM_PACKING))

6257

if (!(env->sd->flags & SD_ASYM_PACKING))

6258

return true;

6258

return true;

6259

6260

/*

6260

/*

6261

* ASYM_PACKING needs to move all the work to the lowest

6261

* ASYM_PACKING needs to move all the work to the lowest

6262

* numbered CPUs in the group, therefore mark all groups

6262

* numbered CPUs in the group, therefore mark all groups

6263

* higher than ourself as busy.

6263

* higher than ourself as busy.

6264

*/

6264

*/

6265

if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {

6265

if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {

6266

if (!sds->busiest)

6266

if (!sds->busiest)

6267

return true;

6267

return true;

6268

6269

if (group_first_cpu(sds->busiest) > group_first_cpu(sg))

6269

if (group_first_cpu(sds->busiest) > group_first_cpu(sg))

6270

return true;

6270

return true;

6271

}

6271

}

6272

6273

return false;

6273

return false;

6274

}

6274

}

6275

6276

#ifdef CONFIG_NUMA_BALANCING

6276

#ifdef CONFIG_NUMA_BALANCING

6277

static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)

6277

static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)

6278

{

6278

{

6279

if (sgs->sum_nr_running > sgs->nr_numa_running)

6279

if (sgs->sum_nr_running > sgs->nr_numa_running)

6280

return regular;

6280

return regular;

6281

if (sgs->sum_nr_running > sgs->nr_preferred_running)

6281

if (sgs->sum_nr_running > sgs->nr_preferred_running)

6282

return remote;

6282

return remote;

6283

return all;

6283

return all;

6284

}

6284

}

6285

6286

static inline enum fbq_type fbq_classify_rq(struct rq *rq)

6286

static inline enum fbq_type fbq_classify_rq(struct rq *rq)

6287

{

6287

{

6288

if (rq->nr_running > rq->nr_numa_running)

6288

if (rq->nr_running > rq->nr_numa_running)

6289

return regular;

6289

return regular;

6290

if (rq->nr_running > rq->nr_preferred_running)

6290

if (rq->nr_running > rq->nr_preferred_running)

6291

return remote;

6291

return remote;

6292

return all;

6292

return all;

6293

}

6293

}

6294

#else

6294

#else

6295

static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)

6295

static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)

6296

{

6296

{

6297

return all;

6297

return all;

6298

}

6298

}

6299

6300

static inline enum fbq_type fbq_classify_rq(struct rq *rq)

6300

static inline enum fbq_type fbq_classify_rq(struct rq *rq)

6301

{

6301

{

6302

return regular;

6302

return regular;

6303

}

6303

}

6304

#endif /* CONFIG_NUMA_BALANCING */

6304

#endif /* CONFIG_NUMA_BALANCING */

6305

6306

/**

6306

/**

6307

* update_sd_lb_stats - Update sched_domain's statistics for load balancing.

6307

* update_sd_lb_stats - Update sched_domain's statistics for load balancing.

6308

* @env: The load balancing environment.

6308

* @env: The load balancing environment.

6309

* @sds: variable to hold the statistics for this sched_domain.

6309

* @sds: variable to hold the statistics for this sched_domain.

6310

*/

6310

*/

6311

static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)

6311

static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)

6312

{

6312

{

6313

struct sched_domain *child = env->sd->child;

6313

struct sched_domain *child = env->sd->child;

6314

struct sched_group *sg = env->sd->groups;

6314

struct sched_group *sg = env->sd->groups;

6315

struct sg_lb_stats tmp_sgs;

6315

struct sg_lb_stats tmp_sgs;

6316

int load_idx, prefer_sibling = 0;

6316

int load_idx, prefer_sibling = 0;

6317

bool overload = false;

6317

bool overload = false;

6318

6319

if (child && child->flags & SD_PREFER_SIBLING)

6319

if (child && child->flags & SD_PREFER_SIBLING)

6320

prefer_sibling = 1;

6320

prefer_sibling = 1;

6321

6322

load_idx = get_sd_load_idx(env->sd, env->idle);

6322

load_idx = get_sd_load_idx(env->sd, env->idle);

6323

6324

do {

6324

do {

6325

struct sg_lb_stats *sgs = &tmp_sgs;

6325

struct sg_lb_stats *sgs = &tmp_sgs;

6326

int local_group;

6326

int local_group;

6327

6328

local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));

6328

local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));

6329

if (local_group) {

6329

if (local_group) {

6330

sds->local = sg;

6330

sds->local = sg;

6331

sgs = &sds->local_stat;

6331

sgs = &sds->local_stat;

6332

6333

if (env->idle != CPU_NEWLY_IDLE ||

6333

if (env->idle != CPU_NEWLY_IDLE ||

6334

time_after_eq(jiffies, sg->sgc->next_update))

6334

time_after_eq(jiffies, sg->sgc->next_update))

6335

update_group_capacity(env->sd, env->dst_cpu);

6335

update_group_capacity(env->sd, env->dst_cpu);

6336

}

6336

}

6337

6338

update_sg_lb_stats(env, sg, load_idx, local_group, sgs,

6338

update_sg_lb_stats(env, sg, load_idx, local_group, sgs,

6339

&overload);

6339

&overload);

6340

6341

if (local_group)

6341

if (local_group)

6342

goto next_group;

6342

goto next_group;

6343

6344

/*

6344

/*

6345

* In case the child domain prefers tasks go to siblings

6345

* In case the child domain prefers tasks go to siblings

6346

* first, lower the sg capacity factor to one so that we'll try

6346

* first, lower the sg capacity factor to one so that we'll try

6347

* and move all the excess tasks away. We lower the capacity

6347

* and move all the excess tasks away. We lower the capacity

6348

* of a group only if the local group has the capacity to fit

6348

* of a group only if the local group has the capacity to fit

6349

* these excess tasks, i.e. nr_running < group_capacity_factor. The

6349

* these excess tasks, i.e. nr_running < group_capacity_factor. The

6350

* extra check prevents the case where you always pull from the

6350

* extra check prevents the case where you always pull from the

6351

* heaviest group when it is already under-utilized (possible

6351

* heaviest group when it is already under-utilized (possible

6352

* with a large weight task outweighs the tasks on the system).

6352

* with a large weight task outweighs the tasks on the system).

6353

*/

6353

*/

6354

if (prefer_sibling && sds->local &&

6354

if (prefer_sibling && sds->local &&

6355

sds->local_stat.group_has_free_capacity) {

6355

sds->local_stat.group_has_free_capacity) {

6356

sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);

6356

sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);

6357

sgs->group_type = group_classify(sg, sgs);

6357

sgs->group_type = group_classify(sg, sgs);

6358

}

6358

}

6359

6360

if (update_sd_pick_busiest(env, sds, sg, sgs)) {

6360

if (update_sd_pick_busiest(env, sds, sg, sgs)) {

6361

sds->busiest = sg;

6361

sds->busiest = sg;

6362

sds->busiest_stat = *sgs;

6362

sds->busiest_stat = *sgs;

6363

}

6363

}

6364

6365

next_group:

6365

next_group:

6366

/* Now, start updating sd_lb_stats */

6366

/* Now, start updating sd_lb_stats */

6367

sds->total_load += sgs->group_load;

6367

sds->total_load += sgs->group_load;

6368

sds->total_capacity += sgs->group_capacity;

6368

sds->total_capacity += sgs->group_capacity;

6369

6370

sg = sg->next;

6370

sg = sg->next;

6371

} while (sg != env->sd->groups);

6371

} while (sg != env->sd->groups);

6372

6373

if (env->sd->flags & SD_NUMA)

6373

if (env->sd->flags & SD_NUMA)

6374

env->fbq_type = fbq_classify_group(&sds->busiest_stat);

6374

env->fbq_type = fbq_classify_group(&sds->busiest_stat);

6375

6376

if (!env->sd->parent) {

6376

if (!env->sd->parent) {

6377

/* update overload indicator if we are at root domain */

6377

/* update overload indicator if we are at root domain */

6378

if (env->dst_rq->rd->overload != overload)

6378

if (env->dst_rq->rd->overload != overload)

6379

env->dst_rq->rd->overload = overload;

6379

env->dst_rq->rd->overload = overload;

6380

}

6380

}

6381

6382

}

6382

}

6383

6384

/**

6384

/**

6385

* check_asym_packing - Check to see if the group is packed into the

6385

* check_asym_packing - Check to see if the group is packed into the

6386

* sched doman.

6386

* sched doman.

6387

*

6387

*

6388

* This is primarily intended to used at the sibling level. Some

6388

* This is primarily intended to used at the sibling level. Some

6389

* cores like POWER7 prefer to use lower numbered SMT threads. In the

6389

* cores like POWER7 prefer to use lower numbered SMT threads. In the

6390

* case of POWER7, it can move to lower SMT modes only when higher

6390

* case of POWER7, it can move to lower SMT modes only when higher

6391

* threads are idle. When in lower SMT modes, the threads will

6391

* threads are idle. When in lower SMT modes, the threads will

6392

* perform better since they share less core resources. Hence when we

6392

* perform better since they share less core resources. Hence when we

6393

* have idle threads, we want them to be the higher ones.

6393

* have idle threads, we want them to be the higher ones.

6394

*

6394

*

6395

* This packing function is run on idle threads. It checks to see if

6395

* This packing function is run on idle threads. It checks to see if

6396

* the busiest CPU in this domain (core in the P7 case) has a higher

6396

* the busiest CPU in this domain (core in the P7 case) has a higher

6397

* CPU number than the packing function is being run on. Here we are

6397

* CPU number than the packing function is being run on. Here we are

6398

* assuming lower CPU number will be equivalent to lower a SMT thread

6398

* assuming lower CPU number will be equivalent to lower a SMT thread

6399

* number.

6399

* number.

6400

*

6400

*

6401

* Return: 1 when packing is required and a task should be moved to

6401

* Return: 1 when packing is required and a task should be moved to

6402

* this CPU. The amount of the imbalance is returned in *imbalance.

6402

* this CPU. The amount of the imbalance is returned in *imbalance.

6403

*

6403

*

6404

* @env: The load balancing environment.

6404

* @env: The load balancing environment.

6405

* @sds: Statistics of the sched_domain which is to be packed

6405

* @sds: Statistics of the sched_domain which is to be packed

6406

*/

6406

*/

6407

static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)

6407

static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)

6408

{

6408

{

6409

int busiest_cpu;

6409

int busiest_cpu;

6410

6411

if (!(env->sd->flags & SD_ASYM_PACKING))

6411

if (!(env->sd->flags & SD_ASYM_PACKING))

6412

return 0;

6412

return 0;

6413

6414

if (!sds->busiest)

6414

if (!sds->busiest)

6415

return 0;

6415

return 0;

6416

6417

busiest_cpu = group_first_cpu(sds->busiest);

6417

busiest_cpu = group_first_cpu(sds->busiest);

6418

if (env->dst_cpu > busiest_cpu)

6418

if (env->dst_cpu > busiest_cpu)

6419

return 0;

6419

return 0;

6420

6421

env->imbalance = DIV_ROUND_CLOSEST(

6421

env->imbalance = DIV_ROUND_CLOSEST(

6422

sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,

6422

sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,

6423

SCHED_CAPACITY_SCALE);

6423

SCHED_CAPACITY_SCALE);

6424

6425

return 1;

6425

return 1;

6426

}

6426

}

6427

6428

/**

6428

/**

6429

* fix_small_imbalance - Calculate the minor imbalance that exists

6429

* fix_small_imbalance - Calculate the minor imbalance that exists

6430

* amongst the groups of a sched_domain, during

6430

* amongst the groups of a sched_domain, during

6431

* load balancing.

6431

* load balancing.

6432

* @env: The load balancing environment.

6432

* @env: The load balancing environment.

6433

* @sds: Statistics of the sched_domain whose imbalance is to be calculated.

6433

* @sds: Statistics of the sched_domain whose imbalance is to be calculated.

6434

*/

6434

*/

6435

static inline

6435

static inline

6436

void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)

6436

void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)

6437

{

6437

{

6438

unsigned long tmp, capa_now = 0, capa_move = 0;

6438

unsigned long tmp, capa_now = 0, capa_move = 0;

6439

unsigned int imbn = 2;

6439

unsigned int imbn = 2;

6440

unsigned long scaled_busy_load_per_task;

6440

unsigned long scaled_busy_load_per_task;

6441

struct sg_lb_stats *local, *busiest;

6441

struct sg_lb_stats *local, *busiest;

6442

6443

local = &sds->local_stat;

6443

local = &sds->local_stat;

6444

busiest = &sds->busiest_stat;

6444

busiest = &sds->busiest_stat;

6445

6446

if (!local->sum_nr_running)

6446

if (!local->sum_nr_running)

6447

local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);

6447

local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);

6448

else if (busiest->load_per_task > local->load_per_task)

6448

else if (busiest->load_per_task > local->load_per_task)

6449

imbn = 1;

6449

imbn = 1;

6450

6451

scaled_busy_load_per_task =

6451

scaled_busy_load_per_task =

6452

(busiest->load_per_task * SCHED_CAPACITY_SCALE) /

6452

(busiest->load_per_task * SCHED_CAPACITY_SCALE) /

6453

busiest->group_capacity;

6453

busiest->group_capacity;

6454

6455

if (busiest->avg_load + scaled_busy_load_per_task >=

6455

if (busiest->avg_load + scaled_busy_load_per_task >=

6456

local->avg_load + (scaled_busy_load_per_task * imbn)) {

6456

local->avg_load + (scaled_busy_load_per_task * imbn)) {

6457

env->imbalance = busiest->load_per_task;

6457

env->imbalance = busiest->load_per_task;

6458

return;

6458

return;

6459

}

6459

}

6460

6461

/*

6461

/*

6462

* OK, we don't have enough imbalance to justify moving tasks,

6462

* OK, we don't have enough imbalance to justify moving tasks,

6463

* however we may be able to increase total CPU capacity used by

6463

* however we may be able to increase total CPU capacity used by

6464

* moving them.

6464

* moving them.

6465

*/

6465

*/

6466

6467

capa_now += busiest->group_capacity *

6467

capa_now += busiest->group_capacity *

6468

min(busiest->load_per_task, busiest->avg_load);

6468

min(busiest->load_per_task, busiest->avg_load);

6469

capa_now += local->group_capacity *

6469

capa_now += local->group_capacity *

6470

min(local->load_per_task, local->avg_load);

6470

min(local->load_per_task, local->avg_load);

6471

capa_now /= SCHED_CAPACITY_SCALE;

6471

capa_now /= SCHED_CAPACITY_SCALE;

6472

6473

/* Amount of load we'd subtract */

6473

/* Amount of load we'd subtract */

6474

if (busiest->avg_load > scaled_busy_load_per_task) {

6474

if (busiest->avg_load > scaled_busy_load_per_task) {

6475

capa_move += busiest->group_capacity *

6475

capa_move += busiest->group_capacity *

6476

min(busiest->load_per_task,

6476

min(busiest->load_per_task,

6477

busiest->avg_load - scaled_busy_load_per_task);

6477

busiest->avg_load - scaled_busy_load_per_task);

6478

}

6478

}

6479

6480

/* Amount of load we'd add */

6480

/* Amount of load we'd add */

6481

if (busiest->avg_load * busiest->group_capacity <

6481

if (busiest->avg_load * busiest->group_capacity <

6482

busiest->load_per_task * SCHED_CAPACITY_SCALE) {

6482

busiest->load_per_task * SCHED_CAPACITY_SCALE) {

6483

tmp = (busiest->avg_load * busiest->group_capacity) /

6483

tmp = (busiest->avg_load * busiest->group_capacity) /

6484

local->group_capacity;

6484

local->group_capacity;

6485

} else {

6485

} else {

6486

tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /

6486

tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /

6487

local->group_capacity;

6487

local->group_capacity;

6488

}

6488

}

6489

capa_move += local->group_capacity *

6489

capa_move += local->group_capacity *

6490

min(local->load_per_task, local->avg_load + tmp);

6490

min(local->load_per_task, local->avg_load + tmp);

6491

capa_move /= SCHED_CAPACITY_SCALE;

6491

capa_move /= SCHED_CAPACITY_SCALE;

6492

6493

/* Move if we gain throughput */

6493

/* Move if we gain throughput */

6494

if (capa_move > capa_now)

6494

if (capa_move > capa_now)

6495

env->imbalance = busiest->load_per_task;

6495

env->imbalance = busiest->load_per_task;

6496

}

6496

}

6497

6498

/**

6498

/**

6499

* calculate_imbalance - Calculate the amount of imbalance present within the

6499

* calculate_imbalance - Calculate the amount of imbalance present within the

6500

* groups of a given sched_domain during load balance.

6500

* groups of a given sched_domain during load balance.

6501

* @env: load balance environment

6501

* @env: load balance environment

6502

* @sds: statistics of the sched_domain whose imbalance is to be calculated.

6502

* @sds: statistics of the sched_domain whose imbalance is to be calculated.

6503

*/

6503

*/

6504

static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)

6504

static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)

6505

{

6505

{

6506

unsigned long max_pull, load_above_capacity = ~0UL;

6506

unsigned long max_pull, load_above_capacity = ~0UL;

6507

struct sg_lb_stats *local, *busiest;

6507

struct sg_lb_stats *local, *busiest;

6508

6509

local = &sds->local_stat;

6509

local = &sds->local_stat;

6510

busiest = &sds->busiest_stat;

6510

busiest = &sds->busiest_stat;

6511

6512

if (busiest->group_type == group_imbalanced) {

6512

if (busiest->group_type == group_imbalanced) {

6513

/*

6513

/*

6514

* In the group_imb case we cannot rely on group-wide averages

6514

* In the group_imb case we cannot rely on group-wide averages

6515

* to ensure cpu-load equilibrium, look at wider averages. XXX

6515

* to ensure cpu-load equilibrium, look at wider averages. XXX

6516

*/

6516

*/

6517

busiest->load_per_task =

6517

busiest->load_per_task =

6518

min(busiest->load_per_task, sds->avg_load);

6518

min(busiest->load_per_task, sds->avg_load);

6519

}

6519

}

6520

6521

/*

6521

/*

6522

* In the presence of smp nice balancing, certain scenarios can have

6522

* In the presence of smp nice balancing, certain scenarios can have

6523

* max load less than avg load(as we skip the groups at or below

6523

* max load less than avg load(as we skip the groups at or below

6524

* its cpu_capacity, while calculating max_load..)

6524

* its cpu_capacity, while calculating max_load..)

6525

*/

6525

*/

6526

if (busiest->avg_load <= sds->avg_load ||

6526

if (busiest->avg_load <= sds->avg_load ||

6527

local->avg_load >= sds->avg_load) {

6527

local->avg_load >= sds->avg_load) {

6528

env->imbalance = 0;

6528

env->imbalance = 0;

6529

return fix_small_imbalance(env, sds);

6529

return fix_small_imbalance(env, sds);

6530

}

6530

}

6531

6532

/*

6532

/*

6533

* If there aren't any idle cpus, avoid creating some.

6533

* If there aren't any idle cpus, avoid creating some.

6534

*/

6534

*/

6535

if (busiest->group_type == group_overloaded &&

6535

if (busiest->group_type == group_overloaded &&

6536

local->group_type == group_overloaded) {

6536

local->group_type == group_overloaded) {

6537

load_above_capacity =

6537

load_above_capacity =

6538

(busiest->sum_nr_running - busiest->group_capacity_factor);

6538

(busiest->sum_nr_running - busiest->group_capacity_factor);

6539

6540

load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);

6540

load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);

6541

load_above_capacity /= busiest->group_capacity;

6541

load_above_capacity /= busiest->group_capacity;

6542

}

6542

}

6543

6544

/*

6544

/*

6545

* We're trying to get all the cpus to the average_load, so we don't

6545

* We're trying to get all the cpus to the average_load, so we don't

6546

* want to push ourselves above the average load, nor do we wish to

6546

* want to push ourselves above the average load, nor do we wish to

6547

* reduce the max loaded cpu below the average load. At the same time,

6547

* reduce the max loaded cpu below the average load. At the same time,

6548

* we also don't want to reduce the group load below the group capacity

6548

* we also don't want to reduce the group load below the group capacity

6549

* (so that we can implement power-savings policies etc). Thus we look

6549

* (so that we can implement power-savings policies etc). Thus we look

6550

* for the minimum possible imbalance.

6550

* for the minimum possible imbalance.

6551

*/

6551

*/

6552

max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);

6552

max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);

6553

6554

/* How much load to actually move to equalise the imbalance */

6554

/* How much load to actually move to equalise the imbalance */

6555

env->imbalance = min(

6555

env->imbalance = min(

6556

max_pull * busiest->group_capacity,

6556

max_pull * busiest->group_capacity,

6557

(sds->avg_load - local->avg_load) * local->group_capacity

6557

(sds->avg_load - local->avg_load) * local->group_capacity

6558

) / SCHED_CAPACITY_SCALE;

6558

) / SCHED_CAPACITY_SCALE;

6559

6560

/*

6560

/*

6561

* if *imbalance is less than the average load per runnable task

6561

* if *imbalance is less than the average load per runnable task

6562

* there is no guarantee that any tasks will be moved so we'll have

6562

* there is no guarantee that any tasks will be moved so we'll have

6563

* a think about bumping its value to force at least one task to be

6563

* a think about bumping its value to force at least one task to be

6564

* moved

6564

* moved

6565

*/

6565

*/

6566

if (env->imbalance < busiest->load_per_task)

6566

if (env->imbalance < busiest->load_per_task)

6567

return fix_small_imbalance(env, sds);

6567

return fix_small_imbalance(env, sds);

6568

}

6568

}

6569

6570

/******* find_busiest_group() helpers end here *********************/

6570

/******* find_busiest_group() helpers end here *********************/

6571

6572

/**

6572

/**

6573

* find_busiest_group - Returns the busiest group within the sched_domain

6573

* find_busiest_group - Returns the busiest group within the sched_domain

6574

* if there is an imbalance. If there isn't an imbalance, and

6574

* if there is an imbalance. If there isn't an imbalance, and

6575

* the user has opted for power-savings, it returns a group whose

6575

* the user has opted for power-savings, it returns a group whose

6576

* CPUs can be put to idle by rebalancing those tasks elsewhere, if

6576

* CPUs can be put to idle by rebalancing those tasks elsewhere, if

6577

* such a group exists.

6577

* such a group exists.

6578

*

6578

*

6579

* Also calculates the amount of weighted load which should be moved

6579

* Also calculates the amount of weighted load which should be moved

6580

* to restore balance.

6580

* to restore balance.

6581

*

6581

*

6582

* @env: The load balancing environment.

6582

* @env: The load balancing environment.

6583

*

6583

*

6584

* Return: - The busiest group if imbalance exists.

6584

* Return: - The busiest group if imbalance exists.

6585

* - If no imbalance and user has opted for power-savings balance,

6585

* - If no imbalance and user has opted for power-savings balance,

6586

* return the least loaded group whose CPUs can be

6586

* return the least loaded group whose CPUs can be

6587

* put to idle by rebalancing its tasks onto our group.

6587

* put to idle by rebalancing its tasks onto our group.

6588

*/

6588

*/

6589

static struct sched_group *find_busiest_group(struct lb_env *env)

6589

static struct sched_group *find_busiest_group(struct lb_env *env)

6590

{

6590

{

6591

struct sg_lb_stats *local, *busiest;

6591

struct sg_lb_stats *local, *busiest;

6592

struct sd_lb_stats sds;

6592

struct sd_lb_stats sds;

6593

6594

init_sd_lb_stats(&sds);

6594

init_sd_lb_stats(&sds);

6595

6596

/*

6596

/*

6597

* Compute the various statistics relavent for load balancing at

6597

* Compute the various statistics relavent for load balancing at

6598

* this level.

6598

* this level.

6599

*/

6599

*/

6600

update_sd_lb_stats(env, &sds);

6600

update_sd_lb_stats(env, &sds);

6601

local = &sds.local_stat;

6601

local = &sds.local_stat;

6602

busiest = &sds.busiest_stat;

6602

busiest = &sds.busiest_stat;

6603

6604

if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&

6604

if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&

6605

check_asym_packing(env, &sds))

6605

check_asym_packing(env, &sds))

6606

return sds.busiest;

6606

return sds.busiest;

6607

6608

/* There is no busy sibling group to pull tasks from */

6608

/* There is no busy sibling group to pull tasks from */

6609

if (!sds.busiest || busiest->sum_nr_running == 0)

6609

if (!sds.busiest || busiest->sum_nr_running == 0)

6610

goto out_balanced;

6610

goto out_balanced;

6611

6612

sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)

6612

sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)

6613

/ sds.total_capacity;

6613

/ sds.total_capacity;

6614

6615

/*

6615

/*

6616

* If the busiest group is imbalanced the below checks don't

6616

* If the busiest group is imbalanced the below checks don't

6617

* work because they assume all things are equal, which typically

6617

* work because they assume all things are equal, which typically

6618

* isn't true due to cpus_allowed constraints and the like.

6618

* isn't true due to cpus_allowed constraints and the like.

6619

*/

6619

*/

6620

if (busiest->group_type == group_imbalanced)

6620

if (busiest->group_type == group_imbalanced)

6621

goto force_balance;

6621

goto force_balance;

6622

6623

/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */

6623

/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */

6624

if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&

6624

if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&

6625

!busiest->group_has_free_capacity)

6625

!busiest->group_has_free_capacity)

6626

goto force_balance;

6626

goto force_balance;

6627

6628

/*

6628

/*

6629

* If the local group is busier than the selected busiest group

6629

* If the local group is busier than the selected busiest group

6630

* don't try and pull any tasks.

6630

* don't try and pull any tasks.

6631

*/

6631

*/

6632

if (local->avg_load >= busiest->avg_load)

6632

if (local->avg_load >= busiest->avg_load)

6633

goto out_balanced;

6633

goto out_balanced;

6634

6635

/*

6635

/*

6636

* Don't pull any tasks if this group is already above the domain

6636

* Don't pull any tasks if this group is already above the domain

6637

* average load.

6637

* average load.

6638

*/

6638

*/

6639

if (local->avg_load >= sds.avg_load)

6639

if (local->avg_load >= sds.avg_load)

6640

goto out_balanced;

6640

goto out_balanced;

6641

6642

if (env->idle == CPU_IDLE) {

6642

if (env->idle == CPU_IDLE) {

6643

/*

6643

/*

6644

* This cpu is idle. If the busiest group is not overloaded

6644

* This cpu is idle. If the busiest group is not overloaded

6645

* and there is no imbalance between this and busiest group

6645

* and there is no imbalance between this and busiest group

6646

* wrt idle cpus, it is balanced. The imbalance becomes

6646

* wrt idle cpus, it is balanced. The imbalance becomes

6647

* significant if the diff is greater than 1 otherwise we

6647

* significant if the diff is greater than 1 otherwise we

6648

* might end up to just move the imbalance on another group

6648

* might end up to just move the imbalance on another group

6649

*/

6649

*/

6650

if ((busiest->group_type != group_overloaded) &&

6650

if ((busiest->group_type != group_overloaded) &&

6651

(local->idle_cpus <= (busiest->idle_cpus + 1)))

6651

(local->idle_cpus <= (busiest->idle_cpus + 1)))

6652

goto out_balanced;

6652

goto out_balanced;

6653

} else {

6653

} else {

6654

/*

6654

/*

6655

* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use

6655

* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use

6656

* imbalance_pct to be conservative.

6656

* imbalance_pct to be conservative.

6657

*/

6657

*/

6658

if (100 * busiest->avg_load <=

6658

if (100 * busiest->avg_load <=

6659

env->sd->imbalance_pct * local->avg_load)

6659

env->sd->imbalance_pct * local->avg_load)

6660

goto out_balanced;

6660

goto out_balanced;

6661

}

6661

}

6662

6663

force_balance:

6663

force_balance:

6664

/* Looks like there is an imbalance. Compute it */

6664

/* Looks like there is an imbalance. Compute it */

6665

calculate_imbalance(env, &sds);

6665

calculate_imbalance(env, &sds);

6666

return sds.busiest;

6666

return sds.busiest;

6667

6668

out_balanced:

6668

out_balanced:

6669

env->imbalance = 0;

6669

env->imbalance = 0;

6670

return NULL;

6670

return NULL;

6671

}

6671

}

6672

6673

/*

6673

/*

6674

* find_busiest_queue - find the busiest runqueue among the cpus in group.

6674

* find_busiest_queue - find the busiest runqueue among the cpus in group.

6675

*/

6675

*/

6676

static struct rq *find_busiest_queue(struct lb_env *env,

6676

static struct rq *find_busiest_queue(struct lb_env *env,

6677

struct sched_group *group)

6677

struct sched_group *group)

6678

{

6678

{

6679

struct rq *busiest = NULL, *rq;

6679

struct rq *busiest = NULL, *rq;

6680

unsigned long busiest_load = 0, busiest_capacity = 1;

6680

unsigned long busiest_load = 0, busiest_capacity = 1;

6681

int i;

6681

int i;

6682

6683

for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {

6683

for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {

6684

unsigned long capacity, capacity_factor, wl;

6684

unsigned long capacity, capacity_factor, wl;

6685

enum fbq_type rt;

6685

enum fbq_type rt;

6686

6687

rq = cpu_rq(i);

6687

rq = cpu_rq(i);

6688

rt = fbq_classify_rq(rq);

6688

rt = fbq_classify_rq(rq);

6689

6690

/*

6690

/*

6691

* We classify groups/runqueues into three groups:

6691

* We classify groups/runqueues into three groups:

6692

* - regular: there are !numa tasks

6692

* - regular: there are !numa tasks

6693

* - remote: there are numa tasks that run on the 'wrong' node

6693

* - remote: there are numa tasks that run on the 'wrong' node

6694

* - all: there is no distinction

6694

* - all: there is no distinction

6695

*

6695

*

6696

* In order to avoid migrating ideally placed numa tasks,

6696

* In order to avoid migrating ideally placed numa tasks,

6697

* ignore those when there's better options.

6697

* ignore those when there's better options.

6698

*

6698

*

6699

* If we ignore the actual busiest queue to migrate another

6699

* If we ignore the actual busiest queue to migrate another

6700

* task, the next balance pass can still reduce the busiest

6700

* task, the next balance pass can still reduce the busiest

6701

* queue by moving tasks around inside the node.

6701

* queue by moving tasks around inside the node.

6702

*

6702

*

6703

* If we cannot move enough load due to this classification

6703

* If we cannot move enough load due to this classification

6704

* the next pass will adjust the group classification and

6704

* the next pass will adjust the group classification and

6705

* allow migration of more tasks.

6705

* allow migration of more tasks.

6706

*

6706

*

6707

* Both cases only affect the total convergence complexity.

6707

* Both cases only affect the total convergence complexity.

6708

*/

6708

*/

6709

if (rt > env->fbq_type)

6709

if (rt > env->fbq_type)

6710

continue;

6710

continue;

6711

6712

capacity = capacity_of(i);

6712

capacity = capacity_of(i);

6713

capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);

6713

capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);

6714

if (!capacity_factor)

6714

if (!capacity_factor)

6715

capacity_factor = fix_small_capacity(env->sd, group);

6715

capacity_factor = fix_small_capacity(env->sd, group);

6716

6717

wl = weighted_cpuload(i);

6717

wl = weighted_cpuload(i);

6718

6719

/*

6719

/*

6720

* When comparing with imbalance, use weighted_cpuload()

6720

* When comparing with imbalance, use weighted_cpuload()

6721

* which is not scaled with the cpu capacity.

6721

* which is not scaled with the cpu capacity.

6722

*/

6722

*/

6723

if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)

6723

if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)

6724

continue;

6724

continue;

6725

6726

/*

6726

/*

6727

* For the load comparisons with the other cpu's, consider

6727

* For the load comparisons with the other cpu's, consider

6728

* the weighted_cpuload() scaled with the cpu capacity, so

6728

* the weighted_cpuload() scaled with the cpu capacity, so

6729

* that the load can be moved away from the cpu that is

6729

* that the load can be moved away from the cpu that is

6730

* potentially running at a lower capacity.

6730

* potentially running at a lower capacity.

6731

*

6731

*

6732

* Thus we're looking for max(wl_i / capacity_i), crosswise

6732

* Thus we're looking for max(wl_i / capacity_i), crosswise

6733

* multiplication to rid ourselves of the division works out

6733

* multiplication to rid ourselves of the division works out

6734

* to: wl_i * capacity_j > wl_j * capacity_i; where j is

6734

* to: wl_i * capacity_j > wl_j * capacity_i; where j is

6735

* our previous maximum.

6735

* our previous maximum.

6736

*/

6736

*/

6737

if (wl * busiest_capacity > busiest_load * capacity) {

6737

if (wl * busiest_capacity > busiest_load * capacity) {

6738

busiest_load = wl;

6738

busiest_load = wl;

6739

busiest_capacity = capacity;

6739

busiest_capacity = capacity;

6740

busiest = rq;

6740

busiest = rq;

6741

}

6741

}

6742

}

6742

}

6743

6744

return busiest;

6744

return busiest;

6745

}

6745

}

6746

6747

/*

6747

/*

6748

* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but

6748

* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but

6749

* so long as it is large enough.

6749

* so long as it is large enough.

6750

*/

6750

*/

6751

#define MAX_PINNED_INTERVAL 512

6751

#define MAX_PINNED_INTERVAL 512

6752

6753

/* Working cpumask for load_balance and load_balance_newidle. */

6753

/* Working cpumask for load_balance and load_balance_newidle. */

6754

DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);

6754

DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);

6755

6756

static int need_active_balance(struct lb_env *env)

6756

static int need_active_balance(struct lb_env *env)

6757

{

6757

{

6758

struct sched_domain *sd = env->sd;

6758

struct sched_domain *sd = env->sd;

6759

6760

if (env->idle == CPU_NEWLY_IDLE) {

6760

if (env->idle == CPU_NEWLY_IDLE) {

6761

6762

/*

6762

/*

6763

* ASYM_PACKING needs to force migrate tasks from busy but

6763

* ASYM_PACKING needs to force migrate tasks from busy but

6764

* higher numbered CPUs in order to pack all tasks in the

6764

* higher numbered CPUs in order to pack all tasks in the

6765

* lowest numbered CPUs.

6765

* lowest numbered CPUs.

6766

*/

6766

*/

6767

if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)

6767

if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)

6768

return 1;

6768

return 1;

6769

}

6769

}

6770

6771

return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);

6771

return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);

6772

}

6772

}

6773

6774

static int active_load_balance_cpu_stop(void *data);

6774

static int active_load_balance_cpu_stop(void *data);

6775

6776

static int should_we_balance(struct lb_env *env)

6776

static int should_we_balance(struct lb_env *env)

6777

{

6777

{

6778

struct sched_group *sg = env->sd->groups;

6778

struct sched_group *sg = env->sd->groups;

6779

struct cpumask *sg_cpus, *sg_mask;

6779

struct cpumask *sg_cpus, *sg_mask;

6780

int cpu, balance_cpu = -1;

6780

int cpu, balance_cpu = -1;

6781

6782

/*

6782

/*

6783

* In the newly idle case, we will allow all the cpu's

6783

* In the newly idle case, we will allow all the cpu's

6784

* to do the newly idle load balance.

6784

* to do the newly idle load balance.

6785

*/

6785

*/

6786

if (env->idle == CPU_NEWLY_IDLE)

6786

if (env->idle == CPU_NEWLY_IDLE)

6787

return 1;

6787

return 1;

6788

6789

sg_cpus = sched_group_cpus(sg);

6789

sg_cpus = sched_group_cpus(sg);

6790

sg_mask = sched_group_mask(sg);

6790

sg_mask = sched_group_mask(sg);

6791

/* Try to find first idle cpu */

6791

/* Try to find first idle cpu */

6792

for_each_cpu_and(cpu, sg_cpus, env->cpus) {

6792

for_each_cpu_and(cpu, sg_cpus, env->cpus) {

6793

if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))

6793

if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))

6794

continue;

6794

continue;

6795

6796

balance_cpu = cpu;

6796

balance_cpu = cpu;

6797

break;

6797

break;

6798

}

6798

}

6799

6800

if (balance_cpu == -1)

6800

if (balance_cpu == -1)

6801

balance_cpu = group_balance_cpu(sg);

6801

balance_cpu = group_balance_cpu(sg);

6802

6803

/*

6803

/*

6804

* First idle cpu or the first cpu(busiest) in this sched group

6804

* First idle cpu or the first cpu(busiest) in this sched group

6805

* is eligible for doing load balancing at this and above domains.

6805

* is eligible for doing load balancing at this and above domains.

6806

*/

6806

*/

6807

return balance_cpu == env->dst_cpu;

6807

return balance_cpu == env->dst_cpu;

6808

}

6808

}

6809

6810

/*

6810

/*

6811

* Check this_cpu to ensure it is balanced within domain. Attempt to move

6811

* Check this_cpu to ensure it is balanced within domain. Attempt to move

6812

* tasks if there is an imbalance.

6812

* tasks if there is an imbalance.

6813

*/

6813

*/

6814

static int load_balance(int this_cpu, struct rq *this_rq,

6814

static int load_balance(int this_cpu, struct rq *this_rq,

6815

struct sched_domain *sd, enum cpu_idle_type idle,

6815

struct sched_domain *sd, enum cpu_idle_type idle,

6816

int *continue_balancing)

6816

int *continue_balancing)

6817

{

6817

{

6818

int ld_moved, cur_ld_moved, active_balance = 0;

6818

int ld_moved, cur_ld_moved, active_balance = 0;

6819

struct sched_domain *sd_parent = sd->parent;

6819

struct sched_domain *sd_parent = sd->parent;

6820

struct sched_group *group;

6820

struct sched_group *group;

6821

struct rq *busiest;

6821

struct rq *busiest;

6822

unsigned long flags;

6822

unsigned long flags;

6823

struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);

6823

struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);

6824

6825

struct lb_env env = {

6825

struct lb_env env = {

6826

.sd = sd,

6826

.sd = sd,

6827

.dst_cpu = this_cpu,

6827

.dst_cpu = this_cpu,

6828

.dst_rq = this_rq,

6828

.dst_rq = this_rq,

6829

.dst_grpmask = sched_group_cpus(sd->groups),

6829

.dst_grpmask = sched_group_cpus(sd->groups),

6830

.idle = idle,

6830

.idle = idle,

6831

.loop_break = sched_nr_migrate_break,

6831

.loop_break = sched_nr_migrate_break,

6832

.cpus = cpus,

6832

.cpus = cpus,

6833

.fbq_type = all,

6833

.fbq_type = all,

6834

.tasks = LIST_HEAD_INIT(env.tasks),

6834

.tasks = LIST_HEAD_INIT(env.tasks),

6835

};

6835

};

6836

6837

/*

6837

/*

6838

* For NEWLY_IDLE load_balancing, we don't need to consider

6838

* For NEWLY_IDLE load_balancing, we don't need to consider

6839

* other cpus in our group

6839

* other cpus in our group

6840

*/

6840

*/

6841

if (idle == CPU_NEWLY_IDLE)

6841

if (idle == CPU_NEWLY_IDLE)

6842

env.dst_grpmask = NULL;

6842

env.dst_grpmask = NULL;

6843

6844

cpumask_copy(cpus, cpu_active_mask);

6844

cpumask_copy(cpus, cpu_active_mask);

6845

6846

schedstat_inc(sd, lb_count[idle]);

6846

schedstat_inc(sd, lb_count[idle]);

6847

6848

redo:

6848

redo:

6849

if (!should_we_balance(&env)) {

6849

if (!should_we_balance(&env)) {

6850

*continue_balancing = 0;

6850

*continue_balancing = 0;

6851

goto out_balanced;

6851

goto out_balanced;

6852

}

6852

}

6853

6854

group = find_busiest_group(&env);

6854

group = find_busiest_group(&env);

6855

if (!group) {

6855

if (!group) {

6856

schedstat_inc(sd, lb_nobusyg[idle]);

6856

schedstat_inc(sd, lb_nobusyg[idle]);

6857

goto out_balanced;

6857

goto out_balanced;

6858

}

6858

}

6859

6860

busiest = find_busiest_queue(&env, group);

6860

busiest = find_busiest_queue(&env, group);

6861

if (!busiest) {

6861

if (!busiest) {

6862

schedstat_inc(sd, lb_nobusyq[idle]);

6862

schedstat_inc(sd, lb_nobusyq[idle]);

6863

goto out_balanced;

6863

goto out_balanced;

6864

}

6864

}

6865

6866

BUG_ON(busiest == env.dst_rq);

6866

BUG_ON(busiest == env.dst_rq);

6867

6868

schedstat_add(sd, lb_imbalance[idle], env.imbalance);

6868

schedstat_add(sd, lb_imbalance[idle], env.imbalance);

6869

6870

ld_moved = 0;

6870

ld_moved = 0;

6871

if (busiest->nr_running > 1) {

6871

if (busiest->nr_running > 1) {

6872

/*

6872

/*

6873

* Attempt to move tasks. If find_busiest_group has found

6873

* Attempt to move tasks. If find_busiest_group has found

6874

* an imbalance but busiest->nr_running <= 1, the group is

6874

* an imbalance but busiest->nr_running <= 1, the group is

6875

* still unbalanced. ld_moved simply stays zero, so it is

6875

* still unbalanced. ld_moved simply stays zero, so it is

6876

* correctly treated as an imbalance.

6876

* correctly treated as an imbalance.

6877

*/

6877

*/

6878

env.flags |= LBF_ALL_PINNED;

6878

env.flags |= LBF_ALL_PINNED;

6879

env.src_cpu = busiest->cpu;

6879

env.src_cpu = busiest->cpu;

6880

env.src_rq = busiest;

6880

env.src_rq = busiest;

6881

env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);

6881

env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);

6882

6883

more_balance:

6883

more_balance:

6884

raw_spin_lock_irqsave(&busiest->lock, flags);

6884

raw_spin_lock_irqsave(&busiest->lock, flags);

6885

6886

/*

6886

/*

6887

* cur_ld_moved - load moved in current iteration

6887

* cur_ld_moved - load moved in current iteration

6888

* ld_moved - cumulative load moved across iterations

6888

* ld_moved - cumulative load moved across iterations

6889

*/

6889

*/

6890

cur_ld_moved = detach_tasks(&env);

6890

cur_ld_moved = detach_tasks(&env);

6891

6892

/*

6892

/*

6893

* We've detached some tasks from busiest_rq. Every

6893

* We've detached some tasks from busiest_rq. Every

6894

* task is masked "TASK_ON_RQ_MIGRATING", so we can safely

6894

* task is masked "TASK_ON_RQ_MIGRATING", so we can safely

6895

* unlock busiest->lock, and we are able to be sure

6895

* unlock busiest->lock, and we are able to be sure

6896

* that nobody can manipulate the tasks in parallel.

6896

* that nobody can manipulate the tasks in parallel.

6897

* See task_rq_lock() family for the details.

6897

* See task_rq_lock() family for the details.

6898

*/

6898

*/

6899

6900

raw_spin_unlock(&busiest->lock);

6900

raw_spin_unlock(&busiest->lock);

6901

6902

if (cur_ld_moved) {

6902

if (cur_ld_moved) {

6903

attach_tasks(&env);

6903

attach_tasks(&env);

6904

ld_moved += cur_ld_moved;

6904

ld_moved += cur_ld_moved;

6905

}

6905

}

6906

6907

local_irq_restore(flags);

6907

local_irq_restore(flags);

6908

6909

if (env.flags & LBF_NEED_BREAK) {

6909

if (env.flags & LBF_NEED_BREAK) {

6910

env.flags &= ~LBF_NEED_BREAK;

6910

env.flags &= ~LBF_NEED_BREAK;

6911

goto more_balance;

6911

goto more_balance;

6912

}

6912

}

6913

6914

/*

6914

/*

6915

* Revisit (affine) tasks on src_cpu that couldn't be moved to

6915

* Revisit (affine) tasks on src_cpu that couldn't be moved to

6916

* us and move them to an alternate dst_cpu in our sched_group

6916

* us and move them to an alternate dst_cpu in our sched_group

6917

* where they can run. The upper limit on how many times we

6917

* where they can run. The upper limit on how many times we

6918

* iterate on same src_cpu is dependent on number of cpus in our

6918

* iterate on same src_cpu is dependent on number of cpus in our

6919

* sched_group.

6919

* sched_group.

6920

*

6920

*

6921

* This changes load balance semantics a bit on who can move

6921

* This changes load balance semantics a bit on who can move

6922

* load to a given_cpu. In addition to the given_cpu itself

6922

* load to a given_cpu. In addition to the given_cpu itself

6923

* (or a ilb_cpu acting on its behalf where given_cpu is

6923

* (or a ilb_cpu acting on its behalf where given_cpu is

6924

* nohz-idle), we now have balance_cpu in a position to move

6924

* nohz-idle), we now have balance_cpu in a position to move

6925

* load to given_cpu. In rare situations, this may cause

6925

* load to given_cpu. In rare situations, this may cause

6926

* conflicts (balance_cpu and given_cpu/ilb_cpu deciding

6926

* conflicts (balance_cpu and given_cpu/ilb_cpu deciding

6927

* _independently_ and at _same_ time to move some load to

6927

* _independently_ and at _same_ time to move some load to

6928

* given_cpu) causing exceess load to be moved to given_cpu.

6928

* given_cpu) causing exceess load to be moved to given_cpu.

6929

* This however should not happen so much in practice and

6929

* This however should not happen so much in practice and

6930

* moreover subsequent load balance cycles should correct the

6930

* moreover subsequent load balance cycles should correct the

6931

* excess load moved.

6931

* excess load moved.

6932

*/

6932

*/

6933

if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {

6933

if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {

6934

6935

/* Prevent to re-select dst_cpu via env's cpus */

6935

/* Prevent to re-select dst_cpu via env's cpus */

6936

cpumask_clear_cpu(env.dst_cpu, env.cpus);

6936

cpumask_clear_cpu(env.dst_cpu, env.cpus);

6937

6938

env.dst_rq = cpu_rq(env.new_dst_cpu);

6938

env.dst_rq = cpu_rq(env.new_dst_cpu);

6939

env.dst_cpu = env.new_dst_cpu;

6939

env.dst_cpu = env.new_dst_cpu;

6940

env.flags &= ~LBF_DST_PINNED;

6940

env.flags &= ~LBF_DST_PINNED;

6941

env.loop = 0;

6941

env.loop = 0;

6942

env.loop_break = sched_nr_migrate_break;

6942

env.loop_break = sched_nr_migrate_break;

6943

6944

/*

6944

/*

6945

* Go back to "more_balance" rather than "redo" since we

6945

* Go back to "more_balance" rather than "redo" since we

6946

* need to continue with same src_cpu.

6946

* need to continue with same src_cpu.

6947

*/

6947

*/

6948

goto more_balance;

6948

goto more_balance;

6949

}

6949

}

6950

6951

/*

6951

/*

6952

* We failed to reach balance because of affinity.

6952

* We failed to reach balance because of affinity.

6953

*/

6953

*/

6954

if (sd_parent) {

6954

if (sd_parent) {

6955

int *group_imbalance = &sd_parent->groups->sgc->imbalance;

6955

int *group_imbalance = &sd_parent->groups->sgc->imbalance;

6956

6957

if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)

6957

if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)

6958

*group_imbalance = 1;

6958

*group_imbalance = 1;

6959

}

6959

}

6960

6961

/* All tasks on this runqueue were pinned by CPU affinity */

6961

/* All tasks on this runqueue were pinned by CPU affinity */

6962

if (unlikely(env.flags & LBF_ALL_PINNED)) {

6962

if (unlikely(env.flags & LBF_ALL_PINNED)) {

6963

cpumask_clear_cpu(cpu_of(busiest), cpus);

6963

cpumask_clear_cpu(cpu_of(busiest), cpus);

6964

if (!cpumask_empty(cpus)) {

6964

if (!cpumask_empty(cpus)) {

6965

env.loop = 0;

6965

env.loop = 0;

6966

env.loop_break = sched_nr_migrate_break;

6966

env.loop_break = sched_nr_migrate_break;

6967

goto redo;

6967

goto redo;

6968

}

6968

}

6969

goto out_all_pinned;

6969

goto out_all_pinned;

6970

}

6970

}

6971

}

6971

}

6972

6973

if (!ld_moved) {

6973

if (!ld_moved) {

6974

schedstat_inc(sd, lb_failed[idle]);

6974

schedstat_inc(sd, lb_failed[idle]);

6975

/*

6975

/*

6976

* Increment the failure counter only on periodic balance.

6976

* Increment the failure counter only on periodic balance.

6977

* We do not want newidle balance, which can be very

6977

* We do not want newidle balance, which can be very

6978

* frequent, pollute the failure counter causing

6978

* frequent, pollute the failure counter causing

6979

* excessive cache_hot migrations and active balances.

6979

* excessive cache_hot migrations and active balances.

6980

*/

6980

*/

6981

if (idle != CPU_NEWLY_IDLE)

6981

if (idle != CPU_NEWLY_IDLE)

6982

sd->nr_balance_failed++;

6982

sd->nr_balance_failed++;

6983

6984

if (need_active_balance(&env)) {

6984

if (need_active_balance(&env)) {

6985

raw_spin_lock_irqsave(&busiest->lock, flags);

6985

raw_spin_lock_irqsave(&busiest->lock, flags);

6986

6987

/* don't kick the active_load_balance_cpu_stop,

6987

/* don't kick the active_load_balance_cpu_stop,

6988

* if the curr task on busiest cpu can't be

6988

* if the curr task on busiest cpu can't be

6989

* moved to this_cpu

6989

* moved to this_cpu

6990

*/

6990

*/

6991

if (!cpumask_test_cpu(this_cpu,

6991

if (!cpumask_test_cpu(this_cpu,

6992

tsk_cpus_allowed(busiest->curr))) {

6992

tsk_cpus_allowed(busiest->curr))) {

6993

raw_spin_unlock_irqrestore(&busiest->lock,

6993

raw_spin_unlock_irqrestore(&busiest->lock,

6994

flags);

6994

flags);

6995

env.flags |= LBF_ALL_PINNED;

6995

env.flags |= LBF_ALL_PINNED;

6996

goto out_one_pinned;

6996

goto out_one_pinned;

6997

}

6997

}

6998

6999

/*

6999

/*

7000

* ->active_balance synchronizes accesses to

7000

* ->active_balance synchronizes accesses to

7001

* ->active_balance_work. Once set, it's cleared

7001

* ->active_balance_work. Once set, it's cleared

7002

* only after active load balance is finished.

7002

* only after active load balance is finished.

7003

*/

7003

*/

7004

if (!busiest->active_balance) {

7004

if (!busiest->active_balance) {

7005

busiest->active_balance = 1;

7005

busiest->active_balance = 1;

7006

busiest->push_cpu = this_cpu;

7006

busiest->push_cpu = this_cpu;

7007

active_balance = 1;

7007

active_balance = 1;

7008

}

7008

}

7009

raw_spin_unlock_irqrestore(&busiest->lock, flags);

7009

raw_spin_unlock_irqrestore(&busiest->lock, flags);

7010

7011

if (active_balance) {

7011

if (active_balance) {

7012

stop_one_cpu_nowait(cpu_of(busiest),

7012

stop_one_cpu_nowait(cpu_of(busiest),

7013

active_load_balance_cpu_stop, busiest,

7013

active_load_balance_cpu_stop, busiest,

7014

&busiest->active_balance_work);

7014

&busiest->active_balance_work);

7015

}

7015

}

7016

7017

/*

7017

/*

7018

* We've kicked active balancing, reset the failure

7018

* We've kicked active balancing, reset the failure

7019

* counter.

7019

* counter.

7020

*/

7020

*/

7021

sd->nr_balance_failed = sd->cache_nice_tries+1;

7021

sd->nr_balance_failed = sd->cache_nice_tries+1;

7022

}

7022

}

7023

} else

7023

} else

7024

sd->nr_balance_failed = 0;

7024

sd->nr_balance_failed = 0;

7025

7026

if (likely(!active_balance)) {

7026

if (likely(!active_balance)) {

7027

/* We were unbalanced, so reset the balancing interval */

7027

/* We were unbalanced, so reset the balancing interval */

7028

sd->balance_interval = sd->min_interval;

7028

sd->balance_interval = sd->min_interval;

7029

} else {

7029

} else {

7030

/*

7030

/*

7031

* If we've begun active balancing, start to back off. This

7031

* If we've begun active balancing, start to back off. This

7032

* case may not be covered by the all_pinned logic if there

7032

* case may not be covered by the all_pinned logic if there

7033

* is only 1 task on the busy runqueue (because we don't call

7033

* is only 1 task on the busy runqueue (because we don't call

7034

* detach_tasks).

7034

* detach_tasks).

7035

*/

7035

*/

7036

if (sd->balance_interval < sd->max_interval)

7036

if (sd->balance_interval < sd->max_interval)

7037

sd->balance_interval *= 2;

7037

sd->balance_interval *= 2;

7038

}

7038

}

7039

7040

goto out;

7040

goto out;

7041

7042

out_balanced:

7042

out_balanced:

7043

/*

7043

/*

7044

* We reach balance although we may have faced some affinity

7044

* We reach balance although we may have faced some affinity

7045

* constraints. Clear the imbalance flag if it was set.

7045

* constraints. Clear the imbalance flag if it was set.

7046

*/

7046

*/

7047

if (sd_parent) {

7047

if (sd_parent) {

7048

int *group_imbalance = &sd_parent->groups->sgc->imbalance;

7048

int *group_imbalance = &sd_parent->groups->sgc->imbalance;

7049

7050

if (*group_imbalance)

7050

if (*group_imbalance)

7051

*group_imbalance = 0;

7051

*group_imbalance = 0;

7052

}

7052

}

7053

7054

out_all_pinned:

7054

out_all_pinned:

7055

/*

7055

/*

7056

* We reach balance because all tasks are pinned at this level so

7056

* We reach balance because all tasks are pinned at this level so

7057

* we can't migrate them. Let the imbalance flag set so parent level

7057

* we can't migrate them. Let the imbalance flag set so parent level

7058

* can try to migrate them.

7058

* can try to migrate them.

7059

*/

7059

*/

7060

schedstat_inc(sd, lb_balanced[idle]);

7060

schedstat_inc(sd, lb_balanced[idle]);

7061

7062

sd->nr_balance_failed = 0;

7062

sd->nr_balance_failed = 0;

7063

7064

out_one_pinned:

7064

out_one_pinned:

7065

/* tune up the balancing interval */

7065

/* tune up the balancing interval */

7066

if (((env.flags & LBF_ALL_PINNED) &&

7066

if (((env.flags & LBF_ALL_PINNED) &&

7067

sd->balance_interval < MAX_PINNED_INTERVAL) ||

7067

sd->balance_interval < MAX_PINNED_INTERVAL) ||

7068

(sd->balance_interval < sd->max_interval))

7068

(sd->balance_interval < sd->max_interval))

7069

sd->balance_interval *= 2;

7069

sd->balance_interval *= 2;

7070

7071

ld_moved = 0;

7071

ld_moved = 0;

7072

out:

7072

out:

7073

return ld_moved;

7073

return ld_moved;

7074

}

7074

}

7075

7076

static inline unsigned long

7076

static inline unsigned long

7077

get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)

7077

get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)

7078

{

7078

{

7079

unsigned long interval = sd->balance_interval;

7079

unsigned long interval = sd->balance_interval;

7080

7081

if (cpu_busy)

7081

if (cpu_busy)

7082

interval *= sd->busy_factor;

7082

interval *= sd->busy_factor;

7083

7084

/* scale ms to jiffies */

7084

/* scale ms to jiffies */

7085

interval = msecs_to_jiffies(interval);

7085

interval = msecs_to_jiffies(interval);

7086

interval = clamp(interval, 1UL, max_load_balance_interval);

7086

interval = clamp(interval, 1UL, max_load_balance_interval);

7087

7088

return interval;

7088

return interval;

7089

}

7089

}

7090

7091

static inline void

7091

static inline void

7092

update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)

7092

update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)

7093

{

7093

{

7094

unsigned long interval, next;

7094

unsigned long interval, next;

7095

7096

interval = get_sd_balance_interval(sd, cpu_busy);

7096

interval = get_sd_balance_interval(sd, cpu_busy);

7097

next = sd->last_balance + interval;

7097

next = sd->last_balance + interval;

7098

7099

if (time_after(*next_balance, next))

7099

if (time_after(*next_balance, next))

7100

*next_balance = next;

7100

*next_balance = next;

7101

}

7101

}

7102

7103

/*

7103

/*

7104

* idle_balance is called by schedule() if this_cpu is about to become

7104

* idle_balance is called by schedule() if this_cpu is about to become

7105

* idle. Attempts to pull tasks from other CPUs.

7105

* idle. Attempts to pull tasks from other CPUs.

7106

*/

7106

*/

7107

static int idle_balance(struct rq *this_rq)

7107

static int idle_balance(struct rq *this_rq)

7108

{

7108

{

7109

unsigned long next_balance = jiffies + HZ;

7109

unsigned long next_balance = jiffies + HZ;

7110

int this_cpu = this_rq->cpu;

7110

int this_cpu = this_rq->cpu;

7111

struct sched_domain *sd;

7111

struct sched_domain *sd;

7112

int pulled_task = 0;

7112

int pulled_task = 0;

7113

u64 curr_cost = 0;

7113

u64 curr_cost = 0;

7114

7115

idle_enter_fair(this_rq);

7115

idle_enter_fair(this_rq);

7116

7117

/*

7117

/*

7118

* We must set idle_stamp _before_ calling idle_balance(), such that we

7118

* We must set idle_stamp _before_ calling idle_balance(), such that we

7119

* measure the duration of idle_balance() as idle time.

7119

* measure the duration of idle_balance() as idle time.

7120

*/

7120

*/

7121

this_rq->idle_stamp = rq_clock(this_rq);

7121

this_rq->idle_stamp = rq_clock(this_rq);

7122

7123

if (this_rq->avg_idle < sysctl_sched_migration_cost ||

7123

if (this_rq->avg_idle < sysctl_sched_migration_cost ||

7124

!this_rq->rd->overload) {

7124

!this_rq->rd->overload) {

7125

rcu_read_lock();

7125

rcu_read_lock();

7126

sd = rcu_dereference_check_sched_domain(this_rq->sd);

7126

sd = rcu_dereference_check_sched_domain(this_rq->sd);

7127

if (sd)

7127

if (sd)

7128

update_next_balance(sd, 0, &next_balance);

7128

update_next_balance(sd, 0, &next_balance);

7129

rcu_read_unlock();

7129

rcu_read_unlock();

7130

7131

goto out;

7131

goto out;

7132

}

7132

}

7133

7134

/*

7134

/*

7135

* Drop the rq->lock, but keep IRQ/preempt disabled.

7135

* Drop the rq->lock, but keep IRQ/preempt disabled.

7136

*/

7136

*/

7137

raw_spin_unlock(&this_rq->lock);

7137

raw_spin_unlock(&this_rq->lock);

7138

7139

update_blocked_averages(this_cpu);

7139

update_blocked_averages(this_cpu);

7140

rcu_read_lock();

7140

rcu_read_lock();

7141

for_each_domain(this_cpu, sd) {

7141

for_each_domain(this_cpu, sd) {

7142

int continue_balancing = 1;

7142

int continue_balancing = 1;

7143

u64 t0, domain_cost;

7143

u64 t0, domain_cost;

7144

7145

if (!(sd->flags & SD_LOAD_BALANCE))

7145

if (!(sd->flags & SD_LOAD_BALANCE))

7146

continue;

7146

continue;

7147

7148

if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {

7148

if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {

7149

update_next_balance(sd, 0, &next_balance);

7149

update_next_balance(sd, 0, &next_balance);

7150

break;

7150

break;

7151

}

7151

}

7152

7153

if (sd->flags & SD_BALANCE_NEWIDLE) {

7153

if (sd->flags & SD_BALANCE_NEWIDLE) {

7154

t0 = sched_clock_cpu(this_cpu);

7154

t0 = sched_clock_cpu(this_cpu);

7155

7156

pulled_task = load_balance(this_cpu, this_rq,

7156

pulled_task = load_balance(this_cpu, this_rq,

7157

sd, CPU_NEWLY_IDLE,

7157

sd, CPU_NEWLY_IDLE,

7158

&continue_balancing);

7158

&continue_balancing);

7159

7160

domain_cost = sched_clock_cpu(this_cpu) - t0;

7160

domain_cost = sched_clock_cpu(this_cpu) - t0;

7161

if (domain_cost > sd->max_newidle_lb_cost)

7161

if (domain_cost > sd->max_newidle_lb_cost)

7162

sd->max_newidle_lb_cost = domain_cost;

7162

sd->max_newidle_lb_cost = domain_cost;

7163

7164

curr_cost += domain_cost;

7164

curr_cost += domain_cost;

7165

}

7165

}

7166

7167

update_next_balance(sd, 0, &next_balance);

7167

update_next_balance(sd, 0, &next_balance);

7168

7169

/*

7169

/*

7170

* Stop searching for tasks to pull if there are

7170

* Stop searching for tasks to pull if there are

7171

* now runnable tasks on this rq.

7171

* now runnable tasks on this rq.

7172

*/

7172

*/

7173

if (pulled_task || this_rq->nr_running > 0)

7173

if (pulled_task || this_rq->nr_running > 0)

7174

break;

7174

break;

7175

}

7175

}

7176

rcu_read_unlock();

7176

rcu_read_unlock();

7177

7178

raw_spin_lock(&this_rq->lock);

7178

raw_spin_lock(&this_rq->lock);

7179

7180

if (curr_cost > this_rq->max_idle_balance_cost)

7180

if (curr_cost > this_rq->max_idle_balance_cost)

7181

this_rq->max_idle_balance_cost = curr_cost;

7181

this_rq->max_idle_balance_cost = curr_cost;

7182

7183

/*

7183

/*

7184

* While browsing the domains, we released the rq lock, a task could

7184

* While browsing the domains, we released the rq lock, a task could

7185

* have been enqueued in the meantime. Since we're not going idle,

7185

* have been enqueued in the meantime. Since we're not going idle,

7186

* pretend we pulled a task.

7186

* pretend we pulled a task.

7187

*/

7187

*/

7188

if (this_rq->cfs.h_nr_running && !pulled_task)

7188

if (this_rq->cfs.h_nr_running && !pulled_task)

7189

pulled_task = 1;

7189

pulled_task = 1;

7190

7191

out:

7191

out:

7192

/* Move the next balance forward */

7192

/* Move the next balance forward */

7193

if (time_after(this_rq->next_balance, next_balance))

7193

if (time_after(this_rq->next_balance, next_balance))

7194

this_rq->next_balance = next_balance;

7194

this_rq->next_balance = next_balance;

7195

7196

/* Is there a task of a high priority class? */

7196

/* Is there a task of a high priority class? */

7197

if (this_rq->nr_running != this_rq->cfs.h_nr_running)

7197

if (this_rq->nr_running != this_rq->cfs.h_nr_running)

7198

pulled_task = -1;

7198

pulled_task = -1;

7199

7200

if (pulled_task) {

7200

if (pulled_task) {

7201

idle_exit_fair(this_rq);

7201

idle_exit_fair(this_rq);

7202

this_rq->idle_stamp = 0;

7202

this_rq->idle_stamp = 0;

7203

}

7203

}

7204

7205

return pulled_task;

7205

return pulled_task;

7206

}

7206

}

7207

7208

/*

7208

/*

7209

* active_load_balance_cpu_stop is run by cpu stopper. It pushes

7209

* active_load_balance_cpu_stop is run by cpu stopper. It pushes

7210

* running tasks off the busiest CPU onto idle CPUs. It requires at

7210

* running tasks off the busiest CPU onto idle CPUs. It requires at

7211

* least 1 task to be running on each physical CPU where possible, and

7211

* least 1 task to be running on each physical CPU where possible, and

7212

* avoids physical / logical imbalances.

7212

* avoids physical / logical imbalances.

7213

*/

7213

*/

7214

static int active_load_balance_cpu_stop(void *data)

7214

static int active_load_balance_cpu_stop(void *data)

7215

{

7215

{

7216

struct rq *busiest_rq = data;

7216

struct rq *busiest_rq = data;

7217

int busiest_cpu = cpu_of(busiest_rq);

7217

int busiest_cpu = cpu_of(busiest_rq);

7218

int target_cpu = busiest_rq->push_cpu;

7218

int target_cpu = busiest_rq->push_cpu;

7219

struct rq *target_rq = cpu_rq(target_cpu);

7219

struct rq *target_rq = cpu_rq(target_cpu);

7220

struct sched_domain *sd;

7220

struct sched_domain *sd;

7221

struct task_struct *p = NULL;

7221

struct task_struct *p = NULL;

7222

7223

raw_spin_lock_irq(&busiest_rq->lock);

7223

raw_spin_lock_irq(&busiest_rq->lock);

7224

7225

/* make sure the requested cpu hasn't gone down in the meantime */

7225

/* make sure the requested cpu hasn't gone down in the meantime */

7226

if (unlikely(busiest_cpu != smp_processor_id() ||

7226

if (unlikely(busiest_cpu != smp_processor_id() ||

7227

!busiest_rq->active_balance))

7227

!busiest_rq->active_balance))

7228

goto out_unlock;

7228

goto out_unlock;

7229

7230

/* Is there any task to move? */

7230

/* Is there any task to move? */

7231

if (busiest_rq->nr_running <= 1)

7231

if (busiest_rq->nr_running <= 1)

7232

goto out_unlock;

7232

goto out_unlock;

7233

7234

/*

7234

/*

7235

* This condition is "impossible", if it occurs

7235

* This condition is "impossible", if it occurs

7236

* we need to fix it. Originally reported by

7236

* we need to fix it. Originally reported by

7237

* Bjorn Helgaas on a 128-cpu setup.

7237

* Bjorn Helgaas on a 128-cpu setup.

7238

*/

7238

*/

7239

BUG_ON(busiest_rq == target_rq);

7239

BUG_ON(busiest_rq == target_rq);

7240

7241

/* Search for an sd spanning us and the target CPU. */

7241

/* Search for an sd spanning us and the target CPU. */

7242

rcu_read_lock();

7242

rcu_read_lock();

7243

for_each_domain(target_cpu, sd) {

7243

for_each_domain(target_cpu, sd) {

7244

if ((sd->flags & SD_LOAD_BALANCE) &&

7244

if ((sd->flags & SD_LOAD_BALANCE) &&

7245

cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))

7245

cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))

7246

break;

7246

break;

7247

}

7247

}

7248

7249

if (likely(sd)) {

7249

if (likely(sd)) {

7250

struct lb_env env = {

7250

struct lb_env env = {

7251

.sd = sd,

7251

.sd = sd,

7252

.dst_cpu = target_cpu,

7252

.dst_cpu = target_cpu,

7253

.dst_rq = target_rq,

7253

.dst_rq = target_rq,

7254

.src_cpu = busiest_rq->cpu,

7254

.src_cpu = busiest_rq->cpu,

7255

.src_rq = busiest_rq,

7255

.src_rq = busiest_rq,

7256

.idle = CPU_IDLE,

7256

.idle = CPU_IDLE,

7257

};

7257

};

7258

7259

schedstat_inc(sd, alb_count);

7259

schedstat_inc(sd, alb_count);

7260

7261

p = detach_one_task(&env);

7261

p = detach_one_task(&env);

7262

if (p)

7262

if (p)

7263

schedstat_inc(sd, alb_pushed);

7263

schedstat_inc(sd, alb_pushed);

7264

else

7264

else

7265

schedstat_inc(sd, alb_failed);

7265

schedstat_inc(sd, alb_failed);

7266

}

7266

}

7267

rcu_read_unlock();

7267

rcu_read_unlock();

7268

out_unlock:

7268

out_unlock:

7269

busiest_rq->active_balance = 0;

7269

busiest_rq->active_balance = 0;

7270

raw_spin_unlock(&busiest_rq->lock);

7270

raw_spin_unlock(&busiest_rq->lock);

7271

7272

if (p)

7272

if (p)

7273

attach_one_task(target_rq, p);

7273

attach_one_task(target_rq, p);

7274

7275

local_irq_enable();

7275

local_irq_enable();

7276

7277

return 0;

7277

return 0;

7278

}

7278

}

7279

7280

static inline int on_null_domain(struct rq *rq)

7280

static inline int on_null_domain(struct rq *rq)

7281

{

7281

{

7282

return unlikely(!rcu_dereference_sched(rq->sd));

7282

return unlikely(!rcu_dereference_sched(rq->sd));

7283

}

7283

}

7284

7285

#ifdef CONFIG_NO_HZ_COMMON

7285

#ifdef CONFIG_NO_HZ_COMMON

7286

/*

7286

/*

7287

* idle load balancing details

7287

* idle load balancing details

7288

* - When one of the busy CPUs notice that there may be an idle rebalancing

7288

* - When one of the busy CPUs notice that there may be an idle rebalancing

7289

* needed, they will kick the idle load balancer, which then does idle

7289

* needed, they will kick the idle load balancer, which then does idle

7290

* load balancing for all the idle CPUs.

7290

* load balancing for all the idle CPUs.

7291

*/

7291

*/

7292

static struct {

7292

static struct {

7293

cpumask_var_t idle_cpus_mask;

7293

cpumask_var_t idle_cpus_mask;

7294

atomic_t nr_cpus;

7294

atomic_t nr_cpus;

7295

unsigned long next_balance; /* in jiffy units */

7295

unsigned long next_balance; /* in jiffy units */

7296

} nohz ____cacheline_aligned;

7296

} nohz ____cacheline_aligned;

7297

7298

static inline int find_new_ilb(void)

7298

static inline int find_new_ilb(void)

7299

{

7299

{

7300

int ilb = cpumask_first(nohz.idle_cpus_mask);

7300

int ilb = cpumask_first(nohz.idle_cpus_mask);

7301

7302

if (ilb < nr_cpu_ids && idle_cpu(ilb))

7302

if (ilb < nr_cpu_ids && idle_cpu(ilb))

7303

return ilb;

7303

return ilb;

7304

7305

return nr_cpu_ids;

7305

return nr_cpu_ids;

7306

}

7306

}

7307

7308

/*

7308

/*

7309

* Kick a CPU to do the nohz balancing, if it is time for it. We pick the

7309

* Kick a CPU to do the nohz balancing, if it is time for it. We pick the

7310

* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle

7310

* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle

7311

* CPU (if there is one).

7311

* CPU (if there is one).

7312

*/

7312

*/

7313

static void nohz_balancer_kick(void)

7313

static void nohz_balancer_kick(void)

7314

{

7314

{

7315

int ilb_cpu;

7315

int ilb_cpu;

7316

7317

nohz.next_balance++;

7317

nohz.next_balance++;

7318

7319

ilb_cpu = find_new_ilb();

7319

ilb_cpu = find_new_ilb();

7320

7321

if (ilb_cpu >= nr_cpu_ids)

7321

if (ilb_cpu >= nr_cpu_ids)

7322

return;

7322

return;

7323

7324

if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))

7324

if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))

7325

return;

7325

return;

7326

/*

7326

/*

7327

* Use smp_send_reschedule() instead of resched_cpu().

7327

* Use smp_send_reschedule() instead of resched_cpu().

7328

* This way we generate a sched IPI on the target cpu which

7328

* This way we generate a sched IPI on the target cpu which

7329

* is idle. And the softirq performing nohz idle load balance

7329

* is idle. And the softirq performing nohz idle load balance

7330

* will be run before returning from the IPI.

7330

* will be run before returning from the IPI.

7331

*/

7331

*/

7332

smp_send_reschedule(ilb_cpu);

7332

smp_send_reschedule(ilb_cpu);

7333

return;

7333

return;

7334

}

7334

}

7335

7336

static inline void nohz_balance_exit_idle(int cpu)

7336

static inline void nohz_balance_exit_idle(int cpu)

7337

{

7337

{

7338

if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {

7338

if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {

7339

/*

7339

/*

7340

* Completely isolated CPUs don't ever set, so we must test.

7340

* Completely isolated CPUs don't ever set, so we must test.

7341

*/

7341

*/

7342

if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {

7342

if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {

7343

cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);

7343

cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);

7344

atomic_dec(&nohz.nr_cpus);

7344

atomic_dec(&nohz.nr_cpus);

7345

}

7345

}

7346

clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));

7346

clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));

7347

}

7347

}

7348

}

7348

}

7349

7350

static inline void set_cpu_sd_state_busy(void)

7350

static inline void set_cpu_sd_state_busy(void)

7351

{

7351

{

7352

struct sched_domain *sd;

7352

struct sched_domain *sd;

7353

int cpu = smp_processor_id();

7353

int cpu = smp_processor_id();

7354

7355

rcu_read_lock();

7355

rcu_read_lock();

7356

sd = rcu_dereference(per_cpu(sd_busy, cpu));

7356

sd = rcu_dereference(per_cpu(sd_busy, cpu));

7357

7358

if (!sd || !sd->nohz_idle)

7358

if (!sd || !sd->nohz_idle)

7359

goto unlock;

7359

goto unlock;

7360

sd->nohz_idle = 0;

7360

sd->nohz_idle = 0;

7361

7362

atomic_inc(&sd->groups->sgc->nr_busy_cpus);

7362

atomic_inc(&sd->groups->sgc->nr_busy_cpus);

7363

unlock:

7363

unlock:

7364

rcu_read_unlock();

7364

rcu_read_unlock();

7365

}

7365

}

7366

7367

void set_cpu_sd_state_idle(void)

7367

void set_cpu_sd_state_idle(void)

7368

{

7368

{

7369

struct sched_domain *sd;

7369

struct sched_domain *sd;

7370

int cpu = smp_processor_id();

7370

int cpu = smp_processor_id();

7371

7372

rcu_read_lock();

7372

rcu_read_lock();

7373

sd = rcu_dereference(per_cpu(sd_busy, cpu));

7373

sd = rcu_dereference(per_cpu(sd_busy, cpu));

7374

7375

if (!sd || sd->nohz_idle)

7375

if (!sd || sd->nohz_idle)

7376

goto unlock;

7376

goto unlock;

7377

sd->nohz_idle = 1;

7377

sd->nohz_idle = 1;

7378

7379

atomic_dec(&sd->groups->sgc->nr_busy_cpus);

7379

atomic_dec(&sd->groups->sgc->nr_busy_cpus);

7380

unlock:

7380

unlock:

7381

rcu_read_unlock();

7381

rcu_read_unlock();

7382

}

7382

}

7383

7384

/*

7384

/*

7385

* This routine will record that the cpu is going idle with tick stopped.

7385

* This routine will record that the cpu is going idle with tick stopped.

7386

* This info will be used in performing idle load balancing in the future.

7386

* This info will be used in performing idle load balancing in the future.

7387

*/

7387

*/

7388

void nohz_balance_enter_idle(int cpu)

7388

void nohz_balance_enter_idle(int cpu)

7389

{

7389

{

7390

/*

7390

/*

7391

* If this cpu is going down, then nothing needs to be done.

7391

* If this cpu is going down, then nothing needs to be done.

7392

*/

7392

*/

7393

if (!cpu_active(cpu))

7393

if (!cpu_active(cpu))

7394

return;

7394

return;

7395

7396

if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))

7396

if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))

7397

return;

7397

return;

7398

7399

/*

7399

/*

7400

* If we're a completely isolated CPU, we don't play.

7400

* If we're a completely isolated CPU, we don't play.

7401

*/

7401

*/

7402

if (on_null_domain(cpu_rq(cpu)))

7402

if (on_null_domain(cpu_rq(cpu)))

7403

return;

7403

return;

7404

7405

cpumask_set_cpu(cpu, nohz.idle_cpus_mask);

7405

cpumask_set_cpu(cpu, nohz.idle_cpus_mask);

7406

atomic_inc(&nohz.nr_cpus);

7406

atomic_inc(&nohz.nr_cpus);

7407

set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));

7407

set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));

7408

}

7408

}

7409

7410

static int sched_ilb_notifier(struct notifier_block *nfb,

7410

static int sched_ilb_notifier(struct notifier_block *nfb,

7411

unsigned long action, void *hcpu)

7411

unsigned long action, void *hcpu)

7412

{

7412

{

7413

switch (action & ~CPU_TASKS_FROZEN) {

7413

switch (action & ~CPU_TASKS_FROZEN) {

7414

case CPU_DYING:

7414

case CPU_DYING:

7415

nohz_balance_exit_idle(smp_processor_id());

7415

nohz_balance_exit_idle(smp_processor_id());

7416

return NOTIFY_OK;

7416

return NOTIFY_OK;

7417

default:

7417

default:

7418

return NOTIFY_DONE;

7418

return NOTIFY_DONE;

7419

}

7419

}

7420

}

7420

}

7421

#endif

7421

#endif

7422

7423

static DEFINE_SPINLOCK(balancing);

7423

static DEFINE_SPINLOCK(balancing);

7424

7425

/*

7425

/*

7426

* Scale the max load_balance interval with the number of CPUs in the system.

7426

* Scale the max load_balance interval with the number of CPUs in the system.

7427

* This trades load-balance latency on larger machines for less cross talk.

7427

* This trades load-balance latency on larger machines for less cross talk.

7428

*/

7428

*/

7429

void update_max_interval(void)

7429

void update_max_interval(void)

7430

{

7430

{

7431

max_load_balance_interval = HZ*num_online_cpus()/10;

7431

max_load_balance_interval = HZ*num_online_cpus()/10;

7432

}

7432

}

7433

7434

/*

7434

/*

7435

* It checks each scheduling domain to see if it is due to be balanced,

7435

* It checks each scheduling domain to see if it is due to be balanced,

7436

* and initiates a balancing operation if so.

7436

* and initiates a balancing operation if so.

7437

*

7437

*

7438

* Balancing parameters are set up in init_sched_domains.

7438

* Balancing parameters are set up in init_sched_domains.

7439

*/

7439

*/

7440

static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)

7440

static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)

7441

{

7441

{

7442

int continue_balancing = 1;

7442

int continue_balancing = 1;

7443

int cpu = rq->cpu;

7443

int cpu = rq->cpu;

7444

unsigned long interval;

7444

unsigned long interval;

7445

struct sched_domain *sd;

7445

struct sched_domain *sd;

7446

/* Earliest time when we have to do rebalance again */

7446

/* Earliest time when we have to do rebalance again */

7447

unsigned long next_balance = jiffies + 60*HZ;

7447

unsigned long next_balance = jiffies + 60*HZ;

7448

int update_next_balance = 0;

7448

int update_next_balance = 0;

7449

int need_serialize, need_decay = 0;

7449

int need_serialize, need_decay = 0;

7450

u64 max_cost = 0;

7450

u64 max_cost = 0;

7451

7452

update_blocked_averages(cpu);

7452

update_blocked_averages(cpu);

7453

7454

rcu_read_lock();

7454

rcu_read_lock();

7455

for_each_domain(cpu, sd) {

7455

for_each_domain(cpu, sd) {

7456

/*

7456

/*

7457

* Decay the newidle max times here because this is a regular

7457

* Decay the newidle max times here because this is a regular

7458

* visit to all the domains. Decay ~1% per second.

7458

* visit to all the domains. Decay ~1% per second.

7459

*/

7459

*/

7460

if (time_after(jiffies, sd->next_decay_max_lb_cost)) {

7460

if (time_after(jiffies, sd->next_decay_max_lb_cost)) {

7461

sd->max_newidle_lb_cost =

7461

sd->max_newidle_lb_cost =

7462

(sd->max_newidle_lb_cost * 253) / 256;

7462

(sd->max_newidle_lb_cost * 253) / 256;

7463

sd->next_decay_max_lb_cost = jiffies + HZ;

7463

sd->next_decay_max_lb_cost = jiffies + HZ;

7464

need_decay = 1;

7464

need_decay = 1;

7465

}

7465

}

7466

max_cost += sd->max_newidle_lb_cost;

7466

max_cost += sd->max_newidle_lb_cost;

7467

7468

if (!(sd->flags & SD_LOAD_BALANCE))

7468

if (!(sd->flags & SD_LOAD_BALANCE))

7469

continue;

7469

continue;

7470

7471

/*

7471

/*

7472

* Stop the load balance at this level. There is another

7472

* Stop the load balance at this level. There is another

7473

* CPU in our sched group which is doing load balancing more

7473

* CPU in our sched group which is doing load balancing more

7474

* actively.

7474

* actively.

7475

*/

7475

*/

7476

if (!continue_balancing) {

7476

if (!continue_balancing) {

7477

if (need_decay)

7477

if (need_decay)

7478

continue;

7478

continue;

7479

break;

7479

break;

7480

}

7480

}

7481

7482

interval = get_sd_balance_interval(sd, idle != CPU_IDLE);

7482

interval = get_sd_balance_interval(sd, idle != CPU_IDLE);

7483

7484

need_serialize = sd->flags & SD_SERIALIZE;

7484

need_serialize = sd->flags & SD_SERIALIZE;

7485

if (need_serialize) {

7485

if (need_serialize) {

7486

if (!spin_trylock(&balancing))

7486

if (!spin_trylock(&balancing))

7487

goto out;

7487

goto out;

7488

}

7488

}

7489

7490

if (time_after_eq(jiffies, sd->last_balance + interval)) {

7490

if (time_after_eq(jiffies, sd->last_balance + interval)) {

7491

if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {

7491

if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {

7492

/*

7492

/*

7493

* The LBF_DST_PINNED logic could have changed

7493

* The LBF_DST_PINNED logic could have changed

7494

* env->dst_cpu, so we can't know our idle

7494

* env->dst_cpu, so we can't know our idle

7495

* state even if we migrated tasks. Update it.

7495

* state even if we migrated tasks. Update it.

7496

*/

7496

*/

7497

idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;

7497

idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;

7498

}

7498

}

7499

sd->last_balance = jiffies;

7499

sd->last_balance = jiffies;

7500

interval = get_sd_balance_interval(sd, idle != CPU_IDLE);

7500

interval = get_sd_balance_interval(sd, idle != CPU_IDLE);

7501

}

7501

}

7502

if (need_serialize)

7502

if (need_serialize)

7503

spin_unlock(&balancing);

7503

spin_unlock(&balancing);

7504

out:

7504

out:

7505

if (time_after(next_balance, sd->last_balance + interval)) {

7505

if (time_after(next_balance, sd->last_balance + interval)) {

7506

next_balance = sd->last_balance + interval;

7506

next_balance = sd->last_balance + interval;

7507

update_next_balance = 1;

7507

update_next_balance = 1;

7508

}

7508

}

7509

}

7509

}

7510

if (need_decay) {

7510

if (need_decay) {

7511

/*

7511

/*

7512

* Ensure the rq-wide value also decays but keep it at a

7512

* Ensure the rq-wide value also decays but keep it at a

7513

* reasonable floor to avoid funnies with rq->avg_idle.

7513

* reasonable floor to avoid funnies with rq->avg_idle.

7514

*/

7514

*/

7515

rq->max_idle_balance_cost =

7515

rq->max_idle_balance_cost =

7516

max((u64)sysctl_sched_migration_cost, max_cost);

7516

max((u64)sysctl_sched_migration_cost, max_cost);

7517

}

7517

}

7518

rcu_read_unlock();

7518

rcu_read_unlock();

7519

7520

/*

7520

/*

7521

* next_balance will be updated only when there is a need.

7521

* next_balance will be updated only when there is a need.

7522

* When the cpu is attached to null domain for ex, it will not be

7522

* When the cpu is attached to null domain for ex, it will not be

7523

* updated.

7523

* updated.

7524

*/

7524

*/

7525

if (likely(update_next_balance))

7525

if (likely(update_next_balance))

7526

rq->next_balance = next_balance;

7526

rq->next_balance = next_balance;

7527

}

7527

}

7528

7529

#ifdef CONFIG_NO_HZ_COMMON

7529

#ifdef CONFIG_NO_HZ_COMMON

7530

/*

7530

/*

7531

* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the

7531

* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the

7532

* rebalancing for all the cpus for whom scheduler ticks are stopped.

7532

* rebalancing for all the cpus for whom scheduler ticks are stopped.

7533

*/

7533

*/

7534

static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)

7534

static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)

7535

{

7535

{

7536

int this_cpu = this_rq->cpu;

7536

int this_cpu = this_rq->cpu;

7537

struct rq *rq;

7537

struct rq *rq;

7538

int balance_cpu;

7538

int balance_cpu;

7539

7540

if (idle != CPU_IDLE ||

7540

if (idle != CPU_IDLE ||

7541

!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))

7541

!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))

7542

goto end;

7542

goto end;

7543

7544

for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {

7544

for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {

7545

if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))

7545

if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))

7546

continue;

7546

continue;

7547

7548

/*

7548

/*

7549

* If this cpu gets work to do, stop the load balancing

7549

* If this cpu gets work to do, stop the load balancing

7550

* work being done for other cpus. Next load

7550

* work being done for other cpus. Next load

7551

* balancing owner will pick it up.

7551

* balancing owner will pick it up.

7552

*/

7552

*/

7553

if (need_resched())

7553

if (need_resched())

7554

break;

7554

break;

7555

7556

rq = cpu_rq(balance_cpu);

7556

rq = cpu_rq(balance_cpu);

7557

7558

/*

7558

/*

7559

* If time for next balance is due,

7559

* If time for next balance is due,

7560

* do the balance.

7560

* do the balance.

7561

*/

7561

*/

7562

if (time_after_eq(jiffies, rq->next_balance)) {

7562

if (time_after_eq(jiffies, rq->next_balance)) {

7563

raw_spin_lock_irq(&rq->lock);

7563

raw_spin_lock_irq(&rq->lock);

7564

update_rq_clock(rq);

7564

update_rq_clock(rq);

7565

update_idle_cpu_load(rq);

7565

update_idle_cpu_load(rq);

7566

raw_spin_unlock_irq(&rq->lock);

7566

raw_spin_unlock_irq(&rq->lock);

7567

rebalance_domains(rq, CPU_IDLE);

7567

rebalance_domains(rq, CPU_IDLE);

7568

}

7568

}

7569

7570

if (time_after(this_rq->next_balance, rq->next_balance))

7570

if (time_after(this_rq->next_balance, rq->next_balance))

7571

this_rq->next_balance = rq->next_balance;

7571

this_rq->next_balance = rq->next_balance;

7572

}

7572

}

7573

nohz.next_balance = this_rq->next_balance;

7573

nohz.next_balance = this_rq->next_balance;

7574

end:

7574

end:

7575

clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));

7575

clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));

7576

}

7576

}

7577

7578

/*

7578

/*

7579

* Current heuristic for kicking the idle load balancer in the presence

7579

* Current heuristic for kicking the idle load balancer in the presence

7580

* of an idle cpu is the system.

7580

* of an idle cpu is the system.

7581

* - This rq has more than one task.

7581

* - This rq has more than one task.

7582

* - At any scheduler domain level, this cpu's scheduler group has multiple

7582

* - At any scheduler domain level, this cpu's scheduler group has multiple

7583

* busy cpu's exceeding the group's capacity.

7583

* busy cpu's exceeding the group's capacity.

7584

* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler

7584

* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler

7585

* domain span are idle.

7585

* domain span are idle.

7586

*/

7586

*/

7587

static inline int nohz_kick_needed(struct rq *rq)

7587

static inline int nohz_kick_needed(struct rq *rq)

7588

{

7588

{

7589

unsigned long now = jiffies;

7589

unsigned long now = jiffies;

7590

struct sched_domain *sd;

7590

struct sched_domain *sd;

7591

struct sched_group_capacity *sgc;

7591

struct sched_group_capacity *sgc;

7592

int nr_busy, cpu = rq->cpu;

7592

int nr_busy, cpu = rq->cpu;

7593

7594

if (unlikely(rq->idle_balance))

7594

if (unlikely(rq->idle_balance))

7595

return 0;

7595

return 0;

7596

7597

/*

7597

/*

7598

* We may be recently in ticked or tickless idle mode. At the first

7598

* We may be recently in ticked or tickless idle mode. At the first

7599

* busy tick after returning from idle, we will update the busy stats.

7599

* busy tick after returning from idle, we will update the busy stats.

7600

*/

7600

*/

7601

set_cpu_sd_state_busy();

7601

set_cpu_sd_state_busy();

7602

nohz_balance_exit_idle(cpu);

7602

nohz_balance_exit_idle(cpu);

7603

7604

/*

7604

/*

7605

* None are in tickless mode and hence no need for NOHZ idle load

7605

* None are in tickless mode and hence no need for NOHZ idle load

7606

* balancing.

7606

* balancing.

7607

*/

7607

*/

7608

if (likely(!atomic_read(&nohz.nr_cpus)))

7608

if (likely(!atomic_read(&nohz.nr_cpus)))

7609

return 0;

7609

return 0;

7610

7611

if (time_before(now, nohz.next_balance))

7611

if (time_before(now, nohz.next_balance))

7612

return 0;

7612

return 0;

7613

7614

if (rq->nr_running >= 2)

7614

if (rq->nr_running >= 2)

7615

goto need_kick;

7615

goto need_kick;

7616

7617

rcu_read_lock();

7617

rcu_read_lock();

7618

sd = rcu_dereference(per_cpu(sd_busy, cpu));

7618

sd = rcu_dereference(per_cpu(sd_busy, cpu));

7619

7620

if (sd) {

7620

if (sd) {

7621

sgc = sd->groups->sgc;

7621

sgc = sd->groups->sgc;

7622

nr_busy = atomic_read(&sgc->nr_busy_cpus);

7622

nr_busy = atomic_read(&sgc->nr_busy_cpus);

7623

7624

if (nr_busy > 1)

7624

if (nr_busy > 1)

7625

goto need_kick_unlock;

7625

goto need_kick_unlock;

7626

}

7626

}

7627

7628

sd = rcu_dereference(per_cpu(sd_asym, cpu));

7628

sd = rcu_dereference(per_cpu(sd_asym, cpu));

7629

7630

if (sd && (cpumask_first_and(nohz.idle_cpus_mask,

7630

if (sd && (cpumask_first_and(nohz.idle_cpus_mask,

7631

sched_domain_span(sd)) < cpu))

7631

sched_domain_span(sd)) < cpu))

7632

goto need_kick_unlock;

7632

goto need_kick_unlock;

7633

7634

rcu_read_unlock();

7634

rcu_read_unlock();

7635

return 0;

7635

return 0;

7636

7637

need_kick_unlock:

7637

need_kick_unlock:

7638

rcu_read_unlock();

7638

rcu_read_unlock();

7639

need_kick:

7639

need_kick:

7640

return 1;

7640

return 1;

7641

}

7641

}

7642

#else

7642

#else

7643

static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }

7643

static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }

7644

#endif

7644

#endif

7645

7646

/*

7646

/*

7647

* run_rebalance_domains is triggered when needed from the scheduler tick.

7647

* run_rebalance_domains is triggered when needed from the scheduler tick.

7648

* Also triggered for nohz idle balancing (with nohz_balancing_kick set).

7648

* Also triggered for nohz idle balancing (with nohz_balancing_kick set).

7649

*/

7649

*/

7650

static void run_rebalance_domains(struct softirq_action *h)

7650

static void run_rebalance_domains(struct softirq_action *h)

7651

{

7651

{

7652

struct rq *this_rq = this_rq();

7652

struct rq *this_rq = this_rq();

7653

enum cpu_idle_type idle = this_rq->idle_balance ?

7653

enum cpu_idle_type idle = this_rq->idle_balance ?

7654

CPU_IDLE : CPU_NOT_IDLE;

7654

CPU_IDLE : CPU_NOT_IDLE;

7655

7656

rebalance_domains(this_rq, idle);

7656

rebalance_domains(this_rq, idle);

7657

7658

/*

7658

/*

7659

* If this cpu has a pending nohz_balance_kick, then do the

7659

* If this cpu has a pending nohz_balance_kick, then do the

7660

* balancing on behalf of the other idle cpus whose ticks are

7660

* balancing on behalf of the other idle cpus whose ticks are

7661

* stopped.

7661

* stopped.

7662

*/

7662

*/

7663

nohz_idle_balance(this_rq, idle);

7663

nohz_idle_balance(this_rq, idle);

7664

}

7664

}

7665

7666

/*

7666

/*

7667

* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.

7667

* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.

7668

*/

7668

*/

7669

void trigger_load_balance(struct rq *rq)

7669

void trigger_load_balance(struct rq *rq)

7670

{

7670

{

7671

/* Don't need to rebalance while attached to NULL domain */

7671

/* Don't need to rebalance while attached to NULL domain */

7672

if (unlikely(on_null_domain(rq)))

7672

if (unlikely(on_null_domain(rq)))

7673

return;

7673

return;

7674

7675

if (time_after_eq(jiffies, rq->next_balance))

7675

if (time_after_eq(jiffies, rq->next_balance))

7676

raise_softirq(SCHED_SOFTIRQ);

7676

raise_softirq(SCHED_SOFTIRQ);

7677

#ifdef CONFIG_NO_HZ_COMMON

7677

#ifdef CONFIG_NO_HZ_COMMON

7678

if (nohz_kick_needed(rq))

7678

if (nohz_kick_needed(rq))

7679

nohz_balancer_kick();

7679

nohz_balancer_kick();

7680

#endif

7680

#endif

7681

}

7681

}

7682

7683

static void rq_online_fair(struct rq *rq)

7683

static void rq_online_fair(struct rq *rq)

7684

{

7684

{

7685

update_sysctl();

7685

update_sysctl();

7686

7687

update_runtime_enabled(rq);

7687

update_runtime_enabled(rq);

7688

}

7688

}

7689

7690

static void rq_offline_fair(struct rq *rq)

7690

static void rq_offline_fair(struct rq *rq)

7691

{

7691

{

7692

update_sysctl();

7692

update_sysctl();

7693

7694

/* Ensure any throttled groups are reachable by pick_next_task */

7694

/* Ensure any throttled groups are reachable by pick_next_task */

7695

unthrottle_offline_cfs_rqs(rq);

7695

unthrottle_offline_cfs_rqs(rq);

7696

}

7696

}

7697

7698

#endif /* CONFIG_SMP */

7698

#endif /* CONFIG_SMP */

7699

7700

/*

7700

/*

7701

* scheduler tick hitting a task of our scheduling class:

7701

* scheduler tick hitting a task of our scheduling class:

7702

*/

7702

*/

7703

static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)

7703

static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)

7704

{

7704

{

7705

struct cfs_rq *cfs_rq;

7705

struct cfs_rq *cfs_rq;

7706

struct sched_entity *se = &curr->se;

7706

struct sched_entity *se = &curr->se;

7707

7708

for_each_sched_entity(se) {

7708

for_each_sched_entity(se) {

7709

cfs_rq = cfs_rq_of(se);

7709

cfs_rq = cfs_rq_of(se);

7710

entity_tick(cfs_rq, se, queued);

7710

entity_tick(cfs_rq, se, queued);

7711

}

7711

}

7712

7713

if (numabalancing_enabled)

7713

if (numabalancing_enabled)

7714

task_tick_numa(rq, curr);

7714

task_tick_numa(rq, curr);

7715

7716

update_rq_runnable_avg(rq, 1);

7716

update_rq_runnable_avg(rq, 1);

7717

}

7717

}

7718

7719

/*

7719

/*

7720

* called on fork with the child task as argument from the parent's context

7720

* called on fork with the child task as argument from the parent's context

7721

* - child not yet on the tasklist

7721

* - child not yet on the tasklist

7722

* - preemption disabled

7722

* - preemption disabled

7723

*/

7723

*/

7724

static void task_fork_fair(struct task_struct *p)

7724

static void task_fork_fair(struct task_struct *p)

7725

{

7725

{

7726

struct cfs_rq *cfs_rq;

7726

struct cfs_rq *cfs_rq;

7727

struct sched_entity *se = &p->se, *curr;

7727

struct sched_entity *se = &p->se, *curr;

7728

int this_cpu = smp_processor_id();

7728

int this_cpu = smp_processor_id();

7729

struct rq *rq = this_rq();

7729

struct rq *rq = this_rq();

7730

unsigned long flags;

7730

unsigned long flags;

7731

7732

raw_spin_lock_irqsave(&rq->lock, flags);

7732

raw_spin_lock_irqsave(&rq->lock, flags);

7733

7734

update_rq_clock(rq);

7734

update_rq_clock(rq);

7735

7736

cfs_rq = task_cfs_rq(current);

7736

cfs_rq = task_cfs_rq(current);

7737

curr = cfs_rq->curr;

7737

curr = cfs_rq->curr;

7738

7739

/*

7739

/*

7740

* Not only the cpu but also the task_group of the parent might have

7740

* Not only the cpu but also the task_group of the parent might have

7741

* been changed after parent->se.parent,cfs_rq were copied to

7741

* been changed after parent->se.parent,cfs_rq were copied to

7742

* child->se.parent,cfs_rq. So call __set_task_cpu() to make those

7742

* child->se.parent,cfs_rq. So call __set_task_cpu() to make those

7743

* of child point to valid ones.

7743

* of child point to valid ones.

7744

*/

7744

*/

7745

rcu_read_lock();

7745

rcu_read_lock();

7746

__set_task_cpu(p, this_cpu);

7746

__set_task_cpu(p, this_cpu);

7747

rcu_read_unlock();

7747

rcu_read_unlock();

7748

7749

update_curr(cfs_rq);

7749

update_curr(cfs_rq);

7750

7751

if (curr)

7751

if (curr)

7752

se->vruntime = curr->vruntime;

7752

se->vruntime = curr->vruntime;

7753

place_entity(cfs_rq, se, 1);

7753

place_entity(cfs_rq, se, 1);

7754

7755

if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {

7755

if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {

7756

/*

7756

/*

7757

* Upon rescheduling, sched_class::put_prev_task() will place

7757

* Upon rescheduling, sched_class::put_prev_task() will place

7758

* 'current' within the tree based on its new key value.

7758

* 'current' within the tree based on its new key value.

7759

*/

7759

*/

7760

swap(curr->vruntime, se->vruntime);

7760

swap(curr->vruntime, se->vruntime);

7761

resched_curr(rq);

7761

resched_curr(rq);

7762

}

7762

}

7763

7764

se->vruntime -= cfs_rq->min_vruntime;

7764

se->vruntime -= cfs_rq->min_vruntime;

7765

7766

raw_spin_unlock_irqrestore(&rq->lock, flags);

7766

raw_spin_unlock_irqrestore(&rq->lock, flags);

7767

}

7767

}

7768

7769

/*

7769

/*

7770

* Priority of the task has changed. Check to see if we preempt

7770

* Priority of the task has changed. Check to see if we preempt

7771

* the current task.

7771

* the current task.

7772

*/

7772

*/

7773

static void

7773

static void

7774

prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)

7774

prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)

7775

{

7775

{

7776

if (!task_on_rq_queued(p))

7776

if (!task_on_rq_queued(p))

7777

return;

7777

return;

7778

7779

/*

7779

/*

7780

* Reschedule if we are currently running on this runqueue and

7780

* Reschedule if we are currently running on this runqueue and

7781

* our priority decreased, or if we are not currently running on

7781

* our priority decreased, or if we are not currently running on

7782

* this runqueue and our priority is higher than the current's

7782

* this runqueue and our priority is higher than the current's

7783

*/

7783

*/

7784

if (rq->curr == p) {

7784

if (rq->curr == p) {

7785

if (p->prio > oldprio)

7785

if (p->prio > oldprio)

7786

resched_curr(rq);

7786

resched_curr(rq);

7787

} else

7787

} else

7788

check_preempt_curr(rq, p, 0);

7788

check_preempt_curr(rq, p, 0);

7789

}

7789

}

7790

7791

static void switched_from_fair(struct rq *rq, struct task_struct *p)

7791

static void switched_from_fair(struct rq *rq, struct task_struct *p)

7792

{

7792

{

7793

struct sched_entity *se = &p->se;

7793

struct sched_entity *se = &p->se;

7794

struct cfs_rq *cfs_rq = cfs_rq_of(se);

7794

struct cfs_rq *cfs_rq = cfs_rq_of(se);

7795

7796

/*

7796

/*

7797

* Ensure the task's vruntime is normalized, so that when it's

7797

* Ensure the task's vruntime is normalized, so that when it's

7798

* switched back to the fair class the enqueue_entity(.flags=0) will

7798

* switched back to the fair class the enqueue_entity(.flags=0) will

7799

* do the right thing.

7799

* do the right thing.

7800

*

7800

*

7801

* If it's queued, then the dequeue_entity(.flags=0) will already

7801

* If it's queued, then the dequeue_entity(.flags=0) will already

7802

* have normalized the vruntime, if it's !queued, then only when

7802

* have normalized the vruntime, if it's !queued, then only when

7803

* the task is sleeping will it still have non-normalized vruntime.

7803

* the task is sleeping will it still have non-normalized vruntime.

7804

*/

7804

*/

7805

if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {

7805

if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {

7806

/*

7806

/*

7807

* Fix up our vruntime so that the current sleep doesn't

7807

* Fix up our vruntime so that the current sleep doesn't

7808

* cause 'unlimited' sleep bonus.

7808

* cause 'unlimited' sleep bonus.

7809

*/

7809

*/

7810

place_entity(cfs_rq, se, 0);

7810

place_entity(cfs_rq, se, 0);

7811

se->vruntime -= cfs_rq->min_vruntime;

7811

se->vruntime -= cfs_rq->min_vruntime;

7812

}

7812

}

7813

7814

#ifdef CONFIG_SMP

7814

#ifdef CONFIG_SMP

7815

/*

7815

/*

7816

* Remove our load from contribution when we leave sched_fair

7816

* Remove our load from contribution when we leave sched_fair

7817

* and ensure we don't carry in an old decay_count if we

7817

* and ensure we don't carry in an old decay_count if we

7818

* switch back.

7818

* switch back.

7819

*/

7819

*/

7820

if (se->avg.decay_count) {

7820

if (se->avg.decay_count) {

7821

__synchronize_entity_decay(se);

7821

__synchronize_entity_decay(se);

7822

subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);

7822

subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);

7823

}

7823

}

7824

#endif

7824

#endif

7825

}

7825

}

7826

7827

/*

7827

/*

7828

* We switched to the sched_fair class.

7828

* We switched to the sched_fair class.

7829

*/

7829

*/

7830

static void switched_to_fair(struct rq *rq, struct task_struct *p)

7830

static void switched_to_fair(struct rq *rq, struct task_struct *p)

7831

{

7831

{

7832

#ifdef CONFIG_FAIR_GROUP_SCHED

7832

#ifdef CONFIG_FAIR_GROUP_SCHED

7833

struct sched_entity *se = &p->se;

7833

struct sched_entity *se = &p->se;

7834

/*

7834

/*

7835

* Since the real-depth could have been changed (only FAIR

7835

* Since the real-depth could have been changed (only FAIR

7836

* class maintain depth value), reset depth properly.

7836

* class maintain depth value), reset depth properly.

7837

*/

7837

*/

7838

se->depth = se->parent ? se->parent->depth + 1 : 0;

7838

se->depth = se->parent ? se->parent->depth + 1 : 0;

7839

#endif

7839

#endif

7840

if (!task_on_rq_queued(p))

7840

if (!task_on_rq_queued(p))

7841

return;

7841

return;

7842

7843

/*

7843

/*

7844

* We were most likely switched from sched_rt, so

7844

* We were most likely switched from sched_rt, so

7845

* kick off the schedule if running, otherwise just see

7845

* kick off the schedule if running, otherwise just see

7846

* if we can still preempt the current task.

7846

* if we can still preempt the current task.

7847

*/

7847

*/

7848

if (rq->curr == p)

7848

if (rq->curr == p)

7849

resched_curr(rq);

7849

resched_curr(rq);

7850

else

7850

else

7851

check_preempt_curr(rq, p, 0);

7851

check_preempt_curr(rq, p, 0);

7852

}

7852

}

7853

7854

/* Account for a task changing its policy or group.

7854

/* Account for a task changing its policy or group.

7855

*

7855

*

7856

* This routine is mostly called to set cfs_rq->curr field when a task

7856

* This routine is mostly called to set cfs_rq->curr field when a task

7857

* migrates between groups/classes.

7857

* migrates between groups/classes.

7858

*/

7858

*/

7859

static void set_curr_task_fair(struct rq *rq)

7859

static void set_curr_task_fair(struct rq *rq)

7860

{

7860

{

7861

struct sched_entity *se = &rq->curr->se;

7861

struct sched_entity *se = &rq->curr->se;

7862

7863

for_each_sched_entity(se) {

7863

for_each_sched_entity(se) {

7864

struct cfs_rq *cfs_rq = cfs_rq_of(se);

7864

struct cfs_rq *cfs_rq = cfs_rq_of(se);

7865

7866

set_next_entity(cfs_rq, se);

7866

set_next_entity(cfs_rq, se);

7867

/* ensure bandwidth has been allocated on our new cfs_rq */

7867

/* ensure bandwidth has been allocated on our new cfs_rq */

7868

account_cfs_rq_runtime(cfs_rq, 0);

7868

account_cfs_rq_runtime(cfs_rq, 0);

7869

}

7869

}

7870

}

7870

}

7871

7872

void init_cfs_rq(struct cfs_rq *cfs_rq)

7872

void init_cfs_rq(struct cfs_rq *cfs_rq)

7873

{

7873

{

7874

cfs_rq->tasks_timeline = RB_ROOT;

7874

cfs_rq->tasks_timeline = RB_ROOT;

7875

cfs_rq->min_vruntime = (u64)(-(1LL << 20));

7875

cfs_rq->min_vruntime = (u64)(-(1LL << 20));

7876

#ifndef CONFIG_64BIT

7876

#ifndef CONFIG_64BIT

7877

cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;

7877

cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;

7878

#endif

7878

#endif

7879

#ifdef CONFIG_SMP

7879

#ifdef CONFIG_SMP

7880

atomic64_set(&cfs_rq->decay_counter, 1);

7880

atomic64_set(&cfs_rq->decay_counter, 1);

7881

atomic_long_set(&cfs_rq->removed_load, 0);

7881

atomic_long_set(&cfs_rq->removed_load, 0);

7882

#endif

7882

#endif

7883

}

7883

}

7884

7885

#ifdef CONFIG_FAIR_GROUP_SCHED

7885

#ifdef CONFIG_FAIR_GROUP_SCHED

7886

static void task_move_group_fair(struct task_struct *p, int queued)

7886

static void task_move_group_fair(struct task_struct *p, int queued)

7887

{

7887

{

7888

struct sched_entity *se = &p->se;

7888

struct sched_entity *se = &p->se;

7889

struct cfs_rq *cfs_rq;

7889

struct cfs_rq *cfs_rq;

7890

7891

/*

7891

/*

7892

* If the task was not on the rq at the time of this cgroup movement

7892

* If the task was not on the rq at the time of this cgroup movement

7893

* it must have been asleep, sleeping tasks keep their ->vruntime

7893

* it must have been asleep, sleeping tasks keep their ->vruntime

7894

* absolute on their old rq until wakeup (needed for the fair sleeper

7894

* absolute on their old rq until wakeup (needed for the fair sleeper

7895

* bonus in place_entity()).

7895

* bonus in place_entity()).

7896

*

7896

*

7897

* If it was on the rq, we've just 'preempted' it, which does convert

7897

* If it was on the rq, we've just 'preempted' it, which does convert

7898

* ->vruntime to a relative base.

7898

* ->vruntime to a relative base.

7899

*

7899

*

7900

* Make sure both cases convert their relative position when migrating

7900

* Make sure both cases convert their relative position when migrating

7901

* to another cgroup's rq. This does somewhat interfere with the

7901

* to another cgroup's rq. This does somewhat interfere with the

7902

* fair sleeper stuff for the first placement, but who cares.

7902

* fair sleeper stuff for the first placement, but who cares.

7903

*/

7903

*/

7904

/*

7904

/*

7905

* When !queued, vruntime of the task has usually NOT been normalized.

7905

* When !queued, vruntime of the task has usually NOT been normalized.

7906

* But there are some cases where it has already been normalized:

7906

* But there are some cases where it has already been normalized:

7907

*

7907

*

7908

* - Moving a forked child which is waiting for being woken up by

7908

* - Moving a forked child which is waiting for being woken up by

7909

* wake_up_new_task().

7909

* wake_up_new_task().

7910

* - Moving a task which has been woken up by try_to_wake_up() and

7910

* - Moving a task which has been woken up by try_to_wake_up() and

7911

* waiting for actually being woken up by sched_ttwu_pending().

7911

* waiting for actually being woken up by sched_ttwu_pending().

7912

*

7912

*

7913

* To prevent boost or penalty in the new cfs_rq caused by delta

7913

* To prevent boost or penalty in the new cfs_rq caused by delta

7914

* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.

7914

* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.

7915

*/

7915

*/

7916

if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))

7916

if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))

7917

queued = 1;

7917

queued = 1;

7918

7919

if (!queued)

7919

if (!queued)

7920

se->vruntime -= cfs_rq_of(se)->min_vruntime;

7920

se->vruntime -= cfs_rq_of(se)->min_vruntime;

7921

set_task_rq(p, task_cpu(p));

7921

set_task_rq(p, task_cpu(p));

7922

se->depth = se->parent ? se->parent->depth + 1 : 0;

7922

se->depth = se->parent ? se->parent->depth + 1 : 0;

7923

if (!queued) {

7923

if (!queued) {

7924

cfs_rq = cfs_rq_of(se);

7924

cfs_rq = cfs_rq_of(se);

7925

se->vruntime += cfs_rq->min_vruntime;

7925

se->vruntime += cfs_rq->min_vruntime;

7926

#ifdef CONFIG_SMP

7926

#ifdef CONFIG_SMP

7927

/*

7927

/*

7928

* migrate_task_rq_fair() will have removed our previous

7928

* migrate_task_rq_fair() will have removed our previous

7929

* contribution, but we must synchronize for ongoing future

7929

* contribution, but we must synchronize for ongoing future

7930

* decay.

7930

* decay.

7931

*/

7931

*/

7932

se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);

7932

se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);

7933

cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;

7933

cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;

7934

#endif

7934

#endif

7935

}

7935

}

7936

}

7936

}

7937

7938

void free_fair_sched_group(struct task_group *tg)

7938

void free_fair_sched_group(struct task_group *tg)

7939

{

7939

{

7940

int i;

7940

int i;

7941

7942

destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));

7942

destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));

7943

7944

for_each_possible_cpu(i) {

7944

for_each_possible_cpu(i) {

7945

if (tg->cfs_rq)

7945

if (tg->cfs_rq)

7946

kfree(tg->cfs_rq[i]);

7946

kfree(tg->cfs_rq[i]);

7947

if (tg->se)

7947

if (tg->se)

7948

kfree(tg->se[i]);

7948

kfree(tg->se[i]);

7949

}

7949

}

7950

7951

kfree(tg->cfs_rq);

7951

kfree(tg->cfs_rq);

7952

kfree(tg->se);

7952

kfree(tg->se);

7953

}

7953

}

7954

7955

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

7955

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

7956

{

7956

{

7957

struct cfs_rq *cfs_rq;

7957

struct cfs_rq *cfs_rq;

7958

struct sched_entity *se;

7958

struct sched_entity *se;

7959

int i;

7959

int i;

7960

7961

tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);

7961

tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);

7962

if (!tg->cfs_rq)

7962

if (!tg->cfs_rq)

7963

goto err;

7963

goto err;

7964

tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);

7964

tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);

7965

if (!tg->se)

7965

if (!tg->se)

7966

goto err;

7966

goto err;

7967

7968

tg->shares = NICE_0_LOAD;

7968

tg->shares = NICE_0_LOAD;

7969

7970

init_cfs_bandwidth(tg_cfs_bandwidth(tg));

7970

init_cfs_bandwidth(tg_cfs_bandwidth(tg));

7971

7972

for_each_possible_cpu(i) {

7972

for_each_possible_cpu(i) {

7973

cfs_rq = kzalloc_node(sizeof(struct cfs_rq),

7973

cfs_rq = kzalloc_node(sizeof(struct cfs_rq),

7974

GFP_KERNEL, cpu_to_node(i));

7974

GFP_KERNEL, cpu_to_node(i));

7975

if (!cfs_rq)

7975

if (!cfs_rq)

7976

goto err;

7976

goto err;

7977

7978

se = kzalloc_node(sizeof(struct sched_entity),

7978

se = kzalloc_node(sizeof(struct sched_entity),

7979

GFP_KERNEL, cpu_to_node(i));

7979

GFP_KERNEL, cpu_to_node(i));

7980

if (!se)

7980

if (!se)

7981

goto err_free_rq;

7981

goto err_free_rq;

7982

7983

init_cfs_rq(cfs_rq);

7983

init_cfs_rq(cfs_rq);

7984

init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);

7984

init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);

7985

}

7985

}

7986

7987

return 1;

7987

return 1;

7988

7989

err_free_rq:

7989

err_free_rq:

7990

kfree(cfs_rq);

7990

kfree(cfs_rq);

7991

err:

7991

err:

7992

return 0;

7992

return 0;

7993

}

7993

}

7994

7995

void unregister_fair_sched_group(struct task_group *tg, int cpu)

7995

void unregister_fair_sched_group(struct task_group *tg, int cpu)

7996

{

7996

{

7997

struct rq *rq = cpu_rq(cpu);

7997

struct rq *rq = cpu_rq(cpu);

7998

unsigned long flags;

7998

unsigned long flags;

7999

8000

/*

8000

/*

8001

* Only empty task groups can be destroyed; so we can speculatively

8001

* Only empty task groups can be destroyed; so we can speculatively

8002

* check on_list without danger of it being re-added.

8002

* check on_list without danger of it being re-added.

8003

*/

8003

*/

8004

if (!tg->cfs_rq[cpu]->on_list)

8004

if (!tg->cfs_rq[cpu]->on_list)

8005

return;

8005

return;

8006

8007

raw_spin_lock_irqsave(&rq->lock, flags);

8007

raw_spin_lock_irqsave(&rq->lock, flags);

8008

list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);

8008

list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);

8009

raw_spin_unlock_irqrestore(&rq->lock, flags);

8009

raw_spin_unlock_irqrestore(&rq->lock, flags);

8010

}

8010

}

8011

8012

void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,

8012

void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,

8013

struct sched_entity *se, int cpu,

8013

struct sched_entity *se, int cpu,

8014

struct sched_entity *parent)

8014

struct sched_entity *parent)

8015

{

8015

{

8016

struct rq *rq = cpu_rq(cpu);

8016

struct rq *rq = cpu_rq(cpu);

8017

8018

cfs_rq->tg = tg;

8018

cfs_rq->tg = tg;

8019

cfs_rq->rq = rq;

8019

cfs_rq->rq = rq;

8020

init_cfs_rq_runtime(cfs_rq);

8020

init_cfs_rq_runtime(cfs_rq);

8021

8022

tg->cfs_rq[cpu] = cfs_rq;

8022

tg->cfs_rq[cpu] = cfs_rq;

8023

tg->se[cpu] = se;

8023

tg->se[cpu] = se;

8024

8025

/* se could be NULL for root_task_group */

8025

/* se could be NULL for root_task_group */

8026

if (!se)

8026

if (!se)

8027

return;

8027

return;

8028

8029

if (!parent) {

8029

if (!parent) {

8030

se->cfs_rq = &rq->cfs;

8030

se->cfs_rq = &rq->cfs;

8031

se->depth = 0;

8031

se->depth = 0;

8032

} else {

8032

} else {

8033

se->cfs_rq = parent->my_q;

8033

se->cfs_rq = parent->my_q;

8034

se->depth = parent->depth + 1;

8034

se->depth = parent->depth + 1;

8035

}

8035

}

8036

8037

se->my_q = cfs_rq;

8037

se->my_q = cfs_rq;

8038

/* guarantee group entities always have weight */

8038

/* guarantee group entities always have weight */

8039

update_load_set(&se->load, NICE_0_LOAD);

8039

update_load_set(&se->load, NICE_0_LOAD);

8040

se->parent = parent;

8040

se->parent = parent;

8041

}

8041

}

8042

8043

static DEFINE_MUTEX(shares_mutex);

8043

static DEFINE_MUTEX(shares_mutex);

8044

8045

int sched_group_set_shares(struct task_group *tg, unsigned long shares)

8045

int sched_group_set_shares(struct task_group *tg, unsigned long shares)

8046

{

8046

{

8047

int i;

8047

int i;

8048

unsigned long flags;

8048

unsigned long flags;

8049

8050

/*

8050

/*

8051

* We can't change the weight of the root cgroup.

8051

* We can't change the weight of the root cgroup.

8052

*/

8052

*/

8053

if (!tg->se[0])

8053

if (!tg->se[0])

8054

return -EINVAL;

8054

return -EINVAL;

8055

8056

shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));

8056

shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));

8057

8058

mutex_lock(&shares_mutex);

8058

mutex_lock(&shares_mutex);

8059

if (tg->shares == shares)

8059

if (tg->shares == shares)

8060

goto done;

8060

goto done;

8061

8062

tg->shares = shares;

8062

tg->shares = shares;

8063

for_each_possible_cpu(i) {

8063

for_each_possible_cpu(i) {

8064

struct rq *rq = cpu_rq(i);

8064

struct rq *rq = cpu_rq(i);

8065

struct sched_entity *se;

8065

struct sched_entity *se;

8066

8067

se = tg->se[i];

8067

se = tg->se[i];

8068

/* Propagate contribution to hierarchy */

8068

/* Propagate contribution to hierarchy */

8069

raw_spin_lock_irqsave(&rq->lock, flags);

8069

raw_spin_lock_irqsave(&rq->lock, flags);

8070

8071

/* Possible calls to update_curr() need rq clock */

8071

/* Possible calls to update_curr() need rq clock */

8072

update_rq_clock(rq);

8072

update_rq_clock(rq);

8073

for_each_sched_entity(se)

8073

for_each_sched_entity(se)

8074

update_cfs_shares(group_cfs_rq(se));

8074

update_cfs_shares(group_cfs_rq(se));

8075

raw_spin_unlock_irqrestore(&rq->lock, flags);

8075

raw_spin_unlock_irqrestore(&rq->lock, flags);

8076

}

8076

}

8077

8078

done:

8078

done:

8079

mutex_unlock(&shares_mutex);

8079

mutex_unlock(&shares_mutex);

8080

return 0;

8080

return 0;

8081

}

8081

}

8082

#else /* CONFIG_FAIR_GROUP_SCHED */

8082

#else /* CONFIG_FAIR_GROUP_SCHED */

8083

8084

void free_fair_sched_group(struct task_group *tg) { }

8084

void free_fair_sched_group(struct task_group *tg) { }

8085

8086

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

8086

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

8087

{

8087

{

8088

return 1;

8088

return 1;

8089

}

8089

}

8090

8091

void unregister_fair_sched_group(struct task_group *tg, int cpu) { }

8091

void unregister_fair_sched_group(struct task_group *tg, int cpu) { }

8092

8093

#endif /* CONFIG_FAIR_GROUP_SCHED */

8093

#endif /* CONFIG_FAIR_GROUP_SCHED */

8094

8095

8096

static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)

8096

static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)

8097

{

8097

{

8098

struct sched_entity *se = &task->se;

8098

struct sched_entity *se = &task->se;

8099

unsigned int rr_interval = 0;

8099

unsigned int rr_interval = 0;

8100

8101

/*

8101

/*

8102

* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise

8102

* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise

8103

* idle runqueue:

8103

* idle runqueue:

8104

*/

8104

*/

8105

if (rq->cfs.load.weight)

8105

if (rq->cfs.load.weight)

8106

rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));

8106

rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));

8107

8108

return rr_interval;

8108

return rr_interval;

8109

}

8109

}

8110

8111

/*

8111

/*

8112

* All the scheduling class methods:

8112

* All the scheduling class methods:

8113

*/

8113

*/

8114

const struct sched_class fair_sched_class = {

8114

const struct sched_class fair_sched_class = {

8115

.next = &idle_sched_class,

8115

.next = &idle_sched_class,

8116

.enqueue_task = enqueue_task_fair,

8116

.enqueue_task = enqueue_task_fair,

8117

.dequeue_task = dequeue_task_fair,

8117

.dequeue_task = dequeue_task_fair,

8118

.yield_task = yield_task_fair,

8118

.yield_task = yield_task_fair,

8119

.yield_to_task = yield_to_task_fair,

8119

.yield_to_task = yield_to_task_fair,

8120

8121

.check_preempt_curr = check_preempt_wakeup,

8121

.check_preempt_curr = check_preempt_wakeup,

8122

8123

.pick_next_task = pick_next_task_fair,

8123

.pick_next_task = pick_next_task_fair,

8124

.put_prev_task = put_prev_task_fair,

8124

.put_prev_task = put_prev_task_fair,

8125

8126

#ifdef CONFIG_SMP

8126

#ifdef CONFIG_SMP

8127

.select_task_rq = select_task_rq_fair,

8127

.select_task_rq = select_task_rq_fair,

8128

.migrate_task_rq = migrate_task_rq_fair,

8128

.migrate_task_rq = migrate_task_rq_fair,

8129

8130

.rq_online = rq_online_fair,

8130

.rq_online = rq_online_fair,

8131

.rq_offline = rq_offline_fair,

8131

.rq_offline = rq_offline_fair,

8132

8133

.task_waking = task_waking_fair,

8133

.task_waking = task_waking_fair,

8134

#endif

8134

#endif

8135

8136

.set_curr_task = set_curr_task_fair,

8136

.set_curr_task = set_curr_task_fair,

8137

.task_tick = task_tick_fair,

8137

.task_tick = task_tick_fair,

8138

.task_fork = task_fork_fair,

8138

.task_fork = task_fork_fair,

8139

8140

.prio_changed = prio_changed_fair,

8140

.prio_changed = prio_changed_fair,

8141

.switched_from = switched_from_fair,

8141

.switched_from = switched_from_fair,

8142

.switched_to = switched_to_fair,

8142

.switched_to = switched_to_fair,

8143

8144

.get_rr_interval = get_rr_interval_fair,

8144

.get_rr_interval = get_rr_interval_fair,

8145

8146

.update_curr = update_curr_fair,

8146

.update_curr = update_curr_fair,

8147

8148

#ifdef CONFIG_FAIR_GROUP_SCHED

8148

#ifdef CONFIG_FAIR_GROUP_SCHED

8149

.task_move_group = task_move_group_fair,

8149

.task_move_group = task_move_group_fair,

8150

#endif

8150

#endif

8151

};

8151

};

8152

8153

#ifdef CONFIG_SCHED_DEBUG

8153

#ifdef CONFIG_SCHED_DEBUG

8154

void print_cfs_stats(struct seq_file *m, int cpu)

8154

void print_cfs_stats(struct seq_file *m, int cpu)

8155

{

8155

{

8156

struct cfs_rq *cfs_rq;

8156

struct cfs_rq *cfs_rq;

8157

8158

rcu_read_lock();

8158

rcu_read_lock();

8159

for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)

8159

for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)

8160

print_cfs_rq(m, cpu, cfs_rq);

8160

print_cfs_rq(m, cpu, cfs_rq);

8161

rcu_read_unlock();

8161

rcu_read_unlock();

8162

}

8162

}

8163

#endif

8163

#endif

8164

8165

__init void init_sched_fair_class(void)

8165

__init void init_sched_fair_class(void)

8166

{

8166

{

8167

#ifdef CONFIG_SMP

8167

#ifdef CONFIG_SMP

8168

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

8168

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

8169

8170

#ifdef CONFIG_NO_HZ_COMMON

8170

#ifdef CONFIG_NO_HZ_COMMON

8171

nohz.next_balance = jiffies;

8171

nohz.next_balance = jiffies;

8172

zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);

8172

zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);

8173

cpu_notifier(sched_ilb_notifier, 0);

8173

cpu_notifier(sched_ilb_notifier, 0);

8174

#endif

8174

#endif

8175

#endif /* SMP */

8175

#endif /* SMP */

8176

8177

}

8177

}

8178

GITLAB

sched: Fix odd values in effective_load() calculations