Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

* kernel/sched/core.c

2

* kernel/sched/core.c

3

*

3

*

4

* Kernel scheduler and related syscalls

4

* Kernel scheduler and related syscalls

5

*

5

*

6

7

*

7

*

8

* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and

8

* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and

9

* make semaphores SMP safe

9

* make semaphores SMP safe

10

* 1998-11-19 Implemented schedule_timeout() and related stuff

10

* 1998-11-19 Implemented schedule_timeout() and related stuff

11

* by Andrea Arcangeli

11

* by Andrea Arcangeli

12

* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:

12

* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:

13

* hybrid priority-list and round-robin design with

13

* hybrid priority-list and round-robin design with

14

* an array-switch method of distributing timeslices

14

* an array-switch method of distributing timeslices

15

* and per-CPU runqueues. Cleanups and useful suggestions

15

* and per-CPU runqueues. Cleanups and useful suggestions

16

* by Davide Libenzi, preemptible kernel bits by Robert Love.

16

* by Davide Libenzi, preemptible kernel bits by Robert Love.

17

* 2003-09-03 Interactivity tuning by Con Kolivas.

17

* 2003-09-03 Interactivity tuning by Con Kolivas.

18

* 2004-04-02 Scheduler domains code by Nick Piggin

18

* 2004-04-02 Scheduler domains code by Nick Piggin

19

* 2007-04-15 Work begun on replacing all interactivity tuning with a

19

* 2007-04-15 Work begun on replacing all interactivity tuning with a

20

* fair scheduling design by Con Kolivas.

20

* fair scheduling design by Con Kolivas.

21

* 2007-05-05 Load balancing (smp-nice) and other improvements

21

* 2007-05-05 Load balancing (smp-nice) and other improvements

22

* by Peter Williams

22

* by Peter Williams

23

* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith

23

* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith

24

* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri

24

* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri

25

* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,

25

* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,

26

* Thomas Gleixner, Mike Kravetz

26

* Thomas Gleixner, Mike Kravetz

27

*/

27

*/

28

29

#include <linux/mm.h>

29

#include <linux/mm.h>

30

#include <linux/module.h>

30

#include <linux/module.h>

31

#include <linux/nmi.h>

31

#include <linux/nmi.h>

32

#include <linux/init.h>

32

#include <linux/init.h>

33

#include <linux/uaccess.h>

33

#include <linux/uaccess.h>

34

#include <linux/highmem.h>

34

#include <linux/highmem.h>

35

#include <asm/mmu_context.h>

35

#include <asm/mmu_context.h>

36

#include <linux/interrupt.h>

36

#include <linux/interrupt.h>

37

#include <linux/capability.h>

37

#include <linux/capability.h>

38

#include <linux/completion.h>

38

#include <linux/completion.h>

39

#include <linux/kernel_stat.h>

39

#include <linux/kernel_stat.h>

40

#include <linux/debug_locks.h>

40

#include <linux/debug_locks.h>

41

#include <linux/perf_event.h>

41

#include <linux/perf_event.h>

42

#include <linux/security.h>

42

#include <linux/security.h>

43

#include <linux/notifier.h>

43

#include <linux/notifier.h>

44

#include <linux/profile.h>

44

#include <linux/profile.h>

45

#include <linux/freezer.h>

45

#include <linux/freezer.h>

46

#include <linux/vmalloc.h>

46

#include <linux/vmalloc.h>

47

#include <linux/blkdev.h>

47

#include <linux/blkdev.h>

48

#include <linux/delay.h>

48

#include <linux/delay.h>

49

#include <linux/pid_namespace.h>

49

#include <linux/pid_namespace.h>

50

#include <linux/smp.h>

50

#include <linux/smp.h>

51

#include <linux/threads.h>

51

#include <linux/threads.h>

52

#include <linux/timer.h>

52

#include <linux/timer.h>

53

#include <linux/rcupdate.h>

53

#include <linux/rcupdate.h>

54

#include <linux/cpu.h>

54

#include <linux/cpu.h>

55

#include <linux/cpuset.h>

55

#include <linux/cpuset.h>

56

#include <linux/percpu.h>

56

#include <linux/percpu.h>

57

#include <linux/proc_fs.h>

57

#include <linux/proc_fs.h>

58

#include <linux/seq_file.h>

58

#include <linux/seq_file.h>

59

#include <linux/sysctl.h>

59

#include <linux/sysctl.h>

60

#include <linux/syscalls.h>

60

#include <linux/syscalls.h>

61

#include <linux/times.h>

61

#include <linux/times.h>

62

#include <linux/tsacct_kern.h>

62

#include <linux/tsacct_kern.h>

63

#include <linux/kprobes.h>

63

#include <linux/kprobes.h>

64

#include <linux/delayacct.h>

64

#include <linux/delayacct.h>

65

#include <linux/unistd.h>

65

#include <linux/unistd.h>

66

#include <linux/pagemap.h>

66

#include <linux/pagemap.h>

67

#include <linux/hrtimer.h>

67

#include <linux/hrtimer.h>

68

#include <linux/tick.h>

68

#include <linux/tick.h>

69

#include <linux/debugfs.h>

69

#include <linux/debugfs.h>

70

#include <linux/ctype.h>

70

#include <linux/ctype.h>

71

#include <linux/ftrace.h>

71

#include <linux/ftrace.h>

72

#include <linux/slab.h>

72

#include <linux/slab.h>

73

#include <linux/init_task.h>

73

#include <linux/init_task.h>

74

#include <linux/binfmts.h>

74

#include <linux/binfmts.h>

75

#include <linux/context_tracking.h>

75

#include <linux/context_tracking.h>

76

#include <linux/compiler.h>

76

#include <linux/compiler.h>

77

78

#include <asm/switch_to.h>

78

#include <asm/switch_to.h>

79

#include <asm/tlb.h>

79

#include <asm/tlb.h>

80

#include <asm/irq_regs.h>

80

#include <asm/irq_regs.h>

81

#include <asm/mutex.h>

81

#include <asm/mutex.h>

82

#ifdef CONFIG_PARAVIRT

82

#ifdef CONFIG_PARAVIRT

83

#include <asm/paravirt.h>

83

#include <asm/paravirt.h>

84

#endif

84

#endif

85

86

#include "sched.h"

86

#include "sched.h"

87

#include "../workqueue_internal.h"

87

#include "../workqueue_internal.h"

88

#include "../smpboot.h"

88

#include "../smpboot.h"

89

90

#define CREATE_TRACE_POINTS

90

#define CREATE_TRACE_POINTS

91

#include <trace/events/sched.h>

91

#include <trace/events/sched.h>

92

93

void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)

93

void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)

94

{

94

{

95

unsigned long delta;

95

unsigned long delta;

96

ktime_t soft, hard, now;

96

ktime_t soft, hard, now;

97

98

for (;;) {

98

for (;;) {

99

if (hrtimer_active(period_timer))

99

if (hrtimer_active(period_timer))

100

break;

100

break;

101

102

now = hrtimer_cb_get_time(period_timer);

102

now = hrtimer_cb_get_time(period_timer);

103

hrtimer_forward(period_timer, now, period);

103

hrtimer_forward(period_timer, now, period);

104

105

soft = hrtimer_get_softexpires(period_timer);

105

soft = hrtimer_get_softexpires(period_timer);

106

hard = hrtimer_get_expires(period_timer);

106

hard = hrtimer_get_expires(period_timer);

107

delta = ktime_to_ns(ktime_sub(hard, soft));

107

delta = ktime_to_ns(ktime_sub(hard, soft));

108

__hrtimer_start_range_ns(period_timer, soft, delta,

108

__hrtimer_start_range_ns(period_timer, soft, delta,

109

HRTIMER_MODE_ABS_PINNED, 0);

109

HRTIMER_MODE_ABS_PINNED, 0);

110

}

110

}

111

}

111

}

112

113

DEFINE_MUTEX(sched_domains_mutex);

113

DEFINE_MUTEX(sched_domains_mutex);

114

DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

114

DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

115

116

static void update_rq_clock_task(struct rq *rq, s64 delta);

116

static void update_rq_clock_task(struct rq *rq, s64 delta);

117

118

void update_rq_clock(struct rq *rq)

118

void update_rq_clock(struct rq *rq)

119

{

119

{

120

s64 delta;

120

s64 delta;

121

122

if (rq->skip_clock_update > 0)

122

if (rq->skip_clock_update > 0)

123

return;

123

return;

124

125

delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;

125

delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;

126

if (delta < 0)

126

if (delta < 0)

127

return;

127

return;

128

rq->clock += delta;

128

rq->clock += delta;

129

update_rq_clock_task(rq, delta);

129

update_rq_clock_task(rq, delta);

130

}

130

}

131

132

/*

132

/*

133

* Debugging: various feature bits

133

* Debugging: various feature bits

134

*/

134

*/

135

136

#define SCHED_FEAT(name, enabled) \

136

#define SCHED_FEAT(name, enabled) \

137

(1UL << __SCHED_FEAT_##name) * enabled |

137

(1UL << __SCHED_FEAT_##name) * enabled |

138

139

const_debug unsigned int sysctl_sched_features =

139

const_debug unsigned int sysctl_sched_features =

140

#include "features.h"

140

#include "features.h"

141

0;

141

0;

142

143

#undef SCHED_FEAT

143

#undef SCHED_FEAT

144

145

#ifdef CONFIG_SCHED_DEBUG

145

#ifdef CONFIG_SCHED_DEBUG

146

#define SCHED_FEAT(name, enabled) \

146

#define SCHED_FEAT(name, enabled) \

147

#name ,

147

#name ,

148

149

static const char * const sched_feat_names[] = {

149

static const char * const sched_feat_names[] = {

150

#include "features.h"

150

#include "features.h"

151

};

151

};

152

153

#undef SCHED_FEAT

153

#undef SCHED_FEAT

154

155

static int sched_feat_show(struct seq_file *m, void *v)

155

static int sched_feat_show(struct seq_file *m, void *v)

156

{

156

{

157

int i;

157

int i;

158

159

for (i = 0; i < __SCHED_FEAT_NR; i++) {

159

for (i = 0; i < __SCHED_FEAT_NR; i++) {

160

if (!(sysctl_sched_features & (1UL << i)))

160

if (!(sysctl_sched_features & (1UL << i)))

161

seq_puts(m, "NO_");

161

seq_puts(m, "NO_");

162

seq_printf(m, "%s ", sched_feat_names[i]);

162

seq_printf(m, "%s ", sched_feat_names[i]);

163

}

163

}

164

seq_puts(m, "\n");

164

seq_puts(m, "\n");

165

166

return 0;

166

return 0;

167

}

167

}

168

169

#ifdef HAVE_JUMP_LABEL

169

#ifdef HAVE_JUMP_LABEL

170

171

#define jump_label_key__true STATIC_KEY_INIT_TRUE

171

#define jump_label_key__true STATIC_KEY_INIT_TRUE

172

#define jump_label_key__false STATIC_KEY_INIT_FALSE

172

#define jump_label_key__false STATIC_KEY_INIT_FALSE

173

174

#define SCHED_FEAT(name, enabled) \

174

#define SCHED_FEAT(name, enabled) \

175

jump_label_key__##enabled ,

175

jump_label_key__##enabled ,

176

177

struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {

177

struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {

178

#include "features.h"

178

#include "features.h"

179

};

179

};

180

181

#undef SCHED_FEAT

181

#undef SCHED_FEAT

182

183

static void sched_feat_disable(int i)

183

static void sched_feat_disable(int i)

184

{

184

{

185

if (static_key_enabled(&sched_feat_keys[i]))

185

if (static_key_enabled(&sched_feat_keys[i]))

186

static_key_slow_dec(&sched_feat_keys[i]);

186

static_key_slow_dec(&sched_feat_keys[i]);

187

}

187

}

188

189

static void sched_feat_enable(int i)

189

static void sched_feat_enable(int i)

190

{

190

{

191

if (!static_key_enabled(&sched_feat_keys[i]))

191

if (!static_key_enabled(&sched_feat_keys[i]))

192

static_key_slow_inc(&sched_feat_keys[i]);

192

static_key_slow_inc(&sched_feat_keys[i]);

193

}

193

}

194

#else

194

#else

195

static void sched_feat_disable(int i) { };

195

static void sched_feat_disable(int i) { };

196

static void sched_feat_enable(int i) { };

196

static void sched_feat_enable(int i) { };

197

#endif /* HAVE_JUMP_LABEL */

197

#endif /* HAVE_JUMP_LABEL */

198

199

static int sched_feat_set(char *cmp)

199

static int sched_feat_set(char *cmp)

200

{

200

{

201

int i;

201

int i;

202

int neg = 0;

202

int neg = 0;

203

204

if (strncmp(cmp, "NO_", 3) == 0) {

204

if (strncmp(cmp, "NO_", 3) == 0) {

205

neg = 1;

205

neg = 1;

206

cmp += 3;

206

cmp += 3;

207

}

207

}

208

209

for (i = 0; i < __SCHED_FEAT_NR; i++) {

209

for (i = 0; i < __SCHED_FEAT_NR; i++) {

210

if (strcmp(cmp, sched_feat_names[i]) == 0) {

210

if (strcmp(cmp, sched_feat_names[i]) == 0) {

211

if (neg) {

211

if (neg) {

212

sysctl_sched_features &= ~(1UL << i);

212

sysctl_sched_features &= ~(1UL << i);

213

sched_feat_disable(i);

213

sched_feat_disable(i);

214

} else {

214

} else {

215

sysctl_sched_features |= (1UL << i);

215

sysctl_sched_features |= (1UL << i);

216

sched_feat_enable(i);

216

sched_feat_enable(i);

217

}

217

}

218

break;

218

break;

219

}

219

}

220

}

220

}

221

222

return i;

222

return i;

223

}

223

}

224

225

static ssize_t

225

static ssize_t

226

sched_feat_write(struct file *filp, const char __user *ubuf,

226

sched_feat_write(struct file *filp, const char __user *ubuf,

227

size_t cnt, loff_t *ppos)

227

size_t cnt, loff_t *ppos)

228

{

228

{

229

char buf[64];

229

char buf[64];

230

char *cmp;

230

char *cmp;

231

int i;

231

int i;

232

struct inode *inode;

232

struct inode *inode;

233

234

if (cnt > 63)

234

if (cnt > 63)

235

cnt = 63;

235

cnt = 63;

236

237

if (copy_from_user(&buf, ubuf, cnt))

237

if (copy_from_user(&buf, ubuf, cnt))

238

return -EFAULT;

238

return -EFAULT;

239

240

buf[cnt] = 0;

240

buf[cnt] = 0;

241

cmp = strstrip(buf);

241

cmp = strstrip(buf);

242

243

/* Ensure the static_key remains in a consistent state */

243

/* Ensure the static_key remains in a consistent state */

244

inode = file_inode(filp);

244

inode = file_inode(filp);

245

mutex_lock(&inode->i_mutex);

245

mutex_lock(&inode->i_mutex);

246

i = sched_feat_set(cmp);

246

i = sched_feat_set(cmp);

247

mutex_unlock(&inode->i_mutex);

247

mutex_unlock(&inode->i_mutex);

248

if (i == __SCHED_FEAT_NR)

248

if (i == __SCHED_FEAT_NR)

249

return -EINVAL;

249

return -EINVAL;

250

251

*ppos += cnt;

251

*ppos += cnt;

252

253

return cnt;

253

return cnt;

254

}

254

}

255

256

static int sched_feat_open(struct inode *inode, struct file *filp)

256

static int sched_feat_open(struct inode *inode, struct file *filp)

257

{

257

{

258

return single_open(filp, sched_feat_show, NULL);

258

return single_open(filp, sched_feat_show, NULL);

259

}

259

}

260

261

static const struct file_operations sched_feat_fops = {

261

static const struct file_operations sched_feat_fops = {

262

.open = sched_feat_open,

262

.open = sched_feat_open,

263

.write = sched_feat_write,

263

.write = sched_feat_write,

264

.read = seq_read,

264

.read = seq_read,

265

.llseek = seq_lseek,

265

.llseek = seq_lseek,

266

.release = single_release,

266

.release = single_release,

267

};

267

};

268

269

static __init int sched_init_debug(void)

269

static __init int sched_init_debug(void)

270

{

270

{

271

debugfs_create_file("sched_features", 0644, NULL, NULL,

271

debugfs_create_file("sched_features", 0644, NULL, NULL,

272

&sched_feat_fops);

272

&sched_feat_fops);

273

274

return 0;

274

return 0;

275

}

275

}

276

late_initcall(sched_init_debug);

276

late_initcall(sched_init_debug);

277

#endif /* CONFIG_SCHED_DEBUG */

277

#endif /* CONFIG_SCHED_DEBUG */

278

279

/*

279

/*

280

* Number of tasks to iterate in a single balance run.

280

* Number of tasks to iterate in a single balance run.

281

* Limited because this is done with IRQs disabled.

281

* Limited because this is done with IRQs disabled.

282

*/

282

*/

283

const_debug unsigned int sysctl_sched_nr_migrate = 32;

283

const_debug unsigned int sysctl_sched_nr_migrate = 32;

284

285

/*

285

/*

286

* period over which we average the RT time consumption, measured

286

* period over which we average the RT time consumption, measured

287

* in ms.

287

* in ms.

288

*

288

*

289

* default: 1s

289

* default: 1s

290

*/

290

*/

291

const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;

291

const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;

292

293

/*

293

/*

294

* period over which we measure -rt task cpu usage in us.

294

* period over which we measure -rt task cpu usage in us.

295

* default: 1s

295

* default: 1s

296

*/

296

*/

297

unsigned int sysctl_sched_rt_period = 1000000;

297

unsigned int sysctl_sched_rt_period = 1000000;

298

299

__read_mostly int scheduler_running;

299

__read_mostly int scheduler_running;

300

301

/*

301

/*

302

* part of the period that we allow rt tasks to run in us.

302

* part of the period that we allow rt tasks to run in us.

303

* default: 0.95s

303

* default: 0.95s

304

*/

304

*/

305

int sysctl_sched_rt_runtime = 950000;

305

int sysctl_sched_rt_runtime = 950000;

306

307

/*

307

/*

308

* __task_rq_lock - lock the rq @p resides on.

308

* __task_rq_lock - lock the rq @p resides on.

309

*/

309

*/

310

static inline struct rq *__task_rq_lock(struct task_struct *p)

310

static inline struct rq *__task_rq_lock(struct task_struct *p)

311

__acquires(rq->lock)

311

__acquires(rq->lock)

312

{

312

{

313

struct rq *rq;

313

struct rq *rq;

314

315

lockdep_assert_held(&p->pi_lock);

315

lockdep_assert_held(&p->pi_lock);

316

317

for (;;) {

317

for (;;) {

318

rq = task_rq(p);

318

rq = task_rq(p);

319

raw_spin_lock(&rq->lock);

319

raw_spin_lock(&rq->lock);

320

if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))

320

if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))

321

return rq;

321

return rq;

322

raw_spin_unlock(&rq->lock);

322

raw_spin_unlock(&rq->lock);

323

324

while (unlikely(task_on_rq_migrating(p)))

324

while (unlikely(task_on_rq_migrating(p)))

325

cpu_relax();

325

cpu_relax();

326

}

326

}

327

}

327

}

328

329

/*

329

/*

330

* task_rq_lock - lock p->pi_lock and lock the rq @p resides on.

330

* task_rq_lock - lock p->pi_lock and lock the rq @p resides on.

331

*/

331

*/

332

static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)

332

static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)

333

__acquires(p->pi_lock)

333

__acquires(p->pi_lock)

334

__acquires(rq->lock)

334

__acquires(rq->lock)

335

{

335

{

336

struct rq *rq;

336

struct rq *rq;

337

338

for (;;) {

338

for (;;) {

339

raw_spin_lock_irqsave(&p->pi_lock, *flags);

339

raw_spin_lock_irqsave(&p->pi_lock, *flags);

340

rq = task_rq(p);

340

rq = task_rq(p);

341

raw_spin_lock(&rq->lock);

341

raw_spin_lock(&rq->lock);

342

if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))

342

if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))

343

return rq;

343

return rq;

344

raw_spin_unlock(&rq->lock);

344

raw_spin_unlock(&rq->lock);

345

raw_spin_unlock_irqrestore(&p->pi_lock, *flags);

345

raw_spin_unlock_irqrestore(&p->pi_lock, *flags);

346

347

while (unlikely(task_on_rq_migrating(p)))

347

while (unlikely(task_on_rq_migrating(p)))

348

cpu_relax();

348

cpu_relax();

349

}

349

}

350

}

350

}

351

352

static void __task_rq_unlock(struct rq *rq)

352

static void __task_rq_unlock(struct rq *rq)

353

__releases(rq->lock)

353

__releases(rq->lock)

354

{

354

{

355

raw_spin_unlock(&rq->lock);

355

raw_spin_unlock(&rq->lock);

356

}

356

}

357

358

static inline void

358

static inline void

359

task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)

359

task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)

360

__releases(rq->lock)

360

__releases(rq->lock)

361

__releases(p->pi_lock)

361

__releases(p->pi_lock)

362

{

362

{

363

raw_spin_unlock(&rq->lock);

363

raw_spin_unlock(&rq->lock);

364

raw_spin_unlock_irqrestore(&p->pi_lock, *flags);

364

raw_spin_unlock_irqrestore(&p->pi_lock, *flags);

365

}

365

}

366

367

/*

367

/*

368

* this_rq_lock - lock this runqueue and disable interrupts.

368

* this_rq_lock - lock this runqueue and disable interrupts.

369

*/

369

*/

370

static struct rq *this_rq_lock(void)

370

static struct rq *this_rq_lock(void)

371

__acquires(rq->lock)

371

__acquires(rq->lock)

372

{

372

{

373

struct rq *rq;

373

struct rq *rq;

374

375

local_irq_disable();

375

local_irq_disable();

376

rq = this_rq();

376

rq = this_rq();

377

raw_spin_lock(&rq->lock);

377

raw_spin_lock(&rq->lock);

378

379

return rq;

379

return rq;

380

}

380

}

381

382

#ifdef CONFIG_SCHED_HRTICK

382

#ifdef CONFIG_SCHED_HRTICK

383

/*

383

/*

384

* Use HR-timers to deliver accurate preemption points.

384

* Use HR-timers to deliver accurate preemption points.

385

*/

385

*/

386

387

static void hrtick_clear(struct rq *rq)

387

static void hrtick_clear(struct rq *rq)

388

{

388

{

389

if (hrtimer_active(&rq->hrtick_timer))

389

if (hrtimer_active(&rq->hrtick_timer))

390

hrtimer_cancel(&rq->hrtick_timer);

390

hrtimer_cancel(&rq->hrtick_timer);

391

}

391

}

392

393

/*

393

/*

394

* High-resolution timer tick.

394

* High-resolution timer tick.

395

* Runs from hardirq context with interrupts disabled.

395

* Runs from hardirq context with interrupts disabled.

396

*/

396

*/

397

static enum hrtimer_restart hrtick(struct hrtimer *timer)

397

static enum hrtimer_restart hrtick(struct hrtimer *timer)

398

{

398

{

399

struct rq *rq = container_of(timer, struct rq, hrtick_timer);

399

struct rq *rq = container_of(timer, struct rq, hrtick_timer);

400

401

WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

401

WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

402

403

raw_spin_lock(&rq->lock);

403

raw_spin_lock(&rq->lock);

404

update_rq_clock(rq);

404

update_rq_clock(rq);

405

rq->curr->sched_class->task_tick(rq, rq->curr, 1);

405

rq->curr->sched_class->task_tick(rq, rq->curr, 1);

406

raw_spin_unlock(&rq->lock);

406

raw_spin_unlock(&rq->lock);

407

408

return HRTIMER_NORESTART;

408

return HRTIMER_NORESTART;

409

}

409

}

410

411

#ifdef CONFIG_SMP

411

#ifdef CONFIG_SMP

412

413

static int __hrtick_restart(struct rq *rq)

413

static int __hrtick_restart(struct rq *rq)

414

{

414

{

415

struct hrtimer *timer = &rq->hrtick_timer;

415

struct hrtimer *timer = &rq->hrtick_timer;

416

ktime_t time = hrtimer_get_softexpires(timer);

416

ktime_t time = hrtimer_get_softexpires(timer);

417

418

return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);

418

return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);

419

}

419

}

420

421

/*

421

/*

422

* called from hardirq (IPI) context

422

* called from hardirq (IPI) context

423

*/

423

*/

424

static void __hrtick_start(void *arg)

424

static void __hrtick_start(void *arg)

425

{

425

{

426

struct rq *rq = arg;

426

struct rq *rq = arg;

427

428

raw_spin_lock(&rq->lock);

428

raw_spin_lock(&rq->lock);

429

__hrtick_restart(rq);

429

__hrtick_restart(rq);

430

rq->hrtick_csd_pending = 0;

430

rq->hrtick_csd_pending = 0;

431

raw_spin_unlock(&rq->lock);

431

raw_spin_unlock(&rq->lock);

432

}

432

}

433

434

/*

434

/*

435

* Called to set the hrtick timer state.

435

* Called to set the hrtick timer state.

436

*

436

*

437

* called with rq->lock held and irqs disabled

437

* called with rq->lock held and irqs disabled

438

*/

438

*/

439

void hrtick_start(struct rq *rq, u64 delay)

439

void hrtick_start(struct rq *rq, u64 delay)

440

{

440

{

441

struct hrtimer *timer = &rq->hrtick_timer;

441

struct hrtimer *timer = &rq->hrtick_timer;

442

ktime_t time;

442

ktime_t time;

443

s64 delta;

443

s64 delta;

444

445

/*

445

/*

446

* Don't schedule slices shorter than 10000ns, that just

446

* Don't schedule slices shorter than 10000ns, that just

447

* doesn't make sense and can cause timer DoS.

447

* doesn't make sense and can cause timer DoS.

448

*/

448

*/

449

delta = max_t(s64, delay, 10000LL);

449

delta = max_t(s64, delay, 10000LL);

450

time = ktime_add_ns(timer->base->get_time(), delta);

450

time = ktime_add_ns(timer->base->get_time(), delta);

451

452

hrtimer_set_expires(timer, time);

452

hrtimer_set_expires(timer, time);

453

454

if (rq == this_rq()) {

454

if (rq == this_rq()) {

455

__hrtick_restart(rq);

455

__hrtick_restart(rq);

456

} else if (!rq->hrtick_csd_pending) {

456

} else if (!rq->hrtick_csd_pending) {

457

smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);

457

smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);

458

rq->hrtick_csd_pending = 1;

458

rq->hrtick_csd_pending = 1;

459

}

459

}

460

}

460

}

461

462

static int

462

static int

463

hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)

463

hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)

464

{

464

{

465

int cpu = (int)(long)hcpu;

465

int cpu = (int)(long)hcpu;

466

467

switch (action) {

467

switch (action) {

468

case CPU_UP_CANCELED:

468

case CPU_UP_CANCELED:

469

case CPU_UP_CANCELED_FROZEN:

469

case CPU_UP_CANCELED_FROZEN:

470

case CPU_DOWN_PREPARE:

470

case CPU_DOWN_PREPARE:

471

case CPU_DOWN_PREPARE_FROZEN:

471

case CPU_DOWN_PREPARE_FROZEN:

472

case CPU_DEAD:

472

case CPU_DEAD:

473

case CPU_DEAD_FROZEN:

473

case CPU_DEAD_FROZEN:

474

hrtick_clear(cpu_rq(cpu));

474

hrtick_clear(cpu_rq(cpu));

475

return NOTIFY_OK;

475

return NOTIFY_OK;

476

}

476

}

477

478

return NOTIFY_DONE;

478

return NOTIFY_DONE;

479

}

479

}

480

481

static __init void init_hrtick(void)

481

static __init void init_hrtick(void)

482

{

482

{

483

hotcpu_notifier(hotplug_hrtick, 0);

483

hotcpu_notifier(hotplug_hrtick, 0);

484

}

484

}

485

#else

485

#else

486

/*

486

/*

487

* Called to set the hrtick timer state.

487

* Called to set the hrtick timer state.

488

*

488

*

489

* called with rq->lock held and irqs disabled

489

* called with rq->lock held and irqs disabled

490

*/

490

*/

491

void hrtick_start(struct rq *rq, u64 delay)

491

void hrtick_start(struct rq *rq, u64 delay)

492

{

492

{

493

__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,

493

__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,

494

HRTIMER_MODE_REL_PINNED, 0);

494

HRTIMER_MODE_REL_PINNED, 0);

495

}

495

}

496

497

static inline void init_hrtick(void)

497

static inline void init_hrtick(void)

498

{

498

{

499

}

499

}

500

#endif /* CONFIG_SMP */

500

#endif /* CONFIG_SMP */

501

502

static void init_rq_hrtick(struct rq *rq)

502

static void init_rq_hrtick(struct rq *rq)

503

{

503

{

504

#ifdef CONFIG_SMP

504

#ifdef CONFIG_SMP

505

rq->hrtick_csd_pending = 0;

505

rq->hrtick_csd_pending = 0;

506

507

rq->hrtick_csd.flags = 0;

507

rq->hrtick_csd.flags = 0;

508

rq->hrtick_csd.func = __hrtick_start;

508

rq->hrtick_csd.func = __hrtick_start;

509

rq->hrtick_csd.info = rq;

509

rq->hrtick_csd.info = rq;

510

#endif

510

#endif

511

512

hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

512

hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

513

rq->hrtick_timer.function = hrtick;

513

rq->hrtick_timer.function = hrtick;

514

}

514

}

515

#else /* CONFIG_SCHED_HRTICK */

515

#else /* CONFIG_SCHED_HRTICK */

516

static inline void hrtick_clear(struct rq *rq)

516

static inline void hrtick_clear(struct rq *rq)

517

{

517

{

518

}

518

}

519

520

static inline void init_rq_hrtick(struct rq *rq)

520

static inline void init_rq_hrtick(struct rq *rq)

521

{

521

{

522

}

522

}

523

524

static inline void init_hrtick(void)

524

static inline void init_hrtick(void)

525

{

525

{

526

}

526

}

527

#endif /* CONFIG_SCHED_HRTICK */

527

#endif /* CONFIG_SCHED_HRTICK */

528

529

/*

529

/*

530

* cmpxchg based fetch_or, macro so it works for different integer types

530

* cmpxchg based fetch_or, macro so it works for different integer types

531

*/

531

*/

532

#define fetch_or(ptr, val) \

532

#define fetch_or(ptr, val) \

533

({ typeof(*(ptr)) __old, __val = *(ptr); \

533

({ typeof(*(ptr)) __old, __val = *(ptr); \

534

for (;;) { \

534

for (;;) { \

535

__old = cmpxchg((ptr), __val, __val | (val)); \

535

__old = cmpxchg((ptr), __val, __val | (val)); \

536

if (__old == __val) \

536

if (__old == __val) \

537

break; \

537

break; \

538

__val = __old; \

538

__val = __old; \

539

} \

539

} \

540

__old; \

540

__old; \

541

})

541

})

542

543

#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)

543

#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)

544

/*

544

/*

545

* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,

545

* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,

546

* this avoids any races wrt polling state changes and thereby avoids

546

* this avoids any races wrt polling state changes and thereby avoids

547

* spurious IPIs.

547

* spurious IPIs.

548

*/

548

*/

549

static bool set_nr_and_not_polling(struct task_struct *p)

549

static bool set_nr_and_not_polling(struct task_struct *p)

550

{

550

{

551

struct thread_info *ti = task_thread_info(p);

551

struct thread_info *ti = task_thread_info(p);

552

return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);

552

return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);

553

}

553

}

554

555

/*

555

/*

556

* Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.

556

* Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.

557

*

557

*

558

* If this returns true, then the idle task promises to call

558

* If this returns true, then the idle task promises to call

559

* sched_ttwu_pending() and reschedule soon.

559

* sched_ttwu_pending() and reschedule soon.

560

*/

560

*/

561

static bool set_nr_if_polling(struct task_struct *p)

561

static bool set_nr_if_polling(struct task_struct *p)

562

{

562

{

563

struct thread_info *ti = task_thread_info(p);

563

struct thread_info *ti = task_thread_info(p);

564

typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);

564

typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);

565

566

for (;;) {

566

for (;;) {

567

if (!(val & _TIF_POLLING_NRFLAG))

567

if (!(val & _TIF_POLLING_NRFLAG))

568

return false;

568

return false;

569

if (val & _TIF_NEED_RESCHED)

569

if (val & _TIF_NEED_RESCHED)

570

return true;

570

return true;

571

old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);

571

old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);

572

if (old == val)

572

if (old == val)

573

break;

573

break;

574

val = old;

574

val = old;

575

}

575

}

576

return true;

576

return true;

577

}

577

}

578

579

#else

579

#else

580

static bool set_nr_and_not_polling(struct task_struct *p)

580

static bool set_nr_and_not_polling(struct task_struct *p)

581

{

581

{

582

set_tsk_need_resched(p);

582

set_tsk_need_resched(p);

583

return true;

583

return true;

584

}

584

}

585

586

#ifdef CONFIG_SMP

586

#ifdef CONFIG_SMP

587

static bool set_nr_if_polling(struct task_struct *p)

587

static bool set_nr_if_polling(struct task_struct *p)

588

{

588

{

589

return false;

589

return false;

590

}

590

}

591

#endif

591

#endif

592

#endif

592

#endif

593

594

/*

594

/*

595

* resched_curr - mark rq's current task 'to be rescheduled now'.

595

* resched_curr - mark rq's current task 'to be rescheduled now'.

596

*

596

*

597

* On UP this means the setting of the need_resched flag, on SMP it

597

* On UP this means the setting of the need_resched flag, on SMP it

598

* might also involve a cross-CPU call to trigger the scheduler on

598

* might also involve a cross-CPU call to trigger the scheduler on

599

* the target CPU.

599

* the target CPU.

600

*/

600

*/

601

void resched_curr(struct rq *rq)

601

void resched_curr(struct rq *rq)

602

{

602

{

603

struct task_struct *curr = rq->curr;

603

struct task_struct *curr = rq->curr;

604

int cpu;

604

int cpu;

605

606

lockdep_assert_held(&rq->lock);

606

lockdep_assert_held(&rq->lock);

607

608

if (test_tsk_need_resched(curr))

608

if (test_tsk_need_resched(curr))

609

return;

609

return;

610

611

cpu = cpu_of(rq);

611

cpu = cpu_of(rq);

612

613

if (cpu == smp_processor_id()) {

613

if (cpu == smp_processor_id()) {

614

set_tsk_need_resched(curr);

614

set_tsk_need_resched(curr);

615

set_preempt_need_resched();

615

set_preempt_need_resched();

616

return;

616

return;

617

}

617

}

618

619

if (set_nr_and_not_polling(curr))

619

if (set_nr_and_not_polling(curr))

620

smp_send_reschedule(cpu);

620

smp_send_reschedule(cpu);

621

else

621

else

622

trace_sched_wake_idle_without_ipi(cpu);

622

trace_sched_wake_idle_without_ipi(cpu);

623

}

623

}

624

625

void resched_cpu(int cpu)

625

void resched_cpu(int cpu)

626

{

626

{

627

struct rq *rq = cpu_rq(cpu);

627

struct rq *rq = cpu_rq(cpu);

628

unsigned long flags;

628

unsigned long flags;

629

630

if (!raw_spin_trylock_irqsave(&rq->lock, flags))

630

if (!raw_spin_trylock_irqsave(&rq->lock, flags))

631

return;

631

return;

632

resched_curr(rq);

632

resched_curr(rq);

633

raw_spin_unlock_irqrestore(&rq->lock, flags);

633

raw_spin_unlock_irqrestore(&rq->lock, flags);

634

}

634

}

635

636

#ifdef CONFIG_SMP

636

#ifdef CONFIG_SMP

637

#ifdef CONFIG_NO_HZ_COMMON

637

#ifdef CONFIG_NO_HZ_COMMON

638

/*

638

/*

639

* In the semi idle case, use the nearest busy cpu for migrating timers

639

* In the semi idle case, use the nearest busy cpu for migrating timers

640

* from an idle cpu. This is good for power-savings.

640

* from an idle cpu. This is good for power-savings.

641

*

641

*

642

* We don't do similar optimization for completely idle system, as

642

* We don't do similar optimization for completely idle system, as

643

* selecting an idle cpu will add more delays to the timers than intended

643

* selecting an idle cpu will add more delays to the timers than intended

644

* (as that cpu's timer base may not be uptodate wrt jiffies etc).

644

* (as that cpu's timer base may not be uptodate wrt jiffies etc).

645

*/

645

*/

646

int get_nohz_timer_target(int pinned)

646

int get_nohz_timer_target(int pinned)

647

{

647

{

648

int cpu = smp_processor_id();

648

int cpu = smp_processor_id();

649

int i;

649

int i;

650

struct sched_domain *sd;

650

struct sched_domain *sd;

651

652

if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))

652

if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))

653

return cpu;

653

return cpu;

654

655

rcu_read_lock();

655

rcu_read_lock();

656

for_each_domain(cpu, sd) {

656

for_each_domain(cpu, sd) {

657

for_each_cpu(i, sched_domain_span(sd)) {

657

for_each_cpu(i, sched_domain_span(sd)) {

658

if (!idle_cpu(i)) {

658

if (!idle_cpu(i)) {

659

cpu = i;

659

cpu = i;

660

goto unlock;

660

goto unlock;

661

}

661

}

662

}

662

}

663

}

663

}

664

unlock:

664

unlock:

665

rcu_read_unlock();

665

rcu_read_unlock();

666

return cpu;

666

return cpu;

667

}

667

}

668

/*

668

/*

669

* When add_timer_on() enqueues a timer into the timer wheel of an

669

* When add_timer_on() enqueues a timer into the timer wheel of an

670

* idle CPU then this timer might expire before the next timer event

670

* idle CPU then this timer might expire before the next timer event

671

* which is scheduled to wake up that CPU. In case of a completely

671

* which is scheduled to wake up that CPU. In case of a completely

672

* idle system the next event might even be infinite time into the

672

* idle system the next event might even be infinite time into the

673

* future. wake_up_idle_cpu() ensures that the CPU is woken up and

673

* future. wake_up_idle_cpu() ensures that the CPU is woken up and

674

* leaves the inner idle loop so the newly added timer is taken into

674

* leaves the inner idle loop so the newly added timer is taken into

675

* account when the CPU goes back to idle and evaluates the timer

675

* account when the CPU goes back to idle and evaluates the timer

676

* wheel for the next timer event.

676

* wheel for the next timer event.

677

*/

677

*/

678

static void wake_up_idle_cpu(int cpu)

678

static void wake_up_idle_cpu(int cpu)

679

{

679

{

680

struct rq *rq = cpu_rq(cpu);

680

struct rq *rq = cpu_rq(cpu);

681

682

if (cpu == smp_processor_id())

682

if (cpu == smp_processor_id())

683

return;

683

return;

684

685

if (set_nr_and_not_polling(rq->idle))

685

if (set_nr_and_not_polling(rq->idle))

686

smp_send_reschedule(cpu);

686

smp_send_reschedule(cpu);

687

else

687

else

688

trace_sched_wake_idle_without_ipi(cpu);

688

trace_sched_wake_idle_without_ipi(cpu);

689

}

689

}

690

691

static bool wake_up_full_nohz_cpu(int cpu)

691

static bool wake_up_full_nohz_cpu(int cpu)

692

{

692

{

693

/*

693

/*

694

* We just need the target to call irq_exit() and re-evaluate

694

* We just need the target to call irq_exit() and re-evaluate

695

* the next tick. The nohz full kick at least implies that.

695

* the next tick. The nohz full kick at least implies that.

696

* If needed we can still optimize that later with an

696

* If needed we can still optimize that later with an

697

* empty IRQ.

697

* empty IRQ.

698

*/

698

*/

699

if (tick_nohz_full_cpu(cpu)) {

699

if (tick_nohz_full_cpu(cpu)) {

700

if (cpu != smp_processor_id() ||

700

if (cpu != smp_processor_id() ||

701

tick_nohz_tick_stopped())

701

tick_nohz_tick_stopped())

702

tick_nohz_full_kick_cpu(cpu);

702

tick_nohz_full_kick_cpu(cpu);

703

return true;

703

return true;

704

}

704

}

705

706

return false;

706

return false;

707

}

707

}

708

709

void wake_up_nohz_cpu(int cpu)

709

void wake_up_nohz_cpu(int cpu)

710

{

710

{

711

if (!wake_up_full_nohz_cpu(cpu))

711

if (!wake_up_full_nohz_cpu(cpu))

712

wake_up_idle_cpu(cpu);

712

wake_up_idle_cpu(cpu);

713

}

713

}

714

715

static inline bool got_nohz_idle_kick(void)

715

static inline bool got_nohz_idle_kick(void)

716

{

716

{

717

int cpu = smp_processor_id();

717

int cpu = smp_processor_id();

718

719

if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))

719

if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))

720

return false;

720

return false;

721

722

if (idle_cpu(cpu) && !need_resched())

722

if (idle_cpu(cpu) && !need_resched())

723

return true;

723

return true;

724

725

/*

725

/*

726

* We can't run Idle Load Balance on this CPU for this time so we

726

* We can't run Idle Load Balance on this CPU for this time so we

727

* cancel it and clear NOHZ_BALANCE_KICK

727

* cancel it and clear NOHZ_BALANCE_KICK

728

*/

728

*/

729

clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));

729

clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));

730

return false;

730

return false;

731

}

731

}

732

733

#else /* CONFIG_NO_HZ_COMMON */

733

#else /* CONFIG_NO_HZ_COMMON */

734

735

static inline bool got_nohz_idle_kick(void)

735

static inline bool got_nohz_idle_kick(void)

736

{

736

{

737

return false;

737

return false;

738

}

738

}

739

740

#endif /* CONFIG_NO_HZ_COMMON */

740

#endif /* CONFIG_NO_HZ_COMMON */

741

742

#ifdef CONFIG_NO_HZ_FULL

742

#ifdef CONFIG_NO_HZ_FULL

743

bool sched_can_stop_tick(void)

743

bool sched_can_stop_tick(void)

744

{

744

{

745

/*

745

/*

746

* More than one running task need preemption.

746

* More than one running task need preemption.

747

* nr_running update is assumed to be visible

747

* nr_running update is assumed to be visible

748

* after IPI is sent from wakers.

748

* after IPI is sent from wakers.

749

*/

749

*/

750

if (this_rq()->nr_running > 1)

750

if (this_rq()->nr_running > 1)

751

return false;

751

return false;

752

753

return true;

753

return true;

754

}

754

}

755

#endif /* CONFIG_NO_HZ_FULL */

755

#endif /* CONFIG_NO_HZ_FULL */

756

757

void sched_avg_update(struct rq *rq)

757

void sched_avg_update(struct rq *rq)

758

{

758

{

759

s64 period = sched_avg_period();

759

s64 period = sched_avg_period();

760

761

while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {

761

while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {

762

/*

762

/*

763

* Inline assembly required to prevent the compiler

763

* Inline assembly required to prevent the compiler

764

* optimising this loop into a divmod call.

764

* optimising this loop into a divmod call.

765

* See __iter_div_u64_rem() for another example of this.

765

* See __iter_div_u64_rem() for another example of this.

766

*/

766

*/

767

asm("" : "+rm" (rq->age_stamp));

767

asm("" : "+rm" (rq->age_stamp));

768

rq->age_stamp += period;

768

rq->age_stamp += period;

769

rq->rt_avg /= 2;

769

rq->rt_avg /= 2;

770

}

770

}

771

}

771

}

772

773

#endif /* CONFIG_SMP */

773

#endif /* CONFIG_SMP */

774

775

#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \

775

#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \

776

(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))

776

(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))

777

/*

777

/*

778

* Iterate task_group tree rooted at *from, calling @down when first entering a

778

* Iterate task_group tree rooted at *from, calling @down when first entering a

779

* node and @up when leaving it for the final time.

779

* node and @up when leaving it for the final time.

780

*

780

*

781

* Caller must hold rcu_lock or sufficient equivalent.

781

* Caller must hold rcu_lock or sufficient equivalent.

782

*/

782

*/

783

int walk_tg_tree_from(struct task_group *from,

783

int walk_tg_tree_from(struct task_group *from,

784

tg_visitor down, tg_visitor up, void *data)

784

tg_visitor down, tg_visitor up, void *data)

785

{

785

{

786

struct task_group *parent, *child;

786

struct task_group *parent, *child;

787

int ret;

787

int ret;

788

789

parent = from;

789

parent = from;

790

791

down:

791

down:

792

ret = (*down)(parent, data);

792

ret = (*down)(parent, data);

793

if (ret)

793

if (ret)

794

goto out;

794

goto out;

795

list_for_each_entry_rcu(child, &parent->children, siblings) {

795

list_for_each_entry_rcu(child, &parent->children, siblings) {

796

parent = child;

796

parent = child;

797

goto down;

797

goto down;

798

799

up:

799

up:

800

continue;

800

continue;

801

}

801

}

802

ret = (*up)(parent, data);

802

ret = (*up)(parent, data);

803

if (ret || parent == from)

803

if (ret || parent == from)

804

goto out;

804

goto out;

805

806

child = parent;

806

child = parent;

807

parent = parent->parent;

807

parent = parent->parent;

808

if (parent)

808

if (parent)

809

goto up;

809

goto up;

810

out:

810

out:

811

return ret;

811

return ret;

812

}

812

}

813

814

int tg_nop(struct task_group *tg, void *data)

814

int tg_nop(struct task_group *tg, void *data)

815

{

815

{

816

return 0;

816

return 0;

817

}

817

}

818

#endif

818

#endif

819

820

static void set_load_weight(struct task_struct *p)

820

static void set_load_weight(struct task_struct *p)

821

{

821

{

822

int prio = p->static_prio - MAX_RT_PRIO;

822

int prio = p->static_prio - MAX_RT_PRIO;

823

struct load_weight *load = &p->se.load;

823

struct load_weight *load = &p->se.load;

824

825

/*

825

/*

826

* SCHED_IDLE tasks get minimal weight:

826

* SCHED_IDLE tasks get minimal weight:

827

*/

827

*/

828

if (p->policy == SCHED_IDLE) {

828

if (p->policy == SCHED_IDLE) {

829

load->weight = scale_load(WEIGHT_IDLEPRIO);

829

load->weight = scale_load(WEIGHT_IDLEPRIO);

830

load->inv_weight = WMULT_IDLEPRIO;

830

load->inv_weight = WMULT_IDLEPRIO;

831

return;

831

return;

832

}

832

}

833

834

load->weight = scale_load(prio_to_weight[prio]);

834

load->weight = scale_load(prio_to_weight[prio]);

835

load->inv_weight = prio_to_wmult[prio];

835

load->inv_weight = prio_to_wmult[prio];

836

}

836

}

837

838

static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)

838

static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)

839

{

839

{

840

update_rq_clock(rq);

840

update_rq_clock(rq);

841

sched_info_queued(rq, p);

841

sched_info_queued(rq, p);

842

p->sched_class->enqueue_task(rq, p, flags);

842

p->sched_class->enqueue_task(rq, p, flags);

843

}

843

}

844

845

static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)

845

static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)

846

{

846

{

847

update_rq_clock(rq);

847

update_rq_clock(rq);

848

sched_info_dequeued(rq, p);

848

sched_info_dequeued(rq, p);

849

p->sched_class->dequeue_task(rq, p, flags);

849

p->sched_class->dequeue_task(rq, p, flags);

850

}

850

}

851

852

void activate_task(struct rq *rq, struct task_struct *p, int flags)

852

void activate_task(struct rq *rq, struct task_struct *p, int flags)

853

{

853

{

854

if (task_contributes_to_load(p))

854

if (task_contributes_to_load(p))

855

rq->nr_uninterruptible--;

855

rq->nr_uninterruptible--;

856

857

enqueue_task(rq, p, flags);

857

enqueue_task(rq, p, flags);

858

}

858

}

859

860

void deactivate_task(struct rq *rq, struct task_struct *p, int flags)

860

void deactivate_task(struct rq *rq, struct task_struct *p, int flags)

861

{

861

{

862

if (task_contributes_to_load(p))

862

if (task_contributes_to_load(p))

863

rq->nr_uninterruptible++;

863

rq->nr_uninterruptible++;

864

865

dequeue_task(rq, p, flags);

865

dequeue_task(rq, p, flags);

866

}

866

}

867

868

static void update_rq_clock_task(struct rq *rq, s64 delta)

868

static void update_rq_clock_task(struct rq *rq, s64 delta)

869

{

869

{

870

/*

870

/*

871

* In theory, the compile should just see 0 here, and optimize out the call

871

* In theory, the compile should just see 0 here, and optimize out the call

872

* to sched_rt_avg_update. But I don't trust it...

872

* to sched_rt_avg_update. But I don't trust it...

873

*/

873

*/

874

#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)

874

#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)

875

s64 steal = 0, irq_delta = 0;

875

s64 steal = 0, irq_delta = 0;

876

#endif

876

#endif

877

#ifdef CONFIG_IRQ_TIME_ACCOUNTING

877

#ifdef CONFIG_IRQ_TIME_ACCOUNTING

878

irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;

878

irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;

879

880

/*

880

/*

881

* Since irq_time is only updated on {soft,}irq_exit, we might run into

881

* Since irq_time is only updated on {soft,}irq_exit, we might run into

882

* this case when a previous update_rq_clock() happened inside a

882

* this case when a previous update_rq_clock() happened inside a

883

* {soft,}irq region.

883

* {soft,}irq region.

884

*

884

*

885

* When this happens, we stop ->clock_task and only update the

885

* When this happens, we stop ->clock_task and only update the

886

* prev_irq_time stamp to account for the part that fit, so that a next

886

* prev_irq_time stamp to account for the part that fit, so that a next

887

* update will consume the rest. This ensures ->clock_task is

887

* update will consume the rest. This ensures ->clock_task is

888

* monotonic.

888

* monotonic.

889

*

889

*

890

* It does however cause some slight miss-attribution of {soft,}irq

890

* It does however cause some slight miss-attribution of {soft,}irq

891

* time, a more accurate solution would be to update the irq_time using

891

* time, a more accurate solution would be to update the irq_time using

892

* the current rq->clock timestamp, except that would require using

892

* the current rq->clock timestamp, except that would require using

893

* atomic ops.

893

* atomic ops.

894

*/

894

*/

895

if (irq_delta > delta)

895

if (irq_delta > delta)

896

irq_delta = delta;

896

irq_delta = delta;

897

898

rq->prev_irq_time += irq_delta;

898

rq->prev_irq_time += irq_delta;

899

delta -= irq_delta;

899

delta -= irq_delta;

900

#endif

900

#endif

901

#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING

901

#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING

902

if (static_key_false((&paravirt_steal_rq_enabled))) {

902

if (static_key_false((&paravirt_steal_rq_enabled))) {

903

steal = paravirt_steal_clock(cpu_of(rq));

903

steal = paravirt_steal_clock(cpu_of(rq));

904

steal -= rq->prev_steal_time_rq;

904

steal -= rq->prev_steal_time_rq;

905

906

if (unlikely(steal > delta))

906

if (unlikely(steal > delta))

907

steal = delta;

907

steal = delta;

908

909

rq->prev_steal_time_rq += steal;

909

rq->prev_steal_time_rq += steal;

910

delta -= steal;

910

delta -= steal;

911

}

911

}

912

#endif

912

#endif

913

914

rq->clock_task += delta;

914

rq->clock_task += delta;

915

916

#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)

916

#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)

917

if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))

917

if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))

918

sched_rt_avg_update(rq, irq_delta + steal);

918

sched_rt_avg_update(rq, irq_delta + steal);

919

#endif

919

#endif

920

}

920

}

921

922

void sched_set_stop_task(int cpu, struct task_struct *stop)

922

void sched_set_stop_task(int cpu, struct task_struct *stop)

923

{

923

{

924

struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };

924

struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };

925

struct task_struct *old_stop = cpu_rq(cpu)->stop;

925

struct task_struct *old_stop = cpu_rq(cpu)->stop;

926

927

if (stop) {

927

if (stop) {

928

/*

928

/*

929

* Make it appear like a SCHED_FIFO task, its something

929

* Make it appear like a SCHED_FIFO task, its something

930

* userspace knows about and won't get confused about.

930

* userspace knows about and won't get confused about.

931

*

931

*

932

* Also, it will make PI more or less work without too

932

* Also, it will make PI more or less work without too

933

* much confusion -- but then, stop work should not

933

* much confusion -- but then, stop work should not

934

* rely on PI working anyway.

934

* rely on PI working anyway.

935

*/

935

*/

936

sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);

936

sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);

937

938

stop->sched_class = &stop_sched_class;

938

stop->sched_class = &stop_sched_class;

939

}

939

}

940

941

cpu_rq(cpu)->stop = stop;

941

cpu_rq(cpu)->stop = stop;

942

943

if (old_stop) {

943

if (old_stop) {

944

/*

944

/*

945

* Reset it back to a normal scheduling class so that

945

* Reset it back to a normal scheduling class so that

946

* it can die in pieces.

946

* it can die in pieces.

947

*/

947

*/

948

old_stop->sched_class = &rt_sched_class;

948

old_stop->sched_class = &rt_sched_class;

949

}

949

}

950

}

950

}

951

952

/*

952

/*

953

* __normal_prio - return the priority that is based on the static prio

953

* __normal_prio - return the priority that is based on the static prio

954

*/

954

*/

955

static inline int __normal_prio(struct task_struct *p)

955

static inline int __normal_prio(struct task_struct *p)

956

{

956

{

957

return p->static_prio;

957

return p->static_prio;

958

}

958

}

959

960

/*

960

/*

961

* Calculate the expected normal priority: i.e. priority

961

* Calculate the expected normal priority: i.e. priority

962

* without taking RT-inheritance into account. Might be

962

* without taking RT-inheritance into account. Might be

963

* boosted by interactivity modifiers. Changes upon fork,

963

* boosted by interactivity modifiers. Changes upon fork,

964

* setprio syscalls, and whenever the interactivity

964

* setprio syscalls, and whenever the interactivity

965

* estimator recalculates.

965

* estimator recalculates.

966

*/

966

*/

967

static inline int normal_prio(struct task_struct *p)

967

static inline int normal_prio(struct task_struct *p)

968

{

968

{

969

int prio;

969

int prio;

970

971

if (task_has_dl_policy(p))

971

if (task_has_dl_policy(p))

972

prio = MAX_DL_PRIO-1;

972

prio = MAX_DL_PRIO-1;

973

else if (task_has_rt_policy(p))

973

else if (task_has_rt_policy(p))

974

prio = MAX_RT_PRIO-1 - p->rt_priority;

974

prio = MAX_RT_PRIO-1 - p->rt_priority;

975

else

975

else

976

prio = __normal_prio(p);

976

prio = __normal_prio(p);

977

return prio;

977

return prio;

978

}

978

}

979

980

/*

980

/*

981

* Calculate the current priority, i.e. the priority

981

* Calculate the current priority, i.e. the priority

982

* taken into account by the scheduler. This value might

982

* taken into account by the scheduler. This value might

983

* be boosted by RT tasks, or might be boosted by

983

* be boosted by RT tasks, or might be boosted by

984

* interactivity modifiers. Will be RT if the task got

984

* interactivity modifiers. Will be RT if the task got

985

* RT-boosted. If not then it returns p->normal_prio.

985

* RT-boosted. If not then it returns p->normal_prio.

986

*/

986

*/

987

static int effective_prio(struct task_struct *p)

987

static int effective_prio(struct task_struct *p)

988

{

988

{

989

p->normal_prio = normal_prio(p);

989

p->normal_prio = normal_prio(p);

990

/*

990

/*

991

* If we are RT tasks or we were boosted to RT priority,

991

* If we are RT tasks or we were boosted to RT priority,

992

* keep the priority unchanged. Otherwise, update priority

992

* keep the priority unchanged. Otherwise, update priority

993

* to the normal priority:

993

* to the normal priority:

994

*/

994

*/

995

if (!rt_prio(p->prio))

995

if (!rt_prio(p->prio))

996

return p->normal_prio;

996

return p->normal_prio;

997

return p->prio;

997

return p->prio;

998

}

998

}

999

1000

/**

1000

/**

1001

* task_curr - is this task currently executing on a CPU?

1001

* task_curr - is this task currently executing on a CPU?

1002

* @p: the task in question.

1002

* @p: the task in question.

1003

*

1003

*

1004

* Return: 1 if the task is currently executing. 0 otherwise.

1004

* Return: 1 if the task is currently executing. 0 otherwise.

1005

*/

1005

*/

1006

inline int task_curr(const struct task_struct *p)

1006

inline int task_curr(const struct task_struct *p)

1007

{

1007

{

1008

return cpu_curr(task_cpu(p)) == p;

1008

return cpu_curr(task_cpu(p)) == p;

1009

}

1009

}

1010

1011

static inline void check_class_changed(struct rq *rq, struct task_struct *p,

1011

static inline void check_class_changed(struct rq *rq, struct task_struct *p,

1012

const struct sched_class *prev_class,

1012

const struct sched_class *prev_class,

1013

int oldprio)

1013

int oldprio)

1014

{

1014

{

1015

if (prev_class != p->sched_class) {

1015

if (prev_class != p->sched_class) {

1016

if (prev_class->switched_from)

1016

if (prev_class->switched_from)

1017

prev_class->switched_from(rq, p);

1017

prev_class->switched_from(rq, p);

1018

p->sched_class->switched_to(rq, p);

1018

p->sched_class->switched_to(rq, p);

1019

} else if (oldprio != p->prio || dl_task(p))

1019

} else if (oldprio != p->prio || dl_task(p))

1020

p->sched_class->prio_changed(rq, p, oldprio);

1020

p->sched_class->prio_changed(rq, p, oldprio);

1021

}

1021

}

1022

1023

void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)

1023

void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)

1024

{

1024

{

1025

const struct sched_class *class;

1025

const struct sched_class *class;

1026

1027

if (p->sched_class == rq->curr->sched_class) {

1027

if (p->sched_class == rq->curr->sched_class) {

1028

rq->curr->sched_class->check_preempt_curr(rq, p, flags);

1028

rq->curr->sched_class->check_preempt_curr(rq, p, flags);

1029

} else {

1029

} else {

1030

for_each_class(class) {

1030

for_each_class(class) {

1031

if (class == rq->curr->sched_class)

1031

if (class == rq->curr->sched_class)

1032

break;

1032

break;

1033

if (class == p->sched_class) {

1033

if (class == p->sched_class) {

1034

resched_curr(rq);

1034

resched_curr(rq);

1035

break;

1035

break;

1036

}

1036

}

1037

}

1037

}

1038

}

1038

}

1039

1040

/*

1040

/*

1041

* A queue event has occurred, and we're going to schedule. In

1041

* A queue event has occurred, and we're going to schedule. In

1042

* this case, we can save a useless back to back clock update.

1042

* this case, we can save a useless back to back clock update.

1043

*/

1043

*/

1044

if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))

1044

if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))

1045

rq->skip_clock_update = 1;

1045

rq->skip_clock_update = 1;

1046

}

1046

}

1047

1048

#ifdef CONFIG_SMP

1048

#ifdef CONFIG_SMP

1049

void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

1049

void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

1050

{

1050

{

1051

#ifdef CONFIG_SCHED_DEBUG

1051

#ifdef CONFIG_SCHED_DEBUG

1052

/*

1052

/*

1053

* We should never call set_task_cpu() on a blocked task,

1053

* We should never call set_task_cpu() on a blocked task,

1054

* ttwu() will sort out the placement.

1054

* ttwu() will sort out the placement.

1055

*/

1055

*/

1056

WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&

1056

WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&

1057

!(task_preempt_count(p) & PREEMPT_ACTIVE));

1057

!(task_preempt_count(p) & PREEMPT_ACTIVE));

1058

1059

#ifdef CONFIG_LOCKDEP

1059

#ifdef CONFIG_LOCKDEP

1060

/*

1060

/*

1061

* The caller should hold either p->pi_lock or rq->lock, when changing

1061

* The caller should hold either p->pi_lock or rq->lock, when changing

1062

* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.

1062

* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.

1063

*

1063

*

1064

* sched_move_task() holds both and thus holding either pins the cgroup,

1064

* sched_move_task() holds both and thus holding either pins the cgroup,

1065

* see task_group().

1065

* see task_group().

1066

*

1066

*

1067

* Furthermore, all task_rq users should acquire both locks, see

1067

* Furthermore, all task_rq users should acquire both locks, see

1068

* task_rq_lock().

1068

* task_rq_lock().

1069

*/

1069

*/

1070

WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||

1070

WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||

1071

lockdep_is_held(&task_rq(p)->lock)));

1071

lockdep_is_held(&task_rq(p)->lock)));

1072

#endif

1072

#endif

1073

#endif

1073

#endif

1074

1075

trace_sched_migrate_task(p, new_cpu);

1075

trace_sched_migrate_task(p, new_cpu);

1076

1077

if (task_cpu(p) != new_cpu) {

1077

if (task_cpu(p) != new_cpu) {

1078

if (p->sched_class->migrate_task_rq)

1078

if (p->sched_class->migrate_task_rq)

1079

p->sched_class->migrate_task_rq(p, new_cpu);

1079

p->sched_class->migrate_task_rq(p, new_cpu);

1080

p->se.nr_migrations++;

1080

p->se.nr_migrations++;

1081

perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);

1081

perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);

1082

}

1082

}

1083

1084

__set_task_cpu(p, new_cpu);

1084

__set_task_cpu(p, new_cpu);

1085

}

1085

}

1086

1087

static void __migrate_swap_task(struct task_struct *p, int cpu)

1087

static void __migrate_swap_task(struct task_struct *p, int cpu)

1088

{

1088

{

1089

if (task_on_rq_queued(p)) {

1089

if (task_on_rq_queued(p)) {

1090

struct rq *src_rq, *dst_rq;

1090

struct rq *src_rq, *dst_rq;

1091

1092

src_rq = task_rq(p);

1092

src_rq = task_rq(p);

1093

dst_rq = cpu_rq(cpu);

1093

dst_rq = cpu_rq(cpu);

1094

1095

deactivate_task(src_rq, p, 0);

1095

deactivate_task(src_rq, p, 0);

1096

set_task_cpu(p, cpu);

1096

set_task_cpu(p, cpu);

1097

activate_task(dst_rq, p, 0);

1097

activate_task(dst_rq, p, 0);

1098

check_preempt_curr(dst_rq, p, 0);

1098

check_preempt_curr(dst_rq, p, 0);

1099

} else {

1099

} else {

1100

/*

1100

/*

1101

* Task isn't running anymore; make it appear like we migrated

1101

* Task isn't running anymore; make it appear like we migrated

1102

* it before it went to sleep. This means on wakeup we make the

1102

* it before it went to sleep. This means on wakeup we make the

1103

* previous cpu our targer instead of where it really is.

1103

* previous cpu our targer instead of where it really is.

1104

*/

1104

*/

1105

p->wake_cpu = cpu;

1105

p->wake_cpu = cpu;

1106

}

1106

}

1107

}

1107

}

1108

1109

struct migration_swap_arg {

1109

struct migration_swap_arg {

1110

struct task_struct *src_task, *dst_task;

1110

struct task_struct *src_task, *dst_task;

1111

int src_cpu, dst_cpu;

1111

int src_cpu, dst_cpu;

1112

};

1112

};

1113

1114

static int migrate_swap_stop(void *data)

1114

static int migrate_swap_stop(void *data)

1115

{

1115

{

1116

struct migration_swap_arg *arg = data;

1116

struct migration_swap_arg *arg = data;

1117

struct rq *src_rq, *dst_rq;

1117

struct rq *src_rq, *dst_rq;

1118

int ret = -EAGAIN;

1118

int ret = -EAGAIN;

1119

1120

src_rq = cpu_rq(arg->src_cpu);

1120

src_rq = cpu_rq(arg->src_cpu);

1121

dst_rq = cpu_rq(arg->dst_cpu);

1121

dst_rq = cpu_rq(arg->dst_cpu);

1122

1123

double_raw_lock(&arg->src_task->pi_lock,

1123

double_raw_lock(&arg->src_task->pi_lock,

1124

&arg->dst_task->pi_lock);

1124

&arg->dst_task->pi_lock);

1125

double_rq_lock(src_rq, dst_rq);

1125

double_rq_lock(src_rq, dst_rq);

1126

if (task_cpu(arg->dst_task) != arg->dst_cpu)

1126

if (task_cpu(arg->dst_task) != arg->dst_cpu)

1127

goto unlock;

1127

goto unlock;

1128

1129

if (task_cpu(arg->src_task) != arg->src_cpu)

1129

if (task_cpu(arg->src_task) != arg->src_cpu)

1130

goto unlock;

1130

goto unlock;

1131

1132

if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))

1132

if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))

1133

goto unlock;

1133

goto unlock;

1134

1135

if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))

1135

if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))

1136

goto unlock;

1136

goto unlock;

1137

1138

__migrate_swap_task(arg->src_task, arg->dst_cpu);

1138

__migrate_swap_task(arg->src_task, arg->dst_cpu);

1139

__migrate_swap_task(arg->dst_task, arg->src_cpu);

1139

__migrate_swap_task(arg->dst_task, arg->src_cpu);

1140

1141

ret = 0;

1141

ret = 0;

1142

1143

unlock:

1143

unlock:

1144

double_rq_unlock(src_rq, dst_rq);

1144

double_rq_unlock(src_rq, dst_rq);

1145

raw_spin_unlock(&arg->dst_task->pi_lock);

1145

raw_spin_unlock(&arg->dst_task->pi_lock);

1146

raw_spin_unlock(&arg->src_task->pi_lock);

1146

raw_spin_unlock(&arg->src_task->pi_lock);

1147

1148

return ret;

1148

return ret;

1149

}

1149

}

1150

1151

/*

1151

/*

1152

* Cross migrate two tasks

1152

* Cross migrate two tasks

1153

*/

1153

*/

1154

int migrate_swap(struct task_struct *cur, struct task_struct *p)

1154

int migrate_swap(struct task_struct *cur, struct task_struct *p)

1155

{

1155

{

1156

struct migration_swap_arg arg;

1156

struct migration_swap_arg arg;

1157

int ret = -EINVAL;

1157

int ret = -EINVAL;

1158

1159

arg = (struct migration_swap_arg){

1159

arg = (struct migration_swap_arg){

1160

.src_task = cur,

1160

.src_task = cur,

1161

.src_cpu = task_cpu(cur),

1161

.src_cpu = task_cpu(cur),

1162

.dst_task = p,

1162

.dst_task = p,

1163

.dst_cpu = task_cpu(p),

1163

.dst_cpu = task_cpu(p),

1164

};

1164

};

1165

1166

if (arg.src_cpu == arg.dst_cpu)

1166

if (arg.src_cpu == arg.dst_cpu)

1167

goto out;

1167

goto out;

1168

1169

/*

1169

/*

1170

* These three tests are all lockless; this is OK since all of them

1170

* These three tests are all lockless; this is OK since all of them

1171

* will be re-checked with proper locks held further down the line.

1171

* will be re-checked with proper locks held further down the line.

1172

*/

1172

*/

1173

if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))

1173

if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))

1174

goto out;

1174

goto out;

1175

1176

if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))

1176

if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))

1177

goto out;

1177

goto out;

1178

1179

if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))

1179

if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))

1180

goto out;

1180

goto out;

1181

1182

trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);

1182

trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);

1183

ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);

1183

ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);

1184

1185

out:

1185

out:

1186

return ret;

1186

return ret;

1187

}

1187

}

1188

1189

struct migration_arg {

1189

struct migration_arg {

1190

struct task_struct *task;

1190

struct task_struct *task;

1191

int dest_cpu;

1191

int dest_cpu;

1192

};

1192

};

1193

1194

static int migration_cpu_stop(void *data);

1194

static int migration_cpu_stop(void *data);

1195

1196

/*

1196

/*

1197

* wait_task_inactive - wait for a thread to unschedule.

1197

* wait_task_inactive - wait for a thread to unschedule.

1198

*

1198

*

1199

* If @match_state is nonzero, it's the @p->state value just checked and

1199

* If @match_state is nonzero, it's the @p->state value just checked and

1200

* not expected to change. If it changes, i.e. @p might have woken up,

1200

* not expected to change. If it changes, i.e. @p might have woken up,

1201

* then return zero. When we succeed in waiting for @p to be off its CPU,

1201

* then return zero. When we succeed in waiting for @p to be off its CPU,

1202

* we return a positive number (its total switch count). If a second call

1202

* we return a positive number (its total switch count). If a second call

1203

* a short while later returns the same number, the caller can be sure that

1203

* a short while later returns the same number, the caller can be sure that

1204

* @p has remained unscheduled the whole time.

1204

* @p has remained unscheduled the whole time.

1205

*

1205

*

1206

* The caller must ensure that the task *will* unschedule sometime soon,

1206

* The caller must ensure that the task *will* unschedule sometime soon,

1207

* else this function might spin for a *long* time. This function can't

1207

* else this function might spin for a *long* time. This function can't

1208

* be called with interrupts off, or it may introduce deadlock with

1208

* be called with interrupts off, or it may introduce deadlock with

1209

* smp_call_function() if an IPI is sent by the same process we are

1209

* smp_call_function() if an IPI is sent by the same process we are

1210

* waiting to become inactive.

1210

* waiting to become inactive.

1211

*/

1211

*/

1212

unsigned long wait_task_inactive(struct task_struct *p, long match_state)

1212

unsigned long wait_task_inactive(struct task_struct *p, long match_state)

1213

{

1213

{

1214

unsigned long flags;

1214

unsigned long flags;

1215

int running, queued;

1215

int running, queued;

1216

unsigned long ncsw;

1216

unsigned long ncsw;

1217

struct rq *rq;

1217

struct rq *rq;

1218

1219

for (;;) {

1219

for (;;) {

1220

/*

1220

/*

1221

* We do the initial early heuristics without holding

1221

* We do the initial early heuristics without holding

1222

* any task-queue locks at all. We'll only try to get

1222

* any task-queue locks at all. We'll only try to get

1223

* the runqueue lock when things look like they will

1223

* the runqueue lock when things look like they will

1224

* work out!

1224

* work out!

1225

*/

1225

*/

1226

rq = task_rq(p);

1226

rq = task_rq(p);

1227

1228

/*

1228

/*

1229

* If the task is actively running on another CPU

1229

* If the task is actively running on another CPU

1230

* still, just relax and busy-wait without holding

1230

* still, just relax and busy-wait without holding

1231

* any locks.

1231

* any locks.

1232

*

1232

*

1233

* NOTE! Since we don't hold any locks, it's not

1233

* NOTE! Since we don't hold any locks, it's not

1234

* even sure that "rq" stays as the right runqueue!

1234

* even sure that "rq" stays as the right runqueue!

1235

* But we don't care, since "task_running()" will

1235

* But we don't care, since "task_running()" will

1236

* return false if the runqueue has changed and p

1236

* return false if the runqueue has changed and p

1237

* is actually now running somewhere else!

1237

* is actually now running somewhere else!

1238

*/

1238

*/

1239

while (task_running(rq, p)) {

1239

while (task_running(rq, p)) {

1240

if (match_state && unlikely(p->state != match_state))

1240

if (match_state && unlikely(p->state != match_state))

1241

return 0;

1241

return 0;

1242

cpu_relax();

1242

cpu_relax();

1243

}

1243

}

1244

1245

/*

1245

/*

1246

* Ok, time to look more closely! We need the rq

1246

* Ok, time to look more closely! We need the rq

1247

* lock now, to be *sure*. If we're wrong, we'll

1247

* lock now, to be *sure*. If we're wrong, we'll

1248

* just go back and repeat.

1248

* just go back and repeat.

1249

*/

1249

*/

1250

rq = task_rq_lock(p, &flags);

1250

rq = task_rq_lock(p, &flags);

1251

trace_sched_wait_task(p);

1251

trace_sched_wait_task(p);

1252

running = task_running(rq, p);

1252

running = task_running(rq, p);

1253

queued = task_on_rq_queued(p);

1253

queued = task_on_rq_queued(p);

1254

ncsw = 0;

1254

ncsw = 0;

1255

if (!match_state || p->state == match_state)

1255

if (!match_state || p->state == match_state)

1256

ncsw = p->nvcsw | LONG_MIN; /* sets MSB */

1256

ncsw = p->nvcsw | LONG_MIN; /* sets MSB */

1257

task_rq_unlock(rq, p, &flags);

1257

task_rq_unlock(rq, p, &flags);

1258

1259

/*

1259

/*

1260

* If it changed from the expected state, bail out now.

1260

* If it changed from the expected state, bail out now.

1261

*/

1261

*/

1262

if (unlikely(!ncsw))

1262

if (unlikely(!ncsw))

1263

break;

1263

break;

1264

1265

/*

1265

/*

1266

* Was it really running after all now that we

1266

* Was it really running after all now that we

1267

* checked with the proper locks actually held?

1267

* checked with the proper locks actually held?

1268

*

1268

*

1269

* Oops. Go back and try again..

1269

* Oops. Go back and try again..

1270

*/

1270

*/

1271

if (unlikely(running)) {

1271

if (unlikely(running)) {

1272

cpu_relax();

1272

cpu_relax();

1273

continue;

1273

continue;

1274

}

1274

}

1275

1276

/*

1276

/*

1277

* It's not enough that it's not actively running,

1277

* It's not enough that it's not actively running,

1278

* it must be off the runqueue _entirely_, and not

1278

* it must be off the runqueue _entirely_, and not

1279

* preempted!

1279

* preempted!

1280

*

1280

*

1281

* So if it was still runnable (but just not actively

1281

* So if it was still runnable (but just not actively

1282

* running right now), it's preempted, and we should

1282

* running right now), it's preempted, and we should

1283

* yield - it could be a while.

1283

* yield - it could be a while.

1284

*/

1284

*/

1285

if (unlikely(queued)) {

1285

if (unlikely(queued)) {

1286

ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);

1286

ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);

1287

1288

set_current_state(TASK_UNINTERRUPTIBLE);

1288

set_current_state(TASK_UNINTERRUPTIBLE);

1289

schedule_hrtimeout(&to, HRTIMER_MODE_REL);

1289

schedule_hrtimeout(&to, HRTIMER_MODE_REL);

1290

continue;

1290

continue;

1291

}

1291

}

1292

1293

/*

1293

/*

1294

* Ahh, all good. It wasn't running, and it wasn't

1294

* Ahh, all good. It wasn't running, and it wasn't

1295

* runnable, which means that it will never become

1295

* runnable, which means that it will never become

1296

* running in the future either. We're all done!

1296

* running in the future either. We're all done!

1297

*/

1297

*/

1298

break;

1298

break;

1299

}

1299

}

1300

1301

return ncsw;

1301

return ncsw;

1302

}

1302

}

1303

1304

/***

1304

/***

1305

* kick_process - kick a running thread to enter/exit the kernel

1305

* kick_process - kick a running thread to enter/exit the kernel

1306

* @p: the to-be-kicked thread

1306

* @p: the to-be-kicked thread

1307

*

1307

*

1308

* Cause a process which is running on another CPU to enter

1308

* Cause a process which is running on another CPU to enter

1309

* kernel-mode, without any delay. (to get signals handled.)

1309

* kernel-mode, without any delay. (to get signals handled.)

1310

*

1310

*

1311

* NOTE: this function doesn't have to take the runqueue lock,

1311

* NOTE: this function doesn't have to take the runqueue lock,

1312

* because all it wants to ensure is that the remote task enters

1312

* because all it wants to ensure is that the remote task enters

1313

* the kernel. If the IPI races and the task has been migrated

1313

* the kernel. If the IPI races and the task has been migrated

1314

* to another CPU then no harm is done and the purpose has been

1314

* to another CPU then no harm is done and the purpose has been

1315

* achieved as well.

1315

* achieved as well.

1316

*/

1316

*/

1317

void kick_process(struct task_struct *p)

1317

void kick_process(struct task_struct *p)

1318

{

1318

{

1319

int cpu;

1319

int cpu;

1320

1321

preempt_disable();

1321

preempt_disable();

1322

cpu = task_cpu(p);

1322

cpu = task_cpu(p);

1323

if ((cpu != smp_processor_id()) && task_curr(p))

1323

if ((cpu != smp_processor_id()) && task_curr(p))

1324

smp_send_reschedule(cpu);

1324

smp_send_reschedule(cpu);

1325

preempt_enable();

1325

preempt_enable();

1326

}

1326

}

1327

EXPORT_SYMBOL_GPL(kick_process);

1327

EXPORT_SYMBOL_GPL(kick_process);

1328

#endif /* CONFIG_SMP */

1328

#endif /* CONFIG_SMP */

1329

1330

#ifdef CONFIG_SMP

1330

#ifdef CONFIG_SMP

1331

/*

1331

/*

1332

* ->cpus_allowed is protected by both rq->lock and p->pi_lock

1332

* ->cpus_allowed is protected by both rq->lock and p->pi_lock

1333

*/

1333

*/

1334

static int select_fallback_rq(int cpu, struct task_struct *p)

1334

static int select_fallback_rq(int cpu, struct task_struct *p)

1335

{

1335

{

1336

int nid = cpu_to_node(cpu);

1336

int nid = cpu_to_node(cpu);

1337

const struct cpumask *nodemask = NULL;

1337

const struct cpumask *nodemask = NULL;

1338

enum { cpuset, possible, fail } state = cpuset;

1338

enum { cpuset, possible, fail } state = cpuset;

1339

int dest_cpu;

1339

int dest_cpu;

1340

1341

/*

1341

/*

1342

* If the node that the cpu is on has been offlined, cpu_to_node()

1342

* If the node that the cpu is on has been offlined, cpu_to_node()

1343

* will return -1. There is no cpu on the node, and we should

1343

* will return -1. There is no cpu on the node, and we should

1344

* select the cpu on the other node.

1344

* select the cpu on the other node.

1345

*/

1345

*/

1346

if (nid != -1) {

1346

if (nid != -1) {

1347

nodemask = cpumask_of_node(nid);

1347

nodemask = cpumask_of_node(nid);

1348

1349

/* Look for allowed, online CPU in same node. */

1349

/* Look for allowed, online CPU in same node. */

1350

for_each_cpu(dest_cpu, nodemask) {

1350

for_each_cpu(dest_cpu, nodemask) {

1351

if (!cpu_online(dest_cpu))

1351

if (!cpu_online(dest_cpu))

1352

continue;

1352

continue;

1353

if (!cpu_active(dest_cpu))

1353

if (!cpu_active(dest_cpu))

1354

continue;

1354

continue;

1355

if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))

1355

if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))

1356

return dest_cpu;

1356

return dest_cpu;

1357

}

1357

}

1358

}

1358

}

1359

1360

for (;;) {

1360

for (;;) {

1361

/* Any allowed, online CPU? */

1361

/* Any allowed, online CPU? */

1362

for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {

1362

for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {

1363

if (!cpu_online(dest_cpu))

1363

if (!cpu_online(dest_cpu))

1364

continue;

1364

continue;

1365

if (!cpu_active(dest_cpu))

1365

if (!cpu_active(dest_cpu))

1366

continue;

1366

continue;

1367

goto out;

1367

goto out;

1368

}

1368

}

1369

1370

switch (state) {

1370

switch (state) {

1371

case cpuset:

1371

case cpuset:

1372

/* No more Mr. Nice Guy. */

1372

/* No more Mr. Nice Guy. */

1373

cpuset_cpus_allowed_fallback(p);

1373

cpuset_cpus_allowed_fallback(p);

1374

state = possible;

1374

state = possible;

1375

break;

1375

break;

1376

1377

case possible:

1377

case possible:

1378

do_set_cpus_allowed(p, cpu_possible_mask);

1378

do_set_cpus_allowed(p, cpu_possible_mask);

1379

state = fail;

1379

state = fail;

1380

break;

1380

break;

1381

1382

case fail:

1382

case fail:

1383

BUG();

1383

BUG();

1384

break;

1384

break;

1385

}

1385

}

1386

}

1386

}

1387

1388

out:

1388

out:

1389

if (state != cpuset) {

1389

if (state != cpuset) {

1390

/*

1390

/*

1391

* Don't tell them about moving exiting tasks or

1391

* Don't tell them about moving exiting tasks or

1392

* kernel threads (both mm NULL), since they never

1392

* kernel threads (both mm NULL), since they never

1393

* leave kernel.

1393

* leave kernel.

1394

*/

1394

*/

1395

if (p->mm && printk_ratelimit()) {

1395

if (p->mm && printk_ratelimit()) {

1396

printk_deferred("process %d (%s) no longer affine to cpu%d\n",

1396

printk_deferred("process %d (%s) no longer affine to cpu%d\n",

1397

task_pid_nr(p), p->comm, cpu);

1397

task_pid_nr(p), p->comm, cpu);

1398

}

1398

}

1399

}

1399

}

1400

1401

return dest_cpu;

1401

return dest_cpu;

1402

}

1402

}

1403

1404

/*

1404

/*

1405

* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.

1405

* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.

1406

*/

1406

*/

1407

static inline

1407

static inline

1408

int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)

1408

int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)

1409

{

1409

{

1410

cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);

1410

cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);

1411

1412

/*

1412

/*

1413

* In order not to call set_task_cpu() on a blocking task we need

1413

* In order not to call set_task_cpu() on a blocking task we need

1414

* to rely on ttwu() to place the task on a valid ->cpus_allowed

1414

* to rely on ttwu() to place the task on a valid ->cpus_allowed

1415

* cpu.

1415

* cpu.

1416

*

1416

*

1417

* Since this is common to all placement strategies, this lives here.

1417

* Since this is common to all placement strategies, this lives here.

1418

*

1418

*

1419

* [ this allows ->select_task() to simply return task_cpu(p) and

1419

* [ this allows ->select_task() to simply return task_cpu(p) and

1420

* not worry about this generic constraint ]

1420

* not worry about this generic constraint ]

1421

*/

1421

*/

1422

if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||

1422

if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||

1423

!cpu_online(cpu)))

1423

!cpu_online(cpu)))

1424

cpu = select_fallback_rq(task_cpu(p), p);

1424

cpu = select_fallback_rq(task_cpu(p), p);

1425

1426

return cpu;

1426

return cpu;

1427

}

1427

}

1428

1429

static void update_avg(u64 *avg, u64 sample)

1429

static void update_avg(u64 *avg, u64 sample)

1430

{

1430

{

1431

s64 diff = sample - *avg;

1431

s64 diff = sample - *avg;

1432

*avg += diff >> 3;

1432

*avg += diff >> 3;

1433

}

1433

}

1434

#endif

1434

#endif

1435

1436

static void

1436

static void

1437

ttwu_stat(struct task_struct *p, int cpu, int wake_flags)

1437

ttwu_stat(struct task_struct *p, int cpu, int wake_flags)

1438

{

1438

{

1439

#ifdef CONFIG_SCHEDSTATS

1439

#ifdef CONFIG_SCHEDSTATS

1440

struct rq *rq = this_rq();

1440

struct rq *rq = this_rq();

1441

1442

#ifdef CONFIG_SMP

1442

#ifdef CONFIG_SMP

1443

int this_cpu = smp_processor_id();

1443

int this_cpu = smp_processor_id();

1444

1445

if (cpu == this_cpu) {

1445

if (cpu == this_cpu) {

1446

schedstat_inc(rq, ttwu_local);

1446

schedstat_inc(rq, ttwu_local);

1447

schedstat_inc(p, se.statistics.nr_wakeups_local);

1447

schedstat_inc(p, se.statistics.nr_wakeups_local);

1448

} else {

1448

} else {

1449

struct sched_domain *sd;

1449

struct sched_domain *sd;

1450

1451

schedstat_inc(p, se.statistics.nr_wakeups_remote);

1451

schedstat_inc(p, se.statistics.nr_wakeups_remote);

1452

rcu_read_lock();

1452

rcu_read_lock();

1453

for_each_domain(this_cpu, sd) {

1453

for_each_domain(this_cpu, sd) {

1454

if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {

1454

if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {

1455

schedstat_inc(sd, ttwu_wake_remote);

1455

schedstat_inc(sd, ttwu_wake_remote);

1456

break;

1456

break;

1457

}

1457

}

1458

}

1458

}

1459

rcu_read_unlock();

1459

rcu_read_unlock();

1460

}

1460

}

1461

1462

if (wake_flags & WF_MIGRATED)

1462

if (wake_flags & WF_MIGRATED)

1463

schedstat_inc(p, se.statistics.nr_wakeups_migrate);

1463

schedstat_inc(p, se.statistics.nr_wakeups_migrate);

1464

1465

#endif /* CONFIG_SMP */

1465

#endif /* CONFIG_SMP */

1466

1467

schedstat_inc(rq, ttwu_count);

1467

schedstat_inc(rq, ttwu_count);

1468

schedstat_inc(p, se.statistics.nr_wakeups);

1468

schedstat_inc(p, se.statistics.nr_wakeups);

1469

1470

if (wake_flags & WF_SYNC)

1470

if (wake_flags & WF_SYNC)

1471

schedstat_inc(p, se.statistics.nr_wakeups_sync);

1471

schedstat_inc(p, se.statistics.nr_wakeups_sync);

1472

1473

#endif /* CONFIG_SCHEDSTATS */

1473

#endif /* CONFIG_SCHEDSTATS */

1474

}

1474

}

1475

1476

static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)

1476

static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)

1477

{

1477

{

1478

activate_task(rq, p, en_flags);

1478

activate_task(rq, p, en_flags);

1479

p->on_rq = TASK_ON_RQ_QUEUED;

1479

p->on_rq = TASK_ON_RQ_QUEUED;

1480

1481

/* if a worker is waking up, notify workqueue */

1481

/* if a worker is waking up, notify workqueue */

1482

if (p->flags & PF_WQ_WORKER)

1482

if (p->flags & PF_WQ_WORKER)

1483

wq_worker_waking_up(p, cpu_of(rq));

1483

wq_worker_waking_up(p, cpu_of(rq));

1484

}

1484

}

1485

1486

/*

1486

/*

1487

* Mark the task runnable and perform wakeup-preemption.

1487

* Mark the task runnable and perform wakeup-preemption.

1488

*/

1488

*/

1489

static void

1489

static void

1490

ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)

1490

ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)

1491

{

1491

{

1492

check_preempt_curr(rq, p, wake_flags);

1492

check_preempt_curr(rq, p, wake_flags);

1493

trace_sched_wakeup(p, true);

1493

trace_sched_wakeup(p, true);

1494

1495

p->state = TASK_RUNNING;

1495

p->state = TASK_RUNNING;

1496

#ifdef CONFIG_SMP

1496

#ifdef CONFIG_SMP

1497

if (p->sched_class->task_woken)

1497

if (p->sched_class->task_woken)

1498

p->sched_class->task_woken(rq, p);

1498

p->sched_class->task_woken(rq, p);

1499

1500

if (rq->idle_stamp) {

1500

if (rq->idle_stamp) {

1501

u64 delta = rq_clock(rq) - rq->idle_stamp;

1501

u64 delta = rq_clock(rq) - rq->idle_stamp;

1502

u64 max = 2*rq->max_idle_balance_cost;

1502

u64 max = 2*rq->max_idle_balance_cost;

1503

1504

update_avg(&rq->avg_idle, delta);

1504

update_avg(&rq->avg_idle, delta);

1505

1506

if (rq->avg_idle > max)

1506

if (rq->avg_idle > max)

1507

rq->avg_idle = max;

1507

rq->avg_idle = max;

1508

1509

rq->idle_stamp = 0;

1509

rq->idle_stamp = 0;

1510

}

1510

}

1511

#endif

1511

#endif

1512

}

1512

}

1513

1514

static void

1514

static void

1515

ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)

1515

ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)

1516

{

1516

{

1517

#ifdef CONFIG_SMP

1517

#ifdef CONFIG_SMP

1518

if (p->sched_contributes_to_load)

1518

if (p->sched_contributes_to_load)

1519

rq->nr_uninterruptible--;

1519

rq->nr_uninterruptible--;

1520

#endif

1520

#endif

1521

1522

ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);

1522

ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);

1523

ttwu_do_wakeup(rq, p, wake_flags);

1523

ttwu_do_wakeup(rq, p, wake_flags);

1524

}

1524

}

1525

1526

/*

1526

/*

1527

* Called in case the task @p isn't fully descheduled from its runqueue,

1527

* Called in case the task @p isn't fully descheduled from its runqueue,

1528

* in this case we must do a remote wakeup. Its a 'light' wakeup though,

1528

* in this case we must do a remote wakeup. Its a 'light' wakeup though,

1529

* since all we need to do is flip p->state to TASK_RUNNING, since

1529

* since all we need to do is flip p->state to TASK_RUNNING, since

1530

* the task is still ->on_rq.

1530

* the task is still ->on_rq.

1531

*/

1531

*/

1532

static int ttwu_remote(struct task_struct *p, int wake_flags)

1532

static int ttwu_remote(struct task_struct *p, int wake_flags)

1533

{

1533

{

1534

struct rq *rq;

1534

struct rq *rq;

1535

int ret = 0;

1535

int ret = 0;

1536

1537

rq = __task_rq_lock(p);

1537

rq = __task_rq_lock(p);

1538

if (task_on_rq_queued(p)) {

1538

if (task_on_rq_queued(p)) {

1539

/* check_preempt_curr() may use rq clock */

1539

/* check_preempt_curr() may use rq clock */

1540

update_rq_clock(rq);

1540

update_rq_clock(rq);

1541

ttwu_do_wakeup(rq, p, wake_flags);

1541

ttwu_do_wakeup(rq, p, wake_flags);

1542

ret = 1;

1542

ret = 1;

1543

}

1543

}

1544

__task_rq_unlock(rq);

1544

__task_rq_unlock(rq);

1545

1546

return ret;

1546

return ret;

1547

}

1547

}

1548

1549

#ifdef CONFIG_SMP

1549

#ifdef CONFIG_SMP

1550

void sched_ttwu_pending(void)

1550

void sched_ttwu_pending(void)

1551

{

1551

{

1552

struct rq *rq = this_rq();

1552

struct rq *rq = this_rq();

1553

struct llist_node *llist = llist_del_all(&rq->wake_list);

1553

struct llist_node *llist = llist_del_all(&rq->wake_list);

1554

struct task_struct *p;

1554

struct task_struct *p;

1555

unsigned long flags;

1555

unsigned long flags;

1556

1557

if (!llist)

1557

if (!llist)

1558

return;

1558

return;

1559

1560

raw_spin_lock_irqsave(&rq->lock, flags);

1560

raw_spin_lock_irqsave(&rq->lock, flags);

1561

1562

while (llist) {

1562

while (llist) {

1563

p = llist_entry(llist, struct task_struct, wake_entry);

1563

p = llist_entry(llist, struct task_struct, wake_entry);

1564

llist = llist_next(llist);

1564

llist = llist_next(llist);

1565

ttwu_do_activate(rq, p, 0);

1565

ttwu_do_activate(rq, p, 0);

1566

}

1566

}

1567

1568

raw_spin_unlock_irqrestore(&rq->lock, flags);

1568

raw_spin_unlock_irqrestore(&rq->lock, flags);

1569

}

1569

}

1570

1571

void scheduler_ipi(void)

1571

void scheduler_ipi(void)

1572

{

1572

{

1573

/*

1573

/*

1574

* Fold TIF_NEED_RESCHED into the preempt_count; anybody setting

1574

* Fold TIF_NEED_RESCHED into the preempt_count; anybody setting

1575

* TIF_NEED_RESCHED remotely (for the first time) will also send

1575

* TIF_NEED_RESCHED remotely (for the first time) will also send

1576

* this IPI.

1576

* this IPI.

1577

*/

1577

*/

1578

preempt_fold_need_resched();

1578

preempt_fold_need_resched();

1579

1580

if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())

1580

if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())

1581

return;

1581

return;

1582

1583

/*

1583

/*

1584

* Not all reschedule IPI handlers call irq_enter/irq_exit, since

1584

* Not all reschedule IPI handlers call irq_enter/irq_exit, since

1585

* traditionally all their work was done from the interrupt return

1585

* traditionally all their work was done from the interrupt return

1586

* path. Now that we actually do some work, we need to make sure

1586

* path. Now that we actually do some work, we need to make sure

1587

* we do call them.

1587

* we do call them.

1588

*

1588

*

1589

* Some archs already do call them, luckily irq_enter/exit nest

1589

* Some archs already do call them, luckily irq_enter/exit nest

1590

* properly.

1590

* properly.

1591

*

1591

*

1592

* Arguably we should visit all archs and update all handlers,

1592

* Arguably we should visit all archs and update all handlers,

1593

* however a fair share of IPIs are still resched only so this would

1593

* however a fair share of IPIs are still resched only so this would

1594

* somewhat pessimize the simple resched case.

1594

* somewhat pessimize the simple resched case.

1595

*/

1595

*/

1596

irq_enter();

1596

irq_enter();

1597

sched_ttwu_pending();

1597

sched_ttwu_pending();

1598

1599

/*

1599

/*

1600

* Check if someone kicked us for doing the nohz idle load balance.

1600

* Check if someone kicked us for doing the nohz idle load balance.

1601

*/

1601

*/

1602

if (unlikely(got_nohz_idle_kick())) {

1602

if (unlikely(got_nohz_idle_kick())) {

1603

this_rq()->idle_balance = 1;

1603

this_rq()->idle_balance = 1;

1604

raise_softirq_irqoff(SCHED_SOFTIRQ);

1604

raise_softirq_irqoff(SCHED_SOFTIRQ);

1605

}

1605

}

1606

irq_exit();

1606

irq_exit();

1607

}

1607

}

1608

1609

static void ttwu_queue_remote(struct task_struct *p, int cpu)

1609

static void ttwu_queue_remote(struct task_struct *p, int cpu)

1610

{

1610

{

1611

struct rq *rq = cpu_rq(cpu);

1611

struct rq *rq = cpu_rq(cpu);

1612

1613

if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {

1613

if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {

1614

if (!set_nr_if_polling(rq->idle))

1614

if (!set_nr_if_polling(rq->idle))

1615

smp_send_reschedule(cpu);

1615

smp_send_reschedule(cpu);

1616

else

1616

else

1617

trace_sched_wake_idle_without_ipi(cpu);

1617

trace_sched_wake_idle_without_ipi(cpu);

1618

}

1618

}

1619

}

1619

}

1620

1621

void wake_up_if_idle(int cpu)

1621

void wake_up_if_idle(int cpu)

1622

{

1622

{

1623

struct rq *rq = cpu_rq(cpu);

1623

struct rq *rq = cpu_rq(cpu);

1624

unsigned long flags;

1624

unsigned long flags;

1625

1626

if (!is_idle_task(rq->curr))

1626

rcu_read_lock();

1627

return;

1628

1627

1628

if (!is_idle_task(rcu_dereference(rq->curr)))

1629

goto out;

1630

1629

if (set_nr_if_polling(rq->idle)) {

1631

if (set_nr_if_polling(rq->idle)) {

1630

trace_sched_wake_idle_without_ipi(cpu);

1632

trace_sched_wake_idle_without_ipi(cpu);

1631

} else {

1633

} else {

1632

raw_spin_lock_irqsave(&rq->lock, flags);

1634

raw_spin_lock_irqsave(&rq->lock, flags);

1633

if (is_idle_task(rq->curr))

1635

if (is_idle_task(rq->curr))

1634

smp_send_reschedule(cpu);

1636

smp_send_reschedule(cpu);

1635

/* Else cpu is not in idle, do nothing here */

1637

/* Else cpu is not in idle, do nothing here */

1636

raw_spin_unlock_irqrestore(&rq->lock, flags);

1638

raw_spin_unlock_irqrestore(&rq->lock, flags);

1637

}

1639

}

1640

1641

out:

1642

rcu_read_unlock();

1638

}

1643

}

1639

1644

1640

bool cpus_share_cache(int this_cpu, int that_cpu)

1645

bool cpus_share_cache(int this_cpu, int that_cpu)

1641

{

1646

{

1642

return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);

1647

return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);

1643

}

1648

}

1644

#endif /* CONFIG_SMP */

1649

#endif /* CONFIG_SMP */

1645

1650

1646

static void ttwu_queue(struct task_struct *p, int cpu)

1651

static void ttwu_queue(struct task_struct *p, int cpu)

1647

{

1652

{

1648

struct rq *rq = cpu_rq(cpu);

1653

struct rq *rq = cpu_rq(cpu);

1649

1654

1650

#if defined(CONFIG_SMP)

1655

#if defined(CONFIG_SMP)

1651

if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {

1656

if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {

1652

sched_clock_cpu(cpu); /* sync clocks x-cpu */

1657

sched_clock_cpu(cpu); /* sync clocks x-cpu */

1653

ttwu_queue_remote(p, cpu);

1658

ttwu_queue_remote(p, cpu);

1654

return;

1659

return;

1655

}

1660

}

1656

#endif

1661

#endif

1657

1662

1658

raw_spin_lock(&rq->lock);

1663

raw_spin_lock(&rq->lock);

1659

ttwu_do_activate(rq, p, 0);

1664

ttwu_do_activate(rq, p, 0);

1660

raw_spin_unlock(&rq->lock);

1665

raw_spin_unlock(&rq->lock);

1661

}

1666

}

1662

1667

1663

/**

1668

/**

1664

* try_to_wake_up - wake up a thread

1669

* try_to_wake_up - wake up a thread

1665

* @p: the thread to be awakened

1670

* @p: the thread to be awakened

1666

* @state: the mask of task states that can be woken

1671

* @state: the mask of task states that can be woken

1667

* @wake_flags: wake modifier flags (WF_*)

1672

* @wake_flags: wake modifier flags (WF_*)

1668

*

1673

*

1669

* Put it on the run-queue if it's not already there. The "current"

1674

* Put it on the run-queue if it's not already there. The "current"

1670

* thread is always on the run-queue (except when the actual

1675

* thread is always on the run-queue (except when the actual

1671

* re-schedule is in progress), and as such you're allowed to do

1676

* re-schedule is in progress), and as such you're allowed to do

1672

* the simpler "current->state = TASK_RUNNING" to mark yourself

1677

* the simpler "current->state = TASK_RUNNING" to mark yourself

1673

* runnable without the overhead of this.

1678

* runnable without the overhead of this.

1674

*

1679

*

1675

* Return: %true if @p was woken up, %false if it was already running.

1680

* Return: %true if @p was woken up, %false if it was already running.

1676

* or @state didn't match @p's state.

1681

* or @state didn't match @p's state.

1677

*/

1682

*/

1678

static int

1683

static int

1679

try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)

1684

try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)

1680

{

1685

{

1681

unsigned long flags;

1686

unsigned long flags;

1682

int cpu, success = 0;

1687

int cpu, success = 0;

1683

1688

1684

/*

1689

/*

1685

* If we are going to wake up a thread waiting for CONDITION we

1690

* If we are going to wake up a thread waiting for CONDITION we

1686

* need to ensure that CONDITION=1 done by the caller can not be

1691

* need to ensure that CONDITION=1 done by the caller can not be

1687

* reordered with p->state check below. This pairs with mb() in

1692

* reordered with p->state check below. This pairs with mb() in

1688

* set_current_state() the waiting thread does.

1693

* set_current_state() the waiting thread does.

1689

*/

1694

*/

1690

smp_mb__before_spinlock();

1695

smp_mb__before_spinlock();

1691

raw_spin_lock_irqsave(&p->pi_lock, flags);

1696

raw_spin_lock_irqsave(&p->pi_lock, flags);

1692

if (!(p->state & state))

1697

if (!(p->state & state))

1693

goto out;

1698

goto out;

1694

1699

1695

success = 1; /* we're going to change ->state */

1700

success = 1; /* we're going to change ->state */

1696

cpu = task_cpu(p);

1701

cpu = task_cpu(p);

1697

1702

1698

if (p->on_rq && ttwu_remote(p, wake_flags))

1703

if (p->on_rq && ttwu_remote(p, wake_flags))

1699

goto stat;

1704

goto stat;

1700

1705

1701

#ifdef CONFIG_SMP

1706

#ifdef CONFIG_SMP

1702

/*

1707

/*

1703

* If the owning (remote) cpu is still in the middle of schedule() with

1708

* If the owning (remote) cpu is still in the middle of schedule() with

1704

* this task as prev, wait until its done referencing the task.

1709

* this task as prev, wait until its done referencing the task.

1705

*/

1710

*/

1706

while (p->on_cpu)

1711

while (p->on_cpu)

1707

cpu_relax();

1712

cpu_relax();

1708

/*

1713

/*

1709

* Pairs with the smp_wmb() in finish_lock_switch().

1714

* Pairs with the smp_wmb() in finish_lock_switch().

1710

*/

1715

*/

1711

smp_rmb();

1716

smp_rmb();

1712

1717

1713

p->sched_contributes_to_load = !!task_contributes_to_load(p);

1718

p->sched_contributes_to_load = !!task_contributes_to_load(p);

1714

p->state = TASK_WAKING;

1719

p->state = TASK_WAKING;

1715

1720

1716

if (p->sched_class->task_waking)

1721

if (p->sched_class->task_waking)

1717

p->sched_class->task_waking(p);

1722

p->sched_class->task_waking(p);

1718

1723

1719

cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);

1724

cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);

1720

if (task_cpu(p) != cpu) {

1725

if (task_cpu(p) != cpu) {

1721

wake_flags |= WF_MIGRATED;

1726

wake_flags |= WF_MIGRATED;

1722

set_task_cpu(p, cpu);

1727

set_task_cpu(p, cpu);

1723

}

1728

}

1724

#endif /* CONFIG_SMP */

1729

#endif /* CONFIG_SMP */

1725

1730

1726

ttwu_queue(p, cpu);

1731

ttwu_queue(p, cpu);

1727

stat:

1732

stat:

1728

ttwu_stat(p, cpu, wake_flags);

1733

ttwu_stat(p, cpu, wake_flags);

1729

out:

1734

out:

1730

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

1735

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

1731

1736

1732

return success;

1737

return success;

1733

}

1738

}

1734

1739

1735

/**

1740

/**

1736

* try_to_wake_up_local - try to wake up a local task with rq lock held

1741

* try_to_wake_up_local - try to wake up a local task with rq lock held

1737

* @p: the thread to be awakened

1742

* @p: the thread to be awakened

1738

*

1743

*

1739

* Put @p on the run-queue if it's not already there. The caller must

1744

* Put @p on the run-queue if it's not already there. The caller must

1740

* ensure that this_rq() is locked, @p is bound to this_rq() and not

1745

* ensure that this_rq() is locked, @p is bound to this_rq() and not

1741

* the current task.

1746

* the current task.

1742

*/

1747

*/

1743

static void try_to_wake_up_local(struct task_struct *p)

1748

static void try_to_wake_up_local(struct task_struct *p)

1744

{

1749

{

1745

struct rq *rq = task_rq(p);

1750

struct rq *rq = task_rq(p);

1746

1751

1747

if (WARN_ON_ONCE(rq != this_rq()) ||

1752

if (WARN_ON_ONCE(rq != this_rq()) ||

1748

WARN_ON_ONCE(p == current))

1753

WARN_ON_ONCE(p == current))

1749

return;

1754

return;

1750

1755

1751

lockdep_assert_held(&rq->lock);

1756

lockdep_assert_held(&rq->lock);

1752

1757

1753

if (!raw_spin_trylock(&p->pi_lock)) {

1758

if (!raw_spin_trylock(&p->pi_lock)) {

1754

raw_spin_unlock(&rq->lock);

1759

raw_spin_unlock(&rq->lock);

1755

raw_spin_lock(&p->pi_lock);

1760

raw_spin_lock(&p->pi_lock);

1756

raw_spin_lock(&rq->lock);

1761

raw_spin_lock(&rq->lock);

1757

}

1762

}

1758

1763

1759

if (!(p->state & TASK_NORMAL))

1764

if (!(p->state & TASK_NORMAL))

1760

goto out;

1765

goto out;

1761

1766

1762

if (!task_on_rq_queued(p))

1767

if (!task_on_rq_queued(p))

1763

ttwu_activate(rq, p, ENQUEUE_WAKEUP);

1768

ttwu_activate(rq, p, ENQUEUE_WAKEUP);

1764

1769

1765

ttwu_do_wakeup(rq, p, 0);

1770

ttwu_do_wakeup(rq, p, 0);

1766

ttwu_stat(p, smp_processor_id(), 0);

1771

ttwu_stat(p, smp_processor_id(), 0);

1767

out:

1772

out:

1768

raw_spin_unlock(&p->pi_lock);

1773

raw_spin_unlock(&p->pi_lock);

1769

}

1774

}

1770

1775

1771

/**

1776

/**

1772

* wake_up_process - Wake up a specific process

1777

* wake_up_process - Wake up a specific process

1773

* @p: The process to be woken up.

1778

* @p: The process to be woken up.

1774

*

1779

*

1775

* Attempt to wake up the nominated process and move it to the set of runnable

1780

* Attempt to wake up the nominated process and move it to the set of runnable

1776

* processes.

1781

* processes.

1777

*

1782

*

1778

* Return: 1 if the process was woken up, 0 if it was already running.

1783

* Return: 1 if the process was woken up, 0 if it was already running.

1779

*

1784

*

1780

* It may be assumed that this function implies a write memory barrier before

1785

* It may be assumed that this function implies a write memory barrier before

1781

* changing the task state if and only if any tasks are woken up.

1786

* changing the task state if and only if any tasks are woken up.

1782

*/

1787

*/

1783

int wake_up_process(struct task_struct *p)

1788

int wake_up_process(struct task_struct *p)

1784

{

1789

{

1785

WARN_ON(task_is_stopped_or_traced(p));

1790

WARN_ON(task_is_stopped_or_traced(p));

1786

return try_to_wake_up(p, TASK_NORMAL, 0);

1791

return try_to_wake_up(p, TASK_NORMAL, 0);

1787

}

1792

}

1788

EXPORT_SYMBOL(wake_up_process);

1793

EXPORT_SYMBOL(wake_up_process);

1789

1794

1790

int wake_up_state(struct task_struct *p, unsigned int state)

1795

int wake_up_state(struct task_struct *p, unsigned int state)

1791

{

1796

{

1792

return try_to_wake_up(p, state, 0);

1797

return try_to_wake_up(p, state, 0);

1793

}

1798

}

1794

1799

1795

/*

1800

/*

1796

* This function clears the sched_dl_entity static params.

1801

* This function clears the sched_dl_entity static params.

1797

*/

1802

*/

1798

void __dl_clear_params(struct task_struct *p)

1803

void __dl_clear_params(struct task_struct *p)

1799

{

1804

{

1800

struct sched_dl_entity *dl_se = &p->dl;

1805

struct sched_dl_entity *dl_se = &p->dl;

1801

1806

1802

dl_se->dl_runtime = 0;

1807

dl_se->dl_runtime = 0;

1803

dl_se->dl_deadline = 0;

1808

dl_se->dl_deadline = 0;

1804

dl_se->dl_period = 0;

1809

dl_se->dl_period = 0;

1805

dl_se->flags = 0;

1810

dl_se->flags = 0;

1806

dl_se->dl_bw = 0;

1811

dl_se->dl_bw = 0;

1807

}

1812

}

1808

1813

1809

/*

1814

/*

1810

* Perform scheduler related setup for a newly forked process p.

1815

* Perform scheduler related setup for a newly forked process p.

1811

* p is forked by current.

1816

* p is forked by current.

1812

*

1817

*

1813

* __sched_fork() is basic setup used by init_idle() too:

1818

* __sched_fork() is basic setup used by init_idle() too:

1814

*/

1819

*/

1815

static void __sched_fork(unsigned long clone_flags, struct task_struct *p)

1820

static void __sched_fork(unsigned long clone_flags, struct task_struct *p)

1816

{

1821

{

1817

p->on_rq = 0;

1822

p->on_rq = 0;

1818

1823

1819

p->se.on_rq = 0;

1824

p->se.on_rq = 0;

1820

p->se.exec_start = 0;

1825

p->se.exec_start = 0;

1821

p->se.sum_exec_runtime = 0;

1826

p->se.sum_exec_runtime = 0;

1822

p->se.prev_sum_exec_runtime = 0;

1827

p->se.prev_sum_exec_runtime = 0;

1823

p->se.nr_migrations = 0;

1828

p->se.nr_migrations = 0;

1824

p->se.vruntime = 0;

1829

p->se.vruntime = 0;

1825

INIT_LIST_HEAD(&p->se.group_node);

1830

INIT_LIST_HEAD(&p->se.group_node);

1826

1831

1827

#ifdef CONFIG_SCHEDSTATS

1832

#ifdef CONFIG_SCHEDSTATS

1828

memset(&p->se.statistics, 0, sizeof(p->se.statistics));

1833

memset(&p->se.statistics, 0, sizeof(p->se.statistics));

1829

#endif

1834

#endif

1830

1835

1831

RB_CLEAR_NODE(&p->dl.rb_node);

1836

RB_CLEAR_NODE(&p->dl.rb_node);

1832

hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

1837

hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

1833

__dl_clear_params(p);

1838

__dl_clear_params(p);

1834

1839

1835

INIT_LIST_HEAD(&p->rt.run_list);

1840

INIT_LIST_HEAD(&p->rt.run_list);

1836

1841

1837

#ifdef CONFIG_PREEMPT_NOTIFIERS

1842

#ifdef CONFIG_PREEMPT_NOTIFIERS

1838

INIT_HLIST_HEAD(&p->preempt_notifiers);

1843

INIT_HLIST_HEAD(&p->preempt_notifiers);

1839

#endif

1844

#endif

1840

1845

1841

#ifdef CONFIG_NUMA_BALANCING

1846

#ifdef CONFIG_NUMA_BALANCING

1842

if (p->mm && atomic_read(&p->mm->mm_users) == 1) {

1847

if (p->mm && atomic_read(&p->mm->mm_users) == 1) {

1843

p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);

1848

p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);

1844

p->mm->numa_scan_seq = 0;

1849

p->mm->numa_scan_seq = 0;

1845

}

1850

}

1846

1851

1847

if (clone_flags & CLONE_VM)

1852

if (clone_flags & CLONE_VM)

1848

p->numa_preferred_nid = current->numa_preferred_nid;

1853

p->numa_preferred_nid = current->numa_preferred_nid;

1849

else

1854

else

1850

p->numa_preferred_nid = -1;

1855

p->numa_preferred_nid = -1;

1851

1856

1852

p->node_stamp = 0ULL;

1857

p->node_stamp = 0ULL;

1853

p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;

1858

p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;

1854

p->numa_scan_period = sysctl_numa_balancing_scan_delay;

1859

p->numa_scan_period = sysctl_numa_balancing_scan_delay;

1855

p->numa_work.next = &p->numa_work;

1860

p->numa_work.next = &p->numa_work;

1856

p->numa_faults_memory = NULL;

1861

p->numa_faults_memory = NULL;

1857

p->numa_faults_buffer_memory = NULL;

1862

p->numa_faults_buffer_memory = NULL;

1858

p->last_task_numa_placement = 0;

1863

p->last_task_numa_placement = 0;

1859

p->last_sum_exec_runtime = 0;

1864

p->last_sum_exec_runtime = 0;

1860

1865

1861

INIT_LIST_HEAD(&p->numa_entry);

1866

INIT_LIST_HEAD(&p->numa_entry);

1862

p->numa_group = NULL;

1867

p->numa_group = NULL;

1863

#endif /* CONFIG_NUMA_BALANCING */

1868

#endif /* CONFIG_NUMA_BALANCING */

1864

}

1869

}

1865

1870

1866

#ifdef CONFIG_NUMA_BALANCING

1871

#ifdef CONFIG_NUMA_BALANCING

1867

#ifdef CONFIG_SCHED_DEBUG

1872

#ifdef CONFIG_SCHED_DEBUG

1868

void set_numabalancing_state(bool enabled)

1873

void set_numabalancing_state(bool enabled)

1869

{

1874

{

1870

if (enabled)

1875

if (enabled)

1871

sched_feat_set("NUMA");

1876

sched_feat_set("NUMA");

1872

else

1877

else

1873

sched_feat_set("NO_NUMA");

1878

sched_feat_set("NO_NUMA");

1874

}

1879

}

1875

#else

1880

#else

1876

__read_mostly bool numabalancing_enabled;

1881

__read_mostly bool numabalancing_enabled;

1877

1882

1878

void set_numabalancing_state(bool enabled)

1883

void set_numabalancing_state(bool enabled)

1879

{

1884

{

1880

numabalancing_enabled = enabled;

1885

numabalancing_enabled = enabled;

1881

}

1886

}

1882

#endif /* CONFIG_SCHED_DEBUG */

1887

#endif /* CONFIG_SCHED_DEBUG */

1883

1888

1884

#ifdef CONFIG_PROC_SYSCTL

1889

#ifdef CONFIG_PROC_SYSCTL

1885

int sysctl_numa_balancing(struct ctl_table *table, int write,

1890

int sysctl_numa_balancing(struct ctl_table *table, int write,

1886

void __user *buffer, size_t *lenp, loff_t *ppos)

1891

void __user *buffer, size_t *lenp, loff_t *ppos)

1887

{

1892

{

1888

struct ctl_table t;

1893

struct ctl_table t;

1889

int err;

1894

int err;

1890

int state = numabalancing_enabled;

1895

int state = numabalancing_enabled;

1891

1896

1892

if (write && !capable(CAP_SYS_ADMIN))

1897

if (write && !capable(CAP_SYS_ADMIN))

1893

return -EPERM;

1898

return -EPERM;

1894

1899

1895

t = *table;

1900

t = *table;

1896

t.data = &state;

1901

t.data = &state;

1897

err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);

1902

err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);

1898

if (err < 0)

1903

if (err < 0)

1899

return err;

1904

return err;

1900

if (write)

1905

if (write)

1901

set_numabalancing_state(state);

1906

set_numabalancing_state(state);

1902

return err;

1907

return err;

1903

}

1908

}

1904

#endif

1909

#endif

1905

#endif

1910

#endif

1906

1911

1907

/*

1912

/*

1908

* fork()/clone()-time setup:

1913

* fork()/clone()-time setup:

1909

*/

1914

*/

1910

int sched_fork(unsigned long clone_flags, struct task_struct *p)

1915

int sched_fork(unsigned long clone_flags, struct task_struct *p)

1911

{

1916

{

1912

unsigned long flags;

1917

unsigned long flags;

1913

int cpu = get_cpu();

1918

int cpu = get_cpu();

1914

1919

1915

__sched_fork(clone_flags, p);

1920

__sched_fork(clone_flags, p);

1916

/*

1921

/*

1917

* We mark the process as running here. This guarantees that

1922

* We mark the process as running here. This guarantees that

1918

* nobody will actually run it, and a signal or other external

1923

* nobody will actually run it, and a signal or other external

1919

* event cannot wake it up and insert it on the runqueue either.

1924

* event cannot wake it up and insert it on the runqueue either.

1920

*/

1925

*/

1921

p->state = TASK_RUNNING;

1926

p->state = TASK_RUNNING;

1922

1927

1923

/*

1928

/*

1924

* Make sure we do not leak PI boosting priority to the child.

1929

* Make sure we do not leak PI boosting priority to the child.

1925

*/

1930

*/

1926

p->prio = current->normal_prio;

1931

p->prio = current->normal_prio;

1927

1932

1928

/*

1933

/*

1929

* Revert to default priority/policy on fork if requested.

1934

* Revert to default priority/policy on fork if requested.

1930

*/

1935

*/

1931

if (unlikely(p->sched_reset_on_fork)) {

1936

if (unlikely(p->sched_reset_on_fork)) {

1932

if (task_has_dl_policy(p) || task_has_rt_policy(p)) {

1937

if (task_has_dl_policy(p) || task_has_rt_policy(p)) {

1933

p->policy = SCHED_NORMAL;

1938

p->policy = SCHED_NORMAL;

1934

p->static_prio = NICE_TO_PRIO(0);

1939

p->static_prio = NICE_TO_PRIO(0);

1935

p->rt_priority = 0;

1940

p->rt_priority = 0;

1936

} else if (PRIO_TO_NICE(p->static_prio) < 0)

1941

} else if (PRIO_TO_NICE(p->static_prio) < 0)

1937

p->static_prio = NICE_TO_PRIO(0);

1942

p->static_prio = NICE_TO_PRIO(0);

1938

1943

1939

p->prio = p->normal_prio = __normal_prio(p);

1944

p->prio = p->normal_prio = __normal_prio(p);

1940

set_load_weight(p);

1945

set_load_weight(p);

1941

1946

1942

/*

1947

/*

1943

* We don't need the reset flag anymore after the fork. It has

1948

* We don't need the reset flag anymore after the fork. It has

1944

* fulfilled its duty:

1949

* fulfilled its duty:

1945

*/

1950

*/

1946

p->sched_reset_on_fork = 0;

1951

p->sched_reset_on_fork = 0;

1947

}

1952

}

1948

1953

1949

if (dl_prio(p->prio)) {

1954

if (dl_prio(p->prio)) {

1950

put_cpu();

1955

put_cpu();

1951

return -EAGAIN;

1956

return -EAGAIN;

1952

} else if (rt_prio(p->prio)) {

1957

} else if (rt_prio(p->prio)) {

1953

p->sched_class = &rt_sched_class;

1958

p->sched_class = &rt_sched_class;

1954

} else {

1959

} else {

1955

p->sched_class = &fair_sched_class;

1960

p->sched_class = &fair_sched_class;

1956

}

1961

}

1957

1962

1958

if (p->sched_class->task_fork)

1963

if (p->sched_class->task_fork)

1959

p->sched_class->task_fork(p);

1964

p->sched_class->task_fork(p);

1960

1965

1961

/*

1966

/*

1962

* The child is not yet in the pid-hash so no cgroup attach races,

1967

* The child is not yet in the pid-hash so no cgroup attach races,

1963

* and the cgroup is pinned to this child due to cgroup_fork()

1968

* and the cgroup is pinned to this child due to cgroup_fork()

1964

* is ran before sched_fork().

1969

* is ran before sched_fork().

1965

*

1970

*

1966

* Silence PROVE_RCU.

1971

* Silence PROVE_RCU.

1967

*/

1972

*/

1968

raw_spin_lock_irqsave(&p->pi_lock, flags);

1973

raw_spin_lock_irqsave(&p->pi_lock, flags);

1969

set_task_cpu(p, cpu);

1974

set_task_cpu(p, cpu);

1970

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

1975

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

1971

1976

1972

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)

1977

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)

1973

if (likely(sched_info_on()))

1978

if (likely(sched_info_on()))

1974

memset(&p->sched_info, 0, sizeof(p->sched_info));

1979

memset(&p->sched_info, 0, sizeof(p->sched_info));

1975

#endif

1980

#endif

1976

#if defined(CONFIG_SMP)

1981

#if defined(CONFIG_SMP)

1977

p->on_cpu = 0;

1982

p->on_cpu = 0;

1978

#endif

1983

#endif

1979

init_task_preempt_count(p);

1984

init_task_preempt_count(p);

1980

#ifdef CONFIG_SMP

1985

#ifdef CONFIG_SMP

1981

plist_node_init(&p->pushable_tasks, MAX_PRIO);

1986

plist_node_init(&p->pushable_tasks, MAX_PRIO);

1982

RB_CLEAR_NODE(&p->pushable_dl_tasks);

1987

RB_CLEAR_NODE(&p->pushable_dl_tasks);

1983

#endif

1988

#endif

1984

1989

1985

put_cpu();

1990

put_cpu();

1986

return 0;

1991

return 0;

1987

}

1992

}

1988

1993

1989

unsigned long to_ratio(u64 period, u64 runtime)

1994

unsigned long to_ratio(u64 period, u64 runtime)

1990

{

1995

{

1991

if (runtime == RUNTIME_INF)

1996

if (runtime == RUNTIME_INF)

1992

return 1ULL << 20;

1997

return 1ULL << 20;

1993

1998

1994

/*

1999

/*

1995

* Doing this here saves a lot of checks in all

2000

* Doing this here saves a lot of checks in all

1996

* the calling paths, and returning zero seems

2001

* the calling paths, and returning zero seems

1997

* safe for them anyway.

2002

* safe for them anyway.

1998

*/

2003

*/

1999

if (period == 0)

2004

if (period == 0)

2000

return 0;

2005

return 0;

2001

2006

2002

return div64_u64(runtime << 20, period);

2007

return div64_u64(runtime << 20, period);

2003

}

2008

}

2004

2009

2005

#ifdef CONFIG_SMP

2010

#ifdef CONFIG_SMP

2006

inline struct dl_bw *dl_bw_of(int i)

2011

inline struct dl_bw *dl_bw_of(int i)

2007

{

2012

{

2008

rcu_lockdep_assert(rcu_read_lock_sched_held(),

2013

rcu_lockdep_assert(rcu_read_lock_sched_held(),

2009

"sched RCU must be held");

2014

"sched RCU must be held");

2010

return &cpu_rq(i)->rd->dl_bw;

2015

return &cpu_rq(i)->rd->dl_bw;

2011

}

2016

}

2012

2017

2013

static inline int dl_bw_cpus(int i)

2018

static inline int dl_bw_cpus(int i)

2014

{

2019

{

2015

struct root_domain *rd = cpu_rq(i)->rd;

2020

struct root_domain *rd = cpu_rq(i)->rd;

2016

int cpus = 0;

2021

int cpus = 0;

2017

2022

2018

rcu_lockdep_assert(rcu_read_lock_sched_held(),

2023

rcu_lockdep_assert(rcu_read_lock_sched_held(),

2019

"sched RCU must be held");

2024

"sched RCU must be held");

2020

for_each_cpu_and(i, rd->span, cpu_active_mask)

2025

for_each_cpu_and(i, rd->span, cpu_active_mask)

2021

cpus++;

2026

cpus++;

2022

2027

2023

return cpus;

2028

return cpus;

2024

}

2029

}

2025

#else

2030

#else

2026

inline struct dl_bw *dl_bw_of(int i)

2031

inline struct dl_bw *dl_bw_of(int i)

2027

{

2032

{

2028

return &cpu_rq(i)->dl.dl_bw;

2033

return &cpu_rq(i)->dl.dl_bw;

2029

}

2034

}

2030

2035

2031

static inline int dl_bw_cpus(int i)

2036

static inline int dl_bw_cpus(int i)

2032

{

2037

{

2033

return 1;

2038

return 1;

2034

}

2039

}

2035

#endif

2040

#endif

2036

2041

2037

static inline

2042

static inline

2038

void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)

2043

void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)

2039

{

2044

{

2040

dl_b->total_bw -= tsk_bw;

2045

dl_b->total_bw -= tsk_bw;

2041

}

2046

}

2042

2047

2043

static inline

2048

static inline

2044

void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)

2049

void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)

2045

{

2050

{

2046

dl_b->total_bw += tsk_bw;

2051

dl_b->total_bw += tsk_bw;

2047

}

2052

}

2048

2053

2049

static inline

2054

static inline

2050

bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)

2055

bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)

2051

{

2056

{

2052

return dl_b->bw != -1 &&

2057

return dl_b->bw != -1 &&

2053

dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;

2058

dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;

2054

}

2059

}

2055

2060

2056

/*

2061

/*

2057

* We must be sure that accepting a new task (or allowing changing the

2062

* We must be sure that accepting a new task (or allowing changing the

2058

* parameters of an existing one) is consistent with the bandwidth

2063

* parameters of an existing one) is consistent with the bandwidth

2059

* constraints. If yes, this function also accordingly updates the currently

2064

* constraints. If yes, this function also accordingly updates the currently

2060

* allocated bandwidth to reflect the new situation.

2065

* allocated bandwidth to reflect the new situation.

2061

*

2066

*

2062

* This function is called while holding p's rq->lock.

2067

* This function is called while holding p's rq->lock.

2063

*/

2068

*/

2064

static int dl_overflow(struct task_struct *p, int policy,

2069

static int dl_overflow(struct task_struct *p, int policy,

2065

const struct sched_attr *attr)

2070

const struct sched_attr *attr)

2066

{

2071

{

2067

2072

2068

struct dl_bw *dl_b = dl_bw_of(task_cpu(p));

2073

struct dl_bw *dl_b = dl_bw_of(task_cpu(p));

2069

u64 period = attr->sched_period ?: attr->sched_deadline;

2074

u64 period = attr->sched_period ?: attr->sched_deadline;

2070

u64 runtime = attr->sched_runtime;

2075

u64 runtime = attr->sched_runtime;

2071

u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;

2076

u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;

2072

int cpus, err = -1;

2077

int cpus, err = -1;

2073

2078

2074

if (new_bw == p->dl.dl_bw)

2079

if (new_bw == p->dl.dl_bw)

2075

return 0;

2080

return 0;

2076

2081

2077

/*

2082

/*

2078

* Either if a task, enters, leave, or stays -deadline but changes

2083

* Either if a task, enters, leave, or stays -deadline but changes

2079

* its parameters, we may need to update accordingly the total

2084

* its parameters, we may need to update accordingly the total

2080

* allocated bandwidth of the container.

2085

* allocated bandwidth of the container.

2081

*/

2086

*/

2082

raw_spin_lock(&dl_b->lock);

2087

raw_spin_lock(&dl_b->lock);

2083

cpus = dl_bw_cpus(task_cpu(p));

2088

cpus = dl_bw_cpus(task_cpu(p));

2084

if (dl_policy(policy) && !task_has_dl_policy(p) &&

2089

if (dl_policy(policy) && !task_has_dl_policy(p) &&

2085

!__dl_overflow(dl_b, cpus, 0, new_bw)) {

2090

!__dl_overflow(dl_b, cpus, 0, new_bw)) {

2086

__dl_add(dl_b, new_bw);

2091

__dl_add(dl_b, new_bw);

2087

err = 0;

2092

err = 0;

2088

} else if (dl_policy(policy) && task_has_dl_policy(p) &&

2093

} else if (dl_policy(policy) && task_has_dl_policy(p) &&

2089

!__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {

2094

!__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {

2090

__dl_clear(dl_b, p->dl.dl_bw);

2095

__dl_clear(dl_b, p->dl.dl_bw);

2091

__dl_add(dl_b, new_bw);

2096

__dl_add(dl_b, new_bw);

2092

err = 0;

2097

err = 0;

2093

} else if (!dl_policy(policy) && task_has_dl_policy(p)) {

2098

} else if (!dl_policy(policy) && task_has_dl_policy(p)) {

2094

__dl_clear(dl_b, p->dl.dl_bw);

2099

__dl_clear(dl_b, p->dl.dl_bw);

2095

err = 0;

2100

err = 0;

2096

}

2101

}

2097

raw_spin_unlock(&dl_b->lock);

2102

raw_spin_unlock(&dl_b->lock);

2098

2103

2099

return err;

2104

return err;

2100

}

2105

}

2101

2106

2102

extern void init_dl_bw(struct dl_bw *dl_b);

2107

extern void init_dl_bw(struct dl_bw *dl_b);

2103

2108

2104

/*

2109

/*

2105

* wake_up_new_task - wake up a newly created task for the first time.

2110

* wake_up_new_task - wake up a newly created task for the first time.

2106

*

2111

*

2107

* This function will do some initial scheduler statistics housekeeping

2112

* This function will do some initial scheduler statistics housekeeping

2108

* that must be done for every newly created context, then puts the task

2113

* that must be done for every newly created context, then puts the task

2109

* on the runqueue and wakes it.

2114

* on the runqueue and wakes it.

2110

*/

2115

*/

2111

void wake_up_new_task(struct task_struct *p)

2116

void wake_up_new_task(struct task_struct *p)

2112

{

2117

{

2113

unsigned long flags;

2118

unsigned long flags;

2114

struct rq *rq;

2119

struct rq *rq;

2115

2120

2116

raw_spin_lock_irqsave(&p->pi_lock, flags);

2121

raw_spin_lock_irqsave(&p->pi_lock, flags);

2117

#ifdef CONFIG_SMP

2122

#ifdef CONFIG_SMP

2118

/*

2123

/*

2119

* Fork balancing, do it here and not earlier because:

2124

* Fork balancing, do it here and not earlier because:

2120

* - cpus_allowed can change in the fork path

2125

* - cpus_allowed can change in the fork path

2121

* - any previously selected cpu might disappear through hotplug

2126

* - any previously selected cpu might disappear through hotplug

2122

*/

2127

*/

2123

set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));

2128

set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));

2124

#endif

2129

#endif

2125

2130

2126

/* Initialize new task's runnable average */

2131

/* Initialize new task's runnable average */

2127

init_task_runnable_average(p);

2132

init_task_runnable_average(p);

2128

rq = __task_rq_lock(p);

2133

rq = __task_rq_lock(p);

2129

activate_task(rq, p, 0);

2134

activate_task(rq, p, 0);

2130

p->on_rq = TASK_ON_RQ_QUEUED;

2135

p->on_rq = TASK_ON_RQ_QUEUED;

2131

trace_sched_wakeup_new(p, true);

2136

trace_sched_wakeup_new(p, true);

2132

check_preempt_curr(rq, p, WF_FORK);

2137

check_preempt_curr(rq, p, WF_FORK);

2133

#ifdef CONFIG_SMP

2138

#ifdef CONFIG_SMP

2134

if (p->sched_class->task_woken)

2139

if (p->sched_class->task_woken)

2135

p->sched_class->task_woken(rq, p);

2140

p->sched_class->task_woken(rq, p);

2136

#endif

2141

#endif

2137

task_rq_unlock(rq, p, &flags);

2142

task_rq_unlock(rq, p, &flags);

2138

}

2143

}

2139

2144

2140

#ifdef CONFIG_PREEMPT_NOTIFIERS

2145

#ifdef CONFIG_PREEMPT_NOTIFIERS

2141

2146

2142

/**

2147

/**

2143

* preempt_notifier_register - tell me when current is being preempted & rescheduled

2148

* preempt_notifier_register - tell me when current is being preempted & rescheduled

2144

* @notifier: notifier struct to register

2149

* @notifier: notifier struct to register

2145

*/

2150

*/

2146

void preempt_notifier_register(struct preempt_notifier *notifier)

2151

void preempt_notifier_register(struct preempt_notifier *notifier)

2147

{

2152

{

2148

hlist_add_head(&notifier->link, &current->preempt_notifiers);

2153

hlist_add_head(&notifier->link, &current->preempt_notifiers);

2149

}

2154

}

2150

EXPORT_SYMBOL_GPL(preempt_notifier_register);

2155

EXPORT_SYMBOL_GPL(preempt_notifier_register);

2151

2156

2152

/**

2157

/**

2153

* preempt_notifier_unregister - no longer interested in preemption notifications

2158

* preempt_notifier_unregister - no longer interested in preemption notifications

2154

* @notifier: notifier struct to unregister

2159

* @notifier: notifier struct to unregister

2155

*

2160

*

2156

* This is safe to call from within a preemption notifier.

2161

* This is safe to call from within a preemption notifier.

2157

*/

2162

*/

2158

void preempt_notifier_unregister(struct preempt_notifier *notifier)

2163

void preempt_notifier_unregister(struct preempt_notifier *notifier)

2159

{

2164

{

2160

hlist_del(&notifier->link);

2165

hlist_del(&notifier->link);

2161

}

2166

}

2162

EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

2167

EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

2163

2168

2164

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2169

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2165

{

2170

{

2166

struct preempt_notifier *notifier;

2171

struct preempt_notifier *notifier;

2167

2172

2168

hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)

2173

hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)

2169

notifier->ops->sched_in(notifier, raw_smp_processor_id());

2174

notifier->ops->sched_in(notifier, raw_smp_processor_id());

2170

}

2175

}

2171

2176

2172

static void

2177

static void

2173

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2178

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2174

struct task_struct *next)

2179

struct task_struct *next)

2175

{

2180

{

2176

struct preempt_notifier *notifier;

2181

struct preempt_notifier *notifier;

2177

2182

2178

hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)

2183

hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)

2179

notifier->ops->sched_out(notifier, next);

2184

notifier->ops->sched_out(notifier, next);

2180

}

2185

}

2181

2186

2182

#else /* !CONFIG_PREEMPT_NOTIFIERS */

2187

#else /* !CONFIG_PREEMPT_NOTIFIERS */

2183

2188

2184

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2189

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2185

{

2190

{

2186

}

2191

}

2187

2192

2188

static void

2193

static void

2189

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2194

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2190

struct task_struct *next)

2195

struct task_struct *next)

2191

{

2196

{

2192

}

2197

}

2193

2198

2194

#endif /* CONFIG_PREEMPT_NOTIFIERS */

2199

#endif /* CONFIG_PREEMPT_NOTIFIERS */

2195

2200

2196

/**

2201

/**

2197

* prepare_task_switch - prepare to switch tasks

2202

* prepare_task_switch - prepare to switch tasks

2198

* @rq: the runqueue preparing to switch

2203

* @rq: the runqueue preparing to switch

2199

* @prev: the current task that is being switched out

2204

* @prev: the current task that is being switched out

2200

* @next: the task we are going to switch to.

2205

* @next: the task we are going to switch to.

2201

*

2206

*

2202

* This is called with the rq lock held and interrupts off. It must

2207

* This is called with the rq lock held and interrupts off. It must

2203

* be paired with a subsequent finish_task_switch after the context

2208

* be paired with a subsequent finish_task_switch after the context

2204

* switch.

2209

* switch.

2205

*

2210

*

2206

* prepare_task_switch sets up locking and calls architecture specific

2211

* prepare_task_switch sets up locking and calls architecture specific

2207

* hooks.

2212

* hooks.

2208

*/

2213

*/

2209

static inline void

2214

static inline void

2210

prepare_task_switch(struct rq *rq, struct task_struct *prev,

2215

prepare_task_switch(struct rq *rq, struct task_struct *prev,

2211

struct task_struct *next)

2216

struct task_struct *next)

2212

{

2217

{

2213

trace_sched_switch(prev, next);

2218

trace_sched_switch(prev, next);

2214

sched_info_switch(rq, prev, next);

2219

sched_info_switch(rq, prev, next);

2215

perf_event_task_sched_out(prev, next);

2220

perf_event_task_sched_out(prev, next);

2216

fire_sched_out_preempt_notifiers(prev, next);

2221

fire_sched_out_preempt_notifiers(prev, next);

2217

prepare_lock_switch(rq, next);

2222

prepare_lock_switch(rq, next);

2218

prepare_arch_switch(next);

2223

prepare_arch_switch(next);

2219

}

2224

}

2220

2225

2221

/**

2226

/**

2222

* finish_task_switch - clean up after a task-switch

2227

* finish_task_switch - clean up after a task-switch

2223

* @rq: runqueue associated with task-switch

2228

* @rq: runqueue associated with task-switch

2224

* @prev: the thread we just switched away from.

2229

* @prev: the thread we just switched away from.

2225

*

2230

*

2226

* finish_task_switch must be called after the context switch, paired

2231

* finish_task_switch must be called after the context switch, paired

2227

* with a prepare_task_switch call before the context switch.

2232

* with a prepare_task_switch call before the context switch.

2228

* finish_task_switch will reconcile locking set up by prepare_task_switch,

2233

* finish_task_switch will reconcile locking set up by prepare_task_switch,

2229

* and do any other architecture-specific cleanup actions.

2234

* and do any other architecture-specific cleanup actions.

2230

*

2235

*

2231

* Note that we may have delayed dropping an mm in context_switch(). If

2236

* Note that we may have delayed dropping an mm in context_switch(). If

2232

* so, we finish that here outside of the runqueue lock. (Doing it

2237

* so, we finish that here outside of the runqueue lock. (Doing it

2233

* with the lock held can cause deadlocks; see schedule() for

2238

* with the lock held can cause deadlocks; see schedule() for

2234

* details.)

2239

* details.)

2235

*/

2240

*/

2236

static void finish_task_switch(struct rq *rq, struct task_struct *prev)

2241

static void finish_task_switch(struct rq *rq, struct task_struct *prev)

2237

__releases(rq->lock)

2242

__releases(rq->lock)

2238

{

2243

{

2239

struct mm_struct *mm = rq->prev_mm;

2244

struct mm_struct *mm = rq->prev_mm;

2240

long prev_state;

2245

long prev_state;

2241

2246

2242

rq->prev_mm = NULL;

2247

rq->prev_mm = NULL;

2243

2248

2244

/*

2249

/*

2245

* A task struct has one reference for the use as "current".

2250

* A task struct has one reference for the use as "current".

2246

* If a task dies, then it sets TASK_DEAD in tsk->state and calls

2251

* If a task dies, then it sets TASK_DEAD in tsk->state and calls

2247

* schedule one last time. The schedule call will never return, and

2252

* schedule one last time. The schedule call will never return, and

2248

* the scheduled task must drop that reference.

2253

* the scheduled task must drop that reference.

2249

* The test for TASK_DEAD must occur while the runqueue locks are

2254

* The test for TASK_DEAD must occur while the runqueue locks are

2250

* still held, otherwise prev could be scheduled on another cpu, die

2255

* still held, otherwise prev could be scheduled on another cpu, die

2251

* there before we look at prev->state, and then the reference would

2256

* there before we look at prev->state, and then the reference would

2252

* be dropped twice.

2257

* be dropped twice.

2253

* Manfred Spraul <manfred@colorfullife.com>

2258

* Manfred Spraul <manfred@colorfullife.com>

2254

*/

2259

*/

2255

prev_state = prev->state;

2260

prev_state = prev->state;

2256

vtime_task_switch(prev);

2261

vtime_task_switch(prev);

2257

finish_arch_switch(prev);

2262

finish_arch_switch(prev);

2258

perf_event_task_sched_in(prev, current);

2263

perf_event_task_sched_in(prev, current);

2259

finish_lock_switch(rq, prev);

2264

finish_lock_switch(rq, prev);

2260

finish_arch_post_lock_switch();

2265

finish_arch_post_lock_switch();

2261

2266

2262

fire_sched_in_preempt_notifiers(current);

2267

fire_sched_in_preempt_notifiers(current);

2263

if (mm)

2268

if (mm)

2264

mmdrop(mm);

2269

mmdrop(mm);

2265

if (unlikely(prev_state == TASK_DEAD)) {

2270

if (unlikely(prev_state == TASK_DEAD)) {

2266

if (prev->sched_class->task_dead)

2271

if (prev->sched_class->task_dead)

2267

prev->sched_class->task_dead(prev);

2272

prev->sched_class->task_dead(prev);

2268

2273

2269

/*

2274

/*

2270

* Remove function-return probe instances associated with this

2275

* Remove function-return probe instances associated with this

2271

* task and put them back on the free list.

2276

* task and put them back on the free list.

2272

*/

2277

*/

2273

kprobe_flush_task(prev);

2278

kprobe_flush_task(prev);

2274

put_task_struct(prev);

2279

put_task_struct(prev);

2275

}

2280

}

2276

2281

2277

tick_nohz_task_switch(current);

2282

tick_nohz_task_switch(current);

2278

}

2283

}

2279

2284

2280

#ifdef CONFIG_SMP

2285

#ifdef CONFIG_SMP

2281

2286

2282

/* rq->lock is NOT held, but preemption is disabled */

2287

/* rq->lock is NOT held, but preemption is disabled */

2283

static inline void post_schedule(struct rq *rq)

2288

static inline void post_schedule(struct rq *rq)

2284

{

2289

{

2285

if (rq->post_schedule) {

2290

if (rq->post_schedule) {

2286

unsigned long flags;

2291

unsigned long flags;

2287

2292

2288

raw_spin_lock_irqsave(&rq->lock, flags);

2293

raw_spin_lock_irqsave(&rq->lock, flags);

2289

if (rq->curr->sched_class->post_schedule)

2294

if (rq->curr->sched_class->post_schedule)

2290

rq->curr->sched_class->post_schedule(rq);

2295

rq->curr->sched_class->post_schedule(rq);

2291

raw_spin_unlock_irqrestore(&rq->lock, flags);

2296

raw_spin_unlock_irqrestore(&rq->lock, flags);

2292

2297

2293

rq->post_schedule = 0;

2298

rq->post_schedule = 0;

2294

}

2299

}

2295

}

2300

}

2296

2301

2297

#else

2302

#else

2298

2303

2299

static inline void post_schedule(struct rq *rq)

2304

static inline void post_schedule(struct rq *rq)

2300

{

2305

{

2301

}

2306

}

2302

2307

2303

#endif

2308

#endif

2304

2309

2305

/**

2310

/**

2306

* schedule_tail - first thing a freshly forked thread must call.

2311

* schedule_tail - first thing a freshly forked thread must call.

2307

* @prev: the thread we just switched away from.

2312

* @prev: the thread we just switched away from.

2308

*/

2313

*/

2309

asmlinkage __visible void schedule_tail(struct task_struct *prev)

2314

asmlinkage __visible void schedule_tail(struct task_struct *prev)

2310

__releases(rq->lock)

2315

__releases(rq->lock)

2311

{

2316

{

2312

struct rq *rq = this_rq();

2317

struct rq *rq = this_rq();

2313

2318

2314

finish_task_switch(rq, prev);

2319

finish_task_switch(rq, prev);

2315

2320

2316

/*

2321

/*

2317

* FIXME: do we need to worry about rq being invalidated by the

2322

* FIXME: do we need to worry about rq being invalidated by the

2318

* task_switch?

2323

* task_switch?

2319

*/

2324

*/

2320

post_schedule(rq);

2325

post_schedule(rq);

2321

2326

2322

if (current->set_child_tid)

2327

if (current->set_child_tid)

2323

put_user(task_pid_vnr(current), current->set_child_tid);

2328

put_user(task_pid_vnr(current), current->set_child_tid);

2324

}

2329

}

2325

2330

2326

/*

2331

/*

2327

* context_switch - switch to the new MM and the new

2332

* context_switch - switch to the new MM and the new

2328

* thread's register state.

2333

* thread's register state.

2329

*/

2334

*/

2330

static inline void

2335

static inline void

2331

context_switch(struct rq *rq, struct task_struct *prev,

2336

context_switch(struct rq *rq, struct task_struct *prev,

2332

struct task_struct *next)

2337

struct task_struct *next)

2333

{

2338

{

2334

struct mm_struct *mm, *oldmm;

2339

struct mm_struct *mm, *oldmm;

2335

2340

2336

prepare_task_switch(rq, prev, next);

2341

prepare_task_switch(rq, prev, next);

2337

2342

2338

mm = next->mm;

2343

mm = next->mm;

2339

oldmm = prev->active_mm;

2344

oldmm = prev->active_mm;

2340

/*

2345

/*

2341

* For paravirt, this is coupled with an exit in switch_to to

2346

* For paravirt, this is coupled with an exit in switch_to to

2342

* combine the page table reload and the switch backend into

2347

* combine the page table reload and the switch backend into

2343

* one hypercall.

2348

* one hypercall.

2344

*/

2349

*/

2345

arch_start_context_switch(prev);

2350

arch_start_context_switch(prev);

2346

2351

2347

if (!mm) {

2352

if (!mm) {

2348

next->active_mm = oldmm;

2353

next->active_mm = oldmm;

2349

atomic_inc(&oldmm->mm_count);

2354

atomic_inc(&oldmm->mm_count);

2350

enter_lazy_tlb(oldmm, next);

2355

enter_lazy_tlb(oldmm, next);

2351

} else

2356

} else

2352

switch_mm(oldmm, mm, next);

2357

switch_mm(oldmm, mm, next);

2353

2358

2354

if (!prev->mm) {

2359

if (!prev->mm) {

2355

prev->active_mm = NULL;

2360

prev->active_mm = NULL;

2356

rq->prev_mm = oldmm;

2361

rq->prev_mm = oldmm;

2357

}

2362

}

2358

/*

2363

/*

2359

* Since the runqueue lock will be released by the next

2364

* Since the runqueue lock will be released by the next

2360

* task (which is an invalid locking op but in the case

2365

* task (which is an invalid locking op but in the case

2361

* of the scheduler it's an obvious special-case), so we

2366

* of the scheduler it's an obvious special-case), so we

2362

* do an early lockdep release here:

2367

* do an early lockdep release here:

2363

*/

2368

*/

2364

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

2369

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

2365

2370

2366

context_tracking_task_switch(prev, next);

2371

context_tracking_task_switch(prev, next);

2367

/* Here we just switch the register state and the stack. */

2372

/* Here we just switch the register state and the stack. */

2368

switch_to(prev, next, prev);

2373

switch_to(prev, next, prev);

2369

2374

2370

barrier();

2375

barrier();

2371

/*

2376

/*

2372

* this_rq must be evaluated again because prev may have moved

2377

* this_rq must be evaluated again because prev may have moved

2373

* CPUs since it called schedule(), thus the 'rq' on its stack

2378

* CPUs since it called schedule(), thus the 'rq' on its stack

2374

* frame will be invalid.

2379

* frame will be invalid.

2375

*/

2380

*/

2376

finish_task_switch(this_rq(), prev);

2381

finish_task_switch(this_rq(), prev);

2377

}

2382

}

2378

2383

2379

/*

2384

/*

2380

* nr_running and nr_context_switches:

2385

* nr_running and nr_context_switches:

2381

*

2386

*

2382

* externally visible scheduler statistics: current number of runnable

2387

* externally visible scheduler statistics: current number of runnable

2383

* threads, total number of context switches performed since bootup.

2388

* threads, total number of context switches performed since bootup.

2384

*/

2389

*/

2385

unsigned long nr_running(void)

2390

unsigned long nr_running(void)

2386

{

2391

{

2387

unsigned long i, sum = 0;

2392

unsigned long i, sum = 0;

2388

2393

2389

for_each_online_cpu(i)

2394

for_each_online_cpu(i)

2390

sum += cpu_rq(i)->nr_running;

2395

sum += cpu_rq(i)->nr_running;

2391

2396

2392

return sum;

2397

return sum;

2393

}

2398

}

2394

2399

2395

/*

2400

/*

2396

* Check if only the current task is running on the cpu.

2401

* Check if only the current task is running on the cpu.

2397

*/

2402

*/

2398

bool single_task_running(void)

2403

bool single_task_running(void)

2399

{

2404

{

2400

if (cpu_rq(smp_processor_id())->nr_running == 1)

2405

if (cpu_rq(smp_processor_id())->nr_running == 1)

2401

return true;

2406

return true;

2402

else

2407

else

2403

return false;

2408

return false;

2404

}

2409

}

2405

EXPORT_SYMBOL(single_task_running);

2410

EXPORT_SYMBOL(single_task_running);

2406

2411

2407

unsigned long long nr_context_switches(void)

2412

unsigned long long nr_context_switches(void)

2408

{

2413

{

2409

int i;

2414

int i;

2410

unsigned long long sum = 0;

2415

unsigned long long sum = 0;

2411

2416

2412

for_each_possible_cpu(i)

2417

for_each_possible_cpu(i)

2413

sum += cpu_rq(i)->nr_switches;

2418

sum += cpu_rq(i)->nr_switches;

2414

2419

2415

return sum;

2420

return sum;

2416

}

2421

}

2417

2422

2418

unsigned long nr_iowait(void)

2423

unsigned long nr_iowait(void)

2419

{

2424

{

2420

unsigned long i, sum = 0;

2425

unsigned long i, sum = 0;

2421

2426

2422

for_each_possible_cpu(i)

2427

for_each_possible_cpu(i)

2423

sum += atomic_read(&cpu_rq(i)->nr_iowait);

2428

sum += atomic_read(&cpu_rq(i)->nr_iowait);

2424

2429

2425

return sum;

2430

return sum;

2426

}

2431

}

2427

2432

2428

unsigned long nr_iowait_cpu(int cpu)

2433

unsigned long nr_iowait_cpu(int cpu)

2429

{

2434

{

2430

struct rq *this = cpu_rq(cpu);

2435

struct rq *this = cpu_rq(cpu);

2431

return atomic_read(&this->nr_iowait);

2436

return atomic_read(&this->nr_iowait);

2432

}

2437

}

2433

2438

2434

void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)

2439

void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)

2435

{

2440

{

2436

struct rq *this = this_rq();

2441

struct rq *this = this_rq();

2437

*nr_waiters = atomic_read(&this->nr_iowait);

2442

*nr_waiters = atomic_read(&this->nr_iowait);

2438

*load = this->cpu_load[0];

2443

*load = this->cpu_load[0];

2439

}

2444

}

2440

2445

2441

#ifdef CONFIG_SMP

2446

#ifdef CONFIG_SMP

2442

2447

2443

/*

2448

/*

2444

* sched_exec - execve() is a valuable balancing opportunity, because at

2449

* sched_exec - execve() is a valuable balancing opportunity, because at

2445

* this point the task has the smallest effective memory and cache footprint.

2450

* this point the task has the smallest effective memory and cache footprint.

2446

*/

2451

*/

2447

void sched_exec(void)

2452

void sched_exec(void)

2448

{

2453

{

2449

struct task_struct *p = current;

2454

struct task_struct *p = current;

2450

unsigned long flags;

2455

unsigned long flags;

2451

int dest_cpu;

2456

int dest_cpu;

2452

2457

2453

raw_spin_lock_irqsave(&p->pi_lock, flags);

2458

raw_spin_lock_irqsave(&p->pi_lock, flags);

2454

dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);

2459

dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);

2455

if (dest_cpu == smp_processor_id())

2460

if (dest_cpu == smp_processor_id())

2456

goto unlock;

2461

goto unlock;

2457

2462

2458

if (likely(cpu_active(dest_cpu))) {

2463

if (likely(cpu_active(dest_cpu))) {

2459

struct migration_arg arg = { p, dest_cpu };

2464

struct migration_arg arg = { p, dest_cpu };

2460

2465

2461

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

2466

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

2462

stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);

2467

stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);

2463

return;

2468

return;

2464

}

2469

}

2465

unlock:

2470

unlock:

2466

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

2471

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

2467

}

2472

}

2468

2473

2469

#endif

2474

#endif

2470

2475

2471

DEFINE_PER_CPU(struct kernel_stat, kstat);

2476

DEFINE_PER_CPU(struct kernel_stat, kstat);

2472

DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);

2477

DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);

2473

2478

2474

EXPORT_PER_CPU_SYMBOL(kstat);

2479

EXPORT_PER_CPU_SYMBOL(kstat);

2475

EXPORT_PER_CPU_SYMBOL(kernel_cpustat);

2480

EXPORT_PER_CPU_SYMBOL(kernel_cpustat);

2476

2481

2477

/*

2482

/*

2478

* Return accounted runtime for the task.

2483

* Return accounted runtime for the task.

2479

* In case the task is currently running, return the runtime plus current's

2484

* In case the task is currently running, return the runtime plus current's

2480

* pending runtime that have not been accounted yet.

2485

* pending runtime that have not been accounted yet.

2481

*/

2486

*/

2482

unsigned long long task_sched_runtime(struct task_struct *p)

2487

unsigned long long task_sched_runtime(struct task_struct *p)

2483

{

2488

{

2484

unsigned long flags;

2489

unsigned long flags;

2485

struct rq *rq;

2490

struct rq *rq;

2486

u64 ns;

2491

u64 ns;

2487

2492

2488

#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)

2493

#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)

2489

/*

2494

/*

2490

* 64-bit doesn't need locks to atomically read a 64bit value.

2495

* 64-bit doesn't need locks to atomically read a 64bit value.

2491

* So we have a optimization chance when the task's delta_exec is 0.

2496

* So we have a optimization chance when the task's delta_exec is 0.

2492

* Reading ->on_cpu is racy, but this is ok.

2497

* Reading ->on_cpu is racy, but this is ok.

2493

*

2498

*

2494

* If we race with it leaving cpu, we'll take a lock. So we're correct.

2499

* If we race with it leaving cpu, we'll take a lock. So we're correct.

2495

* If we race with it entering cpu, unaccounted time is 0. This is

2500

* If we race with it entering cpu, unaccounted time is 0. This is

2496

* indistinguishable from the read occurring a few cycles earlier.

2501

* indistinguishable from the read occurring a few cycles earlier.

2497

* If we see ->on_cpu without ->on_rq, the task is leaving, and has

2502

* If we see ->on_cpu without ->on_rq, the task is leaving, and has

2498

* been accounted, so we're correct here as well.

2503

* been accounted, so we're correct here as well.

2499

*/

2504

*/

2500

if (!p->on_cpu || !task_on_rq_queued(p))

2505

if (!p->on_cpu || !task_on_rq_queued(p))

2501

return p->se.sum_exec_runtime;

2506

return p->se.sum_exec_runtime;

2502

#endif

2507

#endif

2503

2508

2504

rq = task_rq_lock(p, &flags);

2509

rq = task_rq_lock(p, &flags);

2505

/*

2510

/*

2506

* Must be ->curr _and_ ->on_rq. If dequeued, we would

2511

* Must be ->curr _and_ ->on_rq. If dequeued, we would

2507

* project cycles that may never be accounted to this

2512

* project cycles that may never be accounted to this

2508

* thread, breaking clock_gettime().

2513

* thread, breaking clock_gettime().

2509

*/

2514

*/

2510

if (task_current(rq, p) && task_on_rq_queued(p)) {

2515

if (task_current(rq, p) && task_on_rq_queued(p)) {

2511

update_rq_clock(rq);

2516

update_rq_clock(rq);

2512

p->sched_class->update_curr(rq);

2517

p->sched_class->update_curr(rq);

2513

}

2518

}

2514

ns = p->se.sum_exec_runtime;

2519

ns = p->se.sum_exec_runtime;

2515

task_rq_unlock(rq, p, &flags);

2520

task_rq_unlock(rq, p, &flags);

2516

2521

2517

return ns;

2522

return ns;

2518

}

2523

}

2519

2524

2520

/*

2525

/*

2521

* This function gets called by the timer code, with HZ frequency.

2526

* This function gets called by the timer code, with HZ frequency.

2522

* We call it with interrupts disabled.

2527

* We call it with interrupts disabled.

2523

*/

2528

*/

2524

void scheduler_tick(void)

2529

void scheduler_tick(void)

2525

{

2530

{

2526

int cpu = smp_processor_id();

2531

int cpu = smp_processor_id();

2527

struct rq *rq = cpu_rq(cpu);

2532

struct rq *rq = cpu_rq(cpu);

2528

struct task_struct *curr = rq->curr;

2533

struct task_struct *curr = rq->curr;

2529

2534

2530

sched_clock_tick();

2535

sched_clock_tick();

2531

2536

2532

raw_spin_lock(&rq->lock);

2537

raw_spin_lock(&rq->lock);

2533

update_rq_clock(rq);

2538

update_rq_clock(rq);

2534

curr->sched_class->task_tick(rq, curr, 0);

2539

curr->sched_class->task_tick(rq, curr, 0);

2535

update_cpu_load_active(rq);

2540

update_cpu_load_active(rq);

2536

raw_spin_unlock(&rq->lock);

2541

raw_spin_unlock(&rq->lock);

2537

2542

2538

perf_event_task_tick();

2543

perf_event_task_tick();

2539

2544

2540

#ifdef CONFIG_SMP

2545

#ifdef CONFIG_SMP

2541

rq->idle_balance = idle_cpu(cpu);

2546

rq->idle_balance = idle_cpu(cpu);

2542

trigger_load_balance(rq);

2547

trigger_load_balance(rq);

2543

#endif

2548

#endif

2544

rq_last_tick_reset(rq);

2549

rq_last_tick_reset(rq);

2545

}

2550

}

2546

2551

2547

#ifdef CONFIG_NO_HZ_FULL

2552

#ifdef CONFIG_NO_HZ_FULL

2548

/**

2553

/**

2549

* scheduler_tick_max_deferment

2554

* scheduler_tick_max_deferment

2550

*

2555

*

2551

* Keep at least one tick per second when a single

2556

* Keep at least one tick per second when a single

2552

* active task is running because the scheduler doesn't

2557

* active task is running because the scheduler doesn't

2553

* yet completely support full dynticks environment.

2558

* yet completely support full dynticks environment.

2554

*

2559

*

2555

* This makes sure that uptime, CFS vruntime, load

2560

* This makes sure that uptime, CFS vruntime, load

2556

* balancing, etc... continue to move forward, even

2561

* balancing, etc... continue to move forward, even

2557

* with a very low granularity.

2562

* with a very low granularity.

2558

*

2563

*

2559

* Return: Maximum deferment in nanoseconds.

2564

* Return: Maximum deferment in nanoseconds.

2560

*/

2565

*/

2561

u64 scheduler_tick_max_deferment(void)

2566

u64 scheduler_tick_max_deferment(void)

2562

{

2567

{

2563

struct rq *rq = this_rq();

2568

struct rq *rq = this_rq();

2564

unsigned long next, now = ACCESS_ONCE(jiffies);

2569

unsigned long next, now = ACCESS_ONCE(jiffies);

2565

2570

2566

next = rq->last_sched_tick + HZ;

2571

next = rq->last_sched_tick + HZ;

2567

2572

2568

if (time_before_eq(next, now))

2573

if (time_before_eq(next, now))

2569

return 0;

2574

return 0;

2570

2575

2571

return jiffies_to_nsecs(next - now);

2576

return jiffies_to_nsecs(next - now);

2572

}

2577

}

2573

#endif

2578

#endif

2574

2579

2575

notrace unsigned long get_parent_ip(unsigned long addr)

2580

notrace unsigned long get_parent_ip(unsigned long addr)

2576

{

2581

{

2577

if (in_lock_functions(addr)) {

2582

if (in_lock_functions(addr)) {

2578

addr = CALLER_ADDR2;

2583

addr = CALLER_ADDR2;

2579

if (in_lock_functions(addr))

2584

if (in_lock_functions(addr))

2580

addr = CALLER_ADDR3;

2585

addr = CALLER_ADDR3;

2581

}

2586

}

2582

return addr;

2587

return addr;

2583

}

2588

}

2584

2589

2585

#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \

2590

#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \

2586

defined(CONFIG_PREEMPT_TRACER))

2591

defined(CONFIG_PREEMPT_TRACER))

2587

2592

2588

void preempt_count_add(int val)

2593

void preempt_count_add(int val)

2589

{

2594

{

2590

#ifdef CONFIG_DEBUG_PREEMPT

2595

#ifdef CONFIG_DEBUG_PREEMPT

2591

/*

2596

/*

2592

* Underflow?

2597

* Underflow?

2593

*/

2598

*/

2594

if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))

2599

if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))

2595

return;

2600

return;

2596

#endif

2601

#endif

2597

__preempt_count_add(val);

2602

__preempt_count_add(val);

2598

#ifdef CONFIG_DEBUG_PREEMPT

2603

#ifdef CONFIG_DEBUG_PREEMPT

2599

/*

2604

/*

2600

* Spinlock count overflowing soon?

2605

* Spinlock count overflowing soon?

2601

*/

2606

*/

2602

DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=

2607

DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=

2603

PREEMPT_MASK - 10);

2608

PREEMPT_MASK - 10);

2604

#endif

2609

#endif

2605

if (preempt_count() == val) {

2610

if (preempt_count() == val) {

2606

unsigned long ip = get_parent_ip(CALLER_ADDR1);

2611

unsigned long ip = get_parent_ip(CALLER_ADDR1);

2607

#ifdef CONFIG_DEBUG_PREEMPT

2612

#ifdef CONFIG_DEBUG_PREEMPT

2608

current->preempt_disable_ip = ip;

2613

current->preempt_disable_ip = ip;

2609

#endif

2614

#endif

2610

trace_preempt_off(CALLER_ADDR0, ip);

2615

trace_preempt_off(CALLER_ADDR0, ip);

2611

}

2616

}

2612

}

2617

}

2613

EXPORT_SYMBOL(preempt_count_add);

2618

EXPORT_SYMBOL(preempt_count_add);

2614

NOKPROBE_SYMBOL(preempt_count_add);

2619

NOKPROBE_SYMBOL(preempt_count_add);

2615

2620

2616

void preempt_count_sub(int val)

2621

void preempt_count_sub(int val)

2617

{

2622

{

2618

#ifdef CONFIG_DEBUG_PREEMPT

2623

#ifdef CONFIG_DEBUG_PREEMPT

2619

/*

2624

/*

2620

* Underflow?

2625

* Underflow?

2621

*/

2626

*/

2622

if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))

2627

if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))

2623

return;

2628

return;

2624

/*

2629

/*

2625

* Is the spinlock portion underflowing?

2630

* Is the spinlock portion underflowing?

2626

*/

2631

*/

2627

if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&

2632

if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&

2628

!(preempt_count() & PREEMPT_MASK)))

2633

!(preempt_count() & PREEMPT_MASK)))

2629

return;

2634

return;

2630

#endif

2635

#endif

2631

2636

2632

if (preempt_count() == val)

2637

if (preempt_count() == val)

2633

trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

2638

trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

2634

__preempt_count_sub(val);

2639

__preempt_count_sub(val);

2635

}

2640

}

2636

EXPORT_SYMBOL(preempt_count_sub);

2641

EXPORT_SYMBOL(preempt_count_sub);

2637

NOKPROBE_SYMBOL(preempt_count_sub);

2642

NOKPROBE_SYMBOL(preempt_count_sub);

2638

2643

2639

#endif

2644

#endif

2640

2645

2641

/*

2646

/*

2642

* Print scheduling while atomic bug:

2647

* Print scheduling while atomic bug:

2643

*/

2648

*/

2644

static noinline void __schedule_bug(struct task_struct *prev)

2649

static noinline void __schedule_bug(struct task_struct *prev)

2645

{

2650

{

2646

if (oops_in_progress)

2651

if (oops_in_progress)

2647

return;

2652

return;

2648

2653

2649

printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",

2654

printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",

2650

prev->comm, prev->pid, preempt_count());

2655

prev->comm, prev->pid, preempt_count());

2651

2656

2652

debug_show_held_locks(prev);

2657

debug_show_held_locks(prev);

2653

print_modules();

2658

print_modules();

2654

if (irqs_disabled())

2659

if (irqs_disabled())

2655

print_irqtrace_events(prev);

2660

print_irqtrace_events(prev);

2656

#ifdef CONFIG_DEBUG_PREEMPT

2661

#ifdef CONFIG_DEBUG_PREEMPT

2657

if (in_atomic_preempt_off()) {

2662

if (in_atomic_preempt_off()) {

2658

pr_err("Preemption disabled at:");

2663

pr_err("Preemption disabled at:");

2659

print_ip_sym(current->preempt_disable_ip);

2664

print_ip_sym(current->preempt_disable_ip);

2660

pr_cont("\n");

2665

pr_cont("\n");

2661

}

2666

}

2662

#endif

2667

#endif

2663

dump_stack();

2668

dump_stack();

2664

add_taint(TAINT_WARN, LOCKDEP_STILL_OK);

2669

add_taint(TAINT_WARN, LOCKDEP_STILL_OK);

2665

}

2670

}

2666

2671

2667

/*

2672

/*

2668

* Various schedule()-time debugging checks and statistics:

2673

* Various schedule()-time debugging checks and statistics:

2669

*/

2674

*/

2670

static inline void schedule_debug(struct task_struct *prev)

2675

static inline void schedule_debug(struct task_struct *prev)

2671

{

2676

{

2672

#ifdef CONFIG_SCHED_STACK_END_CHECK

2677

#ifdef CONFIG_SCHED_STACK_END_CHECK

2673

BUG_ON(unlikely(task_stack_end_corrupted(prev)));

2678

BUG_ON(unlikely(task_stack_end_corrupted(prev)));

2674

#endif

2679

#endif

2675

/*

2680

/*

2676

* Test if we are atomic. Since do_exit() needs to call into

2681

* Test if we are atomic. Since do_exit() needs to call into

2677

* schedule() atomically, we ignore that path. Otherwise whine

2682

* schedule() atomically, we ignore that path. Otherwise whine

2678

* if we are scheduling when we should not.

2683

* if we are scheduling when we should not.

2679

*/

2684

*/

2680

if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))

2685

if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))

2681

__schedule_bug(prev);

2686

__schedule_bug(prev);

2682

rcu_sleep_check();

2687

rcu_sleep_check();

2683

2688

2684

profile_hit(SCHED_PROFILING, __builtin_return_address(0));

2689

profile_hit(SCHED_PROFILING, __builtin_return_address(0));

2685

2690

2686

schedstat_inc(this_rq(), sched_count);

2691

schedstat_inc(this_rq(), sched_count);

2687

}

2692

}

2688

2693

2689

/*

2694

/*

2690

* Pick up the highest-prio task:

2695

* Pick up the highest-prio task:

2691

*/

2696

*/

2692

static inline struct task_struct *

2697

static inline struct task_struct *

2693

pick_next_task(struct rq *rq, struct task_struct *prev)

2698

pick_next_task(struct rq *rq, struct task_struct *prev)

2694

{

2699

{

2695

const struct sched_class *class = &fair_sched_class;

2700

const struct sched_class *class = &fair_sched_class;

2696

struct task_struct *p;

2701

struct task_struct *p;

2697

2702

2698

/*

2703

/*

2699

* Optimization: we know that if all tasks are in

2704

* Optimization: we know that if all tasks are in

2700

* the fair class we can call that function directly:

2705

* the fair class we can call that function directly:

2701

*/

2706

*/

2702

if (likely(prev->sched_class == class &&

2707

if (likely(prev->sched_class == class &&

2703

rq->nr_running == rq->cfs.h_nr_running)) {

2708

rq->nr_running == rq->cfs.h_nr_running)) {

2704

p = fair_sched_class.pick_next_task(rq, prev);

2709

p = fair_sched_class.pick_next_task(rq, prev);

2705

if (unlikely(p == RETRY_TASK))

2710

if (unlikely(p == RETRY_TASK))

2706

goto again;

2711

goto again;

2707

2712

2708

/* assumes fair_sched_class->next == idle_sched_class */

2713

/* assumes fair_sched_class->next == idle_sched_class */

2709

if (unlikely(!p))

2714

if (unlikely(!p))

2710

p = idle_sched_class.pick_next_task(rq, prev);

2715

p = idle_sched_class.pick_next_task(rq, prev);

2711

2716

2712

return p;

2717

return p;

2713

}

2718

}

2714

2719

2715

again:

2720

again:

2716

for_each_class(class) {

2721

for_each_class(class) {

2717

p = class->pick_next_task(rq, prev);

2722

p = class->pick_next_task(rq, prev);

2718

if (p) {

2723

if (p) {

2719

if (unlikely(p == RETRY_TASK))

2724

if (unlikely(p == RETRY_TASK))

2720

goto again;

2725

goto again;

2721

return p;

2726

return p;

2722

}

2727

}

2723

}

2728

}

2724

2729

2725

BUG(); /* the idle class will always have a runnable task */

2730

BUG(); /* the idle class will always have a runnable task */

2726

}

2731

}

2727

2732

2728

/*

2733

/*

2729

* __schedule() is the main scheduler function.

2734

* __schedule() is the main scheduler function.

2730

*

2735

*

2731

* The main means of driving the scheduler and thus entering this function are:

2736

* The main means of driving the scheduler and thus entering this function are:

2732

*

2737

*

2733

* 1. Explicit blocking: mutex, semaphore, waitqueue, etc.

2738

* 1. Explicit blocking: mutex, semaphore, waitqueue, etc.

2734

*

2739

*

2735

* 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return

2740

* 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return

2736

* paths. For example, see arch/x86/entry_64.S.

2741

* paths. For example, see arch/x86/entry_64.S.

2737

*

2742

*

2738

* To drive preemption between tasks, the scheduler sets the flag in timer

2743

* To drive preemption between tasks, the scheduler sets the flag in timer

2739

* interrupt handler scheduler_tick().

2744

* interrupt handler scheduler_tick().

2740

*

2745

*

2741

* 3. Wakeups don't really cause entry into schedule(). They add a

2746

* 3. Wakeups don't really cause entry into schedule(). They add a

2742

* task to the run-queue and that's it.

2747

* task to the run-queue and that's it.

2743

*

2748

*

2744

* Now, if the new task added to the run-queue preempts the current

2749

* Now, if the new task added to the run-queue preempts the current

2745

* task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets

2750

* task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets

2746

* called on the nearest possible occasion:

2751

* called on the nearest possible occasion:

2747

*

2752

*

2748

* - If the kernel is preemptible (CONFIG_PREEMPT=y):

2753

* - If the kernel is preemptible (CONFIG_PREEMPT=y):

2749

*

2754

*

2750

* - in syscall or exception context, at the next outmost

2755

* - in syscall or exception context, at the next outmost

2751

* preempt_enable(). (this might be as soon as the wake_up()'s

2756

* preempt_enable(). (this might be as soon as the wake_up()'s

2752

* spin_unlock()!)

2757

* spin_unlock()!)

2753

*

2758

*

2754

* - in IRQ context, return from interrupt-handler to

2759

* - in IRQ context, return from interrupt-handler to

2755

* preemptible context

2760

* preemptible context

2756

*

2761

*

2757

* - If the kernel is not preemptible (CONFIG_PREEMPT is not set)

2762

* - If the kernel is not preemptible (CONFIG_PREEMPT is not set)

2758

* then at the next:

2763

* then at the next:

2759

*

2764

*

2760

* - cond_resched() call

2765

* - cond_resched() call

2761

* - explicit schedule() call

2766

* - explicit schedule() call

2762

* - return from syscall or exception to user-space

2767

* - return from syscall or exception to user-space

2763

* - return from interrupt-handler to user-space

2768

* - return from interrupt-handler to user-space

2764

*/

2769

*/

2765

static void __sched __schedule(void)

2770

static void __sched __schedule(void)

2766

{

2771

{

2767

struct task_struct *prev, *next;

2772

struct task_struct *prev, *next;

2768

unsigned long *switch_count;

2773

unsigned long *switch_count;

2769

struct rq *rq;

2774

struct rq *rq;

2770

int cpu;

2775

int cpu;

2771

2776

2772

need_resched:

2777

need_resched:

2773

preempt_disable();

2778

preempt_disable();

2774

cpu = smp_processor_id();

2779

cpu = smp_processor_id();

2775

rq = cpu_rq(cpu);

2780

rq = cpu_rq(cpu);

2776

rcu_note_context_switch(cpu);

2781

rcu_note_context_switch(cpu);

2777

prev = rq->curr;

2782

prev = rq->curr;

2778

2783

2779

schedule_debug(prev);

2784

schedule_debug(prev);

2780

2785

2781

if (sched_feat(HRTICK))

2786

if (sched_feat(HRTICK))

2782

hrtick_clear(rq);

2787

hrtick_clear(rq);

2783

2788

2784

/*

2789

/*

2785

* Make sure that signal_pending_state()->signal_pending() below

2790

* Make sure that signal_pending_state()->signal_pending() below

2786

* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)

2791

* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)

2787

* done by the caller to avoid the race with signal_wake_up().

2792

* done by the caller to avoid the race with signal_wake_up().

2788

*/

2793

*/

2789

smp_mb__before_spinlock();

2794

smp_mb__before_spinlock();

2790

raw_spin_lock_irq(&rq->lock);

2795

raw_spin_lock_irq(&rq->lock);

2791

2796

2792

switch_count = &prev->nivcsw;

2797

switch_count = &prev->nivcsw;

2793

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

2798

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

2794

if (unlikely(signal_pending_state(prev->state, prev))) {

2799

if (unlikely(signal_pending_state(prev->state, prev))) {

2795

prev->state = TASK_RUNNING;

2800

prev->state = TASK_RUNNING;

2796

} else {

2801

} else {

2797

deactivate_task(rq, prev, DEQUEUE_SLEEP);

2802

deactivate_task(rq, prev, DEQUEUE_SLEEP);

2798

prev->on_rq = 0;

2803

prev->on_rq = 0;

2799

2804

2800

/*

2805

/*

2801

* If a worker went to sleep, notify and ask workqueue

2806

* If a worker went to sleep, notify and ask workqueue

2802

* whether it wants to wake up a task to maintain

2807

* whether it wants to wake up a task to maintain

2803

* concurrency.

2808

* concurrency.

2804

*/

2809

*/

2805

if (prev->flags & PF_WQ_WORKER) {

2810

if (prev->flags & PF_WQ_WORKER) {

2806

struct task_struct *to_wakeup;

2811

struct task_struct *to_wakeup;

2807

2812

2808

to_wakeup = wq_worker_sleeping(prev, cpu);

2813

to_wakeup = wq_worker_sleeping(prev, cpu);

2809

if (to_wakeup)

2814

if (to_wakeup)

2810

try_to_wake_up_local(to_wakeup);

2815

try_to_wake_up_local(to_wakeup);

2811

}

2816

}

2812

}

2817

}

2813

switch_count = &prev->nvcsw;

2818

switch_count = &prev->nvcsw;

2814

}

2819

}

2815

2820

2816

if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)

2821

if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)

2817

update_rq_clock(rq);

2822

update_rq_clock(rq);

2818

2823

2819

next = pick_next_task(rq, prev);

2824

next = pick_next_task(rq, prev);

2820

clear_tsk_need_resched(prev);

2825

clear_tsk_need_resched(prev);

2821

clear_preempt_need_resched();

2826

clear_preempt_need_resched();

2822

rq->skip_clock_update = 0;

2827

rq->skip_clock_update = 0;

2823

2828

2824

if (likely(prev != next)) {

2829

if (likely(prev != next)) {

2825

rq->nr_switches++;

2830

rq->nr_switches++;

2826

rq->curr = next;

2831

rq->curr = next;

2827

++*switch_count;

2832

++*switch_count;

2828

2833

2829

context_switch(rq, prev, next); /* unlocks the rq */

2834

context_switch(rq, prev, next); /* unlocks the rq */

2830

/*

2835

/*

2831

* The context switch have flipped the stack from under us

2836

* The context switch have flipped the stack from under us

2832

* and restored the local variables which were saved when

2837

* and restored the local variables which were saved when

2833

* this task called schedule() in the past. prev == current

2838

* this task called schedule() in the past. prev == current

2834

* is still correct, but it can be moved to another cpu/rq.

2839

* is still correct, but it can be moved to another cpu/rq.

2835

*/

2840

*/

2836

cpu = smp_processor_id();

2841

cpu = smp_processor_id();

2837

rq = cpu_rq(cpu);

2842

rq = cpu_rq(cpu);

2838

} else

2843

} else

2839

raw_spin_unlock_irq(&rq->lock);

2844

raw_spin_unlock_irq(&rq->lock);

2840

2845

2841

post_schedule(rq);

2846

post_schedule(rq);

2842

2847

2843

sched_preempt_enable_no_resched();

2848

sched_preempt_enable_no_resched();

2844

if (need_resched())

2849

if (need_resched())

2845

goto need_resched;

2850

goto need_resched;

2846

}

2851

}

2847

2852

2848

static inline void sched_submit_work(struct task_struct *tsk)

2853

static inline void sched_submit_work(struct task_struct *tsk)

2849

{

2854

{

2850

if (!tsk->state || tsk_is_pi_blocked(tsk))

2855

if (!tsk->state || tsk_is_pi_blocked(tsk))

2851

return;

2856

return;

2852

/*

2857

/*

2853

* If we are going to sleep and we have plugged IO queued,

2858

* If we are going to sleep and we have plugged IO queued,

2854

* make sure to submit it to avoid deadlocks.

2859

* make sure to submit it to avoid deadlocks.

2855

*/

2860

*/

2856

if (blk_needs_flush_plug(tsk))

2861

if (blk_needs_flush_plug(tsk))

2857

blk_schedule_flush_plug(tsk);

2862

blk_schedule_flush_plug(tsk);

2858

}

2863

}

2859

2864

2860

asmlinkage __visible void __sched schedule(void)

2865

asmlinkage __visible void __sched schedule(void)

2861

{

2866

{

2862

struct task_struct *tsk = current;

2867

struct task_struct *tsk = current;

2863

2868

2864

sched_submit_work(tsk);

2869

sched_submit_work(tsk);

2865

__schedule();

2870

__schedule();

2866

}

2871

}

2867

EXPORT_SYMBOL(schedule);

2872

EXPORT_SYMBOL(schedule);

2868

2873

2869

#ifdef CONFIG_CONTEXT_TRACKING

2874

#ifdef CONFIG_CONTEXT_TRACKING

2870

asmlinkage __visible void __sched schedule_user(void)

2875

asmlinkage __visible void __sched schedule_user(void)

2871

{

2876

{

2872

/*

2877

/*

2873

* If we come here after a random call to set_need_resched(),

2878

* If we come here after a random call to set_need_resched(),

2874

* or we have been woken up remotely but the IPI has not yet arrived,

2879

* or we have been woken up remotely but the IPI has not yet arrived,

2875

* we haven't yet exited the RCU idle mode. Do it here manually until

2880

* we haven't yet exited the RCU idle mode. Do it here manually until

2876

* we find a better solution.

2881

* we find a better solution.

2877

*

2882

*

2878

* NB: There are buggy callers of this function. Ideally we

2883

* NB: There are buggy callers of this function. Ideally we

2879

* should warn if prev_state != IN_USER, but that will trigger

2884

* should warn if prev_state != IN_USER, but that will trigger

2880

* too frequently to make sense yet.

2885

* too frequently to make sense yet.

2881

*/

2886

*/

2882

enum ctx_state prev_state = exception_enter();

2887

enum ctx_state prev_state = exception_enter();

2883

schedule();

2888

schedule();

2884

exception_exit(prev_state);

2889

exception_exit(prev_state);

2885

}

2890

}

2886

#endif

2891

#endif

2887

2892

2888

/**

2893

/**

2889

* schedule_preempt_disabled - called with preemption disabled

2894

* schedule_preempt_disabled - called with preemption disabled

2890

*

2895

*

2891

* Returns with preemption disabled. Note: preempt_count must be 1

2896

* Returns with preemption disabled. Note: preempt_count must be 1

2892

*/

2897

*/

2893

void __sched schedule_preempt_disabled(void)

2898

void __sched schedule_preempt_disabled(void)

2894

{

2899

{

2895

sched_preempt_enable_no_resched();

2900

sched_preempt_enable_no_resched();

2896

schedule();

2901

schedule();

2897

preempt_disable();

2902

preempt_disable();

2898

}

2903

}

2899

2904

2900

#ifdef CONFIG_PREEMPT

2905

#ifdef CONFIG_PREEMPT

2901

/*

2906

/*

2902

* this is the entry point to schedule() from in-kernel preemption

2907

* this is the entry point to schedule() from in-kernel preemption

2903

* off of preempt_enable. Kernel preemptions off return from interrupt

2908

* off of preempt_enable. Kernel preemptions off return from interrupt

2904

* occur there and call schedule directly.

2909

* occur there and call schedule directly.

2905

*/

2910

*/

2906

asmlinkage __visible void __sched notrace preempt_schedule(void)

2911

asmlinkage __visible void __sched notrace preempt_schedule(void)

2907

{

2912

{

2908

/*

2913

/*

2909

* If there is a non-zero preempt_count or interrupts are disabled,

2914

* If there is a non-zero preempt_count or interrupts are disabled,

2910

* we do not want to preempt the current task. Just return..

2915

* we do not want to preempt the current task. Just return..

2911

*/

2916

*/

2912

if (likely(!preemptible()))

2917

if (likely(!preemptible()))

2913

return;

2918

return;

2914

2919

2915

do {

2920

do {

2916

__preempt_count_add(PREEMPT_ACTIVE);

2921

__preempt_count_add(PREEMPT_ACTIVE);

2917

__schedule();

2922

__schedule();

2918

__preempt_count_sub(PREEMPT_ACTIVE);

2923

__preempt_count_sub(PREEMPT_ACTIVE);

2919

2924

2920

/*

2925

/*

2921

* Check again in case we missed a preemption opportunity

2926

* Check again in case we missed a preemption opportunity

2922

* between schedule and now.

2927

* between schedule and now.

2923

*/

2928

*/

2924

barrier();

2929

barrier();

2925

} while (need_resched());

2930

} while (need_resched());

2926

}

2931

}

2927

NOKPROBE_SYMBOL(preempt_schedule);

2932

NOKPROBE_SYMBOL(preempt_schedule);

2928

EXPORT_SYMBOL(preempt_schedule);

2933

EXPORT_SYMBOL(preempt_schedule);

2929

2934

2930

#ifdef CONFIG_CONTEXT_TRACKING

2935

#ifdef CONFIG_CONTEXT_TRACKING

2931

/**

2936

/**

2932

* preempt_schedule_context - preempt_schedule called by tracing

2937

* preempt_schedule_context - preempt_schedule called by tracing

2933

*

2938

*

2934

* The tracing infrastructure uses preempt_enable_notrace to prevent

2939

* The tracing infrastructure uses preempt_enable_notrace to prevent

2935

* recursion and tracing preempt enabling caused by the tracing

2940

* recursion and tracing preempt enabling caused by the tracing

2936

* infrastructure itself. But as tracing can happen in areas coming

2941

* infrastructure itself. But as tracing can happen in areas coming

2937

* from userspace or just about to enter userspace, a preempt enable

2942

* from userspace or just about to enter userspace, a preempt enable

2938

* can occur before user_exit() is called. This will cause the scheduler

2943

* can occur before user_exit() is called. This will cause the scheduler

2939

* to be called when the system is still in usermode.

2944

* to be called when the system is still in usermode.

2940

*

2945

*

2941

* To prevent this, the preempt_enable_notrace will use this function

2946

* To prevent this, the preempt_enable_notrace will use this function

2942

* instead of preempt_schedule() to exit user context if needed before

2947

* instead of preempt_schedule() to exit user context if needed before

2943

* calling the scheduler.

2948

* calling the scheduler.

2944

*/

2949

*/

2945

asmlinkage __visible void __sched notrace preempt_schedule_context(void)

2950

asmlinkage __visible void __sched notrace preempt_schedule_context(void)

2946

{

2951

{

2947

enum ctx_state prev_ctx;

2952

enum ctx_state prev_ctx;

2948

2953

2949

if (likely(!preemptible()))

2954

if (likely(!preemptible()))

2950

return;

2955

return;

2951

2956

2952

do {

2957

do {

2953

__preempt_count_add(PREEMPT_ACTIVE);

2958

__preempt_count_add(PREEMPT_ACTIVE);

2954

/*

2959

/*

2955

* Needs preempt disabled in case user_exit() is traced

2960

* Needs preempt disabled in case user_exit() is traced

2956

* and the tracer calls preempt_enable_notrace() causing

2961

* and the tracer calls preempt_enable_notrace() causing

2957

* an infinite recursion.

2962

* an infinite recursion.

2958

*/

2963

*/

2959

prev_ctx = exception_enter();

2964

prev_ctx = exception_enter();

2960

__schedule();

2965

__schedule();

2961

exception_exit(prev_ctx);

2966

exception_exit(prev_ctx);

2962

2967

2963

__preempt_count_sub(PREEMPT_ACTIVE);

2968

__preempt_count_sub(PREEMPT_ACTIVE);

2964

barrier();

2969

barrier();

2965

} while (need_resched());

2970

} while (need_resched());

2966

}

2971

}

2967

EXPORT_SYMBOL_GPL(preempt_schedule_context);

2972

EXPORT_SYMBOL_GPL(preempt_schedule_context);

2968

#endif /* CONFIG_CONTEXT_TRACKING */

2973

#endif /* CONFIG_CONTEXT_TRACKING */

2969

2974

2970

#endif /* CONFIG_PREEMPT */

2975

#endif /* CONFIG_PREEMPT */

2971

2976

2972

/*

2977

/*

2973

* this is the entry point to schedule() from kernel preemption

2978

* this is the entry point to schedule() from kernel preemption

2974

* off of irq context.

2979

* off of irq context.

2975

* Note, that this is called and return with irqs disabled. This will

2980

* Note, that this is called and return with irqs disabled. This will

2976

* protect us against recursive calling from irq.

2981

* protect us against recursive calling from irq.

2977

*/

2982

*/

2978

asmlinkage __visible void __sched preempt_schedule_irq(void)

2983

asmlinkage __visible void __sched preempt_schedule_irq(void)

2979

{

2984

{

2980

enum ctx_state prev_state;

2985

enum ctx_state prev_state;

2981

2986

2982

/* Catch callers which need to be fixed */

2987

/* Catch callers which need to be fixed */

2983

BUG_ON(preempt_count() || !irqs_disabled());

2988

BUG_ON(preempt_count() || !irqs_disabled());

2984

2989

2985

prev_state = exception_enter();

2990

prev_state = exception_enter();

2986

2991

2987

do {

2992

do {

2988

__preempt_count_add(PREEMPT_ACTIVE);

2993

__preempt_count_add(PREEMPT_ACTIVE);

2989

local_irq_enable();

2994

local_irq_enable();

2990

__schedule();

2995

__schedule();

2991

local_irq_disable();

2996

local_irq_disable();

2992

__preempt_count_sub(PREEMPT_ACTIVE);

2997

__preempt_count_sub(PREEMPT_ACTIVE);

2993

2998

2994

/*

2999

/*

2995

* Check again in case we missed a preemption opportunity

3000

* Check again in case we missed a preemption opportunity

2996

* between schedule and now.

3001

* between schedule and now.

2997

*/

3002

*/

2998

barrier();

3003

barrier();

2999

} while (need_resched());

3004

} while (need_resched());

3000

3005

3001

exception_exit(prev_state);

3006

exception_exit(prev_state);

3002

}

3007

}

3003

3008

3004

int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,

3009

int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,

3005

void *key)

3010

void *key)

3006

{

3011

{

3007

return try_to_wake_up(curr->private, mode, wake_flags);

3012

return try_to_wake_up(curr->private, mode, wake_flags);

3008

}

3013

}

3009

EXPORT_SYMBOL(default_wake_function);

3014

EXPORT_SYMBOL(default_wake_function);

3010

3015

3011

#ifdef CONFIG_RT_MUTEXES

3016

#ifdef CONFIG_RT_MUTEXES

3012

3017

3013

/*

3018

/*

3014

* rt_mutex_setprio - set the current priority of a task

3019

* rt_mutex_setprio - set the current priority of a task

3015

* @p: task

3020

* @p: task

3016

* @prio: prio value (kernel-internal form)

3021

* @prio: prio value (kernel-internal form)

3017

*

3022

*

3018

* This function changes the 'effective' priority of a task. It does

3023

* This function changes the 'effective' priority of a task. It does

3019

* not touch ->normal_prio like __setscheduler().

3024

* not touch ->normal_prio like __setscheduler().

3020

*

3025

*

3021

* Used by the rt_mutex code to implement priority inheritance

3026

* Used by the rt_mutex code to implement priority inheritance

3022

* logic. Call site only calls if the priority of the task changed.

3027

* logic. Call site only calls if the priority of the task changed.

3023

*/

3028

*/

3024

void rt_mutex_setprio(struct task_struct *p, int prio)

3029

void rt_mutex_setprio(struct task_struct *p, int prio)

3025

{

3030

{

3026

int oldprio, queued, running, enqueue_flag = 0;

3031

int oldprio, queued, running, enqueue_flag = 0;

3027

struct rq *rq;

3032

struct rq *rq;

3028

const struct sched_class *prev_class;

3033

const struct sched_class *prev_class;

3029

3034

3030

BUG_ON(prio > MAX_PRIO);

3035

BUG_ON(prio > MAX_PRIO);

3031

3036

3032

rq = __task_rq_lock(p);

3037

rq = __task_rq_lock(p);

3033

3038

3034

/*

3039

/*

3035

* Idle task boosting is a nono in general. There is one

3040

* Idle task boosting is a nono in general. There is one

3036

* exception, when PREEMPT_RT and NOHZ is active:

3041

* exception, when PREEMPT_RT and NOHZ is active:

3037

*

3042

*

3038

* The idle task calls get_next_timer_interrupt() and holds

3043

* The idle task calls get_next_timer_interrupt() and holds

3039

* the timer wheel base->lock on the CPU and another CPU wants

3044

* the timer wheel base->lock on the CPU and another CPU wants

3040

* to access the timer (probably to cancel it). We can safely

3045

* to access the timer (probably to cancel it). We can safely

3041

* ignore the boosting request, as the idle CPU runs this code

3046

* ignore the boosting request, as the idle CPU runs this code

3042

* with interrupts disabled and will complete the lock

3047

* with interrupts disabled and will complete the lock

3043

* protected section without being interrupted. So there is no

3048

* protected section without being interrupted. So there is no

3044

* real need to boost.

3049

* real need to boost.

3045

*/

3050

*/

3046

if (unlikely(p == rq->idle)) {

3051

if (unlikely(p == rq->idle)) {

3047

WARN_ON(p != rq->curr);

3052

WARN_ON(p != rq->curr);

3048

WARN_ON(p->pi_blocked_on);

3053

WARN_ON(p->pi_blocked_on);

3049

goto out_unlock;

3054

goto out_unlock;

3050

}

3055

}

3051

3056

3052

trace_sched_pi_setprio(p, prio);

3057

trace_sched_pi_setprio(p, prio);

3053

oldprio = p->prio;

3058

oldprio = p->prio;

3054

prev_class = p->sched_class;

3059

prev_class = p->sched_class;

3055

queued = task_on_rq_queued(p);

3060

queued = task_on_rq_queued(p);

3056

running = task_current(rq, p);

3061

running = task_current(rq, p);

3057

if (queued)

3062

if (queued)

3058

dequeue_task(rq, p, 0);

3063

dequeue_task(rq, p, 0);

3059

if (running)

3064

if (running)

3060

put_prev_task(rq, p);

3065

put_prev_task(rq, p);

3061

3066

3062

/*

3067

/*

3063

* Boosting condition are:

3068

* Boosting condition are:

3064

* 1. -rt task is running and holds mutex A

3069

* 1. -rt task is running and holds mutex A

3065

* --> -dl task blocks on mutex A

3070

* --> -dl task blocks on mutex A

3066

*

3071

*

3067

* 2. -dl task is running and holds mutex A

3072

* 2. -dl task is running and holds mutex A

3068

* --> -dl task blocks on mutex A and could preempt the

3073

* --> -dl task blocks on mutex A and could preempt the

3069

* running task

3074

* running task

3070

*/

3075

*/

3071

if (dl_prio(prio)) {

3076

if (dl_prio(prio)) {

3072

struct task_struct *pi_task = rt_mutex_get_top_task(p);

3077

struct task_struct *pi_task = rt_mutex_get_top_task(p);

3073

if (!dl_prio(p->normal_prio) ||

3078

if (!dl_prio(p->normal_prio) ||

3074

(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {

3079

(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {

3075

p->dl.dl_boosted = 1;

3080

p->dl.dl_boosted = 1;

3076

p->dl.dl_throttled = 0;

3081

p->dl.dl_throttled = 0;

3077

enqueue_flag = ENQUEUE_REPLENISH;

3082

enqueue_flag = ENQUEUE_REPLENISH;

3078

} else

3083

} else

3079

p->dl.dl_boosted = 0;

3084

p->dl.dl_boosted = 0;

3080

p->sched_class = &dl_sched_class;

3085

p->sched_class = &dl_sched_class;

3081

} else if (rt_prio(prio)) {

3086

} else if (rt_prio(prio)) {

3082

if (dl_prio(oldprio))

3087

if (dl_prio(oldprio))

3083

p->dl.dl_boosted = 0;

3088

p->dl.dl_boosted = 0;

3084

if (oldprio < prio)

3089

if (oldprio < prio)

3085

enqueue_flag = ENQUEUE_HEAD;

3090

enqueue_flag = ENQUEUE_HEAD;

3086

p->sched_class = &rt_sched_class;

3091

p->sched_class = &rt_sched_class;

3087

} else {

3092

} else {

3088

if (dl_prio(oldprio))

3093

if (dl_prio(oldprio))

3089

p->dl.dl_boosted = 0;

3094

p->dl.dl_boosted = 0;

3090

p->sched_class = &fair_sched_class;

3095

p->sched_class = &fair_sched_class;

3091

}

3096

}

3092

3097

3093

p->prio = prio;

3098

p->prio = prio;

3094

3099

3095

if (running)

3100

if (running)

3096

p->sched_class->set_curr_task(rq);

3101

p->sched_class->set_curr_task(rq);

3097

if (queued)

3102

if (queued)

3098

enqueue_task(rq, p, enqueue_flag);

3103

enqueue_task(rq, p, enqueue_flag);

3099

3104

3100

check_class_changed(rq, p, prev_class, oldprio);

3105

check_class_changed(rq, p, prev_class, oldprio);

3101

out_unlock:

3106

out_unlock:

3102

__task_rq_unlock(rq);

3107

__task_rq_unlock(rq);

3103

}

3108

}

3104

#endif

3109

#endif

3105

3110

3106

void set_user_nice(struct task_struct *p, long nice)

3111

void set_user_nice(struct task_struct *p, long nice)

3107

{

3112

{

3108

int old_prio, delta, queued;

3113

int old_prio, delta, queued;

3109

unsigned long flags;

3114

unsigned long flags;

3110

struct rq *rq;

3115

struct rq *rq;

3111

3116

3112

if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)

3117

if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)

3113

return;

3118

return;

3114

/*

3119

/*

3115

* We have to be careful, if called from sys_setpriority(),

3120

* We have to be careful, if called from sys_setpriority(),

3116

* the task might be in the middle of scheduling on another CPU.

3121

* the task might be in the middle of scheduling on another CPU.

3117

*/

3122

*/

3118

rq = task_rq_lock(p, &flags);

3123

rq = task_rq_lock(p, &flags);

3119

/*

3124

/*

3120

* The RT priorities are set via sched_setscheduler(), but we still

3125

* The RT priorities are set via sched_setscheduler(), but we still

3121

* allow the 'normal' nice value to be set - but as expected

3126

* allow the 'normal' nice value to be set - but as expected

3122

* it wont have any effect on scheduling until the task is

3127

* it wont have any effect on scheduling until the task is

3123

* SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:

3128

* SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:

3124

*/

3129

*/

3125

if (task_has_dl_policy(p) || task_has_rt_policy(p)) {

3130

if (task_has_dl_policy(p) || task_has_rt_policy(p)) {

3126

p->static_prio = NICE_TO_PRIO(nice);

3131

p->static_prio = NICE_TO_PRIO(nice);

3127

goto out_unlock;

3132

goto out_unlock;

3128

}

3133

}

3129

queued = task_on_rq_queued(p);

3134

queued = task_on_rq_queued(p);

3130

if (queued)

3135

if (queued)

3131

dequeue_task(rq, p, 0);

3136

dequeue_task(rq, p, 0);

3132

3137

3133

p->static_prio = NICE_TO_PRIO(nice);

3138

p->static_prio = NICE_TO_PRIO(nice);

3134

set_load_weight(p);

3139

set_load_weight(p);

3135

old_prio = p->prio;

3140

old_prio = p->prio;

3136

p->prio = effective_prio(p);

3141

p->prio = effective_prio(p);

3137

delta = p->prio - old_prio;

3142

delta = p->prio - old_prio;

3138

3143

3139

if (queued) {

3144

if (queued) {

3140

enqueue_task(rq, p, 0);

3145

enqueue_task(rq, p, 0);

3141

/*

3146

/*

3142

* If the task increased its priority or is running and

3147

* If the task increased its priority or is running and

3143

* lowered its priority, then reschedule its CPU:

3148

* lowered its priority, then reschedule its CPU:

3144

*/

3149

*/

3145

if (delta < 0 || (delta > 0 && task_running(rq, p)))

3150

if (delta < 0 || (delta > 0 && task_running(rq, p)))

3146

resched_curr(rq);

3151

resched_curr(rq);

3147

}

3152

}

3148

out_unlock:

3153

out_unlock:

3149

task_rq_unlock(rq, p, &flags);

3154

task_rq_unlock(rq, p, &flags);

3150

}

3155

}

3151

EXPORT_SYMBOL(set_user_nice);

3156

EXPORT_SYMBOL(set_user_nice);

3152

3157

3153

/*

3158

/*

3154

* can_nice - check if a task can reduce its nice value

3159

* can_nice - check if a task can reduce its nice value

3155

* @p: task

3160

* @p: task

3156

* @nice: nice value

3161

* @nice: nice value

3157

*/

3162

*/

3158

int can_nice(const struct task_struct *p, const int nice)

3163

int can_nice(const struct task_struct *p, const int nice)

3159

{

3164

{

3160

/* convert nice value [19,-20] to rlimit style value [1,40] */

3165

/* convert nice value [19,-20] to rlimit style value [1,40] */

3161

int nice_rlim = nice_to_rlimit(nice);

3166

int nice_rlim = nice_to_rlimit(nice);

3162

3167

3163

return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||

3168

return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||

3164

capable(CAP_SYS_NICE));

3169

capable(CAP_SYS_NICE));

3165

}

3170

}

3166

3171

3167

#ifdef __ARCH_WANT_SYS_NICE

3172

#ifdef __ARCH_WANT_SYS_NICE

3168

3173

3169

/*

3174

/*

3170

* sys_nice - change the priority of the current process.

3175

* sys_nice - change the priority of the current process.

3171

* @increment: priority increment

3176

* @increment: priority increment

3172

*

3177

*

3173

* sys_setpriority is a more generic, but much slower function that

3178

* sys_setpriority is a more generic, but much slower function that

3174

* does similar things.

3179

* does similar things.

3175

*/

3180

*/

3176

SYSCALL_DEFINE1(nice, int, increment)

3181

SYSCALL_DEFINE1(nice, int, increment)

3177

{

3182

{

3178

long nice, retval;

3183

long nice, retval;

3179

3184

3180

/*

3185

/*

3181

* Setpriority might change our priority at the same moment.

3186

* Setpriority might change our priority at the same moment.

3182

* We don't have to worry. Conceptually one call occurs first

3187

* We don't have to worry. Conceptually one call occurs first

3183

* and we have a single winner.

3188

* and we have a single winner.

3184

*/

3189

*/

3185

increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);

3190

increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);

3186

nice = task_nice(current) + increment;

3191

nice = task_nice(current) + increment;

3187

3192

3188

nice = clamp_val(nice, MIN_NICE, MAX_NICE);

3193

nice = clamp_val(nice, MIN_NICE, MAX_NICE);

3189

if (increment < 0 && !can_nice(current, nice))

3194

if (increment < 0 && !can_nice(current, nice))

3190

return -EPERM;

3195

return -EPERM;

3191

3196

3192

retval = security_task_setnice(current, nice);

3197

retval = security_task_setnice(current, nice);

3193

if (retval)

3198

if (retval)

3194

return retval;

3199

return retval;

3195

3200

3196

set_user_nice(current, nice);

3201

set_user_nice(current, nice);

3197

return 0;

3202

return 0;

3198

}

3203

}

3199

3204

3200

#endif

3205

#endif

3201

3206

3202

/**

3207

/**

3203

* task_prio - return the priority value of a given task.

3208

* task_prio - return the priority value of a given task.

3204

* @p: the task in question.

3209

* @p: the task in question.

3205

*

3210

*

3206

* Return: The priority value as seen by users in /proc.

3211

* Return: The priority value as seen by users in /proc.

3207

* RT tasks are offset by -200. Normal tasks are centered

3212

* RT tasks are offset by -200. Normal tasks are centered

3208

* around 0, value goes from -16 to +15.

3213

* around 0, value goes from -16 to +15.

3209

*/

3214

*/

3210

int task_prio(const struct task_struct *p)

3215

int task_prio(const struct task_struct *p)

3211

{

3216

{

3212

return p->prio - MAX_RT_PRIO;

3217

return p->prio - MAX_RT_PRIO;

3213

}

3218

}

3214

3219

3215

/**

3220

/**

3216

* idle_cpu - is a given cpu idle currently?

3221

* idle_cpu - is a given cpu idle currently?

3217

* @cpu: the processor in question.

3222

* @cpu: the processor in question.

3218

*

3223

*

3219

* Return: 1 if the CPU is currently idle. 0 otherwise.

3224

* Return: 1 if the CPU is currently idle. 0 otherwise.

3220

*/

3225

*/

3221

int idle_cpu(int cpu)

3226

int idle_cpu(int cpu)

3222

{

3227

{

3223

struct rq *rq = cpu_rq(cpu);

3228

struct rq *rq = cpu_rq(cpu);

3224

3229

3225

if (rq->curr != rq->idle)

3230

if (rq->curr != rq->idle)

3226

return 0;

3231

return 0;

3227

3232

3228

if (rq->nr_running)

3233

if (rq->nr_running)

3229

return 0;

3234

return 0;

3230

3235

3231

#ifdef CONFIG_SMP

3236

#ifdef CONFIG_SMP

3232

if (!llist_empty(&rq->wake_list))

3237

if (!llist_empty(&rq->wake_list))

3233

return 0;

3238

return 0;

3234

#endif

3239

#endif

3235

3240

3236

return 1;

3241

return 1;

3237

}

3242

}

3238

3243

3239

/**

3244

/**

3240

* idle_task - return the idle task for a given cpu.

3245

* idle_task - return the idle task for a given cpu.

3241

* @cpu: the processor in question.

3246

* @cpu: the processor in question.

3242

*

3247

*

3243

* Return: The idle task for the cpu @cpu.

3248

* Return: The idle task for the cpu @cpu.

3244

*/

3249

*/

3245

struct task_struct *idle_task(int cpu)

3250

struct task_struct *idle_task(int cpu)

3246

{

3251

{

3247

return cpu_rq(cpu)->idle;

3252

return cpu_rq(cpu)->idle;

3248

}

3253

}

3249

3254

3250

/**

3255

/**

3251

* find_process_by_pid - find a process with a matching PID value.

3256

* find_process_by_pid - find a process with a matching PID value.

3252

* @pid: the pid in question.

3257

* @pid: the pid in question.

3253

*

3258

*

3254

* The task of @pid, if found. %NULL otherwise.

3259

* The task of @pid, if found. %NULL otherwise.

3255

*/

3260

*/

3256

static struct task_struct *find_process_by_pid(pid_t pid)

3261

static struct task_struct *find_process_by_pid(pid_t pid)

3257

{

3262

{

3258

return pid ? find_task_by_vpid(pid) : current;

3263

return pid ? find_task_by_vpid(pid) : current;

3259

}

3264

}

3260

3265

3261

/*

3266

/*

3262

* This function initializes the sched_dl_entity of a newly becoming

3267

* This function initializes the sched_dl_entity of a newly becoming

3263

* SCHED_DEADLINE task.

3268

* SCHED_DEADLINE task.

3264

*

3269

*

3265

* Only the static values are considered here, the actual runtime and the

3270

* Only the static values are considered here, the actual runtime and the

3266

* absolute deadline will be properly calculated when the task is enqueued

3271

* absolute deadline will be properly calculated when the task is enqueued

3267

* for the first time with its new policy.

3272

* for the first time with its new policy.

3268

*/

3273

*/

3269

static void

3274

static void

3270

__setparam_dl(struct task_struct *p, const struct sched_attr *attr)

3275

__setparam_dl(struct task_struct *p, const struct sched_attr *attr)

3271

{

3276

{

3272

struct sched_dl_entity *dl_se = &p->dl;

3277

struct sched_dl_entity *dl_se = &p->dl;

3273

3278

3274

init_dl_task_timer(dl_se);

3279

init_dl_task_timer(dl_se);

3275

dl_se->dl_runtime = attr->sched_runtime;

3280

dl_se->dl_runtime = attr->sched_runtime;

3276

dl_se->dl_deadline = attr->sched_deadline;

3281

dl_se->dl_deadline = attr->sched_deadline;

3277

dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;

3282

dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;

3278

dl_se->flags = attr->sched_flags;

3283

dl_se->flags = attr->sched_flags;

3279

dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);

3284

dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);

3280

dl_se->dl_throttled = 0;

3285

dl_se->dl_throttled = 0;

3281

dl_se->dl_new = 1;

3286

dl_se->dl_new = 1;

3282

dl_se->dl_yielded = 0;

3287

dl_se->dl_yielded = 0;

3283

}

3288

}

3284

3289

3285

/*

3290

/*

3286

* sched_setparam() passes in -1 for its policy, to let the functions

3291

* sched_setparam() passes in -1 for its policy, to let the functions

3287

* it calls know not to change it.

3292

* it calls know not to change it.

3288

*/

3293

*/

3289

#define SETPARAM_POLICY -1

3294

#define SETPARAM_POLICY -1

3290

3295

3291

static void __setscheduler_params(struct task_struct *p,

3296

static void __setscheduler_params(struct task_struct *p,

3292

const struct sched_attr *attr)

3297

const struct sched_attr *attr)

3293

{

3298

{

3294

int policy = attr->sched_policy;

3299

int policy = attr->sched_policy;

3295

3300

3296

if (policy == SETPARAM_POLICY)

3301

if (policy == SETPARAM_POLICY)

3297

policy = p->policy;

3302

policy = p->policy;

3298

3303

3299

p->policy = policy;

3304

p->policy = policy;

3300

3305

3301

if (dl_policy(policy))

3306

if (dl_policy(policy))

3302

__setparam_dl(p, attr);

3307

__setparam_dl(p, attr);

3303

else if (fair_policy(policy))

3308

else if (fair_policy(policy))

3304

p->static_prio = NICE_TO_PRIO(attr->sched_nice);

3309

p->static_prio = NICE_TO_PRIO(attr->sched_nice);

3305

3310

3306

/*

3311

/*

3307

* __sched_setscheduler() ensures attr->sched_priority == 0 when

3312

* __sched_setscheduler() ensures attr->sched_priority == 0 when

3308

* !rt_policy. Always setting this ensures that things like

3313

* !rt_policy. Always setting this ensures that things like

3309

* getparam()/getattr() don't report silly values for !rt tasks.

3314

* getparam()/getattr() don't report silly values for !rt tasks.

3310

*/

3315

*/

3311

p->rt_priority = attr->sched_priority;

3316

p->rt_priority = attr->sched_priority;

3312

p->normal_prio = normal_prio(p);

3317

p->normal_prio = normal_prio(p);

3313

set_load_weight(p);

3318

set_load_weight(p);

3314

}

3319

}

3315

3320

3316

/* Actually do priority change: must hold pi & rq lock. */

3321

/* Actually do priority change: must hold pi & rq lock. */

3317

static void __setscheduler(struct rq *rq, struct task_struct *p,

3322

static void __setscheduler(struct rq *rq, struct task_struct *p,

3318

const struct sched_attr *attr)

3323

const struct sched_attr *attr)

3319

{

3324

{

3320

__setscheduler_params(p, attr);

3325

__setscheduler_params(p, attr);

3321

3326

3322

/*

3327

/*

3323

* If we get here, there was no pi waiters boosting the

3328

* If we get here, there was no pi waiters boosting the

3324

* task. It is safe to use the normal prio.

3329

* task. It is safe to use the normal prio.

3325

*/

3330

*/

3326

p->prio = normal_prio(p);

3331

p->prio = normal_prio(p);

3327

3332

3328

if (dl_prio(p->prio))

3333

if (dl_prio(p->prio))

3329

p->sched_class = &dl_sched_class;

3334

p->sched_class = &dl_sched_class;

3330

else if (rt_prio(p->prio))

3335

else if (rt_prio(p->prio))

3331

p->sched_class = &rt_sched_class;

3336

p->sched_class = &rt_sched_class;

3332

else

3337

else

3333

p->sched_class = &fair_sched_class;

3338

p->sched_class = &fair_sched_class;

3334

}

3339

}

3335

3340

3336

static void

3341

static void

3337

__getparam_dl(struct task_struct *p, struct sched_attr *attr)

3342

__getparam_dl(struct task_struct *p, struct sched_attr *attr)

3338

{

3343

{

3339

struct sched_dl_entity *dl_se = &p->dl;

3344

struct sched_dl_entity *dl_se = &p->dl;

3340

3345

3341

attr->sched_priority = p->rt_priority;

3346

attr->sched_priority = p->rt_priority;

3342

attr->sched_runtime = dl_se->dl_runtime;

3347

attr->sched_runtime = dl_se->dl_runtime;

3343

attr->sched_deadline = dl_se->dl_deadline;

3348

attr->sched_deadline = dl_se->dl_deadline;

3344

attr->sched_period = dl_se->dl_period;

3349

attr->sched_period = dl_se->dl_period;

3345

attr->sched_flags = dl_se->flags;

3350

attr->sched_flags = dl_se->flags;

3346

}

3351

}

3347

3352

3348

/*

3353

/*

3349

* This function validates the new parameters of a -deadline task.

3354

* This function validates the new parameters of a -deadline task.

3350

* We ask for the deadline not being zero, and greater or equal

3355

* We ask for the deadline not being zero, and greater or equal

3351

* than the runtime, as well as the period of being zero or

3356

* than the runtime, as well as the period of being zero or

3352

* greater than deadline. Furthermore, we have to be sure that

3357

* greater than deadline. Furthermore, we have to be sure that

3353

* user parameters are above the internal resolution of 1us (we

3358

* user parameters are above the internal resolution of 1us (we

3354

* check sched_runtime only since it is always the smaller one) and

3359

* check sched_runtime only since it is always the smaller one) and

3355

* below 2^63 ns (we have to check both sched_deadline and

3360

* below 2^63 ns (we have to check both sched_deadline and

3356

* sched_period, as the latter can be zero).

3361

* sched_period, as the latter can be zero).

3357

*/

3362

*/

3358

static bool

3363

static bool

3359

__checkparam_dl(const struct sched_attr *attr)

3364

__checkparam_dl(const struct sched_attr *attr)

3360

{

3365

{

3361

/* deadline != 0 */

3366

/* deadline != 0 */

3362

if (attr->sched_deadline == 0)

3367

if (attr->sched_deadline == 0)

3363

return false;

3368

return false;

3364

3369

3365

/*

3370

/*

3366

* Since we truncate DL_SCALE bits, make sure we're at least

3371

* Since we truncate DL_SCALE bits, make sure we're at least

3367

* that big.

3372

* that big.

3368

*/

3373

*/

3369

if (attr->sched_runtime < (1ULL << DL_SCALE))

3374

if (attr->sched_runtime < (1ULL << DL_SCALE))

3370

return false;

3375

return false;

3371

3376

3372

/*

3377

/*

3373

* Since we use the MSB for wrap-around and sign issues, make

3378

* Since we use the MSB for wrap-around and sign issues, make

3374

* sure it's not set (mind that period can be equal to zero).

3379

* sure it's not set (mind that period can be equal to zero).

3375

*/

3380

*/

3376

if (attr->sched_deadline & (1ULL << 63) ||

3381

if (attr->sched_deadline & (1ULL << 63) ||

3377

attr->sched_period & (1ULL << 63))

3382

attr->sched_period & (1ULL << 63))

3378

return false;

3383

return false;

3379

3384

3380

/* runtime <= deadline <= period (if period != 0) */

3385

/* runtime <= deadline <= period (if period != 0) */

3381

if ((attr->sched_period != 0 &&

3386

if ((attr->sched_period != 0 &&

3382

attr->sched_period < attr->sched_deadline) ||

3387

attr->sched_period < attr->sched_deadline) ||

3383

attr->sched_deadline < attr->sched_runtime)

3388

attr->sched_deadline < attr->sched_runtime)

3384

return false;

3389

return false;

3385

3390

3386

return true;

3391

return true;

3387

}

3392

}

3388

3393

3389

/*

3394

/*

3390

* check the target process has a UID that matches the current process's

3395

* check the target process has a UID that matches the current process's

3391

*/

3396

*/

3392

static bool check_same_owner(struct task_struct *p)

3397

static bool check_same_owner(struct task_struct *p)

3393

{

3398

{

3394

const struct cred *cred = current_cred(), *pcred;

3399

const struct cred *cred = current_cred(), *pcred;

3395

bool match;

3400

bool match;

3396

3401

3397

rcu_read_lock();

3402

rcu_read_lock();

3398

pcred = __task_cred(p);

3403

pcred = __task_cred(p);

3399

match = (uid_eq(cred->euid, pcred->euid) ||

3404

match = (uid_eq(cred->euid, pcred->euid) ||

3400

uid_eq(cred->euid, pcred->uid));

3405

uid_eq(cred->euid, pcred->uid));

3401

rcu_read_unlock();

3406

rcu_read_unlock();

3402

return match;

3407

return match;

3403

}

3408

}

3404

3409

3405

static int __sched_setscheduler(struct task_struct *p,

3410

static int __sched_setscheduler(struct task_struct *p,

3406

const struct sched_attr *attr,

3411

const struct sched_attr *attr,

3407

bool user)

3412

bool user)

3408

{

3413

{

3409

int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :

3414

int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :

3410

MAX_RT_PRIO - 1 - attr->sched_priority;

3415

MAX_RT_PRIO - 1 - attr->sched_priority;

3411

int retval, oldprio, oldpolicy = -1, queued, running;

3416

int retval, oldprio, oldpolicy = -1, queued, running;

3412

int policy = attr->sched_policy;

3417

int policy = attr->sched_policy;

3413

unsigned long flags;

3418

unsigned long flags;

3414

const struct sched_class *prev_class;

3419

const struct sched_class *prev_class;

3415

struct rq *rq;

3420

struct rq *rq;

3416

int reset_on_fork;

3421

int reset_on_fork;

3417

3422

3418

/* may grab non-irq protected spin_locks */

3423

/* may grab non-irq protected spin_locks */

3419

BUG_ON(in_interrupt());

3424

BUG_ON(in_interrupt());

3420

recheck:

3425

recheck:

3421

/* double check policy once rq lock held */

3426

/* double check policy once rq lock held */

3422

if (policy < 0) {

3427

if (policy < 0) {

3423

reset_on_fork = p->sched_reset_on_fork;

3428

reset_on_fork = p->sched_reset_on_fork;

3424

policy = oldpolicy = p->policy;

3429

policy = oldpolicy = p->policy;

3425

} else {

3430

} else {

3426

reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);

3431

reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);

3427

3432

3428

if (policy != SCHED_DEADLINE &&

3433

if (policy != SCHED_DEADLINE &&

3429

policy != SCHED_FIFO && policy != SCHED_RR &&

3434

policy != SCHED_FIFO && policy != SCHED_RR &&

3430

policy != SCHED_NORMAL && policy != SCHED_BATCH &&

3435

policy != SCHED_NORMAL && policy != SCHED_BATCH &&

3431

policy != SCHED_IDLE)

3436

policy != SCHED_IDLE)

3432

return -EINVAL;

3437

return -EINVAL;

3433

}

3438

}

3434

3439

3435

if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))

3440

if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))

3436

return -EINVAL;

3441

return -EINVAL;

3437

3442

3438

/*

3443

/*

3439

* Valid priorities for SCHED_FIFO and SCHED_RR are

3444

* Valid priorities for SCHED_FIFO and SCHED_RR are

3440

* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,

3445

* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,

3441

* SCHED_BATCH and SCHED_IDLE is 0.

3446

* SCHED_BATCH and SCHED_IDLE is 0.

3442

*/

3447

*/

3443

if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||

3448

if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||

3444

(!p->mm && attr->sched_priority > MAX_RT_PRIO-1))

3449

(!p->mm && attr->sched_priority > MAX_RT_PRIO-1))

3445

return -EINVAL;

3450

return -EINVAL;

3446

if ((dl_policy(policy) && !__checkparam_dl(attr)) ||

3451

if ((dl_policy(policy) && !__checkparam_dl(attr)) ||

3447

(rt_policy(policy) != (attr->sched_priority != 0)))

3452

(rt_policy(policy) != (attr->sched_priority != 0)))

3448

return -EINVAL;

3453

return -EINVAL;

3449

3454

3450

/*

3455

/*

3451

* Allow unprivileged RT tasks to decrease priority:

3456

* Allow unprivileged RT tasks to decrease priority:

3452

*/

3457

*/

3453

if (user && !capable(CAP_SYS_NICE)) {

3458

if (user && !capable(CAP_SYS_NICE)) {

3454

if (fair_policy(policy)) {

3459

if (fair_policy(policy)) {

3455

if (attr->sched_nice < task_nice(p) &&

3460

if (attr->sched_nice < task_nice(p) &&

3456

!can_nice(p, attr->sched_nice))

3461

!can_nice(p, attr->sched_nice))

3457

return -EPERM;

3462

return -EPERM;

3458

}

3463

}

3459

3464

3460

if (rt_policy(policy)) {

3465

if (rt_policy(policy)) {

3461

unsigned long rlim_rtprio =

3466

unsigned long rlim_rtprio =

3462

task_rlimit(p, RLIMIT_RTPRIO);

3467

task_rlimit(p, RLIMIT_RTPRIO);

3463

3468

3464

/* can't set/change the rt policy */

3469

/* can't set/change the rt policy */

3465

if (policy != p->policy && !rlim_rtprio)

3470

if (policy != p->policy && !rlim_rtprio)

3466

return -EPERM;

3471

return -EPERM;

3467

3472

3468

/* can't increase priority */

3473

/* can't increase priority */

3469

if (attr->sched_priority > p->rt_priority &&

3474

if (attr->sched_priority > p->rt_priority &&

3470

attr->sched_priority > rlim_rtprio)

3475

attr->sched_priority > rlim_rtprio)

3471

return -EPERM;

3476

return -EPERM;

3472

}

3477

}

3473

3478

3474

/*

3479

/*

3475

* Can't set/change SCHED_DEADLINE policy at all for now

3480

* Can't set/change SCHED_DEADLINE policy at all for now

3476

* (safest behavior); in the future we would like to allow

3481

* (safest behavior); in the future we would like to allow

3477

* unprivileged DL tasks to increase their relative deadline

3482

* unprivileged DL tasks to increase their relative deadline

3478

* or reduce their runtime (both ways reducing utilization)

3483

* or reduce their runtime (both ways reducing utilization)

3479

*/

3484

*/

3480

if (dl_policy(policy))

3485

if (dl_policy(policy))

3481

return -EPERM;

3486

return -EPERM;

3482

3487

3483

/*

3488

/*

3484

* Treat SCHED_IDLE as nice 20. Only allow a switch to

3489

* Treat SCHED_IDLE as nice 20. Only allow a switch to

3485

* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.

3490

* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.

3486

*/

3491

*/

3487

if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {

3492

if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {

3488

if (!can_nice(p, task_nice(p)))

3493

if (!can_nice(p, task_nice(p)))

3489

return -EPERM;

3494

return -EPERM;

3490

}

3495

}

3491

3496

3492

/* can't change other user's priorities */

3497

/* can't change other user's priorities */

3493

if (!check_same_owner(p))

3498

if (!check_same_owner(p))

3494

return -EPERM;

3499

return -EPERM;

3495

3500

3496

/* Normal users shall not reset the sched_reset_on_fork flag */

3501

/* Normal users shall not reset the sched_reset_on_fork flag */

3497

if (p->sched_reset_on_fork && !reset_on_fork)

3502

if (p->sched_reset_on_fork && !reset_on_fork)

3498

return -EPERM;

3503

return -EPERM;

3499

}

3504

}

3500

3505

3501

if (user) {

3506

if (user) {

3502

retval = security_task_setscheduler(p);

3507

retval = security_task_setscheduler(p);

3503

if (retval)

3508

if (retval)

3504

return retval;

3509

return retval;

3505

}

3510

}

3506

3511

3507

/*

3512

/*

3508

* make sure no PI-waiters arrive (or leave) while we are

3513

* make sure no PI-waiters arrive (or leave) while we are

3509

* changing the priority of the task:

3514

* changing the priority of the task:

3510

*

3515

*

3511

* To be able to change p->policy safely, the appropriate

3516

* To be able to change p->policy safely, the appropriate

3512

* runqueue lock must be held.

3517

* runqueue lock must be held.

3513

*/

3518

*/

3514

rq = task_rq_lock(p, &flags);

3519

rq = task_rq_lock(p, &flags);

3515

3520

3516

/*

3521

/*

3517

* Changing the policy of the stop threads its a very bad idea

3522

* Changing the policy of the stop threads its a very bad idea

3518

*/

3523

*/

3519

if (p == rq->stop) {

3524

if (p == rq->stop) {

3520

task_rq_unlock(rq, p, &flags);

3525

task_rq_unlock(rq, p, &flags);

3521

return -EINVAL;

3526

return -EINVAL;

3522

}

3527

}

3523

3528

3524

/*

3529

/*

3525

* If not changing anything there's no need to proceed further,

3530

* If not changing anything there's no need to proceed further,

3526

* but store a possible modification of reset_on_fork.

3531

* but store a possible modification of reset_on_fork.

3527

*/

3532

*/

3528

if (unlikely(policy == p->policy)) {

3533

if (unlikely(policy == p->policy)) {

3529

if (fair_policy(policy) && attr->sched_nice != task_nice(p))

3534

if (fair_policy(policy) && attr->sched_nice != task_nice(p))

3530

goto change;

3535

goto change;

3531

if (rt_policy(policy) && attr->sched_priority != p->rt_priority)

3536

if (rt_policy(policy) && attr->sched_priority != p->rt_priority)

3532

goto change;

3537

goto change;

3533

if (dl_policy(policy))

3538

if (dl_policy(policy))

3534

goto change;

3539

goto change;

3535

3540

3536

p->sched_reset_on_fork = reset_on_fork;

3541

p->sched_reset_on_fork = reset_on_fork;

3537

task_rq_unlock(rq, p, &flags);

3542

task_rq_unlock(rq, p, &flags);

3538

return 0;

3543

return 0;

3539

}

3544

}

3540

change:

3545

change:

3541

3546

3542

if (user) {

3547

if (user) {

3543

#ifdef CONFIG_RT_GROUP_SCHED

3548

#ifdef CONFIG_RT_GROUP_SCHED

3544

/*

3549

/*

3545

* Do not allow realtime tasks into groups that have no runtime

3550

* Do not allow realtime tasks into groups that have no runtime

3546

* assigned.

3551

* assigned.

3547

*/

3552

*/

3548

if (rt_bandwidth_enabled() && rt_policy(policy) &&

3553

if (rt_bandwidth_enabled() && rt_policy(policy) &&

3549

task_group(p)->rt_bandwidth.rt_runtime == 0 &&

3554

task_group(p)->rt_bandwidth.rt_runtime == 0 &&

3550

!task_group_is_autogroup(task_group(p))) {

3555

!task_group_is_autogroup(task_group(p))) {

3551

task_rq_unlock(rq, p, &flags);

3556

task_rq_unlock(rq, p, &flags);

3552

return -EPERM;

3557

return -EPERM;

3553

}

3558

}

3554

#endif

3559

#endif

3555

#ifdef CONFIG_SMP

3560

#ifdef CONFIG_SMP

3556

if (dl_bandwidth_enabled() && dl_policy(policy)) {

3561

if (dl_bandwidth_enabled() && dl_policy(policy)) {

3557

cpumask_t *span = rq->rd->span;

3562

cpumask_t *span = rq->rd->span;

3558

3563

3559

/*

3564

/*

3560

* Don't allow tasks with an affinity mask smaller than

3565

* Don't allow tasks with an affinity mask smaller than

3561

* the entire root_domain to become SCHED_DEADLINE. We

3566

* the entire root_domain to become SCHED_DEADLINE. We

3562

* will also fail if there's no bandwidth available.

3567

* will also fail if there's no bandwidth available.

3563

*/

3568

*/

3564

if (!cpumask_subset(span, &p->cpus_allowed) ||

3569

if (!cpumask_subset(span, &p->cpus_allowed) ||

3565

rq->rd->dl_bw.bw == 0) {

3570

rq->rd->dl_bw.bw == 0) {

3566

task_rq_unlock(rq, p, &flags);

3571

task_rq_unlock(rq, p, &flags);

3567

return -EPERM;

3572

return -EPERM;

3568

}

3573

}

3569

}

3574

}

3570

#endif

3575

#endif

3571

}

3576

}

3572

3577

3573

/* recheck policy now with rq lock held */

3578

/* recheck policy now with rq lock held */

3574

if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

3579

if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

3575

policy = oldpolicy = -1;

3580

policy = oldpolicy = -1;

3576

task_rq_unlock(rq, p, &flags);

3581

task_rq_unlock(rq, p, &flags);

3577

goto recheck;

3582

goto recheck;

3578

}

3583

}

3579

3584

3580

/*

3585

/*

3581

* If setscheduling to SCHED_DEADLINE (or changing the parameters

3586

* If setscheduling to SCHED_DEADLINE (or changing the parameters

3582

* of a SCHED_DEADLINE task) we need to check if enough bandwidth

3587

* of a SCHED_DEADLINE task) we need to check if enough bandwidth

3583

* is available.

3588

* is available.

3584

*/

3589

*/

3585

if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {

3590

if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {

3586

task_rq_unlock(rq, p, &flags);

3591

task_rq_unlock(rq, p, &flags);

3587

return -EBUSY;

3592

return -EBUSY;

3588

}

3593

}

3589

3594

3590

p->sched_reset_on_fork = reset_on_fork;

3595

p->sched_reset_on_fork = reset_on_fork;

3591

oldprio = p->prio;

3596

oldprio = p->prio;

3592

3597

3593

/*

3598

/*

3594

* Special case for priority boosted tasks.

3599

* Special case for priority boosted tasks.

3595

*

3600

*

3596

* If the new priority is lower or equal (user space view)

3601

* If the new priority is lower or equal (user space view)

3597

* than the current (boosted) priority, we just store the new

3602

* than the current (boosted) priority, we just store the new

3598

* normal parameters and do not touch the scheduler class and

3603

* normal parameters and do not touch the scheduler class and

3599

* the runqueue. This will be done when the task deboost

3604

* the runqueue. This will be done when the task deboost

3600

* itself.

3605

* itself.

3601

*/

3606

*/

3602

if (rt_mutex_check_prio(p, newprio)) {

3607

if (rt_mutex_check_prio(p, newprio)) {

3603

__setscheduler_params(p, attr);

3608

__setscheduler_params(p, attr);

3604

task_rq_unlock(rq, p, &flags);

3609

task_rq_unlock(rq, p, &flags);

3605

return 0;

3610

return 0;

3606

}

3611

}

3607

3612

3608

queued = task_on_rq_queued(p);

3613

queued = task_on_rq_queued(p);

3609

running = task_current(rq, p);

3614

running = task_current(rq, p);

3610

if (queued)

3615

if (queued)

3611

dequeue_task(rq, p, 0);

3616

dequeue_task(rq, p, 0);

3612

if (running)

3617

if (running)

3613

put_prev_task(rq, p);

3618

put_prev_task(rq, p);

3614

3619

3615

prev_class = p->sched_class;

3620

prev_class = p->sched_class;

3616

__setscheduler(rq, p, attr);

3621

__setscheduler(rq, p, attr);

3617

3622

3618

if (running)

3623

if (running)

3619

p->sched_class->set_curr_task(rq);

3624

p->sched_class->set_curr_task(rq);

3620

if (queued) {

3625

if (queued) {

3621

/*

3626

/*

3622

* We enqueue to tail when the priority of a task is

3627

* We enqueue to tail when the priority of a task is

3623

* increased (user space view).

3628

* increased (user space view).

3624

*/

3629

*/

3625

enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);

3630

enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);

3626

}

3631

}

3627

3632

3628

check_class_changed(rq, p, prev_class, oldprio);

3633

check_class_changed(rq, p, prev_class, oldprio);

3629

task_rq_unlock(rq, p, &flags);

3634

task_rq_unlock(rq, p, &flags);

3630

3635

3631

rt_mutex_adjust_pi(p);

3636

rt_mutex_adjust_pi(p);

3632

3637

3633

return 0;

3638

return 0;

3634

}

3639

}

3635

3640

3636

static int _sched_setscheduler(struct task_struct *p, int policy,

3641

static int _sched_setscheduler(struct task_struct *p, int policy,

3637

const struct sched_param *param, bool check)

3642

const struct sched_param *param, bool check)

3638

{

3643

{

3639

struct sched_attr attr = {

3644

struct sched_attr attr = {

3640

.sched_policy = policy,

3645

.sched_policy = policy,

3641

.sched_priority = param->sched_priority,

3646

.sched_priority = param->sched_priority,

3642

.sched_nice = PRIO_TO_NICE(p->static_prio),

3647

.sched_nice = PRIO_TO_NICE(p->static_prio),

3643

};

3648

};

3644

3649

3645

/* Fixup the legacy SCHED_RESET_ON_FORK hack. */

3650

/* Fixup the legacy SCHED_RESET_ON_FORK hack. */

3646

if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {

3651

if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {

3647

attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;

3652

attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;

3648

policy &= ~SCHED_RESET_ON_FORK;

3653

policy &= ~SCHED_RESET_ON_FORK;

3649

attr.sched_policy = policy;

3654

attr.sched_policy = policy;

3650

}

3655

}

3651

3656

3652

return __sched_setscheduler(p, &attr, check);

3657

return __sched_setscheduler(p, &attr, check);

3653

}

3658

}

3654

/**

3659

/**

3655

* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.

3660

* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.

3656

* @p: the task in question.

3661

* @p: the task in question.

3657

* @policy: new policy.

3662

* @policy: new policy.

3658

* @param: structure containing the new RT priority.

3663

* @param: structure containing the new RT priority.

3659

*

3664

*

3660

* Return: 0 on success. An error code otherwise.

3665

* Return: 0 on success. An error code otherwise.

3661

*

3666

*

3662

* NOTE that the task may be already dead.

3667

* NOTE that the task may be already dead.

3663

*/

3668

*/

3664

int sched_setscheduler(struct task_struct *p, int policy,

3669

int sched_setscheduler(struct task_struct *p, int policy,

3665

const struct sched_param *param)

3670

const struct sched_param *param)

3666

{

3671

{

3667

return _sched_setscheduler(p, policy, param, true);

3672

return _sched_setscheduler(p, policy, param, true);

3668

}

3673

}

3669

EXPORT_SYMBOL_GPL(sched_setscheduler);

3674

EXPORT_SYMBOL_GPL(sched_setscheduler);

3670

3675

3671

int sched_setattr(struct task_struct *p, const struct sched_attr *attr)

3676

int sched_setattr(struct task_struct *p, const struct sched_attr *attr)

3672

{

3677

{

3673

return __sched_setscheduler(p, attr, true);

3678

return __sched_setscheduler(p, attr, true);

3674

}

3679

}

3675

EXPORT_SYMBOL_GPL(sched_setattr);

3680

EXPORT_SYMBOL_GPL(sched_setattr);

3676

3681

3677

/**

3682

/**

3678

* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.

3683

* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.

3679

* @p: the task in question.

3684

* @p: the task in question.

3680

* @policy: new policy.

3685

* @policy: new policy.

3681

* @param: structure containing the new RT priority.

3686

* @param: structure containing the new RT priority.

3682

*

3687

*

3683

* Just like sched_setscheduler, only don't bother checking if the

3688

* Just like sched_setscheduler, only don't bother checking if the

3684

* current context has permission. For example, this is needed in

3689

* current context has permission. For example, this is needed in

3685

* stop_machine(): we create temporary high priority worker threads,

3690

* stop_machine(): we create temporary high priority worker threads,

3686

* but our caller might not have that capability.

3691

* but our caller might not have that capability.

3687

*

3692

*

3688

* Return: 0 on success. An error code otherwise.

3693

* Return: 0 on success. An error code otherwise.

3689

*/

3694

*/

3690

int sched_setscheduler_nocheck(struct task_struct *p, int policy,

3695

int sched_setscheduler_nocheck(struct task_struct *p, int policy,

3691

const struct sched_param *param)

3696

const struct sched_param *param)

3692

{

3697

{

3693

return _sched_setscheduler(p, policy, param, false);

3698

return _sched_setscheduler(p, policy, param, false);

3694

}

3699

}

3695

3700

3696

static int

3701

static int

3697

do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

3702

do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

3698

{

3703

{

3699

struct sched_param lparam;

3704

struct sched_param lparam;

3700

struct task_struct *p;

3705

struct task_struct *p;

3701

int retval;

3706

int retval;

3702

3707

3703

if (!param || pid < 0)

3708

if (!param || pid < 0)

3704

return -EINVAL;

3709

return -EINVAL;

3705

if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

3710

if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

3706

return -EFAULT;

3711

return -EFAULT;

3707

3712

3708

rcu_read_lock();

3713

rcu_read_lock();

3709

retval = -ESRCH;

3714

retval = -ESRCH;

3710

p = find_process_by_pid(pid);

3715

p = find_process_by_pid(pid);

3711

if (p != NULL)

3716

if (p != NULL)

3712

retval = sched_setscheduler(p, policy, &lparam);

3717

retval = sched_setscheduler(p, policy, &lparam);

3713

rcu_read_unlock();

3718

rcu_read_unlock();

3714

3719

3715

return retval;

3720

return retval;

3716

}

3721

}

3717

3722

3718

/*

3723

/*

3719

* Mimics kernel/events/core.c perf_copy_attr().

3724

* Mimics kernel/events/core.c perf_copy_attr().

3720

*/

3725

*/

3721

static int sched_copy_attr(struct sched_attr __user *uattr,

3726

static int sched_copy_attr(struct sched_attr __user *uattr,

3722

struct sched_attr *attr)

3727

struct sched_attr *attr)

3723

{

3728

{

3724

u32 size;

3729

u32 size;

3725

int ret;

3730

int ret;

3726

3731

3727

if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))

3732

if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))

3728

return -EFAULT;

3733

return -EFAULT;

3729

3734

3730

/*

3735

/*

3731

* zero the full structure, so that a short copy will be nice.

3736

* zero the full structure, so that a short copy will be nice.

3732

*/

3737

*/

3733

memset(attr, 0, sizeof(*attr));

3738

memset(attr, 0, sizeof(*attr));

3734

3739

3735

ret = get_user(size, &uattr->size);

3740

ret = get_user(size, &uattr->size);

3736

if (ret)

3741

if (ret)

3737

return ret;

3742

return ret;

3738

3743

3739

if (size > PAGE_SIZE) /* silly large */

3744

if (size > PAGE_SIZE) /* silly large */

3740

goto err_size;

3745

goto err_size;

3741

3746

3742

if (!size) /* abi compat */

3747

if (!size) /* abi compat */

3743

size = SCHED_ATTR_SIZE_VER0;

3748

size = SCHED_ATTR_SIZE_VER0;

3744

3749

3745

if (size < SCHED_ATTR_SIZE_VER0)

3750

if (size < SCHED_ATTR_SIZE_VER0)

3746

goto err_size;

3751

goto err_size;

3747

3752

3748

/*

3753

/*

3749

* If we're handed a bigger struct than we know of,

3754

* If we're handed a bigger struct than we know of,

3750

* ensure all the unknown bits are 0 - i.e. new

3755

* ensure all the unknown bits are 0 - i.e. new

3751

* user-space does not rely on any kernel feature

3756

* user-space does not rely on any kernel feature

3752

* extensions we dont know about yet.

3757

* extensions we dont know about yet.

3753

*/

3758

*/

3754

if (size > sizeof(*attr)) {

3759

if (size > sizeof(*attr)) {

3755

unsigned char __user *addr;

3760

unsigned char __user *addr;

3756

unsigned char __user *end;

3761

unsigned char __user *end;

3757

unsigned char val;

3762

unsigned char val;

3758

3763

3759

addr = (void __user *)uattr + sizeof(*attr);

3764

addr = (void __user *)uattr + sizeof(*attr);

3760

end = (void __user *)uattr + size;

3765

end = (void __user *)uattr + size;

3761

3766

3762

for (; addr < end; addr++) {

3767

for (; addr < end; addr++) {

3763

ret = get_user(val, addr);

3768

ret = get_user(val, addr);

3764

if (ret)

3769

if (ret)

3765

return ret;

3770

return ret;

3766

if (val)

3771

if (val)

3767

goto err_size;

3772

goto err_size;

3768

}

3773

}

3769

size = sizeof(*attr);

3774

size = sizeof(*attr);

3770

}

3775

}

3771

3776

3772

ret = copy_from_user(attr, uattr, size);

3777

ret = copy_from_user(attr, uattr, size);

3773

if (ret)

3778

if (ret)

3774

return -EFAULT;

3779

return -EFAULT;

3775

3780

3776

/*

3781

/*

3777

* XXX: do we want to be lenient like existing syscalls; or do we want

3782

* XXX: do we want to be lenient like existing syscalls; or do we want

3778

* to be strict and return an error on out-of-bounds values?

3783

* to be strict and return an error on out-of-bounds values?

3779

*/

3784

*/

3780

attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);

3785

attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);

3781

3786

3782

return 0;

3787

return 0;

3783

3788

3784

err_size:

3789

err_size:

3785

put_user(sizeof(*attr), &uattr->size);

3790

put_user(sizeof(*attr), &uattr->size);

3786

return -E2BIG;

3791

return -E2BIG;

3787

}

3792

}

3788

3793

3789

/**

3794

/**

3790

* sys_sched_setscheduler - set/change the scheduler policy and RT priority

3795

* sys_sched_setscheduler - set/change the scheduler policy and RT priority

3791

* @pid: the pid in question.

3796

* @pid: the pid in question.

3792

* @policy: new policy.

3797

* @policy: new policy.

3793

* @param: structure containing the new RT priority.

3798

* @param: structure containing the new RT priority.

3794

*

3799

*

3795

* Return: 0 on success. An error code otherwise.

3800

* Return: 0 on success. An error code otherwise.

3796

*/

3801

*/

3797

SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,

3802

SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,

3798

struct sched_param __user *, param)

3803

struct sched_param __user *, param)

3799

{

3804

{

3800

/* negative values for policy are not valid */

3805

/* negative values for policy are not valid */

3801

if (policy < 0)

3806

if (policy < 0)

3802

return -EINVAL;

3807

return -EINVAL;

3803

3808

3804

return do_sched_setscheduler(pid, policy, param);

3809

return do_sched_setscheduler(pid, policy, param);

3805

}

3810

}

3806

3811

3807

/**

3812

/**

3808

* sys_sched_setparam - set/change the RT priority of a thread

3813

* sys_sched_setparam - set/change the RT priority of a thread

3809

* @pid: the pid in question.

3814

* @pid: the pid in question.

3810

* @param: structure containing the new RT priority.

3815

* @param: structure containing the new RT priority.

3811

*

3816

*

3812

* Return: 0 on success. An error code otherwise.

3817

* Return: 0 on success. An error code otherwise.

3813

*/

3818

*/

3814

SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)

3819

SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)

3815

{

3820

{

3816

return do_sched_setscheduler(pid, SETPARAM_POLICY, param);

3821

return do_sched_setscheduler(pid, SETPARAM_POLICY, param);

3817

}

3822

}

3818

3823

3819

/**

3824

/**

3820

* sys_sched_setattr - same as above, but with extended sched_attr

3825

* sys_sched_setattr - same as above, but with extended sched_attr

3821

* @pid: the pid in question.

3826

* @pid: the pid in question.

3822

* @uattr: structure containing the extended parameters.

3827

* @uattr: structure containing the extended parameters.

3823

* @flags: for future extension.

3828

* @flags: for future extension.

3824

*/

3829

*/

3825

SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,

3830

SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,

3826

unsigned int, flags)

3831

unsigned int, flags)

3827

{

3832

{

3828

struct sched_attr attr;

3833

struct sched_attr attr;

3829

struct task_struct *p;

3834

struct task_struct *p;

3830

int retval;

3835

int retval;

3831

3836

3832

if (!uattr || pid < 0 || flags)

3837

if (!uattr || pid < 0 || flags)

3833

return -EINVAL;

3838

return -EINVAL;

3834

3839

3835

retval = sched_copy_attr(uattr, &attr);

3840

retval = sched_copy_attr(uattr, &attr);

3836

if (retval)

3841

if (retval)

3837

return retval;

3842

return retval;

3838

3843

3839

if ((int)attr.sched_policy < 0)

3844

if ((int)attr.sched_policy < 0)

3840

return -EINVAL;

3845

return -EINVAL;

3841

3846

3842

rcu_read_lock();

3847

rcu_read_lock();

3843

retval = -ESRCH;

3848

retval = -ESRCH;

3844

p = find_process_by_pid(pid);

3849

p = find_process_by_pid(pid);

3845

if (p != NULL)

3850

if (p != NULL)

3846

retval = sched_setattr(p, &attr);

3851

retval = sched_setattr(p, &attr);

3847

rcu_read_unlock();

3852

rcu_read_unlock();

3848

3853

3849

return retval;

3854

return retval;

3850

}

3855

}

3851

3856

3852

/**

3857

/**

3853

* sys_sched_getscheduler - get the policy (scheduling class) of a thread

3858

* sys_sched_getscheduler - get the policy (scheduling class) of a thread

3854

* @pid: the pid in question.

3859

* @pid: the pid in question.

3855

*

3860

*

3856

* Return: On success, the policy of the thread. Otherwise, a negative error

3861

* Return: On success, the policy of the thread. Otherwise, a negative error

3857

* code.

3862

* code.

3858

*/

3863

*/

3859

SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)

3864

SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)

3860

{

3865

{

3861

struct task_struct *p;

3866

struct task_struct *p;

3862

int retval;

3867

int retval;

3863

3868

3864

if (pid < 0)

3869

if (pid < 0)

3865

return -EINVAL;

3870

return -EINVAL;

3866

3871

3867

retval = -ESRCH;

3872

retval = -ESRCH;

3868

rcu_read_lock();

3873

rcu_read_lock();

3869

p = find_process_by_pid(pid);

3874

p = find_process_by_pid(pid);

3870

if (p) {

3875

if (p) {

3871

retval = security_task_getscheduler(p);

3876

retval = security_task_getscheduler(p);

3872

if (!retval)

3877

if (!retval)

3873

retval = p->policy

3878

retval = p->policy

3874

| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);

3879

| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);

3875

}

3880

}

3876

rcu_read_unlock();

3881

rcu_read_unlock();

3877

return retval;

3882

return retval;

3878

}

3883

}

3879

3884

3880

/**

3885

/**

3881

* sys_sched_getparam - get the RT priority of a thread

3886

* sys_sched_getparam - get the RT priority of a thread

3882

* @pid: the pid in question.

3887

* @pid: the pid in question.

3883

* @param: structure containing the RT priority.

3888

* @param: structure containing the RT priority.

3884

*

3889

*

3885

* Return: On success, 0 and the RT priority is in @param. Otherwise, an error

3890

* Return: On success, 0 and the RT priority is in @param. Otherwise, an error

3886

* code.

3891

* code.

3887

*/

3892

*/

3888

SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)

3893

SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)

3889

{

3894

{

3890

struct sched_param lp = { .sched_priority = 0 };

3895

struct sched_param lp = { .sched_priority = 0 };

3891

struct task_struct *p;

3896

struct task_struct *p;

3892

int retval;

3897

int retval;

3893

3898

3894

if (!param || pid < 0)

3899

if (!param || pid < 0)

3895

return -EINVAL;

3900

return -EINVAL;

3896

3901

3897

rcu_read_lock();

3902

rcu_read_lock();

3898

p = find_process_by_pid(pid);

3903

p = find_process_by_pid(pid);

3899

retval = -ESRCH;

3904

retval = -ESRCH;

3900

if (!p)

3905

if (!p)

3901

goto out_unlock;

3906

goto out_unlock;

3902

3907

3903

retval = security_task_getscheduler(p);

3908

retval = security_task_getscheduler(p);

3904

if (retval)

3909

if (retval)

3905

goto out_unlock;

3910

goto out_unlock;

3906

3911

3907

if (task_has_rt_policy(p))

3912

if (task_has_rt_policy(p))

3908

lp.sched_priority = p->rt_priority;

3913

lp.sched_priority = p->rt_priority;

3909

rcu_read_unlock();

3914

rcu_read_unlock();

3910

3915

3911

/*

3916

/*

3912

* This one might sleep, we cannot do it with a spinlock held ...

3917

* This one might sleep, we cannot do it with a spinlock held ...

3913

*/

3918

*/

3914

retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

3919

retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

3915

3920

3916

return retval;

3921

return retval;

3917

3922

3918

out_unlock:

3923

out_unlock:

3919

rcu_read_unlock();

3924

rcu_read_unlock();

3920

return retval;

3925

return retval;

3921

}

3926

}

3922

3927

3923

static int sched_read_attr(struct sched_attr __user *uattr,

3928

static int sched_read_attr(struct sched_attr __user *uattr,

3924

struct sched_attr *attr,

3929

struct sched_attr *attr,

3925

unsigned int usize)

3930

unsigned int usize)

3926

{

3931

{

3927

int ret;

3932

int ret;

3928

3933

3929

if (!access_ok(VERIFY_WRITE, uattr, usize))

3934

if (!access_ok(VERIFY_WRITE, uattr, usize))

3930

return -EFAULT;

3935

return -EFAULT;

3931

3936

3932

/*

3937

/*

3933

* If we're handed a smaller struct than we know of,

3938

* If we're handed a smaller struct than we know of,

3934

* ensure all the unknown bits are 0 - i.e. old

3939

* ensure all the unknown bits are 0 - i.e. old

3935

* user-space does not get uncomplete information.

3940

* user-space does not get uncomplete information.

3936

*/

3941

*/

3937

if (usize < sizeof(*attr)) {

3942

if (usize < sizeof(*attr)) {

3938

unsigned char *addr;

3943

unsigned char *addr;

3939

unsigned char *end;

3944

unsigned char *end;

3940

3945

3941

addr = (void *)attr + usize;

3946

addr = (void *)attr + usize;

3942

end = (void *)attr + sizeof(*attr);

3947

end = (void *)attr + sizeof(*attr);

3943

3948

3944

for (; addr < end; addr++) {

3949

for (; addr < end; addr++) {

3945

if (*addr)

3950

if (*addr)

3946

return -EFBIG;

3951

return -EFBIG;

3947

}

3952

}

3948

3953

3949

attr->size = usize;

3954

attr->size = usize;

3950

}

3955

}

3951

3956

3952

ret = copy_to_user(uattr, attr, attr->size);

3957

ret = copy_to_user(uattr, attr, attr->size);

3953

if (ret)

3958

if (ret)

3954

return -EFAULT;

3959

return -EFAULT;

3955

3960

3956

return 0;

3961

return 0;

3957

}

3962

}

3958

3963

3959

/**

3964

/**

3960

* sys_sched_getattr - similar to sched_getparam, but with sched_attr

3965

* sys_sched_getattr - similar to sched_getparam, but with sched_attr

3961

* @pid: the pid in question.

3966

* @pid: the pid in question.

3962

* @uattr: structure containing the extended parameters.

3967

* @uattr: structure containing the extended parameters.

3963

* @size: sizeof(attr) for fwd/bwd comp.

3968

* @size: sizeof(attr) for fwd/bwd comp.

3964

* @flags: for future extension.

3969

* @flags: for future extension.

3965

*/

3970

*/

3966

SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,

3971

SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,

3967

unsigned int, size, unsigned int, flags)

3972

unsigned int, size, unsigned int, flags)

3968

{

3973

{

3969

struct sched_attr attr = {

3974

struct sched_attr attr = {

3970

.size = sizeof(struct sched_attr),

3975

.size = sizeof(struct sched_attr),

3971

};

3976

};

3972

struct task_struct *p;

3977

struct task_struct *p;

3973

int retval;

3978

int retval;

3974

3979

3975

if (!uattr || pid < 0 || size > PAGE_SIZE ||

3980

if (!uattr || pid < 0 || size > PAGE_SIZE ||

3976

size < SCHED_ATTR_SIZE_VER0 || flags)

3981

size < SCHED_ATTR_SIZE_VER0 || flags)

3977

return -EINVAL;

3982

return -EINVAL;

3978

3983

3979

rcu_read_lock();

3984

rcu_read_lock();

3980

p = find_process_by_pid(pid);

3985

p = find_process_by_pid(pid);

3981

retval = -ESRCH;

3986

retval = -ESRCH;

3982

if (!p)

3987

if (!p)

3983

goto out_unlock;

3988

goto out_unlock;

3984

3989

3985

retval = security_task_getscheduler(p);

3990

retval = security_task_getscheduler(p);

3986

if (retval)

3991

if (retval)

3987

goto out_unlock;

3992

goto out_unlock;

3988

3993

3989

attr.sched_policy = p->policy;

3994

attr.sched_policy = p->policy;

3990

if (p->sched_reset_on_fork)

3995

if (p->sched_reset_on_fork)

3991

attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;

3996

attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;

3992

if (task_has_dl_policy(p))

3997

if (task_has_dl_policy(p))

3993

__getparam_dl(p, &attr);

3998

__getparam_dl(p, &attr);

3994

else if (task_has_rt_policy(p))

3999

else if (task_has_rt_policy(p))

3995

attr.sched_priority = p->rt_priority;

4000

attr.sched_priority = p->rt_priority;

3996

else

4001

else

3997

attr.sched_nice = task_nice(p);

4002

attr.sched_nice = task_nice(p);

3998

4003

3999

rcu_read_unlock();

4004

rcu_read_unlock();

4000

4005

4001

retval = sched_read_attr(uattr, &attr, size);

4006

retval = sched_read_attr(uattr, &attr, size);

4002

return retval;

4007

return retval;

4003

4008

4004

out_unlock:

4009

out_unlock:

4005

rcu_read_unlock();

4010

rcu_read_unlock();

4006

return retval;

4011

return retval;

4007

}

4012

}

4008

4013

4009

long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)

4014

long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)

4010

{

4015

{

4011

cpumask_var_t cpus_allowed, new_mask;

4016

cpumask_var_t cpus_allowed, new_mask;

4012

struct task_struct *p;

4017

struct task_struct *p;

4013

int retval;

4018

int retval;

4014

4019

4015

rcu_read_lock();

4020

rcu_read_lock();

4016

4021

4017

p = find_process_by_pid(pid);

4022

p = find_process_by_pid(pid);

4018

if (!p) {

4023

if (!p) {

4019

rcu_read_unlock();

4024

rcu_read_unlock();

4020

return -ESRCH;

4025

return -ESRCH;

4021

}

4026

}

4022

4027

4023

/* Prevent p going away */

4028

/* Prevent p going away */

4024

get_task_struct(p);

4029

get_task_struct(p);

4025

rcu_read_unlock();

4030

rcu_read_unlock();

4026

4031

4027

if (p->flags & PF_NO_SETAFFINITY) {

4032

if (p->flags & PF_NO_SETAFFINITY) {

4028

retval = -EINVAL;

4033

retval = -EINVAL;

4029

goto out_put_task;

4034

goto out_put_task;

4030

}

4035

}

4031

if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {

4036

if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {

4032

retval = -ENOMEM;

4037

retval = -ENOMEM;

4033

goto out_put_task;

4038

goto out_put_task;

4034

}

4039

}

4035

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {

4040

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {

4036

retval = -ENOMEM;

4041

retval = -ENOMEM;

4037

goto out_free_cpus_allowed;

4042

goto out_free_cpus_allowed;

4038

}

4043

}

4039

retval = -EPERM;

4044

retval = -EPERM;

4040

if (!check_same_owner(p)) {

4045

if (!check_same_owner(p)) {

4041

rcu_read_lock();

4046

rcu_read_lock();

4042

if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {

4047

if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {

4043

rcu_read_unlock();

4048

rcu_read_unlock();

4044

goto out_free_new_mask;

4049

goto out_free_new_mask;

4045

}

4050

}

4046

rcu_read_unlock();

4051

rcu_read_unlock();

4047

}

4052

}

4048

4053

4049

retval = security_task_setscheduler(p);

4054

retval = security_task_setscheduler(p);

4050

if (retval)

4055

if (retval)

4051

goto out_free_new_mask;

4056

goto out_free_new_mask;

4052

4057

4053

4058

4054

cpuset_cpus_allowed(p, cpus_allowed);

4059

cpuset_cpus_allowed(p, cpus_allowed);

4055

cpumask_and(new_mask, in_mask, cpus_allowed);

4060

cpumask_and(new_mask, in_mask, cpus_allowed);

4056

4061

4057

/*

4062

/*

4058

* Since bandwidth control happens on root_domain basis,

4063

* Since bandwidth control happens on root_domain basis,

4059

* if admission test is enabled, we only admit -deadline

4064

* if admission test is enabled, we only admit -deadline

4060

* tasks allowed to run on all the CPUs in the task's

4065

* tasks allowed to run on all the CPUs in the task's

4061

* root_domain.

4066

* root_domain.

4062

*/

4067

*/

4063

#ifdef CONFIG_SMP

4068

#ifdef CONFIG_SMP

4064

if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {

4069

if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {

4065

rcu_read_lock();

4070

rcu_read_lock();

4066

if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {

4071

if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {

4067

retval = -EBUSY;

4072

retval = -EBUSY;

4068

rcu_read_unlock();

4073

rcu_read_unlock();

4069

goto out_free_new_mask;

4074

goto out_free_new_mask;

4070

}

4075

}

4071

rcu_read_unlock();

4076

rcu_read_unlock();

4072

}

4077

}

4073

#endif

4078

#endif

4074

again:

4079

again:

4075

retval = set_cpus_allowed_ptr(p, new_mask);

4080

retval = set_cpus_allowed_ptr(p, new_mask);

4076

4081

4077

if (!retval) {

4082

if (!retval) {

4078

cpuset_cpus_allowed(p, cpus_allowed);

4083

cpuset_cpus_allowed(p, cpus_allowed);

4079

if (!cpumask_subset(new_mask, cpus_allowed)) {

4084

if (!cpumask_subset(new_mask, cpus_allowed)) {

4080

/*

4085

/*

4081

* We must have raced with a concurrent cpuset

4086

* We must have raced with a concurrent cpuset

4082

* update. Just reset the cpus_allowed to the

4087

* update. Just reset the cpus_allowed to the

4083

* cpuset's cpus_allowed

4088

* cpuset's cpus_allowed

4084

*/

4089

*/

4085

cpumask_copy(new_mask, cpus_allowed);

4090

cpumask_copy(new_mask, cpus_allowed);

4086

goto again;

4091

goto again;

4087

}

4092

}

4088

}

4093

}

4089

out_free_new_mask:

4094

out_free_new_mask:

4090

free_cpumask_var(new_mask);

4095

free_cpumask_var(new_mask);

4091

out_free_cpus_allowed:

4096

out_free_cpus_allowed:

4092

free_cpumask_var(cpus_allowed);

4097

free_cpumask_var(cpus_allowed);

4093

out_put_task:

4098

out_put_task:

4094

put_task_struct(p);

4099

put_task_struct(p);

4095

return retval;

4100

return retval;

4096

}

4101

}

4097

4102

4098

static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

4103

static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

4099

struct cpumask *new_mask)

4104

struct cpumask *new_mask)

4100

{

4105

{

4101

if (len < cpumask_size())

4106

if (len < cpumask_size())

4102

cpumask_clear(new_mask);

4107

cpumask_clear(new_mask);

4103

else if (len > cpumask_size())

4108

else if (len > cpumask_size())

4104

len = cpumask_size();

4109

len = cpumask_size();

4105

4110

4106

return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

4111

return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

4107

}

4112

}

4108

4113

4109

/**

4114

/**

4110

* sys_sched_setaffinity - set the cpu affinity of a process

4115

* sys_sched_setaffinity - set the cpu affinity of a process

4111

* @pid: pid of the process

4116

* @pid: pid of the process

4112

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

4117

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

4113

* @user_mask_ptr: user-space pointer to the new cpu mask

4118

* @user_mask_ptr: user-space pointer to the new cpu mask

4114

*

4119

*

4115

* Return: 0 on success. An error code otherwise.

4120

* Return: 0 on success. An error code otherwise.

4116

*/

4121

*/

4117

SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,

4122

SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,

4118

unsigned long __user *, user_mask_ptr)

4123

unsigned long __user *, user_mask_ptr)

4119

{

4124

{

4120

cpumask_var_t new_mask;

4125

cpumask_var_t new_mask;

4121

int retval;

4126

int retval;

4122

4127

4123

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))

4128

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))

4124

return -ENOMEM;

4129

return -ENOMEM;

4125

4130

4126

retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);

4131

retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);

4127

if (retval == 0)

4132

if (retval == 0)

4128

retval = sched_setaffinity(pid, new_mask);

4133

retval = sched_setaffinity(pid, new_mask);

4129

free_cpumask_var(new_mask);

4134

free_cpumask_var(new_mask);

4130

return retval;

4135

return retval;

4131

}

4136

}

4132

4137

4133

long sched_getaffinity(pid_t pid, struct cpumask *mask)

4138

long sched_getaffinity(pid_t pid, struct cpumask *mask)

4134

{

4139

{

4135

struct task_struct *p;

4140

struct task_struct *p;

4136

unsigned long flags;

4141

unsigned long flags;

4137

int retval;

4142

int retval;

4138

4143

4139

rcu_read_lock();

4144

rcu_read_lock();

4140

4145

4141

retval = -ESRCH;

4146

retval = -ESRCH;

4142

p = find_process_by_pid(pid);

4147

p = find_process_by_pid(pid);

4143

if (!p)

4148

if (!p)

4144

goto out_unlock;

4149

goto out_unlock;

4145

4150

4146

retval = security_task_getscheduler(p);

4151

retval = security_task_getscheduler(p);

4147

if (retval)

4152

if (retval)

4148

goto out_unlock;

4153

goto out_unlock;

4149

4154

4150

raw_spin_lock_irqsave(&p->pi_lock, flags);

4155

raw_spin_lock_irqsave(&p->pi_lock, flags);

4151

cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);

4156

cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);

4152

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

4157

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

4153

4158

4154

out_unlock:

4159

out_unlock:

4155

rcu_read_unlock();

4160

rcu_read_unlock();

4156

4161

4157

return retval;

4162

return retval;

4158

}

4163

}

4159

4164

4160

/**

4165

/**

4161

* sys_sched_getaffinity - get the cpu affinity of a process

4166

* sys_sched_getaffinity - get the cpu affinity of a process

4162

* @pid: pid of the process

4167

* @pid: pid of the process

4163

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

4168

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

4164

* @user_mask_ptr: user-space pointer to hold the current cpu mask

4169

* @user_mask_ptr: user-space pointer to hold the current cpu mask

4165

*

4170

*

4166

* Return: 0 on success. An error code otherwise.

4171

* Return: 0 on success. An error code otherwise.

4167

*/

4172

*/

4168

SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,

4173

SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,

4169

unsigned long __user *, user_mask_ptr)

4174

unsigned long __user *, user_mask_ptr)

4170

{

4175

{

4171

int ret;

4176

int ret;

4172

cpumask_var_t mask;

4177

cpumask_var_t mask;

4173

4178

4174

if ((len * BITS_PER_BYTE) < nr_cpu_ids)

4179

if ((len * BITS_PER_BYTE) < nr_cpu_ids)

4175

return -EINVAL;

4180

return -EINVAL;

4176

if (len & (sizeof(unsigned long)-1))

4181

if (len & (sizeof(unsigned long)-1))

4177

return -EINVAL;

4182

return -EINVAL;

4178

4183

4179

if (!alloc_cpumask_var(&mask, GFP_KERNEL))

4184

if (!alloc_cpumask_var(&mask, GFP_KERNEL))

4180

return -ENOMEM;

4185

return -ENOMEM;

4181

4186

4182

ret = sched_getaffinity(pid, mask);

4187

ret = sched_getaffinity(pid, mask);

4183

if (ret == 0) {

4188

if (ret == 0) {

4184

size_t retlen = min_t(size_t, len, cpumask_size());

4189

size_t retlen = min_t(size_t, len, cpumask_size());

4185

4190

4186

if (copy_to_user(user_mask_ptr, mask, retlen))

4191

if (copy_to_user(user_mask_ptr, mask, retlen))

4187

ret = -EFAULT;

4192

ret = -EFAULT;

4188

else

4193

else

4189

ret = retlen;

4194

ret = retlen;

4190

}

4195

}

4191

free_cpumask_var(mask);

4196

free_cpumask_var(mask);

4192

4197

4193

return ret;

4198

return ret;

4194

}

4199

}

4195

4200

4196

/**

4201

/**

4197

* sys_sched_yield - yield the current processor to other threads.

4202

* sys_sched_yield - yield the current processor to other threads.

4198

*

4203

*

4199

* This function yields the current CPU to other tasks. If there are no

4204

* This function yields the current CPU to other tasks. If there are no

4200

* other threads running on this CPU then this function will return.

4205

* other threads running on this CPU then this function will return.

4201

*

4206

*

4202

* Return: 0.

4207

* Return: 0.

4203

*/

4208

*/

4204

SYSCALL_DEFINE0(sched_yield)

4209

SYSCALL_DEFINE0(sched_yield)

4205

{

4210

{

4206

struct rq *rq = this_rq_lock();

4211

struct rq *rq = this_rq_lock();

4207

4212

4208

schedstat_inc(rq, yld_count);

4213

schedstat_inc(rq, yld_count);

4209

current->sched_class->yield_task(rq);

4214

current->sched_class->yield_task(rq);

4210

4215

4211

/*

4216

/*

4212

* Since we are going to call schedule() anyway, there's

4217

* Since we are going to call schedule() anyway, there's

4213

* no need to preempt or enable interrupts:

4218

* no need to preempt or enable interrupts:

4214

*/

4219

*/

4215

__release(rq->lock);

4220

__release(rq->lock);

4216

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

4221

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

4217

do_raw_spin_unlock(&rq->lock);

4222

do_raw_spin_unlock(&rq->lock);

4218

sched_preempt_enable_no_resched();

4223

sched_preempt_enable_no_resched();

4219

4224

4220

schedule();

4225

schedule();

4221

4226

4222

return 0;

4227

return 0;

4223

}

4228

}

4224

4229

4225

static void __cond_resched(void)

4230

static void __cond_resched(void)

4226

{

4231

{

4227

__preempt_count_add(PREEMPT_ACTIVE);

4232

__preempt_count_add(PREEMPT_ACTIVE);

4228

__schedule();

4233

__schedule();

4229

__preempt_count_sub(PREEMPT_ACTIVE);

4234

__preempt_count_sub(PREEMPT_ACTIVE);

4230

}

4235

}

4231

4236

4232

int __sched _cond_resched(void)

4237

int __sched _cond_resched(void)

4233

{

4238

{

4234

if (should_resched()) {

4239

if (should_resched()) {

4235

__cond_resched();

4240

__cond_resched();

4236

return 1;

4241

return 1;

4237

}

4242

}

4238

return 0;

4243

return 0;

4239

}

4244

}

4240

EXPORT_SYMBOL(_cond_resched);

4245

EXPORT_SYMBOL(_cond_resched);

4241

4246

4242

/*

4247

/*

4243

* __cond_resched_lock() - if a reschedule is pending, drop the given lock,

4248

* __cond_resched_lock() - if a reschedule is pending, drop the given lock,

4244

* call schedule, and on return reacquire the lock.

4249

* call schedule, and on return reacquire the lock.

4245

*

4250

*

4246

* This works OK both with and without CONFIG_PREEMPT. We do strange low-level

4251

* This works OK both with and without CONFIG_PREEMPT. We do strange low-level

4247

* operations here to prevent schedule() from being called twice (once via

4252

* operations here to prevent schedule() from being called twice (once via

4248

* spin_unlock(), once by hand).

4253

* spin_unlock(), once by hand).

4249

*/

4254

*/

4250

int __cond_resched_lock(spinlock_t *lock)

4255

int __cond_resched_lock(spinlock_t *lock)

4251

{

4256

{

4252

int resched = should_resched();

4257

int resched = should_resched();

4253

int ret = 0;

4258

int ret = 0;

4254

4259

4255

lockdep_assert_held(lock);

4260

lockdep_assert_held(lock);

4256

4261

4257

if (spin_needbreak(lock) || resched) {

4262

if (spin_needbreak(lock) || resched) {

4258

spin_unlock(lock);

4263

spin_unlock(lock);

4259

if (resched)

4264

if (resched)

4260

__cond_resched();

4265

__cond_resched();

4261

else

4266

else

4262

cpu_relax();

4267

cpu_relax();

4263

ret = 1;

4268

ret = 1;

4264

spin_lock(lock);

4269

spin_lock(lock);

4265

}

4270

}

4266

return ret;

4271

return ret;

4267

}

4272

}

4268

EXPORT_SYMBOL(__cond_resched_lock);

4273

EXPORT_SYMBOL(__cond_resched_lock);

4269

4274

4270

int __sched __cond_resched_softirq(void)

4275

int __sched __cond_resched_softirq(void)

4271

{

4276

{

4272

BUG_ON(!in_softirq());

4277

BUG_ON(!in_softirq());

4273

4278

4274

if (should_resched()) {

4279

if (should_resched()) {

4275

local_bh_enable();

4280

local_bh_enable();

4276

__cond_resched();

4281

__cond_resched();

4277

local_bh_disable();

4282

local_bh_disable();

4278

return 1;

4283

return 1;

4279

}

4284

}

4280

return 0;

4285

return 0;

4281

}

4286

}

4282

EXPORT_SYMBOL(__cond_resched_softirq);

4287

EXPORT_SYMBOL(__cond_resched_softirq);

4283

4288

4284

/**

4289

/**

4285

* yield - yield the current processor to other threads.

4290

* yield - yield the current processor to other threads.

4286

*

4291

*

4287

* Do not ever use this function, there's a 99% chance you're doing it wrong.

4292

* Do not ever use this function, there's a 99% chance you're doing it wrong.

4288

*

4293

*

4289

* The scheduler is at all times free to pick the calling task as the most

4294

* The scheduler is at all times free to pick the calling task as the most

4290

* eligible task to run, if removing the yield() call from your code breaks

4295

* eligible task to run, if removing the yield() call from your code breaks

4291

* it, its already broken.

4296

* it, its already broken.

4292

*

4297

*

4293

* Typical broken usage is:

4298

* Typical broken usage is:

4294

*

4299

*

4295

* while (!event)

4300

* while (!event)

4296

* yield();

4301

* yield();

4297

*

4302

*

4298

* where one assumes that yield() will let 'the other' process run that will

4303

* where one assumes that yield() will let 'the other' process run that will

4299

* make event true. If the current task is a SCHED_FIFO task that will never

4304

* make event true. If the current task is a SCHED_FIFO task that will never

4300

* happen. Never use yield() as a progress guarantee!!

4305

* happen. Never use yield() as a progress guarantee!!

4301

*

4306

*

4302

* If you want to use yield() to wait for something, use wait_event().

4307

* If you want to use yield() to wait for something, use wait_event().

4303

* If you want to use yield() to be 'nice' for others, use cond_resched().

4308

* If you want to use yield() to be 'nice' for others, use cond_resched().

4304

* If you still want to use yield(), do not!

4309

* If you still want to use yield(), do not!

4305

*/

4310

*/

4306

void __sched yield(void)

4311

void __sched yield(void)

4307

{

4312

{

4308

set_current_state(TASK_RUNNING);

4313

set_current_state(TASK_RUNNING);

4309

sys_sched_yield();

4314

sys_sched_yield();

4310

}

4315

}

4311

EXPORT_SYMBOL(yield);

4316

EXPORT_SYMBOL(yield);

4312

4317

4313

/**

4318

/**

4314

* yield_to - yield the current processor to another thread in

4319

* yield_to - yield the current processor to another thread in

4315

* your thread group, or accelerate that thread toward the

4320

* your thread group, or accelerate that thread toward the

4316

* processor it's on.

4321

* processor it's on.

4317

* @p: target task

4322

* @p: target task

4318

* @preempt: whether task preemption is allowed or not

4323

* @preempt: whether task preemption is allowed or not

4319

*

4324

*

4320

* It's the caller's job to ensure that the target task struct

4325

* It's the caller's job to ensure that the target task struct

4321

* can't go away on us before we can do any checks.

4326

* can't go away on us before we can do any checks.

4322

*

4327

*

4323

* Return:

4328

* Return:

4324

* true (>0) if we indeed boosted the target task.

4329

* true (>0) if we indeed boosted the target task.

4325

* false (0) if we failed to boost the target.

4330

* false (0) if we failed to boost the target.

4326

* -ESRCH if there's no task to yield to.

4331

* -ESRCH if there's no task to yield to.

4327

*/

4332

*/

4328

int __sched yield_to(struct task_struct *p, bool preempt)

4333

int __sched yield_to(struct task_struct *p, bool preempt)

4329

{

4334

{

4330

struct task_struct *curr = current;

4335

struct task_struct *curr = current;

4331

struct rq *rq, *p_rq;

4336

struct rq *rq, *p_rq;

4332

unsigned long flags;

4337

unsigned long flags;

4333

int yielded = 0;

4338

int yielded = 0;

4334

4339

4335

local_irq_save(flags);

4340

local_irq_save(flags);

4336

rq = this_rq();

4341

rq = this_rq();

4337

4342

4338

again:

4343

again:

4339

p_rq = task_rq(p);

4344

p_rq = task_rq(p);

4340

/*

4345

/*

4341

* If we're the only runnable task on the rq and target rq also

4346

* If we're the only runnable task on the rq and target rq also

4342

* has only one task, there's absolutely no point in yielding.

4347

* has only one task, there's absolutely no point in yielding.

4343

*/

4348

*/

4344

if (rq->nr_running == 1 && p_rq->nr_running == 1) {

4349

if (rq->nr_running == 1 && p_rq->nr_running == 1) {

4345

yielded = -ESRCH;

4350

yielded = -ESRCH;

4346

goto out_irq;

4351

goto out_irq;

4347

}

4352

}

4348

4353

4349

double_rq_lock(rq, p_rq);

4354

double_rq_lock(rq, p_rq);

4350

if (task_rq(p) != p_rq) {

4355

if (task_rq(p) != p_rq) {

4351

double_rq_unlock(rq, p_rq);

4356

double_rq_unlock(rq, p_rq);

4352

goto again;

4357

goto again;

4353

}

4358

}

4354

4359

4355

if (!curr->sched_class->yield_to_task)

4360

if (!curr->sched_class->yield_to_task)

4356

goto out_unlock;

4361

goto out_unlock;

4357

4362

4358

if (curr->sched_class != p->sched_class)

4363

if (curr->sched_class != p->sched_class)

4359

goto out_unlock;

4364

goto out_unlock;

4360

4365

4361

if (task_running(p_rq, p) || p->state)

4366

if (task_running(p_rq, p) || p->state)

4362

goto out_unlock;

4367

goto out_unlock;

4363

4368

4364

yielded = curr->sched_class->yield_to_task(rq, p, preempt);

4369

yielded = curr->sched_class->yield_to_task(rq, p, preempt);

4365

if (yielded) {

4370

if (yielded) {

4366

schedstat_inc(rq, yld_count);

4371

schedstat_inc(rq, yld_count);

4367

/*

4372

/*

4368

* Make p's CPU reschedule; pick_next_entity takes care of

4373

* Make p's CPU reschedule; pick_next_entity takes care of

4369

* fairness.

4374

* fairness.

4370

*/

4375

*/

4371

if (preempt && rq != p_rq)

4376

if (preempt && rq != p_rq)

4372

resched_curr(p_rq);

4377

resched_curr(p_rq);

4373

}

4378

}

4374

4379

4375

out_unlock:

4380

out_unlock:

4376

double_rq_unlock(rq, p_rq);

4381

double_rq_unlock(rq, p_rq);

4377

out_irq:

4382

out_irq:

4378

local_irq_restore(flags);

4383

local_irq_restore(flags);

4379

4384

4380

if (yielded > 0)

4385

if (yielded > 0)

4381

schedule();

4386

schedule();

4382

4387

4383

return yielded;

4388

return yielded;

4384

}

4389

}

4385

EXPORT_SYMBOL_GPL(yield_to);

4390

EXPORT_SYMBOL_GPL(yield_to);

4386

4391

4387

/*

4392

/*

4388

* This task is about to go to sleep on IO. Increment rq->nr_iowait so

4393

* This task is about to go to sleep on IO. Increment rq->nr_iowait so

4389

* that process accounting knows that this is a task in IO wait state.

4394

* that process accounting knows that this is a task in IO wait state.

4390

*/

4395

*/

4391

void __sched io_schedule(void)

4396

void __sched io_schedule(void)

4392

{

4397

{

4393

struct rq *rq = raw_rq();

4398

struct rq *rq = raw_rq();

4394

4399

4395

delayacct_blkio_start();

4400

delayacct_blkio_start();

4396

atomic_inc(&rq->nr_iowait);

4401

atomic_inc(&rq->nr_iowait);

4397

blk_flush_plug(current);

4402

blk_flush_plug(current);

4398

current->in_iowait = 1;

4403

current->in_iowait = 1;

4399

schedule();

4404

schedule();

4400

current->in_iowait = 0;

4405

current->in_iowait = 0;

4401

atomic_dec(&rq->nr_iowait);

4406

atomic_dec(&rq->nr_iowait);

4402

delayacct_blkio_end();

4407

delayacct_blkio_end();

4403

}

4408

}

4404

EXPORT_SYMBOL(io_schedule);

4409

EXPORT_SYMBOL(io_schedule);

4405

4410

4406

long __sched io_schedule_timeout(long timeout)

4411

long __sched io_schedule_timeout(long timeout)

4407

{

4412

{

4408

struct rq *rq = raw_rq();

4413

struct rq *rq = raw_rq();

4409

long ret;

4414

long ret;

4410

4415

4411

delayacct_blkio_start();

4416

delayacct_blkio_start();

4412

atomic_inc(&rq->nr_iowait);

4417

atomic_inc(&rq->nr_iowait);

4413

blk_flush_plug(current);

4418

blk_flush_plug(current);

4414

current->in_iowait = 1;

4419

current->in_iowait = 1;

4415

ret = schedule_timeout(timeout);

4420

ret = schedule_timeout(timeout);

4416

current->in_iowait = 0;

4421

current->in_iowait = 0;

4417

atomic_dec(&rq->nr_iowait);

4422

atomic_dec(&rq->nr_iowait);

4418

delayacct_blkio_end();

4423

delayacct_blkio_end();

4419

return ret;

4424

return ret;

4420

}

4425

}

4421

4426

4422

/**

4427

/**

4423

* sys_sched_get_priority_max - return maximum RT priority.

4428

* sys_sched_get_priority_max - return maximum RT priority.

4424

* @policy: scheduling class.

4429

* @policy: scheduling class.

4425

*

4430

*

4426

* Return: On success, this syscall returns the maximum

4431

* Return: On success, this syscall returns the maximum

4427

* rt_priority that can be used by a given scheduling class.

4432

* rt_priority that can be used by a given scheduling class.

4428

* On failure, a negative error code is returned.

4433

* On failure, a negative error code is returned.

4429

*/

4434

*/

4430

SYSCALL_DEFINE1(sched_get_priority_max, int, policy)

4435

SYSCALL_DEFINE1(sched_get_priority_max, int, policy)

4431

{

4436

{

4432

int ret = -EINVAL;

4437

int ret = -EINVAL;

4433

4438

4434

switch (policy) {

4439

switch (policy) {

4435

case SCHED_FIFO:

4440

case SCHED_FIFO:

4436

case SCHED_RR:

4441

case SCHED_RR:

4437

ret = MAX_USER_RT_PRIO-1;

4442

ret = MAX_USER_RT_PRIO-1;

4438

break;

4443

break;

4439

case SCHED_DEADLINE:

4444

case SCHED_DEADLINE:

4440

case SCHED_NORMAL:

4445

case SCHED_NORMAL:

4441

case SCHED_BATCH:

4446

case SCHED_BATCH:

4442

case SCHED_IDLE:

4447

case SCHED_IDLE:

4443

ret = 0;

4448

ret = 0;

4444

break;

4449

break;

4445

}

4450

}

4446

return ret;

4451

return ret;

4447

}

4452

}

4448

4453

4449

/**

4454

/**

4450

* sys_sched_get_priority_min - return minimum RT priority.

4455

* sys_sched_get_priority_min - return minimum RT priority.

4451

* @policy: scheduling class.

4456

* @policy: scheduling class.

4452

*

4457

*

4453

* Return: On success, this syscall returns the minimum

4458

* Return: On success, this syscall returns the minimum

4454

* rt_priority that can be used by a given scheduling class.

4459

* rt_priority that can be used by a given scheduling class.

4455

* On failure, a negative error code is returned.

4460

* On failure, a negative error code is returned.

4456

*/

4461

*/

4457

SYSCALL_DEFINE1(sched_get_priority_min, int, policy)

4462

SYSCALL_DEFINE1(sched_get_priority_min, int, policy)

4458

{

4463

{

4459

int ret = -EINVAL;

4464

int ret = -EINVAL;

4460

4465

4461

switch (policy) {

4466

switch (policy) {

4462

case SCHED_FIFO:

4467

case SCHED_FIFO:

4463

case SCHED_RR:

4468

case SCHED_RR:

4464

ret = 1;

4469

ret = 1;

4465

break;

4470

break;

4466

case SCHED_DEADLINE:

4471

case SCHED_DEADLINE:

4467

case SCHED_NORMAL:

4472

case SCHED_NORMAL:

4468

case SCHED_BATCH:

4473

case SCHED_BATCH:

4469

case SCHED_IDLE:

4474

case SCHED_IDLE:

4470

ret = 0;

4475

ret = 0;

4471

}

4476

}

4472

return ret;

4477

return ret;

4473

}

4478

}

4474

4479

4475

/**

4480

/**

4476

* sys_sched_rr_get_interval - return the default timeslice of a process.

4481

* sys_sched_rr_get_interval - return the default timeslice of a process.

4477

* @pid: pid of the process.

4482

* @pid: pid of the process.

4478

* @interval: userspace pointer to the timeslice value.

4483

* @interval: userspace pointer to the timeslice value.

4479

*

4484

*

4480

* this syscall writes the default timeslice value of a given process

4485

* this syscall writes the default timeslice value of a given process

4481

* into the user-space timespec buffer. A value of '0' means infinity.

4486

* into the user-space timespec buffer. A value of '0' means infinity.

4482

*

4487

*

4483

* Return: On success, 0 and the timeslice is in @interval. Otherwise,

4488

* Return: On success, 0 and the timeslice is in @interval. Otherwise,

4484

* an error code.

4489

* an error code.

4485

*/

4490

*/

4486

SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,

4491

SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,

4487

struct timespec __user *, interval)

4492

struct timespec __user *, interval)

4488

{

4493

{

4489

struct task_struct *p;

4494

struct task_struct *p;

4490

unsigned int time_slice;

4495

unsigned int time_slice;

4491

unsigned long flags;

4496

unsigned long flags;

4492

struct rq *rq;

4497

struct rq *rq;

4493

int retval;

4498

int retval;

4494

struct timespec t;

4499

struct timespec t;

4495

4500

4496

if (pid < 0)

4501

if (pid < 0)

4497

return -EINVAL;

4502

return -EINVAL;

4498

4503

4499

retval = -ESRCH;

4504

retval = -ESRCH;

4500

rcu_read_lock();

4505

rcu_read_lock();

4501

p = find_process_by_pid(pid);

4506

p = find_process_by_pid(pid);

4502

if (!p)

4507

if (!p)

4503

goto out_unlock;

4508

goto out_unlock;

4504

4509

4505

retval = security_task_getscheduler(p);

4510

retval = security_task_getscheduler(p);

4506

if (retval)

4511

if (retval)

4507

goto out_unlock;

4512

goto out_unlock;

4508

4513

4509

rq = task_rq_lock(p, &flags);

4514

rq = task_rq_lock(p, &flags);

4510

time_slice = 0;

4515

time_slice = 0;

4511

if (p->sched_class->get_rr_interval)

4516

if (p->sched_class->get_rr_interval)

4512

time_slice = p->sched_class->get_rr_interval(rq, p);

4517

time_slice = p->sched_class->get_rr_interval(rq, p);

4513

task_rq_unlock(rq, p, &flags);

4518

task_rq_unlock(rq, p, &flags);

4514

4519

4515

rcu_read_unlock();

4520

rcu_read_unlock();

4516

jiffies_to_timespec(time_slice, &t);

4521

jiffies_to_timespec(time_slice, &t);

4517

retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;

4522

retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;

4518

return retval;

4523

return retval;

4519

4524

4520

out_unlock:

4525

out_unlock:

4521

rcu_read_unlock();

4526

rcu_read_unlock();

4522

return retval;

4527

return retval;

4523

}

4528

}

4524

4529

4525

static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;

4530

static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;

4526

4531

4527

void sched_show_task(struct task_struct *p)

4532

void sched_show_task(struct task_struct *p)

4528

{

4533

{

4529

unsigned long free = 0;

4534

unsigned long free = 0;

4530

int ppid;

4535

int ppid;

4531

unsigned state;

4536

unsigned state;

4532

4537

4533

state = p->state ? __ffs(p->state) + 1 : 0;

4538

state = p->state ? __ffs(p->state) + 1 : 0;

4534

printk(KERN_INFO "%-15.15s %c", p->comm,

4539

printk(KERN_INFO "%-15.15s %c", p->comm,

4535

state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');

4540

state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');

4536

#if BITS_PER_LONG == 32

4541

#if BITS_PER_LONG == 32

4537

if (state == TASK_RUNNING)

4542

if (state == TASK_RUNNING)

4538

printk(KERN_CONT " running ");

4543

printk(KERN_CONT " running ");

4539

else

4544

else

4540

printk(KERN_CONT " %08lx ", thread_saved_pc(p));

4545

printk(KERN_CONT " %08lx ", thread_saved_pc(p));

4541

#else

4546

#else

4542

if (state == TASK_RUNNING)

4547

if (state == TASK_RUNNING)

4543

printk(KERN_CONT " running task ");

4548

printk(KERN_CONT " running task ");

4544

else

4549

else

4545

printk(KERN_CONT " %016lx ", thread_saved_pc(p));

4550

printk(KERN_CONT " %016lx ", thread_saved_pc(p));

4546

#endif

4551

#endif

4547

#ifdef CONFIG_DEBUG_STACK_USAGE

4552

#ifdef CONFIG_DEBUG_STACK_USAGE

4548

free = stack_not_used(p);

4553

free = stack_not_used(p);

4549

#endif

4554

#endif

4550

rcu_read_lock();

4555

rcu_read_lock();

4551

ppid = task_pid_nr(rcu_dereference(p->real_parent));

4556

ppid = task_pid_nr(rcu_dereference(p->real_parent));

4552

rcu_read_unlock();

4557

rcu_read_unlock();

4553

printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,

4558

printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,

4554

task_pid_nr(p), ppid,

4559

task_pid_nr(p), ppid,

4555

(unsigned long)task_thread_info(p)->flags);

4560

(unsigned long)task_thread_info(p)->flags);

4556

4561

4557

print_worker_info(KERN_INFO, p);

4562

print_worker_info(KERN_INFO, p);

4558

show_stack(p, NULL);

4563

show_stack(p, NULL);

4559

}

4564

}

4560

4565

4561

void show_state_filter(unsigned long state_filter)

4566

void show_state_filter(unsigned long state_filter)

4562

{

4567

{

4563

struct task_struct *g, *p;

4568

struct task_struct *g, *p;

4564

4569

4565

#if BITS_PER_LONG == 32

4570

#if BITS_PER_LONG == 32

4566

printk(KERN_INFO

4571

printk(KERN_INFO

4567

" task PC stack pid father\n");

4572

" task PC stack pid father\n");

4568

#else

4573

#else

4569

printk(KERN_INFO

4574

printk(KERN_INFO

4570

" task PC stack pid father\n");

4575

" task PC stack pid father\n");

4571

#endif

4576

#endif

4572

rcu_read_lock();

4577

rcu_read_lock();

4573

for_each_process_thread(g, p) {

4578

for_each_process_thread(g, p) {

4574

/*

4579

/*

4575

* reset the NMI-timeout, listing all files on a slow

4580

* reset the NMI-timeout, listing all files on a slow

4576

* console might take a lot of time:

4581

* console might take a lot of time:

4577

*/

4582

*/

4578

touch_nmi_watchdog();

4583

touch_nmi_watchdog();

4579

if (!state_filter || (p->state & state_filter))

4584

if (!state_filter || (p->state & state_filter))

4580

sched_show_task(p);

4585

sched_show_task(p);

4581

}

4586

}

4582

4587

4583

touch_all_softlockup_watchdogs();

4588

touch_all_softlockup_watchdogs();

4584

4589

4585

#ifdef CONFIG_SCHED_DEBUG

4590

#ifdef CONFIG_SCHED_DEBUG

4586

sysrq_sched_debug_show();

4591

sysrq_sched_debug_show();

4587

#endif

4592

#endif

4588

rcu_read_unlock();

4593

rcu_read_unlock();

4589

/*

4594

/*

4590

* Only show locks if all tasks are dumped:

4595

* Only show locks if all tasks are dumped:

4591

*/

4596

*/

4592

if (!state_filter)

4597

if (!state_filter)

4593

debug_show_all_locks();

4598

debug_show_all_locks();

4594

}

4599

}

4595

4600

4596

void init_idle_bootup_task(struct task_struct *idle)

4601

void init_idle_bootup_task(struct task_struct *idle)

4597

{

4602

{

4598

idle->sched_class = &idle_sched_class;

4603

idle->sched_class = &idle_sched_class;

4599

}

4604

}

4600

4605

4601

/**

4606

/**

4602

* init_idle - set up an idle thread for a given CPU

4607

* init_idle - set up an idle thread for a given CPU

4603

* @idle: task in question

4608

* @idle: task in question

4604

* @cpu: cpu the idle task belongs to

4609

* @cpu: cpu the idle task belongs to

4605

*

4610

*

4606

* NOTE: this function does not set the idle thread's NEED_RESCHED

4611

* NOTE: this function does not set the idle thread's NEED_RESCHED

4607

* flag, to make booting more robust.

4612

* flag, to make booting more robust.

4608

*/

4613

*/

4609

void init_idle(struct task_struct *idle, int cpu)

4614

void init_idle(struct task_struct *idle, int cpu)

4610

{

4615

{

4611

struct rq *rq = cpu_rq(cpu);

4616

struct rq *rq = cpu_rq(cpu);

4612

unsigned long flags;

4617

unsigned long flags;

4613

4618

4614

raw_spin_lock_irqsave(&rq->lock, flags);

4619

raw_spin_lock_irqsave(&rq->lock, flags);

4615

4620

4616

__sched_fork(0, idle);

4621

__sched_fork(0, idle);

4617

idle->state = TASK_RUNNING;

4622

idle->state = TASK_RUNNING;

4618

idle->se.exec_start = sched_clock();

4623

idle->se.exec_start = sched_clock();

4619

4624

4620

do_set_cpus_allowed(idle, cpumask_of(cpu));

4625

do_set_cpus_allowed(idle, cpumask_of(cpu));

4621

/*

4626

/*

4622

* We're having a chicken and egg problem, even though we are

4627

* We're having a chicken and egg problem, even though we are

4623

* holding rq->lock, the cpu isn't yet set to this cpu so the

4628

* holding rq->lock, the cpu isn't yet set to this cpu so the

4624

* lockdep check in task_group() will fail.

4629

* lockdep check in task_group() will fail.

4625

*

4630

*

4626

* Similar case to sched_fork(). / Alternatively we could

4631

* Similar case to sched_fork(). / Alternatively we could

4627

* use task_rq_lock() here and obtain the other rq->lock.

4632

* use task_rq_lock() here and obtain the other rq->lock.

4628

*

4633

*

4629

* Silence PROVE_RCU

4634

* Silence PROVE_RCU

4630

*/

4635

*/

4631

rcu_read_lock();

4636

rcu_read_lock();

4632

__set_task_cpu(idle, cpu);

4637

__set_task_cpu(idle, cpu);

4633

rcu_read_unlock();

4638

rcu_read_unlock();

4634

4639

4635

rq->curr = rq->idle = idle;

4640

rq->curr = rq->idle = idle;

4636

idle->on_rq = TASK_ON_RQ_QUEUED;

4641

idle->on_rq = TASK_ON_RQ_QUEUED;

4637

#if defined(CONFIG_SMP)

4642

#if defined(CONFIG_SMP)

4638

idle->on_cpu = 1;

4643

idle->on_cpu = 1;

4639

#endif

4644

#endif

4640

raw_spin_unlock_irqrestore(&rq->lock, flags);

4645

raw_spin_unlock_irqrestore(&rq->lock, flags);

4641

4646

4642

/* Set the preempt count _outside_ the spinlocks! */

4647

/* Set the preempt count _outside_ the spinlocks! */

4643

init_idle_preempt_count(idle, cpu);

4648

init_idle_preempt_count(idle, cpu);

4644

4649

4645

/*

4650

/*

4646

* The idle tasks have their own, simple scheduling class:

4651

* The idle tasks have their own, simple scheduling class:

4647

*/

4652

*/

4648

idle->sched_class = &idle_sched_class;

4653

idle->sched_class = &idle_sched_class;

4649

ftrace_graph_init_idle_task(idle, cpu);

4654

ftrace_graph_init_idle_task(idle, cpu);

4650

vtime_init_idle(idle, cpu);

4655

vtime_init_idle(idle, cpu);

4651

#if defined(CONFIG_SMP)

4656

#if defined(CONFIG_SMP)

4652

sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);

4657

sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);

4653

#endif

4658

#endif

4654

}

4659

}

4655

4660

4656

#ifdef CONFIG_SMP

4661

#ifdef CONFIG_SMP

4657

/*

4662

/*

4658

* move_queued_task - move a queued task to new rq.

4663

* move_queued_task - move a queued task to new rq.

4659

*

4664

*

4660

* Returns (locked) new rq. Old rq's lock is released.

4665

* Returns (locked) new rq. Old rq's lock is released.

4661

*/

4666

*/

4662

static struct rq *move_queued_task(struct task_struct *p, int new_cpu)

4667

static struct rq *move_queued_task(struct task_struct *p, int new_cpu)

4663

{

4668

{

4664

struct rq *rq = task_rq(p);

4669

struct rq *rq = task_rq(p);

4665

4670

4666

lockdep_assert_held(&rq->lock);

4671

lockdep_assert_held(&rq->lock);

4667

4672

4668

dequeue_task(rq, p, 0);

4673

dequeue_task(rq, p, 0);

4669

p->on_rq = TASK_ON_RQ_MIGRATING;

4674

p->on_rq = TASK_ON_RQ_MIGRATING;

4670

set_task_cpu(p, new_cpu);

4675

set_task_cpu(p, new_cpu);

4671

raw_spin_unlock(&rq->lock);

4676

raw_spin_unlock(&rq->lock);

4672

4677

4673

rq = cpu_rq(new_cpu);

4678

rq = cpu_rq(new_cpu);

4674

4679

4675

raw_spin_lock(&rq->lock);

4680

raw_spin_lock(&rq->lock);

4676

BUG_ON(task_cpu(p) != new_cpu);

4681

BUG_ON(task_cpu(p) != new_cpu);

4677

p->on_rq = TASK_ON_RQ_QUEUED;

4682

p->on_rq = TASK_ON_RQ_QUEUED;

4678

enqueue_task(rq, p, 0);

4683

enqueue_task(rq, p, 0);

4679

check_preempt_curr(rq, p, 0);

4684

check_preempt_curr(rq, p, 0);

4680

4685

4681

return rq;

4686

return rq;

4682

}

4687

}

4683

4688

4684

void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)

4689

void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)

4685

{

4690

{

4686

if (p->sched_class && p->sched_class->set_cpus_allowed)

4691

if (p->sched_class && p->sched_class->set_cpus_allowed)

4687

p->sched_class->set_cpus_allowed(p, new_mask);

4692

p->sched_class->set_cpus_allowed(p, new_mask);

4688

4693

4689

cpumask_copy(&p->cpus_allowed, new_mask);

4694

cpumask_copy(&p->cpus_allowed, new_mask);

4690

p->nr_cpus_allowed = cpumask_weight(new_mask);

4695

p->nr_cpus_allowed = cpumask_weight(new_mask);

4691

}

4696

}

4692

4697

4693

/*

4698

/*

4694

* This is how migration works:

4699

* This is how migration works:

4695

*

4700

*

4696

* 1) we invoke migration_cpu_stop() on the target CPU using

4701

* 1) we invoke migration_cpu_stop() on the target CPU using

4697

* stop_one_cpu().

4702

* stop_one_cpu().

4698

* 2) stopper starts to run (implicitly forcing the migrated thread

4703

* 2) stopper starts to run (implicitly forcing the migrated thread

4699

* off the CPU)

4704

* off the CPU)

4700

* 3) it checks whether the migrated task is still in the wrong runqueue.

4705

* 3) it checks whether the migrated task is still in the wrong runqueue.

4701

* 4) if it's in the wrong runqueue then the migration thread removes

4706

* 4) if it's in the wrong runqueue then the migration thread removes

4702

* it and puts it into the right queue.

4707

* it and puts it into the right queue.

4703

* 5) stopper completes and stop_one_cpu() returns and the migration

4708

* 5) stopper completes and stop_one_cpu() returns and the migration

4704

* is done.

4709

* is done.

4705

*/

4710

*/

4706

4711

4707

/*

4712

/*

4708

* Change a given task's CPU affinity. Migrate the thread to a

4713

* Change a given task's CPU affinity. Migrate the thread to a

4709

* proper CPU and schedule it away if the CPU it's executing on

4714

* proper CPU and schedule it away if the CPU it's executing on

4710

* is removed from the allowed bitmask.

4715

* is removed from the allowed bitmask.

4711

*

4716

*

4712

* NOTE: the caller must have a valid reference to the task, the

4717

* NOTE: the caller must have a valid reference to the task, the

4713

* task must not exit() & deallocate itself prematurely. The

4718

* task must not exit() & deallocate itself prematurely. The

4714

* call is not atomic; no spinlocks may be held.

4719

* call is not atomic; no spinlocks may be held.

4715

*/

4720

*/

4716

int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)

4721

int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)

4717

{

4722

{

4718

unsigned long flags;

4723

unsigned long flags;

4719

struct rq *rq;

4724

struct rq *rq;

4720

unsigned int dest_cpu;

4725

unsigned int dest_cpu;

4721

int ret = 0;

4726

int ret = 0;

4722

4727

4723

rq = task_rq_lock(p, &flags);

4728

rq = task_rq_lock(p, &flags);

4724

4729

4725

if (cpumask_equal(&p->cpus_allowed, new_mask))

4730

if (cpumask_equal(&p->cpus_allowed, new_mask))

4726

goto out;

4731

goto out;

4727

4732

4728

if (!cpumask_intersects(new_mask, cpu_active_mask)) {

4733

if (!cpumask_intersects(new_mask, cpu_active_mask)) {

4729

ret = -EINVAL;

4734

ret = -EINVAL;

4730

goto out;

4735

goto out;

4731

}

4736

}

4732

4737

4733

do_set_cpus_allowed(p, new_mask);

4738

do_set_cpus_allowed(p, new_mask);

4734

4739

4735

/* Can the task run on the task's current CPU? If so, we're done */

4740

/* Can the task run on the task's current CPU? If so, we're done */

4736

if (cpumask_test_cpu(task_cpu(p), new_mask))

4741

if (cpumask_test_cpu(task_cpu(p), new_mask))

4737

goto out;

4742

goto out;

4738

4743

4739

dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);

4744

dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);

4740

if (task_running(rq, p) || p->state == TASK_WAKING) {

4745

if (task_running(rq, p) || p->state == TASK_WAKING) {

4741

struct migration_arg arg = { p, dest_cpu };

4746

struct migration_arg arg = { p, dest_cpu };

4742

/* Need help from migration thread: drop lock and wait. */

4747

/* Need help from migration thread: drop lock and wait. */

4743

task_rq_unlock(rq, p, &flags);

4748

task_rq_unlock(rq, p, &flags);

4744

stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);

4749

stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);

4745

tlb_migrate_finish(p->mm);

4750

tlb_migrate_finish(p->mm);

4746

return 0;

4751

return 0;

4747

} else if (task_on_rq_queued(p))

4752

} else if (task_on_rq_queued(p))

4748

rq = move_queued_task(p, dest_cpu);

4753

rq = move_queued_task(p, dest_cpu);

4749

out:

4754

out:

4750

task_rq_unlock(rq, p, &flags);

4755

task_rq_unlock(rq, p, &flags);

4751

4756

4752

return ret;

4757

return ret;

4753

}

4758

}

4754

EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

4759

EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

4755

4760

4756

/*

4761

/*

4757

* Move (not current) task off this cpu, onto dest cpu. We're doing

4762

* Move (not current) task off this cpu, onto dest cpu. We're doing

4758

* this because either it can't run here any more (set_cpus_allowed()

4763

* this because either it can't run here any more (set_cpus_allowed()

4759

* away from this CPU, or CPU going down), or because we're

4764

* away from this CPU, or CPU going down), or because we're

4760

* attempting to rebalance this task on exec (sched_exec).

4765

* attempting to rebalance this task on exec (sched_exec).

4761

*

4766

*

4762

* So we race with normal scheduler movements, but that's OK, as long

4767

* So we race with normal scheduler movements, but that's OK, as long

4763

* as the task is no longer on this CPU.

4768

* as the task is no longer on this CPU.

4764

*

4769

*

4765

* Returns non-zero if task was successfully migrated.

4770

* Returns non-zero if task was successfully migrated.

4766

*/

4771

*/

4767

static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)

4772

static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)

4768

{

4773

{

4769

struct rq *rq;

4774

struct rq *rq;

4770

int ret = 0;

4775

int ret = 0;

4771

4776

4772

if (unlikely(!cpu_active(dest_cpu)))

4777

if (unlikely(!cpu_active(dest_cpu)))

4773

return ret;

4778

return ret;

4774

4779

4775

rq = cpu_rq(src_cpu);

4780

rq = cpu_rq(src_cpu);

4776

4781

4777

raw_spin_lock(&p->pi_lock);

4782

raw_spin_lock(&p->pi_lock);

4778

raw_spin_lock(&rq->lock);

4783

raw_spin_lock(&rq->lock);

4779

/* Already moved. */

4784

/* Already moved. */

4780

if (task_cpu(p) != src_cpu)

4785

if (task_cpu(p) != src_cpu)

4781

goto done;

4786

goto done;

4782

4787

4783

/* Affinity changed (again). */

4788

/* Affinity changed (again). */

4784

if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))

4789

if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))

4785

goto fail;

4790

goto fail;

4786

4791

4787

/*

4792

/*

4788

* If we're not on a rq, the next wake-up will ensure we're

4793

* If we're not on a rq, the next wake-up will ensure we're

4789

* placed properly.

4794

* placed properly.

4790

*/

4795

*/

4791

if (task_on_rq_queued(p))

4796

if (task_on_rq_queued(p))

4792

rq = move_queued_task(p, dest_cpu);

4797

rq = move_queued_task(p, dest_cpu);

4793

done:

4798

done:

4794

ret = 1;

4799

ret = 1;

4795

fail:

4800

fail:

4796

raw_spin_unlock(&rq->lock);

4801

raw_spin_unlock(&rq->lock);

4797

raw_spin_unlock(&p->pi_lock);

4802

raw_spin_unlock(&p->pi_lock);

4798

return ret;

4803

return ret;

4799

}

4804

}

4800

4805

4801

#ifdef CONFIG_NUMA_BALANCING

4806

#ifdef CONFIG_NUMA_BALANCING

4802

/* Migrate current task p to target_cpu */

4807

/* Migrate current task p to target_cpu */

4803

int migrate_task_to(struct task_struct *p, int target_cpu)

4808

int migrate_task_to(struct task_struct *p, int target_cpu)

4804

{

4809

{

4805

struct migration_arg arg = { p, target_cpu };

4810

struct migration_arg arg = { p, target_cpu };

4806

int curr_cpu = task_cpu(p);

4811

int curr_cpu = task_cpu(p);

4807

4812

4808

if (curr_cpu == target_cpu)

4813

if (curr_cpu == target_cpu)

4809

return 0;

4814

return 0;

4810

4815

4811

if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))

4816

if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))

4812

return -EINVAL;

4817

return -EINVAL;

4813

4818

4814

/* TODO: This is not properly updating schedstats */

4819

/* TODO: This is not properly updating schedstats */

4815

4820

4816

trace_sched_move_numa(p, curr_cpu, target_cpu);

4821

trace_sched_move_numa(p, curr_cpu, target_cpu);

4817

return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);

4822

return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);

4818

}

4823

}

4819

4824

4820

/*

4825

/*

4821

* Requeue a task on a given node and accurately track the number of NUMA

4826

* Requeue a task on a given node and accurately track the number of NUMA

4822

* tasks on the runqueues

4827

* tasks on the runqueues

4823

*/

4828

*/

4824

void sched_setnuma(struct task_struct *p, int nid)

4829

void sched_setnuma(struct task_struct *p, int nid)

4825

{

4830

{

4826

struct rq *rq;

4831

struct rq *rq;

4827

unsigned long flags;

4832

unsigned long flags;

4828

bool queued, running;

4833

bool queued, running;

4829

4834

4830

rq = task_rq_lock(p, &flags);

4835

rq = task_rq_lock(p, &flags);

4831

queued = task_on_rq_queued(p);

4836

queued = task_on_rq_queued(p);

4832

running = task_current(rq, p);

4837

running = task_current(rq, p);

4833

4838

4834

if (queued)

4839

if (queued)

4835

dequeue_task(rq, p, 0);

4840

dequeue_task(rq, p, 0);

4836

if (running)

4841

if (running)

4837

put_prev_task(rq, p);

4842

put_prev_task(rq, p);

4838

4843

4839

p->numa_preferred_nid = nid;

4844

p->numa_preferred_nid = nid;

4840

4845

4841

if (running)

4846

if (running)

4842

p->sched_class->set_curr_task(rq);

4847

p->sched_class->set_curr_task(rq);

4843

if (queued)

4848

if (queued)

4844

enqueue_task(rq, p, 0);

4849

enqueue_task(rq, p, 0);

4845

task_rq_unlock(rq, p, &flags);

4850

task_rq_unlock(rq, p, &flags);

4846

}

4851

}

4847

#endif

4852

#endif

4848

4853

4849

/*

4854

/*

4850

* migration_cpu_stop - this will be executed by a highprio stopper thread

4855

* migration_cpu_stop - this will be executed by a highprio stopper thread

4851

* and performs thread migration by bumping thread off CPU then

4856

* and performs thread migration by bumping thread off CPU then

4852

* 'pushing' onto another runqueue.

4857

* 'pushing' onto another runqueue.

4853

*/

4858

*/

4854

static int migration_cpu_stop(void *data)

4859

static int migration_cpu_stop(void *data)

4855

{

4860

{

4856

struct migration_arg *arg = data;

4861

struct migration_arg *arg = data;

4857

4862

4858

/*

4863

/*

4859

* The original target cpu might have gone down and we might

4864

* The original target cpu might have gone down and we might

4860

* be on another cpu but it doesn't matter.

4865

* be on another cpu but it doesn't matter.

4861

*/

4866

*/

4862

local_irq_disable();

4867

local_irq_disable();

4863

/*

4868

/*

4864

* We need to explicitly wake pending tasks before running

4869

* We need to explicitly wake pending tasks before running

4865

* __migrate_task() such that we will not miss enforcing cpus_allowed

4870

* __migrate_task() such that we will not miss enforcing cpus_allowed

4866

* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.

4871

* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.

4867

*/

4872

*/

4868

sched_ttwu_pending();

4873

sched_ttwu_pending();

4869

__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);

4874

__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);

4870

local_irq_enable();

4875

local_irq_enable();

4871

return 0;

4876

return 0;

4872

}

4877

}

4873

4878

4874

#ifdef CONFIG_HOTPLUG_CPU

4879

#ifdef CONFIG_HOTPLUG_CPU

4875

4880

4876

/*

4881

/*

4877

* Ensures that the idle task is using init_mm right before its cpu goes

4882

* Ensures that the idle task is using init_mm right before its cpu goes

4878

* offline.

4883

* offline.

4879

*/

4884

*/

4880

void idle_task_exit(void)

4885

void idle_task_exit(void)

4881

{

4886

{

4882

struct mm_struct *mm = current->active_mm;

4887

struct mm_struct *mm = current->active_mm;

4883

4888

4884

BUG_ON(cpu_online(smp_processor_id()));

4889

BUG_ON(cpu_online(smp_processor_id()));

4885

4890

4886

if (mm != &init_mm) {

4891

if (mm != &init_mm) {

4887

switch_mm(mm, &init_mm, current);

4892

switch_mm(mm, &init_mm, current);

4888

finish_arch_post_lock_switch();

4893

finish_arch_post_lock_switch();

4889

}

4894

}

4890

mmdrop(mm);

4895

mmdrop(mm);

4891

}

4896

}

4892

4897

4893

/*

4898

/*

4894

* Since this CPU is going 'away' for a while, fold any nr_active delta

4899

* Since this CPU is going 'away' for a while, fold any nr_active delta

4895

* we might have. Assumes we're called after migrate_tasks() so that the

4900

* we might have. Assumes we're called after migrate_tasks() so that the

4896

* nr_active count is stable.

4901

* nr_active count is stable.

4897

*

4902

*

4898

* Also see the comment "Global load-average calculations".

4903

* Also see the comment "Global load-average calculations".

4899

*/

4904

*/

4900

static void calc_load_migrate(struct rq *rq)

4905

static void calc_load_migrate(struct rq *rq)

4901

{

4906

{

4902

long delta = calc_load_fold_active(rq);

4907

long delta = calc_load_fold_active(rq);

4903

if (delta)

4908

if (delta)

4904

atomic_long_add(delta, &calc_load_tasks);

4909

atomic_long_add(delta, &calc_load_tasks);

4905

}

4910

}

4906

4911

4907

static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)

4912

static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)

4908

{

4913

{

4909

}

4914

}

4910

4915

4911

static const struct sched_class fake_sched_class = {

4916

static const struct sched_class fake_sched_class = {

4912

.put_prev_task = put_prev_task_fake,

4917

.put_prev_task = put_prev_task_fake,

4913

};

4918

};

4914

4919

4915

static struct task_struct fake_task = {

4920

static struct task_struct fake_task = {

4916

/*

4921

/*

4917

* Avoid pull_{rt,dl}_task()

4922

* Avoid pull_{rt,dl}_task()

4918

*/

4923

*/

4919

.prio = MAX_PRIO + 1,

4924

.prio = MAX_PRIO + 1,

4920

.sched_class = &fake_sched_class,

4925

.sched_class = &fake_sched_class,

4921

};

4926

};

4922

4927

4923

/*

4928

/*

4924

* Migrate all tasks from the rq, sleeping tasks will be migrated by

4929

* Migrate all tasks from the rq, sleeping tasks will be migrated by

4925

* try_to_wake_up()->select_task_rq().

4930

* try_to_wake_up()->select_task_rq().

4926

*

4931

*

4927

* Called with rq->lock held even though we'er in stop_machine() and

4932

* Called with rq->lock held even though we'er in stop_machine() and

4928

* there's no concurrency possible, we hold the required locks anyway

4933

* there's no concurrency possible, we hold the required locks anyway

4929

* because of lock validation efforts.

4934

* because of lock validation efforts.

4930

*/

4935

*/

4931

static void migrate_tasks(unsigned int dead_cpu)

4936

static void migrate_tasks(unsigned int dead_cpu)

4932

{

4937

{

4933

struct rq *rq = cpu_rq(dead_cpu);

4938

struct rq *rq = cpu_rq(dead_cpu);

4934

struct task_struct *next, *stop = rq->stop;

4939

struct task_struct *next, *stop = rq->stop;

4935

int dest_cpu;

4940

int dest_cpu;

4936

4941

4937

/*

4942

/*

4938

* Fudge the rq selection such that the below task selection loop

4943

* Fudge the rq selection such that the below task selection loop

4939

* doesn't get stuck on the currently eligible stop task.

4944

* doesn't get stuck on the currently eligible stop task.

4940

*

4945

*

4941

* We're currently inside stop_machine() and the rq is either stuck

4946

* We're currently inside stop_machine() and the rq is either stuck

4942

* in the stop_machine_cpu_stop() loop, or we're executing this code,

4947

* in the stop_machine_cpu_stop() loop, or we're executing this code,

4943

* either way we should never end up calling schedule() until we're

4948

* either way we should never end up calling schedule() until we're

4944

* done here.

4949

* done here.

4945

*/

4950

*/

4946

rq->stop = NULL;

4951

rq->stop = NULL;

4947

4952

4948

/*

4953

/*

4949

* put_prev_task() and pick_next_task() sched

4954

* put_prev_task() and pick_next_task() sched

4950

* class method both need to have an up-to-date

4955

* class method both need to have an up-to-date

4951

* value of rq->clock[_task]

4956

* value of rq->clock[_task]

4952

*/

4957

*/

4953

update_rq_clock(rq);

4958

update_rq_clock(rq);

4954

4959

4955

for ( ; ; ) {

4960

for ( ; ; ) {

4956

/*

4961

/*

4957

* There's this thread running, bail when that's the only

4962

* There's this thread running, bail when that's the only

4958

* remaining thread.

4963

* remaining thread.

4959

*/

4964

*/

4960

if (rq->nr_running == 1)

4965

if (rq->nr_running == 1)

4961

break;

4966

break;

4962

4967

4963

next = pick_next_task(rq, &fake_task);

4968

next = pick_next_task(rq, &fake_task);

4964

BUG_ON(!next);

4969

BUG_ON(!next);

4965

next->sched_class->put_prev_task(rq, next);

4970

next->sched_class->put_prev_task(rq, next);

4966

4971

4967

/* Find suitable destination for @next, with force if needed. */

4972

/* Find suitable destination for @next, with force if needed. */

4968

dest_cpu = select_fallback_rq(dead_cpu, next);

4973

dest_cpu = select_fallback_rq(dead_cpu, next);

4969

raw_spin_unlock(&rq->lock);

4974

raw_spin_unlock(&rq->lock);

4970

4975

4971

__migrate_task(next, dead_cpu, dest_cpu);

4976

__migrate_task(next, dead_cpu, dest_cpu);

4972

4977

4973

raw_spin_lock(&rq->lock);

4978

raw_spin_lock(&rq->lock);

4974

}

4979

}

4975

4980

4976

rq->stop = stop;

4981

rq->stop = stop;

4977

}

4982

}

4978

4983

4979

#endif /* CONFIG_HOTPLUG_CPU */

4984

#endif /* CONFIG_HOTPLUG_CPU */

4980

4985

4981

#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)

4986

#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)

4982

4987

4983

static struct ctl_table sd_ctl_dir[] = {

4988

static struct ctl_table sd_ctl_dir[] = {

4984

{

4989

{

4985

.procname = "sched_domain",

4990

.procname = "sched_domain",

4986

.mode = 0555,

4991

.mode = 0555,

4987

},

4992

},

4988

{}

4993

{}

4989

};

4994

};

4990

4995

4991

static struct ctl_table sd_ctl_root[] = {

4996

static struct ctl_table sd_ctl_root[] = {

4992

{

4997

{

4993

.procname = "kernel",

4998

.procname = "kernel",

4994

.mode = 0555,

4999

.mode = 0555,

4995

.child = sd_ctl_dir,

5000

.child = sd_ctl_dir,

4996

},

5001

},

4997

{}

5002

{}

4998

};

5003

};

4999

5004

5000

static struct ctl_table *sd_alloc_ctl_entry(int n)

5005

static struct ctl_table *sd_alloc_ctl_entry(int n)

5001

{

5006

{

5002

struct ctl_table *entry =

5007

struct ctl_table *entry =

5003

kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);

5008

kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);

5004

5009

5005

return entry;

5010

return entry;

5006

}

5011

}

5007

5012

5008

static void sd_free_ctl_entry(struct ctl_table **tablep)

5013

static void sd_free_ctl_entry(struct ctl_table **tablep)

5009

{

5014

{

5010

struct ctl_table *entry;

5015

struct ctl_table *entry;

5011

5016

5012

/*

5017

/*

5013

* In the intermediate directories, both the child directory and

5018

* In the intermediate directories, both the child directory and

5014

* procname are dynamically allocated and could fail but the mode

5019

* procname are dynamically allocated and could fail but the mode

5015

* will always be set. In the lowest directory the names are

5020

* will always be set. In the lowest directory the names are

5016

* static strings and all have proc handlers.

5021

* static strings and all have proc handlers.

5017

*/

5022

*/

5018

for (entry = *tablep; entry->mode; entry++) {

5023

for (entry = *tablep; entry->mode; entry++) {

5019

if (entry->child)

5024

if (entry->child)

5020

sd_free_ctl_entry(&entry->child);

5025

sd_free_ctl_entry(&entry->child);

5021

if (entry->proc_handler == NULL)

5026

if (entry->proc_handler == NULL)

5022

kfree(entry->procname);

5027

kfree(entry->procname);

5023

}

5028

}

5024

5029

5025

kfree(*tablep);

5030

kfree(*tablep);

5026

*tablep = NULL;

5031

*tablep = NULL;

5027

}

5032

}

5028

5033

5029

static int min_load_idx = 0;

5034

static int min_load_idx = 0;

5030

static int max_load_idx = CPU_LOAD_IDX_MAX-1;

5035

static int max_load_idx = CPU_LOAD_IDX_MAX-1;

5031

5036

5032

static void

5037

static void

5033

set_table_entry(struct ctl_table *entry,

5038

set_table_entry(struct ctl_table *entry,

5034

const char *procname, void *data, int maxlen,

5039

const char *procname, void *data, int maxlen,

5035

umode_t mode, proc_handler *proc_handler,

5040

umode_t mode, proc_handler *proc_handler,

5036

bool load_idx)

5041

bool load_idx)

5037

{

5042

{

5038

entry->procname = procname;

5043

entry->procname = procname;

5039

entry->data = data;

5044

entry->data = data;

5040

entry->maxlen = maxlen;

5045

entry->maxlen = maxlen;

5041

entry->mode = mode;

5046

entry->mode = mode;

5042

entry->proc_handler = proc_handler;

5047

entry->proc_handler = proc_handler;

5043

5048

5044

if (load_idx) {

5049

if (load_idx) {

5045

entry->extra1 = &min_load_idx;

5050

entry->extra1 = &min_load_idx;

5046

entry->extra2 = &max_load_idx;

5051

entry->extra2 = &max_load_idx;

5047

}

5052

}

5048

}

5053

}

5049

5054

5050

static struct ctl_table *

5055

static struct ctl_table *

5051

sd_alloc_ctl_domain_table(struct sched_domain *sd)

5056

sd_alloc_ctl_domain_table(struct sched_domain *sd)

5052

{

5057

{

5053

struct ctl_table *table = sd_alloc_ctl_entry(14);

5058

struct ctl_table *table = sd_alloc_ctl_entry(14);

5054

5059

5055

if (table == NULL)

5060

if (table == NULL)

5056

return NULL;

5061

return NULL;

5057

5062

5058

set_table_entry(&table[0], "min_interval", &sd->min_interval,

5063

set_table_entry(&table[0], "min_interval", &sd->min_interval,

5059

sizeof(long), 0644, proc_doulongvec_minmax, false);

5064

sizeof(long), 0644, proc_doulongvec_minmax, false);

5060

set_table_entry(&table[1], "max_interval", &sd->max_interval,

5065

set_table_entry(&table[1], "max_interval", &sd->max_interval,

5061

sizeof(long), 0644, proc_doulongvec_minmax, false);

5066

sizeof(long), 0644, proc_doulongvec_minmax, false);

5062

set_table_entry(&table[2], "busy_idx", &sd->busy_idx,

5067

set_table_entry(&table[2], "busy_idx", &sd->busy_idx,

5063

sizeof(int), 0644, proc_dointvec_minmax, true);

5068

sizeof(int), 0644, proc_dointvec_minmax, true);

5064

set_table_entry(&table[3], "idle_idx", &sd->idle_idx,

5069

set_table_entry(&table[3], "idle_idx", &sd->idle_idx,

5065

sizeof(int), 0644, proc_dointvec_minmax, true);

5070

sizeof(int), 0644, proc_dointvec_minmax, true);

5066

set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,

5071

set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,

5067

sizeof(int), 0644, proc_dointvec_minmax, true);

5072

sizeof(int), 0644, proc_dointvec_minmax, true);

5068

set_table_entry(&table[5], "wake_idx", &sd->wake_idx,

5073

set_table_entry(&table[5], "wake_idx", &sd->wake_idx,

5069

sizeof(int), 0644, proc_dointvec_minmax, true);

5074

sizeof(int), 0644, proc_dointvec_minmax, true);

5070

set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,

5075

set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,

5071

sizeof(int), 0644, proc_dointvec_minmax, true);

5076

sizeof(int), 0644, proc_dointvec_minmax, true);

5072

set_table_entry(&table[7], "busy_factor", &sd->busy_factor,

5077

set_table_entry(&table[7], "busy_factor", &sd->busy_factor,

5073

sizeof(int), 0644, proc_dointvec_minmax, false);

5078

sizeof(int), 0644, proc_dointvec_minmax, false);

5074

set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,

5079

set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,

5075

sizeof(int), 0644, proc_dointvec_minmax, false);

5080

sizeof(int), 0644, proc_dointvec_minmax, false);

5076

set_table_entry(&table[9], "cache_nice_tries",

5081

set_table_entry(&table[9], "cache_nice_tries",

5077

&sd->cache_nice_tries,

5082

&sd->cache_nice_tries,

5078

sizeof(int), 0644, proc_dointvec_minmax, false);

5083

sizeof(int), 0644, proc_dointvec_minmax, false);

5079

set_table_entry(&table[10], "flags", &sd->flags,

5084

set_table_entry(&table[10], "flags", &sd->flags,

5080

sizeof(int), 0644, proc_dointvec_minmax, false);

5085

sizeof(int), 0644, proc_dointvec_minmax, false);

5081

set_table_entry(&table[11], "max_newidle_lb_cost",

5086

set_table_entry(&table[11], "max_newidle_lb_cost",

5082

&sd->max_newidle_lb_cost,

5087

&sd->max_newidle_lb_cost,

5083

sizeof(long), 0644, proc_doulongvec_minmax, false);

5088

sizeof(long), 0644, proc_doulongvec_minmax, false);

5084

set_table_entry(&table[12], "name", sd->name,

5089

set_table_entry(&table[12], "name", sd->name,

5085

CORENAME_MAX_SIZE, 0444, proc_dostring, false);

5090

CORENAME_MAX_SIZE, 0444, proc_dostring, false);

5086

/* &table[13] is terminator */

5091

/* &table[13] is terminator */

5087

5092

5088

return table;

5093

return table;

5089

}

5094

}

5090

5095

5091

static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)

5096

static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)

5092

{

5097

{

5093

struct ctl_table *entry, *table;

5098

struct ctl_table *entry, *table;

5094

struct sched_domain *sd;

5099

struct sched_domain *sd;

5095

int domain_num = 0, i;

5100

int domain_num = 0, i;

5096

char buf[32];

5101

char buf[32];

5097

5102

5098

for_each_domain(cpu, sd)

5103

for_each_domain(cpu, sd)

5099

domain_num++;

5104

domain_num++;

5100

entry = table = sd_alloc_ctl_entry(domain_num + 1);

5105

entry = table = sd_alloc_ctl_entry(domain_num + 1);

5101

if (table == NULL)

5106

if (table == NULL)

5102

return NULL;

5107

return NULL;

5103

5108

5104

i = 0;

5109

i = 0;

5105

for_each_domain(cpu, sd) {

5110

for_each_domain(cpu, sd) {

5106

snprintf(buf, 32, "domain%d", i);

5111

snprintf(buf, 32, "domain%d", i);

5107

entry->procname = kstrdup(buf, GFP_KERNEL);

5112

entry->procname = kstrdup(buf, GFP_KERNEL);

5108

entry->mode = 0555;

5113

entry->mode = 0555;

5109

entry->child = sd_alloc_ctl_domain_table(sd);

5114

entry->child = sd_alloc_ctl_domain_table(sd);

5110

entry++;

5115

entry++;

5111

i++;

5116

i++;

5112

}

5117

}

5113

return table;

5118

return table;

5114

}

5119

}

5115

5120

5116

static struct ctl_table_header *sd_sysctl_header;

5121

static struct ctl_table_header *sd_sysctl_header;

5117

static void register_sched_domain_sysctl(void)

5122

static void register_sched_domain_sysctl(void)

5118

{

5123

{

5119

int i, cpu_num = num_possible_cpus();

5124

int i, cpu_num = num_possible_cpus();

5120

struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);

5125

struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);

5121

char buf[32];

5126

char buf[32];

5122

5127

5123

WARN_ON(sd_ctl_dir[0].child);

5128

WARN_ON(sd_ctl_dir[0].child);

5124

sd_ctl_dir[0].child = entry;

5129

sd_ctl_dir[0].child = entry;

5125

5130

5126

if (entry == NULL)

5131

if (entry == NULL)

5127

return;

5132

return;

5128

5133

5129

for_each_possible_cpu(i) {

5134

for_each_possible_cpu(i) {

5130

snprintf(buf, 32, "cpu%d", i);

5135

snprintf(buf, 32, "cpu%d", i);

5131

entry->procname = kstrdup(buf, GFP_KERNEL);

5136

entry->procname = kstrdup(buf, GFP_KERNEL);

5132

entry->mode = 0555;

5137

entry->mode = 0555;

5133

entry->child = sd_alloc_ctl_cpu_table(i);

5138

entry->child = sd_alloc_ctl_cpu_table(i);

5134

entry++;

5139

entry++;

5135

}

5140

}

5136

5141

5137

WARN_ON(sd_sysctl_header);

5142

WARN_ON(sd_sysctl_header);

5138

sd_sysctl_header = register_sysctl_table(sd_ctl_root);

5143

sd_sysctl_header = register_sysctl_table(sd_ctl_root);

5139

}

5144

}

5140

5145

5141

/* may be called multiple times per register */

5146

/* may be called multiple times per register */

5142

static void unregister_sched_domain_sysctl(void)

5147

static void unregister_sched_domain_sysctl(void)

5143

{

5148

{

5144

if (sd_sysctl_header)

5149

if (sd_sysctl_header)

5145

unregister_sysctl_table(sd_sysctl_header);

5150

unregister_sysctl_table(sd_sysctl_header);

5146

sd_sysctl_header = NULL;

5151

sd_sysctl_header = NULL;

5147

if (sd_ctl_dir[0].child)

5152

if (sd_ctl_dir[0].child)

5148

sd_free_ctl_entry(&sd_ctl_dir[0].child);

5153

sd_free_ctl_entry(&sd_ctl_dir[0].child);

5149

}

5154

}

5150

#else

5155

#else

5151

static void register_sched_domain_sysctl(void)

5156

static void register_sched_domain_sysctl(void)

5152

{

5157

{

5153

}

5158

}

5154

static void unregister_sched_domain_sysctl(void)

5159

static void unregister_sched_domain_sysctl(void)

5155

{

5160

{

5156

}

5161

}

5157

#endif

5162

#endif

5158

5163

5159

static void set_rq_online(struct rq *rq)

5164

static void set_rq_online(struct rq *rq)

5160

{

5165

{

5161

if (!rq->online) {

5166

if (!rq->online) {

5162

const struct sched_class *class;

5167

const struct sched_class *class;

5163

5168

5164

cpumask_set_cpu(rq->cpu, rq->rd->online);

5169

cpumask_set_cpu(rq->cpu, rq->rd->online);

5165

rq->online = 1;

5170

rq->online = 1;

5166

5171

5167

for_each_class(class) {

5172

for_each_class(class) {

5168

if (class->rq_online)

5173

if (class->rq_online)

5169

class->rq_online(rq);

5174

class->rq_online(rq);

5170

}

5175

}

5171

}

5176

}

5172

}

5177

}

5173

5178

5174

static void set_rq_offline(struct rq *rq)

5179

static void set_rq_offline(struct rq *rq)

5175

{

5180

{

5176

if (rq->online) {

5181

if (rq->online) {

5177

const struct sched_class *class;

5182

const struct sched_class *class;

5178

5183

5179

for_each_class(class) {

5184

for_each_class(class) {

5180

if (class->rq_offline)

5185

if (class->rq_offline)

5181

class->rq_offline(rq);

5186

class->rq_offline(rq);

5182

}

5187

}

5183

5188

5184

cpumask_clear_cpu(rq->cpu, rq->rd->online);

5189

cpumask_clear_cpu(rq->cpu, rq->rd->online);

5185

rq->online = 0;

5190

rq->online = 0;

5186

}

5191

}

5187

}

5192

}

5188

5193

5189

/*

5194

/*

5190

* migration_call - callback that gets triggered when a CPU is added.

5195

* migration_call - callback that gets triggered when a CPU is added.

5191

* Here we can start up the necessary migration thread for the new CPU.

5196

* Here we can start up the necessary migration thread for the new CPU.

5192

*/

5197

*/

5193

static int

5198

static int

5194

migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)

5199

migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)

5195

{

5200

{

5196

int cpu = (long)hcpu;

5201

int cpu = (long)hcpu;

5197

unsigned long flags;

5202

unsigned long flags;

5198

struct rq *rq = cpu_rq(cpu);

5203

struct rq *rq = cpu_rq(cpu);

5199

5204

5200

switch (action & ~CPU_TASKS_FROZEN) {

5205

switch (action & ~CPU_TASKS_FROZEN) {

5201

5206

5202

case CPU_UP_PREPARE:

5207

case CPU_UP_PREPARE:

5203

rq->calc_load_update = calc_load_update;

5208

rq->calc_load_update = calc_load_update;

5204

break;

5209

break;

5205

5210

5206

case CPU_ONLINE:

5211

case CPU_ONLINE:

5207

/* Update our root-domain */

5212

/* Update our root-domain */

5208

raw_spin_lock_irqsave(&rq->lock, flags);

5213

raw_spin_lock_irqsave(&rq->lock, flags);

5209

if (rq->rd) {

5214

if (rq->rd) {

5210

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

5215

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

5211

5216

5212

set_rq_online(rq);

5217

set_rq_online(rq);

5213

}

5218

}

5214

raw_spin_unlock_irqrestore(&rq->lock, flags);

5219

raw_spin_unlock_irqrestore(&rq->lock, flags);

5215

break;

5220

break;

5216

5221

5217

#ifdef CONFIG_HOTPLUG_CPU

5222

#ifdef CONFIG_HOTPLUG_CPU

5218

case CPU_DYING:

5223

case CPU_DYING:

5219

sched_ttwu_pending();

5224

sched_ttwu_pending();

5220

/* Update our root-domain */

5225

/* Update our root-domain */

5221

raw_spin_lock_irqsave(&rq->lock, flags);

5226

raw_spin_lock_irqsave(&rq->lock, flags);

5222

if (rq->rd) {

5227

if (rq->rd) {

5223

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

5228

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

5224

set_rq_offline(rq);

5229

set_rq_offline(rq);

5225

}

5230

}

5226

migrate_tasks(cpu);

5231

migrate_tasks(cpu);

5227

BUG_ON(rq->nr_running != 1); /* the migration thread */

5232

BUG_ON(rq->nr_running != 1); /* the migration thread */

5228

raw_spin_unlock_irqrestore(&rq->lock, flags);

5233

raw_spin_unlock_irqrestore(&rq->lock, flags);

5229

break;

5234

break;

5230

5235

5231

case CPU_DEAD:

5236

case CPU_DEAD:

5232

calc_load_migrate(rq);

5237

calc_load_migrate(rq);

5233

break;

5238

break;

5234

#endif

5239

#endif

5235

}

5240

}

5236

5241

5237

update_max_interval();

5242

update_max_interval();

5238

5243

5239

return NOTIFY_OK;

5244

return NOTIFY_OK;

5240

}

5245

}

5241

5246

5242

/*

5247

/*

5243

* Register at high priority so that task migration (migrate_all_tasks)

5248

* Register at high priority so that task migration (migrate_all_tasks)

5244

* happens before everything else. This has to be lower priority than

5249

* happens before everything else. This has to be lower priority than

5245

* the notifier in the perf_event subsystem, though.

5250

* the notifier in the perf_event subsystem, though.

5246

*/

5251

*/

5247

static struct notifier_block migration_notifier = {

5252

static struct notifier_block migration_notifier = {

5248

.notifier_call = migration_call,

5253

.notifier_call = migration_call,

5249

.priority = CPU_PRI_MIGRATION,

5254

.priority = CPU_PRI_MIGRATION,

5250

};

5255

};

5251

5256

5252

static void __cpuinit set_cpu_rq_start_time(void)

5257

static void __cpuinit set_cpu_rq_start_time(void)

5253

{

5258

{

5254

int cpu = smp_processor_id();

5259

int cpu = smp_processor_id();

5255

struct rq *rq = cpu_rq(cpu);

5260

struct rq *rq = cpu_rq(cpu);

5256

rq->age_stamp = sched_clock_cpu(cpu);

5261

rq->age_stamp = sched_clock_cpu(cpu);

5257

}

5262

}

5258

5263

5259

static int sched_cpu_active(struct notifier_block *nfb,

5264

static int sched_cpu_active(struct notifier_block *nfb,

5260

unsigned long action, void *hcpu)

5265

unsigned long action, void *hcpu)

5261

{

5266

{

5262

switch (action & ~CPU_TASKS_FROZEN) {

5267

switch (action & ~CPU_TASKS_FROZEN) {

5263

case CPU_STARTING:

5268

case CPU_STARTING:

5264

set_cpu_rq_start_time();

5269

set_cpu_rq_start_time();

5265

return NOTIFY_OK;

5270

return NOTIFY_OK;

5266

case CPU_DOWN_FAILED:

5271

case CPU_DOWN_FAILED:

5267

set_cpu_active((long)hcpu, true);

5272

set_cpu_active((long)hcpu, true);

5268

return NOTIFY_OK;

5273

return NOTIFY_OK;

5269

default:

5274

default:

5270

return NOTIFY_DONE;

5275

return NOTIFY_DONE;

5271

}

5276

}

5272

}

5277

}

5273

5278

5274

static int sched_cpu_inactive(struct notifier_block *nfb,

5279

static int sched_cpu_inactive(struct notifier_block *nfb,

5275

unsigned long action, void *hcpu)

5280

unsigned long action, void *hcpu)

5276

{

5281

{

5277

unsigned long flags;

5282

unsigned long flags;

5278

long cpu = (long)hcpu;

5283

long cpu = (long)hcpu;

5279

struct dl_bw *dl_b;

5284

struct dl_bw *dl_b;

5280

5285

5281

switch (action & ~CPU_TASKS_FROZEN) {

5286

switch (action & ~CPU_TASKS_FROZEN) {

5282

case CPU_DOWN_PREPARE:

5287

case CPU_DOWN_PREPARE:

5283

set_cpu_active(cpu, false);

5288

set_cpu_active(cpu, false);

5284

5289

5285

/* explicitly allow suspend */

5290

/* explicitly allow suspend */

5286

if (!(action & CPU_TASKS_FROZEN)) {

5291

if (!(action & CPU_TASKS_FROZEN)) {

5287

bool overflow;

5292

bool overflow;

5288

int cpus;

5293

int cpus;

5289

5294

5290

rcu_read_lock_sched();

5295

rcu_read_lock_sched();

5291

dl_b = dl_bw_of(cpu);

5296

dl_b = dl_bw_of(cpu);

5292

5297

5293

raw_spin_lock_irqsave(&dl_b->lock, flags);

5298

raw_spin_lock_irqsave(&dl_b->lock, flags);

5294

cpus = dl_bw_cpus(cpu);

5299

cpus = dl_bw_cpus(cpu);

5295

overflow = __dl_overflow(dl_b, cpus, 0, 0);

5300

overflow = __dl_overflow(dl_b, cpus, 0, 0);

5296

raw_spin_unlock_irqrestore(&dl_b->lock, flags);

5301

raw_spin_unlock_irqrestore(&dl_b->lock, flags);

5297

5302

5298

rcu_read_unlock_sched();

5303

rcu_read_unlock_sched();

5299

5304

5300

if (overflow)

5305

if (overflow)

5301

return notifier_from_errno(-EBUSY);

5306

return notifier_from_errno(-EBUSY);

5302

}

5307

}

5303

return NOTIFY_OK;

5308

return NOTIFY_OK;

5304

}

5309

}

5305

5310

5306

return NOTIFY_DONE;

5311

return NOTIFY_DONE;

5307

}

5312

}

5308

5313

5309

static int __init migration_init(void)

5314

static int __init migration_init(void)

5310

{

5315

{

5311

void *cpu = (void *)(long)smp_processor_id();

5316

void *cpu = (void *)(long)smp_processor_id();

5312

int err;

5317

int err;

5313

5318

5314

/* Initialize migration for the boot CPU */

5319

/* Initialize migration for the boot CPU */

5315

err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);

5320

err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);

5316

BUG_ON(err == NOTIFY_BAD);

5321

BUG_ON(err == NOTIFY_BAD);

5317

migration_call(&migration_notifier, CPU_ONLINE, cpu);

5322

migration_call(&migration_notifier, CPU_ONLINE, cpu);

5318

register_cpu_notifier(&migration_notifier);

5323

register_cpu_notifier(&migration_notifier);

5319

5324

5320

/* Register cpu active notifiers */

5325

/* Register cpu active notifiers */

5321

cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);

5326

cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);

5322

cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);

5327

cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);

5323

5328

5324

return 0;

5329

return 0;

5325

}

5330

}

5326

early_initcall(migration_init);

5331

early_initcall(migration_init);

5327

#endif

5332

#endif

5328

5333

5329

#ifdef CONFIG_SMP

5334

#ifdef CONFIG_SMP

5330

5335

5331

static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */

5336

static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */

5332

5337

5333

#ifdef CONFIG_SCHED_DEBUG

5338

#ifdef CONFIG_SCHED_DEBUG

5334

5339

5335

static __read_mostly int sched_debug_enabled;

5340

static __read_mostly int sched_debug_enabled;

5336

5341

5337

static int __init sched_debug_setup(char *str)

5342

static int __init sched_debug_setup(char *str)

5338

{

5343

{

5339

sched_debug_enabled = 1;

5344

sched_debug_enabled = 1;

5340

5345

5341

return 0;

5346

return 0;

5342

}

5347

}

5343

early_param("sched_debug", sched_debug_setup);

5348

early_param("sched_debug", sched_debug_setup);

5344

5349

5345

static inline bool sched_debug(void)

5350

static inline bool sched_debug(void)

5346

{

5351

{

5347

return sched_debug_enabled;

5352

return sched_debug_enabled;

5348

}

5353

}

5349

5354

5350

static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,

5355

static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,

5351

struct cpumask *groupmask)

5356

struct cpumask *groupmask)

5352

{

5357

{

5353

struct sched_group *group = sd->groups;

5358

struct sched_group *group = sd->groups;

5354

char str[256];

5359

char str[256];

5355

5360

5356

cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));

5361

cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));

5357

cpumask_clear(groupmask);

5362

cpumask_clear(groupmask);

5358

5363

5359

printk(KERN_DEBUG "%*s domain %d: ", level, "", level);

5364

printk(KERN_DEBUG "%*s domain %d: ", level, "", level);

5360

5365

5361

if (!(sd->flags & SD_LOAD_BALANCE)) {

5366

if (!(sd->flags & SD_LOAD_BALANCE)) {

5362

printk("does not load-balance\n");

5367

printk("does not load-balance\n");

5363

if (sd->parent)

5368

if (sd->parent)

5364

printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"

5369

printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"

5365

" has parent");

5370

" has parent");

5366

return -1;

5371

return -1;

5367

}

5372

}

5368

5373

5369

printk(KERN_CONT "span %s level %s\n", str, sd->name);

5374

printk(KERN_CONT "span %s level %s\n", str, sd->name);

5370

5375

5371

if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {

5376

if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {

5372

printk(KERN_ERR "ERROR: domain->span does not contain "

5377

printk(KERN_ERR "ERROR: domain->span does not contain "

5373

"CPU%d\n", cpu);

5378

"CPU%d\n", cpu);

5374

}

5379

}

5375

if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {

5380

if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {

5376

printk(KERN_ERR "ERROR: domain->groups does not contain"

5381

printk(KERN_ERR "ERROR: domain->groups does not contain"

5377

" CPU%d\n", cpu);

5382

" CPU%d\n", cpu);

5378

}

5383

}

5379

5384

5380

printk(KERN_DEBUG "%*s groups:", level + 1, "");

5385

printk(KERN_DEBUG "%*s groups:", level + 1, "");

5381

do {

5386

do {

5382

if (!group) {

5387

if (!group) {

5383

printk("\n");

5388

printk("\n");

5384

printk(KERN_ERR "ERROR: group is NULL\n");

5389

printk(KERN_ERR "ERROR: group is NULL\n");

5385

break;

5390

break;

5386

}

5391

}

5387

5392

5388

/*

5393

/*

5389

* Even though we initialize ->capacity to something semi-sane,

5394

* Even though we initialize ->capacity to something semi-sane,

5390

* we leave capacity_orig unset. This allows us to detect if

5395

* we leave capacity_orig unset. This allows us to detect if

5391

* domain iteration is still funny without causing /0 traps.

5396

* domain iteration is still funny without causing /0 traps.

5392

*/

5397

*/

5393

if (!group->sgc->capacity_orig) {

5398

if (!group->sgc->capacity_orig) {

5394

printk(KERN_CONT "\n");

5399

printk(KERN_CONT "\n");

5395

printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");

5400

printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");

5396

break;

5401

break;

5397

}

5402

}

5398

5403

5399

if (!cpumask_weight(sched_group_cpus(group))) {

5404

if (!cpumask_weight(sched_group_cpus(group))) {

5400

printk(KERN_CONT "\n");

5405

printk(KERN_CONT "\n");

5401

printk(KERN_ERR "ERROR: empty group\n");

5406

printk(KERN_ERR "ERROR: empty group\n");

5402

break;

5407

break;

5403

}

5408

}

5404

5409

5405

if (!(sd->flags & SD_OVERLAP) &&

5410

if (!(sd->flags & SD_OVERLAP) &&

5406

cpumask_intersects(groupmask, sched_group_cpus(group))) {

5411

cpumask_intersects(groupmask, sched_group_cpus(group))) {

5407

printk(KERN_CONT "\n");

5412

printk(KERN_CONT "\n");

5408

printk(KERN_ERR "ERROR: repeated CPUs\n");

5413

printk(KERN_ERR "ERROR: repeated CPUs\n");

5409

break;

5414

break;

5410

}

5415

}

5411

5416

5412

cpumask_or(groupmask, groupmask, sched_group_cpus(group));

5417

cpumask_or(groupmask, groupmask, sched_group_cpus(group));

5413

5418

5414

cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));

5419

cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));

5415

5420

5416

printk(KERN_CONT " %s", str);

5421

printk(KERN_CONT " %s", str);

5417

if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {

5422

if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {

5418

printk(KERN_CONT " (cpu_capacity = %d)",

5423

printk(KERN_CONT " (cpu_capacity = %d)",

5419

group->sgc->capacity);

5424

group->sgc->capacity);

5420

}

5425

}

5421

5426

5422

group = group->next;

5427

group = group->next;

5423

} while (group != sd->groups);

5428

} while (group != sd->groups);

5424

printk(KERN_CONT "\n");

5429

printk(KERN_CONT "\n");

5425

5430

5426

if (!cpumask_equal(sched_domain_span(sd), groupmask))

5431

if (!cpumask_equal(sched_domain_span(sd), groupmask))

5427

printk(KERN_ERR "ERROR: groups don't span domain->span\n");

5432

printk(KERN_ERR "ERROR: groups don't span domain->span\n");

5428

5433

5429

if (sd->parent &&

5434

if (sd->parent &&

5430

!cpumask_subset(groupmask, sched_domain_span(sd->parent)))

5435

!cpumask_subset(groupmask, sched_domain_span(sd->parent)))

5431

printk(KERN_ERR "ERROR: parent span is not a superset "

5436

printk(KERN_ERR "ERROR: parent span is not a superset "

5432

"of domain->span\n");

5437

"of domain->span\n");

5433

return 0;

5438

return 0;

5434

}

5439

}

5435

5440

5436

static void sched_domain_debug(struct sched_domain *sd, int cpu)

5441

static void sched_domain_debug(struct sched_domain *sd, int cpu)

5437

{

5442

{

5438

int level = 0;

5443

int level = 0;

5439

5444

5440

if (!sched_debug_enabled)

5445

if (!sched_debug_enabled)

5441

return;

5446

return;

5442

5447

5443

if (!sd) {

5448

if (!sd) {

5444

printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);

5449

printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);

5445

return;

5450

return;

5446

}

5451

}

5447

5452

5448

printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

5453

printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

5449

5454

5450

for (;;) {

5455

for (;;) {

5451

if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))

5456

if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))

5452

break;

5457

break;

5453

level++;

5458

level++;

5454

sd = sd->parent;

5459

sd = sd->parent;

5455

if (!sd)

5460

if (!sd)

5456

break;

5461

break;

5457

}

5462

}

5458

}

5463

}

5459

#else /* !CONFIG_SCHED_DEBUG */

5464

#else /* !CONFIG_SCHED_DEBUG */

5460

# define sched_domain_debug(sd, cpu) do { } while (0)

5465

# define sched_domain_debug(sd, cpu) do { } while (0)

5461

static inline bool sched_debug(void)

5466

static inline bool sched_debug(void)

5462

{

5467

{

5463

return false;

5468

return false;

5464

}

5469

}

5465

#endif /* CONFIG_SCHED_DEBUG */

5470

#endif /* CONFIG_SCHED_DEBUG */

5466

5471

5467

static int sd_degenerate(struct sched_domain *sd)

5472

static int sd_degenerate(struct sched_domain *sd)

5468

{

5473

{

5469

if (cpumask_weight(sched_domain_span(sd)) == 1)

5474

if (cpumask_weight(sched_domain_span(sd)) == 1)

5470

return 1;

5475

return 1;

5471

5476

5472

/* Following flags need at least 2 groups */

5477

/* Following flags need at least 2 groups */

5473

if (sd->flags & (SD_LOAD_BALANCE |

5478

if (sd->flags & (SD_LOAD_BALANCE |

5474

SD_BALANCE_NEWIDLE |

5479

SD_BALANCE_NEWIDLE |

5475

SD_BALANCE_FORK |

5480

SD_BALANCE_FORK |

5476

SD_BALANCE_EXEC |

5481

SD_BALANCE_EXEC |

5477

SD_SHARE_CPUCAPACITY |

5482

SD_SHARE_CPUCAPACITY |

5478

SD_SHARE_PKG_RESOURCES |

5483

SD_SHARE_PKG_RESOURCES |

5479

SD_SHARE_POWERDOMAIN)) {

5484

SD_SHARE_POWERDOMAIN)) {

5480

if (sd->groups != sd->groups->next)

5485

if (sd->groups != sd->groups->next)

5481

return 0;

5486

return 0;

5482

}

5487

}

5483

5488

5484

/* Following flags don't use groups */

5489

/* Following flags don't use groups */

5485

if (sd->flags & (SD_WAKE_AFFINE))

5490

if (sd->flags & (SD_WAKE_AFFINE))

5486

return 0;

5491

return 0;

5487

5492

5488

return 1;

5493

return 1;

5489

}

5494

}

5490

5495

5491

static int

5496

static int

5492

sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)

5497

sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)

5493

{

5498

{

5494

unsigned long cflags = sd->flags, pflags = parent->flags;

5499

unsigned long cflags = sd->flags, pflags = parent->flags;

5495

5500

5496

if (sd_degenerate(parent))

5501

if (sd_degenerate(parent))

5497

return 1;

5502

return 1;

5498

5503

5499

if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))

5504

if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))

5500

return 0;

5505

return 0;

5501

5506

5502

/* Flags needing groups don't count if only 1 group in parent */

5507

/* Flags needing groups don't count if only 1 group in parent */

5503

if (parent->groups == parent->groups->next) {

5508

if (parent->groups == parent->groups->next) {

5504

pflags &= ~(SD_LOAD_BALANCE |

5509

pflags &= ~(SD_LOAD_BALANCE |

5505

SD_BALANCE_NEWIDLE |

5510

SD_BALANCE_NEWIDLE |

5506

SD_BALANCE_FORK |

5511

SD_BALANCE_FORK |

5507

SD_BALANCE_EXEC |

5512

SD_BALANCE_EXEC |

5508

SD_SHARE_CPUCAPACITY |

5513

SD_SHARE_CPUCAPACITY |

5509

SD_SHARE_PKG_RESOURCES |

5514

SD_SHARE_PKG_RESOURCES |

5510

SD_PREFER_SIBLING |

5515

SD_PREFER_SIBLING |

5511

SD_SHARE_POWERDOMAIN);

5516

SD_SHARE_POWERDOMAIN);

5512

if (nr_node_ids == 1)

5517

if (nr_node_ids == 1)

5513

pflags &= ~SD_SERIALIZE;

5518

pflags &= ~SD_SERIALIZE;

5514

}

5519

}

5515

if (~cflags & pflags)

5520

if (~cflags & pflags)

5516

return 0;

5521

return 0;

5517

5522

5518

return 1;

5523

return 1;

5519

}

5524

}

5520

5525

5521

static void free_rootdomain(struct rcu_head *rcu)

5526

static void free_rootdomain(struct rcu_head *rcu)

5522

{

5527

{

5523

struct root_domain *rd = container_of(rcu, struct root_domain, rcu);

5528

struct root_domain *rd = container_of(rcu, struct root_domain, rcu);

5524

5529

5525

cpupri_cleanup(&rd->cpupri);

5530

cpupri_cleanup(&rd->cpupri);

5526

cpudl_cleanup(&rd->cpudl);

5531

cpudl_cleanup(&rd->cpudl);

5527

free_cpumask_var(rd->dlo_mask);

5532

free_cpumask_var(rd->dlo_mask);

5528

free_cpumask_var(rd->rto_mask);

5533

free_cpumask_var(rd->rto_mask);

5529

free_cpumask_var(rd->online);

5534

free_cpumask_var(rd->online);

5530

free_cpumask_var(rd->span);

5535

free_cpumask_var(rd->span);

5531

kfree(rd);

5536

kfree(rd);

5532

}

5537

}

5533

5538

5534

static void rq_attach_root(struct rq *rq, struct root_domain *rd)

5539

static void rq_attach_root(struct rq *rq, struct root_domain *rd)

5535

{

5540

{

5536

struct root_domain *old_rd = NULL;

5541

struct root_domain *old_rd = NULL;

5537

unsigned long flags;

5542

unsigned long flags;

5538

5543

5539

raw_spin_lock_irqsave(&rq->lock, flags);

5544

raw_spin_lock_irqsave(&rq->lock, flags);

5540

5545

5541

if (rq->rd) {

5546

if (rq->rd) {

5542

old_rd = rq->rd;

5547

old_rd = rq->rd;

5543

5548

5544

if (cpumask_test_cpu(rq->cpu, old_rd->online))

5549

if (cpumask_test_cpu(rq->cpu, old_rd->online))

5545

set_rq_offline(rq);

5550

set_rq_offline(rq);

5546

5551

5547

cpumask_clear_cpu(rq->cpu, old_rd->span);

5552

cpumask_clear_cpu(rq->cpu, old_rd->span);

5548

5553

5549

/*

5554

/*

5550

* If we dont want to free the old_rd yet then

5555

* If we dont want to free the old_rd yet then

5551

* set old_rd to NULL to skip the freeing later

5556

* set old_rd to NULL to skip the freeing later

5552

* in this function:

5557

* in this function:

5553

*/

5558

*/

5554

if (!atomic_dec_and_test(&old_rd->refcount))

5559

if (!atomic_dec_and_test(&old_rd->refcount))

5555

old_rd = NULL;

5560

old_rd = NULL;

5556

}

5561

}

5557

5562

5558

atomic_inc(&rd->refcount);

5563

atomic_inc(&rd->refcount);

5559

rq->rd = rd;

5564

rq->rd = rd;

5560

5565

5561

cpumask_set_cpu(rq->cpu, rd->span);

5566

cpumask_set_cpu(rq->cpu, rd->span);

5562

if (cpumask_test_cpu(rq->cpu, cpu_active_mask))

5567

if (cpumask_test_cpu(rq->cpu, cpu_active_mask))

5563

set_rq_online(rq);

5568

set_rq_online(rq);

5564

5569

5565

raw_spin_unlock_irqrestore(&rq->lock, flags);

5570

raw_spin_unlock_irqrestore(&rq->lock, flags);

5566

5571

5567

if (old_rd)

5572

if (old_rd)

5568

call_rcu_sched(&old_rd->rcu, free_rootdomain);

5573

call_rcu_sched(&old_rd->rcu, free_rootdomain);

5569

}

5574

}

5570

5575

5571

static int init_rootdomain(struct root_domain *rd)

5576

static int init_rootdomain(struct root_domain *rd)

5572

{

5577

{

5573

memset(rd, 0, sizeof(*rd));

5578

memset(rd, 0, sizeof(*rd));

5574

5579

5575

if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))

5580

if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))

5576

goto out;

5581

goto out;

5577

if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))

5582

if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))

5578

goto free_span;

5583

goto free_span;

5579

if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))

5584

if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))

5580

goto free_online;

5585

goto free_online;

5581

if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))

5586

if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))

5582

goto free_dlo_mask;

5587

goto free_dlo_mask;

5583

5588

5584

init_dl_bw(&rd->dl_bw);

5589

init_dl_bw(&rd->dl_bw);

5585

if (cpudl_init(&rd->cpudl) != 0)

5590

if (cpudl_init(&rd->cpudl) != 0)

5586

goto free_dlo_mask;

5591

goto free_dlo_mask;

5587

5592

5588

if (cpupri_init(&rd->cpupri) != 0)

5593

if (cpupri_init(&rd->cpupri) != 0)

5589

goto free_rto_mask;

5594

goto free_rto_mask;

5590

return 0;

5595

return 0;

5591

5596

5592

free_rto_mask:

5597

free_rto_mask:

5593

free_cpumask_var(rd->rto_mask);

5598

free_cpumask_var(rd->rto_mask);

5594

free_dlo_mask:

5599

free_dlo_mask:

5595

free_cpumask_var(rd->dlo_mask);

5600

free_cpumask_var(rd->dlo_mask);

5596

free_online:

5601

free_online:

5597

free_cpumask_var(rd->online);

5602

free_cpumask_var(rd->online);

5598

free_span:

5603

free_span:

5599

free_cpumask_var(rd->span);

5604

free_cpumask_var(rd->span);

5600

out:

5605

out:

5601

return -ENOMEM;

5606

return -ENOMEM;

5602

}

5607

}

5603

5608

5604

/*

5609

/*

5605

* By default the system creates a single root-domain with all cpus as

5610

* By default the system creates a single root-domain with all cpus as

5606

* members (mimicking the global state we have today).

5611

* members (mimicking the global state we have today).

5607

*/

5612

*/

5608

struct root_domain def_root_domain;

5613

struct root_domain def_root_domain;

5609

5614

5610

static void init_defrootdomain(void)

5615

static void init_defrootdomain(void)

5611

{

5616

{

5612

init_rootdomain(&def_root_domain);

5617

init_rootdomain(&def_root_domain);

5613

5618

5614

atomic_set(&def_root_domain.refcount, 1);

5619

atomic_set(&def_root_domain.refcount, 1);

5615

}

5620

}

5616

5621

5617

static struct root_domain *alloc_rootdomain(void)

5622

static struct root_domain *alloc_rootdomain(void)

5618

{

5623

{

5619

struct root_domain *rd;

5624

struct root_domain *rd;

5620

5625

5621

rd = kmalloc(sizeof(*rd), GFP_KERNEL);

5626

rd = kmalloc(sizeof(*rd), GFP_KERNEL);

5622

if (!rd)

5627

if (!rd)

5623

return NULL;

5628

return NULL;

5624

5629

5625

if (init_rootdomain(rd) != 0) {

5630

if (init_rootdomain(rd) != 0) {

5626

kfree(rd);

5631

kfree(rd);

5627

return NULL;

5632

return NULL;

5628

}

5633

}

5629

5634

5630

return rd;

5635

return rd;

5631

}

5636

}

5632

5637

5633

static void free_sched_groups(struct sched_group *sg, int free_sgc)

5638

static void free_sched_groups(struct sched_group *sg, int free_sgc)

5634

{

5639

{

5635

struct sched_group *tmp, *first;

5640

struct sched_group *tmp, *first;

5636

5641

5637

if (!sg)

5642

if (!sg)

5638

return;

5643

return;

5639

5644

5640

first = sg;

5645

first = sg;

5641

do {

5646

do {

5642

tmp = sg->next;

5647

tmp = sg->next;

5643

5648

5644

if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))

5649

if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))

5645

kfree(sg->sgc);

5650

kfree(sg->sgc);

5646

5651

5647

kfree(sg);

5652

kfree(sg);

5648

sg = tmp;

5653

sg = tmp;

5649

} while (sg != first);

5654

} while (sg != first);

5650

}

5655

}

5651

5656

5652

static void free_sched_domain(struct rcu_head *rcu)

5657

static void free_sched_domain(struct rcu_head *rcu)

5653

{

5658

{

5654

struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);

5659

struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);

5655

5660

5656

/*

5661

/*

5657

* If its an overlapping domain it has private groups, iterate and

5662

* If its an overlapping domain it has private groups, iterate and

5658

* nuke them all.

5663

* nuke them all.

5659

*/

5664

*/

5660

if (sd->flags & SD_OVERLAP) {

5665

if (sd->flags & SD_OVERLAP) {

5661

free_sched_groups(sd->groups, 1);

5666

free_sched_groups(sd->groups, 1);

5662

} else if (atomic_dec_and_test(&sd->groups->ref)) {

5667

} else if (atomic_dec_and_test(&sd->groups->ref)) {

5663

kfree(sd->groups->sgc);

5668

kfree(sd->groups->sgc);

5664

kfree(sd->groups);

5669

kfree(sd->groups);

5665

}

5670

}

5666

kfree(sd);

5671

kfree(sd);

5667

}

5672

}

5668

5673

5669

static void destroy_sched_domain(struct sched_domain *sd, int cpu)

5674

static void destroy_sched_domain(struct sched_domain *sd, int cpu)

5670

{

5675

{

5671

call_rcu(&sd->rcu, free_sched_domain);

5676

call_rcu(&sd->rcu, free_sched_domain);

5672

}

5677

}

5673

5678

5674

static void destroy_sched_domains(struct sched_domain *sd, int cpu)

5679

static void destroy_sched_domains(struct sched_domain *sd, int cpu)

5675

{

5680

{

5676

for (; sd; sd = sd->parent)

5681

for (; sd; sd = sd->parent)

5677

destroy_sched_domain(sd, cpu);

5682

destroy_sched_domain(sd, cpu);

5678

}

5683

}

5679

5684

5680

/*

5685

/*

5681

* Keep a special pointer to the highest sched_domain that has

5686

* Keep a special pointer to the highest sched_domain that has

5682

* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this

5687

* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this

5683

* allows us to avoid some pointer chasing select_idle_sibling().

5688

* allows us to avoid some pointer chasing select_idle_sibling().

5684

*

5689

*

5685

* Also keep a unique ID per domain (we use the first cpu number in

5690

* Also keep a unique ID per domain (we use the first cpu number in

5686

* the cpumask of the domain), this allows us to quickly tell if

5691

* the cpumask of the domain), this allows us to quickly tell if

5687

* two cpus are in the same cache domain, see cpus_share_cache().

5692

* two cpus are in the same cache domain, see cpus_share_cache().

5688

*/

5693

*/

5689

DEFINE_PER_CPU(struct sched_domain *, sd_llc);

5694

DEFINE_PER_CPU(struct sched_domain *, sd_llc);

5690

DEFINE_PER_CPU(int, sd_llc_size);

5695

DEFINE_PER_CPU(int, sd_llc_size);

5691

DEFINE_PER_CPU(int, sd_llc_id);

5696

DEFINE_PER_CPU(int, sd_llc_id);

5692

DEFINE_PER_CPU(struct sched_domain *, sd_numa);

5697

DEFINE_PER_CPU(struct sched_domain *, sd_numa);

5693

DEFINE_PER_CPU(struct sched_domain *, sd_busy);

5698

DEFINE_PER_CPU(struct sched_domain *, sd_busy);

5694

DEFINE_PER_CPU(struct sched_domain *, sd_asym);

5699

DEFINE_PER_CPU(struct sched_domain *, sd_asym);

5695

5700

5696

static void update_top_cache_domain(int cpu)

5701

static void update_top_cache_domain(int cpu)

5697

{

5702

{

5698

struct sched_domain *sd;

5703

struct sched_domain *sd;

5699

struct sched_domain *busy_sd = NULL;

5704

struct sched_domain *busy_sd = NULL;

5700

int id = cpu;

5705

int id = cpu;

5701

int size = 1;

5706

int size = 1;

5702

5707

5703

sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);

5708

sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);

5704

if (sd) {

5709

if (sd) {

5705

id = cpumask_first(sched_domain_span(sd));

5710

id = cpumask_first(sched_domain_span(sd));

5706

size = cpumask_weight(sched_domain_span(sd));

5711

size = cpumask_weight(sched_domain_span(sd));

5707

busy_sd = sd->parent; /* sd_busy */

5712

busy_sd = sd->parent; /* sd_busy */

5708

}

5713

}

5709

rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);

5714

rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);

5710

5715

5711

rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);

5716

rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);

5712

per_cpu(sd_llc_size, cpu) = size;

5717

per_cpu(sd_llc_size, cpu) = size;

5713

per_cpu(sd_llc_id, cpu) = id;

5718

per_cpu(sd_llc_id, cpu) = id;

5714

5719

5715

sd = lowest_flag_domain(cpu, SD_NUMA);

5720

sd = lowest_flag_domain(cpu, SD_NUMA);

5716

rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);

5721

rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);

5717

5722

5718

sd = highest_flag_domain(cpu, SD_ASYM_PACKING);

5723

sd = highest_flag_domain(cpu, SD_ASYM_PACKING);

5719

rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);

5724

rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);

5720

}

5725

}

5721

5726

5722

/*

5727

/*

5723

* Attach the domain 'sd' to 'cpu' as its base domain. Callers must

5728

* Attach the domain 'sd' to 'cpu' as its base domain. Callers must

5724

* hold the hotplug lock.

5729

* hold the hotplug lock.

5725

*/

5730

*/

5726

static void

5731

static void

5727

cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)

5732

cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)

5728

{

5733

{

5729

struct rq *rq = cpu_rq(cpu);

5734

struct rq *rq = cpu_rq(cpu);

5730

struct sched_domain *tmp;

5735

struct sched_domain *tmp;

5731

5736

5732

/* Remove the sched domains which do not contribute to scheduling. */

5737

/* Remove the sched domains which do not contribute to scheduling. */

5733

for (tmp = sd; tmp; ) {

5738

for (tmp = sd; tmp; ) {

5734

struct sched_domain *parent = tmp->parent;

5739

struct sched_domain *parent = tmp->parent;

5735

if (!parent)

5740

if (!parent)

5736

break;

5741

break;

5737

5742

5738

if (sd_parent_degenerate(tmp, parent)) {

5743

if (sd_parent_degenerate(tmp, parent)) {

5739

tmp->parent = parent->parent;

5744

tmp->parent = parent->parent;

5740

if (parent->parent)

5745

if (parent->parent)

5741

parent->parent->child = tmp;

5746

parent->parent->child = tmp;

5742

/*

5747

/*

5743

* Transfer SD_PREFER_SIBLING down in case of a

5748

* Transfer SD_PREFER_SIBLING down in case of a

5744

* degenerate parent; the spans match for this

5749

* degenerate parent; the spans match for this

5745

* so the property transfers.

5750

* so the property transfers.

5746

*/

5751

*/

5747

if (parent->flags & SD_PREFER_SIBLING)

5752

if (parent->flags & SD_PREFER_SIBLING)

5748

tmp->flags |= SD_PREFER_SIBLING;

5753

tmp->flags |= SD_PREFER_SIBLING;

5749

destroy_sched_domain(parent, cpu);

5754

destroy_sched_domain(parent, cpu);

5750

} else

5755

} else

5751

tmp = tmp->parent;

5756

tmp = tmp->parent;

5752

}

5757

}

5753

5758

5754

if (sd && sd_degenerate(sd)) {

5759

if (sd && sd_degenerate(sd)) {

5755

tmp = sd;

5760

tmp = sd;

5756

sd = sd->parent;

5761

sd = sd->parent;

5757

destroy_sched_domain(tmp, cpu);

5762

destroy_sched_domain(tmp, cpu);

5758

if (sd)

5763

if (sd)

5759

sd->child = NULL;

5764

sd->child = NULL;

5760

}

5765

}

5761

5766

5762

sched_domain_debug(sd, cpu);

5767

sched_domain_debug(sd, cpu);

5763

5768

5764

rq_attach_root(rq, rd);

5769

rq_attach_root(rq, rd);

5765

tmp = rq->sd;

5770

tmp = rq->sd;

5766

rcu_assign_pointer(rq->sd, sd);

5771

rcu_assign_pointer(rq->sd, sd);

5767

destroy_sched_domains(tmp, cpu);

5772

destroy_sched_domains(tmp, cpu);

5768

5773

5769

update_top_cache_domain(cpu);

5774

update_top_cache_domain(cpu);

5770

}

5775

}

5771

5776

5772

/* cpus with isolated domains */

5777

/* cpus with isolated domains */

5773

static cpumask_var_t cpu_isolated_map;

5778

static cpumask_var_t cpu_isolated_map;

5774

5779

5775

/* Setup the mask of cpus configured for isolated domains */

5780

/* Setup the mask of cpus configured for isolated domains */

5776

static int __init isolated_cpu_setup(char *str)

5781

static int __init isolated_cpu_setup(char *str)

5777

{

5782

{

5778

alloc_bootmem_cpumask_var(&cpu_isolated_map);

5783

alloc_bootmem_cpumask_var(&cpu_isolated_map);

5779

cpulist_parse(str, cpu_isolated_map);

5784

cpulist_parse(str, cpu_isolated_map);

5780

return 1;

5785

return 1;

5781

}

5786

}

5782

5787

5783

__setup("isolcpus=", isolated_cpu_setup);

5788

__setup("isolcpus=", isolated_cpu_setup);

5784

5789

5785

struct s_data {

5790

struct s_data {

5786

struct sched_domain ** __percpu sd;

5791

struct sched_domain ** __percpu sd;

5787

struct root_domain *rd;

5792

struct root_domain *rd;

5788

};

5793

};

5789

5794

5790

enum s_alloc {

5795

enum s_alloc {

5791

sa_rootdomain,

5796

sa_rootdomain,

5792

sa_sd,

5797

sa_sd,

5793

sa_sd_storage,

5798

sa_sd_storage,

5794

sa_none,

5799

sa_none,

5795

};

5800

};

5796

5801

5797

/*

5802

/*

5798

* Build an iteration mask that can exclude certain CPUs from the upwards

5803

* Build an iteration mask that can exclude certain CPUs from the upwards

5799

* domain traversal.

5804

* domain traversal.

5800

*

5805

*

5801

* Asymmetric node setups can result in situations where the domain tree is of

5806

* Asymmetric node setups can result in situations where the domain tree is of

5802

* unequal depth, make sure to skip domains that already cover the entire

5807

* unequal depth, make sure to skip domains that already cover the entire

5803

* range.

5808

* range.

5804

*

5809

*

5805

* In that case build_sched_domains() will have terminated the iteration early

5810

* In that case build_sched_domains() will have terminated the iteration early

5806

* and our sibling sd spans will be empty. Domains should always include the

5811

* and our sibling sd spans will be empty. Domains should always include the

5807

* cpu they're built on, so check that.

5812

* cpu they're built on, so check that.

5808

*

5813

*

5809

*/

5814

*/

5810

static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)

5815

static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)

5811

{

5816

{

5812

const struct cpumask *span = sched_domain_span(sd);

5817

const struct cpumask *span = sched_domain_span(sd);

5813

struct sd_data *sdd = sd->private;

5818

struct sd_data *sdd = sd->private;

5814

struct sched_domain *sibling;

5819

struct sched_domain *sibling;

5815

int i;

5820

int i;

5816

5821

5817

for_each_cpu(i, span) {

5822

for_each_cpu(i, span) {

5818

sibling = *per_cpu_ptr(sdd->sd, i);

5823

sibling = *per_cpu_ptr(sdd->sd, i);

5819

if (!cpumask_test_cpu(i, sched_domain_span(sibling)))

5824

if (!cpumask_test_cpu(i, sched_domain_span(sibling)))

5820

continue;

5825

continue;

5821

5826

5822

cpumask_set_cpu(i, sched_group_mask(sg));

5827

cpumask_set_cpu(i, sched_group_mask(sg));

5823

}

5828

}

5824

}

5829

}

5825

5830

5826

/*

5831

/*

5827

* Return the canonical balance cpu for this group, this is the first cpu

5832

* Return the canonical balance cpu for this group, this is the first cpu

5828

* of this group that's also in the iteration mask.

5833

* of this group that's also in the iteration mask.

5829

*/

5834

*/

5830

int group_balance_cpu(struct sched_group *sg)

5835

int group_balance_cpu(struct sched_group *sg)

5831

{

5836

{

5832

return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));

5837

return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));

5833

}

5838

}

5834

5839

5835

static int

5840

static int

5836

build_overlap_sched_groups(struct sched_domain *sd, int cpu)

5841

build_overlap_sched_groups(struct sched_domain *sd, int cpu)

5837

{

5842

{

5838

struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;

5843

struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;

5839

const struct cpumask *span = sched_domain_span(sd);

5844

const struct cpumask *span = sched_domain_span(sd);

5840

struct cpumask *covered = sched_domains_tmpmask;

5845

struct cpumask *covered = sched_domains_tmpmask;

5841

struct sd_data *sdd = sd->private;

5846

struct sd_data *sdd = sd->private;

5842

struct sched_domain *sibling;

5847

struct sched_domain *sibling;

5843

int i;

5848

int i;

5844

5849

5845

cpumask_clear(covered);

5850

cpumask_clear(covered);

5846

5851

5847

for_each_cpu(i, span) {

5852

for_each_cpu(i, span) {

5848

struct cpumask *sg_span;

5853

struct cpumask *sg_span;

5849

5854

5850

if (cpumask_test_cpu(i, covered))

5855

if (cpumask_test_cpu(i, covered))

5851

continue;

5856

continue;

5852

5857

5853

sibling = *per_cpu_ptr(sdd->sd, i);

5858

sibling = *per_cpu_ptr(sdd->sd, i);

5854

5859

5855

/* See the comment near build_group_mask(). */

5860

/* See the comment near build_group_mask(). */

5856

if (!cpumask_test_cpu(i, sched_domain_span(sibling)))

5861

if (!cpumask_test_cpu(i, sched_domain_span(sibling)))

5857

continue;

5862

continue;

5858

5863

5859

sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),

5864

sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),

5860

GFP_KERNEL, cpu_to_node(cpu));

5865

GFP_KERNEL, cpu_to_node(cpu));

5861

5866

5862

if (!sg)

5867

if (!sg)

5863

goto fail;

5868

goto fail;

5864

5869

5865

sg_span = sched_group_cpus(sg);

5870

sg_span = sched_group_cpus(sg);

5866

if (sibling->child)

5871

if (sibling->child)

5867

cpumask_copy(sg_span, sched_domain_span(sibling->child));

5872

cpumask_copy(sg_span, sched_domain_span(sibling->child));

5868

else

5873

else

5869

cpumask_set_cpu(i, sg_span);

5874

cpumask_set_cpu(i, sg_span);

5870

5875

5871

cpumask_or(covered, covered, sg_span);

5876

cpumask_or(covered, covered, sg_span);

5872

5877

5873

sg->sgc = *per_cpu_ptr(sdd->sgc, i);

5878

sg->sgc = *per_cpu_ptr(sdd->sgc, i);

5874

if (atomic_inc_return(&sg->sgc->ref) == 1)

5879

if (atomic_inc_return(&sg->sgc->ref) == 1)

5875

build_group_mask(sd, sg);

5880

build_group_mask(sd, sg);

5876

5881

5877

/*

5882

/*

5878

* Initialize sgc->capacity such that even if we mess up the

5883

* Initialize sgc->capacity such that even if we mess up the

5879

* domains and no possible iteration will get us here, we won't

5884

* domains and no possible iteration will get us here, we won't

5880

* die on a /0 trap.

5885

* die on a /0 trap.

5881

*/

5886

*/

5882

sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);

5887

sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);

5883

sg->sgc->capacity_orig = sg->sgc->capacity;

5888

sg->sgc->capacity_orig = sg->sgc->capacity;

5884

5889

5885

/*

5890

/*

5886

* Make sure the first group of this domain contains the

5891

* Make sure the first group of this domain contains the

5887

* canonical balance cpu. Otherwise the sched_domain iteration

5892

* canonical balance cpu. Otherwise the sched_domain iteration

5888

* breaks. See update_sg_lb_stats().

5893

* breaks. See update_sg_lb_stats().

5889

*/

5894

*/

5890

if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||

5895

if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||

5891

group_balance_cpu(sg) == cpu)

5896

group_balance_cpu(sg) == cpu)

5892

groups = sg;

5897

groups = sg;

5893

5898

5894

if (!first)

5899

if (!first)

5895

first = sg;

5900

first = sg;

5896

if (last)

5901

if (last)

5897

last->next = sg;

5902

last->next = sg;

5898

last = sg;

5903

last = sg;

5899

last->next = first;

5904

last->next = first;

5900

}

5905

}

5901

sd->groups = groups;

5906

sd->groups = groups;

5902

5907

5903

return 0;

5908

return 0;

5904

5909

5905

fail:

5910

fail:

5906

free_sched_groups(first, 0);

5911

free_sched_groups(first, 0);

5907

5912

5908

return -ENOMEM;

5913

return -ENOMEM;

5909

}

5914

}

5910

5915

5911

static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)

5916

static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)

5912

{

5917

{

5913

struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);

5918

struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);

5914

struct sched_domain *child = sd->child;

5919

struct sched_domain *child = sd->child;

5915

5920

5916

if (child)

5921

if (child)

5917

cpu = cpumask_first(sched_domain_span(child));

5922

cpu = cpumask_first(sched_domain_span(child));

5918

5923

5919

if (sg) {

5924

if (sg) {

5920

*sg = *per_cpu_ptr(sdd->sg, cpu);

5925

*sg = *per_cpu_ptr(sdd->sg, cpu);

5921

(*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);

5926

(*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);

5922

atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */

5927

atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */

5923

}

5928

}

5924

5929

5925

return cpu;

5930

return cpu;

5926

}

5931

}

5927

5932

5928

/*

5933

/*

5929

* build_sched_groups will build a circular linked list of the groups

5934

* build_sched_groups will build a circular linked list of the groups

5930

* covered by the given span, and will set each group's ->cpumask correctly,

5935

* covered by the given span, and will set each group's ->cpumask correctly,

5931

* and ->cpu_capacity to 0.

5936

* and ->cpu_capacity to 0.

5932

*

5937

*

5933

* Assumes the sched_domain tree is fully constructed

5938

* Assumes the sched_domain tree is fully constructed

5934

*/

5939

*/

5935

static int

5940

static int

5936

build_sched_groups(struct sched_domain *sd, int cpu)

5941

build_sched_groups(struct sched_domain *sd, int cpu)

5937

{

5942

{

5938

struct sched_group *first = NULL, *last = NULL;

5943

struct sched_group *first = NULL, *last = NULL;

5939

struct sd_data *sdd = sd->private;

5944

struct sd_data *sdd = sd->private;

5940

const struct cpumask *span = sched_domain_span(sd);

5945

const struct cpumask *span = sched_domain_span(sd);

5941

struct cpumask *covered;

5946

struct cpumask *covered;

5942

int i;

5947

int i;

5943

5948

5944

get_group(cpu, sdd, &sd->groups);

5949

get_group(cpu, sdd, &sd->groups);

5945

atomic_inc(&sd->groups->ref);

5950

atomic_inc(&sd->groups->ref);

5946

5951

5947

if (cpu != cpumask_first(span))

5952

if (cpu != cpumask_first(span))

5948

return 0;

5953

return 0;

5949

5954

5950

lockdep_assert_held(&sched_domains_mutex);

5955

lockdep_assert_held(&sched_domains_mutex);

5951

covered = sched_domains_tmpmask;

5956

covered = sched_domains_tmpmask;

5952

5957

5953

cpumask_clear(covered);

5958

cpumask_clear(covered);

5954

5959

5955

for_each_cpu(i, span) {

5960

for_each_cpu(i, span) {

5956

struct sched_group *sg;

5961

struct sched_group *sg;

5957

int group, j;

5962

int group, j;

5958

5963

5959

if (cpumask_test_cpu(i, covered))

5964

if (cpumask_test_cpu(i, covered))

5960

continue;

5965

continue;

5961

5966

5962

group = get_group(i, sdd, &sg);

5967

group = get_group(i, sdd, &sg);

5963

cpumask_setall(sched_group_mask(sg));

5968

cpumask_setall(sched_group_mask(sg));

5964

5969

5965

for_each_cpu(j, span) {

5970

for_each_cpu(j, span) {

5966

if (get_group(j, sdd, NULL) != group)

5971

if (get_group(j, sdd, NULL) != group)

5967

continue;

5972

continue;

5968

5973

5969

cpumask_set_cpu(j, covered);

5974

cpumask_set_cpu(j, covered);

5970

cpumask_set_cpu(j, sched_group_cpus(sg));

5975

cpumask_set_cpu(j, sched_group_cpus(sg));

5971

}

5976

}

5972

5977

5973

if (!first)

5978

if (!first)

5974

first = sg;

5979

first = sg;

5975

if (last)

5980

if (last)

5976

last->next = sg;

5981

last->next = sg;

5977

last = sg;

5982

last = sg;

5978

}

5983

}

5979

last->next = first;

5984

last->next = first;

5980

5985

5981

return 0;

5986

return 0;

5982

}

5987

}

5983

5988

5984

/*

5989

/*

5985

* Initialize sched groups cpu_capacity.

5990

* Initialize sched groups cpu_capacity.

5986

*

5991

*

5987

* cpu_capacity indicates the capacity of sched group, which is used while

5992

* cpu_capacity indicates the capacity of sched group, which is used while

5988

* distributing the load between different sched groups in a sched domain.

5993

* distributing the load between different sched groups in a sched domain.

5989

* Typically cpu_capacity for all the groups in a sched domain will be same

5994

* Typically cpu_capacity for all the groups in a sched domain will be same

5990

* unless there are asymmetries in the topology. If there are asymmetries,

5995

* unless there are asymmetries in the topology. If there are asymmetries,

5991

* group having more cpu_capacity will pickup more load compared to the

5996

* group having more cpu_capacity will pickup more load compared to the

5992

* group having less cpu_capacity.

5997

* group having less cpu_capacity.

5993

*/

5998

*/

5994

static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)

5999

static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)

5995

{

6000

{

5996

struct sched_group *sg = sd->groups;

6001

struct sched_group *sg = sd->groups;

5997

6002

5998

WARN_ON(!sg);

6003

WARN_ON(!sg);

5999

6004

6000

do {

6005

do {

6001

sg->group_weight = cpumask_weight(sched_group_cpus(sg));

6006

sg->group_weight = cpumask_weight(sched_group_cpus(sg));

6002

sg = sg->next;

6007

sg = sg->next;

6003

} while (sg != sd->groups);

6008

} while (sg != sd->groups);

6004

6009

6005

if (cpu != group_balance_cpu(sg))

6010

if (cpu != group_balance_cpu(sg))

6006

return;

6011

return;

6007

6012

6008

update_group_capacity(sd, cpu);

6013

update_group_capacity(sd, cpu);

6009

atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);

6014

atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);

6010

}

6015

}

6011

6016

6012

/*

6017

/*

6013

* Initializers for schedule domains

6018

* Initializers for schedule domains

6014

* Non-inlined to reduce accumulated stack pressure in build_sched_domains()

6019

* Non-inlined to reduce accumulated stack pressure in build_sched_domains()

6015

*/

6020

*/

6016

6021

6017

static int default_relax_domain_level = -1;

6022

static int default_relax_domain_level = -1;

6018

int sched_domain_level_max;

6023

int sched_domain_level_max;

6019

6024

6020

static int __init setup_relax_domain_level(char *str)

6025

static int __init setup_relax_domain_level(char *str)

6021

{

6026

{

6022

if (kstrtoint(str, 0, &default_relax_domain_level))

6027

if (kstrtoint(str, 0, &default_relax_domain_level))

6023

pr_warn("Unable to set relax_domain_level\n");

6028

pr_warn("Unable to set relax_domain_level\n");

6024

6029

6025

return 1;

6030

return 1;

6026

}

6031

}

6027

__setup("relax_domain_level=", setup_relax_domain_level);

6032

__setup("relax_domain_level=", setup_relax_domain_level);

6028

6033

6029

static void set_domain_attribute(struct sched_domain *sd,

6034

static void set_domain_attribute(struct sched_domain *sd,

6030

struct sched_domain_attr *attr)

6035

struct sched_domain_attr *attr)

6031

{

6036

{

6032

int request;

6037

int request;

6033

6038

6034

if (!attr || attr->relax_domain_level < 0) {

6039

if (!attr || attr->relax_domain_level < 0) {

6035

if (default_relax_domain_level < 0)

6040

if (default_relax_domain_level < 0)

6036

return;

6041

return;

6037

else

6042

else

6038

request = default_relax_domain_level;

6043

request = default_relax_domain_level;

6039

} else

6044

} else

6040

request = attr->relax_domain_level;

6045

request = attr->relax_domain_level;

6041

if (request < sd->level) {

6046

if (request < sd->level) {

6042

/* turn off idle balance on this domain */

6047

/* turn off idle balance on this domain */

6043

sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

6048

sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

6044

} else {

6049

} else {

6045

/* turn on idle balance on this domain */

6050

/* turn on idle balance on this domain */

6046

sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

6051

sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

6047

}

6052

}

6048

}

6053

}

6049

6054

6050

static void __sdt_free(const struct cpumask *cpu_map);

6055

static void __sdt_free(const struct cpumask *cpu_map);

6051

static int __sdt_alloc(const struct cpumask *cpu_map);

6056

static int __sdt_alloc(const struct cpumask *cpu_map);

6052

6057

6053

static void __free_domain_allocs(struct s_data *d, enum s_alloc what,

6058

static void __free_domain_allocs(struct s_data *d, enum s_alloc what,

6054

const struct cpumask *cpu_map)

6059

const struct cpumask *cpu_map)

6055

{

6060

{

6056

switch (what) {

6061

switch (what) {

6057

case sa_rootdomain:

6062

case sa_rootdomain:

6058

if (!atomic_read(&d->rd->refcount))

6063

if (!atomic_read(&d->rd->refcount))

6059

free_rootdomain(&d->rd->rcu); /* fall through */

6064

free_rootdomain(&d->rd->rcu); /* fall through */

6060

case sa_sd:

6065

case sa_sd:

6061

free_percpu(d->sd); /* fall through */

6066

free_percpu(d->sd); /* fall through */

6062

case sa_sd_storage:

6067

case sa_sd_storage:

6063

__sdt_free(cpu_map); /* fall through */

6068

__sdt_free(cpu_map); /* fall through */

6064

case sa_none:

6069

case sa_none:

6065

break;

6070

break;

6066

}

6071

}

6067

}

6072

}

6068

6073

6069

static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,

6074

static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,

6070

const struct cpumask *cpu_map)

6075

const struct cpumask *cpu_map)

6071

{

6076

{

6072

memset(d, 0, sizeof(*d));

6077

memset(d, 0, sizeof(*d));

6073

6078

6074

if (__sdt_alloc(cpu_map))

6079

if (__sdt_alloc(cpu_map))

6075

return sa_sd_storage;

6080

return sa_sd_storage;

6076

d->sd = alloc_percpu(struct sched_domain *);

6081

d->sd = alloc_percpu(struct sched_domain *);

6077

if (!d->sd)

6082

if (!d->sd)

6078

return sa_sd_storage;

6083

return sa_sd_storage;

6079

d->rd = alloc_rootdomain();

6084

d->rd = alloc_rootdomain();

6080

if (!d->rd)

6085

if (!d->rd)

6081

return sa_sd;

6086

return sa_sd;

6082

return sa_rootdomain;

6087

return sa_rootdomain;

6083

}

6088

}

6084

6089

6085

/*

6090

/*

6086

* NULL the sd_data elements we've used to build the sched_domain and

6091

* NULL the sd_data elements we've used to build the sched_domain and

6087

* sched_group structure so that the subsequent __free_domain_allocs()

6092

* sched_group structure so that the subsequent __free_domain_allocs()

6088

* will not free the data we're using.

6093

* will not free the data we're using.

6089

*/

6094

*/

6090

static void claim_allocations(int cpu, struct sched_domain *sd)

6095

static void claim_allocations(int cpu, struct sched_domain *sd)

6091

{

6096

{

6092

struct sd_data *sdd = sd->private;

6097

struct sd_data *sdd = sd->private;

6093

6098

6094

WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);

6099

WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);

6095

*per_cpu_ptr(sdd->sd, cpu) = NULL;

6100

*per_cpu_ptr(sdd->sd, cpu) = NULL;

6096

6101

6097

if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))

6102

if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))

6098

*per_cpu_ptr(sdd->sg, cpu) = NULL;

6103

*per_cpu_ptr(sdd->sg, cpu) = NULL;

6099

6104

6100

if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))

6105

if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))

6101

*per_cpu_ptr(sdd->sgc, cpu) = NULL;

6106

*per_cpu_ptr(sdd->sgc, cpu) = NULL;

6102

}

6107

}

6103

6108

6104

#ifdef CONFIG_NUMA

6109

#ifdef CONFIG_NUMA

6105

static int sched_domains_numa_levels;

6110

static int sched_domains_numa_levels;

6106

static int *sched_domains_numa_distance;

6111

static int *sched_domains_numa_distance;

6107

static struct cpumask ***sched_domains_numa_masks;

6112

static struct cpumask ***sched_domains_numa_masks;

6108

static int sched_domains_curr_level;

6113

static int sched_domains_curr_level;

6109

#endif

6114

#endif

6110

6115

6111

/*

6116

/*

6112

* SD_flags allowed in topology descriptions.

6117

* SD_flags allowed in topology descriptions.

6113

*

6118

*

6114

* SD_SHARE_CPUCAPACITY - describes SMT topologies

6119

* SD_SHARE_CPUCAPACITY - describes SMT topologies

6115

* SD_SHARE_PKG_RESOURCES - describes shared caches

6120

* SD_SHARE_PKG_RESOURCES - describes shared caches

6116

* SD_NUMA - describes NUMA topologies

6121

* SD_NUMA - describes NUMA topologies

6117

* SD_SHARE_POWERDOMAIN - describes shared power domain

6122

* SD_SHARE_POWERDOMAIN - describes shared power domain

6118

*

6123

*

6119

* Odd one out:

6124

* Odd one out:

6120

* SD_ASYM_PACKING - describes SMT quirks

6125

* SD_ASYM_PACKING - describes SMT quirks

6121

*/

6126

*/

6122

#define TOPOLOGY_SD_FLAGS \

6127

#define TOPOLOGY_SD_FLAGS \

6123

(SD_SHARE_CPUCAPACITY | \

6128

(SD_SHARE_CPUCAPACITY | \

6124

SD_SHARE_PKG_RESOURCES | \

6129

SD_SHARE_PKG_RESOURCES | \

6125

SD_NUMA | \

6130

SD_NUMA | \

6126

SD_ASYM_PACKING | \

6131

SD_ASYM_PACKING | \

6127

SD_SHARE_POWERDOMAIN)

6132

SD_SHARE_POWERDOMAIN)

6128

6133

6129

static struct sched_domain *

6134

static struct sched_domain *

6130

sd_init(struct sched_domain_topology_level *tl, int cpu)

6135

sd_init(struct sched_domain_topology_level *tl, int cpu)

6131

{

6136

{

6132

struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);

6137

struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);

6133

int sd_weight, sd_flags = 0;

6138

int sd_weight, sd_flags = 0;

6134

6139

6135

#ifdef CONFIG_NUMA

6140

#ifdef CONFIG_NUMA

6136

/*

6141

/*

6137

* Ugly hack to pass state to sd_numa_mask()...

6142

* Ugly hack to pass state to sd_numa_mask()...

6138

*/

6143

*/

6139

sched_domains_curr_level = tl->numa_level;

6144

sched_domains_curr_level = tl->numa_level;

6140

#endif

6145

#endif

6141

6146

6142

sd_weight = cpumask_weight(tl->mask(cpu));

6147

sd_weight = cpumask_weight(tl->mask(cpu));

6143

6148

6144

if (tl->sd_flags)

6149

if (tl->sd_flags)

6145

sd_flags = (*tl->sd_flags)();

6150

sd_flags = (*tl->sd_flags)();

6146

if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,

6151

if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,

6147

"wrong sd_flags in topology description\n"))

6152

"wrong sd_flags in topology description\n"))

6148

sd_flags &= ~TOPOLOGY_SD_FLAGS;

6153

sd_flags &= ~TOPOLOGY_SD_FLAGS;

6149

6154

6150

*sd = (struct sched_domain){

6155

*sd = (struct sched_domain){

6151

.min_interval = sd_weight,

6156

.min_interval = sd_weight,

6152

.max_interval = 2*sd_weight,

6157

.max_interval = 2*sd_weight,

6153

.busy_factor = 32,

6158

.busy_factor = 32,

6154

.imbalance_pct = 125,

6159

.imbalance_pct = 125,

6155

6160

6156

.cache_nice_tries = 0,

6161

.cache_nice_tries = 0,

6157

.busy_idx = 0,

6162

.busy_idx = 0,

6158

.idle_idx = 0,

6163

.idle_idx = 0,

6159

.newidle_idx = 0,

6164

.newidle_idx = 0,

6160

.wake_idx = 0,

6165

.wake_idx = 0,

6161

.forkexec_idx = 0,

6166

.forkexec_idx = 0,

6162

6167

6163

.flags = 1*SD_LOAD_BALANCE

6168

.flags = 1*SD_LOAD_BALANCE

6164

| 1*SD_BALANCE_NEWIDLE

6169

| 1*SD_BALANCE_NEWIDLE

6165

| 1*SD_BALANCE_EXEC

6170

| 1*SD_BALANCE_EXEC

6166

| 1*SD_BALANCE_FORK

6171

| 1*SD_BALANCE_FORK

6167

| 0*SD_BALANCE_WAKE

6172

| 0*SD_BALANCE_WAKE

6168

| 1*SD_WAKE_AFFINE

6173

| 1*SD_WAKE_AFFINE

6169

| 0*SD_SHARE_CPUCAPACITY

6174

| 0*SD_SHARE_CPUCAPACITY

6170

| 0*SD_SHARE_PKG_RESOURCES

6175

| 0*SD_SHARE_PKG_RESOURCES

6171

| 0*SD_SERIALIZE

6176

| 0*SD_SERIALIZE

6172

| 0*SD_PREFER_SIBLING

6177

| 0*SD_PREFER_SIBLING

6173

| 0*SD_NUMA

6178

| 0*SD_NUMA

6174

| sd_flags

6179

| sd_flags

6175

,

6180

,

6176

6181

6177

.last_balance = jiffies,

6182

.last_balance = jiffies,

6178

.balance_interval = sd_weight,

6183

.balance_interval = sd_weight,

6179

.smt_gain = 0,

6184

.smt_gain = 0,

6180

.max_newidle_lb_cost = 0,

6185

.max_newidle_lb_cost = 0,

6181

.next_decay_max_lb_cost = jiffies,

6186

.next_decay_max_lb_cost = jiffies,

6182

#ifdef CONFIG_SCHED_DEBUG

6187

#ifdef CONFIG_SCHED_DEBUG

6183

.name = tl->name,

6188

.name = tl->name,

6184

#endif

6189

#endif

6185

};

6190

};

6186

6191

6187

/*

6192

/*

6188

* Convert topological properties into behaviour.

6193

* Convert topological properties into behaviour.

6189

*/

6194

*/

6190

6195

6191

if (sd->flags & SD_SHARE_CPUCAPACITY) {

6196

if (sd->flags & SD_SHARE_CPUCAPACITY) {

6192

sd->imbalance_pct = 110;

6197

sd->imbalance_pct = 110;

6193

sd->smt_gain = 1178; /* ~15% */

6198

sd->smt_gain = 1178; /* ~15% */

6194

6199

6195

} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {

6200

} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {

6196

sd->imbalance_pct = 117;

6201

sd->imbalance_pct = 117;

6197

sd->cache_nice_tries = 1;

6202

sd->cache_nice_tries = 1;

6198

sd->busy_idx = 2;

6203

sd->busy_idx = 2;

6199

6204

6200

#ifdef CONFIG_NUMA

6205

#ifdef CONFIG_NUMA

6201

} else if (sd->flags & SD_NUMA) {

6206

} else if (sd->flags & SD_NUMA) {

6202

sd->cache_nice_tries = 2;

6207

sd->cache_nice_tries = 2;

6203

sd->busy_idx = 3;

6208

sd->busy_idx = 3;

6204

sd->idle_idx = 2;

6209

sd->idle_idx = 2;

6205

6210

6206

sd->flags |= SD_SERIALIZE;

6211

sd->flags |= SD_SERIALIZE;

6207

if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {

6212

if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {

6208

sd->flags &= ~(SD_BALANCE_EXEC |

6213

sd->flags &= ~(SD_BALANCE_EXEC |

6209

SD_BALANCE_FORK |

6214

SD_BALANCE_FORK |

6210

SD_WAKE_AFFINE);

6215

SD_WAKE_AFFINE);

6211

}

6216

}

6212

6217

6213

#endif

6218

#endif

6214

} else {

6219

} else {

6215

sd->flags |= SD_PREFER_SIBLING;

6220

sd->flags |= SD_PREFER_SIBLING;

6216

sd->cache_nice_tries = 1;

6221

sd->cache_nice_tries = 1;

6217

sd->busy_idx = 2;

6222

sd->busy_idx = 2;

6218

sd->idle_idx = 1;

6223

sd->idle_idx = 1;

6219

}

6224

}

6220

6225

6221

sd->private = &tl->data;

6226

sd->private = &tl->data;

6222

6227

6223

return sd;

6228

return sd;

6224

}

6229

}

6225

6230

6226

/*

6231

/*

6227

* Topology list, bottom-up.

6232

* Topology list, bottom-up.

6228

*/

6233

*/

6229

static struct sched_domain_topology_level default_topology[] = {

6234

static struct sched_domain_topology_level default_topology[] = {

6230

#ifdef CONFIG_SCHED_SMT

6235

#ifdef CONFIG_SCHED_SMT

6231

{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },

6236

{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },

6232

#endif

6237

#endif

6233

#ifdef CONFIG_SCHED_MC

6238

#ifdef CONFIG_SCHED_MC

6234

{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },

6239

{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },

6235

#endif

6240

#endif

6236

{ cpu_cpu_mask, SD_INIT_NAME(DIE) },

6241

{ cpu_cpu_mask, SD_INIT_NAME(DIE) },

6237

{ NULL, },

6242

{ NULL, },

6238

};

6243

};

6239

6244

6240

struct sched_domain_topology_level *sched_domain_topology = default_topology;

6245

struct sched_domain_topology_level *sched_domain_topology = default_topology;

6241

6246

6242

#define for_each_sd_topology(tl) \

6247

#define for_each_sd_topology(tl) \

6243

for (tl = sched_domain_topology; tl->mask; tl++)

6248

for (tl = sched_domain_topology; tl->mask; tl++)

6244

6249

6245

void set_sched_topology(struct sched_domain_topology_level *tl)

6250

void set_sched_topology(struct sched_domain_topology_level *tl)

6246

{

6251

{

6247

sched_domain_topology = tl;

6252

sched_domain_topology = tl;

6248

}

6253

}

6249

6254

6250

#ifdef CONFIG_NUMA

6255

#ifdef CONFIG_NUMA

6251

6256

6252

static const struct cpumask *sd_numa_mask(int cpu)

6257

static const struct cpumask *sd_numa_mask(int cpu)

6253

{

6258

{

6254

return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];

6259

return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];

6255

}

6260

}

6256

6261

6257

static void sched_numa_warn(const char *str)

6262

static void sched_numa_warn(const char *str)

6258

{

6263

{

6259

static int done = false;

6264

static int done = false;

6260

int i,j;

6265

int i,j;

6261

6266

6262

if (done)

6267

if (done)

6263

return;

6268

return;

6264

6269

6265

done = true;

6270

done = true;

6266

6271

6267

printk(KERN_WARNING "ERROR: %s\n\n", str);

6272

printk(KERN_WARNING "ERROR: %s\n\n", str);

6268

6273

6269

for (i = 0; i < nr_node_ids; i++) {

6274

for (i = 0; i < nr_node_ids; i++) {

6270

printk(KERN_WARNING " ");

6275

printk(KERN_WARNING " ");

6271

for (j = 0; j < nr_node_ids; j++)

6276

for (j = 0; j < nr_node_ids; j++)

6272

printk(KERN_CONT "%02d ", node_distance(i,j));

6277

printk(KERN_CONT "%02d ", node_distance(i,j));

6273

printk(KERN_CONT "\n");

6278

printk(KERN_CONT "\n");

6274

}

6279

}

6275

printk(KERN_WARNING "\n");

6280

printk(KERN_WARNING "\n");

6276

}

6281

}

6277

6282

6278

static bool find_numa_distance(int distance)

6283

static bool find_numa_distance(int distance)

6279

{

6284

{

6280

int i;

6285

int i;

6281

6286

6282

if (distance == node_distance(0, 0))

6287

if (distance == node_distance(0, 0))

6283

return true;

6288

return true;

6284

6289

6285

for (i = 0; i < sched_domains_numa_levels; i++) {

6290

for (i = 0; i < sched_domains_numa_levels; i++) {

6286

if (sched_domains_numa_distance[i] == distance)

6291

if (sched_domains_numa_distance[i] == distance)

6287

return true;

6292

return true;

6288

}

6293

}

6289

6294

6290

return false;

6295

return false;

6291

}

6296

}

6292

6297

6293

static void sched_init_numa(void)

6298

static void sched_init_numa(void)

6294

{

6299

{

6295

int next_distance, curr_distance = node_distance(0, 0);

6300

int next_distance, curr_distance = node_distance(0, 0);

6296

struct sched_domain_topology_level *tl;

6301

struct sched_domain_topology_level *tl;

6297

int level = 0;

6302

int level = 0;

6298

int i, j, k;

6303

int i, j, k;

6299

6304

6300

sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);

6305

sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);

6301

if (!sched_domains_numa_distance)

6306

if (!sched_domains_numa_distance)

6302

return;

6307

return;

6303

6308

6304

/*

6309

/*

6305

* O(nr_nodes^2) deduplicating selection sort -- in order to find the

6310

* O(nr_nodes^2) deduplicating selection sort -- in order to find the

6306

* unique distances in the node_distance() table.

6311

* unique distances in the node_distance() table.

6307

*

6312

*

6308

* Assumes node_distance(0,j) includes all distances in

6313

* Assumes node_distance(0,j) includes all distances in

6309

* node_distance(i,j) in order to avoid cubic time.

6314

* node_distance(i,j) in order to avoid cubic time.

6310

*/

6315

*/

6311

next_distance = curr_distance;

6316

next_distance = curr_distance;

6312

for (i = 0; i < nr_node_ids; i++) {

6317

for (i = 0; i < nr_node_ids; i++) {

6313

for (j = 0; j < nr_node_ids; j++) {

6318

for (j = 0; j < nr_node_ids; j++) {

6314

for (k = 0; k < nr_node_ids; k++) {

6319

for (k = 0; k < nr_node_ids; k++) {

6315

int distance = node_distance(i, k);

6320

int distance = node_distance(i, k);

6316

6321

6317

if (distance > curr_distance &&

6322

if (distance > curr_distance &&

6318

(distance < next_distance ||

6323

(distance < next_distance ||

6319

next_distance == curr_distance))

6324

next_distance == curr_distance))

6320

next_distance = distance;

6325

next_distance = distance;

6321

6326

6322

/*

6327

/*

6323

* While not a strong assumption it would be nice to know

6328

* While not a strong assumption it would be nice to know

6324

* about cases where if node A is connected to B, B is not

6329

* about cases where if node A is connected to B, B is not

6325

* equally connected to A.

6330

* equally connected to A.

6326

*/

6331

*/

6327

if (sched_debug() && node_distance(k, i) != distance)

6332

if (sched_debug() && node_distance(k, i) != distance)

6328

sched_numa_warn("Node-distance not symmetric");

6333

sched_numa_warn("Node-distance not symmetric");

6329

6334

6330

if (sched_debug() && i && !find_numa_distance(distance))

6335

if (sched_debug() && i && !find_numa_distance(distance))

6331

sched_numa_warn("Node-0 not representative");

6336

sched_numa_warn("Node-0 not representative");

6332

}

6337

}

6333

if (next_distance != curr_distance) {

6338

if (next_distance != curr_distance) {

6334

sched_domains_numa_distance[level++] = next_distance;

6339

sched_domains_numa_distance[level++] = next_distance;

6335

sched_domains_numa_levels = level;

6340

sched_domains_numa_levels = level;

6336

curr_distance = next_distance;

6341

curr_distance = next_distance;

6337

} else break;

6342

} else break;

6338

}

6343

}

6339

6344

6340

/*

6345

/*

6341

* In case of sched_debug() we verify the above assumption.

6346

* In case of sched_debug() we verify the above assumption.

6342

*/

6347

*/

6343

if (!sched_debug())

6348

if (!sched_debug())

6344

break;

6349

break;

6345

}

6350

}

6346

6351

6347

if (!level)

6352

if (!level)

6348

return;

6353

return;

6349

6354

6350

/*

6355

/*

6351

* 'level' contains the number of unique distances, excluding the

6356

* 'level' contains the number of unique distances, excluding the

6352

* identity distance node_distance(i,i).

6357

* identity distance node_distance(i,i).

6353

*

6358

*

6354

* The sched_domains_numa_distance[] array includes the actual distance

6359

* The sched_domains_numa_distance[] array includes the actual distance

6355

* numbers.

6360

* numbers.

6356

*/

6361

*/

6357

6362

6358

/*

6363

/*

6359

* Here, we should temporarily reset sched_domains_numa_levels to 0.

6364

* Here, we should temporarily reset sched_domains_numa_levels to 0.

6360

* If it fails to allocate memory for array sched_domains_numa_masks[][],

6365

* If it fails to allocate memory for array sched_domains_numa_masks[][],

6361

* the array will contain less then 'level' members. This could be

6366

* the array will contain less then 'level' members. This could be

6362

* dangerous when we use it to iterate array sched_domains_numa_masks[][]

6367

* dangerous when we use it to iterate array sched_domains_numa_masks[][]

6363

* in other functions.

6368

* in other functions.

6364

*

6369

*

6365

* We reset it to 'level' at the end of this function.

6370

* We reset it to 'level' at the end of this function.

6366

*/

6371

*/

6367

sched_domains_numa_levels = 0;

6372

sched_domains_numa_levels = 0;

6368

6373

6369

sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);

6374

sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);

6370

if (!sched_domains_numa_masks)

6375

if (!sched_domains_numa_masks)

6371

return;

6376

return;

6372

6377

6373

/*

6378

/*

6374

* Now for each level, construct a mask per node which contains all

6379

* Now for each level, construct a mask per node which contains all

6375

* cpus of nodes that are that many hops away from us.

6380

* cpus of nodes that are that many hops away from us.

6376

*/

6381

*/

6377

for (i = 0; i < level; i++) {

6382

for (i = 0; i < level; i++) {

6378

sched_domains_numa_masks[i] =

6383

sched_domains_numa_masks[i] =

6379

kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);

6384

kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);

6380

if (!sched_domains_numa_masks[i])

6385

if (!sched_domains_numa_masks[i])

6381

return;

6386

return;

6382

6387

6383

for (j = 0; j < nr_node_ids; j++) {

6388

for (j = 0; j < nr_node_ids; j++) {

6384

struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);

6389

struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);

6385

if (!mask)

6390

if (!mask)

6386

return;

6391

return;

6387

6392

6388

sched_domains_numa_masks[i][j] = mask;

6393

sched_domains_numa_masks[i][j] = mask;

6389

6394

6390

for (k = 0; k < nr_node_ids; k++) {

6395

for (k = 0; k < nr_node_ids; k++) {

6391

if (node_distance(j, k) > sched_domains_numa_distance[i])

6396

if (node_distance(j, k) > sched_domains_numa_distance[i])

6392

continue;

6397

continue;

6393

6398

6394

cpumask_or(mask, mask, cpumask_of_node(k));

6399

cpumask_or(mask, mask, cpumask_of_node(k));

6395

}

6400

}

6396

}

6401

}

6397

}

6402

}

6398

6403

6399

/* Compute default topology size */

6404

/* Compute default topology size */

6400

for (i = 0; sched_domain_topology[i].mask; i++);

6405

for (i = 0; sched_domain_topology[i].mask; i++);

6401

6406

6402

tl = kzalloc((i + level + 1) *

6407

tl = kzalloc((i + level + 1) *

6403

sizeof(struct sched_domain_topology_level), GFP_KERNEL);

6408

sizeof(struct sched_domain_topology_level), GFP_KERNEL);

6404

if (!tl)

6409

if (!tl)

6405

return;

6410

return;

6406

6411

6407

/*

6412

/*

6408

* Copy the default topology bits..

6413

* Copy the default topology bits..

6409

*/

6414

*/

6410

for (i = 0; sched_domain_topology[i].mask; i++)

6415

for (i = 0; sched_domain_topology[i].mask; i++)

6411

tl[i] = sched_domain_topology[i];

6416

tl[i] = sched_domain_topology[i];

6412

6417

6413

/*

6418

/*

6414

* .. and append 'j' levels of NUMA goodness.

6419

* .. and append 'j' levels of NUMA goodness.

6415

*/

6420

*/

6416

for (j = 0; j < level; i++, j++) {

6421

for (j = 0; j < level; i++, j++) {

6417

tl[i] = (struct sched_domain_topology_level){

6422

tl[i] = (struct sched_domain_topology_level){

6418

.mask = sd_numa_mask,

6423

.mask = sd_numa_mask,

6419

.sd_flags = cpu_numa_flags,

6424

.sd_flags = cpu_numa_flags,

6420

.flags = SDTL_OVERLAP,

6425

.flags = SDTL_OVERLAP,

6421

.numa_level = j,

6426

.numa_level = j,

6422

SD_INIT_NAME(NUMA)

6427

SD_INIT_NAME(NUMA)

6423

};

6428

};

6424

}

6429

}

6425

6430

6426

sched_domain_topology = tl;

6431

sched_domain_topology = tl;

6427

6432

6428

sched_domains_numa_levels = level;

6433

sched_domains_numa_levels = level;

6429

}

6434

}

6430

6435

6431

static void sched_domains_numa_masks_set(int cpu)

6436

static void sched_domains_numa_masks_set(int cpu)

6432

{

6437

{

6433

int i, j;

6438

int i, j;

6434

int node = cpu_to_node(cpu);

6439

int node = cpu_to_node(cpu);

6435

6440

6436

for (i = 0; i < sched_domains_numa_levels; i++) {

6441

for (i = 0; i < sched_domains_numa_levels; i++) {

6437

for (j = 0; j < nr_node_ids; j++) {

6442

for (j = 0; j < nr_node_ids; j++) {

6438

if (node_distance(j, node) <= sched_domains_numa_distance[i])

6443

if (node_distance(j, node) <= sched_domains_numa_distance[i])

6439

cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);

6444

cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);

6440

}

6445

}

6441

}

6446

}

6442

}

6447

}

6443

6448

6444

static void sched_domains_numa_masks_clear(int cpu)

6449

static void sched_domains_numa_masks_clear(int cpu)

6445

{

6450

{

6446

int i, j;

6451

int i, j;

6447

for (i = 0; i < sched_domains_numa_levels; i++) {

6452

for (i = 0; i < sched_domains_numa_levels; i++) {

6448

for (j = 0; j < nr_node_ids; j++)

6453

for (j = 0; j < nr_node_ids; j++)

6449

cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);

6454

cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);

6450

}

6455

}

6451

}

6456

}

6452

6457

6453

/*

6458

/*

6454

* Update sched_domains_numa_masks[level][node] array when new cpus

6459

* Update sched_domains_numa_masks[level][node] array when new cpus

6455

* are onlined.

6460

* are onlined.

6456

*/

6461

*/

6457

static int sched_domains_numa_masks_update(struct notifier_block *nfb,

6462

static int sched_domains_numa_masks_update(struct notifier_block *nfb,

6458

unsigned long action,

6463

unsigned long action,

6459

void *hcpu)

6464

void *hcpu)

6460

{

6465

{

6461

int cpu = (long)hcpu;

6466

int cpu = (long)hcpu;

6462

6467

6463

switch (action & ~CPU_TASKS_FROZEN) {

6468

switch (action & ~CPU_TASKS_FROZEN) {

6464

case CPU_ONLINE:

6469

case CPU_ONLINE:

6465

sched_domains_numa_masks_set(cpu);

6470

sched_domains_numa_masks_set(cpu);

6466

break;

6471

break;

6467

6472

6468

case CPU_DEAD:

6473

case CPU_DEAD:

6469

sched_domains_numa_masks_clear(cpu);

6474

sched_domains_numa_masks_clear(cpu);

6470

break;

6475

break;

6471

6476

6472

default:

6477

default:

6473

return NOTIFY_DONE;

6478

return NOTIFY_DONE;

6474

}

6479

}

6475

6480

6476

return NOTIFY_OK;

6481

return NOTIFY_OK;

6477

}

6482

}

6478

#else

6483

#else

6479

static inline void sched_init_numa(void)

6484

static inline void sched_init_numa(void)

6480

{

6485

{

6481

}

6486

}

6482

6487

6483

static int sched_domains_numa_masks_update(struct notifier_block *nfb,

6488

static int sched_domains_numa_masks_update(struct notifier_block *nfb,

6484

unsigned long action,

6489

unsigned long action,

6485

void *hcpu)

6490

void *hcpu)

6486

{

6491

{

6487

return 0;

6492

return 0;

6488

}

6493

}

6489

#endif /* CONFIG_NUMA */

6494

#endif /* CONFIG_NUMA */

6490

6495

6491

static int __sdt_alloc(const struct cpumask *cpu_map)

6496

static int __sdt_alloc(const struct cpumask *cpu_map)

6492

{

6497

{

6493

struct sched_domain_topology_level *tl;

6498

struct sched_domain_topology_level *tl;

6494

int j;

6499

int j;

6495

6500

6496

for_each_sd_topology(tl) {

6501

for_each_sd_topology(tl) {

6497

struct sd_data *sdd = &tl->data;

6502

struct sd_data *sdd = &tl->data;

6498

6503

6499

sdd->sd = alloc_percpu(struct sched_domain *);

6504

sdd->sd = alloc_percpu(struct sched_domain *);

6500

if (!sdd->sd)

6505

if (!sdd->sd)

6501

return -ENOMEM;

6506

return -ENOMEM;

6502

6507

6503

sdd->sg = alloc_percpu(struct sched_group *);

6508

sdd->sg = alloc_percpu(struct sched_group *);

6504

if (!sdd->sg)

6509

if (!sdd->sg)

6505

return -ENOMEM;

6510

return -ENOMEM;

6506

6511

6507

sdd->sgc = alloc_percpu(struct sched_group_capacity *);

6512

sdd->sgc = alloc_percpu(struct sched_group_capacity *);

6508

if (!sdd->sgc)

6513

if (!sdd->sgc)

6509

return -ENOMEM;

6514

return -ENOMEM;

6510

6515

6511

for_each_cpu(j, cpu_map) {

6516

for_each_cpu(j, cpu_map) {

6512

struct sched_domain *sd;

6517

struct sched_domain *sd;

6513

struct sched_group *sg;

6518

struct sched_group *sg;

6514

struct sched_group_capacity *sgc;

6519

struct sched_group_capacity *sgc;

6515

6520

6516

sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),

6521

sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),

6517

GFP_KERNEL, cpu_to_node(j));

6522

GFP_KERNEL, cpu_to_node(j));

6518

if (!sd)

6523

if (!sd)

6519

return -ENOMEM;

6524

return -ENOMEM;

6520

6525

6521

*per_cpu_ptr(sdd->sd, j) = sd;

6526

*per_cpu_ptr(sdd->sd, j) = sd;

6522

6527

6523

sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),

6528

sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),

6524

GFP_KERNEL, cpu_to_node(j));

6529

GFP_KERNEL, cpu_to_node(j));

6525

if (!sg)

6530

if (!sg)

6526

return -ENOMEM;

6531

return -ENOMEM;

6527

6532

6528

sg->next = sg;

6533

sg->next = sg;

6529

6534

6530

*per_cpu_ptr(sdd->sg, j) = sg;

6535

*per_cpu_ptr(sdd->sg, j) = sg;

6531

6536

6532

sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),

6537

sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),

6533

GFP_KERNEL, cpu_to_node(j));

6538

GFP_KERNEL, cpu_to_node(j));

6534

if (!sgc)

6539

if (!sgc)

6535

return -ENOMEM;

6540

return -ENOMEM;

6536

6541

6537

*per_cpu_ptr(sdd->sgc, j) = sgc;

6542

*per_cpu_ptr(sdd->sgc, j) = sgc;

6538

}

6543

}

6539

}

6544

}

6540

6545

6541

return 0;

6546

return 0;

6542

}

6547

}

6543

6548

6544

static void __sdt_free(const struct cpumask *cpu_map)

6549

static void __sdt_free(const struct cpumask *cpu_map)

6545

{

6550

{

6546

struct sched_domain_topology_level *tl;

6551

struct sched_domain_topology_level *tl;

6547

int j;

6552

int j;

6548

6553

6549

for_each_sd_topology(tl) {

6554

for_each_sd_topology(tl) {

6550

struct sd_data *sdd = &tl->data;

6555

struct sd_data *sdd = &tl->data;

6551

6556

6552

for_each_cpu(j, cpu_map) {

6557

for_each_cpu(j, cpu_map) {

6553

struct sched_domain *sd;

6558

struct sched_domain *sd;

6554

6559

6555

if (sdd->sd) {

6560

if (sdd->sd) {

6556

sd = *per_cpu_ptr(sdd->sd, j);

6561

sd = *per_cpu_ptr(sdd->sd, j);

6557

if (sd && (sd->flags & SD_OVERLAP))

6562

if (sd && (sd->flags & SD_OVERLAP))

6558

free_sched_groups(sd->groups, 0);

6563

free_sched_groups(sd->groups, 0);

6559

kfree(*per_cpu_ptr(sdd->sd, j));

6564

kfree(*per_cpu_ptr(sdd->sd, j));

6560

}

6565

}

6561

6566

6562

if (sdd->sg)

6567

if (sdd->sg)

6563

kfree(*per_cpu_ptr(sdd->sg, j));

6568

kfree(*per_cpu_ptr(sdd->sg, j));

6564

if (sdd->sgc)

6569

if (sdd->sgc)

6565

kfree(*per_cpu_ptr(sdd->sgc, j));

6570

kfree(*per_cpu_ptr(sdd->sgc, j));

6566

}

6571

}

6567

free_percpu(sdd->sd);

6572

free_percpu(sdd->sd);

6568

sdd->sd = NULL;

6573

sdd->sd = NULL;

6569

free_percpu(sdd->sg);

6574

free_percpu(sdd->sg);

6570

sdd->sg = NULL;

6575

sdd->sg = NULL;

6571

free_percpu(sdd->sgc);

6576

free_percpu(sdd->sgc);

6572

sdd->sgc = NULL;

6577

sdd->sgc = NULL;

6573

}

6578

}

6574

}

6579

}

6575

6580

6576

struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,

6581

struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,

6577

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

6582

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

6578

struct sched_domain *child, int cpu)

6583

struct sched_domain *child, int cpu)

6579

{

6584

{

6580

struct sched_domain *sd = sd_init(tl, cpu);

6585

struct sched_domain *sd = sd_init(tl, cpu);

6581

if (!sd)

6586

if (!sd)

6582

return child;

6587

return child;

6583

6588

6584

cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));

6589

cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));

6585

if (child) {

6590

if (child) {

6586

sd->level = child->level + 1;

6591

sd->level = child->level + 1;

6587

sched_domain_level_max = max(sched_domain_level_max, sd->level);

6592

sched_domain_level_max = max(sched_domain_level_max, sd->level);

6588

child->parent = sd;

6593

child->parent = sd;

6589

sd->child = child;

6594

sd->child = child;

6590

6595

6591

if (!cpumask_subset(sched_domain_span(child),

6596

if (!cpumask_subset(sched_domain_span(child),

6592

sched_domain_span(sd))) {

6597

sched_domain_span(sd))) {

6593

pr_err("BUG: arch topology borken\n");

6598

pr_err("BUG: arch topology borken\n");

6594

#ifdef CONFIG_SCHED_DEBUG

6599

#ifdef CONFIG_SCHED_DEBUG

6595

pr_err(" the %s domain not a subset of the %s domain\n",

6600

pr_err(" the %s domain not a subset of the %s domain\n",

6596

child->name, sd->name);

6601

child->name, sd->name);

6597

#endif

6602

#endif

6598

/* Fixup, ensure @sd has at least @child cpus. */

6603

/* Fixup, ensure @sd has at least @child cpus. */

6599

cpumask_or(sched_domain_span(sd),

6604

cpumask_or(sched_domain_span(sd),

6600

sched_domain_span(sd),

6605

sched_domain_span(sd),

6601

sched_domain_span(child));

6606

sched_domain_span(child));

6602

}

6607

}

6603

6608

6604

}

6609

}

6605

set_domain_attribute(sd, attr);

6610

set_domain_attribute(sd, attr);

6606

6611

6607

return sd;

6612

return sd;

6608

}

6613

}

6609

6614

6610

/*

6615

/*

6611

* Build sched domains for a given set of cpus and attach the sched domains

6616

* Build sched domains for a given set of cpus and attach the sched domains

6612

* to the individual cpus

6617

* to the individual cpus

6613

*/

6618

*/

6614

static int build_sched_domains(const struct cpumask *cpu_map,

6619

static int build_sched_domains(const struct cpumask *cpu_map,

6615

struct sched_domain_attr *attr)

6620

struct sched_domain_attr *attr)

6616

{

6621

{

6617

enum s_alloc alloc_state;

6622

enum s_alloc alloc_state;

6618

struct sched_domain *sd;

6623

struct sched_domain *sd;

6619

struct s_data d;

6624

struct s_data d;

6620

int i, ret = -ENOMEM;

6625

int i, ret = -ENOMEM;

6621

6626

6622

alloc_state = __visit_domain_allocation_hell(&d, cpu_map);

6627

alloc_state = __visit_domain_allocation_hell(&d, cpu_map);

6623

if (alloc_state != sa_rootdomain)

6628

if (alloc_state != sa_rootdomain)

6624

goto error;

6629

goto error;

6625

6630

6626

/* Set up domains for cpus specified by the cpu_map. */

6631

/* Set up domains for cpus specified by the cpu_map. */

6627

for_each_cpu(i, cpu_map) {

6632

for_each_cpu(i, cpu_map) {

6628

struct sched_domain_topology_level *tl;

6633

struct sched_domain_topology_level *tl;

6629

6634

6630

sd = NULL;

6635

sd = NULL;

6631

for_each_sd_topology(tl) {

6636

for_each_sd_topology(tl) {

6632

sd = build_sched_domain(tl, cpu_map, attr, sd, i);

6637

sd = build_sched_domain(tl, cpu_map, attr, sd, i);

6633

if (tl == sched_domain_topology)

6638

if (tl == sched_domain_topology)

6634

*per_cpu_ptr(d.sd, i) = sd;

6639

*per_cpu_ptr(d.sd, i) = sd;

6635

if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))

6640

if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))

6636

sd->flags |= SD_OVERLAP;

6641

sd->flags |= SD_OVERLAP;

6637

if (cpumask_equal(cpu_map, sched_domain_span(sd)))

6642

if (cpumask_equal(cpu_map, sched_domain_span(sd)))

6638

break;

6643

break;

6639

}

6644

}

6640

}

6645

}

6641

6646

6642

/* Build the groups for the domains */

6647

/* Build the groups for the domains */

6643

for_each_cpu(i, cpu_map) {

6648

for_each_cpu(i, cpu_map) {

6644

for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {

6649

for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {

6645

sd->span_weight = cpumask_weight(sched_domain_span(sd));

6650

sd->span_weight = cpumask_weight(sched_domain_span(sd));

6646

if (sd->flags & SD_OVERLAP) {

6651

if (sd->flags & SD_OVERLAP) {

6647

if (build_overlap_sched_groups(sd, i))

6652

if (build_overlap_sched_groups(sd, i))

6648

goto error;

6653

goto error;

6649

} else {

6654

} else {

6650

if (build_sched_groups(sd, i))

6655

if (build_sched_groups(sd, i))

6651

goto error;

6656

goto error;

6652

}

6657

}

6653

}

6658

}

6654

}

6659

}

6655

6660

6656

/* Calculate CPU capacity for physical packages and nodes */

6661

/* Calculate CPU capacity for physical packages and nodes */

6657

for (i = nr_cpumask_bits-1; i >= 0; i--) {

6662

for (i = nr_cpumask_bits-1; i >= 0; i--) {

6658

if (!cpumask_test_cpu(i, cpu_map))

6663

if (!cpumask_test_cpu(i, cpu_map))

6659

continue;

6664

continue;

6660

6665

6661

for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {

6666

for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {

6662

claim_allocations(i, sd);

6667

claim_allocations(i, sd);

6663

init_sched_groups_capacity(i, sd);

6668

init_sched_groups_capacity(i, sd);

6664

}

6669

}

6665

}

6670

}

6666

6671

6667

/* Attach the domains */

6672

/* Attach the domains */

6668

rcu_read_lock();

6673

rcu_read_lock();

6669

for_each_cpu(i, cpu_map) {

6674

for_each_cpu(i, cpu_map) {

6670

sd = *per_cpu_ptr(d.sd, i);

6675

sd = *per_cpu_ptr(d.sd, i);

6671

cpu_attach_domain(sd, d.rd, i);

6676

cpu_attach_domain(sd, d.rd, i);

6672

}

6677

}

6673

rcu_read_unlock();

6678

rcu_read_unlock();

6674

6679

6675

ret = 0;

6680

ret = 0;

6676

error:

6681

error:

6677

__free_domain_allocs(&d, alloc_state, cpu_map);

6682

__free_domain_allocs(&d, alloc_state, cpu_map);

6678

return ret;

6683

return ret;

6679

}

6684

}

6680

6685

6681

static cpumask_var_t *doms_cur; /* current sched domains */

6686

static cpumask_var_t *doms_cur; /* current sched domains */

6682

static int ndoms_cur; /* number of sched domains in 'doms_cur' */

6687

static int ndoms_cur; /* number of sched domains in 'doms_cur' */

6683

static struct sched_domain_attr *dattr_cur;

6688

static struct sched_domain_attr *dattr_cur;

6684

/* attribues of custom domains in 'doms_cur' */

6689

/* attribues of custom domains in 'doms_cur' */

6685

6690

6686

/*

6691

/*

6687

* Special case: If a kmalloc of a doms_cur partition (array of

6692

* Special case: If a kmalloc of a doms_cur partition (array of

6688

* cpumask) fails, then fallback to a single sched domain,

6693

* cpumask) fails, then fallback to a single sched domain,

6689

* as determined by the single cpumask fallback_doms.

6694

* as determined by the single cpumask fallback_doms.

6690

*/

6695

*/

6691

static cpumask_var_t fallback_doms;

6696

static cpumask_var_t fallback_doms;

6692

6697

6693

/*

6698

/*

6694

* arch_update_cpu_topology lets virtualized architectures update the

6699

* arch_update_cpu_topology lets virtualized architectures update the

6695

* cpu core maps. It is supposed to return 1 if the topology changed

6700

* cpu core maps. It is supposed to return 1 if the topology changed

6696

* or 0 if it stayed the same.

6701

* or 0 if it stayed the same.

6697

*/

6702

*/

6698

int __weak arch_update_cpu_topology(void)

6703

int __weak arch_update_cpu_topology(void)

6699

{

6704

{

6700

return 0;

6705

return 0;

6701

}

6706

}

6702

6707

6703

cpumask_var_t *alloc_sched_domains(unsigned int ndoms)

6708

cpumask_var_t *alloc_sched_domains(unsigned int ndoms)

6704

{

6709

{

6705

int i;

6710

int i;

6706

cpumask_var_t *doms;

6711

cpumask_var_t *doms;

6707

6712

6708

doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);

6713

doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);

6709

if (!doms)

6714

if (!doms)

6710

return NULL;

6715

return NULL;

6711

for (i = 0; i < ndoms; i++) {

6716

for (i = 0; i < ndoms; i++) {

6712

if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {

6717

if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {

6713

free_sched_domains(doms, i);

6718

free_sched_domains(doms, i);

6714

return NULL;

6719

return NULL;

6715

}

6720

}

6716

}

6721

}

6717

return doms;

6722

return doms;

6718

}

6723

}

6719

6724

6720

void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)

6725

void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)

6721

{

6726

{

6722

unsigned int i;

6727

unsigned int i;

6723

for (i = 0; i < ndoms; i++)

6728

for (i = 0; i < ndoms; i++)

6724

free_cpumask_var(doms[i]);

6729

free_cpumask_var(doms[i]);

6725

kfree(doms);

6730

kfree(doms);

6726

}

6731

}

6727

6732

6728

/*

6733

/*

6729

* Set up scheduler domains and groups. Callers must hold the hotplug lock.

6734

* Set up scheduler domains and groups. Callers must hold the hotplug lock.

6730

* For now this just excludes isolated cpus, but could be used to

6735

* For now this just excludes isolated cpus, but could be used to

6731

* exclude other special cases in the future.

6736

* exclude other special cases in the future.

6732

*/

6737

*/

6733

static int init_sched_domains(const struct cpumask *cpu_map)

6738

static int init_sched_domains(const struct cpumask *cpu_map)

6734

{

6739

{

6735

int err;

6740

int err;

6736

6741

6737

arch_update_cpu_topology();

6742

arch_update_cpu_topology();

6738

ndoms_cur = 1;

6743

ndoms_cur = 1;

6739

doms_cur = alloc_sched_domains(ndoms_cur);

6744

doms_cur = alloc_sched_domains(ndoms_cur);

6740

if (!doms_cur)

6745

if (!doms_cur)

6741

doms_cur = &fallback_doms;

6746

doms_cur = &fallback_doms;

6742

cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);

6747

cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);

6743

err = build_sched_domains(doms_cur[0], NULL);

6748

err = build_sched_domains(doms_cur[0], NULL);

6744

register_sched_domain_sysctl();

6749

register_sched_domain_sysctl();

6745

6750

6746

return err;

6751

return err;

6747

}

6752

}

6748

6753

6749

/*

6754

/*

6750

* Detach sched domains from a group of cpus specified in cpu_map

6755

* Detach sched domains from a group of cpus specified in cpu_map

6751

* These cpus will now be attached to the NULL domain

6756

* These cpus will now be attached to the NULL domain

6752

*/

6757

*/

6753

static void detach_destroy_domains(const struct cpumask *cpu_map)

6758

static void detach_destroy_domains(const struct cpumask *cpu_map)

6754

{

6759

{

6755

int i;

6760

int i;

6756

6761

6757

rcu_read_lock();

6762

rcu_read_lock();

6758

for_each_cpu(i, cpu_map)

6763

for_each_cpu(i, cpu_map)

6759

cpu_attach_domain(NULL, &def_root_domain, i);

6764

cpu_attach_domain(NULL, &def_root_domain, i);

6760

rcu_read_unlock();

6765

rcu_read_unlock();

6761

}

6766

}

6762

6767

6763

/* handle null as "default" */

6768

/* handle null as "default" */

6764

static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,

6769

static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,

6765

struct sched_domain_attr *new, int idx_new)

6770

struct sched_domain_attr *new, int idx_new)

6766

{

6771

{

6767

struct sched_domain_attr tmp;

6772

struct sched_domain_attr tmp;

6768

6773

6769

/* fast path */

6774

/* fast path */

6770

if (!new && !cur)

6775

if (!new && !cur)

6771

return 1;

6776

return 1;

6772

6777

6773

tmp = SD_ATTR_INIT;

6778

tmp = SD_ATTR_INIT;

6774

return !memcmp(cur ? (cur + idx_cur) : &tmp,

6779

return !memcmp(cur ? (cur + idx_cur) : &tmp,

6775

new ? (new + idx_new) : &tmp,

6780

new ? (new + idx_new) : &tmp,

6776

sizeof(struct sched_domain_attr));

6781

sizeof(struct sched_domain_attr));

6777

}

6782

}

6778

6783

6779

/*

6784

/*

6780

* Partition sched domains as specified by the 'ndoms_new'

6785

* Partition sched domains as specified by the 'ndoms_new'

6781

* cpumasks in the array doms_new[] of cpumasks. This compares

6786

* cpumasks in the array doms_new[] of cpumasks. This compares

6782

* doms_new[] to the current sched domain partitioning, doms_cur[].

6787

* doms_new[] to the current sched domain partitioning, doms_cur[].

6783

* It destroys each deleted domain and builds each new domain.

6788

* It destroys each deleted domain and builds each new domain.

6784

*

6789

*

6785

* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.

6790

* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.

6786

* The masks don't intersect (don't overlap.) We should setup one

6791

* The masks don't intersect (don't overlap.) We should setup one

6787

* sched domain for each mask. CPUs not in any of the cpumasks will

6792

* sched domain for each mask. CPUs not in any of the cpumasks will

6788

* not be load balanced. If the same cpumask appears both in the

6793

* not be load balanced. If the same cpumask appears both in the

6789

* current 'doms_cur' domains and in the new 'doms_new', we can leave

6794

* current 'doms_cur' domains and in the new 'doms_new', we can leave

6790

* it as it is.

6795

* it as it is.

6791

*

6796

*

6792

* The passed in 'doms_new' should be allocated using

6797

* The passed in 'doms_new' should be allocated using

6793

* alloc_sched_domains. This routine takes ownership of it and will

6798

* alloc_sched_domains. This routine takes ownership of it and will

6794

* free_sched_domains it when done with it. If the caller failed the

6799

* free_sched_domains it when done with it. If the caller failed the

6795

* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,

6800

* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,

6796

* and partition_sched_domains() will fallback to the single partition

6801

* and partition_sched_domains() will fallback to the single partition

6797

* 'fallback_doms', it also forces the domains to be rebuilt.

6802

* 'fallback_doms', it also forces the domains to be rebuilt.

6798

*

6803

*

6799

* If doms_new == NULL it will be replaced with cpu_online_mask.

6804

* If doms_new == NULL it will be replaced with cpu_online_mask.

6800

* ndoms_new == 0 is a special case for destroying existing domains,

6805

* ndoms_new == 0 is a special case for destroying existing domains,

6801

* and it will not create the default domain.

6806

* and it will not create the default domain.

6802

*

6807

*

6803

* Call with hotplug lock held

6808

* Call with hotplug lock held

6804

*/

6809

*/

6805

void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],

6810

void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],

6806

struct sched_domain_attr *dattr_new)

6811

struct sched_domain_attr *dattr_new)

6807

{

6812

{

6808

int i, j, n;

6813

int i, j, n;

6809

int new_topology;

6814

int new_topology;

6810

6815

6811

mutex_lock(&sched_domains_mutex);

6816

mutex_lock(&sched_domains_mutex);

6812

6817

6813

/* always unregister in case we don't destroy any domains */

6818

/* always unregister in case we don't destroy any domains */

6814

unregister_sched_domain_sysctl();

6819

unregister_sched_domain_sysctl();

6815

6820

6816

/* Let architecture update cpu core mappings. */

6821

/* Let architecture update cpu core mappings. */

6817

new_topology = arch_update_cpu_topology();

6822

new_topology = arch_update_cpu_topology();

6818

6823

6819

n = doms_new ? ndoms_new : 0;

6824

n = doms_new ? ndoms_new : 0;

6820

6825

6821

/* Destroy deleted domains */

6826

/* Destroy deleted domains */

6822

for (i = 0; i < ndoms_cur; i++) {

6827

for (i = 0; i < ndoms_cur; i++) {

6823

for (j = 0; j < n && !new_topology; j++) {

6828

for (j = 0; j < n && !new_topology; j++) {

6824

if (cpumask_equal(doms_cur[i], doms_new[j])

6829

if (cpumask_equal(doms_cur[i], doms_new[j])

6825

&& dattrs_equal(dattr_cur, i, dattr_new, j))

6830

&& dattrs_equal(dattr_cur, i, dattr_new, j))

6826

goto match1;

6831

goto match1;

6827

}

6832

}

6828

/* no match - a current sched domain not in new doms_new[] */

6833

/* no match - a current sched domain not in new doms_new[] */

6829

detach_destroy_domains(doms_cur[i]);

6834

detach_destroy_domains(doms_cur[i]);

6830

match1:

6835

match1:

6831

;

6836

;

6832

}

6837

}

6833

6838

6834

n = ndoms_cur;

6839

n = ndoms_cur;

6835

if (doms_new == NULL) {

6840

if (doms_new == NULL) {

6836

n = 0;

6841

n = 0;

6837

doms_new = &fallback_doms;

6842

doms_new = &fallback_doms;

6838

cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);

6843

cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);

6839

WARN_ON_ONCE(dattr_new);

6844

WARN_ON_ONCE(dattr_new);

6840

}

6845

}

6841

6846

6842

/* Build new domains */

6847

/* Build new domains */

6843

for (i = 0; i < ndoms_new; i++) {

6848

for (i = 0; i < ndoms_new; i++) {

6844

for (j = 0; j < n && !new_topology; j++) {

6849

for (j = 0; j < n && !new_topology; j++) {

6845

if (cpumask_equal(doms_new[i], doms_cur[j])

6850

if (cpumask_equal(doms_new[i], doms_cur[j])

6846

&& dattrs_equal(dattr_new, i, dattr_cur, j))

6851

&& dattrs_equal(dattr_new, i, dattr_cur, j))

6847

goto match2;

6852

goto match2;

6848

}

6853

}

6849

/* no match - add a new doms_new */

6854

/* no match - add a new doms_new */

6850

build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);

6855

build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);

6851

match2:

6856

match2:

6852

;

6857

;

6853

}

6858

}

6854

6859

6855

/* Remember the new sched domains */

6860

/* Remember the new sched domains */

6856

if (doms_cur != &fallback_doms)

6861

if (doms_cur != &fallback_doms)

6857

free_sched_domains(doms_cur, ndoms_cur);

6862

free_sched_domains(doms_cur, ndoms_cur);

6858

kfree(dattr_cur); /* kfree(NULL) is safe */

6863

kfree(dattr_cur); /* kfree(NULL) is safe */

6859

doms_cur = doms_new;

6864

doms_cur = doms_new;

6860

dattr_cur = dattr_new;

6865

dattr_cur = dattr_new;

6861

ndoms_cur = ndoms_new;

6866

ndoms_cur = ndoms_new;

6862

6867

6863

register_sched_domain_sysctl();

6868

register_sched_domain_sysctl();

6864

6869

6865

mutex_unlock(&sched_domains_mutex);

6870

mutex_unlock(&sched_domains_mutex);

6866

}

6871

}

6867

6872

6868

static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */

6873

static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */

6869

6874

6870

/*

6875

/*

6871

* Update cpusets according to cpu_active mask. If cpusets are

6876

* Update cpusets according to cpu_active mask. If cpusets are

6872

* disabled, cpuset_update_active_cpus() becomes a simple wrapper

6877

* disabled, cpuset_update_active_cpus() becomes a simple wrapper

6873

* around partition_sched_domains().

6878

* around partition_sched_domains().

6874

*

6879

*

6875

* If we come here as part of a suspend/resume, don't touch cpusets because we

6880

* If we come here as part of a suspend/resume, don't touch cpusets because we

6876

* want to restore it back to its original state upon resume anyway.

6881

* want to restore it back to its original state upon resume anyway.

6877

*/

6882

*/

6878

static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,

6883

static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,

6879

void *hcpu)

6884

void *hcpu)

6880

{

6885

{

6881

switch (action) {

6886

switch (action) {

6882

case CPU_ONLINE_FROZEN:

6887

case CPU_ONLINE_FROZEN:

6883

case CPU_DOWN_FAILED_FROZEN:

6888

case CPU_DOWN_FAILED_FROZEN:

6884

6889

6885

/*

6890

/*

6886

* num_cpus_frozen tracks how many CPUs are involved in suspend

6891

* num_cpus_frozen tracks how many CPUs are involved in suspend

6887

* resume sequence. As long as this is not the last online

6892

* resume sequence. As long as this is not the last online

6888

* operation in the resume sequence, just build a single sched

6893

* operation in the resume sequence, just build a single sched

6889

* domain, ignoring cpusets.

6894

* domain, ignoring cpusets.

6890

*/

6895

*/

6891

num_cpus_frozen--;

6896

num_cpus_frozen--;

6892

if (likely(num_cpus_frozen)) {

6897

if (likely(num_cpus_frozen)) {

6893

partition_sched_domains(1, NULL, NULL);

6898

partition_sched_domains(1, NULL, NULL);

6894

break;

6899

break;

6895

}

6900

}

6896

6901

6897

/*

6902

/*

6898

* This is the last CPU online operation. So fall through and

6903

* This is the last CPU online operation. So fall through and

6899

* restore the original sched domains by considering the

6904

* restore the original sched domains by considering the

6900

* cpuset configurations.

6905

* cpuset configurations.

6901

*/

6906

*/

6902

6907

6903

case CPU_ONLINE:

6908

case CPU_ONLINE:

6904

case CPU_DOWN_FAILED:

6909

case CPU_DOWN_FAILED:

6905

cpuset_update_active_cpus(true);

6910

cpuset_update_active_cpus(true);

6906

break;

6911

break;

6907

default:

6912

default:

6908

return NOTIFY_DONE;

6913

return NOTIFY_DONE;

6909

}

6914

}

6910

return NOTIFY_OK;

6915

return NOTIFY_OK;

6911

}

6916

}

6912

6917

6913

static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,

6918

static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,

6914

void *hcpu)

6919

void *hcpu)

6915

{

6920

{

6916

switch (action) {

6921

switch (action) {

6917

case CPU_DOWN_PREPARE:

6922

case CPU_DOWN_PREPARE:

6918

cpuset_update_active_cpus(false);

6923

cpuset_update_active_cpus(false);

6919

break;

6924

break;

6920

case CPU_DOWN_PREPARE_FROZEN:

6925

case CPU_DOWN_PREPARE_FROZEN:

6921

num_cpus_frozen++;

6926

num_cpus_frozen++;

6922

partition_sched_domains(1, NULL, NULL);

6927

partition_sched_domains(1, NULL, NULL);

6923

break;

6928

break;

6924

default:

6929

default:

6925

return NOTIFY_DONE;

6930

return NOTIFY_DONE;

6926

}

6931

}

6927

return NOTIFY_OK;

6932

return NOTIFY_OK;

6928

}

6933

}

6929

6934

6930

void __init sched_init_smp(void)

6935

void __init sched_init_smp(void)

6931

{

6936

{

6932

cpumask_var_t non_isolated_cpus;

6937

cpumask_var_t non_isolated_cpus;

6933

6938

6934

alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);

6939

alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);

6935

alloc_cpumask_var(&fallback_doms, GFP_KERNEL);

6940

alloc_cpumask_var(&fallback_doms, GFP_KERNEL);

6936

6941

6937

sched_init_numa();

6942

sched_init_numa();

6938

6943

6939

/*

6944

/*

6940

* There's no userspace yet to cause hotplug operations; hence all the

6945

* There's no userspace yet to cause hotplug operations; hence all the

6941

* cpu masks are stable and all blatant races in the below code cannot

6946

* cpu masks are stable and all blatant races in the below code cannot

6942

* happen.

6947

* happen.

6943

*/

6948

*/

6944

mutex_lock(&sched_domains_mutex);

6949

mutex_lock(&sched_domains_mutex);

6945

init_sched_domains(cpu_active_mask);

6950

init_sched_domains(cpu_active_mask);

6946

cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);

6951

cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);

6947

if (cpumask_empty(non_isolated_cpus))

6952

if (cpumask_empty(non_isolated_cpus))

6948

cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);

6953

cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);

6949

mutex_unlock(&sched_domains_mutex);

6954

mutex_unlock(&sched_domains_mutex);

6950

6955

6951

hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);

6956

hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);

6952

hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);

6957

hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);

6953

hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);

6958

hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);

6954

6959

6955

init_hrtick();

6960

init_hrtick();

6956

6961

6957

/* Move init over to a non-isolated CPU */

6962

/* Move init over to a non-isolated CPU */

6958

if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)

6963

if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)

6959

BUG();

6964

BUG();

6960

sched_init_granularity();

6965

sched_init_granularity();

6961

free_cpumask_var(non_isolated_cpus);

6966

free_cpumask_var(non_isolated_cpus);

6962

6967

6963

init_sched_rt_class();

6968

init_sched_rt_class();

6964

init_sched_dl_class();

6969

init_sched_dl_class();

6965

}

6970

}

6966

#else

6971

#else

6967

void __init sched_init_smp(void)

6972

void __init sched_init_smp(void)

6968

{

6973

{

6969

sched_init_granularity();

6974

sched_init_granularity();

6970

}

6975

}

6971

#endif /* CONFIG_SMP */

6976

#endif /* CONFIG_SMP */

6972

6977

6973

const_debug unsigned int sysctl_timer_migration = 1;

6978

const_debug unsigned int sysctl_timer_migration = 1;

6974

6979

6975

int in_sched_functions(unsigned long addr)

6980

int in_sched_functions(unsigned long addr)

6976

{

6981

{

6977

return in_lock_functions(addr) ||

6982

return in_lock_functions(addr) ||

6978

(addr >= (unsigned long)__sched_text_start

6983

(addr >= (unsigned long)__sched_text_start

6979

&& addr < (unsigned long)__sched_text_end);

6984

&& addr < (unsigned long)__sched_text_end);

6980

}

6985

}

6981

6986

6982

#ifdef CONFIG_CGROUP_SCHED

6987

#ifdef CONFIG_CGROUP_SCHED

6983

/*

6988

/*

6984

* Default task group.

6989

* Default task group.

6985

* Every task in system belongs to this group at bootup.

6990

* Every task in system belongs to this group at bootup.

6986

*/

6991

*/

6987

struct task_group root_task_group;

6992

struct task_group root_task_group;

6988

LIST_HEAD(task_groups);

6993

LIST_HEAD(task_groups);

6989

#endif

6994

#endif

6990

6995

6991

DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);

6996

DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);

6992

6997

6993

void __init sched_init(void)

6998

void __init sched_init(void)

6994

{

6999

{

6995

int i, j;

7000

int i, j;

6996

unsigned long alloc_size = 0, ptr;

7001

unsigned long alloc_size = 0, ptr;

6997

7002

6998

#ifdef CONFIG_FAIR_GROUP_SCHED

7003

#ifdef CONFIG_FAIR_GROUP_SCHED

6999

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

7004

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

7000

#endif

7005

#endif

7001

#ifdef CONFIG_RT_GROUP_SCHED

7006

#ifdef CONFIG_RT_GROUP_SCHED

7002

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

7007

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

7003

#endif

7008

#endif

7004

#ifdef CONFIG_CPUMASK_OFFSTACK

7009

#ifdef CONFIG_CPUMASK_OFFSTACK

7005

alloc_size += num_possible_cpus() * cpumask_size();

7010

alloc_size += num_possible_cpus() * cpumask_size();

7006

#endif

7011

#endif

7007

if (alloc_size) {

7012

if (alloc_size) {

7008

ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);

7013

ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);

7009

7014

7010

#ifdef CONFIG_FAIR_GROUP_SCHED

7015

#ifdef CONFIG_FAIR_GROUP_SCHED

7011

root_task_group.se = (struct sched_entity **)ptr;

7016

root_task_group.se = (struct sched_entity **)ptr;

7012

ptr += nr_cpu_ids * sizeof(void **);

7017

ptr += nr_cpu_ids * sizeof(void **);

7013

7018

7014

root_task_group.cfs_rq = (struct cfs_rq **)ptr;

7019

root_task_group.cfs_rq = (struct cfs_rq **)ptr;

7015

ptr += nr_cpu_ids * sizeof(void **);

7020

ptr += nr_cpu_ids * sizeof(void **);

7016

7021

7017

#endif /* CONFIG_FAIR_GROUP_SCHED */

7022

#endif /* CONFIG_FAIR_GROUP_SCHED */

7018

#ifdef CONFIG_RT_GROUP_SCHED

7023

#ifdef CONFIG_RT_GROUP_SCHED

7019

root_task_group.rt_se = (struct sched_rt_entity **)ptr;

7024

root_task_group.rt_se = (struct sched_rt_entity **)ptr;

7020

ptr += nr_cpu_ids * sizeof(void **);

7025

ptr += nr_cpu_ids * sizeof(void **);

7021

7026

7022

root_task_group.rt_rq = (struct rt_rq **)ptr;

7027

root_task_group.rt_rq = (struct rt_rq **)ptr;

7023

ptr += nr_cpu_ids * sizeof(void **);

7028

ptr += nr_cpu_ids * sizeof(void **);

7024

7029

7025

#endif /* CONFIG_RT_GROUP_SCHED */

7030

#endif /* CONFIG_RT_GROUP_SCHED */

7026

#ifdef CONFIG_CPUMASK_OFFSTACK

7031

#ifdef CONFIG_CPUMASK_OFFSTACK

7027

for_each_possible_cpu(i) {

7032

for_each_possible_cpu(i) {

7028

per_cpu(load_balance_mask, i) = (void *)ptr;

7033

per_cpu(load_balance_mask, i) = (void *)ptr;

7029

ptr += cpumask_size();

7034

ptr += cpumask_size();

7030

}

7035

}

7031

#endif /* CONFIG_CPUMASK_OFFSTACK */

7036

#endif /* CONFIG_CPUMASK_OFFSTACK */

7032

}

7037

}

7033

7038

7034

init_rt_bandwidth(&def_rt_bandwidth,

7039

init_rt_bandwidth(&def_rt_bandwidth,

7035

global_rt_period(), global_rt_runtime());

7040

global_rt_period(), global_rt_runtime());

7036

init_dl_bandwidth(&def_dl_bandwidth,

7041

init_dl_bandwidth(&def_dl_bandwidth,

7037

global_rt_period(), global_rt_runtime());

7042

global_rt_period(), global_rt_runtime());

7038

7043

7039

#ifdef CONFIG_SMP

7044

#ifdef CONFIG_SMP

7040

init_defrootdomain();

7045

init_defrootdomain();

7041

#endif

7046

#endif

7042

7047

7043

#ifdef CONFIG_RT_GROUP_SCHED

7048

#ifdef CONFIG_RT_GROUP_SCHED

7044

init_rt_bandwidth(&root_task_group.rt_bandwidth,

7049

init_rt_bandwidth(&root_task_group.rt_bandwidth,

7045

global_rt_period(), global_rt_runtime());

7050

global_rt_period(), global_rt_runtime());

7046

#endif /* CONFIG_RT_GROUP_SCHED */

7051

#endif /* CONFIG_RT_GROUP_SCHED */

7047

7052

7048

#ifdef CONFIG_CGROUP_SCHED

7053

#ifdef CONFIG_CGROUP_SCHED

7049

list_add(&root_task_group.list, &task_groups);

7054

list_add(&root_task_group.list, &task_groups);

7050

INIT_LIST_HEAD(&root_task_group.children);

7055

INIT_LIST_HEAD(&root_task_group.children);

7051

INIT_LIST_HEAD(&root_task_group.siblings);

7056

INIT_LIST_HEAD(&root_task_group.siblings);

7052

autogroup_init(&init_task);

7057

autogroup_init(&init_task);

7053

7058

7054

#endif /* CONFIG_CGROUP_SCHED */

7059

#endif /* CONFIG_CGROUP_SCHED */

7055

7060

7056

for_each_possible_cpu(i) {

7061

for_each_possible_cpu(i) {

7057

struct rq *rq;

7062

struct rq *rq;

7058

7063

7059

rq = cpu_rq(i);

7064

rq = cpu_rq(i);

7060

raw_spin_lock_init(&rq->lock);

7065

raw_spin_lock_init(&rq->lock);

7061

rq->nr_running = 0;

7066

rq->nr_running = 0;

7062

rq->calc_load_active = 0;

7067

rq->calc_load_active = 0;

7063

rq->calc_load_update = jiffies + LOAD_FREQ;

7068

rq->calc_load_update = jiffies + LOAD_FREQ;

7064

init_cfs_rq(&rq->cfs);

7069

init_cfs_rq(&rq->cfs);

7065

init_rt_rq(&rq->rt, rq);

7070

init_rt_rq(&rq->rt, rq);

7066

init_dl_rq(&rq->dl, rq);

7071

init_dl_rq(&rq->dl, rq);

7067

#ifdef CONFIG_FAIR_GROUP_SCHED

7072

#ifdef CONFIG_FAIR_GROUP_SCHED

7068

root_task_group.shares = ROOT_TASK_GROUP_LOAD;

7073

root_task_group.shares = ROOT_TASK_GROUP_LOAD;

7069

INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);

7074

INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);

7070

/*

7075

/*

7071

* How much cpu bandwidth does root_task_group get?

7076

* How much cpu bandwidth does root_task_group get?

7072

*

7077

*

7073

* In case of task-groups formed thr' the cgroup filesystem, it

7078

* In case of task-groups formed thr' the cgroup filesystem, it

7074

* gets 100% of the cpu resources in the system. This overall

7079

* gets 100% of the cpu resources in the system. This overall

7075

* system cpu resource is divided among the tasks of

7080

* system cpu resource is divided among the tasks of

7076

* root_task_group and its child task-groups in a fair manner,

7081

* root_task_group and its child task-groups in a fair manner,

7077

* based on each entity's (task or task-group's) weight

7082

* based on each entity's (task or task-group's) weight

7078

* (se->load.weight).

7083

* (se->load.weight).

7079

*

7084

*

7080

* In other words, if root_task_group has 10 tasks of weight

7085

* In other words, if root_task_group has 10 tasks of weight

7081

* 1024) and two child groups A0 and A1 (of weight 1024 each),

7086

* 1024) and two child groups A0 and A1 (of weight 1024 each),

7082

* then A0's share of the cpu resource is:

7087

* then A0's share of the cpu resource is:

7083

*

7088

*

7084

* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%

7089

* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%

7085

*

7090

*

7086

* We achieve this by letting root_task_group's tasks sit

7091

* We achieve this by letting root_task_group's tasks sit

7087

* directly in rq->cfs (i.e root_task_group->se[] = NULL).

7092

* directly in rq->cfs (i.e root_task_group->se[] = NULL).

7088

*/

7093

*/

7089

init_cfs_bandwidth(&root_task_group.cfs_bandwidth);

7094

init_cfs_bandwidth(&root_task_group.cfs_bandwidth);

7090

init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);

7095

init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);

7091

#endif /* CONFIG_FAIR_GROUP_SCHED */

7096

#endif /* CONFIG_FAIR_GROUP_SCHED */

7092

7097

7093

rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;

7098

rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;

7094

#ifdef CONFIG_RT_GROUP_SCHED

7099

#ifdef CONFIG_RT_GROUP_SCHED

7095

init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);

7100

init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);

7096

#endif

7101

#endif

7097

7102

7098

for (j = 0; j < CPU_LOAD_IDX_MAX; j++)

7103

for (j = 0; j < CPU_LOAD_IDX_MAX; j++)

7099

rq->cpu_load[j] = 0;

7104

rq->cpu_load[j] = 0;

7100

7105

7101

rq->last_load_update_tick = jiffies;

7106

rq->last_load_update_tick = jiffies;

7102

7107

7103

#ifdef CONFIG_SMP

7108

#ifdef CONFIG_SMP

7104

rq->sd = NULL;

7109

rq->sd = NULL;

7105

rq->rd = NULL;

7110

rq->rd = NULL;

7106

rq->cpu_capacity = SCHED_CAPACITY_SCALE;

7111

rq->cpu_capacity = SCHED_CAPACITY_SCALE;

7107

rq->post_schedule = 0;

7112

rq->post_schedule = 0;

7108

rq->active_balance = 0;

7113

rq->active_balance = 0;

7109

rq->next_balance = jiffies;

7114

rq->next_balance = jiffies;

7110

rq->push_cpu = 0;

7115

rq->push_cpu = 0;

7111

rq->cpu = i;

7116

rq->cpu = i;

7112

rq->online = 0;

7117

rq->online = 0;

7113

rq->idle_stamp = 0;

7118

rq->idle_stamp = 0;

7114

rq->avg_idle = 2*sysctl_sched_migration_cost;

7119

rq->avg_idle = 2*sysctl_sched_migration_cost;

7115

rq->max_idle_balance_cost = sysctl_sched_migration_cost;

7120

rq->max_idle_balance_cost = sysctl_sched_migration_cost;

7116

7121

7117

INIT_LIST_HEAD(&rq->cfs_tasks);

7122

INIT_LIST_HEAD(&rq->cfs_tasks);

7118

7123

7119

rq_attach_root(rq, &def_root_domain);

7124

rq_attach_root(rq, &def_root_domain);

7120

#ifdef CONFIG_NO_HZ_COMMON

7125

#ifdef CONFIG_NO_HZ_COMMON

7121

rq->nohz_flags = 0;

7126

rq->nohz_flags = 0;

7122

#endif

7127

#endif

7123

#ifdef CONFIG_NO_HZ_FULL

7128

#ifdef CONFIG_NO_HZ_FULL

7124

rq->last_sched_tick = 0;

7129

rq->last_sched_tick = 0;

7125

#endif

7130

#endif

7126

#endif

7131

#endif

7127

init_rq_hrtick(rq);

7132

init_rq_hrtick(rq);

7128

atomic_set(&rq->nr_iowait, 0);

7133

atomic_set(&rq->nr_iowait, 0);

7129

}

7134

}

7130

7135

7131

set_load_weight(&init_task);

7136

set_load_weight(&init_task);

7132

7137

7133

#ifdef CONFIG_PREEMPT_NOTIFIERS

7138

#ifdef CONFIG_PREEMPT_NOTIFIERS

7134

INIT_HLIST_HEAD(&init_task.preempt_notifiers);

7139

INIT_HLIST_HEAD(&init_task.preempt_notifiers);

7135

#endif

7140

#endif

7136

7141

7137

/*

7142

/*

7138

* The boot idle thread does lazy MMU switching as well:

7143

* The boot idle thread does lazy MMU switching as well:

7139

*/

7144

*/

7140

atomic_inc(&init_mm.mm_count);

7145

atomic_inc(&init_mm.mm_count);

7141

enter_lazy_tlb(&init_mm, current);

7146

enter_lazy_tlb(&init_mm, current);

7142

7147

7143

/*

7148

/*

7144

* Make us the idle thread. Technically, schedule() should not be

7149

* Make us the idle thread. Technically, schedule() should not be

7145

* called from this thread, however somewhere below it might be,

7150

* called from this thread, however somewhere below it might be,

7146

* but because we are the idle thread, we just pick up running again

7151

* but because we are the idle thread, we just pick up running again

7147

* when this runqueue becomes "idle".

7152

* when this runqueue becomes "idle".

7148

*/

7153

*/

7149

init_idle(current, smp_processor_id());

7154

init_idle(current, smp_processor_id());

7150

7155

7151

calc_load_update = jiffies + LOAD_FREQ;

7156

calc_load_update = jiffies + LOAD_FREQ;

7152

7157

7153

/*

7158

/*

7154

* During early bootup we pretend to be a normal task:

7159

* During early bootup we pretend to be a normal task:

7155

*/

7160

*/

7156

current->sched_class = &fair_sched_class;

7161

current->sched_class = &fair_sched_class;

7157

7162

7158

#ifdef CONFIG_SMP

7163

#ifdef CONFIG_SMP

7159

zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);

7164

zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);

7160

/* May be allocated at isolcpus cmdline parse time */

7165

/* May be allocated at isolcpus cmdline parse time */

7161

if (cpu_isolated_map == NULL)

7166

if (cpu_isolated_map == NULL)

7162

zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);

7167

zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);

7163

idle_thread_set_boot_cpu();

7168

idle_thread_set_boot_cpu();

7164

set_cpu_rq_start_time();

7169

set_cpu_rq_start_time();

7165

#endif

7170

#endif

7166

init_sched_fair_class();

7171

init_sched_fair_class();

7167

7172

7168

scheduler_running = 1;

7173

scheduler_running = 1;

7169

}

7174

}

7170

7175

7171

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP

7176

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP

7172

static inline int preempt_count_equals(int preempt_offset)

7177

static inline int preempt_count_equals(int preempt_offset)

7173

{

7178

{

7174

int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();

7179

int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();

7175

7180

7176

return (nested == preempt_offset);

7181

return (nested == preempt_offset);

7177

}

7182

}

7178

7183

7179

void __might_sleep(const char *file, int line, int preempt_offset)

7184

void __might_sleep(const char *file, int line, int preempt_offset)

7180

{

7185

{

7181

static unsigned long prev_jiffy; /* ratelimiting */

7186

static unsigned long prev_jiffy; /* ratelimiting */

7182

7187

7183

rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */

7188

rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */

7184

if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&

7189

if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&

7185

!is_idle_task(current)) ||

7190

!is_idle_task(current)) ||

7186

system_state != SYSTEM_RUNNING || oops_in_progress)

7191

system_state != SYSTEM_RUNNING || oops_in_progress)

7187

return;

7192

return;

7188

if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

7193

if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

7189

return;

7194

return;

7190

prev_jiffy = jiffies;

7195

prev_jiffy = jiffies;

7191

7196

7192

printk(KERN_ERR

7197

printk(KERN_ERR

7193

"BUG: sleeping function called from invalid context at %s:%d\n",

7198

"BUG: sleeping function called from invalid context at %s:%d\n",

7194

file, line);

7199

file, line);

7195

printk(KERN_ERR

7200

printk(KERN_ERR

7196

"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",

7201

"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",

7197

in_atomic(), irqs_disabled(),

7202

in_atomic(), irqs_disabled(),

7198

current->pid, current->comm);

7203

current->pid, current->comm);

7199

7204

7200

debug_show_held_locks(current);

7205

debug_show_held_locks(current);

7201

if (irqs_disabled())

7206

if (irqs_disabled())

7202

print_irqtrace_events(current);

7207

print_irqtrace_events(current);

7203

#ifdef CONFIG_DEBUG_PREEMPT

7208

#ifdef CONFIG_DEBUG_PREEMPT

7204

if (!preempt_count_equals(preempt_offset)) {

7209

if (!preempt_count_equals(preempt_offset)) {

7205

pr_err("Preemption disabled at:");

7210

pr_err("Preemption disabled at:");

7206

print_ip_sym(current->preempt_disable_ip);

7211

print_ip_sym(current->preempt_disable_ip);

7207

pr_cont("\n");

7212

pr_cont("\n");

7208

}

7213

}

7209

#endif

7214

#endif

7210

dump_stack();

7215

dump_stack();

7211

}

7216

}

7212

EXPORT_SYMBOL(__might_sleep);

7217

EXPORT_SYMBOL(__might_sleep);

7213

#endif

7218

#endif

7214

7219

7215

#ifdef CONFIG_MAGIC_SYSRQ

7220

#ifdef CONFIG_MAGIC_SYSRQ

7216

static void normalize_task(struct rq *rq, struct task_struct *p)

7221

static void normalize_task(struct rq *rq, struct task_struct *p)

7217

{

7222

{

7218

const struct sched_class *prev_class = p->sched_class;

7223

const struct sched_class *prev_class = p->sched_class;

7219

struct sched_attr attr = {

7224

struct sched_attr attr = {

7220

.sched_policy = SCHED_NORMAL,

7225

.sched_policy = SCHED_NORMAL,

7221

};

7226

};

7222

int old_prio = p->prio;

7227

int old_prio = p->prio;

7223

int queued;

7228

int queued;

7224

7229

7225

queued = task_on_rq_queued(p);

7230

queued = task_on_rq_queued(p);

7226

if (queued)

7231

if (queued)

7227

dequeue_task(rq, p, 0);

7232

dequeue_task(rq, p, 0);

7228

__setscheduler(rq, p, &attr);

7233

__setscheduler(rq, p, &attr);

7229

if (queued) {

7234

if (queued) {

7230

enqueue_task(rq, p, 0);

7235

enqueue_task(rq, p, 0);

7231

resched_curr(rq);

7236

resched_curr(rq);

7232

}

7237

}

7233

7238

7234

check_class_changed(rq, p, prev_class, old_prio);

7239

check_class_changed(rq, p, prev_class, old_prio);

7235

}

7240

}

7236

7241

7237

void normalize_rt_tasks(void)

7242

void normalize_rt_tasks(void)

7238

{

7243

{

7239

struct task_struct *g, *p;

7244

struct task_struct *g, *p;

7240

unsigned long flags;

7245

unsigned long flags;

7241

struct rq *rq;

7246

struct rq *rq;

7242

7247

7243

read_lock(&tasklist_lock);

7248

read_lock(&tasklist_lock);

7244

for_each_process_thread(g, p) {

7249

for_each_process_thread(g, p) {

7245

/*

7250

/*

7246

* Only normalize user tasks:

7251

* Only normalize user tasks:

7247

*/

7252

*/

7248

if (p->flags & PF_KTHREAD)

7253

if (p->flags & PF_KTHREAD)

7249

continue;

7254

continue;

7250

7255

7251

p->se.exec_start = 0;

7256

p->se.exec_start = 0;

7252

#ifdef CONFIG_SCHEDSTATS

7257

#ifdef CONFIG_SCHEDSTATS

7253

p->se.statistics.wait_start = 0;

7258

p->se.statistics.wait_start = 0;

7254

p->se.statistics.sleep_start = 0;

7259

p->se.statistics.sleep_start = 0;

7255

p->se.statistics.block_start = 0;

7260

p->se.statistics.block_start = 0;

7256

#endif

7261

#endif

7257

7262

7258

if (!dl_task(p) && !rt_task(p)) {

7263

if (!dl_task(p) && !rt_task(p)) {

7259

/*

7264

/*

7260

* Renice negative nice level userspace

7265

* Renice negative nice level userspace

7261

* tasks back to 0:

7266

* tasks back to 0:

7262

*/

7267

*/

7263

if (task_nice(p) < 0)

7268

if (task_nice(p) < 0)

7264

set_user_nice(p, 0);

7269

set_user_nice(p, 0);

7265

continue;

7270

continue;

7266

}

7271

}

7267

7272

7268

rq = task_rq_lock(p, &flags);

7273

rq = task_rq_lock(p, &flags);

7269

normalize_task(rq, p);

7274

normalize_task(rq, p);

7270

task_rq_unlock(rq, p, &flags);

7275

task_rq_unlock(rq, p, &flags);

7271

}

7276

}

7272

read_unlock(&tasklist_lock);

7277

read_unlock(&tasklist_lock);

7273

}

7278

}

7274

7279

7275

#endif /* CONFIG_MAGIC_SYSRQ */

7280

#endif /* CONFIG_MAGIC_SYSRQ */

7276

7281

7277

#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)

7282

#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)

7278

/*

7283

/*

7279

* These functions are only useful for the IA64 MCA handling, or kdb.

7284

* These functions are only useful for the IA64 MCA handling, or kdb.

7280

*

7285

*

7281

* They can only be called when the whole system has been

7286

* They can only be called when the whole system has been

7282

* stopped - every CPU needs to be quiescent, and no scheduling

7287

* stopped - every CPU needs to be quiescent, and no scheduling

7283

* activity can take place. Using them for anything else would

7288

* activity can take place. Using them for anything else would

7284

* be a serious bug, and as a result, they aren't even visible

7289

* be a serious bug, and as a result, they aren't even visible

7285

* under any other configuration.

7290

* under any other configuration.

7286

*/

7291

*/

7287

7292

7288

/**

7293

/**

7289

* curr_task - return the current task for a given cpu.

7294

* curr_task - return the current task for a given cpu.

7290

* @cpu: the processor in question.

7295

* @cpu: the processor in question.

7291

*

7296

*

7292

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

7297

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

7293

*

7298

*

7294

* Return: The current task for @cpu.

7299

* Return: The current task for @cpu.

7295

*/

7300

*/

7296

struct task_struct *curr_task(int cpu)

7301

struct task_struct *curr_task(int cpu)

7297

{

7302

{

7298

return cpu_curr(cpu);

7303

return cpu_curr(cpu);

7299

}

7304

}

7300

7305

7301

#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */

7306

#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */

7302

7307

7303

#ifdef CONFIG_IA64

7308

#ifdef CONFIG_IA64

7304

/**

7309

/**

7305

* set_curr_task - set the current task for a given cpu.

7310

* set_curr_task - set the current task for a given cpu.

7306

* @cpu: the processor in question.

7311

* @cpu: the processor in question.

7307

* @p: the task pointer to set.

7312

* @p: the task pointer to set.

7308

*

7313

*

7309

* Description: This function must only be used when non-maskable interrupts

7314

* Description: This function must only be used when non-maskable interrupts

7310

* are serviced on a separate stack. It allows the architecture to switch the

7315

* are serviced on a separate stack. It allows the architecture to switch the

7311

* notion of the current task on a cpu in a non-blocking manner. This function

7316

* notion of the current task on a cpu in a non-blocking manner. This function

7312

* must be called with all CPU's synchronized, and interrupts disabled, the

7317

* must be called with all CPU's synchronized, and interrupts disabled, the

7313

* and caller must save the original value of the current task (see

7318

* and caller must save the original value of the current task (see

7314

* curr_task() above) and restore that value before reenabling interrupts and

7319

* curr_task() above) and restore that value before reenabling interrupts and

7315

* re-starting the system.

7320

* re-starting the system.

7316

*

7321

*

7317

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

7322

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

7318

*/

7323

*/

7319

void set_curr_task(int cpu, struct task_struct *p)

7324

void set_curr_task(int cpu, struct task_struct *p)

7320

{

7325

{

7321

cpu_curr(cpu) = p;

7326

cpu_curr(cpu) = p;

7322

}

7327

}

7323

7328

7324

#endif

7329

#endif

7325

7330

7326

#ifdef CONFIG_CGROUP_SCHED

7331

#ifdef CONFIG_CGROUP_SCHED

7327

/* task_group_lock serializes the addition/removal of task groups */

7332

/* task_group_lock serializes the addition/removal of task groups */

7328

static DEFINE_SPINLOCK(task_group_lock);

7333

static DEFINE_SPINLOCK(task_group_lock);

7329

7334

7330

static void free_sched_group(struct task_group *tg)

7335

static void free_sched_group(struct task_group *tg)

7331

{

7336

{

7332

free_fair_sched_group(tg);

7337

free_fair_sched_group(tg);

7333

free_rt_sched_group(tg);

7338

free_rt_sched_group(tg);

7334

autogroup_free(tg);

7339

autogroup_free(tg);

7335

kfree(tg);

7340

kfree(tg);

7336

}

7341

}

7337

7342

7338

/* allocate runqueue etc for a new task group */

7343

/* allocate runqueue etc for a new task group */

7339

struct task_group *sched_create_group(struct task_group *parent)

7344

struct task_group *sched_create_group(struct task_group *parent)

7340

{

7345

{

7341

struct task_group *tg;

7346

struct task_group *tg;

7342

7347

7343

tg = kzalloc(sizeof(*tg), GFP_KERNEL);

7348

tg = kzalloc(sizeof(*tg), GFP_KERNEL);

7344

if (!tg)

7349

if (!tg)

7345

return ERR_PTR(-ENOMEM);

7350

return ERR_PTR(-ENOMEM);

7346

7351

7347

if (!alloc_fair_sched_group(tg, parent))

7352

if (!alloc_fair_sched_group(tg, parent))

7348

goto err;

7353

goto err;

7349

7354

7350

if (!alloc_rt_sched_group(tg, parent))

7355

if (!alloc_rt_sched_group(tg, parent))

7351

goto err;

7356

goto err;

7352

7357

7353

return tg;

7358

return tg;

7354

7359

7355

err:

7360

err:

7356

free_sched_group(tg);

7361

free_sched_group(tg);

7357

return ERR_PTR(-ENOMEM);

7362

return ERR_PTR(-ENOMEM);

7358

}

7363

}

7359

7364

7360

void sched_online_group(struct task_group *tg, struct task_group *parent)

7365

void sched_online_group(struct task_group *tg, struct task_group *parent)

7361

{

7366

{

7362

unsigned long flags;

7367

unsigned long flags;

7363

7368

7364

spin_lock_irqsave(&task_group_lock, flags);

7369

spin_lock_irqsave(&task_group_lock, flags);

7365

list_add_rcu(&tg->list, &task_groups);

7370

list_add_rcu(&tg->list, &task_groups);

7366

7371

7367

WARN_ON(!parent); /* root should already exist */

7372

WARN_ON(!parent); /* root should already exist */

7368

7373

7369

tg->parent = parent;

7374

tg->parent = parent;

7370

INIT_LIST_HEAD(&tg->children);

7375

INIT_LIST_HEAD(&tg->children);

7371

list_add_rcu(&tg->siblings, &parent->children);

7376

list_add_rcu(&tg->siblings, &parent->children);

7372

spin_unlock_irqrestore(&task_group_lock, flags);

7377

spin_unlock_irqrestore(&task_group_lock, flags);

7373

}

7378

}

7374

7379

7375

/* rcu callback to free various structures associated with a task group */

7380

/* rcu callback to free various structures associated with a task group */

7376

static void free_sched_group_rcu(struct rcu_head *rhp)

7381

static void free_sched_group_rcu(struct rcu_head *rhp)

7377

{

7382

{

7378

/* now it should be safe to free those cfs_rqs */

7383

/* now it should be safe to free those cfs_rqs */

7379

free_sched_group(container_of(rhp, struct task_group, rcu));

7384

free_sched_group(container_of(rhp, struct task_group, rcu));

7380

}

7385

}

7381

7386

7382

/* Destroy runqueue etc associated with a task group */

7387

/* Destroy runqueue etc associated with a task group */

7383

void sched_destroy_group(struct task_group *tg)

7388

void sched_destroy_group(struct task_group *tg)

7384

{

7389

{

7385

/* wait for possible concurrent references to cfs_rqs complete */

7390

/* wait for possible concurrent references to cfs_rqs complete */

7386

call_rcu(&tg->rcu, free_sched_group_rcu);

7391

call_rcu(&tg->rcu, free_sched_group_rcu);

7387

}

7392

}

7388

7393

7389

void sched_offline_group(struct task_group *tg)

7394

void sched_offline_group(struct task_group *tg)

7390

{

7395

{

7391

unsigned long flags;

7396

unsigned long flags;

7392

int i;

7397

int i;

7393

7398

7394

/* end participation in shares distribution */

7399

/* end participation in shares distribution */

7395

for_each_possible_cpu(i)

7400

for_each_possible_cpu(i)

7396

unregister_fair_sched_group(tg, i);

7401

unregister_fair_sched_group(tg, i);

7397

7402

7398

spin_lock_irqsave(&task_group_lock, flags);

7403

spin_lock_irqsave(&task_group_lock, flags);

7399

list_del_rcu(&tg->list);

7404

list_del_rcu(&tg->list);

7400

list_del_rcu(&tg->siblings);

7405

list_del_rcu(&tg->siblings);

7401

spin_unlock_irqrestore(&task_group_lock, flags);

7406

spin_unlock_irqrestore(&task_group_lock, flags);

7402

}

7407

}

7403

7408

7404

/* change task's runqueue when it moves between groups.

7409

/* change task's runqueue when it moves between groups.

7405

* The caller of this function should have put the task in its new group

7410

* The caller of this function should have put the task in its new group

7406

* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to

7411

* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to

7407

* reflect its new group.

7412

* reflect its new group.

7408

*/

7413

*/

7409

void sched_move_task(struct task_struct *tsk)

7414

void sched_move_task(struct task_struct *tsk)

7410

{

7415

{

7411

struct task_group *tg;

7416

struct task_group *tg;

7412

int queued, running;

7417

int queued, running;

7413

unsigned long flags;

7418

unsigned long flags;

7414

struct rq *rq;

7419

struct rq *rq;

7415

7420

7416

rq = task_rq_lock(tsk, &flags);

7421

rq = task_rq_lock(tsk, &flags);

7417

7422

7418

running = task_current(rq, tsk);

7423

running = task_current(rq, tsk);

7419

queued = task_on_rq_queued(tsk);

7424

queued = task_on_rq_queued(tsk);

7420

7425

7421

if (queued)

7426

if (queued)

7422

dequeue_task(rq, tsk, 0);

7427

dequeue_task(rq, tsk, 0);

7423

if (unlikely(running))

7428

if (unlikely(running))

7424

put_prev_task(rq, tsk);

7429

put_prev_task(rq, tsk);

7425

7430

7426

/*

7431

/*

7427

* All callers are synchronized by task_rq_lock(); we do not use RCU

7432

* All callers are synchronized by task_rq_lock(); we do not use RCU

7428

* which is pointless here. Thus, we pass "true" to task_css_check()

7433

* which is pointless here. Thus, we pass "true" to task_css_check()

7429

* to prevent lockdep warnings.

7434

* to prevent lockdep warnings.

7430

*/

7435

*/

7431

tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),

7436

tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),

7432

struct task_group, css);

7437

struct task_group, css);

7433

tg = autogroup_task_group(tsk, tg);

7438

tg = autogroup_task_group(tsk, tg);

7434

tsk->sched_task_group = tg;

7439

tsk->sched_task_group = tg;

7435

7440

7436

#ifdef CONFIG_FAIR_GROUP_SCHED

7441

#ifdef CONFIG_FAIR_GROUP_SCHED

7437

if (tsk->sched_class->task_move_group)

7442

if (tsk->sched_class->task_move_group)

7438

tsk->sched_class->task_move_group(tsk, queued);

7443

tsk->sched_class->task_move_group(tsk, queued);

7439

else

7444

else

7440

#endif

7445

#endif

7441

set_task_rq(tsk, task_cpu(tsk));

7446

set_task_rq(tsk, task_cpu(tsk));

7442

7447

7443

if (unlikely(running))

7448

if (unlikely(running))

7444

tsk->sched_class->set_curr_task(rq);

7449

tsk->sched_class->set_curr_task(rq);

7445

if (queued)

7450

if (queued)

7446

enqueue_task(rq, tsk, 0);

7451

enqueue_task(rq, tsk, 0);

7447

7452

7448

task_rq_unlock(rq, tsk, &flags);

7453

task_rq_unlock(rq, tsk, &flags);

7449

}

7454

}

7450

#endif /* CONFIG_CGROUP_SCHED */

7455

#endif /* CONFIG_CGROUP_SCHED */

7451

7456

7452

#ifdef CONFIG_RT_GROUP_SCHED

7457

#ifdef CONFIG_RT_GROUP_SCHED

7453

/*

7458

/*

7454

* Ensure that the real time constraints are schedulable.

7459

* Ensure that the real time constraints are schedulable.

7455

*/

7460

*/

7456

static DEFINE_MUTEX(rt_constraints_mutex);

7461

static DEFINE_MUTEX(rt_constraints_mutex);

7457

7462

7458

/* Must be called with tasklist_lock held */

7463

/* Must be called with tasklist_lock held */

7459

static inline int tg_has_rt_tasks(struct task_group *tg)

7464

static inline int tg_has_rt_tasks(struct task_group *tg)

7460

{

7465

{

7461

struct task_struct *g, *p;

7466

struct task_struct *g, *p;

7462

7467

7463

for_each_process_thread(g, p) {

7468

for_each_process_thread(g, p) {

7464

if (rt_task(p) && task_group(p) == tg)

7469

if (rt_task(p) && task_group(p) == tg)

7465

return 1;

7470

return 1;

7466

}

7471

}

7467

7472

7468

return 0;

7473

return 0;

7469

}

7474

}

7470

7475

7471

struct rt_schedulable_data {

7476

struct rt_schedulable_data {

7472

struct task_group *tg;

7477

struct task_group *tg;

7473

u64 rt_period;

7478

u64 rt_period;

7474

u64 rt_runtime;

7479

u64 rt_runtime;

7475

};

7480

};

7476

7481

7477

static int tg_rt_schedulable(struct task_group *tg, void *data)

7482

static int tg_rt_schedulable(struct task_group *tg, void *data)

7478

{

7483

{

7479

struct rt_schedulable_data *d = data;

7484

struct rt_schedulable_data *d = data;

7480

struct task_group *child;

7485

struct task_group *child;

7481

unsigned long total, sum = 0;

7486

unsigned long total, sum = 0;

7482

u64 period, runtime;

7487

u64 period, runtime;

7483

7488

7484

period = ktime_to_ns(tg->rt_bandwidth.rt_period);

7489

period = ktime_to_ns(tg->rt_bandwidth.rt_period);

7485

runtime = tg->rt_bandwidth.rt_runtime;

7490

runtime = tg->rt_bandwidth.rt_runtime;

7486

7491

7487

if (tg == d->tg) {

7492

if (tg == d->tg) {

7488

period = d->rt_period;

7493

period = d->rt_period;

7489

runtime = d->rt_runtime;

7494

runtime = d->rt_runtime;

7490

}

7495

}

7491

7496

7492

/*

7497

/*

7493

* Cannot have more runtime than the period.

7498

* Cannot have more runtime than the period.

7494

*/

7499

*/

7495

if (runtime > period && runtime != RUNTIME_INF)

7500

if (runtime > period && runtime != RUNTIME_INF)

7496

return -EINVAL;

7501

return -EINVAL;

7497

7502

7498

/*

7503

/*

7499

* Ensure we don't starve existing RT tasks.

7504

* Ensure we don't starve existing RT tasks.

7500

*/

7505

*/

7501

if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))

7506

if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))

7502

return -EBUSY;

7507

return -EBUSY;

7503

7508

7504

total = to_ratio(period, runtime);

7509

total = to_ratio(period, runtime);

7505

7510

7506

/*

7511

/*

7507

* Nobody can have more than the global setting allows.

7512

* Nobody can have more than the global setting allows.

7508

*/

7513

*/

7509

if (total > to_ratio(global_rt_period(), global_rt_runtime()))

7514

if (total > to_ratio(global_rt_period(), global_rt_runtime()))

7510

return -EINVAL;

7515

return -EINVAL;

7511

7516

7512

/*

7517

/*

7513

* The sum of our children's runtime should not exceed our own.

7518

* The sum of our children's runtime should not exceed our own.

7514

*/

7519

*/

7515

list_for_each_entry_rcu(child, &tg->children, siblings) {

7520

list_for_each_entry_rcu(child, &tg->children, siblings) {

7516

period = ktime_to_ns(child->rt_bandwidth.rt_period);

7521

period = ktime_to_ns(child->rt_bandwidth.rt_period);

7517

runtime = child->rt_bandwidth.rt_runtime;

7522

runtime = child->rt_bandwidth.rt_runtime;

7518

7523

7519

if (child == d->tg) {

7524

if (child == d->tg) {

7520

period = d->rt_period;

7525

period = d->rt_period;

7521

runtime = d->rt_runtime;

7526

runtime = d->rt_runtime;

7522

}

7527

}

7523

7528

7524

sum += to_ratio(period, runtime);

7529

sum += to_ratio(period, runtime);

7525

}

7530

}

7526

7531

7527

if (sum > total)

7532

if (sum > total)

7528

return -EINVAL;

7533

return -EINVAL;

7529

7534

7530

return 0;

7535

return 0;

7531

}

7536

}

7532

7537

7533

static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)

7538

static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)

7534

{

7539

{

7535

int ret;

7540

int ret;

7536

7541

7537

struct rt_schedulable_data data = {

7542

struct rt_schedulable_data data = {

7538

.tg = tg,

7543

.tg = tg,

7539

.rt_period = period,

7544

.rt_period = period,

7540

.rt_runtime = runtime,

7545

.rt_runtime = runtime,

7541

};

7546

};

7542

7547

7543

rcu_read_lock();

7548

rcu_read_lock();

7544

ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);

7549

ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);

7545

rcu_read_unlock();

7550

rcu_read_unlock();

7546

7551

7547

return ret;

7552

return ret;

7548

}

7553

}

7549

7554

7550

static int tg_set_rt_bandwidth(struct task_group *tg,

7555

static int tg_set_rt_bandwidth(struct task_group *tg,

7551

u64 rt_period, u64 rt_runtime)

7556

u64 rt_period, u64 rt_runtime)

7552

{

7557

{

7553

int i, err = 0;

7558

int i, err = 0;

7554

7559

7555

mutex_lock(&rt_constraints_mutex);

7560

mutex_lock(&rt_constraints_mutex);

7556

read_lock(&tasklist_lock);

7561

read_lock(&tasklist_lock);

7557

err = __rt_schedulable(tg, rt_period, rt_runtime);

7562

err = __rt_schedulable(tg, rt_period, rt_runtime);

7558

if (err)

7563

if (err)

7559

goto unlock;

7564

goto unlock;

7560

7565

7561

raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);

7566

raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);

7562

tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);

7567

tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);

7563

tg->rt_bandwidth.rt_runtime = rt_runtime;

7568

tg->rt_bandwidth.rt_runtime = rt_runtime;

7564

7569

7565

for_each_possible_cpu(i) {

7570

for_each_possible_cpu(i) {

7566

struct rt_rq *rt_rq = tg->rt_rq[i];

7571

struct rt_rq *rt_rq = tg->rt_rq[i];

7567

7572

7568

raw_spin_lock(&rt_rq->rt_runtime_lock);

7573

raw_spin_lock(&rt_rq->rt_runtime_lock);

7569

rt_rq->rt_runtime = rt_runtime;

7574

rt_rq->rt_runtime = rt_runtime;

7570

raw_spin_unlock(&rt_rq->rt_runtime_lock);

7575

raw_spin_unlock(&rt_rq->rt_runtime_lock);

7571

}

7576

}

7572

raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);

7577

raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);

7573

unlock:

7578

unlock:

7574

read_unlock(&tasklist_lock);

7579

read_unlock(&tasklist_lock);

7575

mutex_unlock(&rt_constraints_mutex);

7580

mutex_unlock(&rt_constraints_mutex);

7576

7581

7577

return err;

7582

return err;

7578

}

7583

}

7579

7584

7580

static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)

7585

static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)

7581

{

7586

{

7582

u64 rt_runtime, rt_period;

7587

u64 rt_runtime, rt_period;

7583

7588

7584

rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);

7589

rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);

7585

rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;

7590

rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;

7586

if (rt_runtime_us < 0)

7591

if (rt_runtime_us < 0)

7587

rt_runtime = RUNTIME_INF;

7592

rt_runtime = RUNTIME_INF;

7588

7593

7589

return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);

7594

return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);

7590

}

7595

}

7591

7596

7592

static long sched_group_rt_runtime(struct task_group *tg)

7597

static long sched_group_rt_runtime(struct task_group *tg)

7593

{

7598

{

7594

u64 rt_runtime_us;

7599

u64 rt_runtime_us;

7595

7600

7596

if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)

7601

if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)

7597

return -1;

7602

return -1;

7598

7603

7599

rt_runtime_us = tg->rt_bandwidth.rt_runtime;

7604

rt_runtime_us = tg->rt_bandwidth.rt_runtime;

7600

do_div(rt_runtime_us, NSEC_PER_USEC);

7605

do_div(rt_runtime_us, NSEC_PER_USEC);

7601

return rt_runtime_us;

7606

return rt_runtime_us;

7602

}

7607

}

7603

7608

7604

static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)

7609

static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)

7605

{

7610

{

7606

u64 rt_runtime, rt_period;

7611

u64 rt_runtime, rt_period;

7607

7612

7608

rt_period = (u64)rt_period_us * NSEC_PER_USEC;

7613

rt_period = (u64)rt_period_us * NSEC_PER_USEC;

7609

rt_runtime = tg->rt_bandwidth.rt_runtime;

7614

rt_runtime = tg->rt_bandwidth.rt_runtime;

7610

7615

7611

if (rt_period == 0)

7616

if (rt_period == 0)

7612

return -EINVAL;

7617

return -EINVAL;

7613

7618

7614

return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);

7619

return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);

7615

}

7620

}

7616

7621

7617

static long sched_group_rt_period(struct task_group *tg)

7622

static long sched_group_rt_period(struct task_group *tg)

7618

{

7623

{

7619

u64 rt_period_us;

7624

u64 rt_period_us;

7620

7625

7621

rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);

7626

rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);

7622

do_div(rt_period_us, NSEC_PER_USEC);

7627

do_div(rt_period_us, NSEC_PER_USEC);

7623

return rt_period_us;

7628

return rt_period_us;

7624

}

7629

}

7625

#endif /* CONFIG_RT_GROUP_SCHED */

7630

#endif /* CONFIG_RT_GROUP_SCHED */

7626

7631

7627

#ifdef CONFIG_RT_GROUP_SCHED

7632

#ifdef CONFIG_RT_GROUP_SCHED

7628

static int sched_rt_global_constraints(void)

7633

static int sched_rt_global_constraints(void)

7629

{

7634

{

7630

int ret = 0;

7635

int ret = 0;

7631

7636

7632

mutex_lock(&rt_constraints_mutex);

7637

mutex_lock(&rt_constraints_mutex);

7633

read_lock(&tasklist_lock);

7638

read_lock(&tasklist_lock);

7634

ret = __rt_schedulable(NULL, 0, 0);

7639

ret = __rt_schedulable(NULL, 0, 0);

7635

read_unlock(&tasklist_lock);

7640

read_unlock(&tasklist_lock);

7636

mutex_unlock(&rt_constraints_mutex);

7641

mutex_unlock(&rt_constraints_mutex);

7637

7642

7638

return ret;

7643

return ret;

7639

}

7644

}

7640

7645

7641

static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)

7646

static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)

7642

{

7647

{

7643

/* Don't accept realtime tasks when there is no way for them to run */

7648

/* Don't accept realtime tasks when there is no way for them to run */

7644

if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)

7649

if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)

7645

return 0;

7650

return 0;

7646

7651

7647

return 1;

7652

return 1;

7648

}

7653

}

7649

7654

7650

#else /* !CONFIG_RT_GROUP_SCHED */

7655

#else /* !CONFIG_RT_GROUP_SCHED */

7651

static int sched_rt_global_constraints(void)

7656

static int sched_rt_global_constraints(void)

7652

{

7657

{

7653

unsigned long flags;

7658

unsigned long flags;

7654

int i, ret = 0;

7659

int i, ret = 0;

7655

7660

7656

raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);

7661

raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);

7657

for_each_possible_cpu(i) {

7662

for_each_possible_cpu(i) {

7658

struct rt_rq *rt_rq = &cpu_rq(i)->rt;

7663

struct rt_rq *rt_rq = &cpu_rq(i)->rt;

7659

7664

7660

raw_spin_lock(&rt_rq->rt_runtime_lock);

7665

raw_spin_lock(&rt_rq->rt_runtime_lock);

7661

rt_rq->rt_runtime = global_rt_runtime();

7666

rt_rq->rt_runtime = global_rt_runtime();

7662

raw_spin_unlock(&rt_rq->rt_runtime_lock);

7667

raw_spin_unlock(&rt_rq->rt_runtime_lock);

7663

}

7668

}

7664

raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);

7669

raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);

7665

7670

7666

return ret;

7671

return ret;

7667

}

7672

}

7668

#endif /* CONFIG_RT_GROUP_SCHED */

7673

#endif /* CONFIG_RT_GROUP_SCHED */

7669

7674

7670

static int sched_dl_global_constraints(void)

7675

static int sched_dl_global_constraints(void)

7671

{

7676

{

7672

u64 runtime = global_rt_runtime();

7677

u64 runtime = global_rt_runtime();

7673

u64 period = global_rt_period();

7678

u64 period = global_rt_period();

7674

u64 new_bw = to_ratio(period, runtime);

7679

u64 new_bw = to_ratio(period, runtime);

7675

struct dl_bw *dl_b;

7680

struct dl_bw *dl_b;

7676

int cpu, ret = 0;

7681

int cpu, ret = 0;

7677

unsigned long flags;

7682

unsigned long flags;

7678

7683

7679

/*

7684

/*

7680

* Here we want to check the bandwidth not being set to some

7685

* Here we want to check the bandwidth not being set to some

7681

* value smaller than the currently allocated bandwidth in

7686

* value smaller than the currently allocated bandwidth in

7682

* any of the root_domains.

7687

* any of the root_domains.

7683

*

7688

*

7684

* FIXME: Cycling on all the CPUs is overdoing, but simpler than

7689

* FIXME: Cycling on all the CPUs is overdoing, but simpler than

7685

* cycling on root_domains... Discussion on different/better

7690

* cycling on root_domains... Discussion on different/better

7686

* solutions is welcome!

7691

* solutions is welcome!

7687

*/

7692

*/

7688

for_each_possible_cpu(cpu) {

7693

for_each_possible_cpu(cpu) {

7689

rcu_read_lock_sched();

7694

rcu_read_lock_sched();

7690

dl_b = dl_bw_of(cpu);

7695

dl_b = dl_bw_of(cpu);

7691

7696

7692

raw_spin_lock_irqsave(&dl_b->lock, flags);

7697

raw_spin_lock_irqsave(&dl_b->lock, flags);

7693

if (new_bw < dl_b->total_bw)

7698

if (new_bw < dl_b->total_bw)

7694

ret = -EBUSY;

7699

ret = -EBUSY;

7695

raw_spin_unlock_irqrestore(&dl_b->lock, flags);

7700

raw_spin_unlock_irqrestore(&dl_b->lock, flags);

7696

7701

7697

rcu_read_unlock_sched();

7702

rcu_read_unlock_sched();

7698

7703

7699

if (ret)

7704

if (ret)

7700

break;

7705

break;

7701

}

7706

}

7702

7707

7703

return ret;

7708

return ret;

7704

}

7709

}

7705

7710

7706

static void sched_dl_do_global(void)

7711

static void sched_dl_do_global(void)

7707

{

7712

{

7708

u64 new_bw = -1;

7713

u64 new_bw = -1;

7709

struct dl_bw *dl_b;

7714

struct dl_bw *dl_b;

7710

int cpu;

7715

int cpu;

7711

unsigned long flags;

7716

unsigned long flags;

7712

7717

7713

def_dl_bandwidth.dl_period = global_rt_period();

7718

def_dl_bandwidth.dl_period = global_rt_period();

7714

def_dl_bandwidth.dl_runtime = global_rt_runtime();

7719

def_dl_bandwidth.dl_runtime = global_rt_runtime();

7715

7720

7716

if (global_rt_runtime() != RUNTIME_INF)

7721

if (global_rt_runtime() != RUNTIME_INF)

7717

new_bw = to_ratio(global_rt_period(), global_rt_runtime());

7722

new_bw = to_ratio(global_rt_period(), global_rt_runtime());

7718

7723

7719

/*

7724

/*

7720

* FIXME: As above...

7725

* FIXME: As above...

7721

*/

7726

*/

7722

for_each_possible_cpu(cpu) {

7727

for_each_possible_cpu(cpu) {

7723

rcu_read_lock_sched();

7728

rcu_read_lock_sched();

7724

dl_b = dl_bw_of(cpu);

7729

dl_b = dl_bw_of(cpu);

7725

7730

7726

raw_spin_lock_irqsave(&dl_b->lock, flags);

7731

raw_spin_lock_irqsave(&dl_b->lock, flags);

7727

dl_b->bw = new_bw;

7732

dl_b->bw = new_bw;

7728

raw_spin_unlock_irqrestore(&dl_b->lock, flags);

7733

raw_spin_unlock_irqrestore(&dl_b->lock, flags);

7729

7734

7730

rcu_read_unlock_sched();

7735

rcu_read_unlock_sched();

7731

}

7736

}

7732

}

7737

}

7733

7738

7734

static int sched_rt_global_validate(void)

7739

static int sched_rt_global_validate(void)

7735

{

7740

{

7736

if (sysctl_sched_rt_period <= 0)

7741

if (sysctl_sched_rt_period <= 0)

7737

return -EINVAL;

7742

return -EINVAL;

7738

7743

7739

if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&

7744

if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&

7740

(sysctl_sched_rt_runtime > sysctl_sched_rt_period))

7745

(sysctl_sched_rt_runtime > sysctl_sched_rt_period))

7741

return -EINVAL;

7746

return -EINVAL;

7742

7747

7743

return 0;

7748

return 0;

7744

}

7749

}

7745

7750

7746

static void sched_rt_do_global(void)

7751

static void sched_rt_do_global(void)

7747

{

7752

{

7748

def_rt_bandwidth.rt_runtime = global_rt_runtime();

7753

def_rt_bandwidth.rt_runtime = global_rt_runtime();

7749

def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());

7754

def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());

7750

}

7755

}

7751

7756

7752

int sched_rt_handler(struct ctl_table *table, int write,

7757

int sched_rt_handler(struct ctl_table *table, int write,

7753

void __user *buffer, size_t *lenp,

7758

void __user *buffer, size_t *lenp,

7754

loff_t *ppos)

7759

loff_t *ppos)

7755

{

7760

{

7756

int old_period, old_runtime;

7761

int old_period, old_runtime;

7757

static DEFINE_MUTEX(mutex);

7762

static DEFINE_MUTEX(mutex);

7758

int ret;

7763

int ret;

7759

7764

7760

mutex_lock(&mutex);

7765

mutex_lock(&mutex);

7761

old_period = sysctl_sched_rt_period;

7766

old_period = sysctl_sched_rt_period;

7762

old_runtime = sysctl_sched_rt_runtime;

7767

old_runtime = sysctl_sched_rt_runtime;

7763

7768

7764

ret = proc_dointvec(table, write, buffer, lenp, ppos);

7769

ret = proc_dointvec(table, write, buffer, lenp, ppos);

7765

7770

7766

if (!ret && write) {

7771

if (!ret && write) {

7767

ret = sched_rt_global_validate();

7772

ret = sched_rt_global_validate();

7768

if (ret)

7773

if (ret)

7769

goto undo;

7774

goto undo;

7770

7775

7771

ret = sched_rt_global_constraints();

7776

ret = sched_rt_global_constraints();

7772

if (ret)

7777

if (ret)

7773

goto undo;

7778

goto undo;

7774

7779

7775

ret = sched_dl_global_constraints();

7780

ret = sched_dl_global_constraints();

7776

if (ret)

7781

if (ret)

7777

goto undo;

7782

goto undo;

7778

7783

7779

sched_rt_do_global();

7784

sched_rt_do_global();

7780

sched_dl_do_global();

7785

sched_dl_do_global();

7781

}

7786

}

7782

if (0) {

7787

if (0) {

7783

undo:

7788

undo:

7784

sysctl_sched_rt_period = old_period;

7789

sysctl_sched_rt_period = old_period;

7785

sysctl_sched_rt_runtime = old_runtime;

7790

sysctl_sched_rt_runtime = old_runtime;

7786

}

7791

}

7787

mutex_unlock(&mutex);

7792

mutex_unlock(&mutex);

7788

7793

7789

return ret;

7794

return ret;

7790

}

7795

}

7791

7796

7792

int sched_rr_handler(struct ctl_table *table, int write,

7797

int sched_rr_handler(struct ctl_table *table, int write,

7793

void __user *buffer, size_t *lenp,

7798

void __user *buffer, size_t *lenp,

7794

loff_t *ppos)

7799

loff_t *ppos)

7795

{

7800

{

7796

int ret;

7801

int ret;

7797

static DEFINE_MUTEX(mutex);

7802

static DEFINE_MUTEX(mutex);

7798

7803

7799

mutex_lock(&mutex);

7804

mutex_lock(&mutex);

7800

ret = proc_dointvec(table, write, buffer, lenp, ppos);

7805

ret = proc_dointvec(table, write, buffer, lenp, ppos);

7801

/* make sure that internally we keep jiffies */

7806

/* make sure that internally we keep jiffies */

7802

/* also, writing zero resets timeslice to default */

7807

/* also, writing zero resets timeslice to default */

7803

if (!ret && write) {

7808

if (!ret && write) {

7804

sched_rr_timeslice = sched_rr_timeslice <= 0 ?

7809

sched_rr_timeslice = sched_rr_timeslice <= 0 ?

7805

RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);

7810

RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);

7806

}

7811

}

7807

mutex_unlock(&mutex);

7812

mutex_unlock(&mutex);

7808

return ret;

7813

return ret;

7809

}

7814

}

7810

7815

7811

#ifdef CONFIG_CGROUP_SCHED

7816

#ifdef CONFIG_CGROUP_SCHED

7812

7817

7813

static inline struct task_group *css_tg(struct cgroup_subsys_state *css)

7818

static inline struct task_group *css_tg(struct cgroup_subsys_state *css)

7814

{

7819

{

7815

return css ? container_of(css, struct task_group, css) : NULL;

7820

return css ? container_of(css, struct task_group, css) : NULL;

7816

}

7821

}

7817

7822

7818

static struct cgroup_subsys_state *

7823

static struct cgroup_subsys_state *

7819

cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)

7824

cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)

7820

{

7825

{

7821

struct task_group *parent = css_tg(parent_css);

7826

struct task_group *parent = css_tg(parent_css);

7822

struct task_group *tg;

7827

struct task_group *tg;

7823

7828

7824

if (!parent) {

7829

if (!parent) {

7825

/* This is early initialization for the top cgroup */

7830

/* This is early initialization for the top cgroup */

7826

return &root_task_group.css;

7831

return &root_task_group.css;

7827

}

7832

}

7828

7833

7829

tg = sched_create_group(parent);

7834

tg = sched_create_group(parent);

7830

if (IS_ERR(tg))

7835

if (IS_ERR(tg))

7831

return ERR_PTR(-ENOMEM);

7836

return ERR_PTR(-ENOMEM);

7832

7837

7833

return &tg->css;

7838

return &tg->css;

7834

}

7839

}

7835

7840

7836

static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)

7841

static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)

7837

{

7842

{

7838

struct task_group *tg = css_tg(css);

7843

struct task_group *tg = css_tg(css);

7839

struct task_group *parent = css_tg(css->parent);

7844

struct task_group *parent = css_tg(css->parent);

7840

7845

7841

if (parent)

7846

if (parent)

7842

sched_online_group(tg, parent);

7847

sched_online_group(tg, parent);

7843

return 0;

7848

return 0;

7844

}

7849

}

7845

7850

7846

static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)

7851

static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)

7847

{

7852

{

7848

struct task_group *tg = css_tg(css);

7853

struct task_group *tg = css_tg(css);

7849

7854

7850

sched_destroy_group(tg);

7855

sched_destroy_group(tg);

7851

}

7856

}

7852

7857

7853

static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)

7858

static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)

7854

{

7859

{

7855

struct task_group *tg = css_tg(css);

7860

struct task_group *tg = css_tg(css);

7856

7861

7857

sched_offline_group(tg);

7862

sched_offline_group(tg);

7858

}

7863

}

7859

7864

7860

static void cpu_cgroup_fork(struct task_struct *task)

7865

static void cpu_cgroup_fork(struct task_struct *task)

7861

{

7866

{

7862

sched_move_task(task);

7867

sched_move_task(task);

7863

}

7868

}

7864

7869

7865

static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,

7870

static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,

7866

struct cgroup_taskset *tset)

7871

struct cgroup_taskset *tset)

7867

{

7872

{

7868

struct task_struct *task;

7873

struct task_struct *task;

7869

7874

7870

cgroup_taskset_for_each(task, tset) {

7875

cgroup_taskset_for_each(task, tset) {

7871

#ifdef CONFIG_RT_GROUP_SCHED

7876

#ifdef CONFIG_RT_GROUP_SCHED

7872

if (!sched_rt_can_attach(css_tg(css), task))

7877

if (!sched_rt_can_attach(css_tg(css), task))

7873

return -EINVAL;

7878

return -EINVAL;

7874

#else

7879

#else

7875

/* We don't support RT-tasks being in separate groups */

7880

/* We don't support RT-tasks being in separate groups */

7876

if (task->sched_class != &fair_sched_class)

7881

if (task->sched_class != &fair_sched_class)

7877

return -EINVAL;

7882

return -EINVAL;

7878

#endif

7883

#endif

7879

}

7884

}

7880

return 0;

7885

return 0;

7881

}

7886

}

7882

7887

7883

static void cpu_cgroup_attach(struct cgroup_subsys_state *css,

7888

static void cpu_cgroup_attach(struct cgroup_subsys_state *css,

7884

struct cgroup_taskset *tset)

7889

struct cgroup_taskset *tset)

7885

{

7890

{

7886

struct task_struct *task;

7891

struct task_struct *task;

7887

7892

7888

cgroup_taskset_for_each(task, tset)

7893

cgroup_taskset_for_each(task, tset)

7889

sched_move_task(task);

7894

sched_move_task(task);

7890

}

7895

}

7891

7896

7892

static void cpu_cgroup_exit(struct cgroup_subsys_state *css,

7897

static void cpu_cgroup_exit(struct cgroup_subsys_state *css,

7893

struct cgroup_subsys_state *old_css,

7898

struct cgroup_subsys_state *old_css,

7894

struct task_struct *task)

7899

struct task_struct *task)

7895

{

7900

{

7896

/*

7901

/*

7897

* cgroup_exit() is called in the copy_process() failure path.

7902

* cgroup_exit() is called in the copy_process() failure path.

7898

* Ignore this case since the task hasn't ran yet, this avoids

7903

* Ignore this case since the task hasn't ran yet, this avoids

7899

* trying to poke a half freed task state from generic code.

7904

* trying to poke a half freed task state from generic code.

7900

*/

7905

*/

7901

if (!(task->flags & PF_EXITING))

7906

if (!(task->flags & PF_EXITING))

7902

return;

7907

return;

7903

7908

7904

sched_move_task(task);

7909

sched_move_task(task);

7905

}

7910

}

7906

7911

7907

#ifdef CONFIG_FAIR_GROUP_SCHED

7912

#ifdef CONFIG_FAIR_GROUP_SCHED

7908

static int cpu_shares_write_u64(struct cgroup_subsys_state *css,

7913

static int cpu_shares_write_u64(struct cgroup_subsys_state *css,

7909

struct cftype *cftype, u64 shareval)

7914

struct cftype *cftype, u64 shareval)

7910

{

7915

{

7911

return sched_group_set_shares(css_tg(css), scale_load(shareval));

7916

return sched_group_set_shares(css_tg(css), scale_load(shareval));

7912

}

7917

}

7913

7918

7914

static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,

7919

static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,

7915

struct cftype *cft)

7920

struct cftype *cft)

7916

{

7921

{

7917

struct task_group *tg = css_tg(css);

7922

struct task_group *tg = css_tg(css);

7918

7923

7919

return (u64) scale_load_down(tg->shares);

7924

return (u64) scale_load_down(tg->shares);

7920

}

7925

}

7921

7926

7922

#ifdef CONFIG_CFS_BANDWIDTH

7927

#ifdef CONFIG_CFS_BANDWIDTH

7923

static DEFINE_MUTEX(cfs_constraints_mutex);

7928

static DEFINE_MUTEX(cfs_constraints_mutex);

7924

7929

7925

const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */

7930

const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */

7926

const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */

7931

const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */

7927

7932

7928

static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);

7933

static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);

7929

7934

7930

static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)

7935

static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)

7931

{

7936

{

7932

int i, ret = 0, runtime_enabled, runtime_was_enabled;

7937

int i, ret = 0, runtime_enabled, runtime_was_enabled;

7933

struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;

7938

struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;

7934

7939

7935

if (tg == &root_task_group)

7940

if (tg == &root_task_group)

7936

return -EINVAL;

7941

return -EINVAL;

7937

7942

7938

/*

7943

/*

7939

* Ensure we have at some amount of bandwidth every period. This is

7944

* Ensure we have at some amount of bandwidth every period. This is

7940

* to prevent reaching a state of large arrears when throttled via

7945

* to prevent reaching a state of large arrears when throttled via

7941

* entity_tick() resulting in prolonged exit starvation.

7946

* entity_tick() resulting in prolonged exit starvation.

7942

*/

7947

*/

7943

if (quota < min_cfs_quota_period || period < min_cfs_quota_period)

7948

if (quota < min_cfs_quota_period || period < min_cfs_quota_period)

7944

return -EINVAL;

7949

return -EINVAL;

7945

7950

7946

/*

7951

/*

7947

* Likewise, bound things on the otherside by preventing insane quota

7952

* Likewise, bound things on the otherside by preventing insane quota

7948

* periods. This also allows us to normalize in computing quota

7953

* periods. This also allows us to normalize in computing quota

7949

* feasibility.

7954

* feasibility.

7950

*/

7955

*/

7951

if (period > max_cfs_quota_period)

7956

if (period > max_cfs_quota_period)

7952

return -EINVAL;

7957

return -EINVAL;

7953

7958

7954

/*

7959

/*

7955

* Prevent race between setting of cfs_rq->runtime_enabled and

7960

* Prevent race between setting of cfs_rq->runtime_enabled and

7956

* unthrottle_offline_cfs_rqs().

7961

* unthrottle_offline_cfs_rqs().

7957

*/

7962

*/

7958

get_online_cpus();

7963

get_online_cpus();

7959

mutex_lock(&cfs_constraints_mutex);

7964

mutex_lock(&cfs_constraints_mutex);

7960

ret = __cfs_schedulable(tg, period, quota);

7965

ret = __cfs_schedulable(tg, period, quota);

7961

if (ret)

7966

if (ret)

7962

goto out_unlock;

7967

goto out_unlock;

7963

7968

7964

runtime_enabled = quota != RUNTIME_INF;

7969

runtime_enabled = quota != RUNTIME_INF;

7965

runtime_was_enabled = cfs_b->quota != RUNTIME_INF;

7970

runtime_was_enabled = cfs_b->quota != RUNTIME_INF;

7966

/*

7971

/*

7967

* If we need to toggle cfs_bandwidth_used, off->on must occur

7972

* If we need to toggle cfs_bandwidth_used, off->on must occur

7968

* before making related changes, and on->off must occur afterwards

7973

* before making related changes, and on->off must occur afterwards

7969

*/

7974

*/

7970

if (runtime_enabled && !runtime_was_enabled)

7975

if (runtime_enabled && !runtime_was_enabled)

7971

cfs_bandwidth_usage_inc();

7976

cfs_bandwidth_usage_inc();

7972

raw_spin_lock_irq(&cfs_b->lock);

7977

raw_spin_lock_irq(&cfs_b->lock);

7973

cfs_b->period = ns_to_ktime(period);

7978

cfs_b->period = ns_to_ktime(period);

7974

cfs_b->quota = quota;

7979

cfs_b->quota = quota;

7975

7980

7976

__refill_cfs_bandwidth_runtime(cfs_b);

7981

__refill_cfs_bandwidth_runtime(cfs_b);

7977

/* restart the period timer (if active) to handle new period expiry */

7982

/* restart the period timer (if active) to handle new period expiry */

7978

if (runtime_enabled && cfs_b->timer_active) {

7983

if (runtime_enabled && cfs_b->timer_active) {

7979

/* force a reprogram */

7984

/* force a reprogram */

7980

__start_cfs_bandwidth(cfs_b, true);

7985

__start_cfs_bandwidth(cfs_b, true);

7981

}

7986

}

7982

raw_spin_unlock_irq(&cfs_b->lock);

7987

raw_spin_unlock_irq(&cfs_b->lock);

7983

7988

7984

for_each_online_cpu(i) {

7989

for_each_online_cpu(i) {

7985

struct cfs_rq *cfs_rq = tg->cfs_rq[i];

7990

struct cfs_rq *cfs_rq = tg->cfs_rq[i];

7986

struct rq *rq = cfs_rq->rq;

7991

struct rq *rq = cfs_rq->rq;

7987

7992

7988

raw_spin_lock_irq(&rq->lock);

7993

raw_spin_lock_irq(&rq->lock);

7989

cfs_rq->runtime_enabled = runtime_enabled;

7994

cfs_rq->runtime_enabled = runtime_enabled;

7990

cfs_rq->runtime_remaining = 0;

7995

cfs_rq->runtime_remaining = 0;

7991

7996

7992

if (cfs_rq->throttled)

7997

if (cfs_rq->throttled)

7993

unthrottle_cfs_rq(cfs_rq);

7998

unthrottle_cfs_rq(cfs_rq);

7994

raw_spin_unlock_irq(&rq->lock);

7999

raw_spin_unlock_irq(&rq->lock);

7995

}

8000

}

7996

if (runtime_was_enabled && !runtime_enabled)

8001

if (runtime_was_enabled && !runtime_enabled)

7997

cfs_bandwidth_usage_dec();

8002

cfs_bandwidth_usage_dec();

7998

out_unlock:

8003

out_unlock:

7999

mutex_unlock(&cfs_constraints_mutex);

8004

mutex_unlock(&cfs_constraints_mutex);

8000

put_online_cpus();

8005

put_online_cpus();

8001

8006

8002

return ret;

8007

return ret;

8003

}

8008

}

8004

8009

8005

int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)

8010

int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)

8006

{

8011

{

8007

u64 quota, period;

8012

u64 quota, period;

8008

8013

8009

period = ktime_to_ns(tg->cfs_bandwidth.period);

8014

period = ktime_to_ns(tg->cfs_bandwidth.period);

8010

if (cfs_quota_us < 0)

8015

if (cfs_quota_us < 0)

8011

quota = RUNTIME_INF;

8016

quota = RUNTIME_INF;

8012

else

8017

else

8013

quota = (u64)cfs_quota_us * NSEC_PER_USEC;

8018

quota = (u64)cfs_quota_us * NSEC_PER_USEC;

8014

8019

8015

return tg_set_cfs_bandwidth(tg, period, quota);

8020

return tg_set_cfs_bandwidth(tg, period, quota);

8016

}

8021

}

8017

8022

8018

long tg_get_cfs_quota(struct task_group *tg)

8023

long tg_get_cfs_quota(struct task_group *tg)

8019

{

8024

{

8020

u64 quota_us;

8025

u64 quota_us;

8021

8026

8022

if (tg->cfs_bandwidth.quota == RUNTIME_INF)

8027

if (tg->cfs_bandwidth.quota == RUNTIME_INF)

8023

return -1;

8028

return -1;

8024

8029

8025

quota_us = tg->cfs_bandwidth.quota;

8030

quota_us = tg->cfs_bandwidth.quota;

8026

do_div(quota_us, NSEC_PER_USEC);

8031

do_div(quota_us, NSEC_PER_USEC);

8027

8032

8028

return quota_us;

8033

return quota_us;

8029

}

8034

}

8030

8035

8031

int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)

8036

int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)

8032

{

8037

{

8033

u64 quota, period;

8038

u64 quota, period;

8034

8039

8035

period = (u64)cfs_period_us * NSEC_PER_USEC;

8040

period = (u64)cfs_period_us * NSEC_PER_USEC;

8036

quota = tg->cfs_bandwidth.quota;

8041

quota = tg->cfs_bandwidth.quota;

8037

8042

8038

return tg_set_cfs_bandwidth(tg, period, quota);

8043

return tg_set_cfs_bandwidth(tg, period, quota);

8039

}

8044

}

8040

8045

8041

long tg_get_cfs_period(struct task_group *tg)

8046

long tg_get_cfs_period(struct task_group *tg)

8042

{

8047

{

8043

u64 cfs_period_us;

8048

u64 cfs_period_us;

8044

8049

8045

cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);

8050

cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);

8046

do_div(cfs_period_us, NSEC_PER_USEC);

8051

do_div(cfs_period_us, NSEC_PER_USEC);

8047

8052

8048

return cfs_period_us;

8053

return cfs_period_us;

8049

}

8054

}

8050

8055

8051

static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,

8056

static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,

8052

struct cftype *cft)

8057

struct cftype *cft)

8053

{

8058

{

8054

return tg_get_cfs_quota(css_tg(css));

8059

return tg_get_cfs_quota(css_tg(css));

8055

}

8060

}

8056

8061

8057

static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,

8062

static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,

8058

struct cftype *cftype, s64 cfs_quota_us)

8063

struct cftype *cftype, s64 cfs_quota_us)

8059

{

8064

{

8060

return tg_set_cfs_quota(css_tg(css), cfs_quota_us);

8065

return tg_set_cfs_quota(css_tg(css), cfs_quota_us);

8061

}

8066

}

8062

8067

8063

static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,

8068

static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,

8064

struct cftype *cft)

8069

struct cftype *cft)

8065

{

8070

{

8066

return tg_get_cfs_period(css_tg(css));

8071

return tg_get_cfs_period(css_tg(css));

8067

}

8072

}

8068

8073

8069

static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,

8074

static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,

8070

struct cftype *cftype, u64 cfs_period_us)

8075

struct cftype *cftype, u64 cfs_period_us)

8071

{

8076

{

8072

return tg_set_cfs_period(css_tg(css), cfs_period_us);

8077

return tg_set_cfs_period(css_tg(css), cfs_period_us);

8073

}

8078

}

8074

8079

8075

struct cfs_schedulable_data {

8080

struct cfs_schedulable_data {

8076

struct task_group *tg;

8081

struct task_group *tg;

8077

u64 period, quota;

8082

u64 period, quota;

8078

};

8083

};

8079

8084

8080

/*

8085

/*

8081

* normalize group quota/period to be quota/max_period

8086

* normalize group quota/period to be quota/max_period

8082

* note: units are usecs

8087

* note: units are usecs

8083

*/

8088

*/

8084

static u64 normalize_cfs_quota(struct task_group *tg,

8089

static u64 normalize_cfs_quota(struct task_group *tg,

8085

struct cfs_schedulable_data *d)

8090

struct cfs_schedulable_data *d)

8086

{

8091

{

8087

u64 quota, period;

8092

u64 quota, period;

8088

8093

8089

if (tg == d->tg) {

8094

if (tg == d->tg) {

8090

period = d->period;

8095

period = d->period;

8091

quota = d->quota;

8096

quota = d->quota;

8092

} else {

8097

} else {

8093

period = tg_get_cfs_period(tg);

8098

period = tg_get_cfs_period(tg);

8094

quota = tg_get_cfs_quota(tg);

8099

quota = tg_get_cfs_quota(tg);

8095

}

8100

}

8096

8101

8097

/* note: these should typically be equivalent */

8102

/* note: these should typically be equivalent */

8098

if (quota == RUNTIME_INF || quota == -1)

8103

if (quota == RUNTIME_INF || quota == -1)

8099

return RUNTIME_INF;

8104

return RUNTIME_INF;

8100

8105

8101

return to_ratio(period, quota);

8106

return to_ratio(period, quota);

8102

}

8107

}

8103

8108

8104

static int tg_cfs_schedulable_down(struct task_group *tg, void *data)

8109

static int tg_cfs_schedulable_down(struct task_group *tg, void *data)

8105

{

8110

{

8106

struct cfs_schedulable_data *d = data;

8111

struct cfs_schedulable_data *d = data;

8107

struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;

8112

struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;

8108

s64 quota = 0, parent_quota = -1;

8113

s64 quota = 0, parent_quota = -1;

8109

8114

8110

if (!tg->parent) {

8115

if (!tg->parent) {

8111

quota = RUNTIME_INF;

8116

quota = RUNTIME_INF;

8112

} else {

8117

} else {

8113

struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;

8118

struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;

8114

8119

8115

quota = normalize_cfs_quota(tg, d);

8120

quota = normalize_cfs_quota(tg, d);

8116

parent_quota = parent_b->hierarchical_quota;

8121

parent_quota = parent_b->hierarchical_quota;

8117

8122

8118

/*

8123

/*

8119

* ensure max(child_quota) <= parent_quota, inherit when no

8124

* ensure max(child_quota) <= parent_quota, inherit when no

8120

* limit is set

8125

* limit is set

8121

*/

8126

*/

8122

if (quota == RUNTIME_INF)

8127

if (quota == RUNTIME_INF)

8123

quota = parent_quota;

8128

quota = parent_quota;

8124

else if (parent_quota != RUNTIME_INF && quota > parent_quota)

8129

else if (parent_quota != RUNTIME_INF && quota > parent_quota)

8125

return -EINVAL;

8130

return -EINVAL;

8126

}

8131

}

8127

cfs_b->hierarchical_quota = quota;

8132

cfs_b->hierarchical_quota = quota;

8128

8133

8129

return 0;

8134

return 0;

8130

}

8135

}

8131

8136

8132

static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)

8137

static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)

8133

{

8138

{

8134

int ret;

8139

int ret;

8135

struct cfs_schedulable_data data = {

8140

struct cfs_schedulable_data data = {

8136

.tg = tg,

8141

.tg = tg,

8137

.period = period,

8142

.period = period,

8138

.quota = quota,

8143

.quota = quota,

8139

};

8144

};

8140

8145

8141

if (quota != RUNTIME_INF) {

8146

if (quota != RUNTIME_INF) {

8142

do_div(data.period, NSEC_PER_USEC);

8147

do_div(data.period, NSEC_PER_USEC);

8143

do_div(data.quota, NSEC_PER_USEC);

8148

do_div(data.quota, NSEC_PER_USEC);

8144

}

8149

}

8145

8150

8146

rcu_read_lock();

8151

rcu_read_lock();

8147

ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);

8152

ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);

8148

rcu_read_unlock();

8153

rcu_read_unlock();

8149

8154

8150

return ret;

8155

return ret;

8151

}

8156

}

8152

8157

8153

static int cpu_stats_show(struct seq_file *sf, void *v)

8158

static int cpu_stats_show(struct seq_file *sf, void *v)

8154

{

8159

{

8155

struct task_group *tg = css_tg(seq_css(sf));

8160

struct task_group *tg = css_tg(seq_css(sf));

8156

struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;

8161

struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;

8157

8162

8158

seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);

8163

seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);

8159

seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);

8164

seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);

8160

seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);

8165

seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);

8161

8166

8162

return 0;

8167

return 0;

8163

}

8168

}

8164

#endif /* CONFIG_CFS_BANDWIDTH */

8169

#endif /* CONFIG_CFS_BANDWIDTH */

8165

#endif /* CONFIG_FAIR_GROUP_SCHED */

8170

#endif /* CONFIG_FAIR_GROUP_SCHED */

8166

8171

8167

#ifdef CONFIG_RT_GROUP_SCHED

8172

#ifdef CONFIG_RT_GROUP_SCHED

8168

static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,

8173

static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,

8169

struct cftype *cft, s64 val)

8174

struct cftype *cft, s64 val)

8170

{

8175

{

8171

return sched_group_set_rt_runtime(css_tg(css), val);

8176

return sched_group_set_rt_runtime(css_tg(css), val);

8172

}

8177

}

8173

8178

8174

static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,

8179

static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,

8175

struct cftype *cft)

8180

struct cftype *cft)

8176

{

8181

{

8177

return sched_group_rt_runtime(css_tg(css));

8182

return sched_group_rt_runtime(css_tg(css));

8178

}

8183

}

8179

8184

8180

static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,

8185

static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,

8181

struct cftype *cftype, u64 rt_period_us)

8186

struct cftype *cftype, u64 rt_period_us)

8182

{

8187

{

8183

return sched_group_set_rt_period(css_tg(css), rt_period_us);

8188

return sched_group_set_rt_period(css_tg(css), rt_period_us);

8184

}

8189

}

8185

8190

8186

static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,

8191

static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,

8187

struct cftype *cft)

8192

struct cftype *cft)

8188

{

8193

{

8189

return sched_group_rt_period(css_tg(css));

8194

return sched_group_rt_period(css_tg(css));

8190

}

8195

}

8191

#endif /* CONFIG_RT_GROUP_SCHED */

8196

#endif /* CONFIG_RT_GROUP_SCHED */

8192

8197

8193

static struct cftype cpu_files[] = {

8198

static struct cftype cpu_files[] = {

8194

#ifdef CONFIG_FAIR_GROUP_SCHED

8199

#ifdef CONFIG_FAIR_GROUP_SCHED

8195

{

8200

{

8196

.name = "shares",

8201

.name = "shares",

8197

.read_u64 = cpu_shares_read_u64,

8202

.read_u64 = cpu_shares_read_u64,

8198

.write_u64 = cpu_shares_write_u64,

8203

.write_u64 = cpu_shares_write_u64,

8199

},

8204

},

8200

#endif

8205

#endif

8201

#ifdef CONFIG_CFS_BANDWIDTH

8206

#ifdef CONFIG_CFS_BANDWIDTH

8202

{

8207

{

8203

.name = "cfs_quota_us",

8208

.name = "cfs_quota_us",

8204

.read_s64 = cpu_cfs_quota_read_s64,

8209

.read_s64 = cpu_cfs_quota_read_s64,

8205

.write_s64 = cpu_cfs_quota_write_s64,

8210

.write_s64 = cpu_cfs_quota_write_s64,

8206

},

8211

},

8207

{

8212

{

8208

.name = "cfs_period_us",

8213

.name = "cfs_period_us",

8209

.read_u64 = cpu_cfs_period_read_u64,

8214

.read_u64 = cpu_cfs_period_read_u64,

8210

.write_u64 = cpu_cfs_period_write_u64,

8215

.write_u64 = cpu_cfs_period_write_u64,

8211

},

8216

},

8212

{

8217

{

8213

.name = "stat",

8218

.name = "stat",

8214

.seq_show = cpu_stats_show,

8219

.seq_show = cpu_stats_show,

8215

},

8220

},

8216

#endif

8221

#endif

8217

#ifdef CONFIG_RT_GROUP_SCHED

8222

#ifdef CONFIG_RT_GROUP_SCHED

8218

{

8223

{

8219

.name = "rt_runtime_us",

8224

.name = "rt_runtime_us",

8220

.read_s64 = cpu_rt_runtime_read,

8225

.read_s64 = cpu_rt_runtime_read,

8221

.write_s64 = cpu_rt_runtime_write,

8226

.write_s64 = cpu_rt_runtime_write,

8222

},

8227

},

8223

{

8228

{

8224

.name = "rt_period_us",

8229

.name = "rt_period_us",

8225

.read_u64 = cpu_rt_period_read_uint,

8230

.read_u64 = cpu_rt_period_read_uint,

8226

.write_u64 = cpu_rt_period_write_uint,

8231

.write_u64 = cpu_rt_period_write_uint,

8227

},

8232

},

8228

#endif

8233

#endif

8229

{ } /* terminate */

8234

{ } /* terminate */

8230

};

8235

};

8231

8236

8232

struct cgroup_subsys cpu_cgrp_subsys = {

8237

struct cgroup_subsys cpu_cgrp_subsys = {

8233

.css_alloc = cpu_cgroup_css_alloc,

8238

.css_alloc = cpu_cgroup_css_alloc,

8234

.css_free = cpu_cgroup_css_free,

8239

.css_free = cpu_cgroup_css_free,

8235

.css_online = cpu_cgroup_css_online,

8240

.css_online = cpu_cgroup_css_online,

8236

.css_offline = cpu_cgroup_css_offline,

8241

.css_offline = cpu_cgroup_css_offline,

8237

.fork = cpu_cgroup_fork,

8242

.fork = cpu_cgroup_fork,

8238

.can_attach = cpu_cgroup_can_attach,

8243

.can_attach = cpu_cgroup_can_attach,

8239

.attach = cpu_cgroup_attach,

8244

.attach = cpu_cgroup_attach,

8240

.exit = cpu_cgroup_exit,

8245

.exit = cpu_cgroup_exit,

8241

.legacy_cftypes = cpu_files,

8246

.legacy_cftypes = cpu_files,

8242

.early_init = 1,

8247

.early_init = 1,

8243

};

8248

};

8244

8249

8245

#endif /* CONFIG_CGROUP_SCHED */

8250

#endif /* CONFIG_CGROUP_SCHED */

8246

8251

8247

void dump_cpu_task(int cpu)

8252

void dump_cpu_task(int cpu)

8248

{

8253

{

8249

pr_info("Task dump for CPU %d:\n", cpu);

8254

pr_info("Task dump for CPU %d:\n", cpu);

8250

sched_show_task(cpu_curr(cpu));

8255

sched_show_task(cpu_curr(cpu));

8251

}

8256

}

GITLAB

sched: Add missing rcu protection to wake_up_all_idle_cpus

 /*
  *  kernel/sched/core.c
  *
  *  Kernel scheduler and related syscalls
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
  *
  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
  *		make semaphores SMP safe
  *  1998-11-19	Implemented schedule_timeout() and related stuff
  *		by Andrea Arcangeli
  *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
  *		hybrid priority-list and round-robin design with
  *		an array-switch method of distributing timeslices
  *		and per-CPU runqueues.  Cleanups and useful suggestions
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
  *  2007-04-15  Work begun on replacing all interactivity tuning with a
  *              fair scheduling design by Con Kolivas.
  *  2007-05-05  Load balancing (smp-nice) and other improvements
  *              by Peter Williams
  *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
  *              Thomas Gleixner, Mike Kravetz
  */
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/nmi.h>
 #include <linux/init.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
 #include <asm/mmu_context.h>
 #include <linux/interrupt.h>
 #include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/debug_locks.h>
 #include <linux/perf_event.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
 #include <linux/freezer.h>
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/delay.h>
 #include <linux/pid_namespace.h>
 #include <linux/smp.h>
 #include <linux/threads.h>
 #include <linux/timer.h>
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/percpu.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
 #include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
 #include <linux/slab.h>
 #include <linux/init_task.h>
 #include <linux/binfmts.h>
 #include <linux/context_tracking.h>
 #include <linux/compiler.h>
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 #include <asm/mutex.h>
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #endif
 #include "sched.h"
 #include "../workqueue_internal.h"
 #include "../smpboot.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
 void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
 {
 	unsigned long delta;
 	ktime_t soft, hard, now;
 	for (;;) {
 		if (hrtimer_active(period_timer))
 			break;
 		now = hrtimer_cb_get_time(period_timer);
 		hrtimer_forward(period_timer, now, period);
 		soft = hrtimer_get_softexpires(period_timer);
 		hard = hrtimer_get_expires(period_timer);
 		delta = ktime_to_ns(ktime_sub(hard, soft));
 		__hrtimer_start_range_ns(period_timer, soft, delta,
 					 HRTIMER_MODE_ABS_PINNED, 0);
 	}
 }
 DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 static void update_rq_clock_task(struct rq *rq, s64 delta);
 void update_rq_clock(struct rq *rq)
 {
 	s64 delta;
 	if (rq->skip_clock_update > 0)
 		return;
 	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
 	if (delta < 0)
 		return;
 	rq->clock += delta;
 	update_rq_clock_task(rq, delta);
 }
 /*
  * Debugging: various feature bits
  */
 #define SCHED_FEAT(name, enabled)	\
 	(1UL << __SCHED_FEAT_##name) * enabled |
 const_debug unsigned int sysctl_sched_features =
 #include "features.h"
 	0;
 #undef SCHED_FEAT
 #ifdef CONFIG_SCHED_DEBUG
 #define SCHED_FEAT(name, enabled)	\
 	#name ,
 static const char * const sched_feat_names[] = {
 #include "features.h"
 };
 #undef SCHED_FEAT
 static int sched_feat_show(struct seq_file *m, void *v)
 {
 	int i;
 	for (i = 0; i < __SCHED_FEAT_NR; i++) {
 		if (!(sysctl_sched_features & (1UL << i)))
 			seq_puts(m, "NO_");
 		seq_printf(m, "%s ", sched_feat_names[i]);
 	}
 	seq_puts(m, "\n");
 	return 0;
 }
 #ifdef HAVE_JUMP_LABEL
 #define jump_label_key__true  STATIC_KEY_INIT_TRUE
 #define jump_label_key__false STATIC_KEY_INIT_FALSE
 #define SCHED_FEAT(name, enabled)	\
 	jump_label_key__##enabled ,
 struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
 #include "features.h"
 };
 #undef SCHED_FEAT
 static void sched_feat_disable(int i)
 {
 	if (static_key_enabled(&sched_feat_keys[i]))
 		static_key_slow_dec(&sched_feat_keys[i]);
 }
 static void sched_feat_enable(int i)
 {
 	if (!static_key_enabled(&sched_feat_keys[i]))
 		static_key_slow_inc(&sched_feat_keys[i]);
 }
 #else
 static void sched_feat_disable(int i) { };
 static void sched_feat_enable(int i) { };
 #endif /* HAVE_JUMP_LABEL */
 static int sched_feat_set(char *cmp)
 {
 	int i;
 	int neg = 0;
 	if (strncmp(cmp, "NO_", 3) == 0) {
 		neg = 1;
 		cmp += 3;
 	}
 	for (i = 0; i < __SCHED_FEAT_NR; i++) {
 		if (strcmp(cmp, sched_feat_names[i]) == 0) {
 			if (neg) {
 				sysctl_sched_features &= ~(1UL << i);
 				sched_feat_disable(i);
 			} else {
 				sysctl_sched_features |= (1UL << i);
 				sched_feat_enable(i);
 			}
 			break;
 		}
 	}
 	return i;
 }
 static ssize_t
 sched_feat_write(struct file *filp, const char __user *ubuf,
 		size_t cnt, loff_t *ppos)
 {
 	char buf[64];
 	char *cmp;
 	int i;
 	struct inode *inode;
 	if (cnt > 63)
 		cnt = 63;
 	if (copy_from_user(&buf, ubuf, cnt))
 		return -EFAULT;
 	buf[cnt] = 0;
 	cmp = strstrip(buf);
 	/* Ensure the static_key remains in a consistent state */
 	inode = file_inode(filp);
 	mutex_lock(&inode->i_mutex);
 	i = sched_feat_set(cmp);
 	mutex_unlock(&inode->i_mutex);
 	if (i == __SCHED_FEAT_NR)
 		return -EINVAL;
 	*ppos += cnt;
 	return cnt;
 }
 static int sched_feat_open(struct inode *inode, struct file *filp)
 {
 	return single_open(filp, sched_feat_show, NULL);
 }
 static const struct file_operations sched_feat_fops = {
 	.open		= sched_feat_open,
 	.write		= sched_feat_write,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
 static __init int sched_init_debug(void)
 {
 	debugfs_create_file("sched_features", 0644, NULL, NULL,
 			&sched_feat_fops);
 	return 0;
 }
 late_initcall(sched_init_debug);
 #endif /* CONFIG_SCHED_DEBUG */
 /*
  * Number of tasks to iterate in a single balance run.
  * Limited because this is done with IRQs disabled.
  */
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 /*
  * period over which we average the RT time consumption, measured
  * in ms.
  *
  * default: 1s
  */
 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
 /*
  * period over which we measure -rt task cpu usage in us.
  * default: 1s
  */
 unsigned int sysctl_sched_rt_period = 1000000;
 __read_mostly int scheduler_running;
 /*
  * part of the period that we allow rt tasks to run in us.
  * default: 0.95s
  */
 int sysctl_sched_rt_runtime = 950000;
 /*
  * __task_rq_lock - lock the rq @p resides on.
  */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 	lockdep_assert_held(&p->pi_lock);
 	for (;;) {
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
 		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
 			return rq;
 		raw_spin_unlock(&rq->lock);
 		while (unlikely(task_on_rq_migrating(p)))
 			cpu_relax();
 	}
 }
 /*
  * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
  */
 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	__acquires(p->pi_lock)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 	for (;;) {
 		raw_spin_lock_irqsave(&p->pi_lock, *flags);
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
 		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
 			return rq;
 		raw_spin_unlock(&rq->lock);
 		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 		while (unlikely(task_on_rq_migrating(p)))
 			cpu_relax();
 	}
 }
 static void __task_rq_unlock(struct rq *rq)
 	__releases(rq->lock)
 {
 	raw_spin_unlock(&rq->lock);
 }
 static inline void
 task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
 	__releases(rq->lock)
 	__releases(p->pi_lock)
 {
 	raw_spin_unlock(&rq->lock);
 	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }
 /*
  * this_rq_lock - lock this runqueue and disable interrupts.
  */
 static struct rq *this_rq_lock(void)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 	local_irq_disable();
 	rq = this_rq();
 	raw_spin_lock(&rq->lock);
 	return rq;
 }
 #ifdef CONFIG_SCHED_HRTICK
 /*
  * Use HR-timers to deliver accurate preemption points.
  */
 static void hrtick_clear(struct rq *rq)
 {
 	if (hrtimer_active(&rq->hrtick_timer))
 		hrtimer_cancel(&rq->hrtick_timer);
 }
 /*
  * High-resolution timer tick.
  * Runs from hardirq context with interrupts disabled.
  */
 static enum hrtimer_restart hrtick(struct hrtimer *timer)
 {
 	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
 	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 	raw_spin_lock(&rq->lock);
 	update_rq_clock(rq);
 	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
 	raw_spin_unlock(&rq->lock);
 	return HRTIMER_NORESTART;
 }
 #ifdef CONFIG_SMP
 static int __hrtick_restart(struct rq *rq)
 {
 	struct hrtimer *timer = &rq->hrtick_timer;
 	ktime_t time = hrtimer_get_softexpires(timer);
 	return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
 }
 /*
  * called from hardirq (IPI) context
  */
 static void __hrtick_start(void *arg)
 {
 	struct rq *rq = arg;
 	raw_spin_lock(&rq->lock);
 	__hrtick_restart(rq);
 	rq->hrtick_csd_pending = 0;
 	raw_spin_unlock(&rq->lock);
 }
 /*
  * Called to set the hrtick timer state.
  *
  * called with rq->lock held and irqs disabled
  */
 void hrtick_start(struct rq *rq, u64 delay)
 {
 	struct hrtimer *timer = &rq->hrtick_timer;
 	ktime_t time;
 	s64 delta;
 	/*
 	 * Don't schedule slices shorter than 10000ns, that just
 	 * doesn't make sense and can cause timer DoS.
 	 */
 	delta = max_t(s64, delay, 10000LL);
 	time = ktime_add_ns(timer->base->get_time(), delta);
 	hrtimer_set_expires(timer, time);
 	if (rq == this_rq()) {
 		__hrtick_restart(rq);
 	} else if (!rq->hrtick_csd_pending) {
 		smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
 		rq->hrtick_csd_pending = 1;
 	}
 }
 static int
 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	int cpu = (int)(long)hcpu;
 	switch (action) {
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		hrtick_clear(cpu_rq(cpu));
 		return NOTIFY_OK;
 	}
 	return NOTIFY_DONE;
 }
 static __init void init_hrtick(void)
 {
 	hotcpu_notifier(hotplug_hrtick, 0);
 }
 #else
 /*
  * Called to set the hrtick timer state.
  *
  * called with rq->lock held and irqs disabled
  */
 void hrtick_start(struct rq *rq, u64 delay)
 {
 	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
 			HRTIMER_MODE_REL_PINNED, 0);
 }
 static inline void init_hrtick(void)
 {
 }
 #endif /* CONFIG_SMP */
 static void init_rq_hrtick(struct rq *rq)
 {
 #ifdef CONFIG_SMP
 	rq->hrtick_csd_pending = 0;
 	rq->hrtick_csd.flags = 0;
 	rq->hrtick_csd.func = __hrtick_start;
 	rq->hrtick_csd.info = rq;
 #endif
 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rq->hrtick_timer.function = hrtick;
 }
 #else	/* CONFIG_SCHED_HRTICK */
 static inline void hrtick_clear(struct rq *rq)
 {
 }
 static inline void init_rq_hrtick(struct rq *rq)
 {
 }
 static inline void init_hrtick(void)
 {
 }
 #endif	/* CONFIG_SCHED_HRTICK */
 /*
  * cmpxchg based fetch_or, macro so it works for different integer types
  */
 #define fetch_or(ptr, val)						\
 ({	typeof(*(ptr)) __old, __val = *(ptr);				\
  	for (;;) {							\
  		__old = cmpxchg((ptr), __val, __val | (val));		\
  		if (__old == __val)					\
  			break;						\
  		__val = __old;						\
  	}								\
  	__old;								\
 })
 #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
 /*
  * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
  * this avoids any races wrt polling state changes and thereby avoids
  * spurious IPIs.
  */
 static bool set_nr_and_not_polling(struct task_struct *p)
 {
 	struct thread_info *ti = task_thread_info(p);
 	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
 }
 /*
  * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
  *
  * If this returns true, then the idle task promises to call
  * sched_ttwu_pending() and reschedule soon.
  */
 static bool set_nr_if_polling(struct task_struct *p)
 {
 	struct thread_info *ti = task_thread_info(p);
 	typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
 	for (;;) {
 		if (!(val & _TIF_POLLING_NRFLAG))
 			return false;
 		if (val & _TIF_NEED_RESCHED)
 			return true;
 		old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
 		if (old == val)
 			break;
 		val = old;
 	}
 	return true;
 }
 #else
 static bool set_nr_and_not_polling(struct task_struct *p)
 {
 	set_tsk_need_resched(p);
 	return true;
 }
 #ifdef CONFIG_SMP
 static bool set_nr_if_polling(struct task_struct *p)
 {
 	return false;
 }
 #endif
 #endif
 /*
  * resched_curr - mark rq's current task 'to be rescheduled now'.
  *
  * On UP this means the setting of the need_resched flag, on SMP it
  * might also involve a cross-CPU call to trigger the scheduler on
  * the target CPU.
  */
 void resched_curr(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	int cpu;
 	lockdep_assert_held(&rq->lock);
 	if (test_tsk_need_resched(curr))
 		return;
 	cpu = cpu_of(rq);
 	if (cpu == smp_processor_id()) {
 		set_tsk_need_resched(curr);
 		set_preempt_need_resched();
 		return;
 	}
 	if (set_nr_and_not_polling(curr))
 		smp_send_reschedule(cpu);
 	else
 		trace_sched_wake_idle_without_ipi(cpu);
 }
 void resched_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 	if (!raw_spin_trylock_irqsave(&rq->lock, flags))
 		return;
 	resched_curr(rq);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ_COMMON
 /*
  * In the semi idle case, use the nearest busy cpu for migrating timers
  * from an idle cpu.  This is good for power-savings.
  *
  * We don't do similar optimization for completely idle system, as
  * selecting an idle cpu will add more delays to the timers than intended
  * (as that cpu's timer base may not be uptodate wrt jiffies etc).
  */
 int get_nohz_timer_target(int pinned)
 {
 	int cpu = smp_processor_id();
 	int i;
 	struct sched_domain *sd;
 	if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
 		return cpu;
 	rcu_read_lock();
 	for_each_domain(cpu, sd) {
 		for_each_cpu(i, sched_domain_span(sd)) {
 			if (!idle_cpu(i)) {
 				cpu = i;
 				goto unlock;
 			}
 		}
 	}
 unlock:
 	rcu_read_unlock();
 	return cpu;
 }
 /*
  * When add_timer_on() enqueues a timer into the timer wheel of an
  * idle CPU then this timer might expire before the next timer event
  * which is scheduled to wake up that CPU. In case of a completely
  * idle system the next event might even be infinite time into the
  * future. wake_up_idle_cpu() ensures that the CPU is woken up and
  * leaves the inner idle loop so the newly added timer is taken into
  * account when the CPU goes back to idle and evaluates the timer
  * wheel for the next timer event.
  */
 static void wake_up_idle_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	if (cpu == smp_processor_id())
 		return;
 	if (set_nr_and_not_polling(rq->idle))
 		smp_send_reschedule(cpu);
 	else
 		trace_sched_wake_idle_without_ipi(cpu);
 }
 static bool wake_up_full_nohz_cpu(int cpu)
 {
 	/*
 	 * We just need the target to call irq_exit() and re-evaluate
 	 * the next tick. The nohz full kick at least implies that.
 	 * If needed we can still optimize that later with an
 	 * empty IRQ.
 	 */
 	if (tick_nohz_full_cpu(cpu)) {
 		if (cpu != smp_processor_id() ||
 		    tick_nohz_tick_stopped())
 			tick_nohz_full_kick_cpu(cpu);
 		return true;
 	}
 	return false;
 }
 void wake_up_nohz_cpu(int cpu)
 {
 	if (!wake_up_full_nohz_cpu(cpu))
 		wake_up_idle_cpu(cpu);
 }
 static inline bool got_nohz_idle_kick(void)
 {
 	int cpu = smp_processor_id();
 	if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
 		return false;
 	if (idle_cpu(cpu) && !need_resched())
 		return true;
 	/*
 	 * We can't run Idle Load Balance on this CPU for this time so we
 	 * cancel it and clear NOHZ_BALANCE_KICK
 	 */
 	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
 	return false;
 }
 #else /* CONFIG_NO_HZ_COMMON */
 static inline bool got_nohz_idle_kick(void)
 {
 	return false;
 }
 #endif /* CONFIG_NO_HZ_COMMON */
 #ifdef CONFIG_NO_HZ_FULL
 bool sched_can_stop_tick(void)
 {
 	/*
 	 * More than one running task need preemption.
 	 * nr_running update is assumed to be visible
 	 * after IPI is sent from wakers.
 	 */
 	if (this_rq()->nr_running > 1)
 		return false;
 	return true;
 }
 #endif /* CONFIG_NO_HZ_FULL */
 void sched_avg_update(struct rq *rq)
 {
 	s64 period = sched_avg_period();
 	while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
 		/*
 		 * Inline assembly required to prevent the compiler
 		 * optimising this loop into a divmod call.
 		 * See __iter_div_u64_rem() for another example of this.
 		 */
 		asm("" : "+rm" (rq->age_stamp));
 		rq->age_stamp += period;
 		rq->rt_avg /= 2;
 	}
 }
 #endif /* CONFIG_SMP */
 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
 			(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
 /*
  * Iterate task_group tree rooted at *from, calling @down when first entering a
  * node and @up when leaving it for the final time.
  *
  * Caller must hold rcu_lock or sufficient equivalent.
  */
 int walk_tg_tree_from(struct task_group *from,
 			     tg_visitor down, tg_visitor up, void *data)
 {
 	struct task_group *parent, *child;
 	int ret;
 	parent = from;
 down:
 	ret = (*down)(parent, data);
 	if (ret)
 		goto out;
 	list_for_each_entry_rcu(child, &parent->children, siblings) {
 		parent = child;
 		goto down;
 up:
 		continue;
 	}
 	ret = (*up)(parent, data);
 	if (ret || parent == from)
 		goto out;
 	child = parent;
 	parent = parent->parent;
 	if (parent)
 		goto up;
 out:
 	return ret;
 }
 int tg_nop(struct task_group *tg, void *data)
 {
 	return 0;
 }
 #endif
 static void set_load_weight(struct task_struct *p)
 {
 	int prio = p->static_prio - MAX_RT_PRIO;
 	struct load_weight *load = &p->se.load;
 	/*
 	 * SCHED_IDLE tasks get minimal weight:
 	 */
 	if (p->policy == SCHED_IDLE) {
 		load->weight = scale_load(WEIGHT_IDLEPRIO);
 		load->inv_weight = WMULT_IDLEPRIO;
 		return;
 	}
 	load->weight = scale_load(prio_to_weight[prio]);
 	load->inv_weight = prio_to_wmult[prio];
 }
 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
 	sched_info_queued(rq, p);
 	p->sched_class->enqueue_task(rq, p, flags);
 }
 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
 	sched_info_dequeued(rq, p);
 	p->sched_class->dequeue_task(rq, p, flags);
 }
 void activate_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible--;
 	enqueue_task(rq, p, flags);
 }
 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible++;
 	dequeue_task(rq, p, flags);
 }
 static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
 /*
  * In theory, the compile should just see 0 here, and optimize out the call
  * to sched_rt_avg_update. But I don't trust it...
  */
 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
 	s64 steal = 0, irq_delta = 0;
 #endif
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
 	/*
 	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
 	 * this case when a previous update_rq_clock() happened inside a
 	 * {soft,}irq region.
 	 *
 	 * When this happens, we stop ->clock_task and only update the
 	 * prev_irq_time stamp to account for the part that fit, so that a next
 	 * update will consume the rest. This ensures ->clock_task is
 	 * monotonic.
 	 *
 	 * It does however cause some slight miss-attribution of {soft,}irq
 	 * time, a more accurate solution would be to update the irq_time using
 	 * the current rq->clock timestamp, except that would require using
 	 * atomic ops.
 	 */
 	if (irq_delta > delta)
 		irq_delta = delta;
 	rq->prev_irq_time += irq_delta;
 	delta -= irq_delta;
 #endif
 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
 	if (static_key_false((&paravirt_steal_rq_enabled))) {
 		steal = paravirt_steal_clock(cpu_of(rq));
 		steal -= rq->prev_steal_time_rq;
 		if (unlikely(steal > delta))
 			steal = delta;
 		rq->prev_steal_time_rq += steal;
 		delta -= steal;
 	}
 #endif
 	rq->clock_task += delta;
 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
 	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
 		sched_rt_avg_update(rq, irq_delta + steal);
 #endif
 }
 void sched_set_stop_task(int cpu, struct task_struct *stop)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
 	struct task_struct *old_stop = cpu_rq(cpu)->stop;
 	if (stop) {
 		/*
 		 * Make it appear like a SCHED_FIFO task, its something
 		 * userspace knows about and won't get confused about.
 		 *
 		 * Also, it will make PI more or less work without too
 		 * much confusion -- but then, stop work should not
 		 * rely on PI working anyway.
 		 */
 		sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
 		stop->sched_class = &stop_sched_class;
 	}
 	cpu_rq(cpu)->stop = stop;
 	if (old_stop) {
 		/*
 		 * Reset it back to a normal scheduling class so that
 		 * it can die in pieces.
 		 */
 		old_stop->sched_class = &rt_sched_class;
 	}
 }
 /*
  * __normal_prio - return the priority that is based on the static prio
  */
 static inline int __normal_prio(struct task_struct *p)
 {
 	return p->static_prio;
 }
 /*
  * Calculate the expected normal priority: i.e. priority
  * without taking RT-inheritance into account. Might be
  * boosted by interactivity modifiers. Changes upon fork,
  * setprio syscalls, and whenever the interactivity
  * estimator recalculates.
  */
 static inline int normal_prio(struct task_struct *p)
 {
 	int prio;
 	if (task_has_dl_policy(p))
 		prio = MAX_DL_PRIO-1;
 	else if (task_has_rt_policy(p))
 		prio = MAX_RT_PRIO-1 - p->rt_priority;
 	else
 		prio = __normal_prio(p);
 	return prio;
 }
 /*
  * Calculate the current priority, i.e. the priority
  * taken into account by the scheduler. This value might
  * be boosted by RT tasks, or might be boosted by
  * interactivity modifiers. Will be RT if the task got
  * RT-boosted. If not then it returns p->normal_prio.
  */
 static int effective_prio(struct task_struct *p)
 {
 	p->normal_prio = normal_prio(p);
 	/*
 	 * If we are RT tasks or we were boosted to RT priority,
 	 * keep the priority unchanged. Otherwise, update priority
 	 * to the normal priority:
 	 */
 	if (!rt_prio(p->prio))
 		return p->normal_prio;
 	return p->prio;
 }
 /**
  * task_curr - is this task currently executing on a CPU?
  * @p: the task in question.
  *
  * Return: 1 if the task is currently executing. 0 otherwise.
  */
 inline int task_curr(const struct task_struct *p)
 {
 	return cpu_curr(task_cpu(p)) == p;
 }
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 				       const struct sched_class *prev_class,
 				       int oldprio)
 {
 	if (prev_class != p->sched_class) {
 		if (prev_class->switched_from)
 			prev_class->switched_from(rq, p);
 		p->sched_class->switched_to(rq, p);
 	} else if (oldprio != p->prio || dl_task(p))
 		p->sched_class->prio_changed(rq, p, oldprio);
 }
 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 {
 	const struct sched_class *class;
 	if (p->sched_class == rq->curr->sched_class) {
 		rq->curr->sched_class->check_preempt_curr(rq, p, flags);
 	} else {
 		for_each_class(class) {
 			if (class == rq->curr->sched_class)
 				break;
 			if (class == p->sched_class) {
 				resched_curr(rq);
 				break;
 			}
 		}
 	}
 	/*
 	 * A queue event has occurred, and we're going to schedule.  In
 	 * this case, we can save a useless back to back clock update.
 	 */
 	if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
 		rq->skip_clock_update = 1;
 }
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 #ifdef CONFIG_SCHED_DEBUG
 	/*
 	 * We should never call set_task_cpu() on a blocked task,
 	 * ttwu() will sort out the placement.
 	 */
 	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
 			!(task_preempt_count(p) & PREEMPT_ACTIVE));
 #ifdef CONFIG_LOCKDEP
 	/*
 	 * The caller should hold either p->pi_lock or rq->lock, when changing
 	 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
 	 *
 	 * sched_move_task() holds both and thus holding either pins the cgroup,
 	 * see task_group().
 	 *
 	 * Furthermore, all task_rq users should acquire both locks, see
 	 * task_rq_lock().
 	 */
 	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
 				      lockdep_is_held(&task_rq(p)->lock)));
 #endif
 #endif
 	trace_sched_migrate_task(p, new_cpu);
 	if (task_cpu(p) != new_cpu) {
 		if (p->sched_class->migrate_task_rq)
 			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
 		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
 	}
 	__set_task_cpu(p, new_cpu);
 }
 static void __migrate_swap_task(struct task_struct *p, int cpu)
 {
 	if (task_on_rq_queued(p)) {
 		struct rq *src_rq, *dst_rq;
 		src_rq = task_rq(p);
 		dst_rq = cpu_rq(cpu);
 		deactivate_task(src_rq, p, 0);
 		set_task_cpu(p, cpu);
 		activate_task(dst_rq, p, 0);
 		check_preempt_curr(dst_rq, p, 0);
 	} else {
 		/*
 		 * Task isn't running anymore; make it appear like we migrated
 		 * it before it went to sleep. This means on wakeup we make the
 		 * previous cpu our targer instead of where it really is.
 		 */
 		p->wake_cpu = cpu;
 	}
 }
 struct migration_swap_arg {
 	struct task_struct *src_task, *dst_task;
 	int src_cpu, dst_cpu;
 };
 static int migrate_swap_stop(void *data)
 {
 	struct migration_swap_arg *arg = data;
 	struct rq *src_rq, *dst_rq;
 	int ret = -EAGAIN;
 	src_rq = cpu_rq(arg->src_cpu);
 	dst_rq = cpu_rq(arg->dst_cpu);
 	double_raw_lock(&arg->src_task->pi_lock,
 			&arg->dst_task->pi_lock);
 	double_rq_lock(src_rq, dst_rq);
 	if (task_cpu(arg->dst_task) != arg->dst_cpu)
 		goto unlock;
 	if (task_cpu(arg->src_task) != arg->src_cpu)
 		goto unlock;
 	if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
 		goto unlock;
 	if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
 		goto unlock;
 	__migrate_swap_task(arg->src_task, arg->dst_cpu);
 	__migrate_swap_task(arg->dst_task, arg->src_cpu);
 	ret = 0;
 unlock:
 	double_rq_unlock(src_rq, dst_rq);
 	raw_spin_unlock(&arg->dst_task->pi_lock);
 	raw_spin_unlock(&arg->src_task->pi_lock);
 	return ret;
 }
 /*
  * Cross migrate two tasks
  */
 int migrate_swap(struct task_struct *cur, struct task_struct *p)
 {
 	struct migration_swap_arg arg;
 	int ret = -EINVAL;
 	arg = (struct migration_swap_arg){
 		.src_task = cur,
 		.src_cpu = task_cpu(cur),
 		.dst_task = p,
 		.dst_cpu = task_cpu(p),
 	};
 	if (arg.src_cpu == arg.dst_cpu)
 		goto out;
 	/*
 	 * These three tests are all lockless; this is OK since all of them
 	 * will be re-checked with proper locks held further down the line.
 	 */
 	if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
 		goto out;
 	if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
 		goto out;
 	if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
 		goto out;
 	trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
 	ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
 out:
 	return ret;
 }
 struct migration_arg {
 	struct task_struct *task;
 	int dest_cpu;
 };
 static int migration_cpu_stop(void *data);
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
  * If @match_state is nonzero, it's the @p->state value just checked and
  * not expected to change.  If it changes, i.e. @p might have woken up,
  * then return zero.  When we succeed in waiting for @p to be off its CPU,
  * we return a positive number (its total switch count).  If a second call
  * a short while later returns the same number, the caller can be sure that
  * @p has remained unscheduled the whole time.
  *
  * The caller must ensure that the task *will* unschedule sometime soon,
  * else this function might spin for a *long* time. This function can't
  * be called with interrupts off, or it may introduce deadlock with
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
  */
 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
 	unsigned long flags;
 	int running, queued;
 	unsigned long ncsw;
 	struct rq *rq;
 	for (;;) {
 		/*
 		 * We do the initial early heuristics without holding
 		 * any task-queue locks at all. We'll only try to get
 		 * the runqueue lock when things look like they will
 		 * work out!
 		 */
 		rq = task_rq(p);
 		/*
 		 * If the task is actively running on another CPU
 		 * still, just relax and busy-wait without holding
 		 * any locks.
 		 *
 		 * NOTE! Since we don't hold any locks, it's not
 		 * even sure that "rq" stays as the right runqueue!
 		 * But we don't care, since "task_running()" will
 		 * return false if the runqueue has changed and p
 		 * is actually now running somewhere else!
 		 */
 		while (task_running(rq, p)) {
 			if (match_state && unlikely(p->state != match_state))
 				return 0;
 			cpu_relax();
 		}
 		/*
 		 * Ok, time to look more closely! We need the rq
 		 * lock now, to be *sure*. If we're wrong, we'll
 		 * just go back and repeat.
 		 */
 		rq = task_rq_lock(p, &flags);
 		trace_sched_wait_task(p);
 		running = task_running(rq, p);
 		queued = task_on_rq_queued(p);
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
 		task_rq_unlock(rq, p, &flags);
 		/*
 		 * If it changed from the expected state, bail out now.
 		 */
 		if (unlikely(!ncsw))
 			break;
 		/*
 		 * Was it really running after all now that we
 		 * checked with the proper locks actually held?
 		 *
 		 * Oops. Go back and try again..
 		 */
 		if (unlikely(running)) {
 			cpu_relax();
 			continue;
 		}
 		/*
 		 * It's not enough that it's not actively running,
 		 * it must be off the runqueue _entirely_, and not
 		 * preempted!
 		 *
 		 * So if it was still runnable (but just not actively
 		 * running right now), it's preempted, and we should
 		 * yield - it could be a while.
 		 */
 		if (unlikely(queued)) {
 			ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
 			set_current_state(TASK_UNINTERRUPTIBLE);
 			schedule_hrtimeout(&to, HRTIMER_MODE_REL);
 			continue;
 		}
 		/*
 		 * Ahh, all good. It wasn't running, and it wasn't
 		 * runnable, which means that it will never become
 		 * running in the future either. We're all done!
 		 */
 		break;
 	}
 	return ncsw;
 }
 /***
  * kick_process - kick a running thread to enter/exit the kernel
  * @p: the to-be-kicked thread
  *
  * Cause a process which is running on another CPU to enter
  * kernel-mode, without any delay. (to get signals handled.)
  *
  * NOTE: this function doesn't have to take the runqueue lock,
  * because all it wants to ensure is that the remote task enters
  * the kernel. If the IPI races and the task has been migrated
  * to another CPU then no harm is done and the purpose has been
  * achieved as well.
  */
 void kick_process(struct task_struct *p)
 {
 	int cpu;
 	preempt_disable();
 	cpu = task_cpu(p);
 	if ((cpu != smp_processor_id()) && task_curr(p))
 		smp_send_reschedule(cpu);
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(kick_process);
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_SMP
 /*
  * ->cpus_allowed is protected by both rq->lock and p->pi_lock
  */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
 	int nid = cpu_to_node(cpu);
 	const struct cpumask *nodemask = NULL;
 	enum { cpuset, possible, fail } state = cpuset;
 	int dest_cpu;
 	/*
 	 * If the node that the cpu is on has been offlined, cpu_to_node()
 	 * will return -1. There is no cpu on the node, and we should
 	 * select the cpu on the other node.
 	 */
 	if (nid != -1) {
 		nodemask = cpumask_of_node(nid);
 		/* Look for allowed, online CPU in same node. */
 		for_each_cpu(dest_cpu, nodemask) {
 			if (!cpu_online(dest_cpu))
 				continue;
 			if (!cpu_active(dest_cpu))
 				continue;
 			if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
 				return dest_cpu;
 		}
 	}
 	for (;;) {
 		/* Any allowed, online CPU? */
 		for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
 			if (!cpu_online(dest_cpu))
 				continue;
 			if (!cpu_active(dest_cpu))
 				continue;
 			goto out;
 		}
 		switch (state) {
 		case cpuset:
 			/* No more Mr. Nice Guy. */
 			cpuset_cpus_allowed_fallback(p);
 			state = possible;
 			break;
 		case possible:
 			do_set_cpus_allowed(p, cpu_possible_mask);
 			state = fail;
 			break;
 		case fail:
 			BUG();
 			break;
 		}
 	}
 out:
 	if (state != cpuset) {
 		/*
 		 * Don't tell them about moving exiting tasks or
 		 * kernel threads (both mm NULL), since they never
 		 * leave kernel.
 		 */
 		if (p->mm && printk_ratelimit()) {
 			printk_deferred("process %d (%s) no longer affine to cpu%d\n",
 					task_pid_nr(p), p->comm, cpu);
 		}
 	}
 	return dest_cpu;
 }
 /*
  * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
  */
 static inline
 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
 	cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 	/*
 	 * In order not to call set_task_cpu() on a blocking task we need
 	 * to rely on ttwu() to place the task on a valid ->cpus_allowed
 	 * cpu.
 	 *
 	 * Since this is common to all placement strategies, this lives here.
 	 *
 	 * [ this allows ->select_task() to simply return task_cpu(p) and
 	 *   not worry about this generic constraint ]
 	 */
 	if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
 		     !cpu_online(cpu)))
 		cpu = select_fallback_rq(task_cpu(p), p);
 	return cpu;
 }
 static void update_avg(u64 *avg, u64 sample)
 {
 	s64 diff = sample - *avg;
 	*avg += diff >> 3;
 }
 #endif
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 {
 #ifdef CONFIG_SCHEDSTATS
 	struct rq *rq = this_rq();
 #ifdef CONFIG_SMP
 	int this_cpu = smp_processor_id();
 	if (cpu == this_cpu) {
 		schedstat_inc(rq, ttwu_local);
 		schedstat_inc(p, se.statistics.nr_wakeups_local);
 	} else {
 		struct sched_domain *sd;
 		schedstat_inc(p, se.statistics.nr_wakeups_remote);
 		rcu_read_lock();
 		for_each_domain(this_cpu, sd) {
 			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
 				schedstat_inc(sd, ttwu_wake_remote);
 				break;
 			}
 		}
 		rcu_read_unlock();
 	}
 	if (wake_flags & WF_MIGRATED)
 		schedstat_inc(p, se.statistics.nr_wakeups_migrate);
 #endif /* CONFIG_SMP */
 	schedstat_inc(rq, ttwu_count);
 	schedstat_inc(p, se.statistics.nr_wakeups);
 	if (wake_flags & WF_SYNC)
 		schedstat_inc(p, se.statistics.nr_wakeups_sync);
 #endif /* CONFIG_SCHEDSTATS */
 }
 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 {
 	activate_task(rq, p, en_flags);
 	p->on_rq = TASK_ON_RQ_QUEUED;
 	/* if a worker is waking up, notify workqueue */
 	if (p->flags & PF_WQ_WORKER)
 		wq_worker_waking_up(p, cpu_of(rq));
 }
 /*
  * Mark the task runnable and perform wakeup-preemption.
  */
 static void
 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 	check_preempt_curr(rq, p, wake_flags);
 	trace_sched_wakeup(p, true);
 	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_woken)
 		p->sched_class->task_woken(rq, p);
 	if (rq->idle_stamp) {
 		u64 delta = rq_clock(rq) - rq->idle_stamp;
 		u64 max = 2*rq->max_idle_balance_cost;
 		update_avg(&rq->avg_idle, delta);
 		if (rq->avg_idle > max)
 			rq->avg_idle = max;
 		rq->idle_stamp = 0;
 	}
 #endif
 }
 static void
 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 #ifdef CONFIG_SMP
 	if (p->sched_contributes_to_load)
 		rq->nr_uninterruptible--;
 #endif
 	ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
 	ttwu_do_wakeup(rq, p, wake_flags);
 }
 /*
  * Called in case the task @p isn't fully descheduled from its runqueue,
  * in this case we must do a remote wakeup. Its a 'light' wakeup though,
  * since all we need to do is flip p->state to TASK_RUNNING, since
  * the task is still ->on_rq.
  */
 static int ttwu_remote(struct task_struct *p, int wake_flags)
 {
 	struct rq *rq;
 	int ret = 0;
 	rq = __task_rq_lock(p);
 	if (task_on_rq_queued(p)) {
 		/* check_preempt_curr() may use rq clock */
 		update_rq_clock(rq);
 		ttwu_do_wakeup(rq, p, wake_flags);
 		ret = 1;
 	}
 	__task_rq_unlock(rq);
 	return ret;
 }
 #ifdef CONFIG_SMP
 void sched_ttwu_pending(void)
 {
 	struct rq *rq = this_rq();
 	struct llist_node *llist = llist_del_all(&rq->wake_list);
 	struct task_struct *p;
 	unsigned long flags;
 	if (!llist)
 		return;
 	raw_spin_lock_irqsave(&rq->lock, flags);
 	while (llist) {
 		p = llist_entry(llist, struct task_struct, wake_entry);
 		llist = llist_next(llist);
 		ttwu_do_activate(rq, p, 0);
 	}
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 void scheduler_ipi(void)
 {
 	/*
 	 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
 	 * TIF_NEED_RESCHED remotely (for the first time) will also send
 	 * this IPI.
 	 */
 	preempt_fold_need_resched();
 	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
 		return;
 	/*
 	 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
 	 * traditionally all their work was done from the interrupt return
 	 * path. Now that we actually do some work, we need to make sure
 	 * we do call them.
 	 *
 	 * Some archs already do call them, luckily irq_enter/exit nest
 	 * properly.
 	 *
 	 * Arguably we should visit all archs and update all handlers,
 	 * however a fair share of IPIs are still resched only so this would
 	 * somewhat pessimize the simple resched case.
 	 */
 	irq_enter();
 	sched_ttwu_pending();
 	/*
 	 * Check if someone kicked us for doing the nohz idle load balance.
 	 */
 	if (unlikely(got_nohz_idle_kick())) {
 		this_rq()->idle_balance = 1;
 		raise_softirq_irqoff(SCHED_SOFTIRQ);
 	}
 	irq_exit();
 }
 static void ttwu_queue_remote(struct task_struct *p, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
 		if (!set_nr_if_polling(rq->idle))
 			smp_send_reschedule(cpu);
 		else
 			trace_sched_wake_idle_without_ipi(cpu);
 	}
 }
 void wake_up_if_idle(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
-	if (!is_idle_task(rq->curr))
+	rcu_read_lock();
-		return;
+	if (!is_idle_task(rcu_dereference(rq->curr)))
+		goto out;
 	if (set_nr_if_polling(rq->idle)) {
 		trace_sched_wake_idle_without_ipi(cpu);
 	} else {
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (is_idle_task(rq->curr))
 			smp_send_reschedule(cpu);
 		/* Else cpu is not in idle, do nothing here */
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	}
+out:
+	rcu_read_unlock();
 }
 bool cpus_share_cache(int this_cpu, int that_cpu)
 {
 	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
 }
 #endif /* CONFIG_SMP */
 static void ttwu_queue(struct task_struct *p, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 #if defined(CONFIG_SMP)
 	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
 		sched_clock_cpu(cpu); /* sync clocks x-cpu */
 		ttwu_queue_remote(p, cpu);
 		return;
 	}
 #endif
 	raw_spin_lock(&rq->lock);
 	ttwu_do_activate(rq, p, 0);
 	raw_spin_unlock(&rq->lock);
 }
 /**
  * try_to_wake_up - wake up a thread
  * @p: the thread to be awakened
  * @state: the mask of task states that can be woken
  * @wake_flags: wake modifier flags (WF_*)
  *
  * Put it on the run-queue if it's not already there. The "current"
  * thread is always on the run-queue (except when the actual
  * re-schedule is in progress), and as such you're allowed to do
  * the simpler "current->state = TASK_RUNNING" to mark yourself
  * runnable without the overhead of this.
  *
  * Return: %true if @p was woken up, %false if it was already running.
  * or @state didn't match @p's state.
  */
 static int
 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 {
 	unsigned long flags;
 	int cpu, success = 0;
 	/*
 	 * If we are going to wake up a thread waiting for CONDITION we
 	 * need to ensure that CONDITION=1 done by the caller can not be
 	 * reordered with p->state check below. This pairs with mb() in
 	 * set_current_state() the waiting thread does.
 	 */
 	smp_mb__before_spinlock();
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	if (!(p->state & state))
 		goto out;
 	success = 1; /* we're going to change ->state */
 	cpu = task_cpu(p);
 	if (p->on_rq && ttwu_remote(p, wake_flags))
 		goto stat;
 #ifdef CONFIG_SMP
 	/*
 	 * If the owning (remote) cpu is still in the middle of schedule() with
 	 * this task as prev, wait until its done referencing the task.
 	 */
 	while (p->on_cpu)
 		cpu_relax();
 	/*
 	 * Pairs with the smp_wmb() in finish_lock_switch().
 	 */
 	smp_rmb();
 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
 	p->state = TASK_WAKING;
 	if (p->sched_class->task_waking)
 		p->sched_class->task_waking(p);
 	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
 	if (task_cpu(p) != cpu) {
 		wake_flags |= WF_MIGRATED;
 		set_task_cpu(p, cpu);
 	}
 #endif /* CONFIG_SMP */
 	ttwu_queue(p, cpu);
 stat:
 	ttwu_stat(p, cpu, wake_flags);
 out:
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 	return success;
 }
 /**
  * try_to_wake_up_local - try to wake up a local task with rq lock held
  * @p: the thread to be awakened
  *
  * Put @p on the run-queue if it's not already there. The caller must
  * ensure that this_rq() is locked, @p is bound to this_rq() and not
  * the current task.
  */
 static void try_to_wake_up_local(struct task_struct *p)
 {
 	struct rq *rq = task_rq(p);
 	if (WARN_ON_ONCE(rq != this_rq()) ||
 	    WARN_ON_ONCE(p == current))
 		return;
 	lockdep_assert_held(&rq->lock);
 	if (!raw_spin_trylock(&p->pi_lock)) {
 		raw_spin_unlock(&rq->lock);
 		raw_spin_lock(&p->pi_lock);
 		raw_spin_lock(&rq->lock);
 	}
 	if (!(p->state & TASK_NORMAL))
 		goto out;
 	if (!task_on_rq_queued(p))
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 	ttwu_do_wakeup(rq, p, 0);
 	ttwu_stat(p, smp_processor_id(), 0);
 out:
 	raw_spin_unlock(&p->pi_lock);
 }
 /**
  * wake_up_process - Wake up a specific process
  * @p: The process to be woken up.
  *
  * Attempt to wake up the nominated process and move it to the set of runnable
  * processes.
  *
  * Return: 1 if the process was woken up, 0 if it was already running.
  *
  * It may be assumed that this function implies a write memory barrier before
  * changing the task state if and only if any tasks are woken up.
  */
 int wake_up_process(struct task_struct *p)
 {
 	WARN_ON(task_is_stopped_or_traced(p));
 	return try_to_wake_up(p, TASK_NORMAL, 0);
 }
 EXPORT_SYMBOL(wake_up_process);
 int wake_up_state(struct task_struct *p, unsigned int state)
 {
 	return try_to_wake_up(p, state, 0);
 }
 /*
  * This function clears the sched_dl_entity static params.
  */
 void __dl_clear_params(struct task_struct *p)
 {
 	struct sched_dl_entity *dl_se = &p->dl;
 	dl_se->dl_runtime = 0;
 	dl_se->dl_deadline = 0;
 	dl_se->dl_period = 0;
 	dl_se->flags = 0;
 	dl_se->dl_bw = 0;
 }
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
  *
  * __sched_fork() is basic setup used by init_idle() too:
  */
 static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
 	p->on_rq			= 0;
 	p->se.on_rq			= 0;
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
 	p->se.vruntime			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 	RB_CLEAR_NODE(&p->dl.rb_node);
 	hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	__dl_clear_params(p);
 	INIT_LIST_HEAD(&p->rt.run_list);
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
 #ifdef CONFIG_NUMA_BALANCING
 	if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
 		p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
 		p->mm->numa_scan_seq = 0;
 	}
 	if (clone_flags & CLONE_VM)
 		p->numa_preferred_nid = current->numa_preferred_nid;
 	else
 		p->numa_preferred_nid = -1;
 	p->node_stamp = 0ULL;
 	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
 	p->numa_scan_period = sysctl_numa_balancing_scan_delay;
 	p->numa_work.next = &p->numa_work;
 	p->numa_faults_memory = NULL;
 	p->numa_faults_buffer_memory = NULL;
 	p->last_task_numa_placement = 0;
 	p->last_sum_exec_runtime = 0;
 	INIT_LIST_HEAD(&p->numa_entry);
 	p->numa_group = NULL;
 #endif /* CONFIG_NUMA_BALANCING */
 }
 #ifdef CONFIG_NUMA_BALANCING
 #ifdef CONFIG_SCHED_DEBUG
 void set_numabalancing_state(bool enabled)
 {
 	if (enabled)
 		sched_feat_set("NUMA");
 	else
 		sched_feat_set("NO_NUMA");
 }
 #else
 __read_mostly bool numabalancing_enabled;
 void set_numabalancing_state(bool enabled)
 {
 	numabalancing_enabled = enabled;
 }
 #endif /* CONFIG_SCHED_DEBUG */
 #ifdef CONFIG_PROC_SYSCTL
 int sysctl_numa_balancing(struct ctl_table *table, int write,
 			 void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table t;
 	int err;
 	int state = numabalancing_enabled;
 	if (write && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	t = *table;
 	t.data = &state;
 	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
 	if (err < 0)
 		return err;
 	if (write)
 		set_numabalancing_state(state);
 	return err;
 }
 #endif
 #endif
 /*
  * fork()/clone()-time setup:
  */
 int sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
 	unsigned long flags;
 	int cpu = get_cpu();
 	__sched_fork(clone_flags, p);
 	/*
 	 * We mark the process as running here. This guarantees that
 	 * nobody will actually run it, and a signal or other external
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
 	p->state = TASK_RUNNING;
 	/*
 	 * Make sure we do not leak PI boosting priority to the child.
 	 */
 	p->prio = current->normal_prio;
 	/*
 	 * Revert to default priority/policy on fork if requested.
 	 */
 	if (unlikely(p->sched_reset_on_fork)) {
 		if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
 			p->policy = SCHED_NORMAL;
 			p->static_prio = NICE_TO_PRIO(0);
 			p->rt_priority = 0;
 		} else if (PRIO_TO_NICE(p->static_prio) < 0)
 			p->static_prio = NICE_TO_PRIO(0);
 		p->prio = p->normal_prio = __normal_prio(p);
 		set_load_weight(p);
 		/*
 		 * We don't need the reset flag anymore after the fork. It has
 		 * fulfilled its duty:
 		 */
 		p->sched_reset_on_fork = 0;
 	}
 	if (dl_prio(p->prio)) {
 		put_cpu();
 		return -EAGAIN;
 	} else if (rt_prio(p->prio)) {
 		p->sched_class = &rt_sched_class;
 	} else {
 		p->sched_class = &fair_sched_class;
 	}
 	if (p->sched_class->task_fork)
 		p->sched_class->task_fork(p);
 	/*
 	 * The child is not yet in the pid-hash so no cgroup attach races,
 	 * and the cgroup is pinned to this child due to cgroup_fork()
 	 * is ran before sched_fork().
 	 *
 	 * Silence PROVE_RCU.
 	 */
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	set_task_cpu(p, cpu);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
 #if defined(CONFIG_SMP)
 	p->on_cpu = 0;
 #endif
 	init_task_preempt_count(p);
 #ifdef CONFIG_SMP
 	plist_node_init(&p->pushable_tasks, MAX_PRIO);
 	RB_CLEAR_NODE(&p->pushable_dl_tasks);
 #endif
 	put_cpu();
 	return 0;
 }
 unsigned long to_ratio(u64 period, u64 runtime)
 {
 	if (runtime == RUNTIME_INF)
 		return 1ULL << 20;
 	/*
 	 * Doing this here saves a lot of checks in all
 	 * the calling paths, and returning zero seems
 	 * safe for them anyway.
 	 */
 	if (period == 0)
 		return 0;
 	return div64_u64(runtime << 20, period);
 }
 #ifdef CONFIG_SMP
 inline struct dl_bw *dl_bw_of(int i)
 {
 	rcu_lockdep_assert(rcu_read_lock_sched_held(),
 			   "sched RCU must be held");
 	return &cpu_rq(i)->rd->dl_bw;
 }
 static inline int dl_bw_cpus(int i)
 {
 	struct root_domain *rd = cpu_rq(i)->rd;
 	int cpus = 0;
 	rcu_lockdep_assert(rcu_read_lock_sched_held(),
 			   "sched RCU must be held");
 	for_each_cpu_and(i, rd->span, cpu_active_mask)
 		cpus++;
 	return cpus;
 }
 #else
 inline struct dl_bw *dl_bw_of(int i)
 {
 	return &cpu_rq(i)->dl.dl_bw;
 }
 static inline int dl_bw_cpus(int i)
 {
 	return 1;
 }
 #endif
 static inline
 void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
 {
 	dl_b->total_bw -= tsk_bw;
 }
 static inline
 void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
 {
 	dl_b->total_bw += tsk_bw;
 }
 static inline
 bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
 {
 	return dl_b->bw != -1 &&
 	       dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
 }
 /*
  * We must be sure that accepting a new task (or allowing changing the
  * parameters of an existing one) is consistent with the bandwidth
  * constraints. If yes, this function also accordingly updates the currently
  * allocated bandwidth to reflect the new situation.
  *
  * This function is called while holding p's rq->lock.
  */
 static int dl_overflow(struct task_struct *p, int policy,
 		       const struct sched_attr *attr)
 {
 	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 	u64 period = attr->sched_period ?: attr->sched_deadline;
 	u64 runtime = attr->sched_runtime;
 	u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
 	int cpus, err = -1;
 	if (new_bw == p->dl.dl_bw)
 		return 0;
 	/*
 	 * Either if a task, enters, leave, or stays -deadline but changes
 	 * its parameters, we may need to update accordingly the total
 	 * allocated bandwidth of the container.
 	 */
 	raw_spin_lock(&dl_b->lock);
 	cpus = dl_bw_cpus(task_cpu(p));
 	if (dl_policy(policy) && !task_has_dl_policy(p) &&
 	    !__dl_overflow(dl_b, cpus, 0, new_bw)) {
 		__dl_add(dl_b, new_bw);
 		err = 0;
 	} else if (dl_policy(policy) && task_has_dl_policy(p) &&
 		   !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
 		__dl_clear(dl_b, p->dl.dl_bw);
 		__dl_add(dl_b, new_bw);
 		err = 0;
 	} else if (!dl_policy(policy) && task_has_dl_policy(p)) {
 		__dl_clear(dl_b, p->dl.dl_bw);
 		err = 0;
 	}
 	raw_spin_unlock(&dl_b->lock);
 	return err;
 }
 extern void init_dl_bw(struct dl_bw *dl_b);
 /*
  * wake_up_new_task - wake up a newly created task for the first time.
  *
  * This function will do some initial scheduler statistics housekeeping
  * that must be done for every newly created context, then puts the task
  * on the runqueue and wakes it.
  */
 void wake_up_new_task(struct task_struct *p)
 {
 	unsigned long flags;
 	struct rq *rq;
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 #ifdef CONFIG_SMP
 	/*
 	 * Fork balancing, do it here and not earlier because:
 	 *  - cpus_allowed can change in the fork path
 	 *  - any previously selected cpu might disappear through hotplug
 	 */
 	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
 	/* Initialize new task's runnable average */
 	init_task_runnable_average(p);
 	rq = __task_rq_lock(p);
 	activate_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_QUEUED;
 	trace_sched_wakeup_new(p, true);
 	check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_woken)
 		p->sched_class->task_woken(rq, p);
 #endif
 	task_rq_unlock(rq, p, &flags);
 }
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 /**
  * preempt_notifier_register - tell me when current is being preempted & rescheduled
  * @notifier: notifier struct to register
  */
 void preempt_notifier_register(struct preempt_notifier *notifier)
 {
 	hlist_add_head(&notifier->link, &current->preempt_notifiers);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_register);
 /**
  * preempt_notifier_unregister - no longer interested in preemption notifications
  * @notifier: notifier struct to unregister
  *
  * This is safe to call from within a preemption notifier.
  */
 void preempt_notifier_unregister(struct preempt_notifier *notifier)
 {
 	hlist_del(&notifier->link);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
 	struct preempt_notifier *notifier;
 	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
 		notifier->ops->sched_in(notifier, raw_smp_processor_id());
 }
 static void
 fire_sched_out_preempt_notifiers(struct task_struct *curr,
 				 struct task_struct *next)
 {
 	struct preempt_notifier *notifier;
 	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
 		notifier->ops->sched_out(notifier, next);
 }
 #else /* !CONFIG_PREEMPT_NOTIFIERS */
 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
 }
 static void
 fire_sched_out_preempt_notifiers(struct task_struct *curr,
 				 struct task_struct *next)
 {
 }
 #endif /* CONFIG_PREEMPT_NOTIFIERS */
 /**
  * prepare_task_switch - prepare to switch tasks
  * @rq: the runqueue preparing to switch
  * @prev: the current task that is being switched out
  * @next: the task we are going to switch to.
  *
  * This is called with the rq lock held and interrupts off. It must
  * be paired with a subsequent finish_task_switch after the context
  * switch.
  *
  * prepare_task_switch sets up locking and calls architecture specific
  * hooks.
  */
 static inline void
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
 		    struct task_struct *next)
 {
 	trace_sched_switch(prev, next);
 	sched_info_switch(rq, prev, next);
 	perf_event_task_sched_out(prev, next);
 	fire_sched_out_preempt_notifiers(prev, next);
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
 }
 /**
  * finish_task_switch - clean up after a task-switch
  * @rq: runqueue associated with task-switch
  * @prev: the thread we just switched away from.
  *
  * finish_task_switch must be called after the context switch, paired
  * with a prepare_task_switch call before the context switch.
  * finish_task_switch will reconcile locking set up by prepare_task_switch,
  * and do any other architecture-specific cleanup actions.
  *
  * Note that we may have delayed dropping an mm in context_switch(). If
  * so, we finish that here outside of the runqueue lock. (Doing it
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
  */
 static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct mm_struct *mm = rq->prev_mm;
 	long prev_state;
 	rq->prev_mm = NULL;
 	/*
 	 * A task struct has one reference for the use as "current".
 	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
 	 * schedule one last time. The schedule call will never return, and
 	 * the scheduled task must drop that reference.
 	 * The test for TASK_DEAD must occur while the runqueue locks are
 	 * still held, otherwise prev could be scheduled on another cpu, die
 	 * there before we look at prev->state, and then the reference would
 	 * be dropped twice.
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
 	prev_state = prev->state;
 	vtime_task_switch(prev);
 	finish_arch_switch(prev);
 	perf_event_task_sched_in(prev, current);
 	finish_lock_switch(rq, prev);
 	finish_arch_post_lock_switch();
 	fire_sched_in_preempt_notifiers(current);
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
 		/*
 		 * Remove function-return probe instances associated with this
 		 * task and put them back on the free list.
 		 */
 		kprobe_flush_task(prev);
 		put_task_struct(prev);
 	}
 	tick_nohz_task_switch(current);
 }
 #ifdef CONFIG_SMP
 /* rq->lock is NOT held, but preemption is disabled */
 static inline void post_schedule(struct rq *rq)
 {
 	if (rq->post_schedule) {
 		unsigned long flags;
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->curr->sched_class->post_schedule)
 			rq->curr->sched_class->post_schedule(rq);
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		rq->post_schedule = 0;
 	}
 }
 #else
 static inline void post_schedule(struct rq *rq)
 {
 }
 #endif
 /**
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
  */
 asmlinkage __visible void schedule_tail(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
 	finish_task_switch(rq, prev);
 	/*
 	 * FIXME: do we need to worry about rq being invalidated by the
 	 * task_switch?
 	 */
 	post_schedule(rq);
 	if (current->set_child_tid)
 		put_user(task_pid_vnr(current), current->set_child_tid);
 }
 /*
  * context_switch - switch to the new MM and the new
  * thread's register state.
  */
 static inline void
 context_switch(struct rq *rq, struct task_struct *prev,
 	       struct task_struct *next)
 {
 	struct mm_struct *mm, *oldmm;
 	prepare_task_switch(rq, prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
 	 * For paravirt, this is coupled with an exit in switch_to to
 	 * combine the page table reload and the switch backend into
 	 * one hypercall.
 	 */
 	arch_start_context_switch(prev);
 	if (!mm) {
 		next->active_mm = oldmm;
 		atomic_inc(&oldmm->mm_count);
 		enter_lazy_tlb(oldmm, next);
 	} else
 		switch_mm(oldmm, mm, next);
 	if (!prev->mm) {
 		prev->active_mm = NULL;
 		rq->prev_mm = oldmm;
 	}
 	/*
 	 * Since the runqueue lock will be released by the next
 	 * task (which is an invalid locking op but in the case
 	 * of the scheduler it's an obvious special-case), so we
 	 * do an early lockdep release here:
 	 */
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 	context_tracking_task_switch(prev, next);
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 	barrier();
 	/*
 	 * this_rq must be evaluated again because prev may have moved
 	 * CPUs since it called schedule(), thus the 'rq' on its stack
 	 * frame will be invalid.
 	 */
 	finish_task_switch(this_rq(), prev);
 }
 /*
  * nr_running and nr_context_switches:
  *
  * externally visible scheduler statistics: current number of runnable
  * threads, total number of context switches performed since bootup.
  */
 unsigned long nr_running(void)
 {
 	unsigned long i, sum = 0;
 	for_each_online_cpu(i)
 		sum += cpu_rq(i)->nr_running;
 	return sum;
 }
 /*
  * Check if only the current task is running on the cpu.
  */
 bool single_task_running(void)
 {
 	if (cpu_rq(smp_processor_id())->nr_running == 1)
 		return true;
 	else
 		return false;
 }
 EXPORT_SYMBOL(single_task_running);
 unsigned long long nr_context_switches(void)
 {
 	int i;
 	unsigned long long sum = 0;
 	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_switches;
 	return sum;
 }
 unsigned long nr_iowait(void)
 {
 	unsigned long i, sum = 0;
 	for_each_possible_cpu(i)
 		sum += atomic_read(&cpu_rq(i)->nr_iowait);
 	return sum;
 }
 unsigned long nr_iowait_cpu(int cpu)
 {
 	struct rq *this = cpu_rq(cpu);
 	return atomic_read(&this->nr_iowait);
 }
 void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
 {
 	struct rq *this = this_rq();
 	*nr_waiters = atomic_read(&this->nr_iowait);
 	*load = this->cpu_load[0];
 }
 #ifdef CONFIG_SMP
 /*
  * sched_exec - execve() is a valuable balancing opportunity, because at
  * this point the task has the smallest effective memory and cache footprint.
  */
 void sched_exec(void)
 {
 	struct task_struct *p = current;
 	unsigned long flags;
 	int dest_cpu;
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
 	if (dest_cpu == smp_processor_id())
 		goto unlock;
 	if (likely(cpu_active(dest_cpu))) {
 		struct migration_arg arg = { p, dest_cpu };
 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 		stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
 		return;
 	}
 unlock:
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 }
 #endif
 DEFINE_PER_CPU(struct kernel_stat, kstat);
 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
 /*
  * Return accounted runtime for the task.
  * In case the task is currently running, return the runtime plus current's
  * pending runtime that have not been accounted yet.
  */
 unsigned long long task_sched_runtime(struct task_struct *p)
 {
 	unsigned long flags;
 	struct rq *rq;
 	u64 ns;
 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
 	/*
 	 * 64-bit doesn't need locks to atomically read a 64bit value.
 	 * So we have a optimization chance when the task's delta_exec is 0.
 	 * Reading ->on_cpu is racy, but this is ok.
 	 *
 	 * If we race with it leaving cpu, we'll take a lock. So we're correct.
 	 * If we race with it entering cpu, unaccounted time is 0. This is
 	 * indistinguishable from the read occurring a few cycles earlier.
 	 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
 	 * been accounted, so we're correct here as well.
 	 */
 	if (!p->on_cpu || !task_on_rq_queued(p))
 		return p->se.sum_exec_runtime;
 #endif
 	rq = task_rq_lock(p, &flags);
 	/*
 	 * Must be ->curr _and_ ->on_rq.  If dequeued, we would
 	 * project cycles that may never be accounted to this
 	 * thread, breaking clock_gettime().
 	 */
 	if (task_current(rq, p) && task_on_rq_queued(p)) {
 		update_rq_clock(rq);
 		p->sched_class->update_curr(rq);
 	}
 	ns = p->se.sum_exec_runtime;
 	task_rq_unlock(rq, p, &flags);
 	return ns;
 }
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
  */
 void scheduler_tick(void)
 {
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
 	struct task_struct *curr = rq->curr;
 	sched_clock_tick();
 	raw_spin_lock(&rq->lock);
 	update_rq_clock(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	update_cpu_load_active(rq);
 	raw_spin_unlock(&rq->lock);
 	perf_event_task_tick();
 #ifdef CONFIG_SMP
 	rq->idle_balance = idle_cpu(cpu);
 	trigger_load_balance(rq);
 #endif
 	rq_last_tick_reset(rq);
 }
 #ifdef CONFIG_NO_HZ_FULL
 /**
  * scheduler_tick_max_deferment
  *
  * Keep at least one tick per second when a single
  * active task is running because the scheduler doesn't
  * yet completely support full dynticks environment.
  *
  * This makes sure that uptime, CFS vruntime, load
  * balancing, etc... continue to move forward, even
  * with a very low granularity.
  *
  * Return: Maximum deferment in nanoseconds.
  */
 u64 scheduler_tick_max_deferment(void)
 {
 	struct rq *rq = this_rq();
 	unsigned long next, now = ACCESS_ONCE(jiffies);
 	next = rq->last_sched_tick + HZ;
 	if (time_before_eq(next, now))
 		return 0;
 	return jiffies_to_nsecs(next - now);
 }
 #endif
 notrace unsigned long get_parent_ip(unsigned long addr)
 {
 	if (in_lock_functions(addr)) {
 		addr = CALLER_ADDR2;
 		if (in_lock_functions(addr))
 			addr = CALLER_ADDR3;
 	}
 	return addr;
 }
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
 				defined(CONFIG_PREEMPT_TRACER))
 void preempt_count_add(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
 		return;
 #endif
 	__preempt_count_add(val);
 #ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Spinlock count overflowing soon?
 	 */
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
 #endif
 	if (preempt_count() == val) {
 		unsigned long ip = get_parent_ip(CALLER_ADDR1);
 #ifdef CONFIG_DEBUG_PREEMPT
 		current->preempt_disable_ip = ip;
 #endif
 		trace_preempt_off(CALLER_ADDR0, ip);
 	}
 }
 EXPORT_SYMBOL(preempt_count_add);
 NOKPROBE_SYMBOL(preempt_count_add);
 void preempt_count_sub(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
 		return;
 	/*
 	 * Is the spinlock portion underflowing?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
 			!(preempt_count() & PREEMPT_MASK)))
 		return;
 #endif
 	if (preempt_count() == val)
 		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 	__preempt_count_sub(val);
 }
 EXPORT_SYMBOL(preempt_count_sub);
 NOKPROBE_SYMBOL(preempt_count_sub);
 #endif
 /*
  * Print scheduling while atomic bug:
  */
 static noinline void __schedule_bug(struct task_struct *prev)
 {
 	if (oops_in_progress)
 		return;
 	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
 		prev->comm, prev->pid, preempt_count());
 	debug_show_held_locks(prev);
 	print_modules();
 	if (irqs_disabled())
 		print_irqtrace_events(prev);
 #ifdef CONFIG_DEBUG_PREEMPT
 	if (in_atomic_preempt_off()) {
 		pr_err("Preemption disabled at:");
 		print_ip_sym(current->preempt_disable_ip);
 		pr_cont("\n");
 	}
 #endif
 	dump_stack();
 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
 /*
  * Various schedule()-time debugging checks and statistics:
  */
 static inline void schedule_debug(struct task_struct *prev)
 {
 #ifdef CONFIG_SCHED_STACK_END_CHECK
 	BUG_ON(unlikely(task_stack_end_corrupted(prev)));
 #endif
 	/*
 	 * Test if we are atomic. Since do_exit() needs to call into
 	 * schedule() atomically, we ignore that path. Otherwise whine
 	 * if we are scheduling when we should not.
 	 */
 	if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
 		__schedule_bug(prev);
 	rcu_sleep_check();
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 	schedstat_inc(this_rq(), sched_count);
 }
 /*
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
 pick_next_task(struct rq *rq, struct task_struct *prev)
 {
 	const struct sched_class *class = &fair_sched_class;
 	struct task_struct *p;
 	/*
 	 * Optimization: we know that if all tasks are in
 	 * the fair class we can call that function directly:
 	 */
 	if (likely(prev->sched_class == class &&
 		   rq->nr_running == rq->cfs.h_nr_running)) {
 		p = fair_sched_class.pick_next_task(rq, prev);
 		if (unlikely(p == RETRY_TASK))
 			goto again;
 		/* assumes fair_sched_class->next == idle_sched_class */
 		if (unlikely(!p))
 			p = idle_sched_class.pick_next_task(rq, prev);
 		return p;
 	}
 again:
 	for_each_class(class) {
 		p = class->pick_next_task(rq, prev);
 		if (p) {
 			if (unlikely(p == RETRY_TASK))
 				goto again;
 			return p;
 		}
 	}
 	BUG(); /* the idle class will always have a runnable task */
 }
 /*
  * __schedule() is the main scheduler function.
  *
  * The main means of driving the scheduler and thus entering this function are:
  *
  *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
  *
  *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
  *      paths. For example, see arch/x86/entry_64.S.
  *
  *      To drive preemption between tasks, the scheduler sets the flag in timer
  *      interrupt handler scheduler_tick().
  *
  *   3. Wakeups don't really cause entry into schedule(). They add a
  *      task to the run-queue and that's it.
  *
  *      Now, if the new task added to the run-queue preempts the current
  *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
  *      called on the nearest possible occasion:
  *
  *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
  *
  *         - in syscall or exception context, at the next outmost
  *           preempt_enable(). (this might be as soon as the wake_up()'s
  *           spin_unlock()!)
  *
  *         - in IRQ context, return from interrupt-handler to
  *           preemptible context
  *
  *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
  *         then at the next:
  *
  *          - cond_resched() call
  *          - explicit schedule() call
  *          - return from syscall or exception to user-space
  *          - return from interrupt-handler to user-space
  */
 static void __sched __schedule(void)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
 	int cpu;
 need_resched:
 	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_note_context_switch(cpu);
 	prev = rq->curr;
 	schedule_debug(prev);
 	if (sched_feat(HRTICK))
 		hrtick_clear(rq);
 	/*
 	 * Make sure that signal_pending_state()->signal_pending() below
 	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
 	 * done by the caller to avoid the race with signal_wake_up().
 	 */
 	smp_mb__before_spinlock();
 	raw_spin_lock_irq(&rq->lock);
 	switch_count = &prev->nivcsw;
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		if (unlikely(signal_pending_state(prev->state, prev))) {
 			prev->state = TASK_RUNNING;
 		} else {
 			deactivate_task(rq, prev, DEQUEUE_SLEEP);
 			prev->on_rq = 0;
 			/*
 			 * If a worker went to sleep, notify and ask workqueue
 			 * whether it wants to wake up a task to maintain
 			 * concurrency.
 			 */
 			if (prev->flags & PF_WQ_WORKER) {
 				struct task_struct *to_wakeup;
 				to_wakeup = wq_worker_sleeping(prev, cpu);
 				if (to_wakeup)
 					try_to_wake_up_local(to_wakeup);
 			}
 		}
 		switch_count = &prev->nvcsw;
 	}
 	if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
 		update_rq_clock(rq);
 	next = pick_next_task(rq, prev);
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
 	rq->skip_clock_update = 0;
 	if (likely(prev != next)) {
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
 		context_switch(rq, prev, next); /* unlocks the rq */
 		/*
 		 * The context switch have flipped the stack from under us
 		 * and restored the local variables which were saved when
 		 * this task called schedule() in the past. prev == current
 		 * is still correct, but it can be moved to another cpu/rq.
 		 */
 		cpu = smp_processor_id();
 		rq = cpu_rq(cpu);
 	} else
 		raw_spin_unlock_irq(&rq->lock);
 	post_schedule(rq);
 	sched_preempt_enable_no_resched();
 	if (need_resched())
 		goto need_resched;
 }
 static inline void sched_submit_work(struct task_struct *tsk)
 {
 	if (!tsk->state || tsk_is_pi_blocked(tsk))
 		return;
 	/*
 	 * If we are going to sleep and we have plugged IO queued,
 	 * make sure to submit it to avoid deadlocks.
 	 */
 	if (blk_needs_flush_plug(tsk))
 		blk_schedule_flush_plug(tsk);
 }
 asmlinkage __visible void __sched schedule(void)
 {
 	struct task_struct *tsk = current;
 	sched_submit_work(tsk);
 	__schedule();
 }
 EXPORT_SYMBOL(schedule);
 #ifdef CONFIG_CONTEXT_TRACKING
 asmlinkage __visible void __sched schedule_user(void)
 {
 	/*
 	 * If we come here after a random call to set_need_resched(),
 	 * or we have been woken up remotely but the IPI has not yet arrived,
 	 * we haven't yet exited the RCU idle mode. Do it here manually until
 	 * we find a better solution.
 	 *
 	 * NB: There are buggy callers of this function.  Ideally we
 	 * should warn if prev_state != IN_USER, but that will trigger
 	 * too frequently to make sense yet.
 	 */
 	enum ctx_state prev_state = exception_enter();
 	schedule();
 	exception_exit(prev_state);
 }
 #endif
 /**
  * schedule_preempt_disabled - called with preemption disabled
  *
  * Returns with preemption disabled. Note: preempt_count must be 1
  */
 void __sched schedule_preempt_disabled(void)
 {
 	sched_preempt_enable_no_resched();
 	schedule();
 	preempt_disable();
 }
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
  * off of preempt_enable. Kernel preemptions off return from interrupt
  * occur there and call schedule directly.
  */
 asmlinkage __visible void __sched notrace preempt_schedule(void)
 {
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task. Just return..
 	 */
 	if (likely(!preemptible()))
 		return;
 	do {
 		__preempt_count_add(PREEMPT_ACTIVE);
 		__schedule();
 		__preempt_count_sub(PREEMPT_ACTIVE);
 		/*
 		 * Check again in case we missed a preemption opportunity
 		 * between schedule and now.
 		 */
 		barrier();
 	} while (need_resched());
 }
 NOKPROBE_SYMBOL(preempt_schedule);
 EXPORT_SYMBOL(preempt_schedule);
 #ifdef CONFIG_CONTEXT_TRACKING
 /**
  * preempt_schedule_context - preempt_schedule called by tracing
  *
  * The tracing infrastructure uses preempt_enable_notrace to prevent
  * recursion and tracing preempt enabling caused by the tracing
  * infrastructure itself. But as tracing can happen in areas coming
  * from userspace or just about to enter userspace, a preempt enable
  * can occur before user_exit() is called. This will cause the scheduler
  * to be called when the system is still in usermode.
  *
  * To prevent this, the preempt_enable_notrace will use this function
  * instead of preempt_schedule() to exit user context if needed before
  * calling the scheduler.
  */
 asmlinkage __visible void __sched notrace preempt_schedule_context(void)
 {
 	enum ctx_state prev_ctx;
 	if (likely(!preemptible()))
 		return;
 	do {
 		__preempt_count_add(PREEMPT_ACTIVE);
 		/*
 		 * Needs preempt disabled in case user_exit() is traced
 		 * and the tracer calls preempt_enable_notrace() causing
 		 * an infinite recursion.
 		 */
 		prev_ctx = exception_enter();
 		__schedule();
 		exception_exit(prev_ctx);
 		__preempt_count_sub(PREEMPT_ACTIVE);
 		barrier();
 	} while (need_resched());
 }
 EXPORT_SYMBOL_GPL(preempt_schedule_context);
 #endif /* CONFIG_CONTEXT_TRACKING */
 #endif /* CONFIG_PREEMPT */
 /*
  * this is the entry point to schedule() from kernel preemption
  * off of irq context.
  * Note, that this is called and return with irqs disabled. This will
  * protect us against recursive calling from irq.
  */
 asmlinkage __visible void __sched preempt_schedule_irq(void)
 {
 	enum ctx_state prev_state;
 	/* Catch callers which need to be fixed */
 	BUG_ON(preempt_count() || !irqs_disabled());
 	prev_state = exception_enter();
 	do {
 		__preempt_count_add(PREEMPT_ACTIVE);
 		local_irq_enable();
 		__schedule();
 		local_irq_disable();
 		__preempt_count_sub(PREEMPT_ACTIVE);
 		/*
 		 * Check again in case we missed a preemption opportunity
 		 * between schedule and now.
 		 */
 		barrier();
 	} while (need_resched());
 	exception_exit(prev_state);
 }
 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
 			  void *key)
 {
 	return try_to_wake_up(curr->private, mode, wake_flags);
 }
 EXPORT_SYMBOL(default_wake_function);
 #ifdef CONFIG_RT_MUTEXES
 /*
  * rt_mutex_setprio - set the current priority of a task
  * @p: task
  * @prio: prio value (kernel-internal form)
  *
  * This function changes the 'effective' priority of a task. It does
  * not touch ->normal_prio like __setscheduler().
  *
  * Used by the rt_mutex code to implement priority inheritance
  * logic. Call site only calls if the priority of the task changed.
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
 	int oldprio, queued, running, enqueue_flag = 0;
 	struct rq *rq;
 	const struct sched_class *prev_class;
 	BUG_ON(prio > MAX_PRIO);
 	rq = __task_rq_lock(p);
 	/*
 	 * Idle task boosting is a nono in general. There is one
 	 * exception, when PREEMPT_RT and NOHZ is active:
 	 *
 	 * The idle task calls get_next_timer_interrupt() and holds
 	 * the timer wheel base->lock on the CPU and another CPU wants
 	 * to access the timer (probably to cancel it). We can safely
 	 * ignore the boosting request, as the idle CPU runs this code
 	 * with interrupts disabled and will complete the lock
 	 * protected section without being interrupted. So there is no
 	 * real need to boost.
 	 */
 	if (unlikely(p == rq->idle)) {
 		WARN_ON(p != rq->curr);
 		WARN_ON(p->pi_blocked_on);
 		goto out_unlock;
 	}
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
 	prev_class = p->sched_class;
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
 		dequeue_task(rq, p, 0);
 	if (running)
 		put_prev_task(rq, p);
 	/*
 	 * Boosting condition are:
 	 * 1. -rt task is running and holds mutex A
 	 *      --> -dl task blocks on mutex A
 	 *
 	 * 2. -dl task is running and holds mutex A
 	 *      --> -dl task blocks on mutex A and could preempt the
 	 *          running task
 	 */
 	if (dl_prio(prio)) {
 		struct task_struct *pi_task = rt_mutex_get_top_task(p);
 		if (!dl_prio(p->normal_prio) ||
 		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
 			p->dl.dl_boosted = 1;
 			p->dl.dl_throttled = 0;
 			enqueue_flag = ENQUEUE_REPLENISH;
 		} else
 			p->dl.dl_boosted = 0;
 		p->sched_class = &dl_sched_class;
 	} else if (rt_prio(prio)) {
 		if (dl_prio(oldprio))
 			p->dl.dl_boosted = 0;
 		if (oldprio < prio)
 			enqueue_flag = ENQUEUE_HEAD;
 		p->sched_class = &rt_sched_class;
 	} else {
 		if (dl_prio(oldprio))
 			p->dl.dl_boosted = 0;
 		p->sched_class = &fair_sched_class;
 	}
 	p->prio = prio;
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued)
 		enqueue_task(rq, p, enqueue_flag);
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
 	__task_rq_unlock(rq);
 }
 #endif
 void set_user_nice(struct task_struct *p, long nice)
 {
 	int old_prio, delta, queued;
 	unsigned long flags;
 	struct rq *rq;
 	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
 		return;
 	/*
 	 * We have to be careful, if called from sys_setpriority(),
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
 	rq = task_rq_lock(p, &flags);
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
 	 * it wont have any effect on scheduling until the task is
 	 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
 	 */
 	if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
 	queued = task_on_rq_queued(p);
 	if (queued)
 		dequeue_task(rq, p, 0);
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
 	old_prio = p->prio;
 	p->prio = effective_prio(p);
 	delta = p->prio - old_prio;
 	if (queued) {
 		enqueue_task(rq, p, 0);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
 		 */
 		if (delta < 0 || (delta > 0 && task_running(rq, p)))
 			resched_curr(rq);
 	}
 out_unlock:
 	task_rq_unlock(rq, p, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
 /*
  * can_nice - check if a task can reduce its nice value
  * @p: task
  * @nice: nice value
  */
 int can_nice(const struct task_struct *p, const int nice)
 {
 	/* convert nice value [19,-20] to rlimit style value [1,40] */
 	int nice_rlim = nice_to_rlimit(nice);
 	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
 		capable(CAP_SYS_NICE));
 }
 #ifdef __ARCH_WANT_SYS_NICE
 /*
  * sys_nice - change the priority of the current process.
  * @increment: priority increment
  *
  * sys_setpriority is a more generic, but much slower function that
  * does similar things.
  */
 SYSCALL_DEFINE1(nice, int, increment)
 {
 	long nice, retval;
 	/*
 	 * Setpriority might change our priority at the same moment.
 	 * We don't have to worry. Conceptually one call occurs first
 	 * and we have a single winner.
 	 */
 	increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
 	nice = task_nice(current) + increment;
 	nice = clamp_val(nice, MIN_NICE, MAX_NICE);
 	if (increment < 0 && !can_nice(current, nice))
 		return -EPERM;
 	retval = security_task_setnice(current, nice);
 	if (retval)
 		return retval;
 	set_user_nice(current, nice);
 	return 0;
 }
 #endif
 /**
  * task_prio - return the priority value of a given task.
  * @p: the task in question.
  *
  * Return: The priority value as seen by users in /proc.
  * RT tasks are offset by -200. Normal tasks are centered
  * around 0, value goes from -16 to +15.
  */
 int task_prio(const struct task_struct *p)
 {
 	return p->prio - MAX_RT_PRIO;
 }
 /**
  * idle_cpu - is a given cpu idle currently?
  * @cpu: the processor in question.
  *
  * Return: 1 if the CPU is currently idle. 0 otherwise.
  */
 int idle_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	if (rq->curr != rq->idle)
 		return 0;
 	if (rq->nr_running)
 		return 0;
 #ifdef CONFIG_SMP
 	if (!llist_empty(&rq->wake_list))
 		return 0;
 #endif
 	return 1;
 }
 /**
  * idle_task - return the idle task for a given cpu.
  * @cpu: the processor in question.
  *
  * Return: The idle task for the cpu @cpu.
  */
 struct task_struct *idle_task(int cpu)
 {
 	return cpu_rq(cpu)->idle;
 }
 /**
  * find_process_by_pid - find a process with a matching PID value.
  * @pid: the pid in question.
  *
  * The task of @pid, if found. %NULL otherwise.
  */
 static struct task_struct *find_process_by_pid(pid_t pid)
 {
 	return pid ? find_task_by_vpid(pid) : current;
 }
 /*
  * This function initializes the sched_dl_entity of a newly becoming
  * SCHED_DEADLINE task.
  *
  * Only the static values are considered here, the actual runtime and the
  * absolute deadline will be properly calculated when the task is enqueued
  * for the first time with its new policy.
  */
 static void
 __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
 {
 	struct sched_dl_entity *dl_se = &p->dl;
 	init_dl_task_timer(dl_se);
 	dl_se->dl_runtime = attr->sched_runtime;
 	dl_se->dl_deadline = attr->sched_deadline;
 	dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
 	dl_se->flags = attr->sched_flags;
 	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
 	dl_se->dl_throttled = 0;
 	dl_se->dl_new = 1;
 	dl_se->dl_yielded = 0;
 }
 /*
  * sched_setparam() passes in -1 for its policy, to let the functions
  * it calls know not to change it.
  */
 #define SETPARAM_POLICY	-1
 static void __setscheduler_params(struct task_struct *p,
 		const struct sched_attr *attr)
 {
 	int policy = attr->sched_policy;
 	if (policy == SETPARAM_POLICY)
 		policy = p->policy;
 	p->policy = policy;
 	if (dl_policy(policy))
 		__setparam_dl(p, attr);
 	else if (fair_policy(policy))
 		p->static_prio = NICE_TO_PRIO(attr->sched_nice);
 	/*
 	 * __sched_setscheduler() ensures attr->sched_priority == 0 when
 	 * !rt_policy. Always setting this ensures that things like
 	 * getparam()/getattr() don't report silly values for !rt tasks.
 	 */
 	p->rt_priority = attr->sched_priority;
 	p->normal_prio = normal_prio(p);
 	set_load_weight(p);
 }
 /* Actually do priority change: must hold pi & rq lock. */
 static void __setscheduler(struct rq *rq, struct task_struct *p,
 			   const struct sched_attr *attr)
 {
 	__setscheduler_params(p, attr);
 	/*
 	 * If we get here, there was no pi waiters boosting the
 	 * task. It is safe to use the normal prio.
 	 */
 	p->prio = normal_prio(p);
 	if (dl_prio(p->prio))
 		p->sched_class = &dl_sched_class;
 	else if (rt_prio(p->prio))
 		p->sched_class = &rt_sched_class;
 	else
 		p->sched_class = &fair_sched_class;
 }
 static void
 __getparam_dl(struct task_struct *p, struct sched_attr *attr)
 {
 	struct sched_dl_entity *dl_se = &p->dl;
 	attr->sched_priority = p->rt_priority;
 	attr->sched_runtime = dl_se->dl_runtime;
 	attr->sched_deadline = dl_se->dl_deadline;
 	attr->sched_period = dl_se->dl_period;
 	attr->sched_flags = dl_se->flags;
 }
 /*
  * This function validates the new parameters of a -deadline task.
  * We ask for the deadline not being zero, and greater or equal
  * than the runtime, as well as the period of being zero or
  * greater than deadline. Furthermore, we have to be sure that
  * user parameters are above the internal resolution of 1us (we
  * check sched_runtime only since it is always the smaller one) and
  * below 2^63 ns (we have to check both sched_deadline and
  * sched_period, as the latter can be zero).
  */
 static bool
 __checkparam_dl(const struct sched_attr *attr)
 {
 	/* deadline != 0 */
 	if (attr->sched_deadline == 0)
 		return false;
 	/*
 	 * Since we truncate DL_SCALE bits, make sure we're at least
 	 * that big.
 	 */
 	if (attr->sched_runtime < (1ULL << DL_SCALE))
 		return false;
 	/*
 	 * Since we use the MSB for wrap-around and sign issues, make
 	 * sure it's not set (mind that period can be equal to zero).
 	 */
 	if (attr->sched_deadline & (1ULL << 63) ||
 	    attr->sched_period & (1ULL << 63))
 		return false;
 	/* runtime <= deadline <= period (if period != 0) */
 	if ((attr->sched_period != 0 &&
 	     attr->sched_period < attr->sched_deadline) ||
 	    attr->sched_deadline < attr->sched_runtime)
 		return false;
 	return true;
 }
 /*
  * check the target process has a UID that matches the current process's
  */
 static bool check_same_owner(struct task_struct *p)
 {
 	const struct cred *cred = current_cred(), *pcred;
 	bool match;
 	rcu_read_lock();
 	pcred = __task_cred(p);
 	match = (uid_eq(cred->euid, pcred->euid) ||
 		 uid_eq(cred->euid, pcred->uid));
 	rcu_read_unlock();
 	return match;
 }
 static int __sched_setscheduler(struct task_struct *p,
 				const struct sched_attr *attr,
 				bool user)
 {
 	int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
 		      MAX_RT_PRIO - 1 - attr->sched_priority;
 	int retval, oldprio, oldpolicy = -1, queued, running;
 	int policy = attr->sched_policy;
 	unsigned long flags;
 	const struct sched_class *prev_class;
 	struct rq *rq;
 	int reset_on_fork;
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
 recheck:
 	/* double check policy once rq lock held */
 	if (policy < 0) {
 		reset_on_fork = p->sched_reset_on_fork;
 		policy = oldpolicy = p->policy;
 	} else {
 		reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
 		if (policy != SCHED_DEADLINE &&
 				policy != SCHED_FIFO && policy != SCHED_RR &&
 				policy != SCHED_NORMAL && policy != SCHED_BATCH &&
 				policy != SCHED_IDLE)
 			return -EINVAL;
 	}
 	if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
 		return -EINVAL;
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
 	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
 	 * SCHED_BATCH and SCHED_IDLE is 0.
 	 */
 	if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
 	    (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
 		return -EINVAL;
 	if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
 	    (rt_policy(policy) != (attr->sched_priority != 0)))
 		return -EINVAL;
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
 	if (user && !capable(CAP_SYS_NICE)) {
 		if (fair_policy(policy)) {
 			if (attr->sched_nice < task_nice(p) &&
 			    !can_nice(p, attr->sched_nice))
 				return -EPERM;
 		}
 		if (rt_policy(policy)) {
 			unsigned long rlim_rtprio =
 					task_rlimit(p, RLIMIT_RTPRIO);
 			/* can't set/change the rt policy */
 			if (policy != p->policy && !rlim_rtprio)
 				return -EPERM;
 			/* can't increase priority */
 			if (attr->sched_priority > p->rt_priority &&
 			    attr->sched_priority > rlim_rtprio)
 				return -EPERM;
 		}
 		 /*
 		  * Can't set/change SCHED_DEADLINE policy at all for now
 		  * (safest behavior); in the future we would like to allow
 		  * unprivileged DL tasks to increase their relative deadline
 		  * or reduce their runtime (both ways reducing utilization)
 		  */
 		if (dl_policy(policy))
 			return -EPERM;
 		/*
 		 * Treat SCHED_IDLE as nice 20. Only allow a switch to
 		 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
 		 */
 		if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
 			if (!can_nice(p, task_nice(p)))
 				return -EPERM;
 		}
 		/* can't change other user's priorities */
 		if (!check_same_owner(p))
 			return -EPERM;
 		/* Normal users shall not reset the sched_reset_on_fork flag */
 		if (p->sched_reset_on_fork && !reset_on_fork)
 			return -EPERM;
 	}
 	if (user) {
 		retval = security_task_setscheduler(p);
 		if (retval)
 			return retval;
 	}
 	/*
 	 * make sure no PI-waiters arrive (or leave) while we are
 	 * changing the priority of the task:
 	 *
 	 * To be able to change p->policy safely, the appropriate
 	 * runqueue lock must be held.
 	 */
 	rq = task_rq_lock(p, &flags);
 	/*
 	 * Changing the policy of the stop threads its a very bad idea
 	 */
 	if (p == rq->stop) {
 		task_rq_unlock(rq, p, &flags);
 		return -EINVAL;
 	}
 	/*
 	 * If not changing anything there's no need to proceed further,
 	 * but store a possible modification of reset_on_fork.
 	 */
 	if (unlikely(policy == p->policy)) {
 		if (fair_policy(policy) && attr->sched_nice != task_nice(p))
 			goto change;
 		if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
 			goto change;
 		if (dl_policy(policy))
 			goto change;
 		p->sched_reset_on_fork = reset_on_fork;
 		task_rq_unlock(rq, p, &flags);
 		return 0;
 	}
 change:
 	if (user) {
 #ifdef CONFIG_RT_GROUP_SCHED
 		/*
 		 * Do not allow realtime tasks into groups that have no runtime
 		 * assigned.
 		 */
 		if (rt_bandwidth_enabled() && rt_policy(policy) &&
 				task_group(p)->rt_bandwidth.rt_runtime == 0 &&
 				!task_group_is_autogroup(task_group(p))) {
 			task_rq_unlock(rq, p, &flags);
 			return -EPERM;
 		}
 #endif
 #ifdef CONFIG_SMP
 		if (dl_bandwidth_enabled() && dl_policy(policy)) {
 			cpumask_t *span = rq->rd->span;
 			/*
 			 * Don't allow tasks with an affinity mask smaller than
 			 * the entire root_domain to become SCHED_DEADLINE. We
 			 * will also fail if there's no bandwidth available.
 			 */
 			if (!cpumask_subset(span, &p->cpus_allowed) ||
 			    rq->rd->dl_bw.bw == 0) {
 				task_rq_unlock(rq, p, &flags);
 				return -EPERM;
 			}
 		}
 #endif
 	}
 	/* recheck policy now with rq lock held */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
 		task_rq_unlock(rq, p, &flags);
 		goto recheck;
 	}
 	/*
 	 * If setscheduling to SCHED_DEADLINE (or changing the parameters
 	 * of a SCHED_DEADLINE task) we need to check if enough bandwidth
 	 * is available.
 	 */
 	if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
 		task_rq_unlock(rq, p, &flags);
 		return -EBUSY;
 	}
 	p->sched_reset_on_fork = reset_on_fork;
 	oldprio = p->prio;
 	/*
 	 * Special case for priority boosted tasks.
 	 *
 	 * If the new priority is lower or equal (user space view)
 	 * than the current (boosted) priority, we just store the new
 	 * normal parameters and do not touch the scheduler class and
 	 * the runqueue. This will be done when the task deboost
 	 * itself.
 	 */
 	if (rt_mutex_check_prio(p, newprio)) {
 		__setscheduler_params(p, attr);
 		task_rq_unlock(rq, p, &flags);
 		return 0;
 	}
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
 		dequeue_task(rq, p, 0);
 	if (running)
 		put_prev_task(rq, p);
 	prev_class = p->sched_class;
 	__setscheduler(rq, p, attr);
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued) {
 		/*
 		 * We enqueue to tail when the priority of a task is
 		 * increased (user space view).
 		 */
 		enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
 	}
 	check_class_changed(rq, p, prev_class, oldprio);
 	task_rq_unlock(rq, p, &flags);
 	rt_mutex_adjust_pi(p);
 	return 0;
 }
 static int _sched_setscheduler(struct task_struct *p, int policy,
 			       const struct sched_param *param, bool check)
 {
 	struct sched_attr attr = {
 		.sched_policy   = policy,
 		.sched_priority = param->sched_priority,
 		.sched_nice	= PRIO_TO_NICE(p->static_prio),
 	};
 	/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
 	if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
 		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
 		policy &= ~SCHED_RESET_ON_FORK;
 		attr.sched_policy = policy;
 	}
 	return __sched_setscheduler(p, &attr, check);
 }
 /**
  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
  * @p: the task in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  *
  * Return: 0 on success. An error code otherwise.
  *
  * NOTE that the task may be already dead.
  */
 int sched_setscheduler(struct task_struct *p, int policy,
 		       const struct sched_param *param)
 {
 	return _sched_setscheduler(p, policy, param, true);
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
 {
 	return __sched_setscheduler(p, attr, true);
 }
 EXPORT_SYMBOL_GPL(sched_setattr);
 /**
  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
  * @p: the task in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  *
  * Just like sched_setscheduler, only don't bother checking if the
  * current context has permission.  For example, this is needed in
  * stop_machine(): we create temporary high priority worker threads,
  * but our caller might not have that capability.
  *
  * Return: 0 on success. An error code otherwise.
  */
 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
 			       const struct sched_param *param)
 {
 	return _sched_setscheduler(p, policy, param, false);
 }
 static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
 	struct sched_param lparam;
 	struct task_struct *p;
 	int retval;
 	if (!param || pid < 0)
 		return -EINVAL;
 	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
 		return -EFAULT;
 	rcu_read_lock();
 	retval = -ESRCH;
 	p = find_process_by_pid(pid);
 	if (p != NULL)
 		retval = sched_setscheduler(p, policy, &lparam);
 	rcu_read_unlock();
 	return retval;
 }
 /*
  * Mimics kernel/events/core.c perf_copy_attr().
  */
 static int sched_copy_attr(struct sched_attr __user *uattr,
 			   struct sched_attr *attr)
 {
 	u32 size;
 	int ret;
 	if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
 		return -EFAULT;
 	/*
 	 * zero the full structure, so that a short copy will be nice.
 	 */
 	memset(attr, 0, sizeof(*attr));
 	ret = get_user(size, &uattr->size);
 	if (ret)
 		return ret;
 	if (size > PAGE_SIZE)	/* silly large */
 		goto err_size;
 	if (!size)		/* abi compat */
 		size = SCHED_ATTR_SIZE_VER0;
 	if (size < SCHED_ATTR_SIZE_VER0)
 		goto err_size;
 	/*
 	 * If we're handed a bigger struct than we know of,
 	 * ensure all the unknown bits are 0 - i.e. new
 	 * user-space does not rely on any kernel feature
 	 * extensions we dont know about yet.
 	 */
 	if (size > sizeof(*attr)) {
 		unsigned char __user *addr;
 		unsigned char __user *end;
 		unsigned char val;
 		addr = (void __user *)uattr + sizeof(*attr);
 		end  = (void __user *)uattr + size;
 		for (; addr < end; addr++) {
 			ret = get_user(val, addr);
 			if (ret)
 				return ret;
 			if (val)
 				goto err_size;
 		}
 		size = sizeof(*attr);
 	}
 	ret = copy_from_user(attr, uattr, size);
 	if (ret)
 		return -EFAULT;
 	/*
 	 * XXX: do we want to be lenient like existing syscalls; or do we want
 	 * to be strict and return an error on out-of-bounds values?
 	 */
 	attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
 	return 0;
 err_size:
 	put_user(sizeof(*attr), &uattr->size);
 	return -E2BIG;
 }
 /**
  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
  * @pid: the pid in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  *
  * Return: 0 on success. An error code otherwise.
  */
 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
 		struct sched_param __user *, param)
 {
 	/* negative values for policy are not valid */
 	if (policy < 0)
 		return -EINVAL;
 	return do_sched_setscheduler(pid, policy, param);
 }
 /**
  * sys_sched_setparam - set/change the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the new RT priority.
  *
  * Return: 0 on success. An error code otherwise.
  */
 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 {
 	return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
 }
 /**
  * sys_sched_setattr - same as above, but with extended sched_attr
  * @pid: the pid in question.
  * @uattr: structure containing the extended parameters.
  * @flags: for future extension.
  */
 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
 			       unsigned int, flags)
 {
 	struct sched_attr attr;
 	struct task_struct *p;
 	int retval;
 	if (!uattr || pid < 0 || flags)
 		return -EINVAL;
 	retval = sched_copy_attr(uattr, &attr);
 	if (retval)
 		return retval;
 	if ((int)attr.sched_policy < 0)
 		return -EINVAL;
 	rcu_read_lock();
 	retval = -ESRCH;
 	p = find_process_by_pid(pid);
 	if (p != NULL)
 		retval = sched_setattr(p, &attr);
 	rcu_read_unlock();
 	return retval;
 }
 /**
  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
  * @pid: the pid in question.
  *
  * Return: On success, the policy of the thread. Otherwise, a negative error
  * code.
  */
 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
 {
 	struct task_struct *p;
 	int retval;
 	if (pid < 0)
 		return -EINVAL;
 	retval = -ESRCH;
 	rcu_read_lock();
 	p = find_process_by_pid(pid);
 	if (p) {
 		retval = security_task_getscheduler(p);
 		if (!retval)
 			retval = p->policy
 				| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
 	}
 	rcu_read_unlock();
 	return retval;
 }
 /**
  * sys_sched_getparam - get the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the RT priority.
  *
  * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
  * code.
  */
 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
 {
 	struct sched_param lp = { .sched_priority = 0 };
 	struct task_struct *p;
 	int retval;
 	if (!param || pid < 0)
 		return -EINVAL;
 	rcu_read_lock();
 	p = find_process_by_pid(pid);
 	retval = -ESRCH;
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	if (task_has_rt_policy(p))
 		lp.sched_priority = p->rt_priority;
 	rcu_read_unlock();
 	/*
 	 * This one might sleep, we cannot do it with a spinlock held ...
 	 */
 	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
 	return retval;
 out_unlock:
 	rcu_read_unlock();
 	return retval;
 }
 static int sched_read_attr(struct sched_attr __user *uattr,
 			   struct sched_attr *attr,
 			   unsigned int usize)
 {
 	int ret;
 	if (!access_ok(VERIFY_WRITE, uattr, usize))
 		return -EFAULT;
 	/*
 	 * If we're handed a smaller struct than we know of,
 	 * ensure all the unknown bits are 0 - i.e. old
 	 * user-space does not get uncomplete information.
 	 */
 	if (usize < sizeof(*attr)) {
 		unsigned char *addr;
 		unsigned char *end;
 		addr = (void *)attr + usize;
 		end  = (void *)attr + sizeof(*attr);
 		for (; addr < end; addr++) {
 			if (*addr)
 				return -EFBIG;
 		}
 		attr->size = usize;
 	}
 	ret = copy_to_user(uattr, attr, attr->size);
 	if (ret)
 		return -EFAULT;
 	return 0;
 }
 /**
  * sys_sched_getattr - similar to sched_getparam, but with sched_attr
  * @pid: the pid in question.
  * @uattr: structure containing the extended parameters.
  * @size: sizeof(attr) for fwd/bwd comp.
  * @flags: for future extension.
  */
 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
 		unsigned int, size, unsigned int, flags)
 {
 	struct sched_attr attr = {
 		.size = sizeof(struct sched_attr),
 	};
 	struct task_struct *p;
 	int retval;
 	if (!uattr || pid < 0 || size > PAGE_SIZE ||
 	    size < SCHED_ATTR_SIZE_VER0 || flags)
 		return -EINVAL;
 	rcu_read_lock();
 	p = find_process_by_pid(pid);
 	retval = -ESRCH;
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	attr.sched_policy = p->policy;
 	if (p->sched_reset_on_fork)
 		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
 	if (task_has_dl_policy(p))
 		__getparam_dl(p, &attr);
 	else if (task_has_rt_policy(p))
 		attr.sched_priority = p->rt_priority;
 	else
 		attr.sched_nice = task_nice(p);
 	rcu_read_unlock();
 	retval = sched_read_attr(uattr, &attr, size);
 	return retval;
 out_unlock:
 	rcu_read_unlock();
 	return retval;
 }
 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 {
 	cpumask_var_t cpus_allowed, new_mask;
 	struct task_struct *p;
 	int retval;
 	rcu_read_lock();
 	p = find_process_by_pid(pid);
 	if (!p) {
 		rcu_read_unlock();
 		return -ESRCH;
 	}
 	/* Prevent p going away */
 	get_task_struct(p);
 	rcu_read_unlock();
 	if (p->flags & PF_NO_SETAFFINITY) {
 		retval = -EINVAL;
 		goto out_put_task;
 	}
 	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
 		retval = -ENOMEM;
 		goto out_put_task;
 	}
 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
 		retval = -ENOMEM;
 		goto out_free_cpus_allowed;
 	}
 	retval = -EPERM;
 	if (!check_same_owner(p)) {
 		rcu_read_lock();
 		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
 			rcu_read_unlock();
 			goto out_free_new_mask;
 		}
 		rcu_read_unlock();
 	}
 	retval = security_task_setscheduler(p);
 	if (retval)
 		goto out_free_new_mask;
 	cpuset_cpus_allowed(p, cpus_allowed);
 	cpumask_and(new_mask, in_mask, cpus_allowed);
 	/*
 	 * Since bandwidth control happens on root_domain basis,
 	 * if admission test is enabled, we only admit -deadline
 	 * tasks allowed to run on all the CPUs in the task's
 	 * root_domain.
 	 */
 #ifdef CONFIG_SMP
 	if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
 		rcu_read_lock();
 		if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
 			retval = -EBUSY;
 			rcu_read_unlock();
 			goto out_free_new_mask;
 		}
 		rcu_read_unlock();
 	}
 #endif
 again:
 	retval = set_cpus_allowed_ptr(p, new_mask);
 	if (!retval) {
 		cpuset_cpus_allowed(p, cpus_allowed);
 		if (!cpumask_subset(new_mask, cpus_allowed)) {
 			/*
 			 * We must have raced with a concurrent cpuset
 			 * update. Just reset the cpus_allowed to the
 			 * cpuset's cpus_allowed
 			 */
 			cpumask_copy(new_mask, cpus_allowed);
 			goto again;
 		}
 	}
 out_free_new_mask:
 	free_cpumask_var(new_mask);
 out_free_cpus_allowed:
 	free_cpumask_var(cpus_allowed);
 out_put_task:
 	put_task_struct(p);
 	return retval;
 }
 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
 			     struct cpumask *new_mask)
 {
 	if (len < cpumask_size())
 		cpumask_clear(new_mask);
 	else if (len > cpumask_size())
 		len = cpumask_size();
 	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
 }
 /**
  * sys_sched_setaffinity - set the cpu affinity of a process
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to the new cpu mask
  *
  * Return: 0 on success. An error code otherwise.
  */
 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
 		unsigned long __user *, user_mask_ptr)
 {
 	cpumask_var_t new_mask;
 	int retval;
 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
 		return -ENOMEM;
 	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
 	if (retval == 0)
 		retval = sched_setaffinity(pid, new_mask);
 	free_cpumask_var(new_mask);
 	return retval;
 }
 long sched_getaffinity(pid_t pid, struct cpumask *mask)
 {
 	struct task_struct *p;
 	unsigned long flags;
 	int retval;
 	rcu_read_lock();
 	retval = -ESRCH;
 	p = find_process_by_pid(pid);
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 out_unlock:
 	rcu_read_unlock();
 	return retval;
 }
 /**
  * sys_sched_getaffinity - get the cpu affinity of a process
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to hold the current cpu mask
  *
  * Return: 0 on success. An error code otherwise.
  */
 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
 		unsigned long __user *, user_mask_ptr)
 {
 	int ret;
 	cpumask_var_t mask;
 	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
 		return -EINVAL;
 	if (len & (sizeof(unsigned long)-1))
 		return -EINVAL;
 	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
 		return -ENOMEM;
 	ret = sched_getaffinity(pid, mask);
 	if (ret == 0) {
 		size_t retlen = min_t(size_t, len, cpumask_size());
 		if (copy_to_user(user_mask_ptr, mask, retlen))
 			ret = -EFAULT;
 		else
 			ret = retlen;
 	}
 	free_cpumask_var(mask);
 	return ret;
 }
 /**
  * sys_sched_yield - yield the current processor to other threads.
  *
  * This function yields the current CPU to other tasks. If there are no
  * other threads running on this CPU then this function will return.
  *
  * Return: 0.
  */
 SYSCALL_DEFINE0(sched_yield)
 {
 	struct rq *rq = this_rq_lock();
 	schedstat_inc(rq, yld_count);
 	current->sched_class->yield_task(rq);
 	/*
 	 * Since we are going to call schedule() anyway, there's
 	 * no need to preempt or enable interrupts:
 	 */
 	__release(rq->lock);
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 	do_raw_spin_unlock(&rq->lock);
 	sched_preempt_enable_no_resched();
 	schedule();
 	return 0;
 }
 static void __cond_resched(void)
 {
 	__preempt_count_add(PREEMPT_ACTIVE);
 	__schedule();
 	__preempt_count_sub(PREEMPT_ACTIVE);
 }
 int __sched _cond_resched(void)
 {
 	if (should_resched()) {
 		__cond_resched();
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(_cond_resched);
 /*
  * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
  * call schedule, and on return reacquire the lock.
  *
  * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
  * operations here to prevent schedule() from being called twice (once via
  * spin_unlock(), once by hand).
  */
 int __cond_resched_lock(spinlock_t *lock)
 {
 	int resched = should_resched();
 	int ret = 0;
 	lockdep_assert_held(lock);
 	if (spin_needbreak(lock) || resched) {
 		spin_unlock(lock);
 		if (resched)
 			__cond_resched();
 		else
 			cpu_relax();
 		ret = 1;
 		spin_lock(lock);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(__cond_resched_lock);
 int __sched __cond_resched_softirq(void)
 {
 	BUG_ON(!in_softirq());
 	if (should_resched()) {
 		local_bh_enable();
 		__cond_resched();
 		local_bh_disable();
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(__cond_resched_softirq);
 /**
  * yield - yield the current processor to other threads.
  *
  * Do not ever use this function, there's a 99% chance you're doing it wrong.
  *
  * The scheduler is at all times free to pick the calling task as the most
  * eligible task to run, if removing the yield() call from your code breaks
  * it, its already broken.
  *
  * Typical broken usage is:
  *
  * while (!event)
  * 	yield();
  *
  * where one assumes that yield() will let 'the other' process run that will
  * make event true. If the current task is a SCHED_FIFO task that will never
  * happen. Never use yield() as a progress guarantee!!
  *
  * If you want to use yield() to wait for something, use wait_event().
  * If you want to use yield() to be 'nice' for others, use cond_resched().
  * If you still want to use yield(), do not!
  */
 void __sched yield(void)
 {
 	set_current_state(TASK_RUNNING);
 	sys_sched_yield();
 }
 EXPORT_SYMBOL(yield);
 /**
  * yield_to - yield the current processor to another thread in
  * your thread group, or accelerate that thread toward the
  * processor it's on.
  * @p: target task
  * @preempt: whether task preemption is allowed or not
  *
  * It's the caller's job to ensure that the target task struct
  * can't go away on us before we can do any checks.
  *
  * Return:
  *	true (>0) if we indeed boosted the target task.
  *	false (0) if we failed to boost the target.
  *	-ESRCH if there's no task to yield to.
  */
 int __sched yield_to(struct task_struct *p, bool preempt)
 {
 	struct task_struct *curr = current;
 	struct rq *rq, *p_rq;
 	unsigned long flags;
 	int yielded = 0;
 	local_irq_save(flags);
 	rq = this_rq();
 again:
 	p_rq = task_rq(p);
 	/*
 	 * If we're the only runnable task on the rq and target rq also
 	 * has only one task, there's absolutely no point in yielding.
 	 */
 	if (rq->nr_running == 1 && p_rq->nr_running == 1) {
 		yielded = -ESRCH;
 		goto out_irq;
 	}
 	double_rq_lock(rq, p_rq);
 	if (task_rq(p) != p_rq) {
 		double_rq_unlock(rq, p_rq);
 		goto again;
 	}
 	if (!curr->sched_class->yield_to_task)
 		goto out_unlock;
 	if (curr->sched_class != p->sched_class)
 		goto out_unlock;
 	if (task_running(p_rq, p) || p->state)
 		goto out_unlock;
 	yielded = curr->sched_class->yield_to_task(rq, p, preempt);
 	if (yielded) {
 		schedstat_inc(rq, yld_count);
 		/*
 		 * Make p's CPU reschedule; pick_next_entity takes care of
 		 * fairness.
 		 */
 		if (preempt && rq != p_rq)
 			resched_curr(p_rq);
 	}
 out_unlock:
 	double_rq_unlock(rq, p_rq);
 out_irq:
 	local_irq_restore(flags);
 	if (yielded > 0)
 		schedule();
 	return yielded;
 }
 EXPORT_SYMBOL_GPL(yield_to);
 /*
  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
  * that process accounting knows that this is a task in IO wait state.
  */
 void __sched io_schedule(void)
 {
 	struct rq *rq = raw_rq();
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	blk_flush_plug(current);
 	current->in_iowait = 1;
 	schedule();
 	current->in_iowait = 0;
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
 }
 EXPORT_SYMBOL(io_schedule);
 long __sched io_schedule_timeout(long timeout)
 {
 	struct rq *rq = raw_rq();
 	long ret;
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	blk_flush_plug(current);
 	current->in_iowait = 1;
 	ret = schedule_timeout(timeout);
 	current->in_iowait = 0;
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
 	return ret;
 }
 /**
  * sys_sched_get_priority_max - return maximum RT priority.
  * @policy: scheduling class.
  *
  * Return: On success, this syscall returns the maximum
  * rt_priority that can be used by a given scheduling class.
  * On failure, a negative error code is returned.
  */
 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
 {
 	int ret = -EINVAL;
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
 		ret = MAX_USER_RT_PRIO-1;
 		break;
 	case SCHED_DEADLINE:
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 	case SCHED_IDLE:
 		ret = 0;
 		break;
 	}
 	return ret;
 }
 /**
  * sys_sched_get_priority_min - return minimum RT priority.
  * @policy: scheduling class.
  *
  * Return: On success, this syscall returns the minimum
  * rt_priority that can be used by a given scheduling class.
  * On failure, a negative error code is returned.
  */
 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
 {
 	int ret = -EINVAL;
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
 		ret = 1;
 		break;
 	case SCHED_DEADLINE:
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 	case SCHED_IDLE:
 		ret = 0;
 	}
 	return ret;
 }
 /**
  * sys_sched_rr_get_interval - return the default timeslice of a process.
  * @pid: pid of the process.
  * @interval: userspace pointer to the timeslice value.
  *
  * this syscall writes the default timeslice value of a given process
  * into the user-space timespec buffer. A value of '0' means infinity.
  *
  * Return: On success, 0 and the timeslice is in @interval. Otherwise,
  * an error code.
  */
 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 		struct timespec __user *, interval)
 {
 	struct task_struct *p;
 	unsigned int time_slice;
 	unsigned long flags;
 	struct rq *rq;
 	int retval;
 	struct timespec t;
 	if (pid < 0)
 		return -EINVAL;
 	retval = -ESRCH;
 	rcu_read_lock();
 	p = find_process_by_pid(pid);
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	rq = task_rq_lock(p, &flags);
 	time_slice = 0;
 	if (p->sched_class->get_rr_interval)
 		time_slice = p->sched_class->get_rr_interval(rq, p);
 	task_rq_unlock(rq, p, &flags);
 	rcu_read_unlock();
 	jiffies_to_timespec(time_slice, &t);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 	return retval;
 out_unlock:
 	rcu_read_unlock();
 	return retval;
 }
 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
 void sched_show_task(struct task_struct *p)
 {
 	unsigned long free = 0;
 	int ppid;
 	unsigned state;
 	state = p->state ? __ffs(p->state) + 1 : 0;
 	printk(KERN_INFO "%-15.15s %c", p->comm,
 		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if BITS_PER_LONG == 32
 	if (state == TASK_RUNNING)
 		printk(KERN_CONT " running  ");
 	else
 		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
 #else
 	if (state == TASK_RUNNING)
 		printk(KERN_CONT "  running task    ");
 	else
 		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	free = stack_not_used(p);
 #endif
 	rcu_read_lock();
 	ppid = task_pid_nr(rcu_dereference(p->real_parent));
 	rcu_read_unlock();
 	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
 		task_pid_nr(p), ppid,
 		(unsigned long)task_thread_info(p)->flags);
 	print_worker_info(KERN_INFO, p);
 	show_stack(p, NULL);
 }
 void show_state_filter(unsigned long state_filter)
 {
 	struct task_struct *g, *p;
 #if BITS_PER_LONG == 32
 	printk(KERN_INFO
 		"  task                PC stack   pid father\n");
 #else
 	printk(KERN_INFO
 		"  task                        PC stack   pid father\n");
 #endif
 	rcu_read_lock();
 	for_each_process_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
 		 * console might take a lot of time:
 		 */
 		touch_nmi_watchdog();
 		if (!state_filter || (p->state & state_filter))
 			sched_show_task(p);
 	}
 	touch_all_softlockup_watchdogs();
 #ifdef CONFIG_SCHED_DEBUG
 	sysrq_sched_debug_show();
 #endif
 	rcu_read_unlock();
 	/*
 	 * Only show locks if all tasks are dumped:
 	 */
 	if (!state_filter)
 		debug_show_all_locks();
 }
 void init_idle_bootup_task(struct task_struct *idle)
 {
 	idle->sched_class = &idle_sched_class;
 }
 /**
  * init_idle - set up an idle thread for a given CPU
  * @idle: task in question
  * @cpu: cpu the idle task belongs to
  *
  * NOTE: this function does not set the idle thread's NEED_RESCHED
  * flag, to make booting more robust.
  */
 void init_idle(struct task_struct *idle, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 	raw_spin_lock_irqsave(&rq->lock, flags);
 	__sched_fork(0, idle);
 	idle->state = TASK_RUNNING;
 	idle->se.exec_start = sched_clock();
 	do_set_cpus_allowed(idle, cpumask_of(cpu));
 	/*
 	 * We're having a chicken and egg problem, even though we are
 	 * holding rq->lock, the cpu isn't yet set to this cpu so the
 	 * lockdep check in task_group() will fail.
 	 *
 	 * Similar case to sched_fork(). / Alternatively we could
 	 * use task_rq_lock() here and obtain the other rq->lock.
 	 *
 	 * Silence PROVE_RCU
 	 */
 	rcu_read_lock();
 	__set_task_cpu(idle, cpu);
 	rcu_read_unlock();
 	rq->curr = rq->idle = idle;
 	idle->on_rq = TASK_ON_RQ_QUEUED;
 #if defined(CONFIG_SMP)
 	idle->on_cpu = 1;
 #endif
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 	/* Set the preempt count _outside_ the spinlocks! */
 	init_idle_preempt_count(idle, cpu);
 	/*
 	 * The idle tasks have their own, simple scheduling class:
 	 */
 	idle->sched_class = &idle_sched_class;
 	ftrace_graph_init_idle_task(idle, cpu);
 	vtime_init_idle(idle, cpu);
 #if defined(CONFIG_SMP)
 	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
 #endif
 }
 #ifdef CONFIG_SMP
 /*
  * move_queued_task - move a queued task to new rq.
  *
  * Returns (locked) new rq. Old rq's lock is released.
  */
 static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
 {
 	struct rq *rq = task_rq(p);
 	lockdep_assert_held(&rq->lock);
 	dequeue_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_MIGRATING;
 	set_task_cpu(p, new_cpu);
 	raw_spin_unlock(&rq->lock);
 	rq = cpu_rq(new_cpu);
 	raw_spin_lock(&rq->lock);
 	BUG_ON(task_cpu(p) != new_cpu);
 	p->on_rq = TASK_ON_RQ_QUEUED;
 	enqueue_task(rq, p, 0);
 	check_preempt_curr(rq, p, 0);
 	return rq;
 }
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
 	if (p->sched_class && p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, new_mask);
 	cpumask_copy(&p->cpus_allowed, new_mask);
 	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 /*
  * This is how migration works:
  *
  * 1) we invoke migration_cpu_stop() on the target CPU using
  *    stop_one_cpu().
  * 2) stopper starts to run (implicitly forcing the migrated thread
  *    off the CPU)
  * 3) it checks whether the migrated task is still in the wrong runqueue.
  * 4) if it's in the wrong runqueue then the migration thread removes
  *    it and puts it into the right queue.
  * 5) stopper completes and stop_one_cpu() returns and the migration
  *    is done.
  */
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
  * is removed from the allowed bitmask.
  *
  * NOTE: the caller must have a valid reference to the task, the
  * task must not exit() & deallocate itself prematurely. The
  * call is not atomic; no spinlocks may be held.
  */
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
 	unsigned long flags;
 	struct rq *rq;
 	unsigned int dest_cpu;
 	int ret = 0;
 	rq = task_rq_lock(p, &flags);
 	if (cpumask_equal(&p->cpus_allowed, new_mask))
 		goto out;
 	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
 		ret = -EINVAL;
 		goto out;
 	}
 	do_set_cpus_allowed(p, new_mask);
 	/* Can the task run on the task's current CPU? If so, we're done */
 	if (cpumask_test_cpu(task_cpu(p), new_mask))
 		goto out;
 	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
 	if (task_running(rq, p) || p->state == TASK_WAKING) {
 		struct migration_arg arg = { p, dest_cpu };
 		/* Need help from migration thread: drop lock and wait. */
 		task_rq_unlock(rq, p, &flags);
 		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 		tlb_migrate_finish(p->mm);
 		return 0;
 	} else if (task_on_rq_queued(p))
 		rq = move_queued_task(p, dest_cpu);
 out:
 	task_rq_unlock(rq, p, &flags);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 /*
  * Move (not current) task off this cpu, onto dest cpu. We're doing
  * this because either it can't run here any more (set_cpus_allowed()
  * away from this CPU, or CPU going down), or because we're
  * attempting to rebalance this task on exec (sched_exec).
  *
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
  *
  * Returns non-zero if task was successfully migrated.
  */
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
 	struct rq *rq;
 	int ret = 0;
 	if (unlikely(!cpu_active(dest_cpu)))
 		return ret;
 	rq = cpu_rq(src_cpu);
 	raw_spin_lock(&p->pi_lock);
 	raw_spin_lock(&rq->lock);
 	/* Already moved. */
 	if (task_cpu(p) != src_cpu)
 		goto done;
 	/* Affinity changed (again). */
 	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
 		goto fail;
 	/*
 	 * If we're not on a rq, the next wake-up will ensure we're
 	 * placed properly.
 	 */
 	if (task_on_rq_queued(p))
 		rq = move_queued_task(p, dest_cpu);
 done:
 	ret = 1;
 fail:
 	raw_spin_unlock(&rq->lock);
 	raw_spin_unlock(&p->pi_lock);
 	return ret;
 }
 #ifdef CONFIG_NUMA_BALANCING
 /* Migrate current task p to target_cpu */
 int migrate_task_to(struct task_struct *p, int target_cpu)
 {
 	struct migration_arg arg = { p, target_cpu };
 	int curr_cpu = task_cpu(p);
 	if (curr_cpu == target_cpu)
 		return 0;
 	if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
 		return -EINVAL;
 	/* TODO: This is not properly updating schedstats */
 	trace_sched_move_numa(p, curr_cpu, target_cpu);
 	return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
 }
 /*
  * Requeue a task on a given node and accurately track the number of NUMA
  * tasks on the runqueues
  */
 void sched_setnuma(struct task_struct *p, int nid)
 {
 	struct rq *rq;
 	unsigned long flags;
 	bool queued, running;
 	rq = task_rq_lock(p, &flags);
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
 		dequeue_task(rq, p, 0);
 	if (running)
 		put_prev_task(rq, p);
 	p->numa_preferred_nid = nid;
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued)
 		enqueue_task(rq, p, 0);
 	task_rq_unlock(rq, p, &flags);
 }
 #endif
 /*
  * migration_cpu_stop - this will be executed by a highprio stopper thread
  * and performs thread migration by bumping thread off CPU then
  * 'pushing' onto another runqueue.
  */
 static int migration_cpu_stop(void *data)
 {
 	struct migration_arg *arg = data;
 	/*
 	 * The original target cpu might have gone down and we might
 	 * be on another cpu but it doesn't matter.
 	 */
 	local_irq_disable();
 	/*
 	 * We need to explicitly wake pending tasks before running
 	 * __migrate_task() such that we will not miss enforcing cpus_allowed
 	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
 	 */
 	sched_ttwu_pending();
 	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
 	local_irq_enable();
 	return 0;
 }
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Ensures that the idle task is using init_mm right before its cpu goes
  * offline.
  */
 void idle_task_exit(void)
 {
 	struct mm_struct *mm = current->active_mm;
 	BUG_ON(cpu_online(smp_processor_id()));
 	if (mm != &init_mm) {
 		switch_mm(mm, &init_mm, current);
 		finish_arch_post_lock_switch();
 	}
 	mmdrop(mm);
 }
 /*
  * Since this CPU is going 'away' for a while, fold any nr_active delta
  * we might have. Assumes we're called after migrate_tasks() so that the
  * nr_active count is stable.
  *
  * Also see the comment "Global load-average calculations".
  */
 static void calc_load_migrate(struct rq *rq)
 {
 	long delta = calc_load_fold_active(rq);
 	if (delta)
 		atomic_long_add(delta, &calc_load_tasks);
 }
 static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
 {
 }
 static const struct sched_class fake_sched_class = {
 	.put_prev_task = put_prev_task_fake,
 };
 static struct task_struct fake_task = {
 	/*
 	 * Avoid pull_{rt,dl}_task()
 	 */
 	.prio = MAX_PRIO + 1,
 	.sched_class = &fake_sched_class,
 };
 /*
  * Migrate all tasks from the rq, sleeping tasks will be migrated by
  * try_to_wake_up()->select_task_rq().
  *
  * Called with rq->lock held even though we'er in stop_machine() and
  * there's no concurrency possible, we hold the required locks anyway
  * because of lock validation efforts.
  */
 static void migrate_tasks(unsigned int dead_cpu)
 {
 	struct rq *rq = cpu_rq(dead_cpu);
 	struct task_struct *next, *stop = rq->stop;
 	int dest_cpu;
 	/*
 	 * Fudge the rq selection such that the below task selection loop
 	 * doesn't get stuck on the currently eligible stop task.
 	 *
 	 * We're currently inside stop_machine() and the rq is either stuck
 	 * in the stop_machine_cpu_stop() loop, or we're executing this code,
 	 * either way we should never end up calling schedule() until we're
 	 * done here.
 	 */
 	rq->stop = NULL;
 	/*
 	 * put_prev_task() and pick_next_task() sched
 	 * class method both need to have an up-to-date
 	 * value of rq->clock[_task]
 	 */
 	update_rq_clock(rq);
 	for ( ; ; ) {
 		/*
 		 * There's this thread running, bail when that's the only
 		 * remaining thread.
 		 */
 		if (rq->nr_running == 1)
 			break;
 		next = pick_next_task(rq, &fake_task);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
 		/* Find suitable destination for @next, with force if needed. */
 		dest_cpu = select_fallback_rq(dead_cpu, next);
 		raw_spin_unlock(&rq->lock);
 		__migrate_task(next, dead_cpu, dest_cpu);
 		raw_spin_lock(&rq->lock);
 	}
 	rq->stop = stop;
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
 static struct ctl_table sd_ctl_dir[] = {
 	{
 		.procname	= "sched_domain",
 		.mode		= 0555,
 	},
 	{}
 };
 static struct ctl_table sd_ctl_root[] = {
 	{
 		.procname	= "kernel",
 		.mode		= 0555,
 		.child		= sd_ctl_dir,
 	},
 	{}
 };
 static struct ctl_table *sd_alloc_ctl_entry(int n)
 {
 	struct ctl_table *entry =
 		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
 	return entry;
 }
 static void sd_free_ctl_entry(struct ctl_table **tablep)
 {
 	struct ctl_table *entry;
 	/*
 	 * In the intermediate directories, both the child directory and
 	 * procname are dynamically allocated and could fail but the mode
 	 * will always be set. In the lowest directory the names are
 	 * static strings and all have proc handlers.
 	 */
 	for (entry = *tablep; entry->mode; entry++) {
 		if (entry->child)
 			sd_free_ctl_entry(&entry->child);
 		if (entry->proc_handler == NULL)
 			kfree(entry->procname);
 	}
 	kfree(*tablep);
 	*tablep = NULL;
 }
 static int min_load_idx = 0;
 static int max_load_idx = CPU_LOAD_IDX_MAX-1;
 static void
 set_table_entry(struct ctl_table *entry,
 		const char *procname, void *data, int maxlen,
 		umode_t mode, proc_handler *proc_handler,
 		bool load_idx)
 {
 	entry->procname = procname;
 	entry->data = data;
 	entry->maxlen = maxlen;
 	entry->mode = mode;
 	entry->proc_handler = proc_handler;
 	if (load_idx) {
 		entry->extra1 = &min_load_idx;
 		entry->extra2 = &max_load_idx;
 	}
 }
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
 	struct ctl_table *table = sd_alloc_ctl_entry(14);
 	if (table == NULL)
 		return NULL;
 	set_table_entry(&table[0], "min_interval", &sd->min_interval,
 		sizeof(long), 0644, proc_doulongvec_minmax, false);
 	set_table_entry(&table[1], "max_interval", &sd->max_interval,
 		sizeof(long), 0644, proc_doulongvec_minmax, false);
 	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
 		sizeof(int), 0644, proc_dointvec_minmax, true);
 	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
 		sizeof(int), 0644, proc_dointvec_minmax, true);
 	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
 		sizeof(int), 0644, proc_dointvec_minmax, true);
 	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
 		sizeof(int), 0644, proc_dointvec_minmax, true);
 	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
 		sizeof(int), 0644, proc_dointvec_minmax, true);
 	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
 		sizeof(int), 0644, proc_dointvec_minmax, false);
 	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
 		sizeof(int), 0644, proc_dointvec_minmax, false);
 	set_table_entry(&table[9], "cache_nice_tries",
 		&sd->cache_nice_tries,
 		sizeof(int), 0644, proc_dointvec_minmax, false);
 	set_table_entry(&table[10], "flags", &sd->flags,
 		sizeof(int), 0644, proc_dointvec_minmax, false);
 	set_table_entry(&table[11], "max_newidle_lb_cost",
 		&sd->max_newidle_lb_cost,
 		sizeof(long), 0644, proc_doulongvec_minmax, false);
 	set_table_entry(&table[12], "name", sd->name,
 		CORENAME_MAX_SIZE, 0444, proc_dostring, false);
 	/* &table[13] is terminator */
 	return table;
 }
 static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
 {
 	struct ctl_table *entry, *table;
 	struct sched_domain *sd;
 	int domain_num = 0, i;
 	char buf[32];
 	for_each_domain(cpu, sd)
 		domain_num++;
 	entry = table = sd_alloc_ctl_entry(domain_num + 1);
 	if (table == NULL)
 		return NULL;
 	i = 0;
 	for_each_domain(cpu, sd) {
 		snprintf(buf, 32, "domain%d", i);
 		entry->procname = kstrdup(buf, GFP_KERNEL);
 		entry->mode = 0555;
 		entry->child = sd_alloc_ctl_domain_table(sd);
 		entry++;
 		i++;
 	}
 	return table;
 }
 static struct ctl_table_header *sd_sysctl_header;
 static void register_sched_domain_sysctl(void)
 {
 	int i, cpu_num = num_possible_cpus();
 	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
 	char buf[32];
 	WARN_ON(sd_ctl_dir[0].child);
 	sd_ctl_dir[0].child = entry;
 	if (entry == NULL)
 		return;
 	for_each_possible_cpu(i) {
 		snprintf(buf, 32, "cpu%d", i);
 		entry->procname = kstrdup(buf, GFP_KERNEL);
 		entry->mode = 0555;
 		entry->child = sd_alloc_ctl_cpu_table(i);
 		entry++;
 	}
 	WARN_ON(sd_sysctl_header);
 	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
 }
 /* may be called multiple times per register */
 static void unregister_sched_domain_sysctl(void)
 {
 	if (sd_sysctl_header)
 		unregister_sysctl_table(sd_sysctl_header);
 	sd_sysctl_header = NULL;
 	if (sd_ctl_dir[0].child)
 		sd_free_ctl_entry(&sd_ctl_dir[0].child);
 }
 #else
 static void register_sched_domain_sysctl(void)
 {
 }
 static void unregister_sched_domain_sysctl(void)
 {
 }
 #endif
 static void set_rq_online(struct rq *rq)
 {
 	if (!rq->online) {
 		const struct sched_class *class;
 		cpumask_set_cpu(rq->cpu, rq->rd->online);
 		rq->online = 1;
 		for_each_class(class) {
 			if (class->rq_online)
 				class->rq_online(rq);
 		}
 	}
 }
 static void set_rq_offline(struct rq *rq)
 {
 	if (rq->online) {
 		const struct sched_class *class;
 		for_each_class(class) {
 			if (class->rq_offline)
 				class->rq_offline(rq);
 		}
 		cpumask_clear_cpu(rq->cpu, rq->rd->online);
 		rq->online = 0;
 	}
 }
 /*
  * migration_call - callback that gets triggered when a CPU is added.
  * Here we can start up the necessary migration thread for the new CPU.
  */
 static int
 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	int cpu = (long)hcpu;
 	unsigned long flags;
 	struct rq *rq = cpu_rq(cpu);
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_UP_PREPARE:
 		rq->calc_load_update = calc_load_update;
 		break;
 	case CPU_ONLINE:
 		/* Update our root-domain */
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 			set_rq_online(rq);
 		}
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DYING:
 		sched_ttwu_pending();
 		/* Update our root-domain */
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 			set_rq_offline(rq);
 		}
 		migrate_tasks(cpu);
 		BUG_ON(rq->nr_running != 1); /* the migration thread */
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		break;
 	case CPU_DEAD:
 		calc_load_migrate(rq);
 		break;
 #endif
 	}
 	update_max_interval();
 	return NOTIFY_OK;
 }
 /*
  * Register at high priority so that task migration (migrate_all_tasks)
  * happens before everything else.  This has to be lower priority than
  * the notifier in the perf_event subsystem, though.
  */
 static struct notifier_block migration_notifier = {
 	.notifier_call = migration_call,
 	.priority = CPU_PRI_MIGRATION,
 };
 static void __cpuinit set_cpu_rq_start_time(void)
 {
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
 	rq->age_stamp = sched_clock_cpu(cpu);
 }
 static int sched_cpu_active(struct notifier_block *nfb,
 				      unsigned long action, void *hcpu)
 {
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_STARTING:
 		set_cpu_rq_start_time();
 		return NOTIFY_OK;
 	case CPU_DOWN_FAILED:
 		set_cpu_active((long)hcpu, true);
 		return NOTIFY_OK;
 	default:
 		return NOTIFY_DONE;
 	}
 }
 static int sched_cpu_inactive(struct notifier_block *nfb,
 					unsigned long action, void *hcpu)
 {
 	unsigned long flags;
 	long cpu = (long)hcpu;
 	struct dl_bw *dl_b;
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_DOWN_PREPARE:
 		set_cpu_active(cpu, false);
 		/* explicitly allow suspend */
 		if (!(action & CPU_TASKS_FROZEN)) {
 			bool overflow;
 			int cpus;
 			rcu_read_lock_sched();
 			dl_b = dl_bw_of(cpu);
 			raw_spin_lock_irqsave(&dl_b->lock, flags);
 			cpus = dl_bw_cpus(cpu);
 			overflow = __dl_overflow(dl_b, cpus, 0, 0);
 			raw_spin_unlock_irqrestore(&dl_b->lock, flags);
 			rcu_read_unlock_sched();
 			if (overflow)
 				return notifier_from_errno(-EBUSY);
 		}
 		return NOTIFY_OK;
 	}
 	return NOTIFY_DONE;
 }
 static int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err;
 	/* Initialize migration for the boot CPU */
 	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
 	BUG_ON(err == NOTIFY_BAD);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
 	/* Register cpu active notifiers */
 	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
 	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
 	return 0;
 }
 early_initcall(migration_init);
 #endif
 #ifdef CONFIG_SMP
 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
 #ifdef CONFIG_SCHED_DEBUG
 static __read_mostly int sched_debug_enabled;
 static int __init sched_debug_setup(char *str)
 {
 	sched_debug_enabled = 1;
 	return 0;
 }
 early_param("sched_debug", sched_debug_setup);
 static inline bool sched_debug(void)
 {
 	return sched_debug_enabled;
 }
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 				  struct cpumask *groupmask)
 {
 	struct sched_group *group = sd->groups;
 	char str[256];
 	cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
 	cpumask_clear(groupmask);
 	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
 	if (!(sd->flags & SD_LOAD_BALANCE)) {
 		printk("does not load-balance\n");
 		if (sd->parent)
 			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
 					" has parent");
 		return -1;
 	}
 	printk(KERN_CONT "span %s level %s\n", str, sd->name);
 	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
 		printk(KERN_ERR "ERROR: domain->span does not contain "
 				"CPU%d\n", cpu);
 	}
 	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
 		printk(KERN_ERR "ERROR: domain->groups does not contain"
 				" CPU%d\n", cpu);
 	}
 	printk(KERN_DEBUG "%*s groups:", level + 1, "");
 	do {
 		if (!group) {
 			printk("\n");
 			printk(KERN_ERR "ERROR: group is NULL\n");
 			break;
 		}
 		/*
 		 * Even though we initialize ->capacity to something semi-sane,
 		 * we leave capacity_orig unset. This allows us to detect if
 		 * domain iteration is still funny without causing /0 traps.
 		 */
 		if (!group->sgc->capacity_orig) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
 			break;
 		}
 		if (!cpumask_weight(sched_group_cpus(group))) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: empty group\n");
 			break;
 		}
 		if (!(sd->flags & SD_OVERLAP) &&
 		    cpumask_intersects(groupmask, sched_group_cpus(group))) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: repeated CPUs\n");
 			break;
 		}
 		cpumask_or(groupmask, groupmask, sched_group_cpus(group));
 		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
 		printk(KERN_CONT " %s", str);
 		if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
 			printk(KERN_CONT " (cpu_capacity = %d)",
 				group->sgc->capacity);
 		}
 		group = group->next;
 	} while (group != sd->groups);
 	printk(KERN_CONT "\n");
 	if (!cpumask_equal(sched_domain_span(sd), groupmask))
 		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
 	if (sd->parent &&
 	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
 		printk(KERN_ERR "ERROR: parent span is not a superset "
 			"of domain->span\n");
 	return 0;
 }
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
 	int level = 0;
 	if (!sched_debug_enabled)
 		return;
 	if (!sd) {
 		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
 		return;
 	}
 	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
 	for (;;) {
 		if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
 			break;
 		level++;
 		sd = sd->parent;
 		if (!sd)
 			break;
 	}
 }
 #else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
 static inline bool sched_debug(void)
 {
 	return false;
 }
 #endif /* CONFIG_SCHED_DEBUG */
 static int sd_degenerate(struct sched_domain *sd)
 {
 	if (cpumask_weight(sched_domain_span(sd)) == 1)
 		return 1;
 	/* Following flags need at least 2 groups */
 	if (sd->flags & (SD_LOAD_BALANCE |
 			 SD_BALANCE_NEWIDLE |
 			 SD_BALANCE_FORK |
 			 SD_BALANCE_EXEC |
 			 SD_SHARE_CPUCAPACITY |
 			 SD_SHARE_PKG_RESOURCES |
 			 SD_SHARE_POWERDOMAIN)) {
 		if (sd->groups != sd->groups->next)
 			return 0;
 	}
 	/* Following flags don't use groups */
 	if (sd->flags & (SD_WAKE_AFFINE))
 		return 0;
 	return 1;
 }
 static int
 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 {
 	unsigned long cflags = sd->flags, pflags = parent->flags;
 	if (sd_degenerate(parent))
 		return 1;
 	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
 		return 0;
 	/* Flags needing groups don't count if only 1 group in parent */
 	if (parent->groups == parent->groups->next) {
 		pflags &= ~(SD_LOAD_BALANCE |
 				SD_BALANCE_NEWIDLE |
 				SD_BALANCE_FORK |
 				SD_BALANCE_EXEC |
 				SD_SHARE_CPUCAPACITY |
 				SD_SHARE_PKG_RESOURCES |
 				SD_PREFER_SIBLING |
 				SD_SHARE_POWERDOMAIN);
 		if (nr_node_ids == 1)
 			pflags &= ~SD_SERIALIZE;
 	}
 	if (~cflags & pflags)
 		return 0;
 	return 1;
 }
 static void free_rootdomain(struct rcu_head *rcu)
 {
 	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
 	cpupri_cleanup(&rd->cpupri);
 	cpudl_cleanup(&rd->cpudl);
 	free_cpumask_var(rd->dlo_mask);
 	free_cpumask_var(rd->rto_mask);
 	free_cpumask_var(rd->online);
 	free_cpumask_var(rd->span);
 	kfree(rd);
 }
 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 {
 	struct root_domain *old_rd = NULL;
 	unsigned long flags;
 	raw_spin_lock_irqsave(&rq->lock, flags);
 	if (rq->rd) {
 		old_rd = rq->rd;
 		if (cpumask_test_cpu(rq->cpu, old_rd->online))
 			set_rq_offline(rq);
 		cpumask_clear_cpu(rq->cpu, old_rd->span);
 		/*
 		 * If we dont want to free the old_rd yet then
 		 * set old_rd to NULL to skip the freeing later
 		 * in this function:
 		 */
 		if (!atomic_dec_and_test(&old_rd->refcount))
 			old_rd = NULL;
 	}
 	atomic_inc(&rd->refcount);
 	rq->rd = rd;
 	cpumask_set_cpu(rq->cpu, rd->span);
 	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
 		set_rq_online(rq);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 	if (old_rd)
 		call_rcu_sched(&old_rd->rcu, free_rootdomain);
 }
 static int init_rootdomain(struct root_domain *rd)
 {
 	memset(rd, 0, sizeof(*rd));
 	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
 		goto out;
 	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
 		goto free_span;
 	if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
 		goto free_online;
 	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
 		goto free_dlo_mask;
 	init_dl_bw(&rd->dl_bw);
 	if (cpudl_init(&rd->cpudl) != 0)
 		goto free_dlo_mask;
 	if (cpupri_init(&rd->cpupri) != 0)
 		goto free_rto_mask;
 	return 0;
 free_rto_mask:
 	free_cpumask_var(rd->rto_mask);
 free_dlo_mask:
 	free_cpumask_var(rd->dlo_mask);
 free_online:
 	free_cpumask_var(rd->online);
 free_span:
 	free_cpumask_var(rd->span);
 out:
 	return -ENOMEM;
 }
 /*
  * By default the system creates a single root-domain with all cpus as
  * members (mimicking the global state we have today).
  */
 struct root_domain def_root_domain;
 static void init_defrootdomain(void)
 {
 	init_rootdomain(&def_root_domain);
 	atomic_set(&def_root_domain.refcount, 1);
 }
 static struct root_domain *alloc_rootdomain(void)
 {
 	struct root_domain *rd;
 	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
 	if (!rd)
 		return NULL;
 	if (init_rootdomain(rd) != 0) {
 		kfree(rd);
 		return NULL;
 	}
 	return rd;
 }
 static void free_sched_groups(struct sched_group *sg, int free_sgc)
 {
 	struct sched_group *tmp, *first;
 	if (!sg)
 		return;
 	first = sg;
 	do {
 		tmp = sg->next;
 		if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
 			kfree(sg->sgc);
 		kfree(sg);
 		sg = tmp;
 	} while (sg != first);
 }
 static void free_sched_domain(struct rcu_head *rcu)
 {
 	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
 	/*
 	 * If its an overlapping domain it has private groups, iterate and
 	 * nuke them all.
 	 */
 	if (sd->flags & SD_OVERLAP) {
 		free_sched_groups(sd->groups, 1);
 	} else if (atomic_dec_and_test(&sd->groups->ref)) {
 		kfree(sd->groups->sgc);
 		kfree(sd->groups);
 	}
 	kfree(sd);
 }
 static void destroy_sched_domain(struct sched_domain *sd, int cpu)
 {
 	call_rcu(&sd->rcu, free_sched_domain);
 }
 static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 {
 	for (; sd; sd = sd->parent)
 		destroy_sched_domain(sd, cpu);
 }
 /*
  * Keep a special pointer to the highest sched_domain that has
  * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
  * allows us to avoid some pointer chasing select_idle_sibling().
  *
  * Also keep a unique ID per domain (we use the first cpu number in
  * the cpumask of the domain), this allows us to quickly tell if
  * two cpus are in the same cache domain, see cpus_share_cache().
  */
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
 DEFINE_PER_CPU(struct sched_domain *, sd_busy);
 DEFINE_PER_CPU(struct sched_domain *, sd_asym);
 static void update_top_cache_domain(int cpu)
 {
 	struct sched_domain *sd;
 	struct sched_domain *busy_sd = NULL;
 	int id = cpu;
 	int size = 1;
 	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
 	if (sd) {
 		id = cpumask_first(sched_domain_span(sd));
 		size = cpumask_weight(sched_domain_span(sd));
 		busy_sd = sd->parent; /* sd_busy */
 	}
 	rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
 	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
 	per_cpu(sd_llc_size, cpu) = size;
 	per_cpu(sd_llc_id, cpu) = id;
 	sd = lowest_flag_domain(cpu, SD_NUMA);
 	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
 	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
 	rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
 }
 /*
  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
  * hold the hotplug lock.
  */
 static void
 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
 	/* Remove the sched domains which do not contribute to scheduling. */
 	for (tmp = sd; tmp; ) {
 		struct sched_domain *parent = tmp->parent;
 		if (!parent)
 			break;
 		if (sd_parent_degenerate(tmp, parent)) {
 			tmp->parent = parent->parent;
 			if (parent->parent)
 				parent->parent->child = tmp;
 			/*
 			 * Transfer SD_PREFER_SIBLING down in case of a
 			 * degenerate parent; the spans match for this
 			 * so the property transfers.
 			 */
 			if (parent->flags & SD_PREFER_SIBLING)
 				tmp->flags |= SD_PREFER_SIBLING;
 			destroy_sched_domain(parent, cpu);
 		} else
 			tmp = tmp->parent;
 	}
 	if (sd && sd_degenerate(sd)) {
 		tmp = sd;
 		sd = sd->parent;
 		destroy_sched_domain(tmp, cpu);
 		if (sd)
 			sd->child = NULL;
 	}
 	sched_domain_debug(sd, cpu);
 	rq_attach_root(rq, rd);
 	tmp = rq->sd;
 	rcu_assign_pointer(rq->sd, sd);
 	destroy_sched_domains(tmp, cpu);
 	update_top_cache_domain(cpu);
 }
 /* cpus with isolated domains */
 static cpumask_var_t cpu_isolated_map;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
 	alloc_bootmem_cpumask_var(&cpu_isolated_map);
 	cpulist_parse(str, cpu_isolated_map);
 	return 1;
 }
 __setup("isolcpus=", isolated_cpu_setup);
 struct s_data {
 	struct sched_domain ** __percpu sd;
 	struct root_domain	*rd;
 };
 enum s_alloc {
 	sa_rootdomain,
 	sa_sd,
 	sa_sd_storage,
 	sa_none,
 };
 /*
  * Build an iteration mask that can exclude certain CPUs from the upwards
  * domain traversal.
  *
  * Asymmetric node setups can result in situations where the domain tree is of
  * unequal depth, make sure to skip domains that already cover the entire
  * range.
  *
  * In that case build_sched_domains() will have terminated the iteration early
  * and our sibling sd spans will be empty. Domains should always include the
  * cpu they're built on, so check that.
  *
  */
 static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
 {
 	const struct cpumask *span = sched_domain_span(sd);
 	struct sd_data *sdd = sd->private;
 	struct sched_domain *sibling;
 	int i;
 	for_each_cpu(i, span) {
 		sibling = *per_cpu_ptr(sdd->sd, i);
 		if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
 			continue;
 		cpumask_set_cpu(i, sched_group_mask(sg));
 	}
 }
 /*
  * Return the canonical balance cpu for this group, this is the first cpu
  * of this group that's also in the iteration mask.
  */
 int group_balance_cpu(struct sched_group *sg)
 {
 	return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
 }
 static int
 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 {
 	struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
 	const struct cpumask *span = sched_domain_span(sd);
 	struct cpumask *covered = sched_domains_tmpmask;
 	struct sd_data *sdd = sd->private;
 	struct sched_domain *sibling;
 	int i;
 	cpumask_clear(covered);
 	for_each_cpu(i, span) {
 		struct cpumask *sg_span;
 		if (cpumask_test_cpu(i, covered))
 			continue;
 		sibling = *per_cpu_ptr(sdd->sd, i);
 		/* See the comment near build_group_mask(). */
 		if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
 			continue;
 		sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
 				GFP_KERNEL, cpu_to_node(cpu));
 		if (!sg)
 			goto fail;
 		sg_span = sched_group_cpus(sg);
 		if (sibling->child)
 			cpumask_copy(sg_span, sched_domain_span(sibling->child));
 		else
 			cpumask_set_cpu(i, sg_span);
 		cpumask_or(covered, covered, sg_span);
 		sg->sgc = *per_cpu_ptr(sdd->sgc, i);
 		if (atomic_inc_return(&sg->sgc->ref) == 1)
 			build_group_mask(sd, sg);
 		/*
 		 * Initialize sgc->capacity such that even if we mess up the
 		 * domains and no possible iteration will get us here, we won't
 		 * die on a /0 trap.
 		 */
 		sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
 		sg->sgc->capacity_orig = sg->sgc->capacity;
 		/*
 		 * Make sure the first group of this domain contains the
 		 * canonical balance cpu. Otherwise the sched_domain iteration
 		 * breaks. See update_sg_lb_stats().
 		 */
 		if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
 		    group_balance_cpu(sg) == cpu)
 			groups = sg;
 		if (!first)
 			first = sg;
 		if (last)
 			last->next = sg;
 		last = sg;
 		last->next = first;
 	}
 	sd->groups = groups;
 	return 0;
 fail:
 	free_sched_groups(first, 0);
 	return -ENOMEM;
 }
 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 {
 	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
 	struct sched_domain *child = sd->child;
 	if (child)
 		cpu = cpumask_first(sched_domain_span(child));
 	if (sg) {
 		*sg = *per_cpu_ptr(sdd->sg, cpu);
 		(*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
 		atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
 	}
 	return cpu;
 }
 /*
  * build_sched_groups will build a circular linked list of the groups
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_capacity to 0.
  *
  * Assumes the sched_domain tree is fully constructed
  */
 static int
 build_sched_groups(struct sched_domain *sd, int cpu)
 {
 	struct sched_group *first = NULL, *last = NULL;
 	struct sd_data *sdd = sd->private;
 	const struct cpumask *span = sched_domain_span(sd);
 	struct cpumask *covered;
 	int i;
 	get_group(cpu, sdd, &sd->groups);
 	atomic_inc(&sd->groups->ref);
 	if (cpu != cpumask_first(span))
 		return 0;
 	lockdep_assert_held(&sched_domains_mutex);
 	covered = sched_domains_tmpmask;
 	cpumask_clear(covered);
 	for_each_cpu(i, span) {
 		struct sched_group *sg;
 		int group, j;
 		if (cpumask_test_cpu(i, covered))
 			continue;
 		group = get_group(i, sdd, &sg);
 		cpumask_setall(sched_group_mask(sg));
 		for_each_cpu(j, span) {
 			if (get_group(j, sdd, NULL) != group)
 				continue;
 			cpumask_set_cpu(j, covered);
 			cpumask_set_cpu(j, sched_group_cpus(sg));
 		}
 		if (!first)
 			first = sg;
 		if (last)
 			last->next = sg;
 		last = sg;
 	}
 	last->next = first;
 	return 0;
 }
 /*
  * Initialize sched groups cpu_capacity.
  *
  * cpu_capacity indicates the capacity of sched group, which is used while
  * distributing the load between different sched groups in a sched domain.
  * Typically cpu_capacity for all the groups in a sched domain will be same
  * unless there are asymmetries in the topology. If there are asymmetries,
  * group having more cpu_capacity will pickup more load compared to the
  * group having less cpu_capacity.
  */
 static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 {
 	struct sched_group *sg = sd->groups;
 	WARN_ON(!sg);
 	do {
 		sg->group_weight = cpumask_weight(sched_group_cpus(sg));
 		sg = sg->next;
 	} while (sg != sd->groups);
 	if (cpu != group_balance_cpu(sg))
 		return;
 	update_group_capacity(sd, cpu);
 	atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
 }
 /*
  * Initializers for schedule domains
  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
  */
 static int default_relax_domain_level = -1;
 int sched_domain_level_max;
 static int __init setup_relax_domain_level(char *str)
 {
 	if (kstrtoint(str, 0, &default_relax_domain_level))
 		pr_warn("Unable to set relax_domain_level\n");
 	return 1;
 }
 __setup("relax_domain_level=", setup_relax_domain_level);
 static void set_domain_attribute(struct sched_domain *sd,
 				 struct sched_domain_attr *attr)
 {
 	int request;
 	if (!attr || attr->relax_domain_level < 0) {
 		if (default_relax_domain_level < 0)
 			return;
 		else
 			request = default_relax_domain_level;
 	} else
 		request = attr->relax_domain_level;
 	if (request < sd->level) {
 		/* turn off idle balance on this domain */
 		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
 	} else {
 		/* turn on idle balance on this domain */
 		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
 	}
 }
 static void __sdt_free(const struct cpumask *cpu_map);
 static int __sdt_alloc(const struct cpumask *cpu_map);
 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
 				 const struct cpumask *cpu_map)
 {
 	switch (what) {
 	case sa_rootdomain:
 		if (!atomic_read(&d->rd->refcount))
 			free_rootdomain(&d->rd->rcu); /* fall through */
 	case sa_sd:
 		free_percpu(d->sd); /* fall through */
 	case sa_sd_storage:
 		__sdt_free(cpu_map); /* fall through */
 	case sa_none:
 		break;
 	}
 }
 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
 						   const struct cpumask *cpu_map)
 {
 	memset(d, 0, sizeof(*d));
 	if (__sdt_alloc(cpu_map))
 		return sa_sd_storage;
 	d->sd = alloc_percpu(struct sched_domain *);
 	if (!d->sd)
 		return sa_sd_storage;
 	d->rd = alloc_rootdomain();
 	if (!d->rd)
 		return sa_sd;
 	return sa_rootdomain;
 }
 /*
  * NULL the sd_data elements we've used to build the sched_domain and
  * sched_group structure so that the subsequent __free_domain_allocs()
  * will not free the data we're using.
  */
 static void claim_allocations(int cpu, struct sched_domain *sd)
 {
 	struct sd_data *sdd = sd->private;
 	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
 	*per_cpu_ptr(sdd->sd, cpu) = NULL;
 	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
 		*per_cpu_ptr(sdd->sg, cpu) = NULL;
 	if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
 		*per_cpu_ptr(sdd->sgc, cpu) = NULL;
 }
 #ifdef CONFIG_NUMA
 static int sched_domains_numa_levels;
 static int *sched_domains_numa_distance;
 static struct cpumask ***sched_domains_numa_masks;
 static int sched_domains_curr_level;
 #endif
 /*
  * SD_flags allowed in topology descriptions.
  *
  * SD_SHARE_CPUCAPACITY      - describes SMT topologies
  * SD_SHARE_PKG_RESOURCES - describes shared caches
  * SD_NUMA                - describes NUMA topologies
  * SD_SHARE_POWERDOMAIN   - describes shared power domain
  *
  * Odd one out:
  * SD_ASYM_PACKING        - describes SMT quirks
  */
 #define TOPOLOGY_SD_FLAGS		\
 	(SD_SHARE_CPUCAPACITY |		\
 	 SD_SHARE_PKG_RESOURCES |	\
 	 SD_NUMA |			\
 	 SD_ASYM_PACKING |		\
 	 SD_SHARE_POWERDOMAIN)
 static struct sched_domain *
 sd_init(struct sched_domain_topology_level *tl, int cpu)
 {
 	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
 	int sd_weight, sd_flags = 0;
 #ifdef CONFIG_NUMA
 	/*
 	 * Ugly hack to pass state to sd_numa_mask()...
 	 */
 	sched_domains_curr_level = tl->numa_level;
 #endif
 	sd_weight = cpumask_weight(tl->mask(cpu));
 	if (tl->sd_flags)
 		sd_flags = (*tl->sd_flags)();
 	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
 			"wrong sd_flags in topology description\n"))
 		sd_flags &= ~TOPOLOGY_SD_FLAGS;
 	*sd = (struct sched_domain){
 		.min_interval		= sd_weight,
 		.max_interval		= 2*sd_weight,
 		.busy_factor		= 32,
 		.imbalance_pct		= 125,
 		.cache_nice_tries	= 0,
 		.busy_idx		= 0,
 		.idle_idx		= 0,
 		.newidle_idx		= 0,
 		.wake_idx		= 0,
 		.forkexec_idx		= 0,
 		.flags			= 1*SD_LOAD_BALANCE
 					| 1*SD_BALANCE_NEWIDLE
 					| 1*SD_BALANCE_EXEC
 					| 1*SD_BALANCE_FORK
 					| 0*SD_BALANCE_WAKE
 					| 1*SD_WAKE_AFFINE
 					| 0*SD_SHARE_CPUCAPACITY
 					| 0*SD_SHARE_PKG_RESOURCES
 					| 0*SD_SERIALIZE
 					| 0*SD_PREFER_SIBLING
 					| 0*SD_NUMA
 					| sd_flags
 					,
 		.last_balance		= jiffies,
 		.balance_interval	= sd_weight,
 		.smt_gain		= 0,
 		.max_newidle_lb_cost	= 0,
 		.next_decay_max_lb_cost	= jiffies,
 #ifdef CONFIG_SCHED_DEBUG
 		.name			= tl->name,
 #endif
 	};
 	/*
 	 * Convert topological properties into behaviour.
 	 */
 	if (sd->flags & SD_SHARE_CPUCAPACITY) {
 		sd->imbalance_pct = 110;
 		sd->smt_gain = 1178; /* ~15% */
 	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
 		sd->imbalance_pct = 117;
 		sd->cache_nice_tries = 1;
 		sd->busy_idx = 2;
 #ifdef CONFIG_NUMA
 	} else if (sd->flags & SD_NUMA) {
 		sd->cache_nice_tries = 2;
 		sd->busy_idx = 3;
 		sd->idle_idx = 2;
 		sd->flags |= SD_SERIALIZE;
 		if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
 			sd->flags &= ~(SD_BALANCE_EXEC |
 				       SD_BALANCE_FORK |
 				       SD_WAKE_AFFINE);
 		}
 #endif
 	} else {
 		sd->flags |= SD_PREFER_SIBLING;
 		sd->cache_nice_tries = 1;
 		sd->busy_idx = 2;
 		sd->idle_idx = 1;
 	}
 	sd->private = &tl->data;
 	return sd;
 }
 /*
  * Topology list, bottom-up.
  */
 static struct sched_domain_topology_level default_topology[] = {
 #ifdef CONFIG_SCHED_SMT
 	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
 #endif
 #ifdef CONFIG_SCHED_MC
 	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
 #endif
 	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
 	{ NULL, },
 };
 struct sched_domain_topology_level *sched_domain_topology = default_topology;
 #define for_each_sd_topology(tl)			\
 	for (tl = sched_domain_topology; tl->mask; tl++)
 void set_sched_topology(struct sched_domain_topology_level *tl)
 {
 	sched_domain_topology = tl;
 }
 #ifdef CONFIG_NUMA
 static const struct cpumask *sd_numa_mask(int cpu)
 {
 	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
 }
 static void sched_numa_warn(const char *str)
 {
 	static int done = false;
 	int i,j;
 	if (done)
 		return;
 	done = true;
 	printk(KERN_WARNING "ERROR: %s\n\n", str);
 	for (i = 0; i < nr_node_ids; i++) {
 		printk(KERN_WARNING "  ");
 		for (j = 0; j < nr_node_ids; j++)
 			printk(KERN_CONT "%02d ", node_distance(i,j));
 		printk(KERN_CONT "\n");
 	}
 	printk(KERN_WARNING "\n");
 }
 static bool find_numa_distance(int distance)
 {
 	int i;
 	if (distance == node_distance(0, 0))
 		return true;
 	for (i = 0; i < sched_domains_numa_levels; i++) {
 		if (sched_domains_numa_distance[i] == distance)
 			return true;
 	}
 	return false;
 }
 static void sched_init_numa(void)
 {
 	int next_distance, curr_distance = node_distance(0, 0);
 	struct sched_domain_topology_level *tl;
 	int level = 0;
 	int i, j, k;
 	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
 	if (!sched_domains_numa_distance)
 		return;
 	/*
 	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
 	 * unique distances in the node_distance() table.
 	 *
 	 * Assumes node_distance(0,j) includes all distances in
 	 * node_distance(i,j) in order to avoid cubic time.
 	 */
 	next_distance = curr_distance;
 	for (i = 0; i < nr_node_ids; i++) {
 		for (j = 0; j < nr_node_ids; j++) {
 			for (k = 0; k < nr_node_ids; k++) {
 				int distance = node_distance(i, k);
 				if (distance > curr_distance &&
 				    (distance < next_distance ||
 				     next_distance == curr_distance))
 					next_distance = distance;
 				/*
 				 * While not a strong assumption it would be nice to know
 				 * about cases where if node A is connected to B, B is not
 				 * equally connected to A.
 				 */
 				if (sched_debug() && node_distance(k, i) != distance)
 					sched_numa_warn("Node-distance not symmetric");
 				if (sched_debug() && i && !find_numa_distance(distance))
 					sched_numa_warn("Node-0 not representative");
 			}
 			if (next_distance != curr_distance) {
 				sched_domains_numa_distance[level++] = next_distance;
 				sched_domains_numa_levels = level;
 				curr_distance = next_distance;
 			} else break;
 		}
 		/*
 		 * In case of sched_debug() we verify the above assumption.
 		 */
 		if (!sched_debug())
 			break;
 	}
 	if (!level)
 		return;
 	/*
 	 * 'level' contains the number of unique distances, excluding the
 	 * identity distance node_distance(i,i).
 	 *
 	 * The sched_domains_numa_distance[] array includes the actual distance
 	 * numbers.
 	 */
 	/*
 	 * Here, we should temporarily reset sched_domains_numa_levels to 0.
 	 * If it fails to allocate memory for array sched_domains_numa_masks[][],
 	 * the array will contain less then 'level' members. This could be
 	 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
 	 * in other functions.
 	 *
 	 * We reset it to 'level' at the end of this function.
 	 */
 	sched_domains_numa_levels = 0;
 	sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
 	if (!sched_domains_numa_masks)
 		return;
 	/*
 	 * Now for each level, construct a mask per node which contains all
 	 * cpus of nodes that are that many hops away from us.
 	 */
 	for (i = 0; i < level; i++) {
 		sched_domains_numa_masks[i] =
 			kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
 		if (!sched_domains_numa_masks[i])
 			return;
 		for (j = 0; j < nr_node_ids; j++) {
 			struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
 			if (!mask)
 				return;
 			sched_domains_numa_masks[i][j] = mask;
 			for (k = 0; k < nr_node_ids; k++) {
 				if (node_distance(j, k) > sched_domains_numa_distance[i])
 					continue;
 				cpumask_or(mask, mask, cpumask_of_node(k));
 			}
 		}
 	}
 	/* Compute default topology size */
 	for (i = 0; sched_domain_topology[i].mask; i++);
 	tl = kzalloc((i + level + 1) *
 			sizeof(struct sched_domain_topology_level), GFP_KERNEL);
 	if (!tl)
 		return;
 	/*
 	 * Copy the default topology bits..
 	 */
 	for (i = 0; sched_domain_topology[i].mask; i++)
 		tl[i] = sched_domain_topology[i];
 	/*
 	 * .. and append 'j' levels of NUMA goodness.
 	 */
 	for (j = 0; j < level; i++, j++) {
 		tl[i] = (struct sched_domain_topology_level){
 			.mask = sd_numa_mask,
 			.sd_flags = cpu_numa_flags,
 			.flags = SDTL_OVERLAP,
 			.numa_level = j,
 			SD_INIT_NAME(NUMA)
 		};
 	}
 	sched_domain_topology = tl;
 	sched_domains_numa_levels = level;
 }
 static void sched_domains_numa_masks_set(int cpu)
 {
 	int i, j;
 	int node = cpu_to_node(cpu);
 	for (i = 0; i < sched_domains_numa_levels; i++) {
 		for (j = 0; j < nr_node_ids; j++) {
 			if (node_distance(j, node) <= sched_domains_numa_distance[i])
 				cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
 		}
 	}
 }
 static void sched_domains_numa_masks_clear(int cpu)
 {
 	int i, j;
 	for (i = 0; i < sched_domains_numa_levels; i++) {
 		for (j = 0; j < nr_node_ids; j++)
 			cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
 	}
 }
 /*
  * Update sched_domains_numa_masks[level][node] array when new cpus
  * are onlined.
  */
 static int sched_domains_numa_masks_update(struct notifier_block *nfb,
 					   unsigned long action,
 					   void *hcpu)
 {
 	int cpu = (long)hcpu;
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_ONLINE:
 		sched_domains_numa_masks_set(cpu);
 		break;
 	case CPU_DEAD:
 		sched_domains_numa_masks_clear(cpu);
 		break;
 	default:
 		return NOTIFY_DONE;
 	}
 	return NOTIFY_OK;
 }
 #else
 static inline void sched_init_numa(void)
 {
 }
 static int sched_domains_numa_masks_update(struct notifier_block *nfb,
 					   unsigned long action,
 					   void *hcpu)
 {
 	return 0;
 }
 #endif /* CONFIG_NUMA */
 static int __sdt_alloc(const struct cpumask *cpu_map)
 {
 	struct sched_domain_topology_level *tl;
 	int j;
 	for_each_sd_topology(tl) {
 		struct sd_data *sdd = &tl->data;
 		sdd->sd = alloc_percpu(struct sched_domain *);
 		if (!sdd->sd)
 			return -ENOMEM;
 		sdd->sg = alloc_percpu(struct sched_group *);
 		if (!sdd->sg)
 			return -ENOMEM;
 		sdd->sgc = alloc_percpu(struct sched_group_capacity *);
 		if (!sdd->sgc)
 			return -ENOMEM;
 		for_each_cpu(j, cpu_map) {
 			struct sched_domain *sd;
 			struct sched_group *sg;
 			struct sched_group_capacity *sgc;
 		       	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
 			if (!sd)
 				return -ENOMEM;
 			*per_cpu_ptr(sdd->sd, j) = sd;
 			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
 			if (!sg)
 				return -ENOMEM;
 			sg->next = sg;
 			*per_cpu_ptr(sdd->sg, j) = sg;
 			sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
 			if (!sgc)
 				return -ENOMEM;
 			*per_cpu_ptr(sdd->sgc, j) = sgc;
 		}
 	}
 	return 0;
 }
 static void __sdt_free(const struct cpumask *cpu_map)
 {
 	struct sched_domain_topology_level *tl;
 	int j;
 	for_each_sd_topology(tl) {
 		struct sd_data *sdd = &tl->data;
 		for_each_cpu(j, cpu_map) {
 			struct sched_domain *sd;
 			if (sdd->sd) {
 				sd = *per_cpu_ptr(sdd->sd, j);
 				if (sd && (sd->flags & SD_OVERLAP))
 					free_sched_groups(sd->groups, 0);
 				kfree(*per_cpu_ptr(sdd->sd, j));
 			}
 			if (sdd->sg)
 				kfree(*per_cpu_ptr(sdd->sg, j));
 			if (sdd->sgc)
 				kfree(*per_cpu_ptr(sdd->sgc, j));
 		}
 		free_percpu(sdd->sd);
 		sdd->sd = NULL;
 		free_percpu(sdd->sg);
 		sdd->sg = NULL;
 		free_percpu(sdd->sgc);
 		sdd->sgc = NULL;
 	}
 }
 struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
 		struct sched_domain *child, int cpu)
 {
 	struct sched_domain *sd = sd_init(tl, cpu);
 	if (!sd)
 		return child;
 	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
 	if (child) {
 		sd->level = child->level + 1;
 		sched_domain_level_max = max(sched_domain_level_max, sd->level);
 		child->parent = sd;
 		sd->child = child;
 		if (!cpumask_subset(sched_domain_span(child),
 				    sched_domain_span(sd))) {
 			pr_err("BUG: arch topology borken\n");
 #ifdef CONFIG_SCHED_DEBUG
 			pr_err("     the %s domain not a subset of the %s domain\n",
 					child->name, sd->name);
 #endif
 			/* Fixup, ensure @sd has at least @child cpus. */
 			cpumask_or(sched_domain_span(sd),
 				   sched_domain_span(sd),
 				   sched_domain_span(child));
 		}
 	}
 	set_domain_attribute(sd, attr);
 	return sd;
 }
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
 static int build_sched_domains(const struct cpumask *cpu_map,
 			       struct sched_domain_attr *attr)
 {
 	enum s_alloc alloc_state;
 	struct sched_domain *sd;
 	struct s_data d;
 	int i, ret = -ENOMEM;
 	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
 	if (alloc_state != sa_rootdomain)
 		goto error;
 	/* Set up domains for cpus specified by the cpu_map. */
 	for_each_cpu(i, cpu_map) {
 		struct sched_domain_topology_level *tl;
 		sd = NULL;
 		for_each_sd_topology(tl) {
 			sd = build_sched_domain(tl, cpu_map, attr, sd, i);
 			if (tl == sched_domain_topology)
 				*per_cpu_ptr(d.sd, i) = sd;
 			if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
 				sd->flags |= SD_OVERLAP;
 			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
 				break;
 		}
 	}
 	/* Build the groups for the domains */
 	for_each_cpu(i, cpu_map) {
 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
 			sd->span_weight = cpumask_weight(sched_domain_span(sd));
 			if (sd->flags & SD_OVERLAP) {
 				if (build_overlap_sched_groups(sd, i))
 					goto error;
 			} else {
 				if (build_sched_groups(sd, i))
 					goto error;
 			}
 		}
 	}
 	/* Calculate CPU capacity for physical packages and nodes */
 	for (i = nr_cpumask_bits-1; i >= 0; i--) {
 		if (!cpumask_test_cpu(i, cpu_map))
 			continue;
 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
 			claim_allocations(i, sd);
 			init_sched_groups_capacity(i, sd);
 		}
 	}
 	/* Attach the domains */
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map) {
 		sd = *per_cpu_ptr(d.sd, i);
 		cpu_attach_domain(sd, d.rd, i);
 	}
 	rcu_read_unlock();
 	ret = 0;
 error:
 	__free_domain_allocs(&d, alloc_state, cpu_map);
 	return ret;
 }
 static cpumask_var_t *doms_cur;	/* current sched domains */
 static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
 static struct sched_domain_attr *dattr_cur;
 				/* attribues of custom domains in 'doms_cur' */
 /*
  * Special case: If a kmalloc of a doms_cur partition (array of
  * cpumask) fails, then fallback to a single sched domain,
  * as determined by the single cpumask fallback_doms.
  */
 static cpumask_var_t fallback_doms;
 /*
  * arch_update_cpu_topology lets virtualized architectures update the
  * cpu core maps. It is supposed to return 1 if the topology changed
  * or 0 if it stayed the same.
  */
 int __weak arch_update_cpu_topology(void)
 {
 	return 0;
 }
 cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
 {
 	int i;
 	cpumask_var_t *doms;
 	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
 	if (!doms)
 		return NULL;
 	for (i = 0; i < ndoms; i++) {
 		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
 			free_sched_domains(doms, i);
 			return NULL;
 		}
 	}
 	return doms;
 }
 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
 {
 	unsigned int i;
 	for (i = 0; i < ndoms; i++)
 		free_cpumask_var(doms[i]);
 	kfree(doms);
 }
 /*
  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
  * For now this just excludes isolated cpus, but could be used to
  * exclude other special cases in the future.
  */
 static int init_sched_domains(const struct cpumask *cpu_map)
 {
 	int err;
 	arch_update_cpu_topology();
 	ndoms_cur = 1;
 	doms_cur = alloc_sched_domains(ndoms_cur);
 	if (!doms_cur)
 		doms_cur = &fallback_doms;
 	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
 	err = build_sched_domains(doms_cur[0], NULL);
 	register_sched_domain_sysctl();
 	return err;
 }
 /*
  * Detach sched domains from a group of cpus specified in cpu_map
  * These cpus will now be attached to the NULL domain
  */
 static void detach_destroy_domains(const struct cpumask *cpu_map)
 {
 	int i;
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
 	rcu_read_unlock();
 }
 /* handle null as "default" */
 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
 			struct sched_domain_attr *new, int idx_new)
 {
 	struct sched_domain_attr tmp;
 	/* fast path */
 	if (!new && !cur)
 		return 1;
 	tmp = SD_ATTR_INIT;
 	return !memcmp(cur ? (cur + idx_cur) : &tmp,
 			new ? (new + idx_new) : &tmp,
 			sizeof(struct sched_domain_attr));
 }
 /*
  * Partition sched domains as specified by the 'ndoms_new'
  * cpumasks in the array doms_new[] of cpumasks. This compares
  * doms_new[] to the current sched domain partitioning, doms_cur[].
  * It destroys each deleted domain and builds each new domain.
  *
  * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
  * The masks don't intersect (don't overlap.) We should setup one
  * sched domain for each mask. CPUs not in any of the cpumasks will
  * not be load balanced. If the same cpumask appears both in the
  * current 'doms_cur' domains and in the new 'doms_new', we can leave
  * it as it is.
  *
  * The passed in 'doms_new' should be allocated using
  * alloc_sched_domains.  This routine takes ownership of it and will
  * free_sched_domains it when done with it. If the caller failed the
  * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
  * and partition_sched_domains() will fallback to the single partition
  * 'fallback_doms', it also forces the domains to be rebuilt.
  *
  * If doms_new == NULL it will be replaced with cpu_online_mask.
  * ndoms_new == 0 is a special case for destroying existing domains,
  * and it will not create the default domain.
  *
  * Call with hotplug lock held
  */
 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 			     struct sched_domain_attr *dattr_new)
 {
 	int i, j, n;
 	int new_topology;
 	mutex_lock(&sched_domains_mutex);
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
 	/* Let architecture update cpu core mappings. */
 	new_topology = arch_update_cpu_topology();
 	n = doms_new ? ndoms_new : 0;
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
 		for (j = 0; j < n && !new_topology; j++) {
 			if (cpumask_equal(doms_cur[i], doms_new[j])
 			    && dattrs_equal(dattr_cur, i, dattr_new, j))
 				goto match1;
 		}
 		/* no match - a current sched domain not in new doms_new[] */
 		detach_destroy_domains(doms_cur[i]);
 match1:
 		;
 	}
 	n = ndoms_cur;
 	if (doms_new == NULL) {
 		n = 0;
 		doms_new = &fallback_doms;
 		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
 		WARN_ON_ONCE(dattr_new);
 	}
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < n && !new_topology; j++) {
 			if (cpumask_equal(doms_new[i], doms_cur[j])
 			    && dattrs_equal(dattr_new, i, dattr_cur, j))
 				goto match2;
 		}
 		/* no match - add a new doms_new */
 		build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
 match2:
 		;
 	}
 	/* Remember the new sched domains */
 	if (doms_cur != &fallback_doms)
 		free_sched_domains(doms_cur, ndoms_cur);
 	kfree(dattr_cur);	/* kfree(NULL) is safe */
 	doms_cur = doms_new;
 	dattr_cur = dattr_new;
 	ndoms_cur = ndoms_new;
 	register_sched_domain_sysctl();
 	mutex_unlock(&sched_domains_mutex);
 }
 static int num_cpus_frozen;	/* used to mark begin/end of suspend/resume */
 /*
  * Update cpusets according to cpu_active mask.  If cpusets are
  * disabled, cpuset_update_active_cpus() becomes a simple wrapper
  * around partition_sched_domains().
  *
  * If we come here as part of a suspend/resume, don't touch cpusets because we
  * want to restore it back to its original state upon resume anyway.
  */
 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
 			     void *hcpu)
 {
 	switch (action) {
 	case CPU_ONLINE_FROZEN:
 	case CPU_DOWN_FAILED_FROZEN:
 		/*
 		 * num_cpus_frozen tracks how many CPUs are involved in suspend
 		 * resume sequence. As long as this is not the last online
 		 * operation in the resume sequence, just build a single sched
 		 * domain, ignoring cpusets.
 		 */
 		num_cpus_frozen--;
 		if (likely(num_cpus_frozen)) {
 			partition_sched_domains(1, NULL, NULL);
 			break;
 		}
 		/*
 		 * This is the last CPU online operation. So fall through and
 		 * restore the original sched domains by considering the
 		 * cpuset configurations.
 		 */
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
 		cpuset_update_active_cpus(true);
 		break;
 	default:
 		return NOTIFY_DONE;
 	}
 	return NOTIFY_OK;
 }
 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
 			       void *hcpu)
 {
 	switch (action) {
 	case CPU_DOWN_PREPARE:
 		cpuset_update_active_cpus(false);
 		break;
 	case CPU_DOWN_PREPARE_FROZEN:
 		num_cpus_frozen++;
 		partition_sched_domains(1, NULL, NULL);
 		break;
 	default:
 		return NOTIFY_DONE;
 	}
 	return NOTIFY_OK;
 }
 void __init sched_init_smp(void)
 {
 	cpumask_var_t non_isolated_cpus;
 	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
 	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 	sched_init_numa();
 	/*
 	 * There's no userspace yet to cause hotplug operations; hence all the
 	 * cpu masks are stable and all blatant races in the below code cannot
 	 * happen.
 	 */
 	mutex_lock(&sched_domains_mutex);
 	init_sched_domains(cpu_active_mask);
 	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
 	if (cpumask_empty(non_isolated_cpus))
 		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
 	mutex_unlock(&sched_domains_mutex);
 	hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
 	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
 	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
 	init_hrtick();
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
 		BUG();
 	sched_init_granularity();
 	free_cpumask_var(non_isolated_cpus);
 	init_sched_rt_class();
 	init_sched_dl_class();
 }
 #else
 void __init sched_init_smp(void)
 {
 	sched_init_granularity();
 }
 #endif /* CONFIG_SMP */
 const_debug unsigned int sysctl_timer_migration = 1;
 int in_sched_functions(unsigned long addr)
 {
 	return in_lock_functions(addr) ||
 		(addr >= (unsigned long)__sched_text_start
 		&& addr < (unsigned long)__sched_text_end);
 }
 #ifdef CONFIG_CGROUP_SCHED
 /*
  * Default task group.
  * Every task in system belongs to this group at bootup.
  */
 struct task_group root_task_group;
 LIST_HEAD(task_groups);
 #endif
 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
 void __init sched_init(void)
 {
 	int i, j;
 	unsigned long alloc_size = 0, ptr;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
 #ifdef CONFIG_CPUMASK_OFFSTACK
 	alloc_size += num_possible_cpus() * cpumask_size();
 #endif
 	if (alloc_size) {
 		ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		root_task_group.se = (struct sched_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 		root_task_group.cfs_rq = (struct cfs_rq **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 		root_task_group.rt_se = (struct sched_rt_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 		root_task_group.rt_rq = (struct rt_rq **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CPUMASK_OFFSTACK
 		for_each_possible_cpu(i) {
 			per_cpu(load_balance_mask, i) = (void *)ptr;
 			ptr += cpumask_size();
 		}
 #endif /* CONFIG_CPUMASK_OFFSTACK */
 	}
 	init_rt_bandwidth(&def_rt_bandwidth,
 			global_rt_period(), global_rt_runtime());
 	init_dl_bandwidth(&def_dl_bandwidth,
 			global_rt_period(), global_rt_runtime());
 #ifdef CONFIG_SMP
 	init_defrootdomain();
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	init_rt_bandwidth(&root_task_group.rt_bandwidth,
 			global_rt_period(), global_rt_runtime());
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CGROUP_SCHED
 	list_add(&root_task_group.list, &task_groups);
 	INIT_LIST_HEAD(&root_task_group.children);
 	INIT_LIST_HEAD(&root_task_group.siblings);
 	autogroup_init(&init_task);
 #endif /* CONFIG_CGROUP_SCHED */
 	for_each_possible_cpu(i) {
 		struct rq *rq;
 		rq = cpu_rq(i);
 		raw_spin_lock_init(&rq->lock);
 		rq->nr_running = 0;
 		rq->calc_load_active = 0;
 		rq->calc_load_update = jiffies + LOAD_FREQ;
 		init_cfs_rq(&rq->cfs);
 		init_rt_rq(&rq->rt, rq);
 		init_dl_rq(&rq->dl, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		root_task_group.shares = ROOT_TASK_GROUP_LOAD;
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
 		/*
 		 * How much cpu bandwidth does root_task_group get?
 		 *
 		 * In case of task-groups formed thr' the cgroup filesystem, it
 		 * gets 100% of the cpu resources in the system. This overall
 		 * system cpu resource is divided among the tasks of
 		 * root_task_group and its child task-groups in a fair manner,
 		 * based on each entity's (task or task-group's) weight
 		 * (se->load.weight).
 		 *
 		 * In other words, if root_task_group has 10 tasks of weight
 		 * 1024) and two child groups A0 and A1 (of weight 1024 each),
 		 * then A0's share of the cpu resource is:
 		 *
 		 *	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
 		 *
 		 * We achieve this by letting root_task_group's tasks sit
 		 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
 		 */
 		init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
 		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
 		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
 #endif
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
 		rq->last_load_update_tick = jiffies;
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
 		rq->rd = NULL;
 		rq->cpu_capacity = SCHED_CAPACITY_SCALE;
 		rq->post_schedule = 0;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
 		rq->cpu = i;
 		rq->online = 0;
 		rq->idle_stamp = 0;
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
 		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
 		INIT_LIST_HEAD(&rq->cfs_tasks);
 		rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ_COMMON
 		rq->nohz_flags = 0;
 #endif
 #ifdef CONFIG_NO_HZ_FULL
 		rq->last_sched_tick = 0;
 #endif
 #endif
 		init_rq_hrtick(rq);
 		atomic_set(&rq->nr_iowait, 0);
 	}
 	set_load_weight(&init_task);
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
 #endif
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current);
 	/*
 	 * Make us the idle thread. Technically, schedule() should not be
 	 * called from this thread, however somewhere below it might be,
 	 * but because we are the idle thread, we just pick up running again
 	 * when this runqueue becomes "idle".
 	 */
 	init_idle(current, smp_processor_id());
 	calc_load_update = jiffies + LOAD_FREQ;
 	/*
 	 * During early bootup we pretend to be a normal task:
 	 */
 	current->sched_class = &fair_sched_class;
 #ifdef CONFIG_SMP
 	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
 	/* May be allocated at isolcpus cmdline parse time */
 	if (cpu_isolated_map == NULL)
 		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 	idle_thread_set_boot_cpu();
 	set_cpu_rq_start_time();
 #endif
 	init_sched_fair_class();
 	scheduler_running = 1;
 }
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
 	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
 	return (nested == preempt_offset);
 }
 void __might_sleep(const char *file, int line, int preempt_offset)
 {
 	static unsigned long prev_jiffy;	/* ratelimiting */
 	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
 	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
 	     !is_idle_task(current)) ||
 	    system_state != SYSTEM_RUNNING || oops_in_progress)
 		return;
 	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 		return;
 	prev_jiffy = jiffies;
 	printk(KERN_ERR
 		"BUG: sleeping function called from invalid context at %s:%d\n",
 			file, line);
 	printk(KERN_ERR
 		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
 			in_atomic(), irqs_disabled(),
 			current->pid, current->comm);
 	debug_show_held_locks(current);
 	if (irqs_disabled())
 		print_irqtrace_events(current);
 #ifdef CONFIG_DEBUG_PREEMPT
 	if (!preempt_count_equals(preempt_offset)) {
 		pr_err("Preemption disabled at:");
 		print_ip_sym(current->preempt_disable_ip);
 		pr_cont("\n");
 	}
 #endif
 	dump_stack();
 }
 EXPORT_SYMBOL(__might_sleep);
 #endif
 #ifdef CONFIG_MAGIC_SYSRQ
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
 	const struct sched_class *prev_class = p->sched_class;
 	struct sched_attr attr = {
 		.sched_policy = SCHED_NORMAL,
 	};
 	int old_prio = p->prio;
 	int queued;
 	queued = task_on_rq_queued(p);
 	if (queued)
 		dequeue_task(rq, p, 0);
 	__setscheduler(rq, p, &attr);
 	if (queued) {
 		enqueue_task(rq, p, 0);
 		resched_curr(rq);
 	}
 	check_class_changed(rq, p, prev_class, old_prio);
 }
 void normalize_rt_tasks(void)
 {
 	struct task_struct *g, *p;
 	unsigned long flags;
 	struct rq *rq;
 	read_lock(&tasklist_lock);
 	for_each_process_thread(g, p) {
 		/*
 		 * Only normalize user tasks:
 		 */
 		if (p->flags & PF_KTHREAD)
 			continue;
 		p->se.exec_start		= 0;
 #ifdef CONFIG_SCHEDSTATS
 		p->se.statistics.wait_start	= 0;
 		p->se.statistics.sleep_start	= 0;
 		p->se.statistics.block_start	= 0;
 #endif
 		if (!dl_task(p) && !rt_task(p)) {
 			/*
 			 * Renice negative nice level userspace
 			 * tasks back to 0:
 			 */
 			if (task_nice(p) < 0)
 				set_user_nice(p, 0);
 			continue;
 		}
 		rq = task_rq_lock(p, &flags);
 		normalize_task(rq, p);
 		task_rq_unlock(rq, p, &flags);
 	}
 	read_unlock(&tasklist_lock);
 }
 #endif /* CONFIG_MAGIC_SYSRQ */
 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
 /*
  * These functions are only useful for the IA64 MCA handling, or kdb.
  *
  * They can only be called when the whole system has been
  * stopped - every CPU needs to be quiescent, and no scheduling
  * activity can take place. Using them for anything else would
  * be a serious bug, and as a result, they aren't even visible
  * under any other configuration.
  */
 /**
  * curr_task - return the current task for a given cpu.
  * @cpu: the processor in question.
  *
  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
  *
  * Return: The current task for @cpu.
  */
 struct task_struct *curr_task(int cpu)
 {
 	return cpu_curr(cpu);
 }
 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
 #ifdef CONFIG_IA64
 /**
  * set_curr_task - set the current task for a given cpu.
  * @cpu: the processor in question.
  * @p: the task pointer to set.
  *
  * Description: This function must only be used when non-maskable interrupts
  * are serviced on a separate stack. It allows the architecture to switch the
  * notion of the current task on a cpu in a non-blocking manner. This function
  * must be called with all CPU's synchronized, and interrupts disabled, the
  * and caller must save the original value of the current task (see
  * curr_task() above) and restore that value before reenabling interrupts and
  * re-starting the system.
  *
  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
  */
 void set_curr_task(int cpu, struct task_struct *p)
 {
 	cpu_curr(cpu) = p;
 }
 #endif
 #ifdef CONFIG_CGROUP_SCHED
 /* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
 static void free_sched_group(struct task_group *tg)
 {
 	free_fair_sched_group(tg);
 	free_rt_sched_group(tg);
 	autogroup_free(tg);
 	kfree(tg);
 }
 /* allocate runqueue etc for a new task group */
 struct task_group *sched_create_group(struct task_group *parent)
 {
 	struct task_group *tg;
 	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
 	if (!tg)
 		return ERR_PTR(-ENOMEM);
 	if (!alloc_fair_sched_group(tg, parent))
 		goto err;
 	if (!alloc_rt_sched_group(tg, parent))
 		goto err;
 	return tg;
 err:
 	free_sched_group(tg);
 	return ERR_PTR(-ENOMEM);
 }
 void sched_online_group(struct task_group *tg, struct task_group *parent)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&task_group_lock, flags);
 	list_add_rcu(&tg->list, &task_groups);
 	WARN_ON(!parent); /* root should already exist */
 	tg->parent = parent;
 	INIT_LIST_HEAD(&tg->children);
 	list_add_rcu(&tg->siblings, &parent->children);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 }
 /* rcu callback to free various structures associated with a task group */
 static void free_sched_group_rcu(struct rcu_head *rhp)
 {
 	/* now it should be safe to free those cfs_rqs */
 	free_sched_group(container_of(rhp, struct task_group, rcu));
 }
 /* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 {
 	/* wait for possible concurrent references to cfs_rqs complete */
 	call_rcu(&tg->rcu, free_sched_group_rcu);
 }
 void sched_offline_group(struct task_group *tg)
 {
 	unsigned long flags;
 	int i;
 	/* end participation in shares distribution */
 	for_each_possible_cpu(i)
 		unregister_fair_sched_group(tg, i);
 	spin_lock_irqsave(&task_group_lock, flags);
 	list_del_rcu(&tg->list);
 	list_del_rcu(&tg->siblings);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 }
 /* change task's runqueue when it moves between groups.
  *	The caller of this function should have put the task in its new group
  *	by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
  *	reflect its new group.
  */
 void sched_move_task(struct task_struct *tsk)
 {
 	struct task_group *tg;
 	int queued, running;
 	unsigned long flags;
 	struct rq *rq;
 	rq = task_rq_lock(tsk, &flags);
 	running = task_current(rq, tsk);
 	queued = task_on_rq_queued(tsk);
 	if (queued)
 		dequeue_task(rq, tsk, 0);
 	if (unlikely(running))
 		put_prev_task(rq, tsk);
 	/*
 	 * All callers are synchronized by task_rq_lock(); we do not use RCU
 	 * which is pointless here. Thus, we pass "true" to task_css_check()
 	 * to prevent lockdep warnings.
 	 */
 	tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
 			  struct task_group, css);
 	tg = autogroup_task_group(tsk, tg);
 	tsk->sched_task_group = tg;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (tsk->sched_class->task_move_group)
 		tsk->sched_class->task_move_group(tsk, queued);
 	else
 #endif
 		set_task_rq(tsk, task_cpu(tsk));
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
 	if (queued)
 		enqueue_task(rq, tsk, 0);
 	task_rq_unlock(rq, tsk, &flags);
 }
 #endif /* CONFIG_CGROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 /*
  * Ensure that the real time constraints are schedulable.
  */
 static DEFINE_MUTEX(rt_constraints_mutex);
 /* Must be called with tasklist_lock held */
 static inline int tg_has_rt_tasks(struct task_group *tg)
 {
 	struct task_struct *g, *p;
 	for_each_process_thread(g, p) {
 		if (rt_task(p) && task_group(p) == tg)
 			return 1;
 	}
 	return 0;
 }
 struct rt_schedulable_data {
 	struct task_group *tg;
 	u64 rt_period;
 	u64 rt_runtime;
 };
 static int tg_rt_schedulable(struct task_group *tg, void *data)
 {
 	struct rt_schedulable_data *d = data;
 	struct task_group *child;
 	unsigned long total, sum = 0;
 	u64 period, runtime;
 	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
 	runtime = tg->rt_bandwidth.rt_runtime;
 	if (tg == d->tg) {
 		period = d->rt_period;
 		runtime = d->rt_runtime;
 	}
 	/*
 	 * Cannot have more runtime than the period.
 	 */
 	if (runtime > period && runtime != RUNTIME_INF)
 		return -EINVAL;
 	/*
 	 * Ensure we don't starve existing RT tasks.
 	 */
 	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
 		return -EBUSY;
 	total = to_ratio(period, runtime);
 	/*
 	 * Nobody can have more than the global setting allows.
 	 */
 	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
 		return -EINVAL;
 	/*
 	 * The sum of our children's runtime should not exceed our own.
 	 */
 	list_for_each_entry_rcu(child, &tg->children, siblings) {
 		period = ktime_to_ns(child->rt_bandwidth.rt_period);
 		runtime = child->rt_bandwidth.rt_runtime;
 		if (child == d->tg) {
 			period = d->rt_period;
 			runtime = d->rt_runtime;
 		}
 		sum += to_ratio(period, runtime);
 	}
 	if (sum > total)
 		return -EINVAL;
 	return 0;
 }
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
 	int ret;
 	struct rt_schedulable_data data = {
 		.tg = tg,
 		.rt_period = period,
 		.rt_runtime = runtime,
 	};
 	rcu_read_lock();
 	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
 	rcu_read_unlock();
 	return ret;
 }
 static int tg_set_rt_bandwidth(struct task_group *tg,
 		u64 rt_period, u64 rt_runtime)
 {
 	int i, err = 0;
 	mutex_lock(&rt_constraints_mutex);
 	read_lock(&tasklist_lock);
 	err = __rt_schedulable(tg, rt_period, rt_runtime);
 	if (err)
 		goto unlock;
 	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
 	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
 	tg->rt_bandwidth.rt_runtime = rt_runtime;
 	for_each_possible_cpu(i) {
 		struct rt_rq *rt_rq = tg->rt_rq[i];
 		raw_spin_lock(&rt_rq->rt_runtime_lock);
 		rt_rq->rt_runtime = rt_runtime;
 		raw_spin_unlock(&rt_rq->rt_runtime_lock);
 	}
 	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
 unlock:
 	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 	return err;
 }
 static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 {
 	u64 rt_runtime, rt_period;
 	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
 	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
 	if (rt_runtime_us < 0)
 		rt_runtime = RUNTIME_INF;
 	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
 static long sched_group_rt_runtime(struct task_group *tg)
 {
 	u64 rt_runtime_us;
 	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
 		return -1;
 	rt_runtime_us = tg->rt_bandwidth.rt_runtime;
 	do_div(rt_runtime_us, NSEC_PER_USEC);
 	return rt_runtime_us;
 }
 static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
 {
 	u64 rt_runtime, rt_period;
 	rt_period = (u64)rt_period_us * NSEC_PER_USEC;
 	rt_runtime = tg->rt_bandwidth.rt_runtime;
 	if (rt_period == 0)
 		return -EINVAL;
 	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
 static long sched_group_rt_period(struct task_group *tg)
 {
 	u64 rt_period_us;
 	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
 	do_div(rt_period_us, NSEC_PER_USEC);
 	return rt_period_us;
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 static int sched_rt_global_constraints(void)
 {
 	int ret = 0;
 	mutex_lock(&rt_constraints_mutex);
 	read_lock(&tasklist_lock);
 	ret = __rt_schedulable(NULL, 0, 0);
 	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 	return ret;
 }
 static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 {
 	/* Don't accept realtime tasks when there is no way for them to run */
 	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
 		return 0;
 	return 1;
 }
 #else /* !CONFIG_RT_GROUP_SCHED */
 static int sched_rt_global_constraints(void)
 {
 	unsigned long flags;
 	int i, ret = 0;
 	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
 	for_each_possible_cpu(i) {
 		struct rt_rq *rt_rq = &cpu_rq(i)->rt;
 		raw_spin_lock(&rt_rq->rt_runtime_lock);
 		rt_rq->rt_runtime = global_rt_runtime();
 		raw_spin_unlock(&rt_rq->rt_runtime_lock);
 	}
 	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
 	return ret;
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 static int sched_dl_global_constraints(void)
 {
 	u64 runtime = global_rt_runtime();
 	u64 period = global_rt_period();
 	u64 new_bw = to_ratio(period, runtime);
 	struct dl_bw *dl_b;
 	int cpu, ret = 0;
 	unsigned long flags;
 	/*
 	 * Here we want to check the bandwidth not being set to some
 	 * value smaller than the currently allocated bandwidth in
 	 * any of the root_domains.
 	 *
 	 * FIXME: Cycling on all the CPUs is overdoing, but simpler than
 	 * cycling on root_domains... Discussion on different/better
 	 * solutions is welcome!
 	 */
 	for_each_possible_cpu(cpu) {
 		rcu_read_lock_sched();
 		dl_b = dl_bw_of(cpu);
 		raw_spin_lock_irqsave(&dl_b->lock, flags);
 		if (new_bw < dl_b->total_bw)
 			ret = -EBUSY;
 		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
 		rcu_read_unlock_sched();
 		if (ret)
 			break;
 	}
 	return ret;
 }
 static void sched_dl_do_global(void)
 {
 	u64 new_bw = -1;
 	struct dl_bw *dl_b;
 	int cpu;
 	unsigned long flags;
 	def_dl_bandwidth.dl_period = global_rt_period();
 	def_dl_bandwidth.dl_runtime = global_rt_runtime();
 	if (global_rt_runtime() != RUNTIME_INF)
 		new_bw = to_ratio(global_rt_period(), global_rt_runtime());
 	/*
 	 * FIXME: As above...
 	 */
 	for_each_possible_cpu(cpu) {
 		rcu_read_lock_sched();
 		dl_b = dl_bw_of(cpu);
 		raw_spin_lock_irqsave(&dl_b->lock, flags);
 		dl_b->bw = new_bw;
 		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
 		rcu_read_unlock_sched();
 	}
 }
 static int sched_rt_global_validate(void)
 {
 	if (sysctl_sched_rt_period <= 0)
 		return -EINVAL;
 	if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
 		(sysctl_sched_rt_runtime > sysctl_sched_rt_period))
 		return -EINVAL;
 	return 0;
 }
 static void sched_rt_do_global(void)
 {
 	def_rt_bandwidth.rt_runtime = global_rt_runtime();
 	def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
 }
 int sched_rt_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int old_period, old_runtime;
 	static DEFINE_MUTEX(mutex);
 	int ret;
 	mutex_lock(&mutex);
 	old_period = sysctl_sched_rt_period;
 	old_runtime = sysctl_sched_rt_runtime;
 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (!ret && write) {
 		ret = sched_rt_global_validate();
 		if (ret)
 			goto undo;
 		ret = sched_rt_global_constraints();
 		if (ret)
 			goto undo;
 		ret = sched_dl_global_constraints();
 		if (ret)
 			goto undo;
 		sched_rt_do_global();
 		sched_dl_do_global();
 	}
 	if (0) {
 undo:
 		sysctl_sched_rt_period = old_period;
 		sysctl_sched_rt_runtime = old_runtime;
 	}
 	mutex_unlock(&mutex);
 	return ret;
 }
 int sched_rr_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int ret;
 	static DEFINE_MUTEX(mutex);
 	mutex_lock(&mutex);
 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 	/* make sure that internally we keep jiffies */
 	/* also, writing zero resets timeslice to default */
 	if (!ret && write) {
 		sched_rr_timeslice = sched_rr_timeslice <= 0 ?
 			RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
 	}
 	mutex_unlock(&mutex);
 	return ret;
 }
 #ifdef CONFIG_CGROUP_SCHED
 static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
 {
 	return css ? container_of(css, struct task_group, css) : NULL;
 }
 static struct cgroup_subsys_state *
 cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
 	struct task_group *parent = css_tg(parent_css);
 	struct task_group *tg;
 	if (!parent) {
 		/* This is early initialization for the top cgroup */
 		return &root_task_group.css;
 	}
 	tg = sched_create_group(parent);
 	if (IS_ERR(tg))
 		return ERR_PTR(-ENOMEM);
 	return &tg->css;
 }
 static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
 {
 	struct task_group *tg = css_tg(css);
 	struct task_group *parent = css_tg(css->parent);
 	if (parent)
 		sched_online_group(tg, parent);
 	return 0;
 }
 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
 {
 	struct task_group *tg = css_tg(css);
 	sched_destroy_group(tg);
 }
 static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
 {
 	struct task_group *tg = css_tg(css);
 	sched_offline_group(tg);
 }
 static void cpu_cgroup_fork(struct task_struct *task)
 {
 	sched_move_task(task);
 }
 static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
 				 struct cgroup_taskset *tset)
 {
 	struct task_struct *task;
 	cgroup_taskset_for_each(task, tset) {
 #ifdef CONFIG_RT_GROUP_SCHED
 		if (!sched_rt_can_attach(css_tg(css), task))
 			return -EINVAL;
 #else
 		/* We don't support RT-tasks being in separate groups */
 		if (task->sched_class != &fair_sched_class)
 			return -EINVAL;
 #endif
 	}
 	return 0;
 }
 static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
 			      struct cgroup_taskset *tset)
 {
 	struct task_struct *task;
 	cgroup_taskset_for_each(task, tset)
 		sched_move_task(task);
 }
 static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
 			    struct cgroup_subsys_state *old_css,
 			    struct task_struct *task)
 {
 	/*
 	 * cgroup_exit() is called in the copy_process() failure path.
 	 * Ignore this case since the task hasn't ran yet, this avoids
 	 * trying to poke a half freed task state from generic code.
 	 */
 	if (!(task->flags & PF_EXITING))
 		return;
 	sched_move_task(task);
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
 				struct cftype *cftype, u64 shareval)
 {
 	return sched_group_set_shares(css_tg(css), scale_load(shareval));
 }
 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
 			       struct cftype *cft)
 {
 	struct task_group *tg = css_tg(css);
 	return (u64) scale_load_down(tg->shares);
 }
 #ifdef CONFIG_CFS_BANDWIDTH
 static DEFINE_MUTEX(cfs_constraints_mutex);
 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
 const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 {
 	int i, ret = 0, runtime_enabled, runtime_was_enabled;
 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
 	if (tg == &root_task_group)
 		return -EINVAL;
 	/*
 	 * Ensure we have at some amount of bandwidth every period.  This is
 	 * to prevent reaching a state of large arrears when throttled via
 	 * entity_tick() resulting in prolonged exit starvation.
 	 */
 	if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
 		return -EINVAL;
 	/*
 	 * Likewise, bound things on the otherside by preventing insane quota
 	 * periods.  This also allows us to normalize in computing quota
 	 * feasibility.
 	 */
 	if (period > max_cfs_quota_period)
 		return -EINVAL;
 	/*
 	 * Prevent race between setting of cfs_rq->runtime_enabled and
 	 * unthrottle_offline_cfs_rqs().
 	 */
 	get_online_cpus();
 	mutex_lock(&cfs_constraints_mutex);
 	ret = __cfs_schedulable(tg, period, quota);
 	if (ret)
 		goto out_unlock;
 	runtime_enabled = quota != RUNTIME_INF;
 	runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
 	/*
 	 * If we need to toggle cfs_bandwidth_used, off->on must occur
 	 * before making related changes, and on->off must occur afterwards
 	 */
 	if (runtime_enabled && !runtime_was_enabled)
 		cfs_bandwidth_usage_inc();
 	raw_spin_lock_irq(&cfs_b->lock);
 	cfs_b->period = ns_to_ktime(period);
 	cfs_b->quota = quota;
 	__refill_cfs_bandwidth_runtime(cfs_b);
 	/* restart the period timer (if active) to handle new period expiry */
 	if (runtime_enabled && cfs_b->timer_active) {
 		/* force a reprogram */
 		__start_cfs_bandwidth(cfs_b, true);
 	}
 	raw_spin_unlock_irq(&cfs_b->lock);
 	for_each_online_cpu(i) {
 		struct cfs_rq *cfs_rq = tg->cfs_rq[i];
 		struct rq *rq = cfs_rq->rq;
 		raw_spin_lock_irq(&rq->lock);
 		cfs_rq->runtime_enabled = runtime_enabled;
 		cfs_rq->runtime_remaining = 0;
 		if (cfs_rq->throttled)
 			unthrottle_cfs_rq(cfs_rq);
 		raw_spin_unlock_irq(&rq->lock);
 	}
 	if (runtime_was_enabled && !runtime_enabled)
 		cfs_bandwidth_usage_dec();
 out_unlock:
 	mutex_unlock(&cfs_constraints_mutex);
 	put_online_cpus();
 	return ret;
 }
 int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
 {
 	u64 quota, period;
 	period = ktime_to_ns(tg->cfs_bandwidth.period);
 	if (cfs_quota_us < 0)
 		quota = RUNTIME_INF;
 	else
 		quota = (u64)cfs_quota_us * NSEC_PER_USEC;
 	return tg_set_cfs_bandwidth(tg, period, quota);
 }
 long tg_get_cfs_quota(struct task_group *tg)
 {
 	u64 quota_us;
 	if (tg->cfs_bandwidth.quota == RUNTIME_INF)
 		return -1;
 	quota_us = tg->cfs_bandwidth.quota;
 	do_div(quota_us, NSEC_PER_USEC);
 	return quota_us;
 }
 int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
 {
 	u64 quota, period;
 	period = (u64)cfs_period_us * NSEC_PER_USEC;
 	quota = tg->cfs_bandwidth.quota;
 	return tg_set_cfs_bandwidth(tg, period, quota);
 }
 long tg_get_cfs_period(struct task_group *tg)
 {
 	u64 cfs_period_us;
 	cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
 	do_div(cfs_period_us, NSEC_PER_USEC);
 	return cfs_period_us;
 }
 static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
 				  struct cftype *cft)
 {
 	return tg_get_cfs_quota(css_tg(css));
 }
 static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
 				   struct cftype *cftype, s64 cfs_quota_us)
 {
 	return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
 }
 static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
 				   struct cftype *cft)
 {
 	return tg_get_cfs_period(css_tg(css));
 }
 static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
 				    struct cftype *cftype, u64 cfs_period_us)
 {
 	return tg_set_cfs_period(css_tg(css), cfs_period_us);
 }
 struct cfs_schedulable_data {
 	struct task_group *tg;
 	u64 period, quota;
 };
 /*
  * normalize group quota/period to be quota/max_period
  * note: units are usecs
  */
 static u64 normalize_cfs_quota(struct task_group *tg,
 			       struct cfs_schedulable_data *d)
 {
 	u64 quota, period;
 	if (tg == d->tg) {
 		period = d->period;
 		quota = d->quota;
 	} else {
 		period = tg_get_cfs_period(tg);
 		quota = tg_get_cfs_quota(tg);
 	}
 	/* note: these should typically be equivalent */
 	if (quota == RUNTIME_INF || quota == -1)
 		return RUNTIME_INF;
 	return to_ratio(period, quota);
 }
 static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
 {
 	struct cfs_schedulable_data *d = data;
 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
 	s64 quota = 0, parent_quota = -1;
 	if (!tg->parent) {
 		quota = RUNTIME_INF;
 	} else {
 		struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
 		quota = normalize_cfs_quota(tg, d);
 		parent_quota = parent_b->hierarchical_quota;
 		/*
 		 * ensure max(child_quota) <= parent_quota, inherit when no
 		 * limit is set
 		 */
 		if (quota == RUNTIME_INF)
 			quota = parent_quota;
 		else if (parent_quota != RUNTIME_INF && quota > parent_quota)
 			return -EINVAL;
 	}
 	cfs_b->hierarchical_quota = quota;
 	return 0;
 }
 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
 {
 	int ret;
 	struct cfs_schedulable_data data = {
 		.tg = tg,
 		.period = period,
 		.quota = quota,
 	};
 	if (quota != RUNTIME_INF) {
 		do_div(data.period, NSEC_PER_USEC);
 		do_div(data.quota, NSEC_PER_USEC);
 	}
 	rcu_read_lock();
 	ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
 	rcu_read_unlock();
 	return ret;
 }
 static int cpu_stats_show(struct seq_file *sf, void *v)
 {
 	struct task_group *tg = css_tg(seq_css(sf));
 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
 	seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
 	seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
 	seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
 	return 0;
 }
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
 				struct cftype *cft, s64 val)
 {
 	return sched_group_set_rt_runtime(css_tg(css), val);
 }
 static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
 			       struct cftype *cft)
 {
 	return sched_group_rt_runtime(css_tg(css));
 }
 static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
 				    struct cftype *cftype, u64 rt_period_us)
 {
 	return sched_group_set_rt_period(css_tg(css), rt_period_us);
 }
 static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
 				   struct cftype *cft)
 {
 	return sched_group_rt_period(css_tg(css));
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 static struct cftype cpu_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
 		.name = "shares",
 		.read_u64 = cpu_shares_read_u64,
 		.write_u64 = cpu_shares_write_u64,
 	},
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
 	{
 		.name = "cfs_quota_us",
 		.read_s64 = cpu_cfs_quota_read_s64,
 		.write_s64 = cpu_cfs_quota_write_s64,
 	},
 	{
 		.name = "cfs_period_us",
 		.read_u64 = cpu_cfs_period_read_u64,
 		.write_u64 = cpu_cfs_period_write_u64,
 	},
 	{
 		.name = "stat",
 		.seq_show = cpu_stats_show,
 	},
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	{
 		.name = "rt_runtime_us",
 		.read_s64 = cpu_rt_runtime_read,
 		.write_s64 = cpu_rt_runtime_write,
 	},
 	{
 		.name = "rt_period_us",
 		.read_u64 = cpu_rt_period_read_uint,
 		.write_u64 = cpu_rt_period_write_uint,
 	},
 #endif
 	{ }	/* terminate */
 };
 struct cgroup_subsys cpu_cgrp_subsys = {
 	.css_alloc	= cpu_cgroup_css_alloc,
 	.css_free	= cpu_cgroup_css_free,
 	.css_online	= cpu_cgroup_css_online,
 	.css_offline	= cpu_cgroup_css_offline,
 	.fork		= cpu_cgroup_fork,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
 	.exit		= cpu_cgroup_exit,
 	.legacy_cftypes	= cpu_files,
 	.early_init	= 1,
 };
 #endif	/* CONFIG_CGROUP_SCHED */
 void dump_cpu_task(int cpu)
 {
 	pr_info("Task dump for CPU %d:\n", cpu);
 	sched_show_task(cpu_curr(cpu));
 }