Eric Lee / linux-smarc-t335x-v3.2

1

/*

1

/*

2

* kernel/sched.c

2

* kernel/sched.c

3

*

3

*

4

* Kernel scheduler and related syscalls

4

* Kernel scheduler and related syscalls

5

*

5

*

6

7

*

7

*

8

* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and

8

* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and

9

* make semaphores SMP safe

9

* make semaphores SMP safe

10

* 1998-11-19 Implemented schedule_timeout() and related stuff

10

* 1998-11-19 Implemented schedule_timeout() and related stuff

11

* by Andrea Arcangeli

11

* by Andrea Arcangeli

12

* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:

12

* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:

13

* hybrid priority-list and round-robin design with

13

* hybrid priority-list and round-robin design with

14

* an array-switch method of distributing timeslices

14

* an array-switch method of distributing timeslices

15

* and per-CPU runqueues. Cleanups and useful suggestions

15

* and per-CPU runqueues. Cleanups and useful suggestions

16

* by Davide Libenzi, preemptible kernel bits by Robert Love.

16

* by Davide Libenzi, preemptible kernel bits by Robert Love.

17

* 2003-09-03 Interactivity tuning by Con Kolivas.

17

* 2003-09-03 Interactivity tuning by Con Kolivas.

18

* 2004-04-02 Scheduler domains code by Nick Piggin

18

* 2004-04-02 Scheduler domains code by Nick Piggin

19

* 2007-04-15 Work begun on replacing all interactivity tuning with a

19

* 2007-04-15 Work begun on replacing all interactivity tuning with a

20

* fair scheduling design by Con Kolivas.

20

* fair scheduling design by Con Kolivas.

21

* 2007-05-05 Load balancing (smp-nice) and other improvements

21

* 2007-05-05 Load balancing (smp-nice) and other improvements

22

* by Peter Williams

22

* by Peter Williams

23

* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith

23

* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith

24

* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri

24

* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri

25

* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,

25

* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,

26

* Thomas Gleixner, Mike Kravetz

26

* Thomas Gleixner, Mike Kravetz

27

*/

27

*/

28

29

#include <linux/mm.h>

29

#include <linux/mm.h>

30

#include <linux/module.h>

30

#include <linux/module.h>

31

#include <linux/nmi.h>

31

#include <linux/nmi.h>

32

#include <linux/init.h>

32

#include <linux/init.h>

33

#include <linux/uaccess.h>

33

#include <linux/uaccess.h>

34

#include <linux/highmem.h>

34

#include <linux/highmem.h>

35

#include <linux/smp_lock.h>

35

#include <linux/smp_lock.h>

36

#include <asm/mmu_context.h>

36

#include <asm/mmu_context.h>

37

#include <linux/interrupt.h>

37

#include <linux/interrupt.h>

38

#include <linux/capability.h>

38

#include <linux/capability.h>

39

#include <linux/completion.h>

39

#include <linux/completion.h>

40

#include <linux/kernel_stat.h>

40

#include <linux/kernel_stat.h>

41

#include <linux/debug_locks.h>

41

#include <linux/debug_locks.h>

42

#include <linux/perf_event.h>

42

#include <linux/perf_event.h>

43

#include <linux/security.h>

43

#include <linux/security.h>

44

#include <linux/notifier.h>

44

#include <linux/notifier.h>

45

#include <linux/profile.h>

45

#include <linux/profile.h>

46

#include <linux/freezer.h>

46

#include <linux/freezer.h>

47

#include <linux/vmalloc.h>

47

#include <linux/vmalloc.h>

48

#include <linux/blkdev.h>

48

#include <linux/blkdev.h>

49

#include <linux/delay.h>

49

#include <linux/delay.h>

50

#include <linux/pid_namespace.h>

50

#include <linux/pid_namespace.h>

51

#include <linux/smp.h>

51

#include <linux/smp.h>

52

#include <linux/threads.h>

52

#include <linux/threads.h>

53

#include <linux/timer.h>

53

#include <linux/timer.h>

54

#include <linux/rcupdate.h>

54

#include <linux/rcupdate.h>

55

#include <linux/cpu.h>

55

#include <linux/cpu.h>

56

#include <linux/cpuset.h>

56

#include <linux/cpuset.h>

57

#include <linux/percpu.h>

57

#include <linux/percpu.h>

58

#include <linux/kthread.h>

58

#include <linux/kthread.h>

59

#include <linux/proc_fs.h>

59

#include <linux/proc_fs.h>

60

#include <linux/seq_file.h>

60

#include <linux/seq_file.h>

61

#include <linux/sysctl.h>

61

#include <linux/sysctl.h>

62

#include <linux/syscalls.h>

62

#include <linux/syscalls.h>

63

#include <linux/times.h>

63

#include <linux/times.h>

64

#include <linux/tsacct_kern.h>

64

#include <linux/tsacct_kern.h>

65

#include <linux/kprobes.h>

65

#include <linux/kprobes.h>

66

#include <linux/delayacct.h>

66

#include <linux/delayacct.h>

67

#include <linux/unistd.h>

67

#include <linux/unistd.h>

68

#include <linux/pagemap.h>

68

#include <linux/pagemap.h>

69

#include <linux/hrtimer.h>

69

#include <linux/hrtimer.h>

70

#include <linux/tick.h>

70

#include <linux/tick.h>

71

#include <linux/debugfs.h>

71

#include <linux/debugfs.h>

72

#include <linux/ctype.h>

72

#include <linux/ctype.h>

73

#include <linux/ftrace.h>

73

#include <linux/ftrace.h>

74

75

#include <asm/tlb.h>

75

#include <asm/tlb.h>

76

#include <asm/irq_regs.h>

76

#include <asm/irq_regs.h>

77

78

#include "sched_cpupri.h"

78

#include "sched_cpupri.h"

79

80

#define CREATE_TRACE_POINTS

80

#define CREATE_TRACE_POINTS

81

#include <trace/events/sched.h>

81

#include <trace/events/sched.h>

82

83

/*

83

/*

84

* Convert user-nice values [ -20 ... 0 ... 19 ]

84

* Convert user-nice values [ -20 ... 0 ... 19 ]

85

* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],

85

* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],

86

* and back.

86

* and back.

87

*/

87

*/

88

#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)

88

#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)

89

#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)

89

#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)

90

#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)

90

#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)

91

92

/*

92

/*

93

* 'User priority' is the nice value converted to something we

93

* 'User priority' is the nice value converted to something we

94

* can work with better when scaling various scheduler parameters,

94

* can work with better when scaling various scheduler parameters,

95

* it's a [ 0 ... 39 ] range.

95

* it's a [ 0 ... 39 ] range.

96

*/

96

*/

97

#define USER_PRIO(p) ((p)-MAX_RT_PRIO)

97

#define USER_PRIO(p) ((p)-MAX_RT_PRIO)

98

#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)

98

#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)

99

#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))

99

#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))

100

101

/*

101

/*

102

* Helpers for converting nanosecond timing to jiffy resolution

102

* Helpers for converting nanosecond timing to jiffy resolution

103

*/

103

*/

104

#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))

104

#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))

105

106

#define NICE_0_LOAD SCHED_LOAD_SCALE

106

#define NICE_0_LOAD SCHED_LOAD_SCALE

107

#define NICE_0_SHIFT SCHED_LOAD_SHIFT

107

#define NICE_0_SHIFT SCHED_LOAD_SHIFT

108

109

/*

109

/*

110

* These are the 'tuning knobs' of the scheduler:

110

* These are the 'tuning knobs' of the scheduler:

111

*

111

*

112

* default timeslice is 100 msecs (used only for SCHED_RR tasks).

112

* default timeslice is 100 msecs (used only for SCHED_RR tasks).

113

* Timeslices get refilled after they expire.

113

* Timeslices get refilled after they expire.

114

*/

114

*/

115

#define DEF_TIMESLICE (100 * HZ / 1000)

115

#define DEF_TIMESLICE (100 * HZ / 1000)

116

117

/*

117

/*

118

* single value that denotes runtime == period, ie unlimited time.

118

* single value that denotes runtime == period, ie unlimited time.

119

*/

119

*/

120

#define RUNTIME_INF ((u64)~0ULL)

120

#define RUNTIME_INF ((u64)~0ULL)

121

122

static inline int rt_policy(int policy)

122

static inline int rt_policy(int policy)

123

{

123

{

124

if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))

124

if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))

125

return 1;

125

return 1;

126

return 0;

126

return 0;

127

}

127

}

128

129

static inline int task_has_rt_policy(struct task_struct *p)

129

static inline int task_has_rt_policy(struct task_struct *p)

130

{

130

{

131

return rt_policy(p->policy);

131

return rt_policy(p->policy);

132

}

132

}

133

134

/*

134

/*

135

* This is the priority-queue data structure of the RT scheduling class:

135

* This is the priority-queue data structure of the RT scheduling class:

136

*/

136

*/

137

struct rt_prio_array {

137

struct rt_prio_array {

138

DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */

138

DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */

139

struct list_head queue[MAX_RT_PRIO];

139

struct list_head queue[MAX_RT_PRIO];

140

};

140

};

141

142

struct rt_bandwidth {

142

struct rt_bandwidth {

143

/* nests inside the rq lock: */

143

/* nests inside the rq lock: */

144

spinlock_t rt_runtime_lock;

144

spinlock_t rt_runtime_lock;

145

ktime_t rt_period;

145

ktime_t rt_period;

146

u64 rt_runtime;

146

u64 rt_runtime;

147

struct hrtimer rt_period_timer;

147

struct hrtimer rt_period_timer;

148

};

148

};

149

150

static struct rt_bandwidth def_rt_bandwidth;

150

static struct rt_bandwidth def_rt_bandwidth;

151

152

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);

152

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);

153

154

static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)

154

static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)

155

{

155

{

156

struct rt_bandwidth *rt_b =

156

struct rt_bandwidth *rt_b =

157

container_of(timer, struct rt_bandwidth, rt_period_timer);

157

container_of(timer, struct rt_bandwidth, rt_period_timer);

158

ktime_t now;

158

ktime_t now;

159

int overrun;

159

int overrun;

160

int idle = 0;

160

int idle = 0;

161

162

for (;;) {

162

for (;;) {

163

now = hrtimer_cb_get_time(timer);

163

now = hrtimer_cb_get_time(timer);

164

overrun = hrtimer_forward(timer, now, rt_b->rt_period);

164

overrun = hrtimer_forward(timer, now, rt_b->rt_period);

165

166

if (!overrun)

166

if (!overrun)

167

break;

167

break;

168

169

idle = do_sched_rt_period_timer(rt_b, overrun);

169

idle = do_sched_rt_period_timer(rt_b, overrun);

170

}

170

}

171

172

return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;

172

return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;

173

}

173

}

174

175

static

175

static

176

void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)

176

void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)

177

{

177

{

178

rt_b->rt_period = ns_to_ktime(period);

178

rt_b->rt_period = ns_to_ktime(period);

179

rt_b->rt_runtime = runtime;

179

rt_b->rt_runtime = runtime;

180

181

spin_lock_init(&rt_b->rt_runtime_lock);

181

spin_lock_init(&rt_b->rt_runtime_lock);

182

183

hrtimer_init(&rt_b->rt_period_timer,

183

hrtimer_init(&rt_b->rt_period_timer,

184

CLOCK_MONOTONIC, HRTIMER_MODE_REL);

184

CLOCK_MONOTONIC, HRTIMER_MODE_REL);

185

rt_b->rt_period_timer.function = sched_rt_period_timer;

185

rt_b->rt_period_timer.function = sched_rt_period_timer;

186

}

186

}

187

188

static inline int rt_bandwidth_enabled(void)

188

static inline int rt_bandwidth_enabled(void)

189

{

189

{

190

return sysctl_sched_rt_runtime >= 0;

190

return sysctl_sched_rt_runtime >= 0;

191

}

191

}

192

193

static void start_rt_bandwidth(struct rt_bandwidth *rt_b)

193

static void start_rt_bandwidth(struct rt_bandwidth *rt_b)

194

{

194

{

195

ktime_t now;

195

ktime_t now;

196

197

if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)

197

if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)

198

return;

198

return;

199

200

if (hrtimer_active(&rt_b->rt_period_timer))

200

if (hrtimer_active(&rt_b->rt_period_timer))

201

return;

201

return;

202

203

spin_lock(&rt_b->rt_runtime_lock);

203

spin_lock(&rt_b->rt_runtime_lock);

204

for (;;) {

204

for (;;) {

205

unsigned long delta;

205

unsigned long delta;

206

ktime_t soft, hard;

206

ktime_t soft, hard;

207

208

if (hrtimer_active(&rt_b->rt_period_timer))

208

if (hrtimer_active(&rt_b->rt_period_timer))

209

break;

209

break;

210

211

now = hrtimer_cb_get_time(&rt_b->rt_period_timer);

211

now = hrtimer_cb_get_time(&rt_b->rt_period_timer);

212

hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);

212

hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);

213

214

soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);

214

soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);

215

hard = hrtimer_get_expires(&rt_b->rt_period_timer);

215

hard = hrtimer_get_expires(&rt_b->rt_period_timer);

216

delta = ktime_to_ns(ktime_sub(hard, soft));

216

delta = ktime_to_ns(ktime_sub(hard, soft));

217

__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,

217

__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,

218

HRTIMER_MODE_ABS_PINNED, 0);

218

HRTIMER_MODE_ABS_PINNED, 0);

219

}

219

}

220

spin_unlock(&rt_b->rt_runtime_lock);

220

spin_unlock(&rt_b->rt_runtime_lock);

221

}

221

}

222

223

#ifdef CONFIG_RT_GROUP_SCHED

223

#ifdef CONFIG_RT_GROUP_SCHED

224

static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)

224

static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)

225

{

225

{

226

hrtimer_cancel(&rt_b->rt_period_timer);

226

hrtimer_cancel(&rt_b->rt_period_timer);

227

}

227

}

228

#endif

228

#endif

229

230

/*

230

/*

231

* sched_domains_mutex serializes calls to arch_init_sched_domains,

231

* sched_domains_mutex serializes calls to arch_init_sched_domains,

232

* detach_destroy_domains and partition_sched_domains.

232

* detach_destroy_domains and partition_sched_domains.

233

*/

233

*/

234

static DEFINE_MUTEX(sched_domains_mutex);

234

static DEFINE_MUTEX(sched_domains_mutex);

235

236

#ifdef CONFIG_GROUP_SCHED

236

#ifdef CONFIG_GROUP_SCHED

237

238

#include <linux/cgroup.h>

238

#include <linux/cgroup.h>

239

240

struct cfs_rq;

240

struct cfs_rq;

241

242

static LIST_HEAD(task_groups);

242

static LIST_HEAD(task_groups);

243

244

/* task group related information */

244

/* task group related information */

245

struct task_group {

245

struct task_group {

246

#ifdef CONFIG_CGROUP_SCHED

246

#ifdef CONFIG_CGROUP_SCHED

247

struct cgroup_subsys_state css;

247

struct cgroup_subsys_state css;

248

#endif

248

#endif

249

250

#ifdef CONFIG_USER_SCHED

250

#ifdef CONFIG_USER_SCHED

251

uid_t uid;

251

uid_t uid;

252

#endif

252

#endif

253

254

#ifdef CONFIG_FAIR_GROUP_SCHED

254

#ifdef CONFIG_FAIR_GROUP_SCHED

255

/* schedulable entities of this group on each cpu */

255

/* schedulable entities of this group on each cpu */

256

struct sched_entity **se;

256

struct sched_entity **se;

257

/* runqueue "owned" by this group on each cpu */

257

/* runqueue "owned" by this group on each cpu */

258

struct cfs_rq **cfs_rq;

258

struct cfs_rq **cfs_rq;

259

unsigned long shares;

259

unsigned long shares;

260

#endif

260

#endif

261

262

#ifdef CONFIG_RT_GROUP_SCHED

262

#ifdef CONFIG_RT_GROUP_SCHED

263

struct sched_rt_entity **rt_se;

263

struct sched_rt_entity **rt_se;

264

struct rt_rq **rt_rq;

264

struct rt_rq **rt_rq;

265

266

struct rt_bandwidth rt_bandwidth;

266

struct rt_bandwidth rt_bandwidth;

267

#endif

267

#endif

268

269

struct rcu_head rcu;

269

struct rcu_head rcu;

270

struct list_head list;

270

struct list_head list;

271

272

struct task_group *parent;

272

struct task_group *parent;

273

struct list_head siblings;

273

struct list_head siblings;

274

struct list_head children;

274

struct list_head children;

275

};

275

};

276

277

#ifdef CONFIG_USER_SCHED

277

#ifdef CONFIG_USER_SCHED

278

279

/* Helper function to pass uid information to create_sched_user() */

279

/* Helper function to pass uid information to create_sched_user() */

280

void set_tg_uid(struct user_struct *user)

280

void set_tg_uid(struct user_struct *user)

281

{

281

{

282

user->tg->uid = user->uid;

282

user->tg->uid = user->uid;

283

}

283

}

284

285

/*

285

/*

286

* Root task group.

286

* Root task group.

287

* Every UID task group (including init_task_group aka UID-0) will

287

* Every UID task group (including init_task_group aka UID-0) will

288

* be a child to this group.

288

* be a child to this group.

289

*/

289

*/

290

struct task_group root_task_group;

290

struct task_group root_task_group;

291

292

#ifdef CONFIG_FAIR_GROUP_SCHED

292

#ifdef CONFIG_FAIR_GROUP_SCHED

293

/* Default task group's sched entity on each cpu */

293

/* Default task group's sched entity on each cpu */

294

static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);

294

static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);

295

/* Default task group's cfs_rq on each cpu */

295

/* Default task group's cfs_rq on each cpu */

296

static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);

296

static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);

297

#endif /* CONFIG_FAIR_GROUP_SCHED */

297

#endif /* CONFIG_FAIR_GROUP_SCHED */

298

299

#ifdef CONFIG_RT_GROUP_SCHED

299

#ifdef CONFIG_RT_GROUP_SCHED

300

static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);

300

static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);

301

static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);

301

static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);

302

#endif /* CONFIG_RT_GROUP_SCHED */

302

#endif /* CONFIG_RT_GROUP_SCHED */

303

#else /* !CONFIG_USER_SCHED */

303

#else /* !CONFIG_USER_SCHED */

304

#define root_task_group init_task_group

304

#define root_task_group init_task_group

305

#endif /* CONFIG_USER_SCHED */

305

#endif /* CONFIG_USER_SCHED */

306

307

/* task_group_lock serializes add/remove of task groups and also changes to

307

/* task_group_lock serializes add/remove of task groups and also changes to

308

* a task group's cpu shares.

308

* a task group's cpu shares.

309

*/

309

*/

310

static DEFINE_SPINLOCK(task_group_lock);

310

static DEFINE_SPINLOCK(task_group_lock);

311

312

#ifdef CONFIG_SMP

312

#ifdef CONFIG_SMP

313

static int root_task_group_empty(void)

313

static int root_task_group_empty(void)

314

{

314

{

315

return list_empty(&root_task_group.children);

315

return list_empty(&root_task_group.children);

316

}

316

}

317

#endif

317

#endif

318

319

#ifdef CONFIG_FAIR_GROUP_SCHED

319

#ifdef CONFIG_FAIR_GROUP_SCHED

320

#ifdef CONFIG_USER_SCHED

320

#ifdef CONFIG_USER_SCHED

321

# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)

321

# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)

322

#else /* !CONFIG_USER_SCHED */

322

#else /* !CONFIG_USER_SCHED */

323

# define INIT_TASK_GROUP_LOAD NICE_0_LOAD

323

# define INIT_TASK_GROUP_LOAD NICE_0_LOAD

324

#endif /* CONFIG_USER_SCHED */

324

#endif /* CONFIG_USER_SCHED */

325

326

/*

326

/*

327

* A weight of 0 or 1 can cause arithmetics problems.

327

* A weight of 0 or 1 can cause arithmetics problems.

328

* A weight of a cfs_rq is the sum of weights of which entities

328

* A weight of a cfs_rq is the sum of weights of which entities

329

* are queued on this cfs_rq, so a weight of a entity should not be

329

* are queued on this cfs_rq, so a weight of a entity should not be

330

* too large, so as the shares value of a task group.

330

* too large, so as the shares value of a task group.

331

* (The default weight is 1024 - so there's no practical

331

* (The default weight is 1024 - so there's no practical

332

* limitation from this.)

332

* limitation from this.)

333

*/

333

*/

334

#define MIN_SHARES 2

334

#define MIN_SHARES 2

335

#define MAX_SHARES (1UL << 18)

335

#define MAX_SHARES (1UL << 18)

336

337

static int init_task_group_load = INIT_TASK_GROUP_LOAD;

337

static int init_task_group_load = INIT_TASK_GROUP_LOAD;

338

#endif

338

#endif

339

340

/* Default task group.

340

/* Default task group.

341

* Every task in system belong to this group at bootup.

341

* Every task in system belong to this group at bootup.

342

*/

342

*/

343

struct task_group init_task_group;

343

struct task_group init_task_group;

344

345

/* return group to which a task belongs */

345

/* return group to which a task belongs */

346

static inline struct task_group *task_group(struct task_struct *p)

346

static inline struct task_group *task_group(struct task_struct *p)

347

{

347

{

348

struct task_group *tg;

348

struct task_group *tg;

349

350

#ifdef CONFIG_USER_SCHED

350

#ifdef CONFIG_USER_SCHED

351

rcu_read_lock();

351

rcu_read_lock();

352

tg = __task_cred(p)->user->tg;

352

tg = __task_cred(p)->user->tg;

353

rcu_read_unlock();

353

rcu_read_unlock();

354

#elif defined(CONFIG_CGROUP_SCHED)

354

#elif defined(CONFIG_CGROUP_SCHED)

355

tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),

355

tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),

356

struct task_group, css);

356

struct task_group, css);

357

#else

357

#else

358

tg = &init_task_group;

358

tg = &init_task_group;

359

#endif

359

#endif

360

return tg;

360

return tg;

361

}

361

}

362

363

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */

363

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */

364

static inline void set_task_rq(struct task_struct *p, unsigned int cpu)

364

static inline void set_task_rq(struct task_struct *p, unsigned int cpu)

365

{

365

{

366

#ifdef CONFIG_FAIR_GROUP_SCHED

366

#ifdef CONFIG_FAIR_GROUP_SCHED

367

p->se.cfs_rq = task_group(p)->cfs_rq[cpu];

367

p->se.cfs_rq = task_group(p)->cfs_rq[cpu];

368

p->se.parent = task_group(p)->se[cpu];

368

p->se.parent = task_group(p)->se[cpu];

369

#endif

369

#endif

370

371

#ifdef CONFIG_RT_GROUP_SCHED

371

#ifdef CONFIG_RT_GROUP_SCHED

372

p->rt.rt_rq = task_group(p)->rt_rq[cpu];

372

p->rt.rt_rq = task_group(p)->rt_rq[cpu];

373

p->rt.parent = task_group(p)->rt_se[cpu];

373

p->rt.parent = task_group(p)->rt_se[cpu];

374

#endif

374

#endif

375

}

375

}

376

377

#else

377

#else

378

379

static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }

379

static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }

380

static inline struct task_group *task_group(struct task_struct *p)

380

static inline struct task_group *task_group(struct task_struct *p)

381

{

381

{

382

return NULL;

382

return NULL;

383

}

383

}

384

385

#endif /* CONFIG_GROUP_SCHED */

385

#endif /* CONFIG_GROUP_SCHED */

386

387

/* CFS-related fields in a runqueue */

387

/* CFS-related fields in a runqueue */

388

struct cfs_rq {

388

struct cfs_rq {

389

struct load_weight load;

389

struct load_weight load;

390

unsigned long nr_running;

390

unsigned long nr_running;

391

392

u64 exec_clock;

392

u64 exec_clock;

393

u64 min_vruntime;

393

u64 min_vruntime;

394

395

struct rb_root tasks_timeline;

395

struct rb_root tasks_timeline;

396

struct rb_node *rb_leftmost;

396

struct rb_node *rb_leftmost;

397

398

struct list_head tasks;

398

struct list_head tasks;

399

struct list_head *balance_iterator;

399

struct list_head *balance_iterator;

400

401

/*

401

/*

402

* 'curr' points to currently running entity on this cfs_rq.

402

* 'curr' points to currently running entity on this cfs_rq.

403

* It is set to NULL otherwise (i.e when none are currently running).

403

* It is set to NULL otherwise (i.e when none are currently running).

404

*/

404

*/

405

struct sched_entity *curr, *next, *last;

405

struct sched_entity *curr, *next, *last;

406

407

unsigned int nr_spread_over;

407

unsigned int nr_spread_over;

408

409

#ifdef CONFIG_FAIR_GROUP_SCHED

409

#ifdef CONFIG_FAIR_GROUP_SCHED

410

struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */

410

struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */

411

412

/*

412

/*

413

* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in

413

* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in

414

* a hierarchy). Non-leaf lrqs hold other higher schedulable entities

414

* a hierarchy). Non-leaf lrqs hold other higher schedulable entities

415

* (like users, containers etc.)

415

* (like users, containers etc.)

416

*

416

*

417

* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This

417

* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This

418

* list is used during load balance.

418

* list is used during load balance.

419

*/

419

*/

420

struct list_head leaf_cfs_rq_list;

420

struct list_head leaf_cfs_rq_list;

421

struct task_group *tg; /* group that "owns" this runqueue */

421

struct task_group *tg; /* group that "owns" this runqueue */

422

423

#ifdef CONFIG_SMP

423

#ifdef CONFIG_SMP

424

/*

424

/*

425

* the part of load.weight contributed by tasks

425

* the part of load.weight contributed by tasks

426

*/

426

*/

427

unsigned long task_weight;

427

unsigned long task_weight;

428

429

/*

429

/*

430

* h_load = weight * f(tg)

430

* h_load = weight * f(tg)

431

*

431

*

432

* Where f(tg) is the recursive weight fraction assigned to

432

* Where f(tg) is the recursive weight fraction assigned to

433

* this group.

433

* this group.

434

*/

434

*/

435

unsigned long h_load;

435

unsigned long h_load;

436

437

/*

437

/*

438

* this cpu's part of tg->shares

438

* this cpu's part of tg->shares

439

*/

439

*/

440

unsigned long shares;

440

unsigned long shares;

441

442

/*

442

/*

443

* load.weight at the time we set shares

443

* load.weight at the time we set shares

444

*/

444

*/

445

unsigned long rq_weight;

445

unsigned long rq_weight;

446

#endif

446

#endif

447

#endif

447

#endif

448

};

448

};

449

450

/* Real-Time classes' related field in a runqueue: */

450

/* Real-Time classes' related field in a runqueue: */

451

struct rt_rq {

451

struct rt_rq {

452

struct rt_prio_array active;

452

struct rt_prio_array active;

453

unsigned long rt_nr_running;

453

unsigned long rt_nr_running;

454

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

454

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

455

struct {

455

struct {

456

int curr; /* highest queued rt task prio */

456

int curr; /* highest queued rt task prio */

457

#ifdef CONFIG_SMP

457

#ifdef CONFIG_SMP

458

int next; /* next highest */

458

int next; /* next highest */

459

#endif

459

#endif

460

} highest_prio;

460

} highest_prio;

461

#endif

461

#endif

462

#ifdef CONFIG_SMP

462

#ifdef CONFIG_SMP

463

unsigned long rt_nr_migratory;

463

unsigned long rt_nr_migratory;

464

unsigned long rt_nr_total;

464

unsigned long rt_nr_total;

465

int overloaded;

465

int overloaded;

466

struct plist_head pushable_tasks;

466

struct plist_head pushable_tasks;

467

#endif

467

#endif

468

int rt_throttled;

468

int rt_throttled;

469

u64 rt_time;

469

u64 rt_time;

470

u64 rt_runtime;

470

u64 rt_runtime;

471

/* Nests inside the rq lock: */

471

/* Nests inside the rq lock: */

472

spinlock_t rt_runtime_lock;

472

spinlock_t rt_runtime_lock;

473

474

#ifdef CONFIG_RT_GROUP_SCHED

474

#ifdef CONFIG_RT_GROUP_SCHED

475

unsigned long rt_nr_boosted;

475

unsigned long rt_nr_boosted;

476

477

struct rq *rq;

477

struct rq *rq;

478

struct list_head leaf_rt_rq_list;

478

struct list_head leaf_rt_rq_list;

479

struct task_group *tg;

479

struct task_group *tg;

480

struct sched_rt_entity *rt_se;

480

struct sched_rt_entity *rt_se;

481

#endif

481

#endif

482

};

482

};

483

484

#ifdef CONFIG_SMP

484

#ifdef CONFIG_SMP

485

486

/*

486

/*

487

* We add the notion of a root-domain which will be used to define per-domain

487

* We add the notion of a root-domain which will be used to define per-domain

488

* variables. Each exclusive cpuset essentially defines an island domain by

488

* variables. Each exclusive cpuset essentially defines an island domain by

489

* fully partitioning the member cpus from any other cpuset. Whenever a new

489

* fully partitioning the member cpus from any other cpuset. Whenever a new

490

* exclusive cpuset is created, we also create and attach a new root-domain

490

* exclusive cpuset is created, we also create and attach a new root-domain

491

* object.

491

* object.

492

*

492

*

493

*/

493

*/

494

struct root_domain {

494

struct root_domain {

495

atomic_t refcount;

495

atomic_t refcount;

496

cpumask_var_t span;

496

cpumask_var_t span;

497

cpumask_var_t online;

497

cpumask_var_t online;

498

499

/*

499

/*

500

* The "RT overload" flag: it gets set if a CPU has more than

500

* The "RT overload" flag: it gets set if a CPU has more than

501

* one runnable RT task.

501

* one runnable RT task.

502

*/

502

*/

503

cpumask_var_t rto_mask;

503

cpumask_var_t rto_mask;

504

atomic_t rto_count;

504

atomic_t rto_count;

505

#ifdef CONFIG_SMP

505

#ifdef CONFIG_SMP

506

struct cpupri cpupri;

506

struct cpupri cpupri;

507

#endif

507

#endif

508

};

508

};

509

510

/*

510

/*

511

* By default the system creates a single root-domain with all cpus as

511

* By default the system creates a single root-domain with all cpus as

512

* members (mimicking the global state we have today).

512

* members (mimicking the global state we have today).

513

*/

513

*/

514

static struct root_domain def_root_domain;

514

static struct root_domain def_root_domain;

515

516

#endif

516

#endif

517

518

/*

518

/*

519

* This is the main, per-CPU runqueue data structure.

519

* This is the main, per-CPU runqueue data structure.

520

*

520

*

521

* Locking rule: those places that want to lock multiple runqueues

521

* Locking rule: those places that want to lock multiple runqueues

522

* (such as the load balancing or the thread migration code), lock

522

* (such as the load balancing or the thread migration code), lock

523

* acquire operations must be ordered by ascending &runqueue.

523

* acquire operations must be ordered by ascending &runqueue.

524

*/

524

*/

525

struct rq {

525

struct rq {

526

/* runqueue lock: */

526

/* runqueue lock: */

527

spinlock_t lock;

527

spinlock_t lock;

528

529

/*

529

/*

530

* nr_running and cpu_load should be in the same cacheline because

530

* nr_running and cpu_load should be in the same cacheline because

531

* remote CPUs use both these fields when doing load calculation.

531

* remote CPUs use both these fields when doing load calculation.

532

*/

532

*/

533

unsigned long nr_running;

533

unsigned long nr_running;

534

#define CPU_LOAD_IDX_MAX 5

534

#define CPU_LOAD_IDX_MAX 5

535

unsigned long cpu_load[CPU_LOAD_IDX_MAX];

535

unsigned long cpu_load[CPU_LOAD_IDX_MAX];

536

#ifdef CONFIG_NO_HZ

536

#ifdef CONFIG_NO_HZ

537

unsigned long last_tick_seen;

537

unsigned long last_tick_seen;

538

unsigned char in_nohz_recently;

538

unsigned char in_nohz_recently;

539

#endif

539

#endif

540

/* capture load from *all* tasks on this cpu: */

540

/* capture load from *all* tasks on this cpu: */

541

struct load_weight load;

541

struct load_weight load;

542

unsigned long nr_load_updates;

542

unsigned long nr_load_updates;

543

u64 nr_switches;

543

u64 nr_switches;

544

u64 nr_migrations_in;

544

u64 nr_migrations_in;

545

546

struct cfs_rq cfs;

546

struct cfs_rq cfs;

547

struct rt_rq rt;

547

struct rt_rq rt;

548

549

#ifdef CONFIG_FAIR_GROUP_SCHED

549

#ifdef CONFIG_FAIR_GROUP_SCHED

550

/* list of leaf cfs_rq on this cpu: */

550

/* list of leaf cfs_rq on this cpu: */

551

struct list_head leaf_cfs_rq_list;

551

struct list_head leaf_cfs_rq_list;

552

#endif

552

#endif

553

#ifdef CONFIG_RT_GROUP_SCHED

553

#ifdef CONFIG_RT_GROUP_SCHED

554

struct list_head leaf_rt_rq_list;

554

struct list_head leaf_rt_rq_list;

555

#endif

555

#endif

556

557

/*

557

/*

558

* This is part of a global counter where only the total sum

558

* This is part of a global counter where only the total sum

559

* over all CPUs matters. A task can increase this counter on

559

* over all CPUs matters. A task can increase this counter on

560

* one CPU and if it got migrated afterwards it may decrease

560

* one CPU and if it got migrated afterwards it may decrease

561

* it on another CPU. Always updated under the runqueue lock:

561

* it on another CPU. Always updated under the runqueue lock:

562

*/

562

*/

563

unsigned long nr_uninterruptible;

563

unsigned long nr_uninterruptible;

564

565

struct task_struct *curr, *idle;

565

struct task_struct *curr, *idle;

566

unsigned long next_balance;

566

unsigned long next_balance;

567

struct mm_struct *prev_mm;

567

struct mm_struct *prev_mm;

568

569

u64 clock;

569

u64 clock;

570

571

atomic_t nr_iowait;

571

atomic_t nr_iowait;

572

573

#ifdef CONFIG_SMP

573

#ifdef CONFIG_SMP

574

struct root_domain *rd;

574

struct root_domain *rd;

575

struct sched_domain *sd;

575

struct sched_domain *sd;

576

577

unsigned char idle_at_tick;

577

unsigned char idle_at_tick;

578

/* For active balancing */

578

/* For active balancing */

579

int post_schedule;

579

int post_schedule;

580

int active_balance;

580

int active_balance;

581

int push_cpu;

581

int push_cpu;

582

/* cpu of this runqueue: */

582

/* cpu of this runqueue: */

583

int cpu;

583

int cpu;

584

int online;

584

int online;

585

586

unsigned long avg_load_per_task;

586

unsigned long avg_load_per_task;

587

588

struct task_struct *migration_thread;

588

struct task_struct *migration_thread;

589

struct list_head migration_queue;

589

struct list_head migration_queue;

590

591

u64 rt_avg;

591

u64 rt_avg;

592

u64 age_stamp;

592

u64 age_stamp;

593

#endif

593

#endif

594

595

/* calc_load related fields */

595

/* calc_load related fields */

596

unsigned long calc_load_update;

596

unsigned long calc_load_update;

597

long calc_load_active;

597

long calc_load_active;

598

599

#ifdef CONFIG_SCHED_HRTICK

599

#ifdef CONFIG_SCHED_HRTICK

600

#ifdef CONFIG_SMP

600

#ifdef CONFIG_SMP

601

int hrtick_csd_pending;

601

int hrtick_csd_pending;

602

struct call_single_data hrtick_csd;

602

struct call_single_data hrtick_csd;

603

#endif

603

#endif

604

struct hrtimer hrtick_timer;

604

struct hrtimer hrtick_timer;

605

#endif

605

#endif

606

607

#ifdef CONFIG_SCHEDSTATS

607

#ifdef CONFIG_SCHEDSTATS

608

/* latency stats */

608

/* latency stats */

609

struct sched_info rq_sched_info;

609

struct sched_info rq_sched_info;

610

unsigned long long rq_cpu_time;

610

unsigned long long rq_cpu_time;

611

/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */

611

/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */

612

613

/* sys_sched_yield() stats */

613

/* sys_sched_yield() stats */

614

unsigned int yld_count;

614

unsigned int yld_count;

615

616

/* schedule() stats */

616

/* schedule() stats */

617

unsigned int sched_switch;

617

unsigned int sched_switch;

618

unsigned int sched_count;

618

unsigned int sched_count;

619

unsigned int sched_goidle;

619

unsigned int sched_goidle;

620

621

/* try_to_wake_up() stats */

621

/* try_to_wake_up() stats */

622

unsigned int ttwu_count;

622

unsigned int ttwu_count;

623

unsigned int ttwu_local;

623

unsigned int ttwu_local;

624

625

/* BKL stats */

625

/* BKL stats */

626

unsigned int bkl_count;

626

unsigned int bkl_count;

627

#endif

627

#endif

628

};

628

};

629

630

static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

630

static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

631

632

static inline

632

static inline

633

void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)

633

void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)

634

{

634

{

635

rq->curr->sched_class->check_preempt_curr(rq, p, flags);

635

rq->curr->sched_class->check_preempt_curr(rq, p, flags);

636

}

636

}

637

638

static inline int cpu_of(struct rq *rq)

638

static inline int cpu_of(struct rq *rq)

639

{

639

{

640

#ifdef CONFIG_SMP

640

#ifdef CONFIG_SMP

641

return rq->cpu;

641

return rq->cpu;

642

#else

642

#else

643

return 0;

643

return 0;

644

#endif

644

#endif

645

}

645

}

646

647

/*

647

/*

648

* The domain tree (rq->sd) is protected by RCU's quiescent state transition.

648

* The domain tree (rq->sd) is protected by RCU's quiescent state transition.

649

* See detach_destroy_domains: synchronize_sched for details.

649

* See detach_destroy_domains: synchronize_sched for details.

650

*

650

*

651

* The domain tree of any CPU may only be accessed from within

651

* The domain tree of any CPU may only be accessed from within

652

* preempt-disabled sections.

652

* preempt-disabled sections.

653

*/

653

*/

654

#define for_each_domain(cpu, __sd) \

654

#define for_each_domain(cpu, __sd) \

655

for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)

655

for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)

656

657

#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))

657

#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))

658

#define this_rq() (&__get_cpu_var(runqueues))

658

#define this_rq() (&__get_cpu_var(runqueues))

659

#define task_rq(p) cpu_rq(task_cpu(p))

659

#define task_rq(p) cpu_rq(task_cpu(p))

660

#define cpu_curr(cpu) (cpu_rq(cpu)->curr)

660

#define cpu_curr(cpu) (cpu_rq(cpu)->curr)

661

#define raw_rq() (&__raw_get_cpu_var(runqueues))

661

#define raw_rq() (&__raw_get_cpu_var(runqueues))

662

663

inline void update_rq_clock(struct rq *rq)

663

inline void update_rq_clock(struct rq *rq)

664

{

664

{

665

rq->clock = sched_clock_cpu(cpu_of(rq));

665

rq->clock = sched_clock_cpu(cpu_of(rq));

666

}

666

}

667

668

/*

668

/*

669

* Tunables that become constants when CONFIG_SCHED_DEBUG is off:

669

* Tunables that become constants when CONFIG_SCHED_DEBUG is off:

670

*/

670

*/

671

#ifdef CONFIG_SCHED_DEBUG

671

#ifdef CONFIG_SCHED_DEBUG

672

# define const_debug __read_mostly

672

# define const_debug __read_mostly

673

#else

673

#else

674

# define const_debug static const

674

# define const_debug static const

675

#endif

675

#endif

676

677

/**

677

/**

678

* runqueue_is_locked

678

* runqueue_is_locked

679

* @cpu: the processor in question.

679

* @cpu: the processor in question.

680

*

680

*

681

* Returns true if the current cpu runqueue is locked.

681

* Returns true if the current cpu runqueue is locked.

682

* This interface allows printk to be called with the runqueue lock

682

* This interface allows printk to be called with the runqueue lock

683

* held and know whether or not it is OK to wake up the klogd.

683

* held and know whether or not it is OK to wake up the klogd.

684

*/

684

*/

685

int runqueue_is_locked(int cpu)

685

int runqueue_is_locked(int cpu)

686

{

686

{

687

return spin_is_locked(&cpu_rq(cpu)->lock);

687

return spin_is_locked(&cpu_rq(cpu)->lock);

688

}

688

}

689

690

/*

690

/*

691

* Debugging: various feature bits

691

* Debugging: various feature bits

692

*/

692

*/

693

694

#define SCHED_FEAT(name, enabled) \

694

#define SCHED_FEAT(name, enabled) \

695

__SCHED_FEAT_##name ,

695

__SCHED_FEAT_##name ,

696

697

enum {

697

enum {

698

#include "sched_features.h"

698

#include "sched_features.h"

699

};

699

};

700

701

#undef SCHED_FEAT

701

#undef SCHED_FEAT

702

703

#define SCHED_FEAT(name, enabled) \

703

#define SCHED_FEAT(name, enabled) \

704

(1UL << __SCHED_FEAT_##name) * enabled |

704

(1UL << __SCHED_FEAT_##name) * enabled |

705

706

const_debug unsigned int sysctl_sched_features =

706

const_debug unsigned int sysctl_sched_features =

707

#include "sched_features.h"

707

#include "sched_features.h"

708

0;

708

0;

709

710

#undef SCHED_FEAT

710

#undef SCHED_FEAT

711

712

#ifdef CONFIG_SCHED_DEBUG

712

#ifdef CONFIG_SCHED_DEBUG

713

#define SCHED_FEAT(name, enabled) \

713

#define SCHED_FEAT(name, enabled) \

714

#name ,

714

#name ,

715

716

static __read_mostly char *sched_feat_names[] = {

716

static __read_mostly char *sched_feat_names[] = {

717

#include "sched_features.h"

717

#include "sched_features.h"

718

NULL

718

NULL

719

};

719

};

720

721

#undef SCHED_FEAT

721

#undef SCHED_FEAT

722

723

static int sched_feat_show(struct seq_file *m, void *v)

723

static int sched_feat_show(struct seq_file *m, void *v)

724

{

724

{

725

int i;

725

int i;

726

727

for (i = 0; sched_feat_names[i]; i++) {

727

for (i = 0; sched_feat_names[i]; i++) {

728

if (!(sysctl_sched_features & (1UL << i)))

728

if (!(sysctl_sched_features & (1UL << i)))

729

seq_puts(m, "NO_");

729

seq_puts(m, "NO_");

730

seq_printf(m, "%s ", sched_feat_names[i]);

730

seq_printf(m, "%s ", sched_feat_names[i]);

731

}

731

}

732

seq_puts(m, "\n");

732

seq_puts(m, "\n");

733

734

return 0;

734

return 0;

735

}

735

}

736

737

static ssize_t

737

static ssize_t

738

sched_feat_write(struct file *filp, const char __user *ubuf,

738

sched_feat_write(struct file *filp, const char __user *ubuf,

739

size_t cnt, loff_t *ppos)

739

size_t cnt, loff_t *ppos)

740

{

740

{

741

char buf[64];

741

char buf[64];

742

char *cmp = buf;

742

char *cmp = buf;

743

int neg = 0;

743

int neg = 0;

744

int i;

744

int i;

745

746

if (cnt > 63)

746

if (cnt > 63)

747

cnt = 63;

747

cnt = 63;

748

749

if (copy_from_user(&buf, ubuf, cnt))

749

if (copy_from_user(&buf, ubuf, cnt))

750

return -EFAULT;

750

return -EFAULT;

751

752

buf[cnt] = 0;

752

buf[cnt] = 0;

753

754

if (strncmp(buf, "NO_", 3) == 0) {

754

if (strncmp(buf, "NO_", 3) == 0) {

755

neg = 1;

755

neg = 1;

756

cmp += 3;

756

cmp += 3;

757

}

757

}

758

759

for (i = 0; sched_feat_names[i]; i++) {

759

for (i = 0; sched_feat_names[i]; i++) {

760

int len = strlen(sched_feat_names[i]);

760

int len = strlen(sched_feat_names[i]);

761

762

if (strncmp(cmp, sched_feat_names[i], len) == 0) {

762

if (strncmp(cmp, sched_feat_names[i], len) == 0) {

763

if (neg)

763

if (neg)

764

sysctl_sched_features &= ~(1UL << i);

764

sysctl_sched_features &= ~(1UL << i);

765

else

765

else

766

sysctl_sched_features |= (1UL << i);

766

sysctl_sched_features |= (1UL << i);

767

break;

767

break;

768

}

768

}

769

}

769

}

770

771

if (!sched_feat_names[i])

771

if (!sched_feat_names[i])

772

return -EINVAL;

772

return -EINVAL;

773

774

filp->f_pos += cnt;

774

filp->f_pos += cnt;

775

776

return cnt;

776

return cnt;

777

}

777

}

778

779

static int sched_feat_open(struct inode *inode, struct file *filp)

779

static int sched_feat_open(struct inode *inode, struct file *filp)

780

{

780

{

781

return single_open(filp, sched_feat_show, NULL);

781

return single_open(filp, sched_feat_show, NULL);

782

}

782

}

783

784

static const struct file_operations sched_feat_fops = {

784

static const struct file_operations sched_feat_fops = {

785

.open = sched_feat_open,

785

.open = sched_feat_open,

786

.write = sched_feat_write,

786

.write = sched_feat_write,

787

.read = seq_read,

787

.read = seq_read,

788

.llseek = seq_lseek,

788

.llseek = seq_lseek,

789

.release = single_release,

789

.release = single_release,

790

};

790

};

791

792

static __init int sched_init_debug(void)

792

static __init int sched_init_debug(void)

793

{

793

{

794

debugfs_create_file("sched_features", 0644, NULL, NULL,

794

debugfs_create_file("sched_features", 0644, NULL, NULL,

795

&sched_feat_fops);

795

&sched_feat_fops);

796

797

return 0;

797

return 0;

798

}

798

}

799

late_initcall(sched_init_debug);

799

late_initcall(sched_init_debug);

800

801

#endif

801

#endif

802

803

#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))

803

#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))

804

805

/*

805

/*

806

* Number of tasks to iterate in a single balance run.

806

* Number of tasks to iterate in a single balance run.

807

* Limited because this is done with IRQs disabled.

807

* Limited because this is done with IRQs disabled.

808

*/

808

*/

809

const_debug unsigned int sysctl_sched_nr_migrate = 32;

809

const_debug unsigned int sysctl_sched_nr_migrate = 32;

810

811

/*

811

/*

812

* ratelimit for updating the group shares.

812

* ratelimit for updating the group shares.

813

* default: 0.25ms

813

* default: 0.25ms

814

*/

814

*/

815

unsigned int sysctl_sched_shares_ratelimit = 250000;

815

unsigned int sysctl_sched_shares_ratelimit = 250000;

816

817

/*

817

/*

818

* Inject some fuzzyness into changing the per-cpu group shares

818

* Inject some fuzzyness into changing the per-cpu group shares

819

* this avoids remote rq-locks at the expense of fairness.

819

* this avoids remote rq-locks at the expense of fairness.

820

* default: 4

820

* default: 4

821

*/

821

*/

822

unsigned int sysctl_sched_shares_thresh = 4;

822

unsigned int sysctl_sched_shares_thresh = 4;

823

824

/*

824

/*

825

* period over which we average the RT time consumption, measured

825

* period over which we average the RT time consumption, measured

826

* in ms.

826

* in ms.

827

*

827

*

828

* default: 1s

828

* default: 1s

829

*/

829

*/

830

const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;

830

const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;

831

832

/*

832

/*

833

* period over which we measure -rt task cpu usage in us.

833

* period over which we measure -rt task cpu usage in us.

834

* default: 1s

834

* default: 1s

835

*/

835

*/

836

unsigned int sysctl_sched_rt_period = 1000000;

836

unsigned int sysctl_sched_rt_period = 1000000;

837

838

static __read_mostly int scheduler_running;

838

static __read_mostly int scheduler_running;

839

840

/*

840

/*

841

* part of the period that we allow rt tasks to run in us.

841

* part of the period that we allow rt tasks to run in us.

842

* default: 0.95s

842

* default: 0.95s

843

*/

843

*/

844

int sysctl_sched_rt_runtime = 950000;

844

int sysctl_sched_rt_runtime = 950000;

845

846

static inline u64 global_rt_period(void)

846

static inline u64 global_rt_period(void)

847

{

847

{

848

return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;

848

return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;

849

}

849

}

850

851

static inline u64 global_rt_runtime(void)

851

static inline u64 global_rt_runtime(void)

852

{

852

{

853

if (sysctl_sched_rt_runtime < 0)

853

if (sysctl_sched_rt_runtime < 0)

854

return RUNTIME_INF;

854

return RUNTIME_INF;

855

856

return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;

856

return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;

857

}

857

}

858

859

#ifndef prepare_arch_switch

859

#ifndef prepare_arch_switch

860

# define prepare_arch_switch(next) do { } while (0)

860

# define prepare_arch_switch(next) do { } while (0)

861

#endif

861

#endif

862

#ifndef finish_arch_switch

862

#ifndef finish_arch_switch

863

# define finish_arch_switch(prev) do { } while (0)

863

# define finish_arch_switch(prev) do { } while (0)

864

#endif

864

#endif

865

866

static inline int task_current(struct rq *rq, struct task_struct *p)

866

static inline int task_current(struct rq *rq, struct task_struct *p)

867

{

867

{

868

return rq->curr == p;

868

return rq->curr == p;

869

}

869

}

870

871

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

871

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

872

static inline int task_running(struct rq *rq, struct task_struct *p)

872

static inline int task_running(struct rq *rq, struct task_struct *p)

873

{

873

{

874

return task_current(rq, p);

874

return task_current(rq, p);

875

}

875

}

876

877

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

877

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

878

{

878

{

879

}

879

}

880

881

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

881

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

882

{

882

{

883

#ifdef CONFIG_DEBUG_SPINLOCK

883

#ifdef CONFIG_DEBUG_SPINLOCK

884

/* this is a valid case when another task releases the spinlock */

884

/* this is a valid case when another task releases the spinlock */

885

rq->lock.owner = current;

885

rq->lock.owner = current;

886

#endif

886

#endif

887

/*

887

/*

888

* If we are tracking spinlock dependencies then we have to

888

* If we are tracking spinlock dependencies then we have to

889

* fix up the runqueue lock - which gets 'carried over' from

889

* fix up the runqueue lock - which gets 'carried over' from

890

* prev into current:

890

* prev into current:

891

*/

891

*/

892

spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);

892

spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);

893

894

spin_unlock_irq(&rq->lock);

894

spin_unlock_irq(&rq->lock);

895

}

895

}

896

897

#else /* __ARCH_WANT_UNLOCKED_CTXSW */

897

#else /* __ARCH_WANT_UNLOCKED_CTXSW */

898

static inline int task_running(struct rq *rq, struct task_struct *p)

898

static inline int task_running(struct rq *rq, struct task_struct *p)

899

{

899

{

900

#ifdef CONFIG_SMP

900

#ifdef CONFIG_SMP

901

return p->oncpu;

901

return p->oncpu;

902

#else

902

#else

903

return task_current(rq, p);

903

return task_current(rq, p);

904

#endif

904

#endif

905

}

905

}

906

907

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

907

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

908

{

908

{

909

#ifdef CONFIG_SMP

909

#ifdef CONFIG_SMP

910

/*

910

/*

911

* We can optimise this out completely for !SMP, because the

911

* We can optimise this out completely for !SMP, because the

912

* SMP rebalancing from interrupt is the only thing that cares

912

* SMP rebalancing from interrupt is the only thing that cares

913

* here.

913

* here.

914

*/

914

*/

915

next->oncpu = 1;

915

next->oncpu = 1;

916

#endif

916

#endif

917

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

917

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

918

spin_unlock_irq(&rq->lock);

918

spin_unlock_irq(&rq->lock);

919

#else

919

#else

920

spin_unlock(&rq->lock);

920

spin_unlock(&rq->lock);

921

#endif

921

#endif

922

}

922

}

923

924

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

924

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

925

{

925

{

926

#ifdef CONFIG_SMP

926

#ifdef CONFIG_SMP

927

/*

927

/*

928

* After ->oncpu is cleared, the task can be moved to a different CPU.

928

* After ->oncpu is cleared, the task can be moved to a different CPU.

929

* We must ensure this doesn't happen until the switch is completely

929

* We must ensure this doesn't happen until the switch is completely

930

* finished.

930

* finished.

931

*/

931

*/

932

smp_wmb();

932

smp_wmb();

933

prev->oncpu = 0;

933

prev->oncpu = 0;

934

#endif

934

#endif

935

#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW

935

#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW

936

local_irq_enable();

936

local_irq_enable();

937

#endif

937

#endif

938

}

938

}

939

#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

939

#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

940

941

/*

941

/*

942

* __task_rq_lock - lock the runqueue a given task resides on.

942

* __task_rq_lock - lock the runqueue a given task resides on.

943

* Must be called interrupts disabled.

943

* Must be called interrupts disabled.

944

*/

944

*/

945

static inline struct rq *__task_rq_lock(struct task_struct *p)

945

static inline struct rq *__task_rq_lock(struct task_struct *p)

946

__acquires(rq->lock)

946

__acquires(rq->lock)

947

{

947

{

948

for (;;) {

948

for (;;) {

949

struct rq *rq = task_rq(p);

949

struct rq *rq = task_rq(p);

950

spin_lock(&rq->lock);

950

spin_lock(&rq->lock);

951

if (likely(rq == task_rq(p)))

951

if (likely(rq == task_rq(p)))

952

return rq;

952

return rq;

953

spin_unlock(&rq->lock);

953

spin_unlock(&rq->lock);

954

}

954

}

955

}

955

}

956

957

/*

957

/*

958

* task_rq_lock - lock the runqueue a given task resides on and disable

958

* task_rq_lock - lock the runqueue a given task resides on and disable

959

* interrupts. Note the ordering: we can safely lookup the task_rq without

959

* interrupts. Note the ordering: we can safely lookup the task_rq without

960

* explicitly disabling preemption.

960

* explicitly disabling preemption.

961

*/

961

*/

962

static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)

962

static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)

963

__acquires(rq->lock)

963

__acquires(rq->lock)

964

{

964

{

965

struct rq *rq;

965

struct rq *rq;

966

967

for (;;) {

967

for (;;) {

968

local_irq_save(*flags);

968

local_irq_save(*flags);

969

rq = task_rq(p);

969

rq = task_rq(p);

970

spin_lock(&rq->lock);

970

spin_lock(&rq->lock);

971

if (likely(rq == task_rq(p)))

971

if (likely(rq == task_rq(p)))

972

return rq;

972

return rq;

973

spin_unlock_irqrestore(&rq->lock, *flags);

973

spin_unlock_irqrestore(&rq->lock, *flags);

974

}

974

}

975

}

975

}

976

977

void task_rq_unlock_wait(struct task_struct *p)

977

void task_rq_unlock_wait(struct task_struct *p)

978

{

978

{

979

struct rq *rq = task_rq(p);

979

struct rq *rq = task_rq(p);

980

981

smp_mb(); /* spin-unlock-wait is not a full memory barrier */

981

smp_mb(); /* spin-unlock-wait is not a full memory barrier */

982

spin_unlock_wait(&rq->lock);

982

spin_unlock_wait(&rq->lock);

983

}

983

}

984

985

static void __task_rq_unlock(struct rq *rq)

985

static void __task_rq_unlock(struct rq *rq)

986

__releases(rq->lock)

986

__releases(rq->lock)

987

{

987

{

988

spin_unlock(&rq->lock);

988

spin_unlock(&rq->lock);

989

}

989

}

990

991

static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)

991

static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)

992

__releases(rq->lock)

992

__releases(rq->lock)

993

{

993

{

994

spin_unlock_irqrestore(&rq->lock, *flags);

994

spin_unlock_irqrestore(&rq->lock, *flags);

995

}

995

}

996

997

/*

997

/*

998

* this_rq_lock - lock this runqueue and disable interrupts.

998

* this_rq_lock - lock this runqueue and disable interrupts.

999

*/

999

*/

1000

static struct rq *this_rq_lock(void)

1000

static struct rq *this_rq_lock(void)

1001

__acquires(rq->lock)

1001

__acquires(rq->lock)

1002

{

1002

{

1003

struct rq *rq;

1003

struct rq *rq;

1004

1005

local_irq_disable();

1005

local_irq_disable();

1006

rq = this_rq();

1006

rq = this_rq();

1007

spin_lock(&rq->lock);

1007

spin_lock(&rq->lock);

1008

1009

return rq;

1009

return rq;

1010

}

1010

}

1011

1012

#ifdef CONFIG_SCHED_HRTICK

1012

#ifdef CONFIG_SCHED_HRTICK

1013

/*

1013

/*

1014

* Use HR-timers to deliver accurate preemption points.

1014

* Use HR-timers to deliver accurate preemption points.

1015

*

1015

*

1016

* Its all a bit involved since we cannot program an hrt while holding the

1016

* Its all a bit involved since we cannot program an hrt while holding the

1017

* rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a

1017

* rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a

1018

* reschedule event.

1018

* reschedule event.

1019

*

1019

*

1020

* When we get rescheduled we reprogram the hrtick_timer outside of the

1020

* When we get rescheduled we reprogram the hrtick_timer outside of the

1021

* rq->lock.

1021

* rq->lock.

1022

*/

1022

*/

1023

1024

/*

1024

/*

1025

* Use hrtick when:

1025

* Use hrtick when:

1026

* - enabled by features

1026

* - enabled by features

1027

* - hrtimer is actually high res

1027

* - hrtimer is actually high res

1028

*/

1028

*/

1029

static inline int hrtick_enabled(struct rq *rq)

1029

static inline int hrtick_enabled(struct rq *rq)

1030

{

1030

{

1031

if (!sched_feat(HRTICK))

1031

if (!sched_feat(HRTICK))

1032

return 0;

1032

return 0;

1033

if (!cpu_active(cpu_of(rq)))

1033

if (!cpu_active(cpu_of(rq)))

1034

return 0;

1034

return 0;

1035

return hrtimer_is_hres_active(&rq->hrtick_timer);

1035

return hrtimer_is_hres_active(&rq->hrtick_timer);

1036

}

1036

}

1037

1038

static void hrtick_clear(struct rq *rq)

1038

static void hrtick_clear(struct rq *rq)

1039

{

1039

{

1040

if (hrtimer_active(&rq->hrtick_timer))

1040

if (hrtimer_active(&rq->hrtick_timer))

1041

hrtimer_cancel(&rq->hrtick_timer);

1041

hrtimer_cancel(&rq->hrtick_timer);

1042

}

1042

}

1043

1044

/*

1044

/*

1045

* High-resolution timer tick.

1045

* High-resolution timer tick.

1046

* Runs from hardirq context with interrupts disabled.

1046

* Runs from hardirq context with interrupts disabled.

1047

*/

1047

*/

1048

static enum hrtimer_restart hrtick(struct hrtimer *timer)

1048

static enum hrtimer_restart hrtick(struct hrtimer *timer)

1049

{

1049

{

1050

struct rq *rq = container_of(timer, struct rq, hrtick_timer);

1050

struct rq *rq = container_of(timer, struct rq, hrtick_timer);

1051

1052

WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

1052

WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

1053

1054

spin_lock(&rq->lock);

1054

spin_lock(&rq->lock);

1055

update_rq_clock(rq);

1055

update_rq_clock(rq);

1056

rq->curr->sched_class->task_tick(rq, rq->curr, 1);

1056

rq->curr->sched_class->task_tick(rq, rq->curr, 1);

1057

spin_unlock(&rq->lock);

1057

spin_unlock(&rq->lock);

1058

1059

return HRTIMER_NORESTART;

1059

return HRTIMER_NORESTART;

1060

}

1060

}

1061

1062

#ifdef CONFIG_SMP

1062

#ifdef CONFIG_SMP

1063

/*

1063

/*

1064

* called from hardirq (IPI) context

1064

* called from hardirq (IPI) context

1065

*/

1065

*/

1066

static void __hrtick_start(void *arg)

1066

static void __hrtick_start(void *arg)

1067

{

1067

{

1068

struct rq *rq = arg;

1068

struct rq *rq = arg;

1069

1070

spin_lock(&rq->lock);

1070

spin_lock(&rq->lock);

1071

hrtimer_restart(&rq->hrtick_timer);

1071

hrtimer_restart(&rq->hrtick_timer);

1072

rq->hrtick_csd_pending = 0;

1072

rq->hrtick_csd_pending = 0;

1073

spin_unlock(&rq->lock);

1073

spin_unlock(&rq->lock);

1074

}

1074

}

1075

1076

/*

1076

/*

1077

* Called to set the hrtick timer state.

1077

* Called to set the hrtick timer state.

1078

*

1078

*

1079

* called with rq->lock held and irqs disabled

1079

* called with rq->lock held and irqs disabled

1080

*/

1080

*/

1081

static void hrtick_start(struct rq *rq, u64 delay)

1081

static void hrtick_start(struct rq *rq, u64 delay)

1082

{

1082

{

1083

struct hrtimer *timer = &rq->hrtick_timer;

1083

struct hrtimer *timer = &rq->hrtick_timer;

1084

ktime_t time = ktime_add_ns(timer->base->get_time(), delay);

1084

ktime_t time = ktime_add_ns(timer->base->get_time(), delay);

1085

1086

hrtimer_set_expires(timer, time);

1086

hrtimer_set_expires(timer, time);

1087

1088

if (rq == this_rq()) {

1088

if (rq == this_rq()) {

1089

hrtimer_restart(timer);

1089

hrtimer_restart(timer);

1090

} else if (!rq->hrtick_csd_pending) {

1090

} else if (!rq->hrtick_csd_pending) {

1091

__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);

1091

__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);

1092

rq->hrtick_csd_pending = 1;

1092

rq->hrtick_csd_pending = 1;

1093

}

1093

}

1094

}

1094

}

1095

1096

static int

1096

static int

1097

hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)

1097

hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)

1098

{

1098

{

1099

int cpu = (int)(long)hcpu;

1099

int cpu = (int)(long)hcpu;

1100

1101

switch (action) {

1101

switch (action) {

1102

case CPU_UP_CANCELED:

1102

case CPU_UP_CANCELED:

1103

case CPU_UP_CANCELED_FROZEN:

1103

case CPU_UP_CANCELED_FROZEN:

1104

case CPU_DOWN_PREPARE:

1104

case CPU_DOWN_PREPARE:

1105

case CPU_DOWN_PREPARE_FROZEN:

1105

case CPU_DOWN_PREPARE_FROZEN:

1106

case CPU_DEAD:

1106

case CPU_DEAD:

1107

case CPU_DEAD_FROZEN:

1107

case CPU_DEAD_FROZEN:

1108

hrtick_clear(cpu_rq(cpu));

1108

hrtick_clear(cpu_rq(cpu));

1109

return NOTIFY_OK;

1109

return NOTIFY_OK;

1110

}

1110

}

1111

1112

return NOTIFY_DONE;

1112

return NOTIFY_DONE;

1113

}

1113

}

1114

1115

static __init void init_hrtick(void)

1115

static __init void init_hrtick(void)

1116

{

1116

{

1117

hotcpu_notifier(hotplug_hrtick, 0);

1117

hotcpu_notifier(hotplug_hrtick, 0);

1118

}

1118

}

1119

#else

1119

#else

1120

/*

1120

/*

1121

* Called to set the hrtick timer state.

1121

* Called to set the hrtick timer state.

1122

*

1122

*

1123

* called with rq->lock held and irqs disabled

1123

* called with rq->lock held and irqs disabled

1124

*/

1124

*/

1125

static void hrtick_start(struct rq *rq, u64 delay)

1125

static void hrtick_start(struct rq *rq, u64 delay)

1126

{

1126

{

1127

__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,

1127

__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,

1128

HRTIMER_MODE_REL_PINNED, 0);

1128

HRTIMER_MODE_REL_PINNED, 0);

1129

}

1129

}

1130

1131

static inline void init_hrtick(void)

1131

static inline void init_hrtick(void)

1132

{

1132

{

1133

}

1133

}

1134

#endif /* CONFIG_SMP */

1134

#endif /* CONFIG_SMP */

1135

1136

static void init_rq_hrtick(struct rq *rq)

1136

static void init_rq_hrtick(struct rq *rq)

1137

{

1137

{

1138

#ifdef CONFIG_SMP

1138

#ifdef CONFIG_SMP

1139

rq->hrtick_csd_pending = 0;

1139

rq->hrtick_csd_pending = 0;

1140

1141

rq->hrtick_csd.flags = 0;

1141

rq->hrtick_csd.flags = 0;

1142

rq->hrtick_csd.func = __hrtick_start;

1142

rq->hrtick_csd.func = __hrtick_start;

1143

rq->hrtick_csd.info = rq;

1143

rq->hrtick_csd.info = rq;

1144

#endif

1144

#endif

1145

1146

hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

1146

hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

1147

rq->hrtick_timer.function = hrtick;

1147

rq->hrtick_timer.function = hrtick;

1148

}

1148

}

1149

#else /* CONFIG_SCHED_HRTICK */

1149

#else /* CONFIG_SCHED_HRTICK */

1150

static inline void hrtick_clear(struct rq *rq)

1150

static inline void hrtick_clear(struct rq *rq)

1151

{

1151

{

1152

}

1152

}

1153

1154

static inline void init_rq_hrtick(struct rq *rq)

1154

static inline void init_rq_hrtick(struct rq *rq)

1155

{

1155

{

1156

}

1156

}

1157

1158

static inline void init_hrtick(void)

1158

static inline void init_hrtick(void)

1159

{

1159

{

1160

}

1160

}

1161

#endif /* CONFIG_SCHED_HRTICK */

1161

#endif /* CONFIG_SCHED_HRTICK */

1162

1163

/*

1163

/*

1164

* resched_task - mark a task 'to be rescheduled now'.

1164

* resched_task - mark a task 'to be rescheduled now'.

1165

*

1165

*

1166

* On UP this means the setting of the need_resched flag, on SMP it

1166

* On UP this means the setting of the need_resched flag, on SMP it

1167

* might also involve a cross-CPU call to trigger the scheduler on

1167

* might also involve a cross-CPU call to trigger the scheduler on

1168

* the target CPU.

1168

* the target CPU.

1169

*/

1169

*/

1170

#ifdef CONFIG_SMP

1170

#ifdef CONFIG_SMP

1171

1172

#ifndef tsk_is_polling

1172

#ifndef tsk_is_polling

1173

#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)

1173

#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)

1174

#endif

1174

#endif

1175

1176

static void resched_task(struct task_struct *p)

1176

static void resched_task(struct task_struct *p)

1177

{

1177

{

1178

int cpu;

1178

int cpu;

1179

1180

assert_spin_locked(&task_rq(p)->lock);

1180

assert_spin_locked(&task_rq(p)->lock);

1181

1182

if (test_tsk_need_resched(p))

1182

if (test_tsk_need_resched(p))

1183

return;

1183

return;

1184

1185

set_tsk_need_resched(p);

1185

set_tsk_need_resched(p);

1186

1187

cpu = task_cpu(p);

1187

cpu = task_cpu(p);

1188

if (cpu == smp_processor_id())

1188

if (cpu == smp_processor_id())

1189

return;

1189

return;

1190

1191

/* NEED_RESCHED must be visible before we test polling */

1191

/* NEED_RESCHED must be visible before we test polling */

1192

smp_mb();

1192

smp_mb();

1193

if (!tsk_is_polling(p))

1193

if (!tsk_is_polling(p))

1194

smp_send_reschedule(cpu);

1194

smp_send_reschedule(cpu);

1195

}

1195

}

1196

1197

static void resched_cpu(int cpu)

1197

static void resched_cpu(int cpu)

1198

{

1198

{

1199

struct rq *rq = cpu_rq(cpu);

1199

struct rq *rq = cpu_rq(cpu);

1200

unsigned long flags;

1200

unsigned long flags;

1201

1202

if (!spin_trylock_irqsave(&rq->lock, flags))

1202

if (!spin_trylock_irqsave(&rq->lock, flags))

1203

return;

1203

return;

1204

resched_task(cpu_curr(cpu));

1204

resched_task(cpu_curr(cpu));

1205

spin_unlock_irqrestore(&rq->lock, flags);

1205

spin_unlock_irqrestore(&rq->lock, flags);

1206

}

1206

}

1207

1208

#ifdef CONFIG_NO_HZ

1208

#ifdef CONFIG_NO_HZ

1209

/*

1209

/*

1210

* When add_timer_on() enqueues a timer into the timer wheel of an

1210

* When add_timer_on() enqueues a timer into the timer wheel of an

1211

* idle CPU then this timer might expire before the next timer event

1211

* idle CPU then this timer might expire before the next timer event

1212

* which is scheduled to wake up that CPU. In case of a completely

1212

* which is scheduled to wake up that CPU. In case of a completely

1213

* idle system the next event might even be infinite time into the

1213

* idle system the next event might even be infinite time into the

1214

* future. wake_up_idle_cpu() ensures that the CPU is woken up and

1214

* future. wake_up_idle_cpu() ensures that the CPU is woken up and

1215

* leaves the inner idle loop so the newly added timer is taken into

1215

* leaves the inner idle loop so the newly added timer is taken into

1216

* account when the CPU goes back to idle and evaluates the timer

1216

* account when the CPU goes back to idle and evaluates the timer

1217

* wheel for the next timer event.

1217

* wheel for the next timer event.

1218

*/

1218

*/

1219

void wake_up_idle_cpu(int cpu)

1219

void wake_up_idle_cpu(int cpu)

1220

{

1220

{

1221

struct rq *rq = cpu_rq(cpu);

1221

struct rq *rq = cpu_rq(cpu);

1222

1223

if (cpu == smp_processor_id())

1223

if (cpu == smp_processor_id())

1224

return;

1224

return;

1225

1226

/*

1226

/*

1227

* This is safe, as this function is called with the timer

1227

* This is safe, as this function is called with the timer

1228

* wheel base lock of (cpu) held. When the CPU is on the way

1228

* wheel base lock of (cpu) held. When the CPU is on the way

1229

* to idle and has not yet set rq->curr to idle then it will

1229

* to idle and has not yet set rq->curr to idle then it will

1230

* be serialized on the timer wheel base lock and take the new

1230

* be serialized on the timer wheel base lock and take the new

1231

* timer into account automatically.

1231

* timer into account automatically.

1232

*/

1232

*/

1233

if (rq->curr != rq->idle)

1233

if (rq->curr != rq->idle)

1234

return;

1234

return;

1235

1236

/*

1236

/*

1237

* We can set TIF_RESCHED on the idle task of the other CPU

1237

* We can set TIF_RESCHED on the idle task of the other CPU

1238

* lockless. The worst case is that the other CPU runs the

1238

* lockless. The worst case is that the other CPU runs the

1239

* idle task through an additional NOOP schedule()

1239

* idle task through an additional NOOP schedule()

1240

*/

1240

*/

1241

set_tsk_need_resched(rq->idle);

1241

set_tsk_need_resched(rq->idle);

1242

1243

/* NEED_RESCHED must be visible before we test polling */

1243

/* NEED_RESCHED must be visible before we test polling */

1244

smp_mb();

1244

smp_mb();

1245

if (!tsk_is_polling(rq->idle))

1245

if (!tsk_is_polling(rq->idle))

1246

smp_send_reschedule(cpu);

1246

smp_send_reschedule(cpu);

1247

}

1247

}

1248

#endif /* CONFIG_NO_HZ */

1248

#endif /* CONFIG_NO_HZ */

1249

1250

static u64 sched_avg_period(void)

1250

static u64 sched_avg_period(void)

1251

{

1251

{

1252

return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;

1252

return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;

1253

}

1253

}

1254

1255

static void sched_avg_update(struct rq *rq)

1255

static void sched_avg_update(struct rq *rq)

1256

{

1256

{

1257

s64 period = sched_avg_period();

1257

s64 period = sched_avg_period();

1258

1259

while ((s64)(rq->clock - rq->age_stamp) > period) {

1259

while ((s64)(rq->clock - rq->age_stamp) > period) {

1260

rq->age_stamp += period;

1260

rq->age_stamp += period;

1261

rq->rt_avg /= 2;

1261

rq->rt_avg /= 2;

1262

}

1262

}

1263

}

1263

}

1264

1265

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)

1265

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)

1266

{

1266

{

1267

rq->rt_avg += rt_delta;

1267

rq->rt_avg += rt_delta;

1268

sched_avg_update(rq);

1268

sched_avg_update(rq);

1269

}

1269

}

1270

1271

#else /* !CONFIG_SMP */

1271

#else /* !CONFIG_SMP */

1272

static void resched_task(struct task_struct *p)

1272

static void resched_task(struct task_struct *p)

1273

{

1273

{

1274

assert_spin_locked(&task_rq(p)->lock);

1274

assert_spin_locked(&task_rq(p)->lock);

1275

set_tsk_need_resched(p);

1275

set_tsk_need_resched(p);

1276

}

1276

}

1277

1278

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)

1278

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)

1279

{

1279

{

1280

}

1280

}

1281

#endif /* CONFIG_SMP */

1281

#endif /* CONFIG_SMP */

1282

1283

#if BITS_PER_LONG == 32

1283

#if BITS_PER_LONG == 32

1284

# define WMULT_CONST (~0UL)

1284

# define WMULT_CONST (~0UL)

1285

#else

1285

#else

1286

# define WMULT_CONST (1UL << 32)

1286

# define WMULT_CONST (1UL << 32)

1287

#endif

1287

#endif

1288

1289

#define WMULT_SHIFT 32

1289

#define WMULT_SHIFT 32

1290

1291

/*

1291

/*

1292

* Shift right and round:

1292

* Shift right and round:

1293

*/

1293

*/

1294

#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))

1294

#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))

1295

1296

/*

1296

/*

1297

* delta *= weight / lw

1297

* delta *= weight / lw

1298

*/

1298

*/

1299

static unsigned long

1299

static unsigned long

1300

calc_delta_mine(unsigned long delta_exec, unsigned long weight,

1300

calc_delta_mine(unsigned long delta_exec, unsigned long weight,

1301

struct load_weight *lw)

1301

struct load_weight *lw)

1302

{

1302

{

1303

u64 tmp;

1303

u64 tmp;

1304

1305

if (!lw->inv_weight) {

1305

if (!lw->inv_weight) {

1306

if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))

1306

if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))

1307

lw->inv_weight = 1;

1307

lw->inv_weight = 1;

1308

else

1308

else

1309

lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)

1309

lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)

1310

/ (lw->weight+1);

1310

/ (lw->weight+1);

1311

}

1311

}

1312

1313

tmp = (u64)delta_exec * weight;

1313

tmp = (u64)delta_exec * weight;

1314

/*

1314

/*

1315

* Check whether we'd overflow the 64-bit multiplication:

1315

* Check whether we'd overflow the 64-bit multiplication:

1316

*/

1316

*/

1317

if (unlikely(tmp > WMULT_CONST))

1317

if (unlikely(tmp > WMULT_CONST))

1318

tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,

1318

tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,

1319

WMULT_SHIFT/2);

1319

WMULT_SHIFT/2);

1320

else

1320

else

1321

tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);

1321

tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);

1322

1323

return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);

1323

return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);

1324

}

1324

}

1325

1326

static inline void update_load_add(struct load_weight *lw, unsigned long inc)

1326

static inline void update_load_add(struct load_weight *lw, unsigned long inc)

1327

{

1327

{

1328

lw->weight += inc;

1328

lw->weight += inc;

1329

lw->inv_weight = 0;

1329

lw->inv_weight = 0;

1330

}

1330

}

1331

1332

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)

1332

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)

1333

{

1333

{

1334

lw->weight -= dec;

1334

lw->weight -= dec;

1335

lw->inv_weight = 0;

1335

lw->inv_weight = 0;

1336

}

1336

}

1337

1338

/*

1338

/*

1339

* To aid in avoiding the subversion of "niceness" due to uneven distribution

1339

* To aid in avoiding the subversion of "niceness" due to uneven distribution

1340

* of tasks with abnormal "nice" values across CPUs the contribution that

1340

* of tasks with abnormal "nice" values across CPUs the contribution that

1341

* each task makes to its run queue's load is weighted according to its

1341

* each task makes to its run queue's load is weighted according to its

1342

* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a

1342

* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a

1343

* scaled version of the new time slice allocation that they receive on time

1343

* scaled version of the new time slice allocation that they receive on time

1344

* slice expiry etc.

1344

* slice expiry etc.

1345

*/

1345

*/

1346

1347

#define WEIGHT_IDLEPRIO 3

1347

#define WEIGHT_IDLEPRIO 3

1348

#define WMULT_IDLEPRIO 1431655765

1348

#define WMULT_IDLEPRIO 1431655765

1349

1350

/*

1350

/*

1351

* Nice levels are multiplicative, with a gentle 10% change for every

1351

* Nice levels are multiplicative, with a gentle 10% change for every

1352

* nice level changed. I.e. when a CPU-bound task goes from nice 0 to

1352

* nice level changed. I.e. when a CPU-bound task goes from nice 0 to

1353

* nice 1, it will get ~10% less CPU time than another CPU-bound task

1353

* nice 1, it will get ~10% less CPU time than another CPU-bound task

1354

* that remained on nice 0.

1354

* that remained on nice 0.

1355

*

1355

*

1356

* The "10% effect" is relative and cumulative: from _any_ nice level,

1356

* The "10% effect" is relative and cumulative: from _any_ nice level,

1357

* if you go up 1 level, it's -10% CPU usage, if you go down 1 level

1357

* if you go up 1 level, it's -10% CPU usage, if you go down 1 level

1358

* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.

1358

* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.

1359

* If a task goes up by ~10% and another task goes down by ~10% then

1359

* If a task goes up by ~10% and another task goes down by ~10% then

1360

* the relative distance between them is ~25%.)

1360

* the relative distance between them is ~25%.)

1361

*/

1361

*/

1362

static const int prio_to_weight[40] = {

1362

static const int prio_to_weight[40] = {

1363

/* -20 */ 88761, 71755, 56483, 46273, 36291,

1363

/* -20 */ 88761, 71755, 56483, 46273, 36291,

1364

/* -15 */ 29154, 23254, 18705, 14949, 11916,

1364

/* -15 */ 29154, 23254, 18705, 14949, 11916,

1365

/* -10 */ 9548, 7620, 6100, 4904, 3906,

1365

/* -10 */ 9548, 7620, 6100, 4904, 3906,

1366

/* -5 */ 3121, 2501, 1991, 1586, 1277,

1366

/* -5 */ 3121, 2501, 1991, 1586, 1277,

1367

/* 0 */ 1024, 820, 655, 526, 423,

1367

/* 0 */ 1024, 820, 655, 526, 423,

1368

/* 5 */ 335, 272, 215, 172, 137,

1368

/* 5 */ 335, 272, 215, 172, 137,

1369

/* 10 */ 110, 87, 70, 56, 45,

1369

/* 10 */ 110, 87, 70, 56, 45,

1370

/* 15 */ 36, 29, 23, 18, 15,

1370

/* 15 */ 36, 29, 23, 18, 15,

1371

};

1371

};

1372

1373

/*

1373

/*

1374

* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.

1374

* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.

1375

*

1375

*

1376

* In cases where the weight does not change often, we can use the

1376

* In cases where the weight does not change often, we can use the

1377

* precalculated inverse to speed up arithmetics by turning divisions

1377

* precalculated inverse to speed up arithmetics by turning divisions

1378

* into multiplications:

1378

* into multiplications:

1379

*/

1379

*/

1380

static const u32 prio_to_wmult[40] = {

1380

static const u32 prio_to_wmult[40] = {

1381

/* -20 */ 48388, 59856, 76040, 92818, 118348,

1381

/* -20 */ 48388, 59856, 76040, 92818, 118348,

1382

/* -15 */ 147320, 184698, 229616, 287308, 360437,

1382

/* -15 */ 147320, 184698, 229616, 287308, 360437,

1383

/* -10 */ 449829, 563644, 704093, 875809, 1099582,

1383

/* -10 */ 449829, 563644, 704093, 875809, 1099582,

1384

/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,

1384

/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,

1385

/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,

1385

/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,

1386

/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,

1386

/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,

1387

/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,

1387

/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,

1388

/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,

1388

/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,

1389

};

1389

};

1390

1391

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);

1391

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);

1392

1393

/*

1393

/*

1394

* runqueue iterator, to support SMP load-balancing between different

1394

* runqueue iterator, to support SMP load-balancing between different

1395

* scheduling classes, without having to expose their internal data

1395

* scheduling classes, without having to expose their internal data

1396

* structures to the load-balancing proper:

1396

* structures to the load-balancing proper:

1397

*/

1397

*/

1398

struct rq_iterator {

1398

struct rq_iterator {

1399

void *arg;

1399

void *arg;

1400

struct task_struct *(*start)(void *);

1400

struct task_struct *(*start)(void *);

1401

struct task_struct *(*next)(void *);

1401

struct task_struct *(*next)(void *);

1402

};

1402

};

1403

1404

#ifdef CONFIG_SMP

1404

#ifdef CONFIG_SMP

1405

static unsigned long

1405

static unsigned long

1406

balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

1406

balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

1407

unsigned long max_load_move, struct sched_domain *sd,

1407

unsigned long max_load_move, struct sched_domain *sd,

1408

enum cpu_idle_type idle, int *all_pinned,

1408

enum cpu_idle_type idle, int *all_pinned,

1409

int *this_best_prio, struct rq_iterator *iterator);

1409

int *this_best_prio, struct rq_iterator *iterator);

1410

1411

static int

1411

static int

1412

iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

1412

iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

1413

struct sched_domain *sd, enum cpu_idle_type idle,

1413

struct sched_domain *sd, enum cpu_idle_type idle,

1414

struct rq_iterator *iterator);

1414

struct rq_iterator *iterator);

1415

#endif

1415

#endif

1416

1417

/* Time spent by the tasks of the cpu accounting group executing in ... */

1417

/* Time spent by the tasks of the cpu accounting group executing in ... */

1418

enum cpuacct_stat_index {

1418

enum cpuacct_stat_index {

1419

CPUACCT_STAT_USER, /* ... user mode */

1419

CPUACCT_STAT_USER, /* ... user mode */

1420

CPUACCT_STAT_SYSTEM, /* ... kernel mode */

1420

CPUACCT_STAT_SYSTEM, /* ... kernel mode */

1421

1422

CPUACCT_STAT_NSTATS,

1422

CPUACCT_STAT_NSTATS,

1423

};

1423

};

1424

1425

#ifdef CONFIG_CGROUP_CPUACCT

1425

#ifdef CONFIG_CGROUP_CPUACCT

1426

static void cpuacct_charge(struct task_struct *tsk, u64 cputime);

1426

static void cpuacct_charge(struct task_struct *tsk, u64 cputime);

1427

static void cpuacct_update_stats(struct task_struct *tsk,

1427

static void cpuacct_update_stats(struct task_struct *tsk,

1428

enum cpuacct_stat_index idx, cputime_t val);

1428

enum cpuacct_stat_index idx, cputime_t val);

1429

#else

1429

#else

1430

static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}

1430

static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}

1431

static inline void cpuacct_update_stats(struct task_struct *tsk,

1431

static inline void cpuacct_update_stats(struct task_struct *tsk,

1432

enum cpuacct_stat_index idx, cputime_t val) {}

1432

enum cpuacct_stat_index idx, cputime_t val) {}

1433

#endif

1433

#endif

1434

1435

static inline void inc_cpu_load(struct rq *rq, unsigned long load)

1435

static inline void inc_cpu_load(struct rq *rq, unsigned long load)

1436

{

1436

{

1437

update_load_add(&rq->load, load);

1437

update_load_add(&rq->load, load);

1438

}

1438

}

1439

1440

static inline void dec_cpu_load(struct rq *rq, unsigned long load)

1440

static inline void dec_cpu_load(struct rq *rq, unsigned long load)

1441

{

1441

{

1442

update_load_sub(&rq->load, load);

1442

update_load_sub(&rq->load, load);

1443

}

1443

}

1444

1445

#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)

1445

#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)

1446

typedef int (*tg_visitor)(struct task_group *, void *);

1446

typedef int (*tg_visitor)(struct task_group *, void *);

1447

1448

/*

1448

/*

1449

* Iterate the full tree, calling @down when first entering a node and @up when

1449

* Iterate the full tree, calling @down when first entering a node and @up when

1450

* leaving it for the final time.

1450

* leaving it for the final time.

1451

*/

1451

*/

1452

static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)

1452

static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)

1453

{

1453

{

1454

struct task_group *parent, *child;

1454

struct task_group *parent, *child;

1455

int ret;

1455

int ret;

1456

1457

rcu_read_lock();

1457

rcu_read_lock();

1458

parent = &root_task_group;

1458

parent = &root_task_group;

1459

down:

1459

down:

1460

ret = (*down)(parent, data);

1460

ret = (*down)(parent, data);

1461

if (ret)

1461

if (ret)

1462

goto out_unlock;

1462

goto out_unlock;

1463

list_for_each_entry_rcu(child, &parent->children, siblings) {

1463

list_for_each_entry_rcu(child, &parent->children, siblings) {

1464

parent = child;

1464

parent = child;

1465

goto down;

1465

goto down;

1466

1467

up:

1467

up:

1468

continue;

1468

continue;

1469

}

1469

}

1470

ret = (*up)(parent, data);

1470

ret = (*up)(parent, data);

1471

if (ret)

1471

if (ret)

1472

goto out_unlock;

1472

goto out_unlock;

1473

1474

child = parent;

1474

child = parent;

1475

parent = parent->parent;

1475

parent = parent->parent;

1476

if (parent)

1476

if (parent)

1477

goto up;

1477

goto up;

1478

out_unlock:

1478

out_unlock:

1479

rcu_read_unlock();

1479

rcu_read_unlock();

1480

1481

return ret;

1481

return ret;

1482

}

1482

}

1483

1484

static int tg_nop(struct task_group *tg, void *data)

1484

static int tg_nop(struct task_group *tg, void *data)

1485

{

1485

{

1486

return 0;

1486

return 0;

1487

}

1487

}

1488

#endif

1488

#endif

1489

1490

#ifdef CONFIG_SMP

1490

#ifdef CONFIG_SMP

1491

/* Used instead of source_load when we know the type == 0 */

1491

/* Used instead of source_load when we know the type == 0 */

1492

static unsigned long weighted_cpuload(const int cpu)

1492

static unsigned long weighted_cpuload(const int cpu)

1493

{

1493

{

1494

return cpu_rq(cpu)->load.weight;

1494

return cpu_rq(cpu)->load.weight;

1495

}

1495

}

1496

1497

/*

1497

/*

1498

* Return a low guess at the load of a migration-source cpu weighted

1498

* Return a low guess at the load of a migration-source cpu weighted

1499

* according to the scheduling class and "nice" value.

1499

* according to the scheduling class and "nice" value.

1500

*

1500

*

1501

* We want to under-estimate the load of migration sources, to

1501

* We want to under-estimate the load of migration sources, to

1502

* balance conservatively.

1502

* balance conservatively.

1503

*/

1503

*/

1504

static unsigned long source_load(int cpu, int type)

1504

static unsigned long source_load(int cpu, int type)

1505

{

1505

{

1506

struct rq *rq = cpu_rq(cpu);

1506

struct rq *rq = cpu_rq(cpu);

1507

unsigned long total = weighted_cpuload(cpu);

1507

unsigned long total = weighted_cpuload(cpu);

1508

1509

if (type == 0 || !sched_feat(LB_BIAS))

1509

if (type == 0 || !sched_feat(LB_BIAS))

1510

return total;

1510

return total;

1511

1512

return min(rq->cpu_load[type-1], total);

1512

return min(rq->cpu_load[type-1], total);

1513

}

1513

}

1514

1515

/*

1515

/*

1516

* Return a high guess at the load of a migration-target cpu weighted

1516

* Return a high guess at the load of a migration-target cpu weighted

1517

* according to the scheduling class and "nice" value.

1517

* according to the scheduling class and "nice" value.

1518

*/

1518

*/

1519

static unsigned long target_load(int cpu, int type)

1519

static unsigned long target_load(int cpu, int type)

1520

{

1520

{

1521

struct rq *rq = cpu_rq(cpu);

1521

struct rq *rq = cpu_rq(cpu);

1522

unsigned long total = weighted_cpuload(cpu);

1522

unsigned long total = weighted_cpuload(cpu);

1523

1524

if (type == 0 || !sched_feat(LB_BIAS))

1524

if (type == 0 || !sched_feat(LB_BIAS))

1525

return total;

1525

return total;

1526

1527

return max(rq->cpu_load[type-1], total);

1527

return max(rq->cpu_load[type-1], total);

1528

}

1528

}

1529

1530

static struct sched_group *group_of(int cpu)

1530

static struct sched_group *group_of(int cpu)

1531

{

1531

{

1532

struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);

1532

struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);

1533

1534

if (!sd)

1534

if (!sd)

1535

return NULL;

1535

return NULL;

1536

1537

return sd->groups;

1537

return sd->groups;

1538

}

1538

}

1539

1540

static unsigned long power_of(int cpu)

1540

static unsigned long power_of(int cpu)

1541

{

1541

{

1542

struct sched_group *group = group_of(cpu);

1542

struct sched_group *group = group_of(cpu);

1543

1544

if (!group)

1544

if (!group)

1545

return SCHED_LOAD_SCALE;

1545

return SCHED_LOAD_SCALE;

1546

1547

return group->cpu_power;

1547

return group->cpu_power;

1548

}

1548

}

1549

1550

static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);

1550

static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);

1551

1552

static unsigned long cpu_avg_load_per_task(int cpu)

1552

static unsigned long cpu_avg_load_per_task(int cpu)

1553

{

1553

{

1554

struct rq *rq = cpu_rq(cpu);

1554

struct rq *rq = cpu_rq(cpu);

1555

unsigned long nr_running = ACCESS_ONCE(rq->nr_running);

1555

unsigned long nr_running = ACCESS_ONCE(rq->nr_running);

1556

1557

if (nr_running)

1557

if (nr_running)

1558

rq->avg_load_per_task = rq->load.weight / nr_running;

1558

rq->avg_load_per_task = rq->load.weight / nr_running;

1559

else

1559

else

1560

rq->avg_load_per_task = 0;

1560

rq->avg_load_per_task = 0;

1561

1562

return rq->avg_load_per_task;

1562

return rq->avg_load_per_task;

1563

}

1563

}

1564

1565

#ifdef CONFIG_FAIR_GROUP_SCHED

1565

#ifdef CONFIG_FAIR_GROUP_SCHED

1566

1567

static __read_mostly unsigned long *update_shares_data;

1567

static __read_mostly unsigned long *update_shares_data;

1568

1569

static void __set_se_shares(struct sched_entity *se, unsigned long shares);

1569

static void __set_se_shares(struct sched_entity *se, unsigned long shares);

1570

1571

/*

1571

/*

1572

* Calculate and set the cpu's group shares.

1572

* Calculate and set the cpu's group shares.

1573

*/

1573

*/

1574

static void update_group_shares_cpu(struct task_group *tg, int cpu,

1574

static void update_group_shares_cpu(struct task_group *tg, int cpu,

1575

unsigned long sd_shares,

1575

unsigned long sd_shares,

1576

unsigned long sd_rq_weight,

1576

unsigned long sd_rq_weight,

1577

unsigned long *usd_rq_weight)

1577

unsigned long *usd_rq_weight)

1578

{

1578

{

1579

unsigned long shares, rq_weight;

1579

unsigned long shares, rq_weight;

1580

int boost = 0;

1580

int boost = 0;

1581

1582

rq_weight = usd_rq_weight[cpu];

1582

rq_weight = usd_rq_weight[cpu];

1583

if (!rq_weight) {

1583

if (!rq_weight) {

1584

boost = 1;

1584

boost = 1;

1585

rq_weight = NICE_0_LOAD;

1585

rq_weight = NICE_0_LOAD;

1586

}

1586

}

1587

1588

/*

1588

/*

1589

* \Sum_j shares_j * rq_weight_i

1589

* \Sum_j shares_j * rq_weight_i

1590

* shares_i = -----------------------------

1590

* shares_i = -----------------------------

1591

* \Sum_j rq_weight_j

1591

* \Sum_j rq_weight_j

1592

*/

1592

*/

1593

shares = (sd_shares * rq_weight) / sd_rq_weight;

1593

shares = (sd_shares * rq_weight) / sd_rq_weight;

1594

shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);

1594

shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);

1595

1596

if (abs(shares - tg->se[cpu]->load.weight) >

1596

if (abs(shares - tg->se[cpu]->load.weight) >

1597

sysctl_sched_shares_thresh) {

1597

sysctl_sched_shares_thresh) {

1598

struct rq *rq = cpu_rq(cpu);

1598

struct rq *rq = cpu_rq(cpu);

1599

unsigned long flags;

1599

unsigned long flags;

1600

1601

spin_lock_irqsave(&rq->lock, flags);

1601

spin_lock_irqsave(&rq->lock, flags);

1602

tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;

1602

tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;

1603

tg->cfs_rq[cpu]->shares = boost ? 0 : shares;

1603

tg->cfs_rq[cpu]->shares = boost ? 0 : shares;

1604

__set_se_shares(tg->se[cpu], shares);

1604

__set_se_shares(tg->se[cpu], shares);

1605

spin_unlock_irqrestore(&rq->lock, flags);

1605

spin_unlock_irqrestore(&rq->lock, flags);

1606

}

1606

}

1607

}

1607

}

1608

1609

/*

1609

/*

1610

* Re-compute the task group their per cpu shares over the given domain.

1610

* Re-compute the task group their per cpu shares over the given domain.

1611

* This needs to be done in a bottom-up fashion because the rq weight of a

1611

* This needs to be done in a bottom-up fashion because the rq weight of a

1612

* parent group depends on the shares of its child groups.

1612

* parent group depends on the shares of its child groups.

1613

*/

1613

*/

1614

static int tg_shares_up(struct task_group *tg, void *data)

1614

static int tg_shares_up(struct task_group *tg, void *data)

1615

{

1615

{

1616

unsigned long weight, rq_weight = 0, shares = 0;

1616

unsigned long weight, rq_weight = 0, shares = 0;

1617

unsigned long *usd_rq_weight;

1617

unsigned long *usd_rq_weight;

1618

struct sched_domain *sd = data;

1618

struct sched_domain *sd = data;

1619

unsigned long flags;

1619

unsigned long flags;

1620

int i;

1620

int i;

1621

1622

if (!tg->se[0])

1622

if (!tg->se[0])

1623

return 0;

1623

return 0;

1624

1625

local_irq_save(flags);

1625

local_irq_save(flags);

1626

usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());

1626

usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());

1627

1628

for_each_cpu(i, sched_domain_span(sd)) {

1628

for_each_cpu(i, sched_domain_span(sd)) {

1629

weight = tg->cfs_rq[i]->load.weight;

1629

weight = tg->cfs_rq[i]->load.weight;

1630

usd_rq_weight[i] = weight;

1630

usd_rq_weight[i] = weight;

1631

1632

/*

1632

/*

1633

* If there are currently no tasks on the cpu pretend there

1633

* If there are currently no tasks on the cpu pretend there

1634

* is one of average load so that when a new task gets to

1634

* is one of average load so that when a new task gets to

1635

* run here it will not get delayed by group starvation.

1635

* run here it will not get delayed by group starvation.

1636

*/

1636

*/

1637

if (!weight)

1637

if (!weight)

1638

weight = NICE_0_LOAD;

1638

weight = NICE_0_LOAD;

1639

1640

rq_weight += weight;

1640

rq_weight += weight;

1641

shares += tg->cfs_rq[i]->shares;

1641

shares += tg->cfs_rq[i]->shares;

1642

}

1642

}

1643

1644

if ((!shares && rq_weight) || shares > tg->shares)

1644

if ((!shares && rq_weight) || shares > tg->shares)

1645

shares = tg->shares;

1645

shares = tg->shares;

1646

1647

if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))

1647

if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))

1648

shares = tg->shares;

1648

shares = tg->shares;

1649

1650

for_each_cpu(i, sched_domain_span(sd))

1650

for_each_cpu(i, sched_domain_span(sd))

1651

update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);

1651

update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);

1652

1653

local_irq_restore(flags);

1653

local_irq_restore(flags);

1654

1655

return 0;

1655

return 0;

1656

}

1656

}

1657

1658

/*

1658

/*

1659

* Compute the cpu's hierarchical load factor for each task group.

1659

* Compute the cpu's hierarchical load factor for each task group.

1660

* This needs to be done in a top-down fashion because the load of a child

1660

* This needs to be done in a top-down fashion because the load of a child

1661

* group is a fraction of its parents load.

1661

* group is a fraction of its parents load.

1662

*/

1662

*/

1663

static int tg_load_down(struct task_group *tg, void *data)

1663

static int tg_load_down(struct task_group *tg, void *data)

1664

{

1664

{

1665

unsigned long load;

1665

unsigned long load;

1666

long cpu = (long)data;

1666

long cpu = (long)data;

1667

1668

if (!tg->parent) {

1668

if (!tg->parent) {

1669

load = cpu_rq(cpu)->load.weight;

1669

load = cpu_rq(cpu)->load.weight;

1670

} else {

1670

} else {

1671

load = tg->parent->cfs_rq[cpu]->h_load;

1671

load = tg->parent->cfs_rq[cpu]->h_load;

1672

load *= tg->cfs_rq[cpu]->shares;

1672

load *= tg->cfs_rq[cpu]->shares;

1673

load /= tg->parent->cfs_rq[cpu]->load.weight + 1;

1673

load /= tg->parent->cfs_rq[cpu]->load.weight + 1;

1674

}

1674

}

1675

1676

tg->cfs_rq[cpu]->h_load = load;

1676

tg->cfs_rq[cpu]->h_load = load;

1677

1678

return 0;

1678

return 0;

1679

}

1679

}

1680

1681

static void update_shares(struct sched_domain *sd)

1681

static void update_shares(struct sched_domain *sd)

1682

{

1682

{

1683

s64 elapsed;

1683

s64 elapsed;

1684

u64 now;

1684

u64 now;

1685

1686

if (root_task_group_empty())

1686

if (root_task_group_empty())

1687

return;

1687

return;

1688

1689

now = cpu_clock(raw_smp_processor_id());

1689

now = cpu_clock(raw_smp_processor_id());

1690

elapsed = now - sd->last_update;

1690

elapsed = now - sd->last_update;

1691

1692

if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {

1692

if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {

1693

sd->last_update = now;

1693

sd->last_update = now;

1694

walk_tg_tree(tg_nop, tg_shares_up, sd);

1694

walk_tg_tree(tg_nop, tg_shares_up, sd);

1695

}

1695

}

1696

}

1696

}

1697

1698

static void update_shares_locked(struct rq *rq, struct sched_domain *sd)

1698

static void update_shares_locked(struct rq *rq, struct sched_domain *sd)

1699

{

1699

{

1700

if (root_task_group_empty())

1700

if (root_task_group_empty())

1701

return;

1701

return;

1702

1703

spin_unlock(&rq->lock);

1703

spin_unlock(&rq->lock);

1704

update_shares(sd);

1704

update_shares(sd);

1705

spin_lock(&rq->lock);

1705

spin_lock(&rq->lock);

1706

}

1706

}

1707

1708

static void update_h_load(long cpu)

1708

static void update_h_load(long cpu)

1709

{

1709

{

1710

if (root_task_group_empty())

1710

if (root_task_group_empty())

1711

return;

1711

return;

1712

1713

walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);

1713

walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);

1714

}

1714

}

1715

1716

#else

1716

#else

1717

1718

static inline void update_shares(struct sched_domain *sd)

1718

static inline void update_shares(struct sched_domain *sd)

1719

{

1719

{

1720

}

1720

}

1721

1722

static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)

1722

static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)

1723

{

1723

{

1724

}

1724

}

1725

1726

#endif

1726

#endif

1727

1728

#ifdef CONFIG_PREEMPT

1728

#ifdef CONFIG_PREEMPT

1729

1730

static void double_rq_lock(struct rq *rq1, struct rq *rq2);

1730

static void double_rq_lock(struct rq *rq1, struct rq *rq2);

1731

1732

/*

1732

/*

1733

* fair double_lock_balance: Safely acquires both rq->locks in a fair

1733

* fair double_lock_balance: Safely acquires both rq->locks in a fair

1734

* way at the expense of forcing extra atomic operations in all

1734

* way at the expense of forcing extra atomic operations in all

1735

* invocations. This assures that the double_lock is acquired using the

1735

* invocations. This assures that the double_lock is acquired using the

1736

* same underlying policy as the spinlock_t on this architecture, which

1736

* same underlying policy as the spinlock_t on this architecture, which

1737

* reduces latency compared to the unfair variant below. However, it

1737

* reduces latency compared to the unfair variant below. However, it

1738

* also adds more overhead and therefore may reduce throughput.

1738

* also adds more overhead and therefore may reduce throughput.

1739

*/

1739

*/

1740

static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1740

static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1741

__releases(this_rq->lock)

1741

__releases(this_rq->lock)

1742

__acquires(busiest->lock)

1742

__acquires(busiest->lock)

1743

__acquires(this_rq->lock)

1743

__acquires(this_rq->lock)

1744

{

1744

{

1745

spin_unlock(&this_rq->lock);

1745

spin_unlock(&this_rq->lock);

1746

double_rq_lock(this_rq, busiest);

1746

double_rq_lock(this_rq, busiest);

1747

1748

return 1;

1748

return 1;

1749

}

1749

}

1750

1751

#else

1751

#else

1752

/*

1752

/*

1753

* Unfair double_lock_balance: Optimizes throughput at the expense of

1753

* Unfair double_lock_balance: Optimizes throughput at the expense of

1754

* latency by eliminating extra atomic operations when the locks are

1754

* latency by eliminating extra atomic operations when the locks are

1755

* already in proper order on entry. This favors lower cpu-ids and will

1755

* already in proper order on entry. This favors lower cpu-ids and will

1756

* grant the double lock to lower cpus over higher ids under contention,

1756

* grant the double lock to lower cpus over higher ids under contention,

1757

* regardless of entry order into the function.

1757

* regardless of entry order into the function.

1758

*/

1758

*/

1759

static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1759

static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1760

__releases(this_rq->lock)

1760

__releases(this_rq->lock)

1761

__acquires(busiest->lock)

1761

__acquires(busiest->lock)

1762

__acquires(this_rq->lock)

1762

__acquires(this_rq->lock)

1763

{

1763

{

1764

int ret = 0;

1764

int ret = 0;

1765

1766

if (unlikely(!spin_trylock(&busiest->lock))) {

1766

if (unlikely(!spin_trylock(&busiest->lock))) {

1767

if (busiest < this_rq) {

1767

if (busiest < this_rq) {

1768

spin_unlock(&this_rq->lock);

1768

spin_unlock(&this_rq->lock);

1769

spin_lock(&busiest->lock);

1769

spin_lock(&busiest->lock);

1770

spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);

1770

spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);

1771

ret = 1;

1771

ret = 1;

1772

} else

1772

} else

1773

spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);

1773

spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);

1774

}

1774

}

1775

return ret;

1775

return ret;

1776

}

1776

}

1777

1778

#endif /* CONFIG_PREEMPT */

1778

#endif /* CONFIG_PREEMPT */

1779

1780

/*

1780

/*

1781

* double_lock_balance - lock the busiest runqueue, this_rq is locked already.

1781

* double_lock_balance - lock the busiest runqueue, this_rq is locked already.

1782

*/

1782

*/

1783

static int double_lock_balance(struct rq *this_rq, struct rq *busiest)

1783

static int double_lock_balance(struct rq *this_rq, struct rq *busiest)

1784

{

1784

{

1785

if (unlikely(!irqs_disabled())) {

1785

if (unlikely(!irqs_disabled())) {

1786

/* printk() doesn't work good under rq->lock */

1786

/* printk() doesn't work good under rq->lock */

1787

spin_unlock(&this_rq->lock);

1787

spin_unlock(&this_rq->lock);

1788

BUG_ON(1);

1788

BUG_ON(1);

1789

}

1789

}

1790

1791

return _double_lock_balance(this_rq, busiest);

1791

return _double_lock_balance(this_rq, busiest);

1792

}

1792

}

1793

1794

static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)

1794

static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)

1795

__releases(busiest->lock)

1795

__releases(busiest->lock)

1796

{

1796

{

1797

spin_unlock(&busiest->lock);

1797

spin_unlock(&busiest->lock);

1798

lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);

1798

lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);

1799

}

1799

}

1800

#endif

1800

#endif

1801

1802

#ifdef CONFIG_FAIR_GROUP_SCHED

1802

#ifdef CONFIG_FAIR_GROUP_SCHED

1803

static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)

1803

static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)

1804

{

1804

{

1805

#ifdef CONFIG_SMP

1805

#ifdef CONFIG_SMP

1806

cfs_rq->shares = shares;

1806

cfs_rq->shares = shares;

1807

#endif

1807

#endif

1808

}

1808

}

1809

#endif

1809

#endif

1810

1811

static void calc_load_account_active(struct rq *this_rq);

1811

static void calc_load_account_active(struct rq *this_rq);

1812

1813

#include "sched_stats.h"

1813

#include "sched_stats.h"

1814

#include "sched_idletask.c"

1814

#include "sched_idletask.c"

1815

#include "sched_fair.c"

1815

#include "sched_fair.c"

1816

#include "sched_rt.c"

1816

#include "sched_rt.c"

1817

#ifdef CONFIG_SCHED_DEBUG

1817

#ifdef CONFIG_SCHED_DEBUG

1818

# include "sched_debug.c"

1818

# include "sched_debug.c"

1819

#endif

1819

#endif

1820

1821

#define sched_class_highest (&rt_sched_class)

1821

#define sched_class_highest (&rt_sched_class)

1822

#define for_each_class(class) \

1822

#define for_each_class(class) \

1823

for (class = sched_class_highest; class; class = class->next)

1823

for (class = sched_class_highest; class; class = class->next)

1824

1825

static void inc_nr_running(struct rq *rq)

1825

static void inc_nr_running(struct rq *rq)

1826

{

1826

{

1827

rq->nr_running++;

1827

rq->nr_running++;

1828

}

1828

}

1829

1830

static void dec_nr_running(struct rq *rq)

1830

static void dec_nr_running(struct rq *rq)

1831

{

1831

{

1832

rq->nr_running--;

1832

rq->nr_running--;

1833

}

1833

}

1834

1835

static void set_load_weight(struct task_struct *p)

1835

static void set_load_weight(struct task_struct *p)

1836

{

1836

{

1837

if (task_has_rt_policy(p)) {

1837

if (task_has_rt_policy(p)) {

1838

p->se.load.weight = prio_to_weight[0] * 2;

1838

p->se.load.weight = prio_to_weight[0] * 2;

1839

p->se.load.inv_weight = prio_to_wmult[0] >> 1;

1839

p->se.load.inv_weight = prio_to_wmult[0] >> 1;

1840

return;

1840

return;

1841

}

1841

}

1842

1843

/*

1843

/*

1844

* SCHED_IDLE tasks get minimal weight:

1844

* SCHED_IDLE tasks get minimal weight:

1845

*/

1845

*/

1846

if (p->policy == SCHED_IDLE) {

1846

if (p->policy == SCHED_IDLE) {

1847

p->se.load.weight = WEIGHT_IDLEPRIO;

1847

p->se.load.weight = WEIGHT_IDLEPRIO;

1848

p->se.load.inv_weight = WMULT_IDLEPRIO;

1848

p->se.load.inv_weight = WMULT_IDLEPRIO;

1849

return;

1849

return;

1850

}

1850

}

1851

1852

p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];

1852

p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];

1853

p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];

1853

p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];

1854

}

1854

}

1855

1856

static void update_avg(u64 *avg, u64 sample)

1856

static void update_avg(u64 *avg, u64 sample)

1857

{

1857

{

1858

s64 diff = sample - *avg;

1858

s64 diff = sample - *avg;

1859

*avg += diff >> 3;

1859

*avg += diff >> 3;

1860

}

1860

}

1861

1862

static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)

1862

static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)

1863

{

1863

{

1864

if (wakeup)

1864

if (wakeup)

1865

p->se.start_runtime = p->se.sum_exec_runtime;

1865

p->se.start_runtime = p->se.sum_exec_runtime;

1866

1867

sched_info_queued(p);

1867

sched_info_queued(p);

1868

p->sched_class->enqueue_task(rq, p, wakeup);

1868

p->sched_class->enqueue_task(rq, p, wakeup);

1869

p->se.on_rq = 1;

1869

p->se.on_rq = 1;

1870

}

1870

}

1871

1872

static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)

1872

static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)

1873

{

1873

{

1874

if (sleep) {

1874

if (sleep) {

1875

if (p->se.last_wakeup) {

1875

if (p->se.last_wakeup) {

1876

update_avg(&p->se.avg_overlap,

1876

update_avg(&p->se.avg_overlap,

1877

p->se.sum_exec_runtime - p->se.last_wakeup);

1877

p->se.sum_exec_runtime - p->se.last_wakeup);

1878

p->se.last_wakeup = 0;

1878

p->se.last_wakeup = 0;

1879

} else {

1879

} else {

1880

update_avg(&p->se.avg_wakeup,

1880

update_avg(&p->se.avg_wakeup,

1881

sysctl_sched_wakeup_granularity);

1881

sysctl_sched_wakeup_granularity);

1882

}

1882

}

1883

}

1883

}

1884

1885

sched_info_dequeued(p);

1885

sched_info_dequeued(p);

1886

p->sched_class->dequeue_task(rq, p, sleep);

1886

p->sched_class->dequeue_task(rq, p, sleep);

1887

p->se.on_rq = 0;

1887

p->se.on_rq = 0;

1888

}

1888

}

1889

1890

/*

1890

/*

1891

* __normal_prio - return the priority that is based on the static prio

1891

* __normal_prio - return the priority that is based on the static prio

1892

*/

1892

*/

1893

static inline int __normal_prio(struct task_struct *p)

1893

static inline int __normal_prio(struct task_struct *p)

1894

{

1894

{

1895

return p->static_prio;

1895

return p->static_prio;

1896

}

1896

}

1897

1898

/*

1898

/*

1899

* Calculate the expected normal priority: i.e. priority

1899

* Calculate the expected normal priority: i.e. priority

1900

* without taking RT-inheritance into account. Might be

1900

* without taking RT-inheritance into account. Might be

1901

* boosted by interactivity modifiers. Changes upon fork,

1901

* boosted by interactivity modifiers. Changes upon fork,

1902

* setprio syscalls, and whenever the interactivity

1902

* setprio syscalls, and whenever the interactivity

1903

* estimator recalculates.

1903

* estimator recalculates.

1904

*/

1904

*/

1905

static inline int normal_prio(struct task_struct *p)

1905

static inline int normal_prio(struct task_struct *p)

1906

{

1906

{

1907

int prio;

1907

int prio;

1908

1909

if (task_has_rt_policy(p))

1909

if (task_has_rt_policy(p))

1910

prio = MAX_RT_PRIO-1 - p->rt_priority;

1910

prio = MAX_RT_PRIO-1 - p->rt_priority;

1911

else

1911

else

1912

prio = __normal_prio(p);

1912

prio = __normal_prio(p);

1913

return prio;

1913

return prio;

1914

}

1914

}

1915

1916

/*

1916

/*

1917

* Calculate the current priority, i.e. the priority

1917

* Calculate the current priority, i.e. the priority

1918

* taken into account by the scheduler. This value might

1918

* taken into account by the scheduler. This value might

1919

* be boosted by RT tasks, or might be boosted by

1919

* be boosted by RT tasks, or might be boosted by

1920

* interactivity modifiers. Will be RT if the task got

1920

* interactivity modifiers. Will be RT if the task got

1921

* RT-boosted. If not then it returns p->normal_prio.

1921

* RT-boosted. If not then it returns p->normal_prio.

1922

*/

1922

*/

1923

static int effective_prio(struct task_struct *p)

1923

static int effective_prio(struct task_struct *p)

1924

{

1924

{

1925

p->normal_prio = normal_prio(p);

1925

p->normal_prio = normal_prio(p);

1926

/*

1926

/*

1927

* If we are RT tasks or we were boosted to RT priority,

1927

* If we are RT tasks or we were boosted to RT priority,

1928

* keep the priority unchanged. Otherwise, update priority

1928

* keep the priority unchanged. Otherwise, update priority

1929

* to the normal priority:

1929

* to the normal priority:

1930

*/

1930

*/

1931

if (!rt_prio(p->prio))

1931

if (!rt_prio(p->prio))

1932

return p->normal_prio;

1932

return p->normal_prio;

1933

return p->prio;

1933

return p->prio;

1934

}

1934

}

1935

1936

/*

1936

/*

1937

* activate_task - move a task to the runqueue.

1937

* activate_task - move a task to the runqueue.

1938

*/

1938

*/

1939

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)

1939

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)

1940

{

1940

{

1941

if (task_contributes_to_load(p))

1941

if (task_contributes_to_load(p))

1942

rq->nr_uninterruptible--;

1942

rq->nr_uninterruptible--;

1943

1944

enqueue_task(rq, p, wakeup);

1944

enqueue_task(rq, p, wakeup);

1945

inc_nr_running(rq);

1945

inc_nr_running(rq);

1946

}

1946

}

1947

1948

/*

1948

/*

1949

* deactivate_task - remove a task from the runqueue.

1949

* deactivate_task - remove a task from the runqueue.

1950

*/

1950

*/

1951

static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)

1951

static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)

1952

{

1952

{

1953

if (task_contributes_to_load(p))

1953

if (task_contributes_to_load(p))

1954

rq->nr_uninterruptible++;

1954

rq->nr_uninterruptible++;

1955

1956

dequeue_task(rq, p, sleep);

1956

dequeue_task(rq, p, sleep);

1957

dec_nr_running(rq);

1957

dec_nr_running(rq);

1958

}

1958

}

1959

1960

/**

1960

/**

1961

* task_curr - is this task currently executing on a CPU?

1961

* task_curr - is this task currently executing on a CPU?

1962

* @p: the task in question.

1962

* @p: the task in question.

1963

*/

1963

*/

1964

inline int task_curr(const struct task_struct *p)

1964

inline int task_curr(const struct task_struct *p)

1965

{

1965

{

1966

return cpu_curr(task_cpu(p)) == p;

1966

return cpu_curr(task_cpu(p)) == p;

1967

}

1967

}

1968

1969

static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)

1969

static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)

1970

{

1970

{

1971

set_task_rq(p, cpu);

1971

set_task_rq(p, cpu);

1972

#ifdef CONFIG_SMP

1972

#ifdef CONFIG_SMP

1973

/*

1973

/*

1974

* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be

1974

* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be

1975

* successfuly executed on another CPU. We must ensure that updates of

1975

* successfuly executed on another CPU. We must ensure that updates of

1976

* per-task data have been completed by this moment.

1976

* per-task data have been completed by this moment.

1977

*/

1977

*/

1978

smp_wmb();

1978

smp_wmb();

1979

task_thread_info(p)->cpu = cpu;

1979

task_thread_info(p)->cpu = cpu;

1980

#endif

1980

#endif

1981

}

1981

}

1982

1983

static inline void check_class_changed(struct rq *rq, struct task_struct *p,

1983

static inline void check_class_changed(struct rq *rq, struct task_struct *p,

1984

const struct sched_class *prev_class,

1984

const struct sched_class *prev_class,

1985

int oldprio, int running)

1985

int oldprio, int running)

1986

{

1986

{

1987

if (prev_class != p->sched_class) {

1987

if (prev_class != p->sched_class) {

1988

if (prev_class->switched_from)

1988

if (prev_class->switched_from)

1989

prev_class->switched_from(rq, p, running);

1989

prev_class->switched_from(rq, p, running);

1990

p->sched_class->switched_to(rq, p, running);

1990

p->sched_class->switched_to(rq, p, running);

1991

} else

1991

} else

1992

p->sched_class->prio_changed(rq, p, oldprio, running);

1992

p->sched_class->prio_changed(rq, p, oldprio, running);

1993

}

1993

}

1994

1995

#ifdef CONFIG_SMP

1995

#ifdef CONFIG_SMP

1996

/*

1996

/*

1997

* Is this task likely cache-hot:

1997

* Is this task likely cache-hot:

1998

*/

1998

*/

1999

static int

1999

static int

2000

task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)

2000

task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)

2001

{

2001

{

2002

s64 delta;

2002

s64 delta;

2003

2004

/*

2004

/*

2005

* Buddy candidates are cache hot:

2005

* Buddy candidates are cache hot:

2006

*/

2006

*/

2007

if (sched_feat(CACHE_HOT_BUDDY) &&

2007

if (sched_feat(CACHE_HOT_BUDDY) &&

2008

(&p->se == cfs_rq_of(&p->se)->next ||

2008

(&p->se == cfs_rq_of(&p->se)->next ||

2009

&p->se == cfs_rq_of(&p->se)->last))

2009

&p->se == cfs_rq_of(&p->se)->last))

2010

return 1;

2010

return 1;

2011

2012

if (p->sched_class != &fair_sched_class)

2012

if (p->sched_class != &fair_sched_class)

2013

return 0;

2013

return 0;

2014

2015

if (sysctl_sched_migration_cost == -1)

2015

if (sysctl_sched_migration_cost == -1)

2016

return 1;

2016

return 1;

2017

if (sysctl_sched_migration_cost == 0)

2017

if (sysctl_sched_migration_cost == 0)

2018

return 0;

2018

return 0;

2019

2020

delta = now - p->se.exec_start;

2020

delta = now - p->se.exec_start;

2021

2022

return delta < (s64)sysctl_sched_migration_cost;

2022

return delta < (s64)sysctl_sched_migration_cost;

2023

}

2023

}

2024

2025

2026

void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

2026

void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

2027

{

2027

{

2028

int old_cpu = task_cpu(p);

2028

int old_cpu = task_cpu(p);

2029

struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);

2029

struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);

2030

struct cfs_rq *old_cfsrq = task_cfs_rq(p),

2030

struct cfs_rq *old_cfsrq = task_cfs_rq(p),

2031

*new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);

2031

*new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);

2032

u64 clock_offset;

2032

u64 clock_offset;

2033

2034

clock_offset = old_rq->clock - new_rq->clock;

2034

clock_offset = old_rq->clock - new_rq->clock;

2035

2036

trace_sched_migrate_task(p, new_cpu);

2036

trace_sched_migrate_task(p, new_cpu);

2037

2038

#ifdef CONFIG_SCHEDSTATS

2038

#ifdef CONFIG_SCHEDSTATS

2039

if (p->se.wait_start)

2039

if (p->se.wait_start)

2040

p->se.wait_start -= clock_offset;

2040

p->se.wait_start -= clock_offset;

2041

if (p->se.sleep_start)

2041

if (p->se.sleep_start)

2042

p->se.sleep_start -= clock_offset;

2042

p->se.sleep_start -= clock_offset;

2043

if (p->se.block_start)

2043

if (p->se.block_start)

2044

p->se.block_start -= clock_offset;

2044

p->se.block_start -= clock_offset;

2045

#endif

2045

#endif

2046

if (old_cpu != new_cpu) {

2046

if (old_cpu != new_cpu) {

2047

p->se.nr_migrations++;

2047

p->se.nr_migrations++;

2048

new_rq->nr_migrations_in++;

2048

new_rq->nr_migrations_in++;

2049

#ifdef CONFIG_SCHEDSTATS

2049

#ifdef CONFIG_SCHEDSTATS

2050

if (task_hot(p, old_rq->clock, NULL))

2050

if (task_hot(p, old_rq->clock, NULL))

2051

schedstat_inc(p, se.nr_forced2_migrations);

2051

schedstat_inc(p, se.nr_forced2_migrations);

2052

#endif

2052

#endif

2053

perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,

2053

perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,

2054

1, 1, NULL, 0);

2054

1, 1, NULL, 0);

2055

}

2055

}

2056

p->se.vruntime -= old_cfsrq->min_vruntime -

2056

p->se.vruntime -= old_cfsrq->min_vruntime -

2057

new_cfsrq->min_vruntime;

2057

new_cfsrq->min_vruntime;

2058

2059

__set_task_cpu(p, new_cpu);

2059

__set_task_cpu(p, new_cpu);

2060

}

2060

}

2061

2062

struct migration_req {

2062

struct migration_req {

2063

struct list_head list;

2063

struct list_head list;

2064

2065

struct task_struct *task;

2065

struct task_struct *task;

2066

int dest_cpu;

2066

int dest_cpu;

2067

2068

struct completion done;

2068

struct completion done;

2069

};

2069

};

2070

2071

/*

2071

/*

2072

* The task's runqueue lock must be held.

2072

* The task's runqueue lock must be held.

2073

* Returns true if you have to wait for migration thread.

2073

* Returns true if you have to wait for migration thread.

2074

*/

2074

*/

2075

static int

2075

static int

2076

migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)

2076

migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)

2077

{

2077

{

2078

struct rq *rq = task_rq(p);

2078

struct rq *rq = task_rq(p);

2079

2080

/*

2080

/*

2081

* If the task is not on a runqueue (and not running), then

2081

* If the task is not on a runqueue (and not running), then

2082

* it is sufficient to simply update the task's cpu field.

2082

* it is sufficient to simply update the task's cpu field.

2083

*/

2083

*/

2084

if (!p->se.on_rq && !task_running(rq, p)) {

2084

if (!p->se.on_rq && !task_running(rq, p)) {

2085

set_task_cpu(p, dest_cpu);

2085

set_task_cpu(p, dest_cpu);

2086

return 0;

2086

return 0;

2087

}

2087

}

2088

2089

init_completion(&req->done);

2089

init_completion(&req->done);

2090

req->task = p;

2090

req->task = p;

2091

req->dest_cpu = dest_cpu;

2091

req->dest_cpu = dest_cpu;

2092

list_add(&req->list, &rq->migration_queue);

2092

list_add(&req->list, &rq->migration_queue);

2093

2094

return 1;

2094

return 1;

2095

}

2095

}

2096

2097

/*

2097

/*

2098

* wait_task_context_switch - wait for a thread to complete at least one

2098

* wait_task_context_switch - wait for a thread to complete at least one

2099

* context switch.

2099

* context switch.

2100

*

2100

*

2101

* @p must not be current.

2101

* @p must not be current.

2102

*/

2102

*/

2103

void wait_task_context_switch(struct task_struct *p)

2103

void wait_task_context_switch(struct task_struct *p)

2104

{

2104

{

2105

unsigned long nvcsw, nivcsw, flags;

2105

unsigned long nvcsw, nivcsw, flags;

2106

int running;

2106

int running;

2107

struct rq *rq;

2107

struct rq *rq;

2108

2109

nvcsw = p->nvcsw;

2109

nvcsw = p->nvcsw;

2110

nivcsw = p->nivcsw;

2110

nivcsw = p->nivcsw;

2111

for (;;) {

2111

for (;;) {

2112

/*

2112

/*

2113

* The runqueue is assigned before the actual context

2113

* The runqueue is assigned before the actual context

2114

* switch. We need to take the runqueue lock.

2114

* switch. We need to take the runqueue lock.

2115

*

2115

*

2116

* We could check initially without the lock but it is

2116

* We could check initially without the lock but it is

2117

* very likely that we need to take the lock in every

2117

* very likely that we need to take the lock in every

2118

* iteration.

2118

* iteration.

2119

*/

2119

*/

2120

rq = task_rq_lock(p, &flags);

2120

rq = task_rq_lock(p, &flags);

2121

running = task_running(rq, p);

2121

running = task_running(rq, p);

2122

task_rq_unlock(rq, &flags);

2122

task_rq_unlock(rq, &flags);

2123

2124

if (likely(!running))

2124

if (likely(!running))

2125

break;

2125

break;

2126

/*

2126

/*

2127

* The switch count is incremented before the actual

2127

* The switch count is incremented before the actual

2128

* context switch. We thus wait for two switches to be

2128

* context switch. We thus wait for two switches to be

2129

* sure at least one completed.

2129

* sure at least one completed.

2130

*/

2130

*/

2131

if ((p->nvcsw - nvcsw) > 1)

2131

if ((p->nvcsw - nvcsw) > 1)

2132

break;

2132

break;

2133

if ((p->nivcsw - nivcsw) > 1)

2133

if ((p->nivcsw - nivcsw) > 1)

2134

break;

2134

break;

2135

2136

cpu_relax();

2136

cpu_relax();

2137

}

2137

}

2138

}

2138

}

2139

2140

/*

2140

/*

2141

* wait_task_inactive - wait for a thread to unschedule.

2141

* wait_task_inactive - wait for a thread to unschedule.

2142

*

2142

*

2143

* If @match_state is nonzero, it's the @p->state value just checked and

2143

* If @match_state is nonzero, it's the @p->state value just checked and

2144

* not expected to change. If it changes, i.e. @p might have woken up,

2144

* not expected to change. If it changes, i.e. @p might have woken up,

2145

* then return zero. When we succeed in waiting for @p to be off its CPU,

2145

* then return zero. When we succeed in waiting for @p to be off its CPU,

2146

* we return a positive number (its total switch count). If a second call

2146

* we return a positive number (its total switch count). If a second call

2147

* a short while later returns the same number, the caller can be sure that

2147

* a short while later returns the same number, the caller can be sure that

2148

* @p has remained unscheduled the whole time.

2148

* @p has remained unscheduled the whole time.

2149

*

2149

*

2150

* The caller must ensure that the task *will* unschedule sometime soon,

2150

* The caller must ensure that the task *will* unschedule sometime soon,

2151

* else this function might spin for a *long* time. This function can't

2151

* else this function might spin for a *long* time. This function can't

2152

* be called with interrupts off, or it may introduce deadlock with

2152

* be called with interrupts off, or it may introduce deadlock with

2153

* smp_call_function() if an IPI is sent by the same process we are

2153

* smp_call_function() if an IPI is sent by the same process we are

2154

* waiting to become inactive.

2154

* waiting to become inactive.

2155

*/

2155

*/

2156

unsigned long wait_task_inactive(struct task_struct *p, long match_state)

2156

unsigned long wait_task_inactive(struct task_struct *p, long match_state)

2157

{

2157

{

2158

unsigned long flags;

2158

unsigned long flags;

2159

int running, on_rq;

2159

int running, on_rq;

2160

unsigned long ncsw;

2160

unsigned long ncsw;

2161

struct rq *rq;

2161

struct rq *rq;

2162

2163

for (;;) {

2163

for (;;) {

2164

/*

2164

/*

2165

* We do the initial early heuristics without holding

2165

* We do the initial early heuristics without holding

2166

* any task-queue locks at all. We'll only try to get

2166

* any task-queue locks at all. We'll only try to get

2167

* the runqueue lock when things look like they will

2167

* the runqueue lock when things look like they will

2168

* work out!

2168

* work out!

2169

*/

2169

*/

2170

rq = task_rq(p);

2170

rq = task_rq(p);

2171

2172

/*

2172

/*

2173

* If the task is actively running on another CPU

2173

* If the task is actively running on another CPU

2174

* still, just relax and busy-wait without holding

2174

* still, just relax and busy-wait without holding

2175

* any locks.

2175

* any locks.

2176

*

2176

*

2177

* NOTE! Since we don't hold any locks, it's not

2177

* NOTE! Since we don't hold any locks, it's not

2178

* even sure that "rq" stays as the right runqueue!

2178

* even sure that "rq" stays as the right runqueue!

2179

* But we don't care, since "task_running()" will

2179

* But we don't care, since "task_running()" will

2180

* return false if the runqueue has changed and p

2180

* return false if the runqueue has changed and p

2181

* is actually now running somewhere else!

2181

* is actually now running somewhere else!

2182

*/

2182

*/

2183

while (task_running(rq, p)) {

2183

while (task_running(rq, p)) {

2184

if (match_state && unlikely(p->state != match_state))

2184

if (match_state && unlikely(p->state != match_state))

2185

return 0;

2185

return 0;

2186

cpu_relax();

2186

cpu_relax();

2187

}

2187

}

2188

2189

/*

2189

/*

2190

* Ok, time to look more closely! We need the rq

2190

* Ok, time to look more closely! We need the rq

2191

* lock now, to be *sure*. If we're wrong, we'll

2191

* lock now, to be *sure*. If we're wrong, we'll

2192

* just go back and repeat.

2192

* just go back and repeat.

2193

*/

2193

*/

2194

rq = task_rq_lock(p, &flags);

2194

rq = task_rq_lock(p, &flags);

2195

trace_sched_wait_task(rq, p);

2195

trace_sched_wait_task(rq, p);

2196

running = task_running(rq, p);

2196

running = task_running(rq, p);

2197

on_rq = p->se.on_rq;

2197

on_rq = p->se.on_rq;

2198

ncsw = 0;

2198

ncsw = 0;

2199

if (!match_state || p->state == match_state)

2199

if (!match_state || p->state == match_state)

2200

ncsw = p->nvcsw | LONG_MIN; /* sets MSB */

2200

ncsw = p->nvcsw | LONG_MIN; /* sets MSB */

2201

task_rq_unlock(rq, &flags);

2201

task_rq_unlock(rq, &flags);

2202

2203

/*

2203

/*

2204

* If it changed from the expected state, bail out now.

2204

* If it changed from the expected state, bail out now.

2205

*/

2205

*/

2206

if (unlikely(!ncsw))

2206

if (unlikely(!ncsw))

2207

break;

2207

break;

2208

2209

/*

2209

/*

2210

* Was it really running after all now that we

2210

* Was it really running after all now that we

2211

* checked with the proper locks actually held?

2211

* checked with the proper locks actually held?

2212

*

2212

*

2213

* Oops. Go back and try again..

2213

* Oops. Go back and try again..

2214

*/

2214

*/

2215

if (unlikely(running)) {

2215

if (unlikely(running)) {

2216

cpu_relax();

2216

cpu_relax();

2217

continue;

2217

continue;

2218

}

2218

}

2219

2220

/*

2220

/*

2221

* It's not enough that it's not actively running,

2221

* It's not enough that it's not actively running,

2222

* it must be off the runqueue _entirely_, and not

2222

* it must be off the runqueue _entirely_, and not

2223

* preempted!

2223

* preempted!

2224

*

2224

*

2225

* So if it was still runnable (but just not actively

2225

* So if it was still runnable (but just not actively

2226

* running right now), it's preempted, and we should

2226

* running right now), it's preempted, and we should

2227

* yield - it could be a while.

2227

* yield - it could be a while.

2228

*/

2228

*/

2229

if (unlikely(on_rq)) {

2229

if (unlikely(on_rq)) {

2230

schedule_timeout_uninterruptible(1);

2230

schedule_timeout_uninterruptible(1);

2231

continue;

2231

continue;

2232

}

2232

}

2233

2234

/*

2234

/*

2235

* Ahh, all good. It wasn't running, and it wasn't

2235

* Ahh, all good. It wasn't running, and it wasn't

2236

* runnable, which means that it will never become

2236

* runnable, which means that it will never become

2237

* running in the future either. We're all done!

2237

* running in the future either. We're all done!

2238

*/

2238

*/

2239

break;

2239

break;

2240

}

2240

}

2241

2242

return ncsw;

2242

return ncsw;

2243

}

2243

}

2244

2245

/***

2245

/***

2246

* kick_process - kick a running thread to enter/exit the kernel

2246

* kick_process - kick a running thread to enter/exit the kernel

2247

* @p: the to-be-kicked thread

2247

* @p: the to-be-kicked thread

2248

*

2248

*

2249

* Cause a process which is running on another CPU to enter

2249

* Cause a process which is running on another CPU to enter

2250

* kernel-mode, without any delay. (to get signals handled.)

2250

* kernel-mode, without any delay. (to get signals handled.)

2251

*

2251

*

2252

* NOTE: this function doesnt have to take the runqueue lock,

2252

* NOTE: this function doesnt have to take the runqueue lock,

2253

* because all it wants to ensure is that the remote task enters

2253

* because all it wants to ensure is that the remote task enters

2254

* the kernel. If the IPI races and the task has been migrated

2254

* the kernel. If the IPI races and the task has been migrated

2255

* to another CPU then no harm is done and the purpose has been

2255

* to another CPU then no harm is done and the purpose has been

2256

* achieved as well.

2256

* achieved as well.

2257

*/

2257

*/

2258

void kick_process(struct task_struct *p)

2258

void kick_process(struct task_struct *p)

2259

{

2259

{

2260

int cpu;

2260

int cpu;

2261

2262

preempt_disable();

2262

preempt_disable();

2263

cpu = task_cpu(p);

2263

cpu = task_cpu(p);

2264

if ((cpu != smp_processor_id()) && task_curr(p))

2264

if ((cpu != smp_processor_id()) && task_curr(p))

2265

smp_send_reschedule(cpu);

2265

smp_send_reschedule(cpu);

2266

preempt_enable();

2266

preempt_enable();

2267

}

2267

}

2268

EXPORT_SYMBOL_GPL(kick_process);

2268

EXPORT_SYMBOL_GPL(kick_process);

2269

#endif /* CONFIG_SMP */

2269

#endif /* CONFIG_SMP */

2270

2271

/**

2271

/**

2272

* task_oncpu_function_call - call a function on the cpu on which a task runs

2272

* task_oncpu_function_call - call a function on the cpu on which a task runs

2273

* @p: the task to evaluate

2273

* @p: the task to evaluate

2274

* @func: the function to be called

2274

* @func: the function to be called

2275

* @info: the function call argument

2275

* @info: the function call argument

2276

*

2276

*

2277

* Calls the function @func when the task is currently running. This might

2277

* Calls the function @func when the task is currently running. This might

2278

* be on the current CPU, which just calls the function directly

2278

* be on the current CPU, which just calls the function directly

2279

*/

2279

*/

2280

void task_oncpu_function_call(struct task_struct *p,

2280

void task_oncpu_function_call(struct task_struct *p,

2281

void (*func) (void *info), void *info)

2281

void (*func) (void *info), void *info)

2282

{

2282

{

2283

int cpu;

2283

int cpu;

2284

2285

preempt_disable();

2285

preempt_disable();

2286

cpu = task_cpu(p);

2286

cpu = task_cpu(p);

2287

if (task_curr(p))

2287

if (task_curr(p))

2288

smp_call_function_single(cpu, func, info, 1);

2288

smp_call_function_single(cpu, func, info, 1);

2289

preempt_enable();

2289

preempt_enable();

2290

}

2290

}

2291

2292

/***

2292

/***

2293

* try_to_wake_up - wake up a thread

2293

* try_to_wake_up - wake up a thread

2294

* @p: the to-be-woken-up thread

2294

* @p: the to-be-woken-up thread

2295

* @state: the mask of task states that can be woken

2295

* @state: the mask of task states that can be woken

2296

* @sync: do a synchronous wakeup?

2296

* @sync: do a synchronous wakeup?

2297

*

2297

*

2298

* Put it on the run-queue if it's not already there. The "current"

2298

* Put it on the run-queue if it's not already there. The "current"

2299

* thread is always on the run-queue (except when the actual

2299

* thread is always on the run-queue (except when the actual

2300

* re-schedule is in progress), and as such you're allowed to do

2300

* re-schedule is in progress), and as such you're allowed to do

2301

* the simpler "current->state = TASK_RUNNING" to mark yourself

2301

* the simpler "current->state = TASK_RUNNING" to mark yourself

2302

* runnable without the overhead of this.

2302

* runnable without the overhead of this.

2303

*

2303

*

2304

* returns failure only if the task is already active.

2304

* returns failure only if the task is already active.

2305

*/

2305

*/

2306

static int try_to_wake_up(struct task_struct *p, unsigned int state,

2306

static int try_to_wake_up(struct task_struct *p, unsigned int state,

2307

int wake_flags)

2307

int wake_flags)

2308

{

2308

{

2309

int cpu, orig_cpu, this_cpu, success = 0;

2309

int cpu, orig_cpu, this_cpu, success = 0;

2310

unsigned long flags;

2310

unsigned long flags;

2311

struct rq *rq, *orig_rq;

2311

struct rq *rq, *orig_rq;

2312

2313

if (!sched_feat(SYNC_WAKEUPS))

2313

if (!sched_feat(SYNC_WAKEUPS))

2314

wake_flags &= ~WF_SYNC;

2314

wake_flags &= ~WF_SYNC;

2315

2316

this_cpu = get_cpu();

2316

this_cpu = get_cpu();

2317

2318

smp_wmb();

2318

smp_wmb();

2319

rq = orig_rq = task_rq_lock(p, &flags);

2319

rq = orig_rq = task_rq_lock(p, &flags);

2320

update_rq_clock(rq);

2320

update_rq_clock(rq);

2321

if (!(p->state & state))

2321

if (!(p->state & state))

2322

goto out;

2322

goto out;

2323

2324

if (p->se.on_rq)

2324

if (p->se.on_rq)

2325

goto out_running;

2325

goto out_running;

2326

2327

cpu = task_cpu(p);

2327

cpu = task_cpu(p);

2328

orig_cpu = cpu;

2328

orig_cpu = cpu;

2329

2330

#ifdef CONFIG_SMP

2330

#ifdef CONFIG_SMP

2331

if (unlikely(task_running(rq, p)))

2331

if (unlikely(task_running(rq, p)))

2332

goto out_activate;

2332

goto out_activate;

2333

2334

/*

2334

/*

2335

* In order to handle concurrent wakeups and release the rq->lock

2335

* In order to handle concurrent wakeups and release the rq->lock

2336

* we put the task in TASK_WAKING state.

2336

* we put the task in TASK_WAKING state.

2337

*

2337

*

2338

* First fix up the nr_uninterruptible count:

2338

* First fix up the nr_uninterruptible count:

2339

*/

2339

*/

2340

if (task_contributes_to_load(p))

2340

if (task_contributes_to_load(p))

2341

rq->nr_uninterruptible--;

2341

rq->nr_uninterruptible--;

2342

p->state = TASK_WAKING;

2342

p->state = TASK_WAKING;

2343

task_rq_unlock(rq, &flags);

2343

task_rq_unlock(rq, &flags);

2344

2345

cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);

2345

cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);

2346

if (cpu != orig_cpu)

2346

if (cpu != orig_cpu)

2347

set_task_cpu(p, cpu);

2347

set_task_cpu(p, cpu);

2348

2349

rq = task_rq_lock(p, &flags);

2349

rq = task_rq_lock(p, &flags);

2350

2351

if (rq != orig_rq)

2351

if (rq != orig_rq)

2352

update_rq_clock(rq);

2352

update_rq_clock(rq);

2353

2354

WARN_ON(p->state != TASK_WAKING);

2354

WARN_ON(p->state != TASK_WAKING);

2355

cpu = task_cpu(p);

2355

cpu = task_cpu(p);

2356

2357

#ifdef CONFIG_SCHEDSTATS

2357

#ifdef CONFIG_SCHEDSTATS

2358

schedstat_inc(rq, ttwu_count);

2358

schedstat_inc(rq, ttwu_count);

2359

if (cpu == this_cpu)

2359

if (cpu == this_cpu)

2360

schedstat_inc(rq, ttwu_local);

2360

schedstat_inc(rq, ttwu_local);

2361

else {

2361

else {

2362

struct sched_domain *sd;

2362

struct sched_domain *sd;

2363

for_each_domain(this_cpu, sd) {

2363

for_each_domain(this_cpu, sd) {

2364

if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {

2364

if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {

2365

schedstat_inc(sd, ttwu_wake_remote);

2365

schedstat_inc(sd, ttwu_wake_remote);

2366

break;

2366

break;

2367

}

2367

}

2368

}

2368

}

2369

}

2369

}

2370

#endif /* CONFIG_SCHEDSTATS */

2370

#endif /* CONFIG_SCHEDSTATS */

2371

2372

out_activate:

2372

out_activate:

2373

#endif /* CONFIG_SMP */

2373

#endif /* CONFIG_SMP */

2374

schedstat_inc(p, se.nr_wakeups);

2374

schedstat_inc(p, se.nr_wakeups);

2375

if (wake_flags & WF_SYNC)

2375

if (wake_flags & WF_SYNC)

2376

schedstat_inc(p, se.nr_wakeups_sync);

2376

schedstat_inc(p, se.nr_wakeups_sync);

2377

if (orig_cpu != cpu)

2377

if (orig_cpu != cpu)

2378

schedstat_inc(p, se.nr_wakeups_migrate);

2378

schedstat_inc(p, se.nr_wakeups_migrate);

2379

if (cpu == this_cpu)

2379

if (cpu == this_cpu)

2380

schedstat_inc(p, se.nr_wakeups_local);

2380

schedstat_inc(p, se.nr_wakeups_local);

2381

else

2381

else

2382

schedstat_inc(p, se.nr_wakeups_remote);

2382

schedstat_inc(p, se.nr_wakeups_remote);

2383

activate_task(rq, p, 1);

2383

activate_task(rq, p, 1);

2384

success = 1;

2384

success = 1;

2385

2386

/*

2386

/*

2387

* Only attribute actual wakeups done by this task.

2387

* Only attribute actual wakeups done by this task.

2388

*/

2388

*/

2389

if (!in_interrupt()) {

2389

if (!in_interrupt()) {

2390

struct sched_entity *se = &current->se;

2390

struct sched_entity *se = &current->se;

2391

u64 sample = se->sum_exec_runtime;

2391

u64 sample = se->sum_exec_runtime;

2392

2393

if (se->last_wakeup)

2393

if (se->last_wakeup)

2394

sample -= se->last_wakeup;

2394

sample -= se->last_wakeup;

2395

else

2395

else

2396

sample -= se->start_runtime;

2396

sample -= se->start_runtime;

2397

update_avg(&se->avg_wakeup, sample);

2397

update_avg(&se->avg_wakeup, sample);

2398

2399

se->last_wakeup = se->sum_exec_runtime;

2399

se->last_wakeup = se->sum_exec_runtime;

2400

}

2400

}

2401

2402

out_running:

2402

out_running:

2403

trace_sched_wakeup(rq, p, success);

2403

trace_sched_wakeup(rq, p, success);

2404

check_preempt_curr(rq, p, wake_flags);

2404

check_preempt_curr(rq, p, wake_flags);

2405

2406

p->state = TASK_RUNNING;

2406

p->state = TASK_RUNNING;

2407

#ifdef CONFIG_SMP

2407

#ifdef CONFIG_SMP

2408

if (p->sched_class->task_wake_up)

2408

if (p->sched_class->task_wake_up)

2409

p->sched_class->task_wake_up(rq, p);

2409

p->sched_class->task_wake_up(rq, p);

2410

#endif

2410

#endif

2411

out:

2411

out:

2412

task_rq_unlock(rq, &flags);

2412

task_rq_unlock(rq, &flags);

2413

put_cpu();

2413

put_cpu();

2414

2415

return success;

2415

return success;

2416

}

2416

}

2417

2418

/**

2418

/**

2419

* wake_up_process - Wake up a specific process

2419

* wake_up_process - Wake up a specific process

2420

* @p: The process to be woken up.

2420

* @p: The process to be woken up.

2421

*

2421

*

2422

* Attempt to wake up the nominated process and move it to the set of runnable

2422

* Attempt to wake up the nominated process and move it to the set of runnable

2423

* processes. Returns 1 if the process was woken up, 0 if it was already

2423

* processes. Returns 1 if the process was woken up, 0 if it was already

2424

* running.

2424

* running.

2425

*

2425

*

2426

* It may be assumed that this function implies a write memory barrier before

2426

* It may be assumed that this function implies a write memory barrier before

2427

* changing the task state if and only if any tasks are woken up.

2427

* changing the task state if and only if any tasks are woken up.

2428

*/

2428

*/

2429

int wake_up_process(struct task_struct *p)

2429

int wake_up_process(struct task_struct *p)

2430

{

2430

{

2431

return try_to_wake_up(p, TASK_ALL, 0);

2431

return try_to_wake_up(p, TASK_ALL, 0);

2432

}

2432

}

2433

EXPORT_SYMBOL(wake_up_process);

2433

EXPORT_SYMBOL(wake_up_process);

2434

2435

int wake_up_state(struct task_struct *p, unsigned int state)

2435

int wake_up_state(struct task_struct *p, unsigned int state)

2436

{

2436

{

2437

return try_to_wake_up(p, state, 0);

2437

return try_to_wake_up(p, state, 0);

2438

}

2438

}

2439

2440

/*

2440

/*

2441

* Perform scheduler related setup for a newly forked process p.

2441

* Perform scheduler related setup for a newly forked process p.

2442

* p is forked by current.

2442

* p is forked by current.

2443

*

2443

*

2444

* __sched_fork() is basic setup used by init_idle() too:

2444

* __sched_fork() is basic setup used by init_idle() too:

2445

*/

2445

*/

2446

static void __sched_fork(struct task_struct *p)

2446

static void __sched_fork(struct task_struct *p)

2447

{

2447

{

2448

p->se.exec_start = 0;

2448

p->se.exec_start = 0;

2449

p->se.sum_exec_runtime = 0;

2449

p->se.sum_exec_runtime = 0;

2450

p->se.prev_sum_exec_runtime = 0;

2450

p->se.prev_sum_exec_runtime = 0;

2451

p->se.nr_migrations = 0;

2451

p->se.nr_migrations = 0;

2452

p->se.last_wakeup = 0;

2452

p->se.last_wakeup = 0;

2453

p->se.avg_overlap = 0;

2453

p->se.avg_overlap = 0;

2454

p->se.start_runtime = 0;

2454

p->se.start_runtime = 0;

2455

p->se.avg_wakeup = sysctl_sched_wakeup_granularity;

2455

p->se.avg_wakeup = sysctl_sched_wakeup_granularity;

2456

p->se.avg_running = 0;

2456

p->se.avg_running = 0;

2457

2458

#ifdef CONFIG_SCHEDSTATS

2458

#ifdef CONFIG_SCHEDSTATS

2459

p->se.wait_start = 0;

2459

p->se.wait_start = 0;

2460

p->se.wait_max = 0;

2460

p->se.wait_max = 0;

2461

p->se.wait_count = 0;

2461

p->se.wait_count = 0;

2462

p->se.wait_sum = 0;

2462

p->se.wait_sum = 0;

2463

2464

p->se.sleep_start = 0;

2464

p->se.sleep_start = 0;

2465

p->se.sleep_max = 0;

2465

p->se.sleep_max = 0;

2466

p->se.sum_sleep_runtime = 0;

2466

p->se.sum_sleep_runtime = 0;

2467

2468

p->se.block_start = 0;

2468

p->se.block_start = 0;

2469

p->se.block_max = 0;

2469

p->se.block_max = 0;

2470

p->se.exec_max = 0;

2470

p->se.exec_max = 0;

2471

p->se.slice_max = 0;

2471

p->se.slice_max = 0;

2472

2473

p->se.nr_migrations_cold = 0;

2473

p->se.nr_migrations_cold = 0;

2474

p->se.nr_failed_migrations_affine = 0;

2474

p->se.nr_failed_migrations_affine = 0;

2475

p->se.nr_failed_migrations_running = 0;

2475

p->se.nr_failed_migrations_running = 0;

2476

p->se.nr_failed_migrations_hot = 0;

2476

p->se.nr_failed_migrations_hot = 0;

2477

p->se.nr_forced_migrations = 0;

2477

p->se.nr_forced_migrations = 0;

2478

p->se.nr_forced2_migrations = 0;

2478

p->se.nr_forced2_migrations = 0;

2479

2480

p->se.nr_wakeups = 0;

2480

p->se.nr_wakeups = 0;

2481

p->se.nr_wakeups_sync = 0;

2481

p->se.nr_wakeups_sync = 0;

2482

p->se.nr_wakeups_migrate = 0;

2482

p->se.nr_wakeups_migrate = 0;

2483

p->se.nr_wakeups_local = 0;

2483

p->se.nr_wakeups_local = 0;

2484

p->se.nr_wakeups_remote = 0;

2484

p->se.nr_wakeups_remote = 0;

2485

p->se.nr_wakeups_affine = 0;

2485

p->se.nr_wakeups_affine = 0;

2486

p->se.nr_wakeups_affine_attempts = 0;

2486

p->se.nr_wakeups_affine_attempts = 0;

2487

p->se.nr_wakeups_passive = 0;

2487

p->se.nr_wakeups_passive = 0;

2488

p->se.nr_wakeups_idle = 0;

2488

p->se.nr_wakeups_idle = 0;

2489

2490

#endif

2490

#endif

2491

2492

INIT_LIST_HEAD(&p->rt.run_list);

2492

INIT_LIST_HEAD(&p->rt.run_list);

2493

p->se.on_rq = 0;

2493

p->se.on_rq = 0;

2494

INIT_LIST_HEAD(&p->se.group_node);

2494

INIT_LIST_HEAD(&p->se.group_node);

2495

2496

#ifdef CONFIG_PREEMPT_NOTIFIERS

2496

#ifdef CONFIG_PREEMPT_NOTIFIERS

2497

INIT_HLIST_HEAD(&p->preempt_notifiers);

2497

INIT_HLIST_HEAD(&p->preempt_notifiers);

2498

#endif

2498

#endif

2499

2500

/*

2500

/*

2501

* We mark the process as running here, but have not actually

2501

* We mark the process as running here, but have not actually

2502

* inserted it onto the runqueue yet. This guarantees that

2502

* inserted it onto the runqueue yet. This guarantees that

2503

* nobody will actually run it, and a signal or other external

2503

* nobody will actually run it, and a signal or other external

2504

* event cannot wake it up and insert it on the runqueue either.

2504

* event cannot wake it up and insert it on the runqueue either.

2505

*/

2505

*/

2506

p->state = TASK_RUNNING;

2506

p->state = TASK_RUNNING;

2507

}

2507

}

2508

2509

/*

2509

/*

2510

* fork()/clone()-time setup:

2510

* fork()/clone()-time setup:

2511

*/

2511

*/

2512

void sched_fork(struct task_struct *p, int clone_flags)

2512

void sched_fork(struct task_struct *p, int clone_flags)

2513

{

2513

{

2514

int cpu = get_cpu();

2514

int cpu = get_cpu();

2515

2516

__sched_fork(p);

2516

__sched_fork(p);

2517

2518

/*

2518

/*

2519

* Revert to default priority/policy on fork if requested.

2519

* Revert to default priority/policy on fork if requested.

2520

*/

2520

*/

2521

if (unlikely(p->sched_reset_on_fork)) {

2521

if (unlikely(p->sched_reset_on_fork)) {

2522

if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {

2522

if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {

2523

p->policy = SCHED_NORMAL;

2523

p->policy = SCHED_NORMAL;

2524

p->normal_prio = p->static_prio;

2524

p->normal_prio = p->static_prio;

2525

}

2525

}

2526

2527

if (PRIO_TO_NICE(p->static_prio) < 0) {

2527

if (PRIO_TO_NICE(p->static_prio) < 0) {

2528

p->static_prio = NICE_TO_PRIO(0);

2528

p->static_prio = NICE_TO_PRIO(0);

2529

p->normal_prio = p->static_prio;

2529

p->normal_prio = p->static_prio;

2530

set_load_weight(p);

2530

set_load_weight(p);

2531

}

2531

}

2532

2533

/*

2533

/*

2534

* We don't need the reset flag anymore after the fork. It has

2534

* We don't need the reset flag anymore after the fork. It has

2535

* fulfilled its duty:

2535

* fulfilled its duty:

2536

*/

2536

*/

2537

p->sched_reset_on_fork = 0;

2537

p->sched_reset_on_fork = 0;

2538

}

2538

}

2539

2540

/*

2540

/*

2541

* Make sure we do not leak PI boosting priority to the child.

2541

* Make sure we do not leak PI boosting priority to the child.

2542

*/

2542

*/

2543

p->prio = current->normal_prio;

2543

p->prio = current->normal_prio;

2544

2545

if (!rt_prio(p->prio))

2545

if (!rt_prio(p->prio))

2546

p->sched_class = &fair_sched_class;

2546

p->sched_class = &fair_sched_class;

2547

2548

#ifdef CONFIG_SMP

2548

#ifdef CONFIG_SMP

2549

cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);

2549

cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);

2550

#endif

2550

#endif

2551

set_task_cpu(p, cpu);

2551

set_task_cpu(p, cpu);

2552

2553

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)

2553

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)

2554

if (likely(sched_info_on()))

2554

if (likely(sched_info_on()))

2555

memset(&p->sched_info, 0, sizeof(p->sched_info));

2555

memset(&p->sched_info, 0, sizeof(p->sched_info));

2556

#endif

2556

#endif

2557

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

2557

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

2558

p->oncpu = 0;

2558

p->oncpu = 0;

2559

#endif

2559

#endif

2560

#ifdef CONFIG_PREEMPT

2560

#ifdef CONFIG_PREEMPT

2561

/* Want to start with kernel preemption disabled. */

2561

/* Want to start with kernel preemption disabled. */

2562

task_thread_info(p)->preempt_count = 1;

2562

task_thread_info(p)->preempt_count = 1;

2563

#endif

2563

#endif

2564

plist_node_init(&p->pushable_tasks, MAX_PRIO);

2564

plist_node_init(&p->pushable_tasks, MAX_PRIO);

2565

2566

put_cpu();

2566

put_cpu();

2567

}

2567

}

2568

2569

/*

2569

/*

2570

* wake_up_new_task - wake up a newly created task for the first time.

2570

* wake_up_new_task - wake up a newly created task for the first time.

2571

*

2571

*

2572

* This function will do some initial scheduler statistics housekeeping

2572

* This function will do some initial scheduler statistics housekeeping

2573

* that must be done for every newly created context, then puts the task

2573

* that must be done for every newly created context, then puts the task

2574

* on the runqueue and wakes it.

2574

* on the runqueue and wakes it.

2575

*/

2575

*/

2576

void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)

2576

void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)

2577

{

2577

{

2578

unsigned long flags;

2578

unsigned long flags;

2579

struct rq *rq;

2579

struct rq *rq;

2580

2581

rq = task_rq_lock(p, &flags);

2581

rq = task_rq_lock(p, &flags);

2582

BUG_ON(p->state != TASK_RUNNING);

2582

BUG_ON(p->state != TASK_RUNNING);

2583

update_rq_clock(rq);

2583

update_rq_clock(rq);

2584

2585

if (!p->sched_class->task_new || !current->se.on_rq) {

2585

if (!p->sched_class->task_new || !current->se.on_rq) {

2586

activate_task(rq, p, 0);

2586

activate_task(rq, p, 0);

2587

} else {

2587

} else {

2588

/*

2588

/*

2589

* Let the scheduling class do new task startup

2589

* Let the scheduling class do new task startup

2590

* management (if any):

2590

* management (if any):

2591

*/

2591

*/

2592

p->sched_class->task_new(rq, p);

2592

p->sched_class->task_new(rq, p);

2593

inc_nr_running(rq);

2593

inc_nr_running(rq);

2594

}

2594

}

2595

trace_sched_wakeup_new(rq, p, 1);

2595

trace_sched_wakeup_new(rq, p, 1);

2596

check_preempt_curr(rq, p, WF_FORK);

2596

check_preempt_curr(rq, p, WF_FORK);

2597

#ifdef CONFIG_SMP

2597

#ifdef CONFIG_SMP

2598

if (p->sched_class->task_wake_up)

2598

if (p->sched_class->task_wake_up)

2599

p->sched_class->task_wake_up(rq, p);

2599

p->sched_class->task_wake_up(rq, p);

2600

#endif

2600

#endif

2601

task_rq_unlock(rq, &flags);

2601

task_rq_unlock(rq, &flags);

2602

}

2602

}

2603

2604

#ifdef CONFIG_PREEMPT_NOTIFIERS

2604

#ifdef CONFIG_PREEMPT_NOTIFIERS

2605

2606

/**

2606

/**

2607

* preempt_notifier_register - tell me when current is being preempted & rescheduled

2607

* preempt_notifier_register - tell me when current is being preempted & rescheduled

2608

* @notifier: notifier struct to register

2608

* @notifier: notifier struct to register

2609

*/

2609

*/

2610

void preempt_notifier_register(struct preempt_notifier *notifier)

2610

void preempt_notifier_register(struct preempt_notifier *notifier)

2611

{

2611

{

2612

hlist_add_head(&notifier->link, &current->preempt_notifiers);

2612

hlist_add_head(&notifier->link, &current->preempt_notifiers);

2613

}

2613

}

2614

EXPORT_SYMBOL_GPL(preempt_notifier_register);

2614

EXPORT_SYMBOL_GPL(preempt_notifier_register);

2615

2616

/**

2616

/**

2617

* preempt_notifier_unregister - no longer interested in preemption notifications

2617

* preempt_notifier_unregister - no longer interested in preemption notifications

2618

* @notifier: notifier struct to unregister

2618

* @notifier: notifier struct to unregister

2619

*

2619

*

2620

* This is safe to call from within a preemption notifier.

2620

* This is safe to call from within a preemption notifier.

2621

*/

2621

*/

2622

void preempt_notifier_unregister(struct preempt_notifier *notifier)

2622

void preempt_notifier_unregister(struct preempt_notifier *notifier)

2623

{

2623

{

2624

hlist_del(&notifier->link);

2624

hlist_del(&notifier->link);

2625

}

2625

}

2626

EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

2626

EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

2627

2628

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2628

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2629

{

2629

{

2630

struct preempt_notifier *notifier;

2630

struct preempt_notifier *notifier;

2631

struct hlist_node *node;

2631

struct hlist_node *node;

2632

2633

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2633

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2634

notifier->ops->sched_in(notifier, raw_smp_processor_id());

2634

notifier->ops->sched_in(notifier, raw_smp_processor_id());

2635

}

2635

}

2636

2637

static void

2637

static void

2638

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2638

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2639

struct task_struct *next)

2639

struct task_struct *next)

2640

{

2640

{

2641

struct preempt_notifier *notifier;

2641

struct preempt_notifier *notifier;

2642

struct hlist_node *node;

2642

struct hlist_node *node;

2643

2644

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2644

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2645

notifier->ops->sched_out(notifier, next);

2645

notifier->ops->sched_out(notifier, next);

2646

}

2646

}

2647

2648

#else /* !CONFIG_PREEMPT_NOTIFIERS */

2648

#else /* !CONFIG_PREEMPT_NOTIFIERS */

2649

2650

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2650

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2651

{

2651

{

2652

}

2652

}

2653

2654

static void

2654

static void

2655

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2655

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2656

struct task_struct *next)

2656

struct task_struct *next)

2657

{

2657

{

2658

}

2658

}

2659

2660

#endif /* CONFIG_PREEMPT_NOTIFIERS */

2660

#endif /* CONFIG_PREEMPT_NOTIFIERS */

2661

2662

/**

2662

/**

2663

* prepare_task_switch - prepare to switch tasks

2663

* prepare_task_switch - prepare to switch tasks

2664

* @rq: the runqueue preparing to switch

2664

* @rq: the runqueue preparing to switch

2665

* @prev: the current task that is being switched out

2665

* @prev: the current task that is being switched out

2666

* @next: the task we are going to switch to.

2666

* @next: the task we are going to switch to.

2667

*

2667

*

2668

* This is called with the rq lock held and interrupts off. It must

2668

* This is called with the rq lock held and interrupts off. It must

2669

* be paired with a subsequent finish_task_switch after the context

2669

* be paired with a subsequent finish_task_switch after the context

2670

* switch.

2670

* switch.

2671

*

2671

*

2672

* prepare_task_switch sets up locking and calls architecture specific

2672

* prepare_task_switch sets up locking and calls architecture specific

2673

* hooks.

2673

* hooks.

2674

*/

2674

*/

2675

static inline void

2675

static inline void

2676

prepare_task_switch(struct rq *rq, struct task_struct *prev,

2676

prepare_task_switch(struct rq *rq, struct task_struct *prev,

2677

struct task_struct *next)

2677

struct task_struct *next)

2678

{

2678

{

2679

fire_sched_out_preempt_notifiers(prev, next);

2679

fire_sched_out_preempt_notifiers(prev, next);

2680

prepare_lock_switch(rq, next);

2680

prepare_lock_switch(rq, next);

2681

prepare_arch_switch(next);

2681

prepare_arch_switch(next);

2682

}

2682

}

2683

2684

/**

2684

/**

2685

* finish_task_switch - clean up after a task-switch

2685

* finish_task_switch - clean up after a task-switch

2686

* @rq: runqueue associated with task-switch

2686

* @rq: runqueue associated with task-switch

2687

* @prev: the thread we just switched away from.

2687

* @prev: the thread we just switched away from.

2688

*

2688

*

2689

* finish_task_switch must be called after the context switch, paired

2689

* finish_task_switch must be called after the context switch, paired

2690

* with a prepare_task_switch call before the context switch.

2690

* with a prepare_task_switch call before the context switch.

2691

* finish_task_switch will reconcile locking set up by prepare_task_switch,

2691

* finish_task_switch will reconcile locking set up by prepare_task_switch,

2692

* and do any other architecture-specific cleanup actions.

2692

* and do any other architecture-specific cleanup actions.

2693

*

2693

*

2694

* Note that we may have delayed dropping an mm in context_switch(). If

2694

* Note that we may have delayed dropping an mm in context_switch(). If

2695

* so, we finish that here outside of the runqueue lock. (Doing it

2695

* so, we finish that here outside of the runqueue lock. (Doing it

2696

* with the lock held can cause deadlocks; see schedule() for

2696

* with the lock held can cause deadlocks; see schedule() for

2697

* details.)

2697

* details.)

2698

*/

2698

*/

2699

static void finish_task_switch(struct rq *rq, struct task_struct *prev)

2699

static void finish_task_switch(struct rq *rq, struct task_struct *prev)

2700

__releases(rq->lock)

2700

__releases(rq->lock)

2701

{

2701

{

2702

struct mm_struct *mm = rq->prev_mm;

2702

struct mm_struct *mm = rq->prev_mm;

2703

long prev_state;

2703

long prev_state;

2704

2705

rq->prev_mm = NULL;

2705

rq->prev_mm = NULL;

2706

2707

/*

2707

/*

2708

* A task struct has one reference for the use as "current".

2708

* A task struct has one reference for the use as "current".

2709

* If a task dies, then it sets TASK_DEAD in tsk->state and calls

2709

* If a task dies, then it sets TASK_DEAD in tsk->state and calls

2710

* schedule one last time. The schedule call will never return, and

2710

* schedule one last time. The schedule call will never return, and

2711

* the scheduled task must drop that reference.

2711

* the scheduled task must drop that reference.

2712

* The test for TASK_DEAD must occur while the runqueue locks are

2712

* The test for TASK_DEAD must occur while the runqueue locks are

2713

* still held, otherwise prev could be scheduled on another cpu, die

2713

* still held, otherwise prev could be scheduled on another cpu, die

2714

* there before we look at prev->state, and then the reference would

2714

* there before we look at prev->state, and then the reference would

2715

* be dropped twice.

2715

* be dropped twice.

2716

* Manfred Spraul <manfred@colorfullife.com>

2716

* Manfred Spraul <manfred@colorfullife.com>

2717

*/

2717

*/

2718

prev_state = prev->state;

2718

prev_state = prev->state;

2719

finish_arch_switch(prev);

2719

finish_arch_switch(prev);

2720

perf_event_task_sched_in(current, cpu_of(rq));

2720

perf_event_task_sched_in(current, cpu_of(rq));

2721

finish_lock_switch(rq, prev);

2721

finish_lock_switch(rq, prev);

2722

2723

fire_sched_in_preempt_notifiers(current);

2723

fire_sched_in_preempt_notifiers(current);

2724

if (mm)

2724

if (mm)

2725

mmdrop(mm);

2725

mmdrop(mm);

2726

if (unlikely(prev_state == TASK_DEAD)) {

2726

if (unlikely(prev_state == TASK_DEAD)) {

2727

/*

2727

/*

2728

* Remove function-return probe instances associated with this

2728

* Remove function-return probe instances associated with this

2729

* task and put them back on the free list.

2729

* task and put them back on the free list.

2730

*/

2730

*/

2731

kprobe_flush_task(prev);

2731

kprobe_flush_task(prev);

2732

put_task_struct(prev);

2732

put_task_struct(prev);

2733

}

2733

}

2734

}

2734

}

2735

2736

#ifdef CONFIG_SMP

2736

#ifdef CONFIG_SMP

2737

2738

/* assumes rq->lock is held */

2738

/* assumes rq->lock is held */

2739

static inline void pre_schedule(struct rq *rq, struct task_struct *prev)

2739

static inline void pre_schedule(struct rq *rq, struct task_struct *prev)

2740

{

2740

{

2741

if (prev->sched_class->pre_schedule)

2741

if (prev->sched_class->pre_schedule)

2742

prev->sched_class->pre_schedule(rq, prev);

2742

prev->sched_class->pre_schedule(rq, prev);

2743

}

2743

}

2744

2745

/* rq->lock is NOT held, but preemption is disabled */

2745

/* rq->lock is NOT held, but preemption is disabled */

2746

static inline void post_schedule(struct rq *rq)

2746

static inline void post_schedule(struct rq *rq)

2747

{

2747

{

2748

if (rq->post_schedule) {

2748

if (rq->post_schedule) {

2749

unsigned long flags;

2749

unsigned long flags;

2750

2751

spin_lock_irqsave(&rq->lock, flags);

2751

spin_lock_irqsave(&rq->lock, flags);

2752

if (rq->curr->sched_class->post_schedule)

2752

if (rq->curr->sched_class->post_schedule)

2753

rq->curr->sched_class->post_schedule(rq);

2753

rq->curr->sched_class->post_schedule(rq);

2754

spin_unlock_irqrestore(&rq->lock, flags);

2754

spin_unlock_irqrestore(&rq->lock, flags);

2755

2756

rq->post_schedule = 0;

2756

rq->post_schedule = 0;

2757

}

2757

}

2758

}

2758

}

2759

2760

#else

2760

#else

2761

2762

static inline void pre_schedule(struct rq *rq, struct task_struct *p)

2762

static inline void pre_schedule(struct rq *rq, struct task_struct *p)

2763

{

2763

{

2764

}

2764

}

2765

2766

static inline void post_schedule(struct rq *rq)

2766

static inline void post_schedule(struct rq *rq)

2767

{

2767

{

2768

}

2768

}

2769

2770

#endif

2770

#endif

2771

2772

/**

2772

/**

2773

* schedule_tail - first thing a freshly forked thread must call.

2773

* schedule_tail - first thing a freshly forked thread must call.

2774

* @prev: the thread we just switched away from.

2774

* @prev: the thread we just switched away from.

2775

*/

2775

*/

2776

asmlinkage void schedule_tail(struct task_struct *prev)

2776

asmlinkage void schedule_tail(struct task_struct *prev)

2777

__releases(rq->lock)

2777

__releases(rq->lock)

2778

{

2778

{

2779

struct rq *rq = this_rq();

2779

struct rq *rq = this_rq();

2780

2781

finish_task_switch(rq, prev);

2781

finish_task_switch(rq, prev);

2782

2783

/*

2783

/*

2784

* FIXME: do we need to worry about rq being invalidated by the

2784

* FIXME: do we need to worry about rq being invalidated by the

2785

* task_switch?

2785

* task_switch?

2786

*/

2786

*/

2787

post_schedule(rq);

2787

post_schedule(rq);

2788

2789

#ifdef __ARCH_WANT_UNLOCKED_CTXSW

2789

#ifdef __ARCH_WANT_UNLOCKED_CTXSW

2790

/* In this case, finish_task_switch does not reenable preemption */

2790

/* In this case, finish_task_switch does not reenable preemption */

2791

preempt_enable();

2791

preempt_enable();

2792

#endif

2792

#endif

2793

if (current->set_child_tid)

2793

if (current->set_child_tid)

2794

put_user(task_pid_vnr(current), current->set_child_tid);

2794

put_user(task_pid_vnr(current), current->set_child_tid);

2795

}

2795

}

2796

2797

/*

2797

/*

2798

* context_switch - switch to the new MM and the new

2798

* context_switch - switch to the new MM and the new

2799

* thread's register state.

2799

* thread's register state.

2800

*/

2800

*/

2801

static inline void

2801

static inline void

2802

context_switch(struct rq *rq, struct task_struct *prev,

2802

context_switch(struct rq *rq, struct task_struct *prev,

2803

struct task_struct *next)

2803

struct task_struct *next)

2804

{

2804

{

2805

struct mm_struct *mm, *oldmm;

2805

struct mm_struct *mm, *oldmm;

2806

2807

prepare_task_switch(rq, prev, next);

2807

prepare_task_switch(rq, prev, next);

2808

trace_sched_switch(rq, prev, next);

2808

trace_sched_switch(rq, prev, next);

2809

mm = next->mm;

2809

mm = next->mm;

2810

oldmm = prev->active_mm;

2810

oldmm = prev->active_mm;

2811

/*

2811

/*

2812

* For paravirt, this is coupled with an exit in switch_to to

2812

* For paravirt, this is coupled with an exit in switch_to to

2813

* combine the page table reload and the switch backend into

2813

* combine the page table reload and the switch backend into

2814

* one hypercall.

2814

* one hypercall.

2815

*/

2815

*/

2816

arch_start_context_switch(prev);

2816

arch_start_context_switch(prev);

2817

2818

if (unlikely(!mm)) {

2818

if (unlikely(!mm)) {

2819

next->active_mm = oldmm;

2819

next->active_mm = oldmm;

2820

atomic_inc(&oldmm->mm_count);

2820

atomic_inc(&oldmm->mm_count);

2821

enter_lazy_tlb(oldmm, next);

2821

enter_lazy_tlb(oldmm, next);

2822

} else

2822

} else

2823

switch_mm(oldmm, mm, next);

2823

switch_mm(oldmm, mm, next);

2824

2825

if (unlikely(!prev->mm)) {

2825

if (unlikely(!prev->mm)) {

2826

prev->active_mm = NULL;

2826

prev->active_mm = NULL;

2827

rq->prev_mm = oldmm;

2827

rq->prev_mm = oldmm;

2828

}

2828

}

2829

/*

2829

/*

2830

* Since the runqueue lock will be released by the next

2830

* Since the runqueue lock will be released by the next

2831

* task (which is an invalid locking op but in the case

2831

* task (which is an invalid locking op but in the case

2832

* of the scheduler it's an obvious special-case), so we

2832

* of the scheduler it's an obvious special-case), so we

2833

* do an early lockdep release here:

2833

* do an early lockdep release here:

2834

*/

2834

*/

2835

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

2835

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

2836

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

2836

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

2837

#endif

2837

#endif

2838

2839

/* Here we just switch the register state and the stack. */

2839

/* Here we just switch the register state and the stack. */

2840

switch_to(prev, next, prev);

2840

switch_to(prev, next, prev);

2841

2842

barrier();

2842

barrier();

2843

/*

2843

/*

2844

* this_rq must be evaluated again because prev may have moved

2844

* this_rq must be evaluated again because prev may have moved

2845

* CPUs since it called schedule(), thus the 'rq' on its stack

2845

* CPUs since it called schedule(), thus the 'rq' on its stack

2846

* frame will be invalid.

2846

* frame will be invalid.

2847

*/

2847

*/

2848

finish_task_switch(this_rq(), prev);

2848

finish_task_switch(this_rq(), prev);

2849

}

2849

}

2850

2851

/*

2851

/*

2852

* nr_running, nr_uninterruptible and nr_context_switches:

2852

* nr_running, nr_uninterruptible and nr_context_switches:

2853

*

2853

*

2854

* externally visible scheduler statistics: current number of runnable

2854

* externally visible scheduler statistics: current number of runnable

2855

* threads, current number of uninterruptible-sleeping threads, total

2855

* threads, current number of uninterruptible-sleeping threads, total

2856

* number of context switches performed since bootup.

2856

* number of context switches performed since bootup.

2857

*/

2857

*/

2858

unsigned long nr_running(void)

2858

unsigned long nr_running(void)

2859

{

2859

{

2860

unsigned long i, sum = 0;

2860

unsigned long i, sum = 0;

2861

2862

for_each_online_cpu(i)

2862

for_each_online_cpu(i)

2863

sum += cpu_rq(i)->nr_running;

2863

sum += cpu_rq(i)->nr_running;

2864

2865

return sum;

2865

return sum;

2866

}

2866

}

2867

2868

unsigned long nr_uninterruptible(void)

2868

unsigned long nr_uninterruptible(void)

2869

{

2869

{

2870

unsigned long i, sum = 0;

2870

unsigned long i, sum = 0;

2871

2872

for_each_possible_cpu(i)

2872

for_each_possible_cpu(i)

2873

sum += cpu_rq(i)->nr_uninterruptible;

2873

sum += cpu_rq(i)->nr_uninterruptible;

2874

2875

/*

2875

/*

2876

* Since we read the counters lockless, it might be slightly

2876

* Since we read the counters lockless, it might be slightly

2877

* inaccurate. Do not allow it to go below zero though:

2877

* inaccurate. Do not allow it to go below zero though:

2878

*/

2878

*/

2879

if (unlikely((long)sum < 0))

2879

if (unlikely((long)sum < 0))

2880

sum = 0;

2880

sum = 0;

2881

2882

return sum;

2882

return sum;

2883

}

2883

}

2884

2885

unsigned long long nr_context_switches(void)

2885

unsigned long long nr_context_switches(void)

2886

{

2886

{

2887

int i;

2887

int i;

2888

unsigned long long sum = 0;

2888

unsigned long long sum = 0;

2889

2890

for_each_possible_cpu(i)

2890

for_each_possible_cpu(i)

2891

sum += cpu_rq(i)->nr_switches;

2891

sum += cpu_rq(i)->nr_switches;

2892

2893

return sum;

2893

return sum;

2894

}

2894

}

2895

2896

unsigned long nr_iowait(void)

2896

unsigned long nr_iowait(void)

2897

{

2897

{

2898

unsigned long i, sum = 0;

2898

unsigned long i, sum = 0;

2899

2900

for_each_possible_cpu(i)

2900

for_each_possible_cpu(i)

2901

sum += atomic_read(&cpu_rq(i)->nr_iowait);

2901

sum += atomic_read(&cpu_rq(i)->nr_iowait);

2902

2903

return sum;

2903

return sum;

2904

}

2904

}

2905

2906

unsigned long nr_iowait_cpu(void)

2906

unsigned long nr_iowait_cpu(void)

2907

{

2907

{

2908

struct rq *this = this_rq();

2908

struct rq *this = this_rq();

2909

return atomic_read(&this->nr_iowait);

2909

return atomic_read(&this->nr_iowait);

2910

}

2910

}

2911

2912

unsigned long this_cpu_load(void)

2912

unsigned long this_cpu_load(void)

2913

{

2913

{

2914

struct rq *this = this_rq();

2914

struct rq *this = this_rq();

2915

return this->cpu_load[0];

2915

return this->cpu_load[0];

2916

}

2916

}

2917

2918

2919

/* Variables and functions for calc_load */

2919

/* Variables and functions for calc_load */

2920

static atomic_long_t calc_load_tasks;

2920

static atomic_long_t calc_load_tasks;

2921

static unsigned long calc_load_update;

2921

static unsigned long calc_load_update;

2922

unsigned long avenrun[3];

2922

unsigned long avenrun[3];

2923

EXPORT_SYMBOL(avenrun);

2923

EXPORT_SYMBOL(avenrun);

2924

2925

/**

2925

/**

2926

* get_avenrun - get the load average array

2926

* get_avenrun - get the load average array

2927

* @loads: pointer to dest load array

2927

* @loads: pointer to dest load array

2928

* @offset: offset to add

2928

* @offset: offset to add

2929

* @shift: shift count to shift the result left

2929

* @shift: shift count to shift the result left

2930

*

2930

*

2931

* These values are estimates at best, so no need for locking.

2931

* These values are estimates at best, so no need for locking.

2932

*/

2932

*/

2933

void get_avenrun(unsigned long *loads, unsigned long offset, int shift)

2933

void get_avenrun(unsigned long *loads, unsigned long offset, int shift)

2934

{

2934

{

2935

loads[0] = (avenrun[0] + offset) << shift;

2935

loads[0] = (avenrun[0] + offset) << shift;

2936

loads[1] = (avenrun[1] + offset) << shift;

2936

loads[1] = (avenrun[1] + offset) << shift;

2937

loads[2] = (avenrun[2] + offset) << shift;

2937

loads[2] = (avenrun[2] + offset) << shift;

2938

}

2938

}

2939

2940

static unsigned long

2940

static unsigned long

2941

calc_load(unsigned long load, unsigned long exp, unsigned long active)

2941

calc_load(unsigned long load, unsigned long exp, unsigned long active)

2942

{

2942

{

2943

load *= exp;

2943

load *= exp;

2944

load += active * (FIXED_1 - exp);

2944

load += active * (FIXED_1 - exp);

2945

return load >> FSHIFT;

2945

return load >> FSHIFT;

2946

}

2946

}

2947

2948

/*

2948

/*

2949

* calc_load - update the avenrun load estimates 10 ticks after the

2949

* calc_load - update the avenrun load estimates 10 ticks after the

2950

* CPUs have updated calc_load_tasks.

2950

* CPUs have updated calc_load_tasks.

2951

*/

2951

*/

2952

void calc_global_load(void)

2952

void calc_global_load(void)

2953

{

2953

{

2954

unsigned long upd = calc_load_update + 10;

2954

unsigned long upd = calc_load_update + 10;

2955

long active;

2955

long active;

2956

2957

if (time_before(jiffies, upd))

2957

if (time_before(jiffies, upd))

2958

return;

2958

return;

2959

2960

active = atomic_long_read(&calc_load_tasks);

2960

active = atomic_long_read(&calc_load_tasks);

2961

active = active > 0 ? active * FIXED_1 : 0;

2961

active = active > 0 ? active * FIXED_1 : 0;

2962

2963

avenrun[0] = calc_load(avenrun[0], EXP_1, active);

2963

avenrun[0] = calc_load(avenrun[0], EXP_1, active);

2964

avenrun[1] = calc_load(avenrun[1], EXP_5, active);

2964

avenrun[1] = calc_load(avenrun[1], EXP_5, active);

2965

avenrun[2] = calc_load(avenrun[2], EXP_15, active);

2965

avenrun[2] = calc_load(avenrun[2], EXP_15, active);

2966

2967

calc_load_update += LOAD_FREQ;

2967

calc_load_update += LOAD_FREQ;

2968

}

2968

}

2969

2970

/*

2970

/*

2971

* Either called from update_cpu_load() or from a cpu going idle

2971

* Either called from update_cpu_load() or from a cpu going idle

2972

*/

2972

*/

2973

static void calc_load_account_active(struct rq *this_rq)

2973

static void calc_load_account_active(struct rq *this_rq)

2974

{

2974

{

2975

long nr_active, delta;

2975

long nr_active, delta;

2976

2977

nr_active = this_rq->nr_running;

2977

nr_active = this_rq->nr_running;

2978

nr_active += (long) this_rq->nr_uninterruptible;

2978

nr_active += (long) this_rq->nr_uninterruptible;

2979

2980

if (nr_active != this_rq->calc_load_active) {

2980

if (nr_active != this_rq->calc_load_active) {

2981

delta = nr_active - this_rq->calc_load_active;

2981

delta = nr_active - this_rq->calc_load_active;

2982

this_rq->calc_load_active = nr_active;

2982

this_rq->calc_load_active = nr_active;

2983

atomic_long_add(delta, &calc_load_tasks);

2983

atomic_long_add(delta, &calc_load_tasks);

2984

}

2984

}

2985

}

2985

}

2986

2987

/*

2987

/*

2988

* Externally visible per-cpu scheduler statistics:

2988

* Externally visible per-cpu scheduler statistics:

2989

* cpu_nr_migrations(cpu) - number of migrations into that cpu

2989

* cpu_nr_migrations(cpu) - number of migrations into that cpu

2990

*/

2990

*/

2991

u64 cpu_nr_migrations(int cpu)

2991

u64 cpu_nr_migrations(int cpu)

2992

{

2992

{

2993

return cpu_rq(cpu)->nr_migrations_in;

2993

return cpu_rq(cpu)->nr_migrations_in;

2994

}

2994

}

2995

2996

/*

2996

/*

2997

* Update rq->cpu_load[] statistics. This function is usually called every

2997

* Update rq->cpu_load[] statistics. This function is usually called every

2998

* scheduler tick (TICK_NSEC).

2998

* scheduler tick (TICK_NSEC).

2999

*/

2999

*/

3000

static void update_cpu_load(struct rq *this_rq)

3000

static void update_cpu_load(struct rq *this_rq)

3001

{

3001

{

3002

unsigned long this_load = this_rq->load.weight;

3002

unsigned long this_load = this_rq->load.weight;

3003

int i, scale;

3003

int i, scale;

3004

3005

this_rq->nr_load_updates++;

3005

this_rq->nr_load_updates++;

3006

3007

/* Update our load: */

3007

/* Update our load: */

3008

for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {

3008

for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {

3009

unsigned long old_load, new_load;

3009

unsigned long old_load, new_load;

3010

3011

/* scale is effectively 1 << i now, and >> i divides by scale */

3011

/* scale is effectively 1 << i now, and >> i divides by scale */

3012

3013

old_load = this_rq->cpu_load[i];

3013

old_load = this_rq->cpu_load[i];

3014

new_load = this_load;

3014

new_load = this_load;

3015

/*

3015

/*

3016

* Round up the averaging division if load is increasing. This

3016

* Round up the averaging division if load is increasing. This

3017

* prevents us from getting stuck on 9 if the load is 10, for

3017

* prevents us from getting stuck on 9 if the load is 10, for

3018

* example.

3018

* example.

3019

*/

3019

*/

3020

if (new_load > old_load)

3020

if (new_load > old_load)

3021

new_load += scale-1;

3021

new_load += scale-1;

3022

this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;

3022

this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;

3023

}

3023

}

3024

3025

if (time_after_eq(jiffies, this_rq->calc_load_update)) {

3025

if (time_after_eq(jiffies, this_rq->calc_load_update)) {

3026

this_rq->calc_load_update += LOAD_FREQ;

3026

this_rq->calc_load_update += LOAD_FREQ;

3027

calc_load_account_active(this_rq);

3027

calc_load_account_active(this_rq);

3028

}

3028

}

3029

}

3029

}

3030

3031

#ifdef CONFIG_SMP

3031

#ifdef CONFIG_SMP

3032

3033

/*

3033

/*

3034

* double_rq_lock - safely lock two runqueues

3034

* double_rq_lock - safely lock two runqueues

3035

*

3035

*

3036

* Note this does not disable interrupts like task_rq_lock,

3036

* Note this does not disable interrupts like task_rq_lock,

3037

* you need to do so manually before calling.

3037

* you need to do so manually before calling.

3038

*/

3038

*/

3039

static void double_rq_lock(struct rq *rq1, struct rq *rq2)

3039

static void double_rq_lock(struct rq *rq1, struct rq *rq2)

3040

__acquires(rq1->lock)

3040

__acquires(rq1->lock)

3041

__acquires(rq2->lock)

3041

__acquires(rq2->lock)

3042

{

3042

{

3043

BUG_ON(!irqs_disabled());

3043

BUG_ON(!irqs_disabled());

3044

if (rq1 == rq2) {

3044

if (rq1 == rq2) {

3045

spin_lock(&rq1->lock);

3045

spin_lock(&rq1->lock);

3046

__acquire(rq2->lock); /* Fake it out ;) */

3046

__acquire(rq2->lock); /* Fake it out ;) */

3047

} else {

3047

} else {

3048

if (rq1 < rq2) {

3048

if (rq1 < rq2) {

3049

spin_lock(&rq1->lock);

3049

spin_lock(&rq1->lock);

3050

spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);

3050

spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);

3051

} else {

3051

} else {

3052

spin_lock(&rq2->lock);

3052

spin_lock(&rq2->lock);

3053

spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);

3053

spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);

3054

}

3054

}

3055

}

3055

}

3056

update_rq_clock(rq1);

3056

update_rq_clock(rq1);

3057

update_rq_clock(rq2);

3057

update_rq_clock(rq2);

3058

}

3058

}

3059

3060

/*

3060

/*

3061

* double_rq_unlock - safely unlock two runqueues

3061

* double_rq_unlock - safely unlock two runqueues

3062

*

3062

*

3063

* Note this does not restore interrupts like task_rq_unlock,

3063

* Note this does not restore interrupts like task_rq_unlock,

3064

* you need to do so manually after calling.

3064

* you need to do so manually after calling.

3065

*/

3065

*/

3066

static void double_rq_unlock(struct rq *rq1, struct rq *rq2)

3066

static void double_rq_unlock(struct rq *rq1, struct rq *rq2)

3067

__releases(rq1->lock)

3067

__releases(rq1->lock)

3068

__releases(rq2->lock)

3068

__releases(rq2->lock)

3069

{

3069

{

3070

spin_unlock(&rq1->lock);

3070

spin_unlock(&rq1->lock);

3071

if (rq1 != rq2)

3071

if (rq1 != rq2)

3072

spin_unlock(&rq2->lock);

3072

spin_unlock(&rq2->lock);

3073

else

3073

else

3074

__release(rq2->lock);

3074

__release(rq2->lock);

3075

}

3075

}

3076

3077

/*

3077

/*

3078

* If dest_cpu is allowed for this process, migrate the task to it.

3078

* If dest_cpu is allowed for this process, migrate the task to it.

3079

* This is accomplished by forcing the cpu_allowed mask to only

3079

* This is accomplished by forcing the cpu_allowed mask to only

3080

* allow dest_cpu, which will force the cpu onto dest_cpu. Then

3080

* allow dest_cpu, which will force the cpu onto dest_cpu. Then

3081

* the cpu_allowed mask is restored.

3081

* the cpu_allowed mask is restored.

3082

*/

3082

*/

3083

static void sched_migrate_task(struct task_struct *p, int dest_cpu)

3083

static void sched_migrate_task(struct task_struct *p, int dest_cpu)

3084

{

3084

{

3085

struct migration_req req;

3085

struct migration_req req;

3086

unsigned long flags;

3086

unsigned long flags;

3087

struct rq *rq;

3087

struct rq *rq;

3088

3089

rq = task_rq_lock(p, &flags);

3089

rq = task_rq_lock(p, &flags);

3090

if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)

3090

if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)

3091

|| unlikely(!cpu_active(dest_cpu)))

3091

|| unlikely(!cpu_active(dest_cpu)))

3092

goto out;

3092

goto out;

3093

3094

/* force the process onto the specified CPU */

3094

/* force the process onto the specified CPU */

3095

if (migrate_task(p, dest_cpu, &req)) {

3095

if (migrate_task(p, dest_cpu, &req)) {

3096

/* Need to wait for migration thread (might exit: take ref). */

3096

/* Need to wait for migration thread (might exit: take ref). */

3097

struct task_struct *mt = rq->migration_thread;

3097

struct task_struct *mt = rq->migration_thread;

3098

3099

get_task_struct(mt);

3099

get_task_struct(mt);

3100

task_rq_unlock(rq, &flags);

3100

task_rq_unlock(rq, &flags);

3101

wake_up_process(mt);

3101

wake_up_process(mt);

3102

put_task_struct(mt);

3102

put_task_struct(mt);

3103

wait_for_completion(&req.done);

3103

wait_for_completion(&req.done);

3104

3105

return;

3105

return;

3106

}

3106

}

3107

out:

3107

out:

3108

task_rq_unlock(rq, &flags);

3108

task_rq_unlock(rq, &flags);

3109

}

3109

}

3110

3111

/*

3111

/*

3112

* sched_exec - execve() is a valuable balancing opportunity, because at

3112

* sched_exec - execve() is a valuable balancing opportunity, because at

3113

* this point the task has the smallest effective memory and cache footprint.

3113

* this point the task has the smallest effective memory and cache footprint.

3114

*/

3114

*/

3115

void sched_exec(void)

3115

void sched_exec(void)

3116

{

3116

{

3117

int new_cpu, this_cpu = get_cpu();

3117

int new_cpu, this_cpu = get_cpu();

3118

new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);

3118

new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);

3119

put_cpu();

3119

put_cpu();

3120

if (new_cpu != this_cpu)

3120

if (new_cpu != this_cpu)

3121

sched_migrate_task(current, new_cpu);

3121

sched_migrate_task(current, new_cpu);

3122

}

3122

}

3123

3124

/*

3124

/*

3125

* pull_task - move a task from a remote runqueue to the local runqueue.

3125

* pull_task - move a task from a remote runqueue to the local runqueue.

3126

* Both runqueues must be locked.

3126

* Both runqueues must be locked.

3127

*/

3127

*/

3128

static void pull_task(struct rq *src_rq, struct task_struct *p,

3128

static void pull_task(struct rq *src_rq, struct task_struct *p,

3129

struct rq *this_rq, int this_cpu)

3129

struct rq *this_rq, int this_cpu)

3130

{

3130

{

3131

deactivate_task(src_rq, p, 0);

3131

deactivate_task(src_rq, p, 0);

3132

set_task_cpu(p, this_cpu);

3132

set_task_cpu(p, this_cpu);

3133

activate_task(this_rq, p, 0);

3133

activate_task(this_rq, p, 0);

3134

/*

3134

/*

3135

* Note that idle threads have a prio of MAX_PRIO, for this test

3135

* Note that idle threads have a prio of MAX_PRIO, for this test

3136

* to be always true for them.

3136

* to be always true for them.

3137

*/

3137

*/

3138

check_preempt_curr(this_rq, p, 0);

3138

check_preempt_curr(this_rq, p, 0);

3139

}

3139

}

3140

3141

/*

3141

/*

3142

* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?

3142

* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?

3143

*/

3143

*/

3144

static

3144

static

3145

int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,

3145

int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,

3146

struct sched_domain *sd, enum cpu_idle_type idle,

3146

struct sched_domain *sd, enum cpu_idle_type idle,

3147

int *all_pinned)

3147

int *all_pinned)

3148

{

3148

{

3149

int tsk_cache_hot = 0;

3149

int tsk_cache_hot = 0;

3150

/*

3150

/*

3151

* We do not migrate tasks that are:

3151

* We do not migrate tasks that are:

3152

* 1) running (obviously), or

3152

* 1) running (obviously), or

3153

* 2) cannot be migrated to this CPU due to cpus_allowed, or

3153

* 2) cannot be migrated to this CPU due to cpus_allowed, or

3154

* 3) are cache-hot on their current CPU.

3154

* 3) are cache-hot on their current CPU.

3155

*/

3155

*/

3156

if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {

3156

if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {

3157

schedstat_inc(p, se.nr_failed_migrations_affine);

3157

schedstat_inc(p, se.nr_failed_migrations_affine);

3158

return 0;

3158

return 0;

3159

}

3159

}

3160

*all_pinned = 0;

3160

*all_pinned = 0;

3161

3162

if (task_running(rq, p)) {

3162

if (task_running(rq, p)) {

3163

schedstat_inc(p, se.nr_failed_migrations_running);

3163

schedstat_inc(p, se.nr_failed_migrations_running);

3164

return 0;

3164

return 0;

3165

}

3165

}

3166

3167

/*

3167

/*

3168

* Aggressive migration if:

3168

* Aggressive migration if:

3169

* 1) task is cache cold, or

3169

* 1) task is cache cold, or

3170

* 2) too many balance attempts have failed.

3170

* 2) too many balance attempts have failed.

3171

*/

3171

*/

3172

3173

tsk_cache_hot = task_hot(p, rq->clock, sd);

3173

tsk_cache_hot = task_hot(p, rq->clock, sd);

3174

if (!tsk_cache_hot ||

3174

if (!tsk_cache_hot ||

3175

sd->nr_balance_failed > sd->cache_nice_tries) {

3175

sd->nr_balance_failed > sd->cache_nice_tries) {

3176

#ifdef CONFIG_SCHEDSTATS

3176

#ifdef CONFIG_SCHEDSTATS

3177

if (tsk_cache_hot) {

3177

if (tsk_cache_hot) {

3178

schedstat_inc(sd, lb_hot_gained[idle]);

3178

schedstat_inc(sd, lb_hot_gained[idle]);

3179

schedstat_inc(p, se.nr_forced_migrations);

3179

schedstat_inc(p, se.nr_forced_migrations);

3180

}

3180

}

3181

#endif

3181

#endif

3182

return 1;

3182

return 1;

3183

}

3183

}

3184

3185

if (tsk_cache_hot) {

3185

if (tsk_cache_hot) {

3186

schedstat_inc(p, se.nr_failed_migrations_hot);

3186

schedstat_inc(p, se.nr_failed_migrations_hot);

3187

return 0;

3187

return 0;

3188

}

3188

}

3189

return 1;

3189

return 1;

3190

}

3190

}

3191

3192

static unsigned long

3192

static unsigned long

3193

balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

3193

balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

3194

unsigned long max_load_move, struct sched_domain *sd,

3194

unsigned long max_load_move, struct sched_domain *sd,

3195

enum cpu_idle_type idle, int *all_pinned,

3195

enum cpu_idle_type idle, int *all_pinned,

3196

int *this_best_prio, struct rq_iterator *iterator)

3196

int *this_best_prio, struct rq_iterator *iterator)

3197

{

3197

{

3198

int loops = 0, pulled = 0, pinned = 0;

3198

int loops = 0, pulled = 0, pinned = 0;

3199

struct task_struct *p;

3199

struct task_struct *p;

3200

long rem_load_move = max_load_move;

3200

long rem_load_move = max_load_move;

3201

3202

if (max_load_move == 0)

3202

if (max_load_move == 0)

3203

goto out;

3203

goto out;

3204

3205

pinned = 1;

3205

pinned = 1;

3206

3207

/*

3207

/*

3208

* Start the load-balancing iterator:

3208

* Start the load-balancing iterator:

3209

*/

3209

*/

3210

p = iterator->start(iterator->arg);

3210

p = iterator->start(iterator->arg);

3211

if (!p || loops++ > sysctl_sched_nr_migrate)

3212

if (!p || loops++ > sysctl_sched_nr_migrate)

3213

goto out;

3213

goto out;

3214

3215

if ((p->se.load.weight >> 1) > rem_load_move ||

3215

if ((p->se.load.weight >> 1) > rem_load_move ||

3216

!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {

3216

!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {

3217

p = iterator->next(iterator->arg);

3217

p = iterator->next(iterator->arg);

3218

goto next;

3218

goto next;

3219

}

3219

}

3220

3221

pull_task(busiest, p, this_rq, this_cpu);

3221

pull_task(busiest, p, this_rq, this_cpu);

3222

pulled++;

3222

pulled++;

3223

rem_load_move -= p->se.load.weight;

3223

rem_load_move -= p->se.load.weight;

3224

3225

#ifdef CONFIG_PREEMPT

3225

#ifdef CONFIG_PREEMPT

3226

/*

3226

/*

3227

* NEWIDLE balancing is a source of latency, so preemptible kernels

3227

* NEWIDLE balancing is a source of latency, so preemptible kernels

3228

* will stop after the first task is pulled to minimize the critical

3228

* will stop after the first task is pulled to minimize the critical

3229

* section.

3229

* section.

3230

*/

3230

*/

3231

if (idle == CPU_NEWLY_IDLE)

3231

if (idle == CPU_NEWLY_IDLE)

3232

goto out;

3232

goto out;

3233

#endif

3233

#endif

3234

3235

/*

3235

/*

3236

* We only want to steal up to the prescribed amount of weighted load.

3236

* We only want to steal up to the prescribed amount of weighted load.

3237

*/

3237

*/

3238

if (rem_load_move > 0) {

3238

if (rem_load_move > 0) {

3239

if (p->prio < *this_best_prio)

3239

if (p->prio < *this_best_prio)

3240

*this_best_prio = p->prio;

3240

*this_best_prio = p->prio;

3241

p = iterator->next(iterator->arg);

3241

p = iterator->next(iterator->arg);

3242

goto next;

3242

goto next;

3243

}

3243

}

3244

out:

3244

out:

3245

/*

3245

/*

3246

* Right now, this is one of only two places pull_task() is called,

3246

* Right now, this is one of only two places pull_task() is called,

3247

* so we can safely collect pull_task() stats here rather than

3247

* so we can safely collect pull_task() stats here rather than

3248

* inside pull_task().

3248

* inside pull_task().

3249

*/

3249

*/

3250

schedstat_add(sd, lb_gained[idle], pulled);

3250

schedstat_add(sd, lb_gained[idle], pulled);

3251

3252

if (all_pinned)

3252

if (all_pinned)

3253

*all_pinned = pinned;

3253

*all_pinned = pinned;

3254

3255

return max_load_move - rem_load_move;

3255

return max_load_move - rem_load_move;

3256

}

3256

}

3257

3258

/*

3258

/*

3259

* move_tasks tries to move up to max_load_move weighted load from busiest to

3259

* move_tasks tries to move up to max_load_move weighted load from busiest to

3260

* this_rq, as part of a balancing operation within domain "sd".

3260

* this_rq, as part of a balancing operation within domain "sd".

3261

* Returns 1 if successful and 0 otherwise.

3261

* Returns 1 if successful and 0 otherwise.

3262

*

3262

*

3263

* Called with both runqueues locked.

3263

* Called with both runqueues locked.

3264

*/

3264

*/

3265

static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

3265

static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

3266

unsigned long max_load_move,

3266

unsigned long max_load_move,

3267

struct sched_domain *sd, enum cpu_idle_type idle,

3267

struct sched_domain *sd, enum cpu_idle_type idle,

3268

int *all_pinned)

3268

int *all_pinned)

3269

{

3269

{

3270

const struct sched_class *class = sched_class_highest;

3270

const struct sched_class *class = sched_class_highest;

3271

unsigned long total_load_moved = 0;

3271

unsigned long total_load_moved = 0;

3272

int this_best_prio = this_rq->curr->prio;

3272

int this_best_prio = this_rq->curr->prio;

3273

3274

do {

3274

do {

3275

total_load_moved +=

3275

total_load_moved +=

3276

class->load_balance(this_rq, this_cpu, busiest,

3276

class->load_balance(this_rq, this_cpu, busiest,

3277

max_load_move - total_load_moved,

3277

max_load_move - total_load_moved,

3278

sd, idle, all_pinned, &this_best_prio);

3278

sd, idle, all_pinned, &this_best_prio);

3279

class = class->next;

3279

class = class->next;

3280

3281

#ifdef CONFIG_PREEMPT

3281

#ifdef CONFIG_PREEMPT

3282

/*

3282

/*

3283

* NEWIDLE balancing is a source of latency, so preemptible

3283

* NEWIDLE balancing is a source of latency, so preemptible

3284

* kernels will stop after the first task is pulled to minimize

3284

* kernels will stop after the first task is pulled to minimize

3285

* the critical section.

3285

* the critical section.

3286

*/

3286

*/

3287

if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)

3287

if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)

3288

break;

3288

break;

3289

#endif

3289

#endif

3290

} while (class && max_load_move > total_load_moved);

3290

} while (class && max_load_move > total_load_moved);

3291

3292

return total_load_moved > 0;

3292

return total_load_moved > 0;

3293

}

3293

}

3294

3295

static int

3295

static int

3296

iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

3296

iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

3297

struct sched_domain *sd, enum cpu_idle_type idle,

3297

struct sched_domain *sd, enum cpu_idle_type idle,

3298

struct rq_iterator *iterator)

3298

struct rq_iterator *iterator)

3299

{

3299

{

3300

struct task_struct *p = iterator->start(iterator->arg);

3300

struct task_struct *p = iterator->start(iterator->arg);

3301

int pinned = 0;

3301

int pinned = 0;

3302

3303

while (p) {

3303

while (p) {

3304

if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {

3304

if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {

3305

pull_task(busiest, p, this_rq, this_cpu);

3305

pull_task(busiest, p, this_rq, this_cpu);

3306

/*

3306

/*

3307

* Right now, this is only the second place pull_task()

3307

* Right now, this is only the second place pull_task()

3308

* is called, so we can safely collect pull_task()

3308

* is called, so we can safely collect pull_task()

3309

* stats here rather than inside pull_task().

3309

* stats here rather than inside pull_task().

3310

*/

3310

*/

3311

schedstat_inc(sd, lb_gained[idle]);

3311

schedstat_inc(sd, lb_gained[idle]);

3312

3313

return 1;

3313

return 1;

3314

}

3314

}

3315

p = iterator->next(iterator->arg);

3315

p = iterator->next(iterator->arg);

3316

}

3316

}

3317

3318

return 0;

3318

return 0;

3319

}

3319

}

3320

3321

/*

3321

/*

3322

* move_one_task tries to move exactly one task from busiest to this_rq, as

3322

* move_one_task tries to move exactly one task from busiest to this_rq, as

3323

* part of active balancing operations within "domain".

3323

* part of active balancing operations within "domain".

3324

* Returns 1 if successful and 0 otherwise.

3324

* Returns 1 if successful and 0 otherwise.

3325

*

3325

*

3326

* Called with both runqueues locked.

3326

* Called with both runqueues locked.

3327

*/

3327

*/

3328

static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

3328

static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

3329

struct sched_domain *sd, enum cpu_idle_type idle)

3329

struct sched_domain *sd, enum cpu_idle_type idle)

3330

{

3330

{

3331

const struct sched_class *class;

3331

const struct sched_class *class;

3332

3333

for_each_class(class) {

3333

for_each_class(class) {

3334

if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))

3334

if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))

3335

return 1;

3335

return 1;

3336

}

3336

}

3337

3338

return 0;

3338

return 0;

3339

}

3339

}

3340

/********** Helpers for find_busiest_group ************************/

3340

/********** Helpers for find_busiest_group ************************/

3341

/*

3341

/*

3342

* sd_lb_stats - Structure to store the statistics of a sched_domain

3342

* sd_lb_stats - Structure to store the statistics of a sched_domain

3343

* during load balancing.

3343

* during load balancing.

3344

*/

3344

*/

3345

struct sd_lb_stats {

3345

struct sd_lb_stats {

3346

struct sched_group *busiest; /* Busiest group in this sd */

3346

struct sched_group *busiest; /* Busiest group in this sd */

3347

struct sched_group *this; /* Local group in this sd */

3347

struct sched_group *this; /* Local group in this sd */

3348

unsigned long total_load; /* Total load of all groups in sd */

3348

unsigned long total_load; /* Total load of all groups in sd */

3349

unsigned long total_pwr; /* Total power of all groups in sd */

3349

unsigned long total_pwr; /* Total power of all groups in sd */

3350

unsigned long avg_load; /* Average load across all groups in sd */

3350

unsigned long avg_load; /* Average load across all groups in sd */

3351

3352

/** Statistics of this group */

3352

/** Statistics of this group */

3353

unsigned long this_load;

3353

unsigned long this_load;

3354

unsigned long this_load_per_task;

3354

unsigned long this_load_per_task;

3355

unsigned long this_nr_running;

3355

unsigned long this_nr_running;

3356

3357

/* Statistics of the busiest group */

3357

/* Statistics of the busiest group */

3358

unsigned long max_load;

3358

unsigned long max_load;

3359

unsigned long busiest_load_per_task;

3359

unsigned long busiest_load_per_task;

3360

unsigned long busiest_nr_running;

3360

unsigned long busiest_nr_running;

3361

3362

int group_imb; /* Is there imbalance in this sd */

3362

int group_imb; /* Is there imbalance in this sd */

3363

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

3363

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

3364

int power_savings_balance; /* Is powersave balance needed for this sd */

3364

int power_savings_balance; /* Is powersave balance needed for this sd */

3365

struct sched_group *group_min; /* Least loaded group in sd */

3365

struct sched_group *group_min; /* Least loaded group in sd */

3366

struct sched_group *group_leader; /* Group which relieves group_min */

3366

struct sched_group *group_leader; /* Group which relieves group_min */

3367

unsigned long min_load_per_task; /* load_per_task in group_min */

3367

unsigned long min_load_per_task; /* load_per_task in group_min */

3368

unsigned long leader_nr_running; /* Nr running of group_leader */

3368

unsigned long leader_nr_running; /* Nr running of group_leader */

3369

unsigned long min_nr_running; /* Nr running of group_min */

3369

unsigned long min_nr_running; /* Nr running of group_min */

3370

#endif

3370

#endif

3371

};

3371

};

3372

3373

/*

3373

/*

3374

* sg_lb_stats - stats of a sched_group required for load_balancing

3374

* sg_lb_stats - stats of a sched_group required for load_balancing

3375

*/

3375

*/

3376

struct sg_lb_stats {

3376

struct sg_lb_stats {

3377

unsigned long avg_load; /*Avg load across the CPUs of the group */

3377

unsigned long avg_load; /*Avg load across the CPUs of the group */

3378

unsigned long group_load; /* Total load over the CPUs of the group */

3378

unsigned long group_load; /* Total load over the CPUs of the group */

3379

unsigned long sum_nr_running; /* Nr tasks running in the group */

3379

unsigned long sum_nr_running; /* Nr tasks running in the group */

3380

unsigned long sum_weighted_load; /* Weighted load of group's tasks */

3380

unsigned long sum_weighted_load; /* Weighted load of group's tasks */

3381

unsigned long group_capacity;

3381

unsigned long group_capacity;

3382

int group_imb; /* Is there an imbalance in the group ? */

3382

int group_imb; /* Is there an imbalance in the group ? */

3383

};

3383

};

3384

3385

/**

3385

/**

3386

* group_first_cpu - Returns the first cpu in the cpumask of a sched_group.

3386

* group_first_cpu - Returns the first cpu in the cpumask of a sched_group.

3387

* @group: The group whose first cpu is to be returned.

3387

* @group: The group whose first cpu is to be returned.

3388

*/

3388

*/

3389

static inline unsigned int group_first_cpu(struct sched_group *group)

3389

static inline unsigned int group_first_cpu(struct sched_group *group)

3390

{

3390

{

3391

return cpumask_first(sched_group_cpus(group));

3391

return cpumask_first(sched_group_cpus(group));

3392

}

3392

}

3393

3394

/**

3394

/**

3395

* get_sd_load_idx - Obtain the load index for a given sched domain.

3395

* get_sd_load_idx - Obtain the load index for a given sched domain.

3396

* @sd: The sched_domain whose load_idx is to be obtained.

3396

* @sd: The sched_domain whose load_idx is to be obtained.

3397

* @idle: The Idle status of the CPU for whose sd load_icx is obtained.

3397

* @idle: The Idle status of the CPU for whose sd load_icx is obtained.

3398

*/

3398

*/

3399

static inline int get_sd_load_idx(struct sched_domain *sd,

3399

static inline int get_sd_load_idx(struct sched_domain *sd,

3400

enum cpu_idle_type idle)

3400

enum cpu_idle_type idle)

3401

{

3401

{

3402

int load_idx;

3402

int load_idx;

3403

3404

switch (idle) {

3404

switch (idle) {

3405

case CPU_NOT_IDLE:

3405

case CPU_NOT_IDLE:

3406

load_idx = sd->busy_idx;

3406

load_idx = sd->busy_idx;

3407

break;

3407

break;

3408

3409

case CPU_NEWLY_IDLE:

3409

case CPU_NEWLY_IDLE:

3410

load_idx = sd->newidle_idx;

3410

load_idx = sd->newidle_idx;

3411

break;

3411

break;

3412

default:

3412

default:

3413

load_idx = sd->idle_idx;

3413

load_idx = sd->idle_idx;

3414

break;

3414

break;

3415

}

3415

}

3416

3417

return load_idx;

3417

return load_idx;

3418

}

3418

}

3419

3420

3421

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

3421

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

3422

/**

3422

/**

3423

* init_sd_power_savings_stats - Initialize power savings statistics for

3423

* init_sd_power_savings_stats - Initialize power savings statistics for

3424

* the given sched_domain, during load balancing.

3424

* the given sched_domain, during load balancing.

3425

*

3425

*

3426

* @sd: Sched domain whose power-savings statistics are to be initialized.

3426

* @sd: Sched domain whose power-savings statistics are to be initialized.

3427

* @sds: Variable containing the statistics for sd.

3427

* @sds: Variable containing the statistics for sd.

3428

* @idle: Idle status of the CPU at which we're performing load-balancing.

3428

* @idle: Idle status of the CPU at which we're performing load-balancing.

3429

*/

3429

*/

3430

static inline void init_sd_power_savings_stats(struct sched_domain *sd,

3430

static inline void init_sd_power_savings_stats(struct sched_domain *sd,

3431

struct sd_lb_stats *sds, enum cpu_idle_type idle)

3431

struct sd_lb_stats *sds, enum cpu_idle_type idle)

3432

{

3432

{

3433

/*

3433

/*

3434

* Busy processors will not participate in power savings

3434

* Busy processors will not participate in power savings

3435

* balance.

3435

* balance.

3436

*/

3436

*/

3437

if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))

3437

if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))

3438

sds->power_savings_balance = 0;

3438

sds->power_savings_balance = 0;

3439

else {

3439

else {

3440

sds->power_savings_balance = 1;

3440

sds->power_savings_balance = 1;

3441

sds->min_nr_running = ULONG_MAX;

3441

sds->min_nr_running = ULONG_MAX;

3442

sds->leader_nr_running = 0;

3442

sds->leader_nr_running = 0;

3443

}

3443

}

3444

}

3444

}

3445

3446

/**

3446

/**

3447

* update_sd_power_savings_stats - Update the power saving stats for a

3447

* update_sd_power_savings_stats - Update the power saving stats for a

3448

* sched_domain while performing load balancing.

3448

* sched_domain while performing load balancing.

3449

*

3449

*

3450

* @group: sched_group belonging to the sched_domain under consideration.

3450

* @group: sched_group belonging to the sched_domain under consideration.

3451

* @sds: Variable containing the statistics of the sched_domain

3451

* @sds: Variable containing the statistics of the sched_domain

3452

* @local_group: Does group contain the CPU for which we're performing

3452

* @local_group: Does group contain the CPU for which we're performing

3453

* load balancing ?

3453

* load balancing ?

3454

* @sgs: Variable containing the statistics of the group.

3454

* @sgs: Variable containing the statistics of the group.

3455

*/

3455

*/

3456

static inline void update_sd_power_savings_stats(struct sched_group *group,

3456

static inline void update_sd_power_savings_stats(struct sched_group *group,

3457

struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)

3457

struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)

3458

{

3458

{

3459

3460

if (!sds->power_savings_balance)

3460

if (!sds->power_savings_balance)

3461

return;

3461

return;

3462

3463

/*

3463

/*

3464

* If the local group is idle or completely loaded

3464

* If the local group is idle or completely loaded

3465

* no need to do power savings balance at this domain

3465

* no need to do power savings balance at this domain

3466

*/

3466

*/

3467

if (local_group && (sds->this_nr_running >= sgs->group_capacity ||

3467

if (local_group && (sds->this_nr_running >= sgs->group_capacity ||

3468

!sds->this_nr_running))

3468

!sds->this_nr_running))

3469

sds->power_savings_balance = 0;

3469

sds->power_savings_balance = 0;

3470

3471

/*

3471

/*

3472

* If a group is already running at full capacity or idle,

3472

* If a group is already running at full capacity or idle,

3473

* don't include that group in power savings calculations

3473

* don't include that group in power savings calculations

3474

*/

3474

*/

3475

if (!sds->power_savings_balance ||

3475

if (!sds->power_savings_balance ||

3476

sgs->sum_nr_running >= sgs->group_capacity ||

3476

sgs->sum_nr_running >= sgs->group_capacity ||

3477

!sgs->sum_nr_running)

3477

!sgs->sum_nr_running)

3478

return;

3478

return;

3479

3480

/*

3480

/*

3481

* Calculate the group which has the least non-idle load.

3481

* Calculate the group which has the least non-idle load.

3482

* This is the group from where we need to pick up the load

3482

* This is the group from where we need to pick up the load

3483

* for saving power

3483

* for saving power

3484

*/

3484

*/

3485

if ((sgs->sum_nr_running < sds->min_nr_running) ||

3485

if ((sgs->sum_nr_running < sds->min_nr_running) ||

3486

(sgs->sum_nr_running == sds->min_nr_running &&

3486

(sgs->sum_nr_running == sds->min_nr_running &&

3487

group_first_cpu(group) > group_first_cpu(sds->group_min))) {

3487

group_first_cpu(group) > group_first_cpu(sds->group_min))) {

3488

sds->group_min = group;

3488

sds->group_min = group;

3489

sds->min_nr_running = sgs->sum_nr_running;

3489

sds->min_nr_running = sgs->sum_nr_running;

3490

sds->min_load_per_task = sgs->sum_weighted_load /

3490

sds->min_load_per_task = sgs->sum_weighted_load /

3491

sgs->sum_nr_running;

3491

sgs->sum_nr_running;

3492

}

3492

}

3493

3494

/*

3494

/*

3495

* Calculate the group which is almost near its

3495

* Calculate the group which is almost near its

3496

* capacity but still has some space to pick up some load

3496

* capacity but still has some space to pick up some load

3497

* from other group and save more power

3497

* from other group and save more power

3498

*/

3498

*/

3499

if (sgs->sum_nr_running + 1 > sgs->group_capacity)

3499

if (sgs->sum_nr_running + 1 > sgs->group_capacity)

3500

return;

3500

return;

3501

3502

if (sgs->sum_nr_running > sds->leader_nr_running ||

3502

if (sgs->sum_nr_running > sds->leader_nr_running ||

3503

(sgs->sum_nr_running == sds->leader_nr_running &&

3503

(sgs->sum_nr_running == sds->leader_nr_running &&

3504

group_first_cpu(group) < group_first_cpu(sds->group_leader))) {

3504

group_first_cpu(group) < group_first_cpu(sds->group_leader))) {

3505

sds->group_leader = group;

3505

sds->group_leader = group;

3506

sds->leader_nr_running = sgs->sum_nr_running;

3506

sds->leader_nr_running = sgs->sum_nr_running;

3507

}

3507

}

3508

}

3508

}

3509

3510

/**

3510

/**

3511

* check_power_save_busiest_group - see if there is potential for some power-savings balance

3511

* check_power_save_busiest_group - see if there is potential for some power-savings balance

3512

* @sds: Variable containing the statistics of the sched_domain

3512

* @sds: Variable containing the statistics of the sched_domain

3513

* under consideration.

3513

* under consideration.

3514

* @this_cpu: Cpu at which we're currently performing load-balancing.

3514

* @this_cpu: Cpu at which we're currently performing load-balancing.

3515

* @imbalance: Variable to store the imbalance.

3515

* @imbalance: Variable to store the imbalance.

3516

*

3516

*

3517

* Description:

3517

* Description:

3518

* Check if we have potential to perform some power-savings balance.

3518

* Check if we have potential to perform some power-savings balance.

3519

* If yes, set the busiest group to be the least loaded group in the

3519

* If yes, set the busiest group to be the least loaded group in the

3520

* sched_domain, so that it's CPUs can be put to idle.

3520

* sched_domain, so that it's CPUs can be put to idle.

3521

*

3521

*

3522

* Returns 1 if there is potential to perform power-savings balance.

3522

* Returns 1 if there is potential to perform power-savings balance.

3523

* Else returns 0.

3523

* Else returns 0.

3524

*/

3524

*/

3525

static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,

3525

static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,

3526

int this_cpu, unsigned long *imbalance)

3526

int this_cpu, unsigned long *imbalance)

3527

{

3527

{

3528

if (!sds->power_savings_balance)

3528

if (!sds->power_savings_balance)

3529

return 0;

3529

return 0;

3530

3531

if (sds->this != sds->group_leader ||

3531

if (sds->this != sds->group_leader ||

3532

sds->group_leader == sds->group_min)

3532

sds->group_leader == sds->group_min)

3533

return 0;

3533

return 0;

3534

3535

*imbalance = sds->min_load_per_task;

3535

*imbalance = sds->min_load_per_task;

3536

sds->busiest = sds->group_min;

3536

sds->busiest = sds->group_min;

3537

3538

return 1;

3538

return 1;

3539

3540

}

3540

}

3541

#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

3541

#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

3542

static inline void init_sd_power_savings_stats(struct sched_domain *sd,

3542

static inline void init_sd_power_savings_stats(struct sched_domain *sd,

3543

struct sd_lb_stats *sds, enum cpu_idle_type idle)

3543

struct sd_lb_stats *sds, enum cpu_idle_type idle)

3544

{

3544

{

3545

return;

3545

return;

3546

}

3546

}

3547

3548

static inline void update_sd_power_savings_stats(struct sched_group *group,

3548

static inline void update_sd_power_savings_stats(struct sched_group *group,

3549

struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)

3549

struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)

3550

{

3550

{

3551

return;

3551

return;

3552

}

3552

}

3553

3554

static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,

3554

static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,

3555

int this_cpu, unsigned long *imbalance)

3555

int this_cpu, unsigned long *imbalance)

3556

{

3556

{

3557

return 0;

3557

return 0;

3558

}

3558

}

3559

#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

3559

#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

3560

3561

3562

unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)

3562

unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)

3563

{

3563

{

3564

return SCHED_LOAD_SCALE;

3564

return SCHED_LOAD_SCALE;

3565

}

3565

}

3566

3567

unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)

3567

unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)

3568

{

3568

{

3569

return default_scale_freq_power(sd, cpu);

3569

return default_scale_freq_power(sd, cpu);

3570

}

3570

}

3571

3572

unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)

3572

unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)

3573

{

3573

{

3574

unsigned long weight = cpumask_weight(sched_domain_span(sd));

3574

unsigned long weight = cpumask_weight(sched_domain_span(sd));

3575

unsigned long smt_gain = sd->smt_gain;

3575

unsigned long smt_gain = sd->smt_gain;

3576

3577

smt_gain /= weight;

3577

smt_gain /= weight;

3578

3579

return smt_gain;

3579

return smt_gain;

3580

}

3580

}

3581

3582

unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)

3582

unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)

3583

{

3583

{

3584

return default_scale_smt_power(sd, cpu);

3584

return default_scale_smt_power(sd, cpu);

3585

}

3585

}

3586

3587

unsigned long scale_rt_power(int cpu)

3587

unsigned long scale_rt_power(int cpu)

3588

{

3588

{

3589

struct rq *rq = cpu_rq(cpu);

3589

struct rq *rq = cpu_rq(cpu);

3590

u64 total, available;

3590

u64 total, available;

3591

3592

sched_avg_update(rq);

3592

sched_avg_update(rq);

3593

3594

total = sched_avg_period() + (rq->clock - rq->age_stamp);

3594

total = sched_avg_period() + (rq->clock - rq->age_stamp);

3595

available = total - rq->rt_avg;

3595

available = total - rq->rt_avg;

3596

3597

if (unlikely((s64)total < SCHED_LOAD_SCALE))

3597

if (unlikely((s64)total < SCHED_LOAD_SCALE))

3598

total = SCHED_LOAD_SCALE;

3598

total = SCHED_LOAD_SCALE;

3599

3600

total >>= SCHED_LOAD_SHIFT;

3600

total >>= SCHED_LOAD_SHIFT;

3601

3602

return div_u64(available, total);

3602

return div_u64(available, total);

3603

}

3603

}

3604

3605

static void update_cpu_power(struct sched_domain *sd, int cpu)

3605

static void update_cpu_power(struct sched_domain *sd, int cpu)

3606

{

3606

{

3607

unsigned long weight = cpumask_weight(sched_domain_span(sd));

3607

unsigned long weight = cpumask_weight(sched_domain_span(sd));

3608

unsigned long power = SCHED_LOAD_SCALE;

3608

unsigned long power = SCHED_LOAD_SCALE;

3609

struct sched_group *sdg = sd->groups;

3609

struct sched_group *sdg = sd->groups;

3610

3611

if (sched_feat(ARCH_POWER))

3611

if (sched_feat(ARCH_POWER))

3612

power *= arch_scale_freq_power(sd, cpu);

3612

power *= arch_scale_freq_power(sd, cpu);

3613

else

3613

else

3614

power *= default_scale_freq_power(sd, cpu);

3614

power *= default_scale_freq_power(sd, cpu);

3615

3616

power >>= SCHED_LOAD_SHIFT;

3616

power >>= SCHED_LOAD_SHIFT;

3617

3618

if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {

3618

if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {

3619

if (sched_feat(ARCH_POWER))

3619

if (sched_feat(ARCH_POWER))

3620

power *= arch_scale_smt_power(sd, cpu);

3620

power *= arch_scale_smt_power(sd, cpu);

3621

else

3621

else

3622

power *= default_scale_smt_power(sd, cpu);

3622

power *= default_scale_smt_power(sd, cpu);

3623

3624

power >>= SCHED_LOAD_SHIFT;

3624

power >>= SCHED_LOAD_SHIFT;

3625

}

3625

}

3626

3627

power *= scale_rt_power(cpu);

3627

power *= scale_rt_power(cpu);

3628

power >>= SCHED_LOAD_SHIFT;

3628

power >>= SCHED_LOAD_SHIFT;

3629

3630

if (!power)

3630

if (!power)

3631

power = 1;

3631

power = 1;

3632

3633

sdg->cpu_power = power;

3633

sdg->cpu_power = power;

3634

}

3634

}

3635

3636

static void update_group_power(struct sched_domain *sd, int cpu)

3636

static void update_group_power(struct sched_domain *sd, int cpu)

3637

{

3637

{

3638

struct sched_domain *child = sd->child;

3638

struct sched_domain *child = sd->child;

3639

struct sched_group *group, *sdg = sd->groups;

3639

struct sched_group *group, *sdg = sd->groups;

3640

unsigned long power;

3640

unsigned long power;

3641

3642

if (!child) {

3642

if (!child) {

3643

update_cpu_power(sd, cpu);

3643

update_cpu_power(sd, cpu);

3644

return;

3644

return;

3645

}

3645

}

3646

3647

power = 0;

3647

power = 0;

3648

3649

group = child->groups;

3649

group = child->groups;

3650

do {

3650

do {

3651

power += group->cpu_power;

3651

power += group->cpu_power;

3652

group = group->next;

3652

group = group->next;

3653

} while (group != child->groups);

3653

} while (group != child->groups);

3654

3655

sdg->cpu_power = power;

3655

sdg->cpu_power = power;

3656

}

3656

}

3657

3658

/**

3658

/**

3659

* update_sg_lb_stats - Update sched_group's statistics for load balancing.

3659

* update_sg_lb_stats - Update sched_group's statistics for load balancing.

3660

* @sd: The sched_domain whose statistics are to be updated.

3660

* @sd: The sched_domain whose statistics are to be updated.

3661

* @group: sched_group whose statistics are to be updated.

3661

* @group: sched_group whose statistics are to be updated.

3662

* @this_cpu: Cpu for which load balance is currently performed.

3662

* @this_cpu: Cpu for which load balance is currently performed.

3663

* @idle: Idle status of this_cpu

3663

* @idle: Idle status of this_cpu

3664

* @load_idx: Load index of sched_domain of this_cpu for load calc.

3664

* @load_idx: Load index of sched_domain of this_cpu for load calc.

3665

* @sd_idle: Idle status of the sched_domain containing group.

3665

* @sd_idle: Idle status of the sched_domain containing group.

3666

* @local_group: Does group contain this_cpu.

3666

* @local_group: Does group contain this_cpu.

3667

* @cpus: Set of cpus considered for load balancing.

3667

* @cpus: Set of cpus considered for load balancing.

3668

* @balance: Should we balance.

3668

* @balance: Should we balance.

3669

* @sgs: variable to hold the statistics for this group.

3669

* @sgs: variable to hold the statistics for this group.

3670

*/

3670

*/

3671

static inline void update_sg_lb_stats(struct sched_domain *sd,

3671

static inline void update_sg_lb_stats(struct sched_domain *sd,

3672

struct sched_group *group, int this_cpu,

3672

struct sched_group *group, int this_cpu,

3673

enum cpu_idle_type idle, int load_idx, int *sd_idle,

3673

enum cpu_idle_type idle, int load_idx, int *sd_idle,

3674

int local_group, const struct cpumask *cpus,

3674

int local_group, const struct cpumask *cpus,

3675

int *balance, struct sg_lb_stats *sgs)

3675

int *balance, struct sg_lb_stats *sgs)

3676

{

3676

{

3677

unsigned long load, max_cpu_load, min_cpu_load;

3677

unsigned long load, max_cpu_load, min_cpu_load;

3678

int i;

3678

int i;

3679

unsigned int balance_cpu = -1, first_idle_cpu = 0;

3679

unsigned int balance_cpu = -1, first_idle_cpu = 0;

3680

unsigned long sum_avg_load_per_task;

3680

unsigned long sum_avg_load_per_task;

3681

unsigned long avg_load_per_task;

3681

unsigned long avg_load_per_task;

3682

3683

if (local_group) {

3683

if (local_group) {

3684

balance_cpu = group_first_cpu(group);

3684

balance_cpu = group_first_cpu(group);

3685

if (balance_cpu == this_cpu)

3685

if (balance_cpu == this_cpu)

3686

update_group_power(sd, this_cpu);

3686

update_group_power(sd, this_cpu);

3687

}

3687

}

3688

3689

/* Tally up the load of all CPUs in the group */

3689

/* Tally up the load of all CPUs in the group */

3690

sum_avg_load_per_task = avg_load_per_task = 0;

3690

sum_avg_load_per_task = avg_load_per_task = 0;

3691

max_cpu_load = 0;

3691

max_cpu_load = 0;

3692

min_cpu_load = ~0UL;

3692

min_cpu_load = ~0UL;

3693

3694

for_each_cpu_and(i, sched_group_cpus(group), cpus) {

3694

for_each_cpu_and(i, sched_group_cpus(group), cpus) {

3695

struct rq *rq = cpu_rq(i);

3695

struct rq *rq = cpu_rq(i);

3696

3697

if (*sd_idle && rq->nr_running)

3697

if (*sd_idle && rq->nr_running)

3698

*sd_idle = 0;

3698

*sd_idle = 0;

3699

3700

/* Bias balancing toward cpus of our domain */

3700

/* Bias balancing toward cpus of our domain */

3701

if (local_group) {

3701

if (local_group) {

3702

if (idle_cpu(i) && !first_idle_cpu) {

3702

if (idle_cpu(i) && !first_idle_cpu) {

3703

first_idle_cpu = 1;

3703

first_idle_cpu = 1;

3704

balance_cpu = i;

3704

balance_cpu = i;

3705

}

3705

}

3706

3707

load = target_load(i, load_idx);

3707

load = target_load(i, load_idx);

3708

} else {

3708

} else {

3709

load = source_load(i, load_idx);

3709

load = source_load(i, load_idx);

3710

if (load > max_cpu_load)

3710

if (load > max_cpu_load)

3711

max_cpu_load = load;

3711

max_cpu_load = load;

3712

if (min_cpu_load > load)

3712

if (min_cpu_load > load)

3713

min_cpu_load = load;

3713

min_cpu_load = load;

3714

}

3714

}

3715

3716

sgs->group_load += load;

3716

sgs->group_load += load;

3717

sgs->sum_nr_running += rq->nr_running;

3717

sgs->sum_nr_running += rq->nr_running;

3718

sgs->sum_weighted_load += weighted_cpuload(i);

3718

sgs->sum_weighted_load += weighted_cpuload(i);

3719

3720

sum_avg_load_per_task += cpu_avg_load_per_task(i);

3720

sum_avg_load_per_task += cpu_avg_load_per_task(i);

3721

}

3721

}

3722

3723

/*

3723

/*

3724

* First idle cpu or the first cpu(busiest) in this sched group

3724

* First idle cpu or the first cpu(busiest) in this sched group

3725

* is eligible for doing load balancing at this and above

3725

* is eligible for doing load balancing at this and above

3726

* domains. In the newly idle case, we will allow all the cpu's

3726

* domains. In the newly idle case, we will allow all the cpu's

3727

* to do the newly idle load balance.

3727

* to do the newly idle load balance.

3728

*/

3728

*/

3729

if (idle != CPU_NEWLY_IDLE && local_group &&

3729

if (idle != CPU_NEWLY_IDLE && local_group &&

3730

balance_cpu != this_cpu && balance) {

3730

balance_cpu != this_cpu && balance) {

3731

*balance = 0;

3731

*balance = 0;

3732

return;

3732

return;

3733

}

3733

}

3734

3735

/* Adjust by relative CPU power of the group */

3735

/* Adjust by relative CPU power of the group */

3736

sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;

3736

sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;

3737

3738

3739

/*

3739

/*

3740

* Consider the group unbalanced when the imbalance is larger

3740

* Consider the group unbalanced when the imbalance is larger

3741

* than the average weight of two tasks.

3741

* than the average weight of two tasks.

3742

*

3742

*

3743

* APZ: with cgroup the avg task weight can vary wildly and

3743

* APZ: with cgroup the avg task weight can vary wildly and

3744

* might not be a suitable number - should we keep a

3744

* might not be a suitable number - should we keep a

3745

* normalized nr_running number somewhere that negates

3745

* normalized nr_running number somewhere that negates

3746

* the hierarchy?

3746

* the hierarchy?

3747

*/

3747

*/

3748

avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /

3748

avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /

3749

group->cpu_power;

3749

group->cpu_power;

3750

3751

if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)

3751

if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)

3752

sgs->group_imb = 1;

3752

sgs->group_imb = 1;

3753

3754

sgs->group_capacity =

3754

sgs->group_capacity =

3755

DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);

3755

DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);

3756

}

3756

}

3757

3758

/**

3758

/**

3759

* update_sd_lb_stats - Update sched_group's statistics for load balancing.

3759

* update_sd_lb_stats - Update sched_group's statistics for load balancing.

3760

* @sd: sched_domain whose statistics are to be updated.

3760

* @sd: sched_domain whose statistics are to be updated.

3761

* @this_cpu: Cpu for which load balance is currently performed.

3761

* @this_cpu: Cpu for which load balance is currently performed.

3762

* @idle: Idle status of this_cpu

3762

* @idle: Idle status of this_cpu

3763

* @sd_idle: Idle status of the sched_domain containing group.

3763

* @sd_idle: Idle status of the sched_domain containing group.

3764

* @cpus: Set of cpus considered for load balancing.

3764

* @cpus: Set of cpus considered for load balancing.

3765

* @balance: Should we balance.

3765

* @balance: Should we balance.

3766

* @sds: variable to hold the statistics for this sched_domain.

3766

* @sds: variable to hold the statistics for this sched_domain.

3767

*/

3767

*/

3768

static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,

3768

static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,

3769

enum cpu_idle_type idle, int *sd_idle,

3769

enum cpu_idle_type idle, int *sd_idle,

3770

const struct cpumask *cpus, int *balance,

3770

const struct cpumask *cpus, int *balance,

3771

struct sd_lb_stats *sds)

3771

struct sd_lb_stats *sds)

3772

{

3772

{

3773

struct sched_domain *child = sd->child;

3773

struct sched_domain *child = sd->child;

3774

struct sched_group *group = sd->groups;

3774

struct sched_group *group = sd->groups;

3775

struct sg_lb_stats sgs;

3775

struct sg_lb_stats sgs;

3776

int load_idx, prefer_sibling = 0;

3776

int load_idx, prefer_sibling = 0;

3777

3778

if (child && child->flags & SD_PREFER_SIBLING)

3778

if (child && child->flags & SD_PREFER_SIBLING)

3779

prefer_sibling = 1;

3779

prefer_sibling = 1;

3780

3781

init_sd_power_savings_stats(sd, sds, idle);

3781

init_sd_power_savings_stats(sd, sds, idle);

3782

load_idx = get_sd_load_idx(sd, idle);

3782

load_idx = get_sd_load_idx(sd, idle);

3783

3784

do {

3784

do {

3785

int local_group;

3785

int local_group;

3786

3787

local_group = cpumask_test_cpu(this_cpu,

3787

local_group = cpumask_test_cpu(this_cpu,

3788

sched_group_cpus(group));

3788

sched_group_cpus(group));

3789

memset(&sgs, 0, sizeof(sgs));

3789

memset(&sgs, 0, sizeof(sgs));

3790

update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,

3790

update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,

3791

local_group, cpus, balance, &sgs);

3791

local_group, cpus, balance, &sgs);

3792

3793

if (local_group && balance && !(*balance))

3793

if (local_group && balance && !(*balance))

3794

return;

3794

return;

3795

3796

sds->total_load += sgs.group_load;

3796

sds->total_load += sgs.group_load;

3797

sds->total_pwr += group->cpu_power;

3797

sds->total_pwr += group->cpu_power;

3798

3799

/*

3799

/*

3800

* In case the child domain prefers tasks go to siblings

3800

* In case the child domain prefers tasks go to siblings

3801

* first, lower the group capacity to one so that we'll try

3801

* first, lower the group capacity to one so that we'll try

3802

* and move all the excess tasks away.

3802

* and move all the excess tasks away.

3803

*/

3803

*/

3804

if (prefer_sibling)

3804

if (prefer_sibling)

3805

sgs.group_capacity = min(sgs.group_capacity, 1UL);

3805

sgs.group_capacity = min(sgs.group_capacity, 1UL);

3806

3807

if (local_group) {

3807

if (local_group) {

3808

sds->this_load = sgs.avg_load;

3808

sds->this_load = sgs.avg_load;

3809

sds->this = group;

3809

sds->this = group;

3810

sds->this_nr_running = sgs.sum_nr_running;

3810

sds->this_nr_running = sgs.sum_nr_running;

3811

sds->this_load_per_task = sgs.sum_weighted_load;

3811

sds->this_load_per_task = sgs.sum_weighted_load;

3812

} else if (sgs.avg_load > sds->max_load &&

3812

} else if (sgs.avg_load > sds->max_load &&

3813

(sgs.sum_nr_running > sgs.group_capacity ||

3813

(sgs.sum_nr_running > sgs.group_capacity ||

3814

sgs.group_imb)) {

3814

sgs.group_imb)) {

3815

sds->max_load = sgs.avg_load;

3815

sds->max_load = sgs.avg_load;

3816

sds->busiest = group;

3816

sds->busiest = group;

3817

sds->busiest_nr_running = sgs.sum_nr_running;

3817

sds->busiest_nr_running = sgs.sum_nr_running;

3818

sds->busiest_load_per_task = sgs.sum_weighted_load;

3818

sds->busiest_load_per_task = sgs.sum_weighted_load;

3819

sds->group_imb = sgs.group_imb;

3819

sds->group_imb = sgs.group_imb;

3820

}

3820

}

3821

3822

update_sd_power_savings_stats(group, sds, local_group, &sgs);

3822

update_sd_power_savings_stats(group, sds, local_group, &sgs);

3823

group = group->next;

3823

group = group->next;

3824

} while (group != sd->groups);

3824

} while (group != sd->groups);

3825

}

3825

}

3826

3827

/**

3827

/**

3828

* fix_small_imbalance - Calculate the minor imbalance that exists

3828

* fix_small_imbalance - Calculate the minor imbalance that exists

3829

* amongst the groups of a sched_domain, during

3829

* amongst the groups of a sched_domain, during

3830

* load balancing.

3830

* load balancing.

3831

* @sds: Statistics of the sched_domain whose imbalance is to be calculated.

3831

* @sds: Statistics of the sched_domain whose imbalance is to be calculated.

3832

* @this_cpu: The cpu at whose sched_domain we're performing load-balance.

3832

* @this_cpu: The cpu at whose sched_domain we're performing load-balance.

3833

* @imbalance: Variable to store the imbalance.

3833

* @imbalance: Variable to store the imbalance.

3834

*/

3834

*/

3835

static inline void fix_small_imbalance(struct sd_lb_stats *sds,

3835

static inline void fix_small_imbalance(struct sd_lb_stats *sds,

3836

int this_cpu, unsigned long *imbalance)

3836

int this_cpu, unsigned long *imbalance)

3837

{

3837

{

3838

unsigned long tmp, pwr_now = 0, pwr_move = 0;

3838

unsigned long tmp, pwr_now = 0, pwr_move = 0;

3839

unsigned int imbn = 2;

3839

unsigned int imbn = 2;

3840

3841

if (sds->this_nr_running) {

3841

if (sds->this_nr_running) {

3842

sds->this_load_per_task /= sds->this_nr_running;

3842

sds->this_load_per_task /= sds->this_nr_running;

3843

if (sds->busiest_load_per_task >

3843

if (sds->busiest_load_per_task >

3844

sds->this_load_per_task)

3844

sds->this_load_per_task)

3845

imbn = 1;

3845

imbn = 1;

3846

} else

3846

} else

3847

sds->this_load_per_task =

3847

sds->this_load_per_task =

3848

cpu_avg_load_per_task(this_cpu);

3848

cpu_avg_load_per_task(this_cpu);

3849

3850

if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=

3850

if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=

3851

sds->busiest_load_per_task * imbn) {

3851

sds->busiest_load_per_task * imbn) {

3852

*imbalance = sds->busiest_load_per_task;

3852

*imbalance = sds->busiest_load_per_task;

3853

return;

3853

return;

3854

}

3854

}

3855

3856

/*

3856

/*

3857

* OK, we don't have enough imbalance to justify moving tasks,

3857

* OK, we don't have enough imbalance to justify moving tasks,

3858

* however we may be able to increase total CPU power used by

3858

* however we may be able to increase total CPU power used by

3859

* moving them.

3859

* moving them.

3860

*/

3860

*/

3861

3862

pwr_now += sds->busiest->cpu_power *

3862

pwr_now += sds->busiest->cpu_power *

3863

min(sds->busiest_load_per_task, sds->max_load);

3863

min(sds->busiest_load_per_task, sds->max_load);

3864

pwr_now += sds->this->cpu_power *

3864

pwr_now += sds->this->cpu_power *

3865

min(sds->this_load_per_task, sds->this_load);

3865

min(sds->this_load_per_task, sds->this_load);

3866

pwr_now /= SCHED_LOAD_SCALE;

3866

pwr_now /= SCHED_LOAD_SCALE;

3867

3868

/* Amount of load we'd subtract */

3868

/* Amount of load we'd subtract */

3869

tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /

3869

tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /

3870

sds->busiest->cpu_power;

3870

sds->busiest->cpu_power;

3871

if (sds->max_load > tmp)

3871

if (sds->max_load > tmp)

3872

pwr_move += sds->busiest->cpu_power *

3872

pwr_move += sds->busiest->cpu_power *

3873

min(sds->busiest_load_per_task, sds->max_load - tmp);

3873

min(sds->busiest_load_per_task, sds->max_load - tmp);

3874

3875

/* Amount of load we'd add */

3875

/* Amount of load we'd add */

3876

if (sds->max_load * sds->busiest->cpu_power <

3876

if (sds->max_load * sds->busiest->cpu_power <

3877

sds->busiest_load_per_task * SCHED_LOAD_SCALE)

3877

sds->busiest_load_per_task * SCHED_LOAD_SCALE)

3878

tmp = (sds->max_load * sds->busiest->cpu_power) /

3878

tmp = (sds->max_load * sds->busiest->cpu_power) /

3879

sds->this->cpu_power;

3879

sds->this->cpu_power;

3880

else

3880

else

3881

tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /

3881

tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /

3882

sds->this->cpu_power;

3882

sds->this->cpu_power;

3883

pwr_move += sds->this->cpu_power *

3883

pwr_move += sds->this->cpu_power *

3884

min(sds->this_load_per_task, sds->this_load + tmp);

3884

min(sds->this_load_per_task, sds->this_load + tmp);

3885

pwr_move /= SCHED_LOAD_SCALE;

3885

pwr_move /= SCHED_LOAD_SCALE;

3886

3887

/* Move if we gain throughput */

3887

/* Move if we gain throughput */

3888

if (pwr_move > pwr_now)

3888

if (pwr_move > pwr_now)

3889

*imbalance = sds->busiest_load_per_task;

3889

*imbalance = sds->busiest_load_per_task;

3890

}

3890

}

3891

3892

/**

3892

/**

3893

* calculate_imbalance - Calculate the amount of imbalance present within the

3893

* calculate_imbalance - Calculate the amount of imbalance present within the

3894

* groups of a given sched_domain during load balance.

3894

* groups of a given sched_domain during load balance.

3895

* @sds: statistics of the sched_domain whose imbalance is to be calculated.

3895

* @sds: statistics of the sched_domain whose imbalance is to be calculated.

3896

* @this_cpu: Cpu for which currently load balance is being performed.

3896

* @this_cpu: Cpu for which currently load balance is being performed.

3897

* @imbalance: The variable to store the imbalance.

3897

* @imbalance: The variable to store the imbalance.

3898

*/

3898

*/

3899

static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,

3899

static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,

3900

unsigned long *imbalance)

3900

unsigned long *imbalance)

3901

{

3901

{

3902

unsigned long max_pull;

3902

unsigned long max_pull;

3903

/*

3903

/*

3904

* In the presence of smp nice balancing, certain scenarios can have

3904

* In the presence of smp nice balancing, certain scenarios can have

3905

* max load less than avg load(as we skip the groups at or below

3905

* max load less than avg load(as we skip the groups at or below

3906

* its cpu_power, while calculating max_load..)

3906

* its cpu_power, while calculating max_load..)

3907

*/

3907

*/

3908

if (sds->max_load < sds->avg_load) {

3908

if (sds->max_load < sds->avg_load) {

3909

*imbalance = 0;

3909

*imbalance = 0;

3910

return fix_small_imbalance(sds, this_cpu, imbalance);

3910

return fix_small_imbalance(sds, this_cpu, imbalance);

3911

}

3911

}

3912

3913

/* Don't want to pull so many tasks that a group would go idle */

3913

/* Don't want to pull so many tasks that a group would go idle */

3914

max_pull = min(sds->max_load - sds->avg_load,

3914

max_pull = min(sds->max_load - sds->avg_load,

3915

sds->max_load - sds->busiest_load_per_task);

3915

sds->max_load - sds->busiest_load_per_task);

3916

3917

/* How much load to actually move to equalise the imbalance */

3917

/* How much load to actually move to equalise the imbalance */

3918

*imbalance = min(max_pull * sds->busiest->cpu_power,

3918

*imbalance = min(max_pull * sds->busiest->cpu_power,

3919

(sds->avg_load - sds->this_load) * sds->this->cpu_power)

3919

(sds->avg_load - sds->this_load) * sds->this->cpu_power)

3920

/ SCHED_LOAD_SCALE;

3920

/ SCHED_LOAD_SCALE;

3921

3922

/*

3922

/*

3923

* if *imbalance is less than the average load per runnable task

3923

* if *imbalance is less than the average load per runnable task

3924

* there is no gaurantee that any tasks will be moved so we'll have

3924

* there is no gaurantee that any tasks will be moved so we'll have

3925

* a think about bumping its value to force at least one task to be

3925

* a think about bumping its value to force at least one task to be

3926

* moved

3926

* moved

3927

*/

3927

*/

3928

if (*imbalance < sds->busiest_load_per_task)

3928

if (*imbalance < sds->busiest_load_per_task)

3929

return fix_small_imbalance(sds, this_cpu, imbalance);

3929

return fix_small_imbalance(sds, this_cpu, imbalance);

3930

3931

}

3931

}

3932

/******* find_busiest_group() helpers end here *********************/

3932

/******* find_busiest_group() helpers end here *********************/

3933

3934

/**

3934

/**

3935

* find_busiest_group - Returns the busiest group within the sched_domain

3935

* find_busiest_group - Returns the busiest group within the sched_domain

3936

* if there is an imbalance. If there isn't an imbalance, and

3936

* if there is an imbalance. If there isn't an imbalance, and

3937

* the user has opted for power-savings, it returns a group whose

3937

* the user has opted for power-savings, it returns a group whose

3938

* CPUs can be put to idle by rebalancing those tasks elsewhere, if

3938

* CPUs can be put to idle by rebalancing those tasks elsewhere, if

3939

* such a group exists.

3939

* such a group exists.

3940

*

3940

*

3941

* Also calculates the amount of weighted load which should be moved

3941

* Also calculates the amount of weighted load which should be moved

3942

* to restore balance.

3942

* to restore balance.

3943

*

3943

*

3944

* @sd: The sched_domain whose busiest group is to be returned.

3944

* @sd: The sched_domain whose busiest group is to be returned.

3945

* @this_cpu: The cpu for which load balancing is currently being performed.

3945

* @this_cpu: The cpu for which load balancing is currently being performed.

3946

* @imbalance: Variable which stores amount of weighted load which should

3946

* @imbalance: Variable which stores amount of weighted load which should

3947

* be moved to restore balance/put a group to idle.

3947

* be moved to restore balance/put a group to idle.

3948

* @idle: The idle status of this_cpu.

3948

* @idle: The idle status of this_cpu.

3949

* @sd_idle: The idleness of sd

3949

* @sd_idle: The idleness of sd

3950

* @cpus: The set of CPUs under consideration for load-balancing.

3950

* @cpus: The set of CPUs under consideration for load-balancing.

3951

* @balance: Pointer to a variable indicating if this_cpu

3951

* @balance: Pointer to a variable indicating if this_cpu

3952

* is the appropriate cpu to perform load balancing at this_level.

3952

* is the appropriate cpu to perform load balancing at this_level.

3953

*

3953

*

3954

* Returns: - the busiest group if imbalance exists.

3954

* Returns: - the busiest group if imbalance exists.

3955

* - If no imbalance and user has opted for power-savings balance,

3955

* - If no imbalance and user has opted for power-savings balance,

3956

* return the least loaded group whose CPUs can be

3956

* return the least loaded group whose CPUs can be

3957

* put to idle by rebalancing its tasks onto our group.

3957

* put to idle by rebalancing its tasks onto our group.

3958

*/

3958

*/

3959

static struct sched_group *

3959

static struct sched_group *

3960

find_busiest_group(struct sched_domain *sd, int this_cpu,

3960

find_busiest_group(struct sched_domain *sd, int this_cpu,

3961

unsigned long *imbalance, enum cpu_idle_type idle,

3961

unsigned long *imbalance, enum cpu_idle_type idle,

3962

int *sd_idle, const struct cpumask *cpus, int *balance)

3962

int *sd_idle, const struct cpumask *cpus, int *balance)

3963

{

3963

{

3964

struct sd_lb_stats sds;

3964

struct sd_lb_stats sds;

3965

3966

memset(&sds, 0, sizeof(sds));

3966

memset(&sds, 0, sizeof(sds));

3967

3968

/*

3968

/*

3969

* Compute the various statistics relavent for load balancing at

3969

* Compute the various statistics relavent for load balancing at

3970

* this level.

3970

* this level.

3971

*/

3971

*/

3972

update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,

3972

update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,

3973

balance, &sds);

3973

balance, &sds);

3974

3975

/* Cases where imbalance does not exist from POV of this_cpu */

3975

/* Cases where imbalance does not exist from POV of this_cpu */

3976

/* 1) this_cpu is not the appropriate cpu to perform load balancing

3976

/* 1) this_cpu is not the appropriate cpu to perform load balancing

3977

* at this level.

3977

* at this level.

3978

* 2) There is no busy sibling group to pull from.

3978

* 2) There is no busy sibling group to pull from.

3979

* 3) This group is the busiest group.

3979

* 3) This group is the busiest group.

3980

* 4) This group is more busy than the avg busieness at this

3980

* 4) This group is more busy than the avg busieness at this

3981

* sched_domain.

3981

* sched_domain.

3982

* 5) The imbalance is within the specified limit.

3982

* 5) The imbalance is within the specified limit.

3983

* 6) Any rebalance would lead to ping-pong

3983

* 6) Any rebalance would lead to ping-pong

3984

*/

3984

*/

3985

if (balance && !(*balance))

3985

if (balance && !(*balance))

3986

goto ret;

3986

goto ret;

3987

3988

if (!sds.busiest || sds.busiest_nr_running == 0)

3988

if (!sds.busiest || sds.busiest_nr_running == 0)

3989

goto out_balanced;

3989

goto out_balanced;

3990

3991

if (sds.this_load >= sds.max_load)

3991

if (sds.this_load >= sds.max_load)

3992

goto out_balanced;

3992

goto out_balanced;

3993

3994

sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;

3994

sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;

3995

3996

if (sds.this_load >= sds.avg_load)

3996

if (sds.this_load >= sds.avg_load)

3997

goto out_balanced;

3997

goto out_balanced;

3998

3999

if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)

3999

if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)

4000

goto out_balanced;

4000

goto out_balanced;

4001

4002

sds.busiest_load_per_task /= sds.busiest_nr_running;

4002

sds.busiest_load_per_task /= sds.busiest_nr_running;

4003

if (sds.group_imb)

4003

if (sds.group_imb)

4004

sds.busiest_load_per_task =

4004

sds.busiest_load_per_task =

4005

min(sds.busiest_load_per_task, sds.avg_load);

4005

min(sds.busiest_load_per_task, sds.avg_load);

4006

4007

/*

4007

/*

4008

* We're trying to get all the cpus to the average_load, so we don't

4008

* We're trying to get all the cpus to the average_load, so we don't

4009

* want to push ourselves above the average load, nor do we wish to

4009

* want to push ourselves above the average load, nor do we wish to

4010

* reduce the max loaded cpu below the average load, as either of these

4010

* reduce the max loaded cpu below the average load, as either of these

4011

* actions would just result in more rebalancing later, and ping-pong

4011

* actions would just result in more rebalancing later, and ping-pong

4012

* tasks around. Thus we look for the minimum possible imbalance.

4012

* tasks around. Thus we look for the minimum possible imbalance.

4013

* Negative imbalances (*we* are more loaded than anyone else) will

4013

* Negative imbalances (*we* are more loaded than anyone else) will

4014

* be counted as no imbalance for these purposes -- we can't fix that

4014

* be counted as no imbalance for these purposes -- we can't fix that

4015

* by pulling tasks to us. Be careful of negative numbers as they'll

4015

* by pulling tasks to us. Be careful of negative numbers as they'll

4016

* appear as very large values with unsigned longs.

4016

* appear as very large values with unsigned longs.

4017

*/

4017

*/

4018

if (sds.max_load <= sds.busiest_load_per_task)

4018

if (sds.max_load <= sds.busiest_load_per_task)

4019

goto out_balanced;

4019

goto out_balanced;

4020

4021

/* Looks like there is an imbalance. Compute it */

4021

/* Looks like there is an imbalance. Compute it */

4022

calculate_imbalance(&sds, this_cpu, imbalance);

4022

calculate_imbalance(&sds, this_cpu, imbalance);

4023

return sds.busiest;

4023

return sds.busiest;

4024

4025

out_balanced:

4025

out_balanced:

4026

/*

4026

/*

4027

* There is no obvious imbalance. But check if we can do some balancing

4027

* There is no obvious imbalance. But check if we can do some balancing

4028

* to save power.

4028

* to save power.

4029

*/

4029

*/

4030

if (check_power_save_busiest_group(&sds, this_cpu, imbalance))

4030

if (check_power_save_busiest_group(&sds, this_cpu, imbalance))

4031

return sds.busiest;

4031

return sds.busiest;

4032

ret:

4032

ret:

4033

*imbalance = 0;

4033

*imbalance = 0;

4034

return NULL;

4034

return NULL;

4035

}

4035

}

4036

4037

/*

4037

/*

4038

* find_busiest_queue - find the busiest runqueue among the cpus in group.

4038

* find_busiest_queue - find the busiest runqueue among the cpus in group.

4039

*/

4039

*/

4040

static struct rq *

4040

static struct rq *

4041

find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,

4041

find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,

4042

unsigned long imbalance, const struct cpumask *cpus)

4042

unsigned long imbalance, const struct cpumask *cpus)

4043

{

4043

{

4044

struct rq *busiest = NULL, *rq;

4044

struct rq *busiest = NULL, *rq;

4045

unsigned long max_load = 0;

4045

unsigned long max_load = 0;

4046

int i;

4046

int i;

4047

4048

for_each_cpu(i, sched_group_cpus(group)) {

4048

for_each_cpu(i, sched_group_cpus(group)) {

4049

unsigned long power = power_of(i);

4049

unsigned long power = power_of(i);

4050

unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);

4050

unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);

4051

unsigned long wl;

4051

unsigned long wl;

4052

4053

if (!cpumask_test_cpu(i, cpus))

4053

if (!cpumask_test_cpu(i, cpus))

4054

continue;

4054

continue;

4055

4056

rq = cpu_rq(i);

4056

rq = cpu_rq(i);

4057

wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;

4057

wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;

4058

wl /= power;

4058

wl /= power;

4059

4060

if (capacity && rq->nr_running == 1 && wl > imbalance)

4060

if (capacity && rq->nr_running == 1 && wl > imbalance)

4061

continue;

4061

continue;

4062

4063

if (wl > max_load) {

4063

if (wl > max_load) {

4064

max_load = wl;

4064

max_load = wl;

4065

busiest = rq;

4065

busiest = rq;

4066

}

4066

}

4067

}

4067

}

4068

4069

return busiest;

4069

return busiest;

4070

}

4070

}

4071

4072

/*

4072

/*

4073

* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but

4073

* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but

4074

* so long as it is large enough.

4074

* so long as it is large enough.

4075

*/

4075

*/

4076

#define MAX_PINNED_INTERVAL 512

4076

#define MAX_PINNED_INTERVAL 512

4077

4078

/* Working cpumask for load_balance and load_balance_newidle. */

4078

/* Working cpumask for load_balance and load_balance_newidle. */

4079

static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);

4079

static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);

4080

4081

/*

4081

/*

4082

* Check this_cpu to ensure it is balanced within domain. Attempt to move

4082

* Check this_cpu to ensure it is balanced within domain. Attempt to move

4083

* tasks if there is an imbalance.

4083

* tasks if there is an imbalance.

4084

*/

4084

*/

4085

static int load_balance(int this_cpu, struct rq *this_rq,

4085

static int load_balance(int this_cpu, struct rq *this_rq,

4086

struct sched_domain *sd, enum cpu_idle_type idle,

4086

struct sched_domain *sd, enum cpu_idle_type idle,

4087

int *balance)

4087

int *balance)

4088

{

4088

{

4089

int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;

4089

int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;

4090

struct sched_group *group;

4090

struct sched_group *group;

4091

unsigned long imbalance;

4091

unsigned long imbalance;

4092

struct rq *busiest;

4092

struct rq *busiest;

4093

unsigned long flags;

4093

unsigned long flags;

4094

struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);

4094

struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);

4095

4096

cpumask_setall(cpus);

4096

cpumask_setall(cpus);

4097

4098

/*

4098

/*

4099

* When power savings policy is enabled for the parent domain, idle

4099

* When power savings policy is enabled for the parent domain, idle

4100

* sibling can pick up load irrespective of busy siblings. In this case,

4100

* sibling can pick up load irrespective of busy siblings. In this case,

4101

* let the state of idle sibling percolate up as CPU_IDLE, instead of

4101

* let the state of idle sibling percolate up as CPU_IDLE, instead of

4102

* portraying it as CPU_NOT_IDLE.

4102

* portraying it as CPU_NOT_IDLE.

4103

*/

4103

*/

4104

if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&

4104

if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&

4105

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4105

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4106

sd_idle = 1;

4106

sd_idle = 1;

4107

4108

schedstat_inc(sd, lb_count[idle]);

4108

schedstat_inc(sd, lb_count[idle]);

4109

4110

redo:

4110

redo:

4111

update_shares(sd);

4111

update_shares(sd);

4112

group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,

4112

group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,

4113

cpus, balance);

4113

cpus, balance);

4114

4115

if (*balance == 0)

4115

if (*balance == 0)

4116

goto out_balanced;

4116

goto out_balanced;

4117

4118

if (!group) {

4118

if (!group) {

4119

schedstat_inc(sd, lb_nobusyg[idle]);

4119

schedstat_inc(sd, lb_nobusyg[idle]);

4120

goto out_balanced;

4120

goto out_balanced;

4121

}

4121

}

4122

4123

busiest = find_busiest_queue(group, idle, imbalance, cpus);

4123

busiest = find_busiest_queue(group, idle, imbalance, cpus);

4124

if (!busiest) {

4124

if (!busiest) {

4125

schedstat_inc(sd, lb_nobusyq[idle]);

4125

schedstat_inc(sd, lb_nobusyq[idle]);

4126

goto out_balanced;

4126

goto out_balanced;

4127

}

4127

}

4128

4129

BUG_ON(busiest == this_rq);

4129

BUG_ON(busiest == this_rq);

4130

4131

schedstat_add(sd, lb_imbalance[idle], imbalance);

4131

schedstat_add(sd, lb_imbalance[idle], imbalance);

4132

4133

ld_moved = 0;

4133

ld_moved = 0;

4134

if (busiest->nr_running > 1) {

4134

if (busiest->nr_running > 1) {

4135

/*

4135

/*

4136

* Attempt to move tasks. If find_busiest_group has found

4136

* Attempt to move tasks. If find_busiest_group has found

4137

* an imbalance but busiest->nr_running <= 1, the group is

4137

* an imbalance but busiest->nr_running <= 1, the group is

4138

* still unbalanced. ld_moved simply stays zero, so it is

4138

* still unbalanced. ld_moved simply stays zero, so it is

4139

* correctly treated as an imbalance.

4139

* correctly treated as an imbalance.

4140

*/

4140

*/

4141

local_irq_save(flags);

4141

local_irq_save(flags);

4142

double_rq_lock(this_rq, busiest);

4142

double_rq_lock(this_rq, busiest);

4143

ld_moved = move_tasks(this_rq, this_cpu, busiest,

4143

ld_moved = move_tasks(this_rq, this_cpu, busiest,

4144

imbalance, sd, idle, &all_pinned);

4144

imbalance, sd, idle, &all_pinned);

4145

double_rq_unlock(this_rq, busiest);

4145

double_rq_unlock(this_rq, busiest);

4146

local_irq_restore(flags);

4146

local_irq_restore(flags);

4147

4148

/*

4148

/*

4149

* some other cpu did the load balance for us.

4149

* some other cpu did the load balance for us.

4150

*/

4150

*/

4151

if (ld_moved && this_cpu != smp_processor_id())

4151

if (ld_moved && this_cpu != smp_processor_id())

4152

resched_cpu(this_cpu);

4152

resched_cpu(this_cpu);

4153

4154

/* All tasks on this runqueue were pinned by CPU affinity */

4154

/* All tasks on this runqueue were pinned by CPU affinity */

4155

if (unlikely(all_pinned)) {

4155

if (unlikely(all_pinned)) {

4156

cpumask_clear_cpu(cpu_of(busiest), cpus);

4156

cpumask_clear_cpu(cpu_of(busiest), cpus);

4157

if (!cpumask_empty(cpus))

4157

if (!cpumask_empty(cpus))

4158

goto redo;

4158

goto redo;

4159

goto out_balanced;

4159

goto out_balanced;

4160

}

4160

}

4161

}

4161

}

4162

4163

if (!ld_moved) {

4163

if (!ld_moved) {

4164

schedstat_inc(sd, lb_failed[idle]);

4164

schedstat_inc(sd, lb_failed[idle]);

4165

sd->nr_balance_failed++;

4165

sd->nr_balance_failed++;

4166

4167

if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {

4167

if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {

4168

4169

spin_lock_irqsave(&busiest->lock, flags);

4169

spin_lock_irqsave(&busiest->lock, flags);

4170

4171

/* don't kick the migration_thread, if the curr

4171

/* don't kick the migration_thread, if the curr

4172

* task on busiest cpu can't be moved to this_cpu

4172

* task on busiest cpu can't be moved to this_cpu

4173

*/

4173

*/

4174

if (!cpumask_test_cpu(this_cpu,

4174

if (!cpumask_test_cpu(this_cpu,

4175

&busiest->curr->cpus_allowed)) {

4175

&busiest->curr->cpus_allowed)) {

4176

spin_unlock_irqrestore(&busiest->lock, flags);

4176

spin_unlock_irqrestore(&busiest->lock, flags);

4177

all_pinned = 1;

4177

all_pinned = 1;

4178

goto out_one_pinned;

4178

goto out_one_pinned;

4179

}

4179

}

4180

4181

if (!busiest->active_balance) {

4181

if (!busiest->active_balance) {

4182

busiest->active_balance = 1;

4182

busiest->active_balance = 1;

4183

busiest->push_cpu = this_cpu;

4183

busiest->push_cpu = this_cpu;

4184

active_balance = 1;

4184

active_balance = 1;

4185

}

4185

}

4186

spin_unlock_irqrestore(&busiest->lock, flags);

4186

spin_unlock_irqrestore(&busiest->lock, flags);

4187

if (active_balance)

4187

if (active_balance)

4188

wake_up_process(busiest->migration_thread);

4188

wake_up_process(busiest->migration_thread);

4189

4190

/*

4190

/*

4191

* We've kicked active balancing, reset the failure

4191

* We've kicked active balancing, reset the failure

4192

* counter.

4192

* counter.

4193

*/

4193

*/

4194

sd->nr_balance_failed = sd->cache_nice_tries+1;

4194

sd->nr_balance_failed = sd->cache_nice_tries+1;

4195

}

4195

}

4196

} else

4196

} else

4197

sd->nr_balance_failed = 0;

4197

sd->nr_balance_failed = 0;

4198

4199

if (likely(!active_balance)) {

4199

if (likely(!active_balance)) {

4200

/* We were unbalanced, so reset the balancing interval */

4200

/* We were unbalanced, so reset the balancing interval */

4201

sd->balance_interval = sd->min_interval;

4201

sd->balance_interval = sd->min_interval;

4202

} else {

4202

} else {

4203

/*

4203

/*

4204

* If we've begun active balancing, start to back off. This

4204

* If we've begun active balancing, start to back off. This

4205

* case may not be covered by the all_pinned logic if there

4205

* case may not be covered by the all_pinned logic if there

4206

* is only 1 task on the busy runqueue (because we don't call

4206

* is only 1 task on the busy runqueue (because we don't call

4207

* move_tasks).

4207

* move_tasks).

4208

*/

4208

*/

4209

if (sd->balance_interval < sd->max_interval)

4209

if (sd->balance_interval < sd->max_interval)

4210

sd->balance_interval *= 2;

4210

sd->balance_interval *= 2;

4211

}

4211

}

4212

4213

if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4213

if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4214

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4214

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4215

ld_moved = -1;

4215

ld_moved = -1;

4216

4217

goto out;

4217

goto out;

4218

4219

out_balanced:

4219

out_balanced:

4220

schedstat_inc(sd, lb_balanced[idle]);

4220

schedstat_inc(sd, lb_balanced[idle]);

4221

4222

sd->nr_balance_failed = 0;

4222

sd->nr_balance_failed = 0;

4223

4224

out_one_pinned:

4224

out_one_pinned:

4225

/* tune up the balancing interval */

4225

/* tune up the balancing interval */

4226

if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||

4226

if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||

4227

(sd->balance_interval < sd->max_interval))

4227

(sd->balance_interval < sd->max_interval))

4228

sd->balance_interval *= 2;

4228

sd->balance_interval *= 2;

4229

4230

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4230

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4231

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4231

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4232

ld_moved = -1;

4232

ld_moved = -1;

4233

else

4233

else

4234

ld_moved = 0;

4234

ld_moved = 0;

4235

out:

4235

out:

4236

if (ld_moved)

4236

if (ld_moved)

4237

update_shares(sd);

4237

update_shares(sd);

4238

return ld_moved;

4238

return ld_moved;

4239

}

4239

}

4240

4241

/*

4241

/*

4242

* Check this_cpu to ensure it is balanced within domain. Attempt to move

4242

* Check this_cpu to ensure it is balanced within domain. Attempt to move

4243

* tasks if there is an imbalance.

4243

* tasks if there is an imbalance.

4244

*

4244

*

4245

* Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).

4245

* Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).

4246

* this_rq is locked.

4246

* this_rq is locked.

4247

*/

4247

*/

4248

static int

4248

static int

4249

load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)

4249

load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)

4250

{

4250

{

4251

struct sched_group *group;

4251

struct sched_group *group;

4252

struct rq *busiest = NULL;

4252

struct rq *busiest = NULL;

4253

unsigned long imbalance;

4253

unsigned long imbalance;

4254

int ld_moved = 0;

4254

int ld_moved = 0;

4255

int sd_idle = 0;

4255

int sd_idle = 0;

4256

int all_pinned = 0;

4256

int all_pinned = 0;

4257

struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);

4257

struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);

4258

4259

cpumask_setall(cpus);

4259

cpumask_setall(cpus);

4260

4261

/*

4261

/*

4262

* When power savings policy is enabled for the parent domain, idle

4262

* When power savings policy is enabled for the parent domain, idle

4263

* sibling can pick up load irrespective of busy siblings. In this case,

4263

* sibling can pick up load irrespective of busy siblings. In this case,

4264

* let the state of idle sibling percolate up as IDLE, instead of

4264

* let the state of idle sibling percolate up as IDLE, instead of

4265

* portraying it as CPU_NOT_IDLE.

4265

* portraying it as CPU_NOT_IDLE.

4266

*/

4266

*/

4267

if (sd->flags & SD_SHARE_CPUPOWER &&

4267

if (sd->flags & SD_SHARE_CPUPOWER &&

4268

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4268

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4269

sd_idle = 1;

4269

sd_idle = 1;

4270

4271

schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);

4271

schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);

4272

redo:

4272

redo:

4273

update_shares_locked(this_rq, sd);

4273

update_shares_locked(this_rq, sd);

4274

group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,

4274

group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,

4275

&sd_idle, cpus, NULL);

4275

&sd_idle, cpus, NULL);

4276

if (!group) {

4276

if (!group) {

4277

schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);

4277

schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);

4278

goto out_balanced;

4278

goto out_balanced;

4279

}

4279

}

4280

4281

busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);

4281

busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);

4282

if (!busiest) {

4282

if (!busiest) {

4283

schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);

4283

schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);

4284

goto out_balanced;

4284

goto out_balanced;

4285

}

4285

}

4286

4287

BUG_ON(busiest == this_rq);

4287

BUG_ON(busiest == this_rq);

4288

4289

schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);

4289

schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);

4290

4291

ld_moved = 0;

4291

ld_moved = 0;

4292

if (busiest->nr_running > 1) {

4292

if (busiest->nr_running > 1) {

4293

/* Attempt to move tasks */

4293

/* Attempt to move tasks */

4294

double_lock_balance(this_rq, busiest);

4294

double_lock_balance(this_rq, busiest);

4295

/* this_rq->clock is already updated */

4295

/* this_rq->clock is already updated */

4296

update_rq_clock(busiest);

4296

update_rq_clock(busiest);

4297

ld_moved = move_tasks(this_rq, this_cpu, busiest,

4297

ld_moved = move_tasks(this_rq, this_cpu, busiest,

4298

imbalance, sd, CPU_NEWLY_IDLE,

4298

imbalance, sd, CPU_NEWLY_IDLE,

4299

&all_pinned);

4299

&all_pinned);

4300

double_unlock_balance(this_rq, busiest);

4300

double_unlock_balance(this_rq, busiest);

4301

4302

if (unlikely(all_pinned)) {

4302

if (unlikely(all_pinned)) {

4303

cpumask_clear_cpu(cpu_of(busiest), cpus);

4303

cpumask_clear_cpu(cpu_of(busiest), cpus);

4304

if (!cpumask_empty(cpus))

4304

if (!cpumask_empty(cpus))

4305

goto redo;

4305

goto redo;

4306

}

4306

}

4307

}

4307

}

4308

4309

if (!ld_moved) {

4309

if (!ld_moved) {

4310

int active_balance = 0;

4310

int active_balance = 0;

4311

4312

schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);

4312

schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);

4313

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4313

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4314

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4314

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4315

return -1;

4315

return -1;

4316

4317

if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)

4317

if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)

4318

return -1;

4318

return -1;

4319

4320

if (sd->nr_balance_failed++ < 2)

4320

if (sd->nr_balance_failed++ < 2)

4321

return -1;

4321

return -1;

4322

4323

/*

4323

/*

4324

* The only task running in a non-idle cpu can be moved to this

4324

* The only task running in a non-idle cpu can be moved to this

4325

* cpu in an attempt to completely freeup the other CPU

4325

* cpu in an attempt to completely freeup the other CPU

4326

* package. The same method used to move task in load_balance()

4326

* package. The same method used to move task in load_balance()

4327

* have been extended for load_balance_newidle() to speedup

4327

* have been extended for load_balance_newidle() to speedup

4328

* consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)

4328

* consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)

4329

*

4329

*

4330

* The package power saving logic comes from

4330

* The package power saving logic comes from

4331

* find_busiest_group(). If there are no imbalance, then

4331

* find_busiest_group(). If there are no imbalance, then

4332

* f_b_g() will return NULL. However when sched_mc={1,2} then

4332

* f_b_g() will return NULL. However when sched_mc={1,2} then

4333

* f_b_g() will select a group from which a running task may be

4333

* f_b_g() will select a group from which a running task may be

4334

* pulled to this cpu in order to make the other package idle.

4334

* pulled to this cpu in order to make the other package idle.

4335

* If there is no opportunity to make a package idle and if

4335

* If there is no opportunity to make a package idle and if

4336

* there are no imbalance, then f_b_g() will return NULL and no

4336

* there are no imbalance, then f_b_g() will return NULL and no

4337

* action will be taken in load_balance_newidle().

4337

* action will be taken in load_balance_newidle().

4338

*

4338

*

4339

* Under normal task pull operation due to imbalance, there

4339

* Under normal task pull operation due to imbalance, there

4340

* will be more than one task in the source run queue and

4340

* will be more than one task in the source run queue and

4341

* move_tasks() will succeed. ld_moved will be true and this

4341

* move_tasks() will succeed. ld_moved will be true and this

4342

* active balance code will not be triggered.

4342

* active balance code will not be triggered.

4343

*/

4343

*/

4344

4345

/* Lock busiest in correct order while this_rq is held */

4345

/* Lock busiest in correct order while this_rq is held */

4346

double_lock_balance(this_rq, busiest);

4346

double_lock_balance(this_rq, busiest);

4347

4348

/*

4348

/*

4349

* don't kick the migration_thread, if the curr

4349

* don't kick the migration_thread, if the curr

4350

* task on busiest cpu can't be moved to this_cpu

4350

* task on busiest cpu can't be moved to this_cpu

4351

*/

4351

*/

4352

if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {

4352

if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {

4353

double_unlock_balance(this_rq, busiest);

4353

double_unlock_balance(this_rq, busiest);

4354

all_pinned = 1;

4354

all_pinned = 1;

4355

return ld_moved;

4355

return ld_moved;

4356

}

4356

}

4357

4358

if (!busiest->active_balance) {

4358

if (!busiest->active_balance) {

4359

busiest->active_balance = 1;

4359

busiest->active_balance = 1;

4360

busiest->push_cpu = this_cpu;

4360

busiest->push_cpu = this_cpu;

4361

active_balance = 1;

4361

active_balance = 1;

4362

}

4362

}

4363

4364

double_unlock_balance(this_rq, busiest);

4364

double_unlock_balance(this_rq, busiest);

4365

/*

4365

/*

4366

* Should not call ttwu while holding a rq->lock

4366

* Should not call ttwu while holding a rq->lock

4367

*/

4367

*/

4368

spin_unlock(&this_rq->lock);

4368

spin_unlock(&this_rq->lock);

4369

if (active_balance)

4369

if (active_balance)

4370

wake_up_process(busiest->migration_thread);

4370

wake_up_process(busiest->migration_thread);

4371

spin_lock(&this_rq->lock);

4371

spin_lock(&this_rq->lock);

4372

4373

} else

4373

} else

4374

sd->nr_balance_failed = 0;

4374

sd->nr_balance_failed = 0;

4375

4376

update_shares_locked(this_rq, sd);

4376

update_shares_locked(this_rq, sd);

4377

return ld_moved;

4377

return ld_moved;

4378

4379

out_balanced:

4379

out_balanced:

4380

schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);

4380

schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);

4381

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4381

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4382

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4382

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4383

return -1;

4383

return -1;

4384

sd->nr_balance_failed = 0;

4384

sd->nr_balance_failed = 0;

4385

4386

return 0;

4386

return 0;

4387

}

4387

}

4388

4389

/*

4389

/*

4390

* idle_balance is called by schedule() if this_cpu is about to become

4390

* idle_balance is called by schedule() if this_cpu is about to become

4391

* idle. Attempts to pull tasks from other CPUs.

4391

* idle. Attempts to pull tasks from other CPUs.

4392

*/

4392

*/

4393

static void idle_balance(int this_cpu, struct rq *this_rq)

4393

static void idle_balance(int this_cpu, struct rq *this_rq)

4394

{

4394

{

4395

struct sched_domain *sd;

4395

struct sched_domain *sd;

4396

int pulled_task = 0;

4396

int pulled_task = 0;

4397

unsigned long next_balance = jiffies + HZ;

4397

unsigned long next_balance = jiffies + HZ;

4398

4399

for_each_domain(this_cpu, sd) {

4399

for_each_domain(this_cpu, sd) {

4400

unsigned long interval;

4400

unsigned long interval;

4401

4402

if (!(sd->flags & SD_LOAD_BALANCE))

4402

if (!(sd->flags & SD_LOAD_BALANCE))

4403

continue;

4403

continue;

4404

4405

if (sd->flags & SD_BALANCE_NEWIDLE)

4405

if (sd->flags & SD_BALANCE_NEWIDLE)

4406

/* If we've pulled tasks over stop searching: */

4406

/* If we've pulled tasks over stop searching: */

4407

pulled_task = load_balance_newidle(this_cpu, this_rq,

4407

pulled_task = load_balance_newidle(this_cpu, this_rq,

4408

sd);

4408

sd);

4409

4410

interval = msecs_to_jiffies(sd->balance_interval);

4410

interval = msecs_to_jiffies(sd->balance_interval);

4411

if (time_after(next_balance, sd->last_balance + interval))

4411

if (time_after(next_balance, sd->last_balance + interval))

4412

next_balance = sd->last_balance + interval;

4412

next_balance = sd->last_balance + interval;

4413

if (pulled_task)

4413

if (pulled_task)

4414

break;

4414

break;

4415

}

4415

}

4416

if (pulled_task || time_after(jiffies, this_rq->next_balance)) {

4416

if (pulled_task || time_after(jiffies, this_rq->next_balance)) {

4417

/*

4417

/*

4418

* We are going idle. next_balance may be set based on

4418

* We are going idle. next_balance may be set based on

4419

* a busy processor. So reset next_balance.

4419

* a busy processor. So reset next_balance.

4420

*/

4420

*/

4421

this_rq->next_balance = next_balance;

4421

this_rq->next_balance = next_balance;

4422

}

4422

}

4423

}

4423

}

4424

4425

/*

4425

/*

4426

* active_load_balance is run by migration threads. It pushes running tasks

4426

* active_load_balance is run by migration threads. It pushes running tasks

4427

* off the busiest CPU onto idle CPUs. It requires at least 1 task to be

4427

* off the busiest CPU onto idle CPUs. It requires at least 1 task to be

4428

* running on each physical CPU where possible, and avoids physical /

4428

* running on each physical CPU where possible, and avoids physical /

4429

* logical imbalances.

4429

* logical imbalances.

4430

*

4430

*

4431

* Called with busiest_rq locked.

4431

* Called with busiest_rq locked.

4432

*/

4432

*/

4433

static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)

4433

static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)

4434

{

4434

{

4435

int target_cpu = busiest_rq->push_cpu;

4435

int target_cpu = busiest_rq->push_cpu;

4436

struct sched_domain *sd;

4436

struct sched_domain *sd;

4437

struct rq *target_rq;

4437

struct rq *target_rq;

4438

4439

/* Is there any task to move? */

4439

/* Is there any task to move? */

4440

if (busiest_rq->nr_running <= 1)

4440

if (busiest_rq->nr_running <= 1)

4441

return;

4441

return;

4442

4443

target_rq = cpu_rq(target_cpu);

4443

target_rq = cpu_rq(target_cpu);

4444

4445

/*

4445

/*

4446

* This condition is "impossible", if it occurs

4446

* This condition is "impossible", if it occurs

4447

* we need to fix it. Originally reported by

4447

* we need to fix it. Originally reported by

4448

* Bjorn Helgaas on a 128-cpu setup.

4448

* Bjorn Helgaas on a 128-cpu setup.

4449

*/

4449

*/

4450

BUG_ON(busiest_rq == target_rq);

4450

BUG_ON(busiest_rq == target_rq);

4451

4452

/* move a task from busiest_rq to target_rq */

4452

/* move a task from busiest_rq to target_rq */

4453

double_lock_balance(busiest_rq, target_rq);

4453

double_lock_balance(busiest_rq, target_rq);

4454

update_rq_clock(busiest_rq);

4454

update_rq_clock(busiest_rq);

4455

update_rq_clock(target_rq);

4455

update_rq_clock(target_rq);

4456

4457

/* Search for an sd spanning us and the target CPU. */

4457

/* Search for an sd spanning us and the target CPU. */

4458

for_each_domain(target_cpu, sd) {

4458

for_each_domain(target_cpu, sd) {

4459

if ((sd->flags & SD_LOAD_BALANCE) &&

4459

if ((sd->flags & SD_LOAD_BALANCE) &&

4460

cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))

4460

cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))

4461

break;

4461

break;

4462

}

4462

}

4463

4464

if (likely(sd)) {

4464

if (likely(sd)) {

4465

schedstat_inc(sd, alb_count);

4465

schedstat_inc(sd, alb_count);

4466

4467

if (move_one_task(target_rq, target_cpu, busiest_rq,

4467

if (move_one_task(target_rq, target_cpu, busiest_rq,

4468

sd, CPU_IDLE))

4468

sd, CPU_IDLE))

4469

schedstat_inc(sd, alb_pushed);

4469

schedstat_inc(sd, alb_pushed);

4470

else

4470

else

4471

schedstat_inc(sd, alb_failed);

4471

schedstat_inc(sd, alb_failed);

4472

}

4472

}

4473

double_unlock_balance(busiest_rq, target_rq);

4473

double_unlock_balance(busiest_rq, target_rq);

4474

}

4474

}

4475

4476

#ifdef CONFIG_NO_HZ

4476

#ifdef CONFIG_NO_HZ

4477

static struct {

4477

static struct {

4478

atomic_t load_balancer;

4478

atomic_t load_balancer;

4479

cpumask_var_t cpu_mask;

4479

cpumask_var_t cpu_mask;

4480

cpumask_var_t ilb_grp_nohz_mask;

4480

cpumask_var_t ilb_grp_nohz_mask;

4481

} nohz ____cacheline_aligned = {

4481

} nohz ____cacheline_aligned = {

4482

.load_balancer = ATOMIC_INIT(-1),

4482

.load_balancer = ATOMIC_INIT(-1),

4483

};

4483

};

4484

4485

int get_nohz_load_balancer(void)

4485

int get_nohz_load_balancer(void)

4486

{

4486

{

4487

return atomic_read(&nohz.load_balancer);

4487

return atomic_read(&nohz.load_balancer);

4488

}

4488

}

4489

4490

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

4490

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

4491

/**

4491

/**

4492

* lowest_flag_domain - Return lowest sched_domain containing flag.

4492

* lowest_flag_domain - Return lowest sched_domain containing flag.

4493

* @cpu: The cpu whose lowest level of sched domain is to

4493

* @cpu: The cpu whose lowest level of sched domain is to

4494

* be returned.

4494

* be returned.

4495

* @flag: The flag to check for the lowest sched_domain

4495

* @flag: The flag to check for the lowest sched_domain

4496

* for the given cpu.

4496

* for the given cpu.

4497

*

4497

*

4498

* Returns the lowest sched_domain of a cpu which contains the given flag.

4498

* Returns the lowest sched_domain of a cpu which contains the given flag.

4499

*/

4499

*/

4500

static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)

4500

static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)

4501

{

4501

{

4502

struct sched_domain *sd;

4502

struct sched_domain *sd;

4503

4504

for_each_domain(cpu, sd)

4504

for_each_domain(cpu, sd)

4505

if (sd && (sd->flags & flag))

4505

if (sd && (sd->flags & flag))

4506

break;

4506

break;

4507

4508

return sd;

4508

return sd;

4509

}

4509

}

4510

4511

/**

4511

/**

4512

* for_each_flag_domain - Iterates over sched_domains containing the flag.

4512

* for_each_flag_domain - Iterates over sched_domains containing the flag.

4513

* @cpu: The cpu whose domains we're iterating over.

4513

* @cpu: The cpu whose domains we're iterating over.

4514

* @sd: variable holding the value of the power_savings_sd

4514

* @sd: variable holding the value of the power_savings_sd

4515

* for cpu.

4515

* for cpu.

4516

* @flag: The flag to filter the sched_domains to be iterated.

4516

* @flag: The flag to filter the sched_domains to be iterated.

4517

*

4517

*

4518

* Iterates over all the scheduler domains for a given cpu that has the 'flag'

4518

* Iterates over all the scheduler domains for a given cpu that has the 'flag'

4519

* set, starting from the lowest sched_domain to the highest.

4519

* set, starting from the lowest sched_domain to the highest.

4520

*/

4520

*/

4521

#define for_each_flag_domain(cpu, sd, flag) \

4521

#define for_each_flag_domain(cpu, sd, flag) \

4522

for (sd = lowest_flag_domain(cpu, flag); \

4522

for (sd = lowest_flag_domain(cpu, flag); \

4523

(sd && (sd->flags & flag)); sd = sd->parent)

4523

(sd && (sd->flags & flag)); sd = sd->parent)

4524

4525

/**

4525

/**

4526

* is_semi_idle_group - Checks if the given sched_group is semi-idle.

4526

* is_semi_idle_group - Checks if the given sched_group is semi-idle.

4527

* @ilb_group: group to be checked for semi-idleness

4527

* @ilb_group: group to be checked for semi-idleness

4528

*

4528

*

4529

* Returns: 1 if the group is semi-idle. 0 otherwise.

4529

* Returns: 1 if the group is semi-idle. 0 otherwise.

4530

*

4530

*

4531

* We define a sched_group to be semi idle if it has atleast one idle-CPU

4531

* We define a sched_group to be semi idle if it has atleast one idle-CPU

4532

* and atleast one non-idle CPU. This helper function checks if the given

4532

* and atleast one non-idle CPU. This helper function checks if the given

4533

* sched_group is semi-idle or not.

4533

* sched_group is semi-idle or not.

4534

*/

4534

*/

4535

static inline int is_semi_idle_group(struct sched_group *ilb_group)

4535

static inline int is_semi_idle_group(struct sched_group *ilb_group)

4536

{

4536

{

4537

cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,

4537

cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,

4538

sched_group_cpus(ilb_group));

4538

sched_group_cpus(ilb_group));

4539

4540

/*

4540

/*

4541

* A sched_group is semi-idle when it has atleast one busy cpu

4541

* A sched_group is semi-idle when it has atleast one busy cpu

4542

* and atleast one idle cpu.

4542

* and atleast one idle cpu.

4543

*/

4543

*/

4544

if (cpumask_empty(nohz.ilb_grp_nohz_mask))

4544

if (cpumask_empty(nohz.ilb_grp_nohz_mask))

4545

return 0;

4545

return 0;

4546

4547

if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))

4547

if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))

4548

return 0;

4548

return 0;

4549

4550

return 1;

4550

return 1;

4551

}

4551

}

4552

/**

4552

/**

4553

* find_new_ilb - Finds the optimum idle load balancer for nomination.

4553

* find_new_ilb - Finds the optimum idle load balancer for nomination.

4554

* @cpu: The cpu which is nominating a new idle_load_balancer.

4554

* @cpu: The cpu which is nominating a new idle_load_balancer.

4555

*

4555

*

4556

* Returns: Returns the id of the idle load balancer if it exists,

4556

* Returns: Returns the id of the idle load balancer if it exists,

4557

* Else, returns >= nr_cpu_ids.

4557

* Else, returns >= nr_cpu_ids.

4558

*

4558

*

4559

* This algorithm picks the idle load balancer such that it belongs to a

4559

* This algorithm picks the idle load balancer such that it belongs to a

4560

* semi-idle powersavings sched_domain. The idea is to try and avoid

4560

* semi-idle powersavings sched_domain. The idea is to try and avoid

4561

* completely idle packages/cores just for the purpose of idle load balancing

4561

* completely idle packages/cores just for the purpose of idle load balancing

4562

* when there are other idle cpu's which are better suited for that job.

4562

* when there are other idle cpu's which are better suited for that job.

4563

*/

4563

*/

4564

static int find_new_ilb(int cpu)

4564

static int find_new_ilb(int cpu)

4565

{

4565

{

4566

struct sched_domain *sd;

4566

struct sched_domain *sd;

4567

struct sched_group *ilb_group;

4567

struct sched_group *ilb_group;

4568

4569

/*

4569

/*

4570

* Have idle load balancer selection from semi-idle packages only

4570

* Have idle load balancer selection from semi-idle packages only

4571

* when power-aware load balancing is enabled

4571

* when power-aware load balancing is enabled

4572

*/

4572

*/

4573

if (!(sched_smt_power_savings || sched_mc_power_savings))

4573

if (!(sched_smt_power_savings || sched_mc_power_savings))

4574

goto out_done;

4574

goto out_done;

4575

4576

/*

4576

/*

4577

* Optimize for the case when we have no idle CPUs or only one

4577

* Optimize for the case when we have no idle CPUs or only one

4578

* idle CPU. Don't walk the sched_domain hierarchy in such cases

4578

* idle CPU. Don't walk the sched_domain hierarchy in such cases

4579

*/

4579

*/

4580

if (cpumask_weight(nohz.cpu_mask) < 2)

4580

if (cpumask_weight(nohz.cpu_mask) < 2)

4581

goto out_done;

4581

goto out_done;

4582

4583

for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {

4583

for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {

4584

ilb_group = sd->groups;

4584

ilb_group = sd->groups;

4585

4586

do {

4586

do {

4587

if (is_semi_idle_group(ilb_group))

4587

if (is_semi_idle_group(ilb_group))

4588

return cpumask_first(nohz.ilb_grp_nohz_mask);

4588

return cpumask_first(nohz.ilb_grp_nohz_mask);

4589

4590

ilb_group = ilb_group->next;

4590

ilb_group = ilb_group->next;

4591

4592

} while (ilb_group != sd->groups);

4592

} while (ilb_group != sd->groups);

4593

}

4593

}

4594

4595

out_done:

4595

out_done:

4596

return cpumask_first(nohz.cpu_mask);

4596

return cpumask_first(nohz.cpu_mask);

4597

}

4597

}

4598

#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */

4598

#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */

4599

static inline int find_new_ilb(int call_cpu)

4599

static inline int find_new_ilb(int call_cpu)

4600

{

4600

{

4601

return cpumask_first(nohz.cpu_mask);

4601

return cpumask_first(nohz.cpu_mask);

4602

}

4602

}

4603

#endif

4603

#endif

4604

4605

/*

4605

/*

4606

* This routine will try to nominate the ilb (idle load balancing)

4606

* This routine will try to nominate the ilb (idle load balancing)

4607

* owner among the cpus whose ticks are stopped. ilb owner will do the idle

4607

* owner among the cpus whose ticks are stopped. ilb owner will do the idle

4608

* load balancing on behalf of all those cpus. If all the cpus in the system

4608

* load balancing on behalf of all those cpus. If all the cpus in the system

4609

* go into this tickless mode, then there will be no ilb owner (as there is

4609

* go into this tickless mode, then there will be no ilb owner (as there is

4610

* no need for one) and all the cpus will sleep till the next wakeup event

4610

* no need for one) and all the cpus will sleep till the next wakeup event

4611

* arrives...

4611

* arrives...

4612

*

4612

*

4613

* For the ilb owner, tick is not stopped. And this tick will be used

4613

* For the ilb owner, tick is not stopped. And this tick will be used

4614

* for idle load balancing. ilb owner will still be part of

4614

* for idle load balancing. ilb owner will still be part of

4615

* nohz.cpu_mask..

4615

* nohz.cpu_mask..

4616

*

4616

*

4617

* While stopping the tick, this cpu will become the ilb owner if there

4617

* While stopping the tick, this cpu will become the ilb owner if there

4618

* is no other owner. And will be the owner till that cpu becomes busy

4618

* is no other owner. And will be the owner till that cpu becomes busy

4619

* or if all cpus in the system stop their ticks at which point

4619

* or if all cpus in the system stop their ticks at which point

4620

* there is no need for ilb owner.

4620

* there is no need for ilb owner.

4621

*

4621

*

4622

* When the ilb owner becomes busy, it nominates another owner, during the

4622

* When the ilb owner becomes busy, it nominates another owner, during the

4623

* next busy scheduler_tick()

4623

* next busy scheduler_tick()

4624

*/

4624

*/

4625

int select_nohz_load_balancer(int stop_tick)

4625

int select_nohz_load_balancer(int stop_tick)

4626

{

4626

{

4627

int cpu = smp_processor_id();

4627

int cpu = smp_processor_id();

4628

4629

if (stop_tick) {

4629

if (stop_tick) {

4630

cpu_rq(cpu)->in_nohz_recently = 1;

4630

cpu_rq(cpu)->in_nohz_recently = 1;

4631

4632

if (!cpu_active(cpu)) {

4632

if (!cpu_active(cpu)) {

4633

if (atomic_read(&nohz.load_balancer) != cpu)

4633

if (atomic_read(&nohz.load_balancer) != cpu)

4634

return 0;

4634

return 0;

4635

4636

/*

4636

/*

4637

* If we are going offline and still the leader,

4637

* If we are going offline and still the leader,

4638

* give up!

4638

* give up!

4639

*/

4639

*/

4640

if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)

4640

if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)

4641

BUG();

4641

BUG();

4642

4643

return 0;

4643

return 0;

4644

}

4644

}

4645

4646

cpumask_set_cpu(cpu, nohz.cpu_mask);

4646

cpumask_set_cpu(cpu, nohz.cpu_mask);

4647

4648

/* time for ilb owner also to sleep */

4648

/* time for ilb owner also to sleep */

4649

if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {

4649

if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {

4650

if (atomic_read(&nohz.load_balancer) == cpu)

4650

if (atomic_read(&nohz.load_balancer) == cpu)

4651

atomic_set(&nohz.load_balancer, -1);

4651

atomic_set(&nohz.load_balancer, -1);

4652

return 0;

4652

return 0;

4653

}

4653

}

4654

4655

if (atomic_read(&nohz.load_balancer) == -1) {

4655

if (atomic_read(&nohz.load_balancer) == -1) {

4656

/* make me the ilb owner */

4656

/* make me the ilb owner */

4657

if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)

4657

if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)

4658

return 1;

4658

return 1;

4659

} else if (atomic_read(&nohz.load_balancer) == cpu) {

4659

} else if (atomic_read(&nohz.load_balancer) == cpu) {

4660

int new_ilb;

4660

int new_ilb;

4661

4662

if (!(sched_smt_power_savings ||

4662

if (!(sched_smt_power_savings ||

4663

sched_mc_power_savings))

4663

sched_mc_power_savings))

4664

return 1;

4664

return 1;

4665

/*

4665

/*

4666

* Check to see if there is a more power-efficient

4666

* Check to see if there is a more power-efficient

4667

* ilb.

4667

* ilb.

4668

*/

4668

*/

4669

new_ilb = find_new_ilb(cpu);

4669

new_ilb = find_new_ilb(cpu);

4670

if (new_ilb < nr_cpu_ids && new_ilb != cpu) {

4670

if (new_ilb < nr_cpu_ids && new_ilb != cpu) {

4671

atomic_set(&nohz.load_balancer, -1);

4671

atomic_set(&nohz.load_balancer, -1);

4672

resched_cpu(new_ilb);

4672

resched_cpu(new_ilb);

4673

return 0;

4673

return 0;

4674

}

4674

}

4675

return 1;

4675

return 1;

4676

}

4676

}

4677

} else {

4677

} else {

4678

if (!cpumask_test_cpu(cpu, nohz.cpu_mask))

4678

if (!cpumask_test_cpu(cpu, nohz.cpu_mask))

4679

return 0;

4679

return 0;

4680

4681

cpumask_clear_cpu(cpu, nohz.cpu_mask);

4681

cpumask_clear_cpu(cpu, nohz.cpu_mask);

4682

4683

if (atomic_read(&nohz.load_balancer) == cpu)

4683

if (atomic_read(&nohz.load_balancer) == cpu)

4684

if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)

4684

if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)

4685

BUG();

4685

BUG();

4686

}

4686

}

4687

return 0;

4687

return 0;

4688

}

4688

}

4689

#endif

4689

#endif

4690

4691

static DEFINE_SPINLOCK(balancing);

4691

static DEFINE_SPINLOCK(balancing);

4692

4693

/*

4693

/*

4694

* It checks each scheduling domain to see if it is due to be balanced,

4694

* It checks each scheduling domain to see if it is due to be balanced,

4695

* and initiates a balancing operation if so.

4695

* and initiates a balancing operation if so.

4696

*

4696

*

4697

* Balancing parameters are set up in arch_init_sched_domains.

4697

* Balancing parameters are set up in arch_init_sched_domains.

4698

*/

4698

*/

4699

static void rebalance_domains(int cpu, enum cpu_idle_type idle)

4699

static void rebalance_domains(int cpu, enum cpu_idle_type idle)

4700

{

4700

{

4701

int balance = 1;

4701

int balance = 1;

4702

struct rq *rq = cpu_rq(cpu);

4702

struct rq *rq = cpu_rq(cpu);

4703

unsigned long interval;

4703

unsigned long interval;

4704

struct sched_domain *sd;

4704

struct sched_domain *sd;

4705

/* Earliest time when we have to do rebalance again */

4705

/* Earliest time when we have to do rebalance again */

4706

unsigned long next_balance = jiffies + 60*HZ;

4706

unsigned long next_balance = jiffies + 60*HZ;

4707

int update_next_balance = 0;

4707

int update_next_balance = 0;

4708

int need_serialize;

4708

int need_serialize;

4709

4710

for_each_domain(cpu, sd) {

4710

for_each_domain(cpu, sd) {

4711

if (!(sd->flags & SD_LOAD_BALANCE))

4711

if (!(sd->flags & SD_LOAD_BALANCE))

4712

continue;

4712

continue;

4713

4714

interval = sd->balance_interval;

4714

interval = sd->balance_interval;

4715

if (idle != CPU_IDLE)

4715

if (idle != CPU_IDLE)

4716

interval *= sd->busy_factor;

4716

interval *= sd->busy_factor;

4717

4718

/* scale ms to jiffies */

4718

/* scale ms to jiffies */

4719

interval = msecs_to_jiffies(interval);

4719

interval = msecs_to_jiffies(interval);

4720

if (unlikely(!interval))

4720

if (unlikely(!interval))

4721

interval = 1;

4721

interval = 1;

4722

if (interval > HZ*NR_CPUS/10)

4722

if (interval > HZ*NR_CPUS/10)

4723

interval = HZ*NR_CPUS/10;

4723

interval = HZ*NR_CPUS/10;

4724

4725

need_serialize = sd->flags & SD_SERIALIZE;

4725

need_serialize = sd->flags & SD_SERIALIZE;

4726

4727

if (need_serialize) {

4727

if (need_serialize) {

4728

if (!spin_trylock(&balancing))

4728

if (!spin_trylock(&balancing))

4729

goto out;

4729

goto out;

4730

}

4730

}

4731

4732

if (time_after_eq(jiffies, sd->last_balance + interval)) {

4732

if (time_after_eq(jiffies, sd->last_balance + interval)) {

4733

if (load_balance(cpu, rq, sd, idle, &balance)) {

4733

if (load_balance(cpu, rq, sd, idle, &balance)) {

4734

/*

4734

/*

4735

* We've pulled tasks over so either we're no

4735

* We've pulled tasks over so either we're no

4736

* longer idle, or one of our SMT siblings is

4736

* longer idle, or one of our SMT siblings is

4737

* not idle.

4737

* not idle.

4738

*/

4738

*/

4739

idle = CPU_NOT_IDLE;

4739

idle = CPU_NOT_IDLE;

4740

}

4740

}

4741

sd->last_balance = jiffies;

4741

sd->last_balance = jiffies;

4742

}

4742

}

4743

if (need_serialize)

4743

if (need_serialize)

4744

spin_unlock(&balancing);

4744

spin_unlock(&balancing);

4745

out:

4745

out:

4746

if (time_after(next_balance, sd->last_balance + interval)) {

4746

if (time_after(next_balance, sd->last_balance + interval)) {

4747

next_balance = sd->last_balance + interval;

4747

next_balance = sd->last_balance + interval;

4748

update_next_balance = 1;

4748

update_next_balance = 1;

4749

}

4749

}

4750

4751

/*

4751

/*

4752

* Stop the load balance at this level. There is another

4752

* Stop the load balance at this level. There is another

4753

* CPU in our sched group which is doing load balancing more

4753

* CPU in our sched group which is doing load balancing more

4754

* actively.

4754

* actively.

4755

*/

4755

*/

4756

if (!balance)

4756

if (!balance)

4757

break;

4757

break;

4758

}

4758

}

4759

4760

/*

4760

/*

4761

* next_balance will be updated only when there is a need.

4761

* next_balance will be updated only when there is a need.

4762

* When the cpu is attached to null domain for ex, it will not be

4762

* When the cpu is attached to null domain for ex, it will not be

4763

* updated.

4763

* updated.

4764

*/

4764

*/

4765

if (likely(update_next_balance))

4765

if (likely(update_next_balance))

4766

rq->next_balance = next_balance;

4766

rq->next_balance = next_balance;

4767

}

4767

}

4768

4769

/*

4769

/*

4770

* run_rebalance_domains is triggered when needed from the scheduler tick.

4770

* run_rebalance_domains is triggered when needed from the scheduler tick.

4771

* In CONFIG_NO_HZ case, the idle load balance owner will do the

4771

* In CONFIG_NO_HZ case, the idle load balance owner will do the

4772

* rebalancing for all the cpus for whom scheduler ticks are stopped.

4772

* rebalancing for all the cpus for whom scheduler ticks are stopped.

4773

*/

4773

*/

4774

static void run_rebalance_domains(struct softirq_action *h)

4774

static void run_rebalance_domains(struct softirq_action *h)

4775

{

4775

{

4776

int this_cpu = smp_processor_id();

4776

int this_cpu = smp_processor_id();

4777

struct rq *this_rq = cpu_rq(this_cpu);

4777

struct rq *this_rq = cpu_rq(this_cpu);

4778

enum cpu_idle_type idle = this_rq->idle_at_tick ?

4778

enum cpu_idle_type idle = this_rq->idle_at_tick ?

4779

CPU_IDLE : CPU_NOT_IDLE;

4779

CPU_IDLE : CPU_NOT_IDLE;

4780

4781

rebalance_domains(this_cpu, idle);

4781

rebalance_domains(this_cpu, idle);

4782

4783

#ifdef CONFIG_NO_HZ

4783

#ifdef CONFIG_NO_HZ

4784

/*

4784

/*

4785

* If this cpu is the owner for idle load balancing, then do the

4785

* If this cpu is the owner for idle load balancing, then do the

4786

* balancing on behalf of the other idle cpus whose ticks are

4786

* balancing on behalf of the other idle cpus whose ticks are

4787

* stopped.

4787

* stopped.

4788

*/

4788

*/

4789

if (this_rq->idle_at_tick &&

4789

if (this_rq->idle_at_tick &&

4790

atomic_read(&nohz.load_balancer) == this_cpu) {

4790

atomic_read(&nohz.load_balancer) == this_cpu) {

4791

struct rq *rq;

4791

struct rq *rq;

4792

int balance_cpu;

4792

int balance_cpu;

4793

4794

for_each_cpu(balance_cpu, nohz.cpu_mask) {

4794

for_each_cpu(balance_cpu, nohz.cpu_mask) {

4795

if (balance_cpu == this_cpu)

4795

if (balance_cpu == this_cpu)

4796

continue;

4796

continue;

4797

4798

/*

4798

/*

4799

* If this cpu gets work to do, stop the load balancing

4799

* If this cpu gets work to do, stop the load balancing

4800

* work being done for other cpus. Next load

4800

* work being done for other cpus. Next load

4801

* balancing owner will pick it up.

4801

* balancing owner will pick it up.

4802

*/

4802

*/

4803

if (need_resched())

4803

if (need_resched())

4804

break;

4804

break;

4805

4806

rebalance_domains(balance_cpu, CPU_IDLE);

4806

rebalance_domains(balance_cpu, CPU_IDLE);

4807

4808

rq = cpu_rq(balance_cpu);

4808

rq = cpu_rq(balance_cpu);

4809

if (time_after(this_rq->next_balance, rq->next_balance))

4809

if (time_after(this_rq->next_balance, rq->next_balance))

4810

this_rq->next_balance = rq->next_balance;

4810

this_rq->next_balance = rq->next_balance;

4811

}

4811

}

4812

}

4812

}

4813

#endif

4813

#endif

4814

}

4814

}

4815

4816

static inline int on_null_domain(int cpu)

4816

static inline int on_null_domain(int cpu)

4817

{

4817

{

4818

return !rcu_dereference(cpu_rq(cpu)->sd);

4818

return !rcu_dereference(cpu_rq(cpu)->sd);

4819

}

4819

}

4820

4821

/*

4821

/*

4822

* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.

4822

* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.

4823

*

4823

*

4824

* In case of CONFIG_NO_HZ, this is the place where we nominate a new

4824

* In case of CONFIG_NO_HZ, this is the place where we nominate a new

4825

* idle load balancing owner or decide to stop the periodic load balancing,

4825

* idle load balancing owner or decide to stop the periodic load balancing,

4826

* if the whole system is idle.

4826

* if the whole system is idle.

4827

*/

4827

*/

4828

static inline void trigger_load_balance(struct rq *rq, int cpu)

4828

static inline void trigger_load_balance(struct rq *rq, int cpu)

4829

{

4829

{

4830

#ifdef CONFIG_NO_HZ

4830

#ifdef CONFIG_NO_HZ

4831

/*

4831

/*

4832

* If we were in the nohz mode recently and busy at the current

4832

* If we were in the nohz mode recently and busy at the current

4833

* scheduler tick, then check if we need to nominate new idle

4833

* scheduler tick, then check if we need to nominate new idle

4834

* load balancer.

4834

* load balancer.

4835

*/

4835

*/

4836

if (rq->in_nohz_recently && !rq->idle_at_tick) {

4836

if (rq->in_nohz_recently && !rq->idle_at_tick) {

4837

rq->in_nohz_recently = 0;

4837

rq->in_nohz_recently = 0;

4838

4839

if (atomic_read(&nohz.load_balancer) == cpu) {

4839

if (atomic_read(&nohz.load_balancer) == cpu) {

4840

cpumask_clear_cpu(cpu, nohz.cpu_mask);

4840

cpumask_clear_cpu(cpu, nohz.cpu_mask);

4841

atomic_set(&nohz.load_balancer, -1);

4841

atomic_set(&nohz.load_balancer, -1);

4842

}

4842

}

4843

4844

if (atomic_read(&nohz.load_balancer) == -1) {

4844

if (atomic_read(&nohz.load_balancer) == -1) {

4845

int ilb = find_new_ilb(cpu);

4845

int ilb = find_new_ilb(cpu);

4846

4847

if (ilb < nr_cpu_ids)

4847

if (ilb < nr_cpu_ids)

4848

resched_cpu(ilb);

4848

resched_cpu(ilb);

4849

}

4849

}

4850

}

4850

}

4851

4852

/*

4852

/*

4853

* If this cpu is idle and doing idle load balancing for all the

4853

* If this cpu is idle and doing idle load balancing for all the

4854

* cpus with ticks stopped, is it time for that to stop?

4854

* cpus with ticks stopped, is it time for that to stop?

4855

*/

4855

*/

4856

if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&

4856

if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&

4857

cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {

4857

cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {

4858

resched_cpu(cpu);

4858

resched_cpu(cpu);

4859

return;

4859

return;

4860

}

4860

}

4861

4862

/*

4862

/*

4863

* If this cpu is idle and the idle load balancing is done by

4863

* If this cpu is idle and the idle load balancing is done by

4864

* someone else, then no need raise the SCHED_SOFTIRQ

4864

* someone else, then no need raise the SCHED_SOFTIRQ

4865

*/

4865

*/

4866

if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&

4866

if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&

4867

cpumask_test_cpu(cpu, nohz.cpu_mask))

4867

cpumask_test_cpu(cpu, nohz.cpu_mask))

4868

return;

4868

return;

4869

#endif

4869

#endif

4870

/* Don't need to rebalance while attached to NULL domain */

4870

/* Don't need to rebalance while attached to NULL domain */

4871

if (time_after_eq(jiffies, rq->next_balance) &&

4871

if (time_after_eq(jiffies, rq->next_balance) &&

4872

likely(!on_null_domain(cpu)))

4872

likely(!on_null_domain(cpu)))

4873

raise_softirq(SCHED_SOFTIRQ);

4873

raise_softirq(SCHED_SOFTIRQ);

4874

}

4874

}

4875

4876

#else /* CONFIG_SMP */

4876

#else /* CONFIG_SMP */

4877

4878

/*

4878

/*

4879

* on UP we do not need to balance between CPUs:

4879

* on UP we do not need to balance between CPUs:

4880

*/

4880

*/

4881

static inline void idle_balance(int cpu, struct rq *rq)

4881

static inline void idle_balance(int cpu, struct rq *rq)

4882

{

4882

{

4883

}

4883

}

4884

4885

#endif

4885

#endif

4886

4887

DEFINE_PER_CPU(struct kernel_stat, kstat);

4887

DEFINE_PER_CPU(struct kernel_stat, kstat);

4888

4889

EXPORT_PER_CPU_SYMBOL(kstat);

4889

EXPORT_PER_CPU_SYMBOL(kstat);

4890

4891

/*

4891

/*

4892

* Return any ns on the sched_clock that have not yet been accounted in

4892

* Return any ns on the sched_clock that have not yet been accounted in

4893

* @p in case that task is currently running.

4893

* @p in case that task is currently running.

4894

*

4894

*

4895

* Called with task_rq_lock() held on @rq.

4895

* Called with task_rq_lock() held on @rq.

4896

*/

4896

*/

4897

static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)

4897

static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)

4898

{

4898

{

4899

u64 ns = 0;

4899

u64 ns = 0;

4900

4901

if (task_current(rq, p)) {

4901

if (task_current(rq, p)) {

4902

update_rq_clock(rq);

4902

update_rq_clock(rq);

4903

ns = rq->clock - p->se.exec_start;

4903

ns = rq->clock - p->se.exec_start;

4904

if ((s64)ns < 0)

4904

if ((s64)ns < 0)

4905

ns = 0;

4905

ns = 0;

4906

}

4906

}

4907

4908

return ns;

4908

return ns;

4909

}

4909

}

4910

4911

unsigned long long task_delta_exec(struct task_struct *p)

4911

unsigned long long task_delta_exec(struct task_struct *p)

4912

{

4912

{

4913

unsigned long flags;

4913

unsigned long flags;

4914

struct rq *rq;

4914

struct rq *rq;

4915

u64 ns = 0;

4915

u64 ns = 0;

4916

4917

rq = task_rq_lock(p, &flags);

4917

rq = task_rq_lock(p, &flags);

4918

ns = do_task_delta_exec(p, rq);

4918

ns = do_task_delta_exec(p, rq);

4919

task_rq_unlock(rq, &flags);

4919

task_rq_unlock(rq, &flags);

4920

4921

return ns;

4921

return ns;

4922

}

4922

}

4923

4924

/*

4924

/*

4925

* Return accounted runtime for the task.

4925

* Return accounted runtime for the task.

4926

* In case the task is currently running, return the runtime plus current's

4926

* In case the task is currently running, return the runtime plus current's

4927

* pending runtime that have not been accounted yet.

4927

* pending runtime that have not been accounted yet.

4928

*/

4928

*/

4929

unsigned long long task_sched_runtime(struct task_struct *p)

4929

unsigned long long task_sched_runtime(struct task_struct *p)

4930

{

4930

{

4931

unsigned long flags;

4931

unsigned long flags;

4932

struct rq *rq;

4932

struct rq *rq;

4933

u64 ns = 0;

4933

u64 ns = 0;

4934

4935

rq = task_rq_lock(p, &flags);

4935

rq = task_rq_lock(p, &flags);

4936

ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);

4936

ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);

4937

task_rq_unlock(rq, &flags);

4937

task_rq_unlock(rq, &flags);

4938

4939

return ns;

4939

return ns;

4940

}

4940

}

4941

4942

/*

4942

/*

4943

* Return sum_exec_runtime for the thread group.

4943

* Return sum_exec_runtime for the thread group.

4944

* In case the task is currently running, return the sum plus current's

4944

* In case the task is currently running, return the sum plus current's

4945

* pending runtime that have not been accounted yet.

4945

* pending runtime that have not been accounted yet.

4946

*

4946

*

4947

* Note that the thread group might have other running tasks as well,

4947

* Note that the thread group might have other running tasks as well,

4948

* so the return value not includes other pending runtime that other

4948

* so the return value not includes other pending runtime that other

4949

* running tasks might have.

4949

* running tasks might have.

4950

*/

4950

*/

4951

unsigned long long thread_group_sched_runtime(struct task_struct *p)

4951

unsigned long long thread_group_sched_runtime(struct task_struct *p)

4952

{

4952

{

4953

struct task_cputime totals;

4953

struct task_cputime totals;

4954

unsigned long flags;

4954

unsigned long flags;

4955

struct rq *rq;

4955

struct rq *rq;

4956

u64 ns;

4956

u64 ns;

4957

4958

rq = task_rq_lock(p, &flags);

4958

rq = task_rq_lock(p, &flags);

4959

thread_group_cputime(p, &totals);

4959

thread_group_cputime(p, &totals);

4960

ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);

4960

ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);

4961

task_rq_unlock(rq, &flags);

4961

task_rq_unlock(rq, &flags);

4962

4963

return ns;

4963

return ns;

4964

}

4964

}

4965

4966

/*

4966

/*

4967

* Account user cpu time to a process.

4967

* Account user cpu time to a process.

4968

* @p: the process that the cpu time gets accounted to

4968

* @p: the process that the cpu time gets accounted to

4969

* @cputime: the cpu time spent in user space since the last update

4969

* @cputime: the cpu time spent in user space since the last update

4970

* @cputime_scaled: cputime scaled by cpu frequency

4970

* @cputime_scaled: cputime scaled by cpu frequency

4971

*/

4971

*/

4972

void account_user_time(struct task_struct *p, cputime_t cputime,

4972

void account_user_time(struct task_struct *p, cputime_t cputime,

4973

cputime_t cputime_scaled)

4973

cputime_t cputime_scaled)

4974

{

4974

{

4975

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

4975

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

4976

cputime64_t tmp;

4976

cputime64_t tmp;

4977

4978

/* Add user time to process. */

4978

/* Add user time to process. */

4979

p->utime = cputime_add(p->utime, cputime);

4979

p->utime = cputime_add(p->utime, cputime);

4980

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

4980

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

4981

account_group_user_time(p, cputime);

4981

account_group_user_time(p, cputime);

4982

4983

/* Add user time to cpustat. */

4983

/* Add user time to cpustat. */

4984

tmp = cputime_to_cputime64(cputime);

4984

tmp = cputime_to_cputime64(cputime);

4985

if (TASK_NICE(p) > 0)

4985

if (TASK_NICE(p) > 0)

4986

cpustat->nice = cputime64_add(cpustat->nice, tmp);

4986

cpustat->nice = cputime64_add(cpustat->nice, tmp);

4987

else

4987

else

4988

cpustat->user = cputime64_add(cpustat->user, tmp);

4988

cpustat->user = cputime64_add(cpustat->user, tmp);

4989

4990

cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);

4990

cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);

4991

/* Account for user time used */

4991

/* Account for user time used */

4992

acct_update_integrals(p);

4992

acct_update_integrals(p);

4993

}

4993

}

4994

4995

/*

4995

/*

4996

* Account guest cpu time to a process.

4996

* Account guest cpu time to a process.

4997

* @p: the process that the cpu time gets accounted to

4997

* @p: the process that the cpu time gets accounted to

4998

* @cputime: the cpu time spent in virtual machine since the last update

4998

* @cputime: the cpu time spent in virtual machine since the last update

4999

* @cputime_scaled: cputime scaled by cpu frequency

4999

* @cputime_scaled: cputime scaled by cpu frequency

5000

*/

5000

*/

5001

static void account_guest_time(struct task_struct *p, cputime_t cputime,

5001

static void account_guest_time(struct task_struct *p, cputime_t cputime,

5002

cputime_t cputime_scaled)

5002

cputime_t cputime_scaled)

5003

{

5003

{

5004

cputime64_t tmp;

5004

cputime64_t tmp;

5005

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

5005

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

5006

5007

tmp = cputime_to_cputime64(cputime);

5007

tmp = cputime_to_cputime64(cputime);

5008

5009

/* Add guest time to process. */

5009

/* Add guest time to process. */

5010

p->utime = cputime_add(p->utime, cputime);

5010

p->utime = cputime_add(p->utime, cputime);

5011

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

5011

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

5012

account_group_user_time(p, cputime);

5012

account_group_user_time(p, cputime);

5013

p->gtime = cputime_add(p->gtime, cputime);

5013

p->gtime = cputime_add(p->gtime, cputime);

5014

5015

/* Add guest time to cpustat. */

5015

/* Add guest time to cpustat. */

5016

cpustat->user = cputime64_add(cpustat->user, tmp);

5016

cpustat->user = cputime64_add(cpustat->user, tmp);

5017

cpustat->guest = cputime64_add(cpustat->guest, tmp);

5017

cpustat->guest = cputime64_add(cpustat->guest, tmp);

5018

}

5018

}

5019

5020

/*

5020

/*

5021

* Account system cpu time to a process.

5021

* Account system cpu time to a process.

5022

* @p: the process that the cpu time gets accounted to

5022

* @p: the process that the cpu time gets accounted to

5023

* @hardirq_offset: the offset to subtract from hardirq_count()

5023

* @hardirq_offset: the offset to subtract from hardirq_count()

5024

* @cputime: the cpu time spent in kernel space since the last update

5024

* @cputime: the cpu time spent in kernel space since the last update

5025

* @cputime_scaled: cputime scaled by cpu frequency

5025

* @cputime_scaled: cputime scaled by cpu frequency

5026

*/

5026

*/

5027

void account_system_time(struct task_struct *p, int hardirq_offset,

5027

void account_system_time(struct task_struct *p, int hardirq_offset,

5028

cputime_t cputime, cputime_t cputime_scaled)

5028

cputime_t cputime, cputime_t cputime_scaled)

5029

{

5029

{

5030

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

5030

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

5031

cputime64_t tmp;

5031

cputime64_t tmp;

5032

5033

if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {

5033

if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {

5034

account_guest_time(p, cputime, cputime_scaled);

5034

account_guest_time(p, cputime, cputime_scaled);

5035

return;

5035

return;

5036

}

5036

}

5037

5038

/* Add system time to process. */

5038

/* Add system time to process. */

5039

p->stime = cputime_add(p->stime, cputime);

5039

p->stime = cputime_add(p->stime, cputime);

5040

p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);

5040

p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);

5041

account_group_system_time(p, cputime);

5041

account_group_system_time(p, cputime);

5042

5043

/* Add system time to cpustat. */

5043

/* Add system time to cpustat. */

5044

tmp = cputime_to_cputime64(cputime);

5044

tmp = cputime_to_cputime64(cputime);

5045

if (hardirq_count() - hardirq_offset)

5045

if (hardirq_count() - hardirq_offset)

5046

cpustat->irq = cputime64_add(cpustat->irq, tmp);

5046

cpustat->irq = cputime64_add(cpustat->irq, tmp);

5047

else if (softirq_count())

5047

else if (softirq_count())

5048

cpustat->softirq = cputime64_add(cpustat->softirq, tmp);

5048

cpustat->softirq = cputime64_add(cpustat->softirq, tmp);

5049

else

5049

else

5050

cpustat->system = cputime64_add(cpustat->system, tmp);

5050

cpustat->system = cputime64_add(cpustat->system, tmp);

5051

5052

cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);

5052

cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);

5053

5054

/* Account for system time used */

5054

/* Account for system time used */

5055

acct_update_integrals(p);

5055

acct_update_integrals(p);

5056

}

5056

}

5057

5058

/*

5058

/*

5059

* Account for involuntary wait time.

5059

* Account for involuntary wait time.

5060

* @steal: the cpu time spent in involuntary wait

5060

* @steal: the cpu time spent in involuntary wait

5061

*/

5061

*/

5062

void account_steal_time(cputime_t cputime)

5062

void account_steal_time(cputime_t cputime)

5063

{

5063

{

5064

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

5064

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

5065

cputime64_t cputime64 = cputime_to_cputime64(cputime);

5065

cputime64_t cputime64 = cputime_to_cputime64(cputime);

5066

5067

cpustat->steal = cputime64_add(cpustat->steal, cputime64);

5067

cpustat->steal = cputime64_add(cpustat->steal, cputime64);

5068

}

5068

}

5069

5070

/*

5070

/*

5071

* Account for idle time.

5071

* Account for idle time.

5072

* @cputime: the cpu time spent in idle wait

5072

* @cputime: the cpu time spent in idle wait

5073

*/

5073

*/

5074

void account_idle_time(cputime_t cputime)

5074

void account_idle_time(cputime_t cputime)

5075

{

5075

{

5076

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

5076

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

5077

cputime64_t cputime64 = cputime_to_cputime64(cputime);

5077

cputime64_t cputime64 = cputime_to_cputime64(cputime);

5078

struct rq *rq = this_rq();

5078

struct rq *rq = this_rq();

5079

5080

if (atomic_read(&rq->nr_iowait) > 0)

5080

if (atomic_read(&rq->nr_iowait) > 0)

5081

cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);

5081

cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);

5082

else

5082

else

5083

cpustat->idle = cputime64_add(cpustat->idle, cputime64);

5083

cpustat->idle = cputime64_add(cpustat->idle, cputime64);

5084

}

5084

}

5085

5086

#ifndef CONFIG_VIRT_CPU_ACCOUNTING

5086

#ifndef CONFIG_VIRT_CPU_ACCOUNTING

5087

5088

/*

5088

/*

5089

* Account a single tick of cpu time.

5089

* Account a single tick of cpu time.

5090

* @p: the process that the cpu time gets accounted to

5090

* @p: the process that the cpu time gets accounted to

5091

* @user_tick: indicates if the tick is a user or a system tick

5091

* @user_tick: indicates if the tick is a user or a system tick

5092

*/

5092

*/

5093

void account_process_tick(struct task_struct *p, int user_tick)

5093

void account_process_tick(struct task_struct *p, int user_tick)

5094

{

5094

{

5095

cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);

5095

cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);

5096

struct rq *rq = this_rq();

5096

struct rq *rq = this_rq();

5097

5098

if (user_tick)

5098

if (user_tick)

5099

account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);

5099

account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);

5100

else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))

5100

else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))

5101

account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,

5101

account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,

5102

one_jiffy_scaled);

5102

one_jiffy_scaled);

5103

else

5103

else

5104

account_idle_time(cputime_one_jiffy);

5104

account_idle_time(cputime_one_jiffy);

5105

}

5105

}

5106

5107

/*

5107

/*

5108

* Account multiple ticks of steal time.

5108

* Account multiple ticks of steal time.

5109

* @p: the process from which the cpu time has been stolen

5109

* @p: the process from which the cpu time has been stolen

5110

* @ticks: number of stolen ticks

5110

* @ticks: number of stolen ticks

5111

*/

5111

*/

5112

void account_steal_ticks(unsigned long ticks)

5112

void account_steal_ticks(unsigned long ticks)

5113

{

5113

{

5114

account_steal_time(jiffies_to_cputime(ticks));

5114

account_steal_time(jiffies_to_cputime(ticks));

5115

}

5115

}

5116

5117

/*

5117

/*

5118

* Account multiple ticks of idle time.

5118

* Account multiple ticks of idle time.

5119

* @ticks: number of stolen ticks

5119

* @ticks: number of stolen ticks

5120

*/

5120

*/

5121

void account_idle_ticks(unsigned long ticks)

5121

void account_idle_ticks(unsigned long ticks)

5122

{

5122

{

5123

account_idle_time(jiffies_to_cputime(ticks));

5123

account_idle_time(jiffies_to_cputime(ticks));

5124

}

5124

}

5125

5126

#endif

5126

#endif

5127

5128

/*

5128

/*

5129

* Use precise platform statistics if available:

5129

* Use precise platform statistics if available:

5130

*/

5130

*/

5131

#ifdef CONFIG_VIRT_CPU_ACCOUNTING

5131

#ifdef CONFIG_VIRT_CPU_ACCOUNTING

5132

cputime_t task_utime(struct task_struct *p)

5132

cputime_t task_utime(struct task_struct *p)

5133

{

5133

{

5134

return p->utime;

5134

return p->utime;

5135

}

5135

}

5136

5137

cputime_t task_stime(struct task_struct *p)

5137

cputime_t task_stime(struct task_struct *p)

5138

{

5138

{

5139

return p->stime;

5139

return p->stime;

5140

}

5140

}

5141

#else

5141

#else

5142

cputime_t task_utime(struct task_struct *p)

5142

cputime_t task_utime(struct task_struct *p)

5143

{

5143

{

5144

clock_t utime = cputime_to_clock_t(p->utime),

5144

clock_t utime = cputime_to_clock_t(p->utime),

5145

total = utime + cputime_to_clock_t(p->stime);

5145

total = utime + cputime_to_clock_t(p->stime);

5146

u64 temp;

5146

u64 temp;

5147

5148

/*

5148

/*

5149

* Use CFS's precise accounting:

5149

* Use CFS's precise accounting:

5150

*/

5150

*/

5151

temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);

5151

temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);

5152

5153

if (total) {

5153

if (total) {

5154

temp *= utime;

5154

temp *= utime;

5155

do_div(temp, total);

5155

do_div(temp, total);

5156

}

5156

}

5157

utime = (clock_t)temp;

5157

utime = (clock_t)temp;

5158

5159

p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));

5159

p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));

5160

return p->prev_utime;

5160

return p->prev_utime;

5161

}

5161

}

5162

5163

cputime_t task_stime(struct task_struct *p)

5163

cputime_t task_stime(struct task_struct *p)

5164

{

5164

{

5165

clock_t stime;

5165

clock_t stime;

5166

5167

/*

5167

/*

5168

* Use CFS's precise accounting. (we subtract utime from

5168

* Use CFS's precise accounting. (we subtract utime from

5169

* the total, to make sure the total observed by userspace

5169

* the total, to make sure the total observed by userspace

5170

* grows monotonically - apps rely on that):

5170

* grows monotonically - apps rely on that):

5171

*/

5171

*/

5172

stime = nsec_to_clock_t(p->se.sum_exec_runtime) -

5172

stime = nsec_to_clock_t(p->se.sum_exec_runtime) -

5173

cputime_to_clock_t(task_utime(p));

5173

cputime_to_clock_t(task_utime(p));

5174

5175

if (stime >= 0)

5175

if (stime >= 0)

5176

p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));

5176

p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));

5177

5178

return p->prev_stime;

5178

return p->prev_stime;

5179

}

5179

}

5180

#endif

5180

#endif

5181

5182

inline cputime_t task_gtime(struct task_struct *p)

5182

inline cputime_t task_gtime(struct task_struct *p)

5183

{

5183

{

5184

return p->gtime;

5184

return p->gtime;

5185

}

5185

}

5186

5187

/*

5187

/*

5188

* This function gets called by the timer code, with HZ frequency.

5188

* This function gets called by the timer code, with HZ frequency.

5189

* We call it with interrupts disabled.

5189

* We call it with interrupts disabled.

5190

*

5190

*

5191

* It also gets called by the fork code, when changing the parent's

5191

* It also gets called by the fork code, when changing the parent's

5192

* timeslices.

5192

* timeslices.

5193

*/

5193

*/

5194

void scheduler_tick(void)

5194

void scheduler_tick(void)

5195

{

5195

{

5196

int cpu = smp_processor_id();

5196

int cpu = smp_processor_id();

5197

struct rq *rq = cpu_rq(cpu);

5197

struct rq *rq = cpu_rq(cpu);

5198

struct task_struct *curr = rq->curr;

5198

struct task_struct *curr = rq->curr;

5199

5200

sched_clock_tick();

5200

sched_clock_tick();

5201

5202

spin_lock(&rq->lock);

5202

spin_lock(&rq->lock);

5203

update_rq_clock(rq);

5203

update_rq_clock(rq);

5204

update_cpu_load(rq);

5204

update_cpu_load(rq);

5205

curr->sched_class->task_tick(rq, curr, 0);

5205

curr->sched_class->task_tick(rq, curr, 0);

5206

spin_unlock(&rq->lock);

5206

spin_unlock(&rq->lock);

5207

5208

perf_event_task_tick(curr, cpu);

5208

perf_event_task_tick(curr, cpu);

5209

5210

#ifdef CONFIG_SMP

5210

#ifdef CONFIG_SMP

5211

rq->idle_at_tick = idle_cpu(cpu);

5211

rq->idle_at_tick = idle_cpu(cpu);

5212

trigger_load_balance(rq, cpu);

5212

trigger_load_balance(rq, cpu);

5213

#endif

5213

#endif

5214

}

5214

}

5215

5216

notrace unsigned long get_parent_ip(unsigned long addr)

5216

notrace unsigned long get_parent_ip(unsigned long addr)

5217

{

5217

{

5218

if (in_lock_functions(addr)) {

5218

if (in_lock_functions(addr)) {

5219

addr = CALLER_ADDR2;

5219

addr = CALLER_ADDR2;

5220

if (in_lock_functions(addr))

5220

if (in_lock_functions(addr))

5221

addr = CALLER_ADDR3;

5221

addr = CALLER_ADDR3;

5222

}

5222

}

5223

return addr;

5223

return addr;

5224

}

5224

}

5225

5226

#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \

5226

#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \

5227

defined(CONFIG_PREEMPT_TRACER))

5227

defined(CONFIG_PREEMPT_TRACER))

5228

5229

void __kprobes add_preempt_count(int val)

5229

void __kprobes add_preempt_count(int val)

5230

{

5230

{

5231

#ifdef CONFIG_DEBUG_PREEMPT

5231

#ifdef CONFIG_DEBUG_PREEMPT

5232

/*

5232

/*

5233

* Underflow?

5233

* Underflow?

5234

*/

5234

*/

5235

if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))

5235

if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))

5236

return;

5236

return;

5237

#endif

5237

#endif

5238

preempt_count() += val;

5238

preempt_count() += val;

5239

#ifdef CONFIG_DEBUG_PREEMPT

5239

#ifdef CONFIG_DEBUG_PREEMPT

5240

/*

5240

/*

5241

* Spinlock count overflowing soon?

5241

* Spinlock count overflowing soon?

5242

*/

5242

*/

5243

DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=

5243

DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=

5244

PREEMPT_MASK - 10);

5244

PREEMPT_MASK - 10);

5245

#endif

5245

#endif

5246

if (preempt_count() == val)

5246

if (preempt_count() == val)

5247

trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

5247

trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

5248

}

5248

}

5249

EXPORT_SYMBOL(add_preempt_count);

5249

EXPORT_SYMBOL(add_preempt_count);

5250

5251

void __kprobes sub_preempt_count(int val)

5251

void __kprobes sub_preempt_count(int val)

5252

{

5252

{

5253

#ifdef CONFIG_DEBUG_PREEMPT

5253

#ifdef CONFIG_DEBUG_PREEMPT

5254

/*

5254

/*

5255

* Underflow?

5255

* Underflow?

5256

*/

5256

*/

5257

if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))

5257

if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))

5258

return;

5258

return;

5259

/*

5259

/*

5260

* Is the spinlock portion underflowing?

5260

* Is the spinlock portion underflowing?

5261

*/

5261

*/

5262

if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&

5262

if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&

5263

!(preempt_count() & PREEMPT_MASK)))

5263

!(preempt_count() & PREEMPT_MASK)))

5264

return;

5264

return;

5265

#endif

5265

#endif

5266

5267

if (preempt_count() == val)

5267

if (preempt_count() == val)

5268

trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

5268

trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

5269

preempt_count() -= val;

5269

preempt_count() -= val;

5270

}

5270

}

5271

EXPORT_SYMBOL(sub_preempt_count);

5271

EXPORT_SYMBOL(sub_preempt_count);

5272

5273

#endif

5273

#endif

5274

5275

/*

5275

/*

5276

* Print scheduling while atomic bug:

5276

* Print scheduling while atomic bug:

5277

*/

5277

*/

5278

static noinline void __schedule_bug(struct task_struct *prev)

5278

static noinline void __schedule_bug(struct task_struct *prev)

5279

{

5279

{

5280

struct pt_regs *regs = get_irq_regs();

5280

struct pt_regs *regs = get_irq_regs();

5281

5282

printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",

5282

printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",

5283

prev->comm, prev->pid, preempt_count());

5283

prev->comm, prev->pid, preempt_count());

5284

5285

debug_show_held_locks(prev);

5285

debug_show_held_locks(prev);

5286

print_modules();

5286

print_modules();

5287

if (irqs_disabled())

5287

if (irqs_disabled())

5288

print_irqtrace_events(prev);

5288

print_irqtrace_events(prev);

5289

5290

if (regs)

5290

if (regs)

5291

show_regs(regs);

5291

show_regs(regs);

5292

else

5292

else

5293

dump_stack();

5293

dump_stack();

5294

}

5294

}

5295

5296

/*

5296

/*

5297

* Various schedule()-time debugging checks and statistics:

5297

* Various schedule()-time debugging checks and statistics:

5298

*/

5298

*/

5299

static inline void schedule_debug(struct task_struct *prev)

5299

static inline void schedule_debug(struct task_struct *prev)

5300

{

5300

{

5301

/*

5301

/*

5302

* Test if we are atomic. Since do_exit() needs to call into

5302

* Test if we are atomic. Since do_exit() needs to call into

5303

* schedule() atomically, we ignore that path for now.

5303

* schedule() atomically, we ignore that path for now.

5304

* Otherwise, whine if we are scheduling when we should not be.

5304

* Otherwise, whine if we are scheduling when we should not be.

5305

*/

5305

*/

5306

if (unlikely(in_atomic_preempt_off() && !prev->exit_state))

5306

if (unlikely(in_atomic_preempt_off() && !prev->exit_state))

5307

__schedule_bug(prev);

5307

__schedule_bug(prev);

5308

5309

profile_hit(SCHED_PROFILING, __builtin_return_address(0));

5309

profile_hit(SCHED_PROFILING, __builtin_return_address(0));

5310

5311

schedstat_inc(this_rq(), sched_count);

5311

schedstat_inc(this_rq(), sched_count);

5312

#ifdef CONFIG_SCHEDSTATS

5312

#ifdef CONFIG_SCHEDSTATS

5313

if (unlikely(prev->lock_depth >= 0)) {

5313

if (unlikely(prev->lock_depth >= 0)) {

5314

schedstat_inc(this_rq(), bkl_count);

5314

schedstat_inc(this_rq(), bkl_count);

5315

schedstat_inc(prev, sched_info.bkl_count);

5315

schedstat_inc(prev, sched_info.bkl_count);

5316

}

5316

}

5317

#endif

5317

#endif

5318

}

5318

}

5319

5320

static void put_prev_task(struct rq *rq, struct task_struct *p)

5320

static void put_prev_task(struct rq *rq, struct task_struct *p)

5321

{

5321

{

5322

u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;

5322

u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;

5323

5324

update_avg(&p->se.avg_running, runtime);

5324

update_avg(&p->se.avg_running, runtime);

5325

5326

if (p->state == TASK_RUNNING) {

5326

if (p->state == TASK_RUNNING) {

5327

/*

5327

/*

5328

* In order to avoid avg_overlap growing stale when we are

5328

* In order to avoid avg_overlap growing stale when we are

5329

* indeed overlapping and hence not getting put to sleep, grow

5329

* indeed overlapping and hence not getting put to sleep, grow

5330

* the avg_overlap on preemption.

5330

* the avg_overlap on preemption.

5331

*

5331

*

5332

* We use the average preemption runtime because that

5332

* We use the average preemption runtime because that

5333

* correlates to the amount of cache footprint a task can

5333

* correlates to the amount of cache footprint a task can

5334

* build up.

5334

* build up.

5335

*/

5335

*/

5336

runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);

5336

runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);

5337

update_avg(&p->se.avg_overlap, runtime);

5337

update_avg(&p->se.avg_overlap, runtime);

5338

} else {

5338

} else {

5339

update_avg(&p->se.avg_running, 0);

5339

update_avg(&p->se.avg_running, 0);

5340

}

5340

}

5341

p->sched_class->put_prev_task(rq, p);

5341

p->sched_class->put_prev_task(rq, p);

5342

}

5342

}

5343

5344

/*

5344

/*

5345

* Pick up the highest-prio task:

5345

* Pick up the highest-prio task:

5346

*/

5346

*/

5347

static inline struct task_struct *

5347

static inline struct task_struct *

5348

pick_next_task(struct rq *rq)

5348

pick_next_task(struct rq *rq)

5349

{

5349

{

5350

const struct sched_class *class;

5350

const struct sched_class *class;

5351

struct task_struct *p;

5351

struct task_struct *p;

5352

5353

/*

5353

/*

5354

* Optimization: we know that if all tasks are in

5354

* Optimization: we know that if all tasks are in

5355

* the fair class we can call that function directly:

5355

* the fair class we can call that function directly:

5356

*/

5356

*/

5357

if (likely(rq->nr_running == rq->cfs.nr_running)) {

5357

if (likely(rq->nr_running == rq->cfs.nr_running)) {

5358

p = fair_sched_class.pick_next_task(rq);

5358

p = fair_sched_class.pick_next_task(rq);

5359

if (likely(p))

5359

if (likely(p))

5360

return p;

5360

return p;

5361

}

5361

}

5362

5363

class = sched_class_highest;

5363

class = sched_class_highest;

5364

for ( ; ; ) {

5364

for ( ; ; ) {

5365

p = class->pick_next_task(rq);

5365

p = class->pick_next_task(rq);

5366

if (p)

5366

if (p)

5367

return p;

5367

return p;

5368

/*

5368

/*

5369

* Will never be NULL as the idle class always

5369

* Will never be NULL as the idle class always

5370

* returns a non-NULL p:

5370

* returns a non-NULL p:

5371

*/

5371

*/

5372

class = class->next;

5372

class = class->next;

5373

}

5373

}

5374

}

5374

}

5375

5376

/*

5376

/*

5377

* schedule() is the main scheduler function.

5377

* schedule() is the main scheduler function.

5378

*/

5378

*/

5379

asmlinkage void __sched schedule(void)

5379

asmlinkage void __sched schedule(void)

5380

{

5380

{

5381

struct task_struct *prev, *next;

5381

struct task_struct *prev, *next;

5382

unsigned long *switch_count;

5382

unsigned long *switch_count;

5383

struct rq *rq;

5383

struct rq *rq;

5384

int cpu;

5384

int cpu;

5385

5386

need_resched:

5386

need_resched:

5387

preempt_disable();

5387

preempt_disable();

5388

cpu = smp_processor_id();

5388

cpu = smp_processor_id();

5389

rq = cpu_rq(cpu);

5389

rq = cpu_rq(cpu);

5390

rcu_sched_qs(cpu);

5390

rcu_sched_qs(cpu);

5391

prev = rq->curr;

5391

prev = rq->curr;

5392

switch_count = &prev->nivcsw;

5392

switch_count = &prev->nivcsw;

5393

5394

release_kernel_lock(prev);

5394

release_kernel_lock(prev);

5395

need_resched_nonpreemptible:

5395

need_resched_nonpreemptible:

5396

5397

schedule_debug(prev);

5397

schedule_debug(prev);

5398

5399

if (sched_feat(HRTICK))

5399

if (sched_feat(HRTICK))

5400

hrtick_clear(rq);

5400

hrtick_clear(rq);

5401

5402

spin_lock_irq(&rq->lock);

5402

spin_lock_irq(&rq->lock);

5403

update_rq_clock(rq);

5403

update_rq_clock(rq);

5404

clear_tsk_need_resched(prev);

5404

clear_tsk_need_resched(prev);

5405

5406

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

5406

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

5407

if (unlikely(signal_pending_state(prev->state, prev)))

5407

if (unlikely(signal_pending_state(prev->state, prev)))

5408

prev->state = TASK_RUNNING;

5408

prev->state = TASK_RUNNING;

5409

else

5409

else

5410

deactivate_task(rq, prev, 1);

5410

deactivate_task(rq, prev, 1);

5411

switch_count = &prev->nvcsw;

5411

switch_count = &prev->nvcsw;

5412

}

5412

}

5413

5414

pre_schedule(rq, prev);

5414

pre_schedule(rq, prev);

5415

5416

if (unlikely(!rq->nr_running))

5416

if (unlikely(!rq->nr_running))

5417

idle_balance(cpu, rq);

5417

idle_balance(cpu, rq);

5418

5419

put_prev_task(rq, prev);

5419

put_prev_task(rq, prev);

5420

next = pick_next_task(rq);

5420

next = pick_next_task(rq);

5421

5422

if (likely(prev != next)) {

5422

if (likely(prev != next)) {

5423

sched_info_switch(prev, next);

5423

sched_info_switch(prev, next);

5424

perf_event_task_sched_out(prev, next, cpu);

5424

perf_event_task_sched_out(prev, next, cpu);

5425

5426

rq->nr_switches++;

5426

rq->nr_switches++;

5427

rq->curr = next;

5427

rq->curr = next;

5428

++*switch_count;

5428

++*switch_count;

5429

5430

context_switch(rq, prev, next); /* unlocks the rq */

5430

context_switch(rq, prev, next); /* unlocks the rq */

5431

/*

5431

/*

5432

* the context switch might have flipped the stack from under

5432

* the context switch might have flipped the stack from under

5433

* us, hence refresh the local variables.

5433

* us, hence refresh the local variables.

5434

*/

5434

*/

5435

cpu = smp_processor_id();

5435

cpu = smp_processor_id();

5436

rq = cpu_rq(cpu);

5436

rq = cpu_rq(cpu);

5437

} else

5437

} else

5438

spin_unlock_irq(&rq->lock);

5438

spin_unlock_irq(&rq->lock);

5439

5440

post_schedule(rq);

5440

post_schedule(rq);

5441

5442

if (unlikely(reacquire_kernel_lock(current) < 0))

5442

if (unlikely(reacquire_kernel_lock(current) < 0))

5443

goto need_resched_nonpreemptible;

5443

goto need_resched_nonpreemptible;

5444

5445

preempt_enable_no_resched();

5445

preempt_enable_no_resched();

5446

if (need_resched())

5446

if (need_resched())

5447

goto need_resched;

5447

goto need_resched;

5448

}

5448

}

5449

EXPORT_SYMBOL(schedule);

5449

EXPORT_SYMBOL(schedule);

5450

5451

#ifdef CONFIG_SMP

5451

#ifdef CONFIG_SMP

5452

/*

5452

/*

5453

* Look out! "owner" is an entirely speculative pointer

5453

* Look out! "owner" is an entirely speculative pointer

5454

* access and not reliable.

5454

* access and not reliable.

5455

*/

5455

*/

5456

int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)

5456

int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)

5457

{

5457

{

5458

unsigned int cpu;

5458

unsigned int cpu;

5459

struct rq *rq;

5459

struct rq *rq;

5460

5461

if (!sched_feat(OWNER_SPIN))

5461

if (!sched_feat(OWNER_SPIN))

5462

return 0;

5462

return 0;

5463

5464

#ifdef CONFIG_DEBUG_PAGEALLOC

5464

#ifdef CONFIG_DEBUG_PAGEALLOC

5465

/*

5465

/*

5466

* Need to access the cpu field knowing that

5466

* Need to access the cpu field knowing that

5467

* DEBUG_PAGEALLOC could have unmapped it if

5467

* DEBUG_PAGEALLOC could have unmapped it if

5468

* the mutex owner just released it and exited.

5468

* the mutex owner just released it and exited.

5469

*/

5469

*/

5470

if (probe_kernel_address(&owner->cpu, cpu))

5470

if (probe_kernel_address(&owner->cpu, cpu))

5471

goto out;

5471

goto out;

5472

#else

5472

#else

5473

cpu = owner->cpu;

5473

cpu = owner->cpu;

5474

#endif

5474

#endif

5475

5476

/*

5476

/*

5477

* Even if the access succeeded (likely case),

5477

* Even if the access succeeded (likely case),

5478

* the cpu field may no longer be valid.

5478

* the cpu field may no longer be valid.

5479

*/

5479

*/

5480

if (cpu >= nr_cpumask_bits)

5480

if (cpu >= nr_cpumask_bits)

5481

goto out;

5481

goto out;

5482

5483

/*

5483

/*

5484

* We need to validate that we can do a

5484

* We need to validate that we can do a

5485

* get_cpu() and that we have the percpu area.

5485

* get_cpu() and that we have the percpu area.

5486

*/

5486

*/

5487

if (!cpu_online(cpu))

5487

if (!cpu_online(cpu))

5488

goto out;

5488

goto out;

5489

5490

rq = cpu_rq(cpu);

5490

rq = cpu_rq(cpu);

5491

5492

for (;;) {

5492

for (;;) {

5493

/*

5493

/*

5494

* Owner changed, break to re-assess state.

5494

* Owner changed, break to re-assess state.

5495

*/

5495

*/

5496

if (lock->owner != owner)

5496

if (lock->owner != owner)

5497

break;

5497

break;

5498

5499

/*

5499

/*

5500

* Is that owner really running on that cpu?

5500

* Is that owner really running on that cpu?

5501

*/

5501

*/

5502

if (task_thread_info(rq->curr) != owner || need_resched())

5502

if (task_thread_info(rq->curr) != owner || need_resched())

5503

return 0;

5503

return 0;

5504

5505

cpu_relax();

5505

cpu_relax();

5506

}

5506

}

5507

out:

5507

out:

5508

return 1;

5508

return 1;

5509

}

5509

}

5510

#endif

5510

#endif

5511

5512

#ifdef CONFIG_PREEMPT

5512

#ifdef CONFIG_PREEMPT

5513

/*

5513

/*

5514

* this is the entry point to schedule() from in-kernel preemption

5514

* this is the entry point to schedule() from in-kernel preemption

5515

* off of preempt_enable. Kernel preemptions off return from interrupt

5515

* off of preempt_enable. Kernel preemptions off return from interrupt

5516

* occur there and call schedule directly.

5516

* occur there and call schedule directly.

5517

*/

5517

*/

5518

asmlinkage void __sched preempt_schedule(void)

5518

asmlinkage void __sched preempt_schedule(void)

5519

{

5519

{

5520

struct thread_info *ti = current_thread_info();

5520

struct thread_info *ti = current_thread_info();

5521

5522

/*

5522

/*

5523

* If there is a non-zero preempt_count or interrupts are disabled,

5523

* If there is a non-zero preempt_count or interrupts are disabled,

5524

* we do not want to preempt the current task. Just return..

5524

* we do not want to preempt the current task. Just return..

5525

*/

5525

*/

5526

if (likely(ti->preempt_count || irqs_disabled()))

5526

if (likely(ti->preempt_count || irqs_disabled()))

5527

return;

5527

return;

5528

5529

do {

5529

do {

5530

add_preempt_count(PREEMPT_ACTIVE);

5530

add_preempt_count(PREEMPT_ACTIVE);

5531

schedule();

5531

schedule();

5532

sub_preempt_count(PREEMPT_ACTIVE);

5532

sub_preempt_count(PREEMPT_ACTIVE);

5533

5534

/*

5534

/*

5535

* Check again in case we missed a preemption opportunity

5535

* Check again in case we missed a preemption opportunity

5536

* between schedule and now.

5536

* between schedule and now.

5537

*/

5537

*/

5538

barrier();

5538

barrier();

5539

} while (need_resched());

5539

} while (need_resched());

5540

}

5540

}

5541

EXPORT_SYMBOL(preempt_schedule);

5541

EXPORT_SYMBOL(preempt_schedule);

5542

5543

/*

5543

/*

5544

* this is the entry point to schedule() from kernel preemption

5544

* this is the entry point to schedule() from kernel preemption

5545

* off of irq context.

5545

* off of irq context.

5546

* Note, that this is called and return with irqs disabled. This will

5546

* Note, that this is called and return with irqs disabled. This will

5547

* protect us against recursive calling from irq.

5547

* protect us against recursive calling from irq.

5548

*/

5548

*/

5549

asmlinkage void __sched preempt_schedule_irq(void)

5549

asmlinkage void __sched preempt_schedule_irq(void)

5550

{

5550

{

5551

struct thread_info *ti = current_thread_info();

5551

struct thread_info *ti = current_thread_info();

5552

5553

/* Catch callers which need to be fixed */

5553

/* Catch callers which need to be fixed */

5554

BUG_ON(ti->preempt_count || !irqs_disabled());

5554

BUG_ON(ti->preempt_count || !irqs_disabled());

5555

5556

do {

5556

do {

5557

add_preempt_count(PREEMPT_ACTIVE);

5557

add_preempt_count(PREEMPT_ACTIVE);

5558

local_irq_enable();

5558

local_irq_enable();

5559

schedule();

5559

schedule();

5560

local_irq_disable();

5560

local_irq_disable();

5561

sub_preempt_count(PREEMPT_ACTIVE);

5561

sub_preempt_count(PREEMPT_ACTIVE);

5562

5563

/*

5563

/*

5564

* Check again in case we missed a preemption opportunity

5564

* Check again in case we missed a preemption opportunity

5565

* between schedule and now.

5565

* between schedule and now.

5566

*/

5566

*/

5567

barrier();

5567

barrier();

5568

} while (need_resched());

5568

} while (need_resched());

5569

}

5569

}

5570

5571

#endif /* CONFIG_PREEMPT */

5571

#endif /* CONFIG_PREEMPT */

5572

5573

int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,

5573

int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,

5574

void *key)

5574

void *key)

5575

{

5575

{

5576

return try_to_wake_up(curr->private, mode, wake_flags);

5576

return try_to_wake_up(curr->private, mode, wake_flags);

5577

}

5577

}

5578

EXPORT_SYMBOL(default_wake_function);

5578

EXPORT_SYMBOL(default_wake_function);

5579

5580

/*

5580

/*

5581

* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just

5581

* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just

5582

* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve

5582

* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve

5583

* number) then we wake all the non-exclusive tasks and one exclusive task.

5583

* number) then we wake all the non-exclusive tasks and one exclusive task.

5584

*

5584

*

5585

* There are circumstances in which we can try to wake a task which has already

5585

* There are circumstances in which we can try to wake a task which has already

5586

* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns

5586

* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns

5587

* zero in this (rare) case, and we handle it by continuing to scan the queue.

5587

* zero in this (rare) case, and we handle it by continuing to scan the queue.

5588

*/

5588

*/

5589

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,

5589

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,

5590

int nr_exclusive, int wake_flags, void *key)

5590

int nr_exclusive, int wake_flags, void *key)

5591

{

5591

{

5592

wait_queue_t *curr, *next;

5592

wait_queue_t *curr, *next;

5593

5594

list_for_each_entry_safe(curr, next, &q->task_list, task_list) {

5594

list_for_each_entry_safe(curr, next, &q->task_list, task_list) {

5595

unsigned flags = curr->flags;

5595

unsigned flags = curr->flags;

5596

5597

if (curr->func(curr, mode, wake_flags, key) &&

5597

if (curr->func(curr, mode, wake_flags, key) &&

5598

(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)

5598

(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)

5599

break;

5599

break;

5600

}

5600

}

5601

}

5601

}

5602

5603

/**

5603

/**

5604

* __wake_up - wake up threads blocked on a waitqueue.

5604

* __wake_up - wake up threads blocked on a waitqueue.

5605

* @q: the waitqueue

5605

* @q: the waitqueue

5606

* @mode: which threads

5606

* @mode: which threads

5607

* @nr_exclusive: how many wake-one or wake-many threads to wake up

5607

* @nr_exclusive: how many wake-one or wake-many threads to wake up

5608

* @key: is directly passed to the wakeup function

5608

* @key: is directly passed to the wakeup function

5609

*

5609

*

5610

* It may be assumed that this function implies a write memory barrier before

5610

* It may be assumed that this function implies a write memory barrier before

5611

* changing the task state if and only if any tasks are woken up.

5611

* changing the task state if and only if any tasks are woken up.

5612

*/

5612

*/

5613

void __wake_up(wait_queue_head_t *q, unsigned int mode,

5613

void __wake_up(wait_queue_head_t *q, unsigned int mode,

5614

int nr_exclusive, void *key)

5614

int nr_exclusive, void *key)

5615

{

5615

{

5616

unsigned long flags;

5616

unsigned long flags;

5617

5618

spin_lock_irqsave(&q->lock, flags);

5618

spin_lock_irqsave(&q->lock, flags);

5619

__wake_up_common(q, mode, nr_exclusive, 0, key);

5619

__wake_up_common(q, mode, nr_exclusive, 0, key);

5620

spin_unlock_irqrestore(&q->lock, flags);

5620

spin_unlock_irqrestore(&q->lock, flags);

5621

}

5621

}

5622

EXPORT_SYMBOL(__wake_up);

5622

EXPORT_SYMBOL(__wake_up);

5623

5624

/*

5624

/*

5625

* Same as __wake_up but called with the spinlock in wait_queue_head_t held.

5625

* Same as __wake_up but called with the spinlock in wait_queue_head_t held.

5626

*/

5626

*/

5627

void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)

5627

void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)

5628

{

5628

{

5629

__wake_up_common(q, mode, 1, 0, NULL);

5629

__wake_up_common(q, mode, 1, 0, NULL);

5630

}

5630

}

5631

5632

void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)

5632

void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)

5633

{

5633

{

5634

__wake_up_common(q, mode, 1, 0, key);

5634

__wake_up_common(q, mode, 1, 0, key);

5635

}

5635

}

5636

5637

/**

5637

/**

5638

* __wake_up_sync_key - wake up threads blocked on a waitqueue.

5638

* __wake_up_sync_key - wake up threads blocked on a waitqueue.

5639

* @q: the waitqueue

5639

* @q: the waitqueue

5640

* @mode: which threads

5640

* @mode: which threads

5641

* @nr_exclusive: how many wake-one or wake-many threads to wake up

5641

* @nr_exclusive: how many wake-one or wake-many threads to wake up

5642

* @key: opaque value to be passed to wakeup targets

5642

* @key: opaque value to be passed to wakeup targets

5643

*

5643

*

5644

* The sync wakeup differs that the waker knows that it will schedule

5644

* The sync wakeup differs that the waker knows that it will schedule

5645

* away soon, so while the target thread will be woken up, it will not

5645

* away soon, so while the target thread will be woken up, it will not

5646

* be migrated to another CPU - ie. the two threads are 'synchronized'

5646

* be migrated to another CPU - ie. the two threads are 'synchronized'

5647

* with each other. This can prevent needless bouncing between CPUs.

5647

* with each other. This can prevent needless bouncing between CPUs.

5648

*

5648

*

5649

* On UP it can prevent extra preemption.

5649

* On UP it can prevent extra preemption.

5650

*

5650

*

5651

* It may be assumed that this function implies a write memory barrier before

5651

* It may be assumed that this function implies a write memory barrier before

5652

* changing the task state if and only if any tasks are woken up.

5652

* changing the task state if and only if any tasks are woken up.

5653

*/

5653

*/

5654

void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,

5654

void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,

5655

int nr_exclusive, void *key)

5655

int nr_exclusive, void *key)

5656

{

5656

{

5657

unsigned long flags;

5657

unsigned long flags;

5658

int wake_flags = WF_SYNC;

5658

int wake_flags = WF_SYNC;

5659

5660

if (unlikely(!q))

5660

if (unlikely(!q))

5661

return;

5661

return;

5662

5663

if (unlikely(!nr_exclusive))

5663

if (unlikely(!nr_exclusive))

5664

wake_flags = 0;

5664

wake_flags = 0;

5665

5666

spin_lock_irqsave(&q->lock, flags);

5666

spin_lock_irqsave(&q->lock, flags);

5667

__wake_up_common(q, mode, nr_exclusive, wake_flags, key);

5667

__wake_up_common(q, mode, nr_exclusive, wake_flags, key);

5668

spin_unlock_irqrestore(&q->lock, flags);

5668

spin_unlock_irqrestore(&q->lock, flags);

5669

}

5669

}

5670

EXPORT_SYMBOL_GPL(__wake_up_sync_key);

5670

EXPORT_SYMBOL_GPL(__wake_up_sync_key);

5671

5672

/*

5672

/*

5673

* __wake_up_sync - see __wake_up_sync_key()

5673

* __wake_up_sync - see __wake_up_sync_key()

5674

*/

5674

*/

5675

void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)

5675

void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)

5676

{

5676

{

5677

__wake_up_sync_key(q, mode, nr_exclusive, NULL);

5677

__wake_up_sync_key(q, mode, nr_exclusive, NULL);

5678

}

5678

}

5679

EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */

5679

EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */

5680

5681

/**

5681

/**

5682

* complete: - signals a single thread waiting on this completion

5682

* complete: - signals a single thread waiting on this completion

5683

* @x: holds the state of this particular completion

5683

* @x: holds the state of this particular completion

5684

*

5684

*

5685

* This will wake up a single thread waiting on this completion. Threads will be

5685

* This will wake up a single thread waiting on this completion. Threads will be

5686

* awakened in the same order in which they were queued.

5686

* awakened in the same order in which they were queued.

5687

*

5687

*

5688

* See also complete_all(), wait_for_completion() and related routines.

5688

* See also complete_all(), wait_for_completion() and related routines.

5689

*

5689

*

5690

* It may be assumed that this function implies a write memory barrier before

5690

* It may be assumed that this function implies a write memory barrier before

5691

* changing the task state if and only if any tasks are woken up.

5691

* changing the task state if and only if any tasks are woken up.

5692

*/

5692

*/

5693

void complete(struct completion *x)

5693

void complete(struct completion *x)

5694

{

5694

{

5695

unsigned long flags;

5695

unsigned long flags;

5696

5697

spin_lock_irqsave(&x->wait.lock, flags);

5697

spin_lock_irqsave(&x->wait.lock, flags);

5698

x->done++;

5698

x->done++;

5699

__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);

5699

__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);

5700

spin_unlock_irqrestore(&x->wait.lock, flags);

5700

spin_unlock_irqrestore(&x->wait.lock, flags);

5701

}

5701

}

5702

EXPORT_SYMBOL(complete);

5702

EXPORT_SYMBOL(complete);

5703

5704

/**

5704

/**

5705

* complete_all: - signals all threads waiting on this completion

5705

* complete_all: - signals all threads waiting on this completion

5706

* @x: holds the state of this particular completion

5706

* @x: holds the state of this particular completion

5707

*

5707

*

5708

* This will wake up all threads waiting on this particular completion event.

5708

* This will wake up all threads waiting on this particular completion event.

5709

*

5709

*

5710

* It may be assumed that this function implies a write memory barrier before

5710

* It may be assumed that this function implies a write memory barrier before

5711

* changing the task state if and only if any tasks are woken up.

5711

* changing the task state if and only if any tasks are woken up.

5712

*/

5712

*/

5713

void complete_all(struct completion *x)

5713

void complete_all(struct completion *x)

5714

{

5714

{

5715

unsigned long flags;

5715

unsigned long flags;

5716

5717

spin_lock_irqsave(&x->wait.lock, flags);

5717

spin_lock_irqsave(&x->wait.lock, flags);

5718

x->done += UINT_MAX/2;

5718

x->done += UINT_MAX/2;

5719

__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);

5719

__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);

5720

spin_unlock_irqrestore(&x->wait.lock, flags);

5720

spin_unlock_irqrestore(&x->wait.lock, flags);

5721

}

5721

}

5722

EXPORT_SYMBOL(complete_all);

5722

EXPORT_SYMBOL(complete_all);

5723

5724

static inline long __sched

5724

static inline long __sched

5725

do_wait_for_common(struct completion *x, long timeout, int state)

5725

do_wait_for_common(struct completion *x, long timeout, int state)

5726

{

5726

{

5727

if (!x->done) {

5727

if (!x->done) {

5728

DECLARE_WAITQUEUE(wait, current);

5728

DECLARE_WAITQUEUE(wait, current);

5729

5730

wait.flags |= WQ_FLAG_EXCLUSIVE;

5730

wait.flags |= WQ_FLAG_EXCLUSIVE;

5731

__add_wait_queue_tail(&x->wait, &wait);

5731

__add_wait_queue_tail(&x->wait, &wait);

5732

do {

5732

do {

5733

if (signal_pending_state(state, current)) {

5733

if (signal_pending_state(state, current)) {

5734

timeout = -ERESTARTSYS;

5734

timeout = -ERESTARTSYS;

5735

break;

5735

break;

5736

}

5736

}

5737

__set_current_state(state);

5737

__set_current_state(state);

5738

spin_unlock_irq(&x->wait.lock);

5738

spin_unlock_irq(&x->wait.lock);

5739

timeout = schedule_timeout(timeout);

5739

timeout = schedule_timeout(timeout);

5740

spin_lock_irq(&x->wait.lock);

5740

spin_lock_irq(&x->wait.lock);

5741

} while (!x->done && timeout);

5741

} while (!x->done && timeout);

5742

__remove_wait_queue(&x->wait, &wait);

5742

__remove_wait_queue(&x->wait, &wait);

5743

if (!x->done)

5743

if (!x->done)

5744

return timeout;

5744

return timeout;

5745

}

5745

}

5746

x->done--;

5746

x->done--;

5747

return timeout ?: 1;

5747

return timeout ?: 1;

5748

}

5748

}

5749

5750

static long __sched

5750

static long __sched

5751

wait_for_common(struct completion *x, long timeout, int state)

5751

wait_for_common(struct completion *x, long timeout, int state)

5752

{

5752

{

5753

might_sleep();

5753

might_sleep();

5754

5755

spin_lock_irq(&x->wait.lock);

5755

spin_lock_irq(&x->wait.lock);

5756

timeout = do_wait_for_common(x, timeout, state);

5756

timeout = do_wait_for_common(x, timeout, state);

5757

spin_unlock_irq(&x->wait.lock);

5757

spin_unlock_irq(&x->wait.lock);

5758

return timeout;

5758

return timeout;

5759

}

5759

}

5760

5761

/**

5761

/**

5762

* wait_for_completion: - waits for completion of a task

5762

* wait_for_completion: - waits for completion of a task

5763

* @x: holds the state of this particular completion

5763

* @x: holds the state of this particular completion

5764

*

5764

*

5765

* This waits to be signaled for completion of a specific task. It is NOT

5765

* This waits to be signaled for completion of a specific task. It is NOT

5766

* interruptible and there is no timeout.

5766

* interruptible and there is no timeout.

5767

*

5767

*

5768

* See also similar routines (i.e. wait_for_completion_timeout()) with timeout

5768

* See also similar routines (i.e. wait_for_completion_timeout()) with timeout

5769

* and interrupt capability. Also see complete().

5769

* and interrupt capability. Also see complete().

5770

*/

5770

*/

5771

void __sched wait_for_completion(struct completion *x)

5771

void __sched wait_for_completion(struct completion *x)

5772

{

5772

{

5773

wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);

5773

wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);

5774

}

5774

}

5775

EXPORT_SYMBOL(wait_for_completion);

5775

EXPORT_SYMBOL(wait_for_completion);

5776

5777

/**

5777

/**

5778

* wait_for_completion_timeout: - waits for completion of a task (w/timeout)

5778

* wait_for_completion_timeout: - waits for completion of a task (w/timeout)

5779

* @x: holds the state of this particular completion

5779

* @x: holds the state of this particular completion

5780

* @timeout: timeout value in jiffies

5780

* @timeout: timeout value in jiffies

5781

*

5781

*

5782

* This waits for either a completion of a specific task to be signaled or for a

5782

* This waits for either a completion of a specific task to be signaled or for a

5783

* specified timeout to expire. The timeout is in jiffies. It is not

5783

* specified timeout to expire. The timeout is in jiffies. It is not

5784

* interruptible.

5784

* interruptible.

5785

*/

5785

*/

5786

unsigned long __sched

5786

unsigned long __sched

5787

wait_for_completion_timeout(struct completion *x, unsigned long timeout)

5787

wait_for_completion_timeout(struct completion *x, unsigned long timeout)

5788

{

5788

{

5789

return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);

5789

return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);

5790

}

5790

}

5791

EXPORT_SYMBOL(wait_for_completion_timeout);

5791

EXPORT_SYMBOL(wait_for_completion_timeout);

5792

5793

/**

5793

/**

5794

* wait_for_completion_interruptible: - waits for completion of a task (w/intr)

5794

* wait_for_completion_interruptible: - waits for completion of a task (w/intr)

5795

* @x: holds the state of this particular completion

5795

* @x: holds the state of this particular completion

5796

*

5796

*

5797

* This waits for completion of a specific task to be signaled. It is

5797

* This waits for completion of a specific task to be signaled. It is

5798

* interruptible.

5798

* interruptible.

5799

*/

5799

*/

5800

int __sched wait_for_completion_interruptible(struct completion *x)

5800

int __sched wait_for_completion_interruptible(struct completion *x)

5801

{

5801

{

5802

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);

5802

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);

5803

if (t == -ERESTARTSYS)

5803

if (t == -ERESTARTSYS)

5804

return t;

5804

return t;

5805

return 0;

5805

return 0;

5806

}

5806

}

5807

EXPORT_SYMBOL(wait_for_completion_interruptible);

5807

EXPORT_SYMBOL(wait_for_completion_interruptible);

5808

5809

/**

5809

/**

5810

* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))

5810

* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))

5811

* @x: holds the state of this particular completion

5811

* @x: holds the state of this particular completion

5812

* @timeout: timeout value in jiffies

5812

* @timeout: timeout value in jiffies

5813

*

5813

*

5814

* This waits for either a completion of a specific task to be signaled or for a

5814

* This waits for either a completion of a specific task to be signaled or for a

5815

* specified timeout to expire. It is interruptible. The timeout is in jiffies.

5815

* specified timeout to expire. It is interruptible. The timeout is in jiffies.

5816

*/

5816

*/

5817

unsigned long __sched

5817

unsigned long __sched

5818

wait_for_completion_interruptible_timeout(struct completion *x,

5818

wait_for_completion_interruptible_timeout(struct completion *x,

5819

unsigned long timeout)

5819

unsigned long timeout)

5820

{

5820

{

5821

return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);

5821

return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);

5822

}

5822

}

5823

EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);

5823

EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);

5824

5825

/**

5825

/**

5826

* wait_for_completion_killable: - waits for completion of a task (killable)

5826

* wait_for_completion_killable: - waits for completion of a task (killable)

5827

* @x: holds the state of this particular completion

5827

* @x: holds the state of this particular completion

5828

*

5828

*

5829

* This waits to be signaled for completion of a specific task. It can be

5829

* This waits to be signaled for completion of a specific task. It can be

5830

* interrupted by a kill signal.

5830

* interrupted by a kill signal.

5831

*/

5831

*/

5832

int __sched wait_for_completion_killable(struct completion *x)

5832

int __sched wait_for_completion_killable(struct completion *x)

5833

{

5833

{

5834

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);

5834

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);

5835

if (t == -ERESTARTSYS)

5835

if (t == -ERESTARTSYS)

5836

return t;

5836

return t;

5837

return 0;

5837

return 0;

5838

}

5838

}

5839

EXPORT_SYMBOL(wait_for_completion_killable);

5839

EXPORT_SYMBOL(wait_for_completion_killable);

5840

5841

/**

5841

/**

5842

* try_wait_for_completion - try to decrement a completion without blocking

5842

* try_wait_for_completion - try to decrement a completion without blocking

5843

* @x: completion structure

5843

* @x: completion structure

5844

*

5844

*

5845

* Returns: 0 if a decrement cannot be done without blocking

5845

* Returns: 0 if a decrement cannot be done without blocking

5846

* 1 if a decrement succeeded.

5846

* 1 if a decrement succeeded.

5847

*

5847

*

5848

* If a completion is being used as a counting completion,

5848

* If a completion is being used as a counting completion,

5849

* attempt to decrement the counter without blocking. This

5849

* attempt to decrement the counter without blocking. This

5850

* enables us to avoid waiting if the resource the completion

5850

* enables us to avoid waiting if the resource the completion

5851

* is protecting is not available.

5851

* is protecting is not available.

5852

*/

5852

*/

5853

bool try_wait_for_completion(struct completion *x)

5853

bool try_wait_for_completion(struct completion *x)

5854

{

5854

{

5855

int ret = 1;

5855

int ret = 1;

5856

5857

spin_lock_irq(&x->wait.lock);

5857

spin_lock_irq(&x->wait.lock);

5858

if (!x->done)

5858

if (!x->done)

5859

ret = 0;

5859

ret = 0;

5860

else

5860

else

5861

x->done--;

5861

x->done--;

5862

spin_unlock_irq(&x->wait.lock);

5862

spin_unlock_irq(&x->wait.lock);

5863

return ret;

5863

return ret;

5864

}

5864

}

5865

EXPORT_SYMBOL(try_wait_for_completion);

5865

EXPORT_SYMBOL(try_wait_for_completion);

5866

5867

/**

5867

/**

5868

* completion_done - Test to see if a completion has any waiters

5868

* completion_done - Test to see if a completion has any waiters

5869

* @x: completion structure

5869

* @x: completion structure

5870

*

5870

*

5871

* Returns: 0 if there are waiters (wait_for_completion() in progress)

5871

* Returns: 0 if there are waiters (wait_for_completion() in progress)

5872

* 1 if there are no waiters.

5872

* 1 if there are no waiters.

5873

*

5873

*

5874

*/

5874

*/

5875

bool completion_done(struct completion *x)

5875

bool completion_done(struct completion *x)

5876

{

5876

{

5877

int ret = 1;

5877

int ret = 1;

5878

5879

spin_lock_irq(&x->wait.lock);

5879

spin_lock_irq(&x->wait.lock);

5880

if (!x->done)

5880

if (!x->done)

5881

ret = 0;

5881

ret = 0;

5882

spin_unlock_irq(&x->wait.lock);

5882

spin_unlock_irq(&x->wait.lock);

5883

return ret;

5883

return ret;

5884

}

5884

}

5885

EXPORT_SYMBOL(completion_done);

5885

EXPORT_SYMBOL(completion_done);

5886

5887

static long __sched

5887

static long __sched

5888

sleep_on_common(wait_queue_head_t *q, int state, long timeout)

5888

sleep_on_common(wait_queue_head_t *q, int state, long timeout)

5889

{

5889

{

5890

unsigned long flags;

5890

unsigned long flags;

5891

wait_queue_t wait;

5891

wait_queue_t wait;

5892

5893

init_waitqueue_entry(&wait, current);

5893

init_waitqueue_entry(&wait, current);

5894

5895

__set_current_state(state);

5895

__set_current_state(state);

5896

5897

spin_lock_irqsave(&q->lock, flags);

5897

spin_lock_irqsave(&q->lock, flags);

5898

__add_wait_queue(q, &wait);

5898

__add_wait_queue(q, &wait);

5899

spin_unlock(&q->lock);

5899

spin_unlock(&q->lock);

5900

timeout = schedule_timeout(timeout);

5900

timeout = schedule_timeout(timeout);

5901

spin_lock_irq(&q->lock);

5901

spin_lock_irq(&q->lock);

5902

__remove_wait_queue(q, &wait);

5902

__remove_wait_queue(q, &wait);

5903

spin_unlock_irqrestore(&q->lock, flags);

5903

spin_unlock_irqrestore(&q->lock, flags);

5904

5905

return timeout;

5905

return timeout;

5906

}

5906

}

5907

5908

void __sched interruptible_sleep_on(wait_queue_head_t *q)

5908

void __sched interruptible_sleep_on(wait_queue_head_t *q)

5909

{

5909

{

5910

sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

5910

sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

5911

}

5911

}

5912

EXPORT_SYMBOL(interruptible_sleep_on);

5912

EXPORT_SYMBOL(interruptible_sleep_on);

5913

5914

long __sched

5914

long __sched

5915

interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)

5915

interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)

5916

{

5916

{

5917

return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);

5917

return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);

5918

}

5918

}

5919

EXPORT_SYMBOL(interruptible_sleep_on_timeout);

5919

EXPORT_SYMBOL(interruptible_sleep_on_timeout);

5920

5921

void __sched sleep_on(wait_queue_head_t *q)

5921

void __sched sleep_on(wait_queue_head_t *q)

5922

{

5922

{

5923

sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

5923

sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

5924

}

5924

}

5925

EXPORT_SYMBOL(sleep_on);

5925

EXPORT_SYMBOL(sleep_on);

5926

5927

long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)

5927

long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)

5928

{

5928

{

5929

return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);

5929

return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);

5930

}

5930

}

5931

EXPORT_SYMBOL(sleep_on_timeout);

5931

EXPORT_SYMBOL(sleep_on_timeout);

5932

5933

#ifdef CONFIG_RT_MUTEXES

5933

#ifdef CONFIG_RT_MUTEXES

5934

5935

/*

5935

/*

5936

* rt_mutex_setprio - set the current priority of a task

5936

* rt_mutex_setprio - set the current priority of a task

5937

* @p: task

5937

* @p: task

5938

* @prio: prio value (kernel-internal form)

5938

* @prio: prio value (kernel-internal form)

5939

*

5939

*

5940

* This function changes the 'effective' priority of a task. It does

5940

* This function changes the 'effective' priority of a task. It does

5941

* not touch ->normal_prio like __setscheduler().

5941

* not touch ->normal_prio like __setscheduler().

5942

*

5942

*

5943

* Used by the rt_mutex code to implement priority inheritance logic.

5943

* Used by the rt_mutex code to implement priority inheritance logic.

5944

*/

5944

*/

5945

void rt_mutex_setprio(struct task_struct *p, int prio)

5945

void rt_mutex_setprio(struct task_struct *p, int prio)

5946

{

5946

{

5947

unsigned long flags;

5947

unsigned long flags;

5948

int oldprio, on_rq, running;

5948

int oldprio, on_rq, running;

5949

struct rq *rq;

5949

struct rq *rq;

5950

const struct sched_class *prev_class = p->sched_class;

5950

const struct sched_class *prev_class = p->sched_class;

5951

5952

BUG_ON(prio < 0 || prio > MAX_PRIO);

5952

BUG_ON(prio < 0 || prio > MAX_PRIO);

5953

5954

rq = task_rq_lock(p, &flags);

5954

rq = task_rq_lock(p, &flags);

5955

update_rq_clock(rq);

5955

update_rq_clock(rq);

5956

5957

oldprio = p->prio;

5957

oldprio = p->prio;

5958

on_rq = p->se.on_rq;

5958

on_rq = p->se.on_rq;

5959

running = task_current(rq, p);

5959

running = task_current(rq, p);

5960

if (on_rq)

5960

if (on_rq)

5961

dequeue_task(rq, p, 0);

5961

dequeue_task(rq, p, 0);

5962

if (running)

5962

if (running)

5963

p->sched_class->put_prev_task(rq, p);

5963

p->sched_class->put_prev_task(rq, p);

5964

5965

if (rt_prio(prio))

5965

if (rt_prio(prio))

5966

p->sched_class = &rt_sched_class;

5966

p->sched_class = &rt_sched_class;

5967

else

5967

else

5968

p->sched_class = &fair_sched_class;

5968

p->sched_class = &fair_sched_class;

5969

5970

p->prio = prio;

5970

p->prio = prio;

5971

5972

if (running)

5972

if (running)

5973

p->sched_class->set_curr_task(rq);

5973

p->sched_class->set_curr_task(rq);

5974

if (on_rq) {

5974

if (on_rq) {

5975

enqueue_task(rq, p, 0);

5975

enqueue_task(rq, p, 0);

5976

5977

check_class_changed(rq, p, prev_class, oldprio, running);

5977

check_class_changed(rq, p, prev_class, oldprio, running);

5978

}

5978

}

5979

task_rq_unlock(rq, &flags);

5979

task_rq_unlock(rq, &flags);

5980

}

5980

}

5981

5982

#endif

5982

#endif

5983

5984

void set_user_nice(struct task_struct *p, long nice)

5984

void set_user_nice(struct task_struct *p, long nice)

5985

{

5985

{

5986

int old_prio, delta, on_rq;

5986

int old_prio, delta, on_rq;

5987

unsigned long flags;

5987

unsigned long flags;

5988

struct rq *rq;

5988

struct rq *rq;

5989

5990

if (TASK_NICE(p) == nice || nice < -20 || nice > 19)

5990

if (TASK_NICE(p) == nice || nice < -20 || nice > 19)

5991

return;

5991

return;

5992

/*

5992

/*

5993

* We have to be careful, if called from sys_setpriority(),

5993

* We have to be careful, if called from sys_setpriority(),

5994

* the task might be in the middle of scheduling on another CPU.

5994

* the task might be in the middle of scheduling on another CPU.

5995

*/

5995

*/

5996

rq = task_rq_lock(p, &flags);

5996

rq = task_rq_lock(p, &flags);

5997

update_rq_clock(rq);

5997

update_rq_clock(rq);

5998

/*

5998

/*

5999

* The RT priorities are set via sched_setscheduler(), but we still

5999

* The RT priorities are set via sched_setscheduler(), but we still

6000

* allow the 'normal' nice value to be set - but as expected

6000

* allow the 'normal' nice value to be set - but as expected

6001

* it wont have any effect on scheduling until the task is

6001

* it wont have any effect on scheduling until the task is

6002

* SCHED_FIFO/SCHED_RR:

6002

* SCHED_FIFO/SCHED_RR:

6003

*/

6003

*/

6004

if (task_has_rt_policy(p)) {

6004

if (task_has_rt_policy(p)) {

6005

p->static_prio = NICE_TO_PRIO(nice);

6005

p->static_prio = NICE_TO_PRIO(nice);

6006

goto out_unlock;

6006

goto out_unlock;

6007

}

6007

}

6008

on_rq = p->se.on_rq;

6008

on_rq = p->se.on_rq;

6009

if (on_rq)

6009

if (on_rq)

6010

dequeue_task(rq, p, 0);

6010

dequeue_task(rq, p, 0);

6011

6012

p->static_prio = NICE_TO_PRIO(nice);

6012

p->static_prio = NICE_TO_PRIO(nice);

6013

set_load_weight(p);

6013

set_load_weight(p);

6014

old_prio = p->prio;

6014

old_prio = p->prio;

6015

p->prio = effective_prio(p);

6015

p->prio = effective_prio(p);

6016

delta = p->prio - old_prio;

6016

delta = p->prio - old_prio;

6017

6018

if (on_rq) {

6018

if (on_rq) {

6019

enqueue_task(rq, p, 0);

6019

enqueue_task(rq, p, 0);

6020

/*

6020

/*

6021

* If the task increased its priority or is running and

6021

* If the task increased its priority or is running and

6022

* lowered its priority, then reschedule its CPU:

6022

* lowered its priority, then reschedule its CPU:

6023

*/

6023

*/

6024

if (delta < 0 || (delta > 0 && task_running(rq, p)))

6024

if (delta < 0 || (delta > 0 && task_running(rq, p)))

6025

resched_task(rq->curr);

6025

resched_task(rq->curr);

6026

}

6026

}

6027

out_unlock:

6027

out_unlock:

6028

task_rq_unlock(rq, &flags);

6028

task_rq_unlock(rq, &flags);

6029

}

6029

}

6030

EXPORT_SYMBOL(set_user_nice);

6030

EXPORT_SYMBOL(set_user_nice);

6031

6032

/*

6032

/*

6033

* can_nice - check if a task can reduce its nice value

6033

* can_nice - check if a task can reduce its nice value

6034

* @p: task

6034

* @p: task

6035

* @nice: nice value

6035

* @nice: nice value

6036

*/

6036

*/

6037

int can_nice(const struct task_struct *p, const int nice)

6037

int can_nice(const struct task_struct *p, const int nice)

6038

{

6038

{

6039

/* convert nice value [19,-20] to rlimit style value [1,40] */

6039

/* convert nice value [19,-20] to rlimit style value [1,40] */

6040

int nice_rlim = 20 - nice;

6040

int nice_rlim = 20 - nice;

6041

6042

return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||

6042

return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||

6043

capable(CAP_SYS_NICE));

6043

capable(CAP_SYS_NICE));

6044

}

6044

}

6045

6046

#ifdef __ARCH_WANT_SYS_NICE

6046

#ifdef __ARCH_WANT_SYS_NICE

6047

6048

/*

6048

/*

6049

* sys_nice - change the priority of the current process.

6049

* sys_nice - change the priority of the current process.

6050

* @increment: priority increment

6050

* @increment: priority increment

6051

*

6051

*

6052

* sys_setpriority is a more generic, but much slower function that

6052

* sys_setpriority is a more generic, but much slower function that

6053

* does similar things.

6053

* does similar things.

6054

*/

6054

*/

6055

SYSCALL_DEFINE1(nice, int, increment)

6055

SYSCALL_DEFINE1(nice, int, increment)

6056

{

6056

{

6057

long nice, retval;

6057

long nice, retval;

6058

6059

/*

6059

/*

6060

* Setpriority might change our priority at the same moment.

6060

* Setpriority might change our priority at the same moment.

6061

* We don't have to worry. Conceptually one call occurs first

6061

* We don't have to worry. Conceptually one call occurs first

6062

* and we have a single winner.

6062

* and we have a single winner.

6063

*/

6063

*/

6064

if (increment < -40)

6064

if (increment < -40)

6065

increment = -40;

6065

increment = -40;

6066

if (increment > 40)

6066

if (increment > 40)

6067

increment = 40;

6067

increment = 40;

6068

6069

nice = TASK_NICE(current) + increment;

6069

nice = TASK_NICE(current) + increment;

6070

if (nice < -20)

6070

if (nice < -20)

6071

nice = -20;

6071

nice = -20;

6072

if (nice > 19)

6072

if (nice > 19)

6073

nice = 19;

6073

nice = 19;

6074

6075

if (increment < 0 && !can_nice(current, nice))

6075

if (increment < 0 && !can_nice(current, nice))

6076

return -EPERM;

6076

return -EPERM;

6077

6078

retval = security_task_setnice(current, nice);

6078

retval = security_task_setnice(current, nice);

6079

if (retval)

6079

if (retval)

6080

return retval;

6080

return retval;

6081

6082

set_user_nice(current, nice);

6082

set_user_nice(current, nice);

6083

return 0;

6083

return 0;

6084

}

6084

}

6085

6086

#endif

6086

#endif

6087

6088

/**

6088

/**

6089

* task_prio - return the priority value of a given task.

6089

* task_prio - return the priority value of a given task.

6090

* @p: the task in question.

6090

* @p: the task in question.

6091

*

6091

*

6092

* This is the priority value as seen by users in /proc.

6092

* This is the priority value as seen by users in /proc.

6093

* RT tasks are offset by -200. Normal tasks are centered

6093

* RT tasks are offset by -200. Normal tasks are centered

6094

* around 0, value goes from -16 to +15.

6094

* around 0, value goes from -16 to +15.

6095

*/

6095

*/

6096

int task_prio(const struct task_struct *p)

6096

int task_prio(const struct task_struct *p)

6097

{

6097

{

6098

return p->prio - MAX_RT_PRIO;

6098

return p->prio - MAX_RT_PRIO;

6099

}

6099

}

6100

6101

/**

6101

/**

6102

* task_nice - return the nice value of a given task.

6102

* task_nice - return the nice value of a given task.

6103

* @p: the task in question.

6103

* @p: the task in question.

6104

*/

6104

*/

6105

int task_nice(const struct task_struct *p)

6105

int task_nice(const struct task_struct *p)

6106

{

6106

{

6107

return TASK_NICE(p);

6107

return TASK_NICE(p);

6108

}

6108

}

6109

EXPORT_SYMBOL(task_nice);

6109

EXPORT_SYMBOL(task_nice);

6110

6111

/**

6111

/**

6112

* idle_cpu - is a given cpu idle currently?

6112

* idle_cpu - is a given cpu idle currently?

6113

* @cpu: the processor in question.

6113

* @cpu: the processor in question.

6114

*/

6114

*/

6115

int idle_cpu(int cpu)

6115

int idle_cpu(int cpu)

6116

{

6116

{

6117

return cpu_curr(cpu) == cpu_rq(cpu)->idle;

6117

return cpu_curr(cpu) == cpu_rq(cpu)->idle;

6118

}

6118

}

6119

6120

/**

6120

/**

6121

* idle_task - return the idle task for a given cpu.

6121

* idle_task - return the idle task for a given cpu.

6122

* @cpu: the processor in question.

6122

* @cpu: the processor in question.

6123

*/

6123

*/

6124

struct task_struct *idle_task(int cpu)

6124

struct task_struct *idle_task(int cpu)

6125

{

6125

{

6126

return cpu_rq(cpu)->idle;

6126

return cpu_rq(cpu)->idle;

6127

}

6127

}

6128

6129

/**

6129

/**

6130

* find_process_by_pid - find a process with a matching PID value.

6130

* find_process_by_pid - find a process with a matching PID value.

6131

* @pid: the pid in question.

6131

* @pid: the pid in question.

6132

*/

6132

*/

6133

static struct task_struct *find_process_by_pid(pid_t pid)

6133

static struct task_struct *find_process_by_pid(pid_t pid)

6134

{

6134

{

6135

return pid ? find_task_by_vpid(pid) : current;

6135

return pid ? find_task_by_vpid(pid) : current;

6136

}

6136

}

6137

6138

/* Actually do priority change: must hold rq lock. */

6138

/* Actually do priority change: must hold rq lock. */

6139

static void

6139

static void

6140

__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)

6140

__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)

6141

{

6141

{

6142

BUG_ON(p->se.on_rq);

6142

BUG_ON(p->se.on_rq);

6143

6144

p->policy = policy;

6144

p->policy = policy;

6145

switch (p->policy) {

6145

switch (p->policy) {

6146

case SCHED_NORMAL:

6146

case SCHED_NORMAL:

6147

case SCHED_BATCH:

6147

case SCHED_BATCH:

6148

case SCHED_IDLE:

6148

case SCHED_IDLE:

6149

p->sched_class = &fair_sched_class;

6149

p->sched_class = &fair_sched_class;

6150

break;

6150

break;

6151

case SCHED_FIFO:

6151

case SCHED_FIFO:

6152

case SCHED_RR:

6152

case SCHED_RR:

6153

p->sched_class = &rt_sched_class;

6153

p->sched_class = &rt_sched_class;

6154

break;

6154

break;

6155

}

6155

}

6156

6157

p->rt_priority = prio;

6157

p->rt_priority = prio;

6158

p->normal_prio = normal_prio(p);

6158

p->normal_prio = normal_prio(p);

6159

/* we are holding p->pi_lock already */

6159

/* we are holding p->pi_lock already */

6160

p->prio = rt_mutex_getprio(p);

6160

p->prio = rt_mutex_getprio(p);

6161

set_load_weight(p);

6161

set_load_weight(p);

6162

}

6162

}

6163

6164

/*

6164

/*

6165

* check the target process has a UID that matches the current process's

6165

* check the target process has a UID that matches the current process's

6166

*/

6166

*/

6167

static bool check_same_owner(struct task_struct *p)

6167

static bool check_same_owner(struct task_struct *p)

6168

{

6168

{

6169

const struct cred *cred = current_cred(), *pcred;

6169

const struct cred *cred = current_cred(), *pcred;

6170

bool match;

6170

bool match;

6171

6172

rcu_read_lock();

6172

rcu_read_lock();

6173

pcred = __task_cred(p);

6173

pcred = __task_cred(p);

6174

match = (cred->euid == pcred->euid ||

6174

match = (cred->euid == pcred->euid ||

6175

cred->euid == pcred->uid);

6175

cred->euid == pcred->uid);

6176

rcu_read_unlock();

6176

rcu_read_unlock();

6177

return match;

6177

return match;

6178

}

6178

}

6179

6180

static int __sched_setscheduler(struct task_struct *p, int policy,

6180

static int __sched_setscheduler(struct task_struct *p, int policy,

6181

struct sched_param *param, bool user)

6181

struct sched_param *param, bool user)

6182

{

6182

{

6183

int retval, oldprio, oldpolicy = -1, on_rq, running;

6183

int retval, oldprio, oldpolicy = -1, on_rq, running;

6184

unsigned long flags;

6184

unsigned long flags;

6185

const struct sched_class *prev_class = p->sched_class;

6185

const struct sched_class *prev_class = p->sched_class;

6186

struct rq *rq;

6186

struct rq *rq;

6187

int reset_on_fork;

6187

int reset_on_fork;

6188

6189

/* may grab non-irq protected spin_locks */

6189

/* may grab non-irq protected spin_locks */

6190

BUG_ON(in_interrupt());

6190

BUG_ON(in_interrupt());

6191

recheck:

6191

recheck:

6192

/* double check policy once rq lock held */

6192

/* double check policy once rq lock held */

6193

if (policy < 0) {

6193

if (policy < 0) {

6194

reset_on_fork = p->sched_reset_on_fork;

6194

reset_on_fork = p->sched_reset_on_fork;

6195

policy = oldpolicy = p->policy;

6195

policy = oldpolicy = p->policy;

6196

} else {

6196

} else {

6197

reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);

6197

reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);

6198

policy &= ~SCHED_RESET_ON_FORK;

6198

policy &= ~SCHED_RESET_ON_FORK;

6199

6200

if (policy != SCHED_FIFO && policy != SCHED_RR &&

6200

if (policy != SCHED_FIFO && policy != SCHED_RR &&

6201

policy != SCHED_NORMAL && policy != SCHED_BATCH &&

6201

policy != SCHED_NORMAL && policy != SCHED_BATCH &&

6202

policy != SCHED_IDLE)

6202

policy != SCHED_IDLE)

6203

return -EINVAL;

6203

return -EINVAL;

6204

}

6204

}

6205

6206

/*

6206

/*

6207

* Valid priorities for SCHED_FIFO and SCHED_RR are

6207

* Valid priorities for SCHED_FIFO and SCHED_RR are

6208

* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,

6208

* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,

6209

* SCHED_BATCH and SCHED_IDLE is 0.

6209

* SCHED_BATCH and SCHED_IDLE is 0.

6210

*/

6210

*/

6211

if (param->sched_priority < 0 ||

6211

if (param->sched_priority < 0 ||

6212

(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||

6212

(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||

6213

(!p->mm && param->sched_priority > MAX_RT_PRIO-1))

6213

(!p->mm && param->sched_priority > MAX_RT_PRIO-1))

6214

return -EINVAL;

6214

return -EINVAL;

6215

if (rt_policy(policy) != (param->sched_priority != 0))

6215

if (rt_policy(policy) != (param->sched_priority != 0))

6216

return -EINVAL;

6216

return -EINVAL;

6217

6218

/*

6218

/*

6219

* Allow unprivileged RT tasks to decrease priority:

6219

* Allow unprivileged RT tasks to decrease priority:

6220

*/

6220

*/

6221

if (user && !capable(CAP_SYS_NICE)) {

6221

if (user && !capable(CAP_SYS_NICE)) {

6222

if (rt_policy(policy)) {

6222

if (rt_policy(policy)) {

6223

unsigned long rlim_rtprio;

6223

unsigned long rlim_rtprio;

6224

6225

if (!lock_task_sighand(p, &flags))

6225

if (!lock_task_sighand(p, &flags))

6226

return -ESRCH;

6226

return -ESRCH;

6227

rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;

6227

rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;

6228

unlock_task_sighand(p, &flags);

6228

unlock_task_sighand(p, &flags);

6229

6230

/* can't set/change the rt policy */

6230

/* can't set/change the rt policy */

6231

if (policy != p->policy && !rlim_rtprio)

6231

if (policy != p->policy && !rlim_rtprio)

6232

return -EPERM;

6232

return -EPERM;

6233

6234

/* can't increase priority */

6234

/* can't increase priority */

6235

if (param->sched_priority > p->rt_priority &&

6235

if (param->sched_priority > p->rt_priority &&

6236

param->sched_priority > rlim_rtprio)

6236

param->sched_priority > rlim_rtprio)

6237

return -EPERM;

6237

return -EPERM;

6238

}

6238

}

6239

/*

6239

/*

6240

* Like positive nice levels, dont allow tasks to

6240

* Like positive nice levels, dont allow tasks to

6241

* move out of SCHED_IDLE either:

6241

* move out of SCHED_IDLE either:

6242

*/

6242

*/

6243

if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)

6243

if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)

6244

return -EPERM;

6244

return -EPERM;

6245

6246

/* can't change other user's priorities */

6246

/* can't change other user's priorities */

6247

if (!check_same_owner(p))

6247

if (!check_same_owner(p))

6248

return -EPERM;

6248

return -EPERM;

6249

6250

/* Normal users shall not reset the sched_reset_on_fork flag */

6250

/* Normal users shall not reset the sched_reset_on_fork flag */

6251

if (p->sched_reset_on_fork && !reset_on_fork)

6251

if (p->sched_reset_on_fork && !reset_on_fork)

6252

return -EPERM;

6252

return -EPERM;

6253

}

6253

}

6254

6255

if (user) {

6255

if (user) {

6256

#ifdef CONFIG_RT_GROUP_SCHED

6256

#ifdef CONFIG_RT_GROUP_SCHED

6257

/*

6257

/*

6258

* Do not allow realtime tasks into groups that have no runtime

6258

* Do not allow realtime tasks into groups that have no runtime

6259

* assigned.

6259

* assigned.

6260

*/

6260

*/

6261

if (rt_bandwidth_enabled() && rt_policy(policy) &&

6261

if (rt_bandwidth_enabled() && rt_policy(policy) &&

6262

task_group(p)->rt_bandwidth.rt_runtime == 0)

6262

task_group(p)->rt_bandwidth.rt_runtime == 0)

6263

return -EPERM;

6263

return -EPERM;

6264

#endif

6264

#endif

6265

6266

retval = security_task_setscheduler(p, policy, param);

6266

retval = security_task_setscheduler(p, policy, param);

6267

if (retval)

6267

if (retval)

6268

return retval;

6268

return retval;

6269

}

6269

}

6270

6271

/*

6271

/*

6272

* make sure no PI-waiters arrive (or leave) while we are

6272

* make sure no PI-waiters arrive (or leave) while we are

6273

* changing the priority of the task:

6273

* changing the priority of the task:

6274

*/

6274

*/

6275

spin_lock_irqsave(&p->pi_lock, flags);

6275

spin_lock_irqsave(&p->pi_lock, flags);

6276

/*

6276

/*

6277

* To be able to change p->policy safely, the apropriate

6277

* To be able to change p->policy safely, the apropriate

6278

* runqueue lock must be held.

6278

* runqueue lock must be held.

6279

*/

6279

*/

6280

rq = __task_rq_lock(p);

6280

rq = __task_rq_lock(p);

6281

/* recheck policy now with rq lock held */

6281

/* recheck policy now with rq lock held */

6282

if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

6282

if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

6283

policy = oldpolicy = -1;

6283

policy = oldpolicy = -1;

6284

__task_rq_unlock(rq);

6284

__task_rq_unlock(rq);

6285

spin_unlock_irqrestore(&p->pi_lock, flags);

6285

spin_unlock_irqrestore(&p->pi_lock, flags);

6286

goto recheck;

6286

goto recheck;

6287

}

6287

}

6288

update_rq_clock(rq);

6288

update_rq_clock(rq);

6289

on_rq = p->se.on_rq;

6289

on_rq = p->se.on_rq;

6290

running = task_current(rq, p);

6290

running = task_current(rq, p);

6291

if (on_rq)

6291

if (on_rq)

6292

deactivate_task(rq, p, 0);

6292

deactivate_task(rq, p, 0);

6293

if (running)

6293

if (running)

6294

p->sched_class->put_prev_task(rq, p);

6294

p->sched_class->put_prev_task(rq, p);

6295

6296

p->sched_reset_on_fork = reset_on_fork;

6296

p->sched_reset_on_fork = reset_on_fork;

6297

6298

oldprio = p->prio;

6298

oldprio = p->prio;

6299

__setscheduler(rq, p, policy, param->sched_priority);

6299

__setscheduler(rq, p, policy, param->sched_priority);

6300

6301

if (running)

6301

if (running)

6302

p->sched_class->set_curr_task(rq);

6302

p->sched_class->set_curr_task(rq);

6303

if (on_rq) {

6303

if (on_rq) {

6304

activate_task(rq, p, 0);

6304

activate_task(rq, p, 0);

6305

6306

check_class_changed(rq, p, prev_class, oldprio, running);

6306

check_class_changed(rq, p, prev_class, oldprio, running);

6307

}

6307

}

6308

__task_rq_unlock(rq);

6308

__task_rq_unlock(rq);

6309

spin_unlock_irqrestore(&p->pi_lock, flags);

6309

spin_unlock_irqrestore(&p->pi_lock, flags);

6310

6311

rt_mutex_adjust_pi(p);

6311

rt_mutex_adjust_pi(p);

6312

6313

return 0;

6313

return 0;

6314

}

6314

}

6315

6316

/**

6316

/**

6317

* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.

6317

* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.

6318

* @p: the task in question.

6318

* @p: the task in question.

6319

* @policy: new policy.

6319

* @policy: new policy.

6320

* @param: structure containing the new RT priority.

6320

* @param: structure containing the new RT priority.

6321

*

6321

*

6322

* NOTE that the task may be already dead.

6322

* NOTE that the task may be already dead.

6323

*/

6323

*/

6324

int sched_setscheduler(struct task_struct *p, int policy,

6324

int sched_setscheduler(struct task_struct *p, int policy,

6325

struct sched_param *param)

6325

struct sched_param *param)

6326

{

6326

{

6327

return __sched_setscheduler(p, policy, param, true);

6327

return __sched_setscheduler(p, policy, param, true);

6328

}

6328

}

6329

EXPORT_SYMBOL_GPL(sched_setscheduler);

6329

EXPORT_SYMBOL_GPL(sched_setscheduler);

6330

6331

/**

6331

/**

6332

* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.

6332

* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.

6333

* @p: the task in question.

6333

* @p: the task in question.

6334

* @policy: new policy.

6334

* @policy: new policy.

6335

* @param: structure containing the new RT priority.

6335

* @param: structure containing the new RT priority.

6336

*

6336

*

6337

* Just like sched_setscheduler, only don't bother checking if the

6337

* Just like sched_setscheduler, only don't bother checking if the

6338

* current context has permission. For example, this is needed in

6338

* current context has permission. For example, this is needed in

6339

* stop_machine(): we create temporary high priority worker threads,

6339

* stop_machine(): we create temporary high priority worker threads,

6340

* but our caller might not have that capability.

6340

* but our caller might not have that capability.

6341

*/

6341

*/

6342

int sched_setscheduler_nocheck(struct task_struct *p, int policy,

6342

int sched_setscheduler_nocheck(struct task_struct *p, int policy,

6343

struct sched_param *param)

6343

struct sched_param *param)

6344

{

6344

{

6345

return __sched_setscheduler(p, policy, param, false);

6345

return __sched_setscheduler(p, policy, param, false);

6346

}

6346

}

6347

6348

static int

6348

static int

6349

do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

6349

do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

6350

{

6350

{

6351

struct sched_param lparam;

6351

struct sched_param lparam;

6352

struct task_struct *p;

6352

struct task_struct *p;

6353

int retval;

6353

int retval;

6354

6355

if (!param || pid < 0)

6355

if (!param || pid < 0)

6356

return -EINVAL;

6356

return -EINVAL;

6357

if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

6357

if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

6358

return -EFAULT;

6358

return -EFAULT;

6359

6360

rcu_read_lock();

6360

rcu_read_lock();

6361

retval = -ESRCH;

6361

retval = -ESRCH;

6362

p = find_process_by_pid(pid);

6362

p = find_process_by_pid(pid);

6363

if (p != NULL)

6363

if (p != NULL)

6364

retval = sched_setscheduler(p, policy, &lparam);

6364

retval = sched_setscheduler(p, policy, &lparam);

6365

rcu_read_unlock();

6365

rcu_read_unlock();

6366

6367

return retval;

6367

return retval;

6368

}

6368

}

6369

6370

/**

6370

/**

6371

* sys_sched_setscheduler - set/change the scheduler policy and RT priority

6371

* sys_sched_setscheduler - set/change the scheduler policy and RT priority

6372

* @pid: the pid in question.

6372

* @pid: the pid in question.

6373

* @policy: new policy.

6373

* @policy: new policy.

6374

* @param: structure containing the new RT priority.

6374

* @param: structure containing the new RT priority.

6375

*/

6375

*/

6376

SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,

6376

SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,

6377

struct sched_param __user *, param)

6377

struct sched_param __user *, param)

6378

{

6378

{

6379

/* negative values for policy are not valid */

6379

/* negative values for policy are not valid */

6380

if (policy < 0)

6380

if (policy < 0)

6381

return -EINVAL;

6381

return -EINVAL;

6382

6383

return do_sched_setscheduler(pid, policy, param);

6383

return do_sched_setscheduler(pid, policy, param);

6384

}

6384

}

6385

6386

/**

6386

/**

6387

* sys_sched_setparam - set/change the RT priority of a thread

6387

* sys_sched_setparam - set/change the RT priority of a thread

6388

* @pid: the pid in question.

6388

* @pid: the pid in question.

6389

* @param: structure containing the new RT priority.

6389

* @param: structure containing the new RT priority.

6390

*/

6390

*/

6391

SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)

6391

SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)

6392

{

6392

{

6393

return do_sched_setscheduler(pid, -1, param);

6393

return do_sched_setscheduler(pid, -1, param);

6394

}

6394

}

6395

6396

/**

6396

/**

6397

* sys_sched_getscheduler - get the policy (scheduling class) of a thread

6397

* sys_sched_getscheduler - get the policy (scheduling class) of a thread

6398

* @pid: the pid in question.

6398

* @pid: the pid in question.

6399

*/

6399

*/

6400

SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)

6400

SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)

6401

{

6401

{

6402

struct task_struct *p;

6402

struct task_struct *p;

6403

int retval;

6403

int retval;

6404

6405

if (pid < 0)

6405

if (pid < 0)

6406

return -EINVAL;

6406

return -EINVAL;

6407

6408

retval = -ESRCH;

6408

retval = -ESRCH;

6409

read_lock(&tasklist_lock);

6409

read_lock(&tasklist_lock);

6410

p = find_process_by_pid(pid);

6410

p = find_process_by_pid(pid);

6411

if (p) {

6411

if (p) {

6412

retval = security_task_getscheduler(p);

6412

retval = security_task_getscheduler(p);

6413

if (!retval)

6413

if (!retval)

6414

retval = p->policy

6414

retval = p->policy

6415

| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);

6415

| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);

6416

}

6416

}

6417

read_unlock(&tasklist_lock);

6417

read_unlock(&tasklist_lock);

6418

return retval;

6418

return retval;

6419

}

6419

}

6420

6421

/**

6421

/**

6422

* sys_sched_getparam - get the RT priority of a thread

6422

* sys_sched_getparam - get the RT priority of a thread

6423

* @pid: the pid in question.

6423

* @pid: the pid in question.

6424

* @param: structure containing the RT priority.

6424

* @param: structure containing the RT priority.

6425

*/

6425

*/

6426

SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)

6426

SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)

6427

{

6427

{

6428

struct sched_param lp;

6428

struct sched_param lp;

6429

struct task_struct *p;

6429

struct task_struct *p;

6430

int retval;

6430

int retval;

6431

6432

if (!param || pid < 0)

6432

if (!param || pid < 0)

6433

return -EINVAL;

6433

return -EINVAL;

6434

6435

read_lock(&tasklist_lock);

6435

read_lock(&tasklist_lock);

6436

p = find_process_by_pid(pid);

6436

p = find_process_by_pid(pid);

6437

retval = -ESRCH;

6437

retval = -ESRCH;

6438

if (!p)

6438

if (!p)

6439

goto out_unlock;

6439

goto out_unlock;

6440

6441

retval = security_task_getscheduler(p);

6441

retval = security_task_getscheduler(p);

6442

if (retval)

6442

if (retval)

6443

goto out_unlock;

6443

goto out_unlock;

6444

6445

lp.sched_priority = p->rt_priority;

6445

lp.sched_priority = p->rt_priority;

6446

read_unlock(&tasklist_lock);

6446

read_unlock(&tasklist_lock);

6447

6448

/*

6448

/*

6449

* This one might sleep, we cannot do it with a spinlock held ...

6449

* This one might sleep, we cannot do it with a spinlock held ...

6450

*/

6450

*/

6451

retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

6451

retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

6452

6453

return retval;

6453

return retval;

6454

6455

out_unlock:

6455

out_unlock:

6456

read_unlock(&tasklist_lock);

6456

read_unlock(&tasklist_lock);

6457

return retval;

6457

return retval;

6458

}

6458

}

6459

6460

long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)

6460

long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)

6461

{

6461

{

6462

cpumask_var_t cpus_allowed, new_mask;

6462

cpumask_var_t cpus_allowed, new_mask;

6463

struct task_struct *p;

6463

struct task_struct *p;

6464

int retval;

6464

int retval;

6465

6466

get_online_cpus();

6466

get_online_cpus();

6467

read_lock(&tasklist_lock);

6467

read_lock(&tasklist_lock);

6468

6469

p = find_process_by_pid(pid);

6469

p = find_process_by_pid(pid);

6470

if (!p) {

6470

if (!p) {

6471

read_unlock(&tasklist_lock);

6471

read_unlock(&tasklist_lock);

6472

put_online_cpus();

6472

put_online_cpus();

6473

return -ESRCH;

6473

return -ESRCH;

6474

}

6474

}

6475

6476

/*

6476

/*

6477

* It is not safe to call set_cpus_allowed with the

6477

* It is not safe to call set_cpus_allowed with the

6478

* tasklist_lock held. We will bump the task_struct's

6478

* tasklist_lock held. We will bump the task_struct's

6479

* usage count and then drop tasklist_lock.

6479

* usage count and then drop tasklist_lock.

6480

*/

6480

*/

6481

get_task_struct(p);

6481

get_task_struct(p);

6482

read_unlock(&tasklist_lock);

6482

read_unlock(&tasklist_lock);

6483

6484

if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {

6484

if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {

6485

retval = -ENOMEM;

6485

retval = -ENOMEM;

6486

goto out_put_task;

6486

goto out_put_task;

6487

}

6487

}

6488

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {

6488

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {

6489

retval = -ENOMEM;

6489

retval = -ENOMEM;

6490

goto out_free_cpus_allowed;

6490

goto out_free_cpus_allowed;

6491

}

6491

}

6492

retval = -EPERM;

6492

retval = -EPERM;

6493

if (!check_same_owner(p) && !capable(CAP_SYS_NICE))

6493

if (!check_same_owner(p) && !capable(CAP_SYS_NICE))

6494

goto out_unlock;

6494

goto out_unlock;

6495

6496

retval = security_task_setscheduler(p, 0, NULL);

6496

retval = security_task_setscheduler(p, 0, NULL);

6497

if (retval)

6497

if (retval)

6498

goto out_unlock;

6498

goto out_unlock;

6499

6500

cpuset_cpus_allowed(p, cpus_allowed);

6500

cpuset_cpus_allowed(p, cpus_allowed);

6501

cpumask_and(new_mask, in_mask, cpus_allowed);

6501

cpumask_and(new_mask, in_mask, cpus_allowed);

6502

again:

6502

again:

6503

retval = set_cpus_allowed_ptr(p, new_mask);

6503

retval = set_cpus_allowed_ptr(p, new_mask);

6504

6505

if (!retval) {

6505

if (!retval) {

6506

cpuset_cpus_allowed(p, cpus_allowed);

6506

cpuset_cpus_allowed(p, cpus_allowed);

6507

if (!cpumask_subset(new_mask, cpus_allowed)) {

6507

if (!cpumask_subset(new_mask, cpus_allowed)) {

6508

/*

6508

/*

6509

* We must have raced with a concurrent cpuset

6509

* We must have raced with a concurrent cpuset

6510

* update. Just reset the cpus_allowed to the

6510

* update. Just reset the cpus_allowed to the

6511

* cpuset's cpus_allowed

6511

* cpuset's cpus_allowed

6512

*/

6512

*/

6513

cpumask_copy(new_mask, cpus_allowed);

6513

cpumask_copy(new_mask, cpus_allowed);

6514

goto again;

6514

goto again;

6515

}

6515

}

6516

}

6516

}

6517

out_unlock:

6517

out_unlock:

6518

free_cpumask_var(new_mask);

6518

free_cpumask_var(new_mask);

6519

out_free_cpus_allowed:

6519

out_free_cpus_allowed:

6520

free_cpumask_var(cpus_allowed);

6520

free_cpumask_var(cpus_allowed);

6521

out_put_task:

6521

out_put_task:

6522

put_task_struct(p);

6522

put_task_struct(p);

6523

put_online_cpus();

6523

put_online_cpus();

6524

return retval;

6524

return retval;

6525

}

6525

}

6526

6527

static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

6527

static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

6528

struct cpumask *new_mask)

6528

struct cpumask *new_mask)

6529

{

6529

{

6530

if (len < cpumask_size())

6530

if (len < cpumask_size())

6531

cpumask_clear(new_mask);

6531

cpumask_clear(new_mask);

6532

else if (len > cpumask_size())

6532

else if (len > cpumask_size())

6533

len = cpumask_size();

6533

len = cpumask_size();

6534

6535

return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

6535

return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

6536

}

6536

}

6537

6538

/**

6538

/**

6539

* sys_sched_setaffinity - set the cpu affinity of a process

6539

* sys_sched_setaffinity - set the cpu affinity of a process

6540

* @pid: pid of the process

6540

* @pid: pid of the process

6541

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

6541

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

6542

* @user_mask_ptr: user-space pointer to the new cpu mask

6542

* @user_mask_ptr: user-space pointer to the new cpu mask

6543

*/

6543

*/

6544

SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,

6544

SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,

6545

unsigned long __user *, user_mask_ptr)

6545

unsigned long __user *, user_mask_ptr)

6546

{

6546

{

6547

cpumask_var_t new_mask;

6547

cpumask_var_t new_mask;

6548

int retval;

6548

int retval;

6549

6550

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))

6550

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))

6551

return -ENOMEM;

6551

return -ENOMEM;

6552

6553

retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);

6553

retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);

6554

if (retval == 0)

6554

if (retval == 0)

6555

retval = sched_setaffinity(pid, new_mask);

6555

retval = sched_setaffinity(pid, new_mask);

6556

free_cpumask_var(new_mask);

6556

free_cpumask_var(new_mask);

6557

return retval;

6557

return retval;

6558

}

6558

}

6559

6560

long sched_getaffinity(pid_t pid, struct cpumask *mask)

6560

long sched_getaffinity(pid_t pid, struct cpumask *mask)

6561

{

6561

{

6562

struct task_struct *p;

6562

struct task_struct *p;

6563

int retval;

6563

int retval;

6564

6565

get_online_cpus();

6565

get_online_cpus();

6566

read_lock(&tasklist_lock);

6566

read_lock(&tasklist_lock);

6567

6568

retval = -ESRCH;

6568

retval = -ESRCH;

6569

p = find_process_by_pid(pid);

6569

p = find_process_by_pid(pid);

6570

if (!p)

6570

if (!p)

6571

goto out_unlock;

6571

goto out_unlock;

6572

6573

retval = security_task_getscheduler(p);

6573

retval = security_task_getscheduler(p);

6574

if (retval)

6574

if (retval)

6575

goto out_unlock;

6575

goto out_unlock;

6576

6577

cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);

6577

cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);

6578

6579

out_unlock:

6579

out_unlock:

6580

read_unlock(&tasklist_lock);

6580

read_unlock(&tasklist_lock);

6581

put_online_cpus();

6581

put_online_cpus();

6582

6583

return retval;

6583

return retval;

6584

}

6584

}

6585

6586

/**

6586

/**

6587

* sys_sched_getaffinity - get the cpu affinity of a process

6587

* sys_sched_getaffinity - get the cpu affinity of a process

6588

* @pid: pid of the process

6588

* @pid: pid of the process

6589

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

6589

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

6590

* @user_mask_ptr: user-space pointer to hold the current cpu mask

6590

* @user_mask_ptr: user-space pointer to hold the current cpu mask

6591

*/

6591

*/

6592

SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,

6592

SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,

6593

unsigned long __user *, user_mask_ptr)

6593

unsigned long __user *, user_mask_ptr)

6594

{

6594

{

6595

int ret;

6595

int ret;

6596

cpumask_var_t mask;

6596

cpumask_var_t mask;

6597

6598

if (len < cpumask_size())

6598

if (len < cpumask_size())

6599

return -EINVAL;

6599

return -EINVAL;

6600

6601

if (!alloc_cpumask_var(&mask, GFP_KERNEL))

6601

if (!alloc_cpumask_var(&mask, GFP_KERNEL))

6602

return -ENOMEM;

6602

return -ENOMEM;

6603

6604

ret = sched_getaffinity(pid, mask);

6604

ret = sched_getaffinity(pid, mask);

6605

if (ret == 0) {

6605

if (ret == 0) {

6606

if (copy_to_user(user_mask_ptr, mask, cpumask_size()))

6606

if (copy_to_user(user_mask_ptr, mask, cpumask_size()))

6607

ret = -EFAULT;

6607

ret = -EFAULT;

6608

else

6608

else

6609

ret = cpumask_size();

6609

ret = cpumask_size();

6610

}

6610

}

6611

free_cpumask_var(mask);

6611

free_cpumask_var(mask);

6612

6613

return ret;

6613

return ret;

6614

}

6614

}

6615

6616

/**

6616

/**

6617

* sys_sched_yield - yield the current processor to other threads.

6617

* sys_sched_yield - yield the current processor to other threads.

6618

*

6618

*

6619

* This function yields the current CPU to other tasks. If there are no

6619

* This function yields the current CPU to other tasks. If there are no

6620

* other threads running on this CPU then this function will return.

6620

* other threads running on this CPU then this function will return.

6621

*/

6621

*/

6622

SYSCALL_DEFINE0(sched_yield)

6622

SYSCALL_DEFINE0(sched_yield)

6623

{

6623

{

6624

struct rq *rq = this_rq_lock();

6624

struct rq *rq = this_rq_lock();

6625

6626

schedstat_inc(rq, yld_count);

6626

schedstat_inc(rq, yld_count);

6627

current->sched_class->yield_task(rq);

6627

current->sched_class->yield_task(rq);

6628

6629

/*

6629

/*

6630

* Since we are going to call schedule() anyway, there's

6630

* Since we are going to call schedule() anyway, there's

6631

* no need to preempt or enable interrupts:

6631

* no need to preempt or enable interrupts:

6632

*/

6632

*/

6633

__release(rq->lock);

6633

__release(rq->lock);

6634

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

6634

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

6635

_raw_spin_unlock(&rq->lock);

6635

_raw_spin_unlock(&rq->lock);

6636

preempt_enable_no_resched();

6636

preempt_enable_no_resched();

6637

6638

schedule();

6638

schedule();

6639

6640

return 0;

6640

return 0;

6641

}

6641

}

6642

6643

static inline int should_resched(void)

6643

static inline int should_resched(void)

6644

{

6644

{

6645

return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);

6645

return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);

6646

}

6646

}

6647

6648

static void __cond_resched(void)

6648

static void __cond_resched(void)

6649

{

6649

{

6650

add_preempt_count(PREEMPT_ACTIVE);

6650

add_preempt_count(PREEMPT_ACTIVE);

6651

schedule();

6651

schedule();

6652

sub_preempt_count(PREEMPT_ACTIVE);

6652

sub_preempt_count(PREEMPT_ACTIVE);

6653

}

6653

}

6654

6655

int __sched _cond_resched(void)

6655

int __sched _cond_resched(void)

6656

{

6656

{

6657

if (should_resched()) {

6657

if (should_resched()) {

6658

__cond_resched();

6658

__cond_resched();

6659

return 1;

6659

return 1;

6660

}

6660

}

6661

return 0;

6661

return 0;

6662

}

6662

}

6663

EXPORT_SYMBOL(_cond_resched);

6663

EXPORT_SYMBOL(_cond_resched);

6664

6665

/*

6665

/*

6666

* __cond_resched_lock() - if a reschedule is pending, drop the given lock,

6666

* __cond_resched_lock() - if a reschedule is pending, drop the given lock,

6667

* call schedule, and on return reacquire the lock.

6667

* call schedule, and on return reacquire the lock.

6668

*

6668

*

6669

* This works OK both with and without CONFIG_PREEMPT. We do strange low-level

6669

* This works OK both with and without CONFIG_PREEMPT. We do strange low-level

6670

* operations here to prevent schedule() from being called twice (once via

6670

* operations here to prevent schedule() from being called twice (once via

6671

* spin_unlock(), once by hand).

6671

* spin_unlock(), once by hand).

6672

*/

6672

*/

6673

int __cond_resched_lock(spinlock_t *lock)

6673

int __cond_resched_lock(spinlock_t *lock)

6674

{

6674

{

6675

int resched = should_resched();

6675

int resched = should_resched();

6676

int ret = 0;

6676

int ret = 0;

6677

6678

lockdep_assert_held(lock);

6678

lockdep_assert_held(lock);

6679

6680

if (spin_needbreak(lock) || resched) {

6680

if (spin_needbreak(lock) || resched) {

6681

spin_unlock(lock);

6681

spin_unlock(lock);

6682

if (resched)

6682

if (resched)

6683

__cond_resched();

6683

__cond_resched();

6684

else

6684

else

6685

cpu_relax();

6685

cpu_relax();

6686

ret = 1;

6686

ret = 1;

6687

spin_lock(lock);

6687

spin_lock(lock);

6688

}

6688

}

6689

return ret;

6689

return ret;

6690

}

6690

}

6691

EXPORT_SYMBOL(__cond_resched_lock);

6691

EXPORT_SYMBOL(__cond_resched_lock);

6692

6693

int __sched __cond_resched_softirq(void)

6693

int __sched __cond_resched_softirq(void)

6694

{

6694

{

6695

BUG_ON(!in_softirq());

6695

BUG_ON(!in_softirq());

6696

6697

if (should_resched()) {

6697

if (should_resched()) {

6698

local_bh_enable();

6698

local_bh_enable();

6699

__cond_resched();

6699

__cond_resched();

6700

local_bh_disable();

6700

local_bh_disable();

6701

return 1;

6701

return 1;

6702

}

6702

}

6703

return 0;

6703

return 0;

6704

}

6704

}

6705

EXPORT_SYMBOL(__cond_resched_softirq);

6705

EXPORT_SYMBOL(__cond_resched_softirq);

6706

6707

/**

6707

/**

6708

* yield - yield the current processor to other threads.

6708

* yield - yield the current processor to other threads.

6709

*

6709

*

6710

* This is a shortcut for kernel-space yielding - it marks the

6710

* This is a shortcut for kernel-space yielding - it marks the

6711

* thread runnable and calls sys_sched_yield().

6711

* thread runnable and calls sys_sched_yield().

6712

*/

6712

*/

6713

void __sched yield(void)

6713

void __sched yield(void)

6714

{

6714

{

6715

set_current_state(TASK_RUNNING);

6715

set_current_state(TASK_RUNNING);

6716

sys_sched_yield();

6716

sys_sched_yield();

6717

}

6717

}

6718

EXPORT_SYMBOL(yield);

6718

EXPORT_SYMBOL(yield);

6719

6720

/*

6720

/*

6721

* This task is about to go to sleep on IO. Increment rq->nr_iowait so

6721

* This task is about to go to sleep on IO. Increment rq->nr_iowait so

6722

* that process accounting knows that this is a task in IO wait state.

6722

* that process accounting knows that this is a task in IO wait state.

6723

*/

6723

*/

6724

void __sched io_schedule(void)

6724

void __sched io_schedule(void)

6725

{

6725

{

6726

struct rq *rq = raw_rq();

6726

struct rq *rq = raw_rq();

6727

6728

delayacct_blkio_start();

6728

delayacct_blkio_start();

6729

atomic_inc(&rq->nr_iowait);

6729

atomic_inc(&rq->nr_iowait);

6730

current->in_iowait = 1;

6730

current->in_iowait = 1;

6731

schedule();

6731

schedule();

6732

current->in_iowait = 0;

6732

current->in_iowait = 0;

6733

atomic_dec(&rq->nr_iowait);

6733

atomic_dec(&rq->nr_iowait);

6734

delayacct_blkio_end();

6734

delayacct_blkio_end();

6735

}

6735

}

6736

EXPORT_SYMBOL(io_schedule);

6736

EXPORT_SYMBOL(io_schedule);

6737

6738

long __sched io_schedule_timeout(long timeout)

6738

long __sched io_schedule_timeout(long timeout)

6739

{

6739

{

6740

struct rq *rq = raw_rq();

6740

struct rq *rq = raw_rq();

6741

long ret;

6741

long ret;

6742

6743

delayacct_blkio_start();

6743

delayacct_blkio_start();

6744

atomic_inc(&rq->nr_iowait);

6744

atomic_inc(&rq->nr_iowait);

6745

current->in_iowait = 1;

6745

current->in_iowait = 1;

6746

ret = schedule_timeout(timeout);

6746

ret = schedule_timeout(timeout);

6747

current->in_iowait = 0;

6747

current->in_iowait = 0;

6748

atomic_dec(&rq->nr_iowait);

6748

atomic_dec(&rq->nr_iowait);

6749

delayacct_blkio_end();

6749

delayacct_blkio_end();

6750

return ret;

6750

return ret;

6751

}

6751

}

6752

6753

/**

6753

/**

6754

* sys_sched_get_priority_max - return maximum RT priority.

6754

* sys_sched_get_priority_max - return maximum RT priority.

6755

* @policy: scheduling class.

6755

* @policy: scheduling class.

6756

*

6756

*

6757

* this syscall returns the maximum rt_priority that can be used

6757

* this syscall returns the maximum rt_priority that can be used

6758

* by a given scheduling class.

6758

* by a given scheduling class.

6759

*/

6759

*/

6760

SYSCALL_DEFINE1(sched_get_priority_max, int, policy)

6760

SYSCALL_DEFINE1(sched_get_priority_max, int, policy)

6761

{

6761

{

6762

int ret = -EINVAL;

6762

int ret = -EINVAL;

6763

6764

switch (policy) {

6764

switch (policy) {

6765

case SCHED_FIFO:

6765

case SCHED_FIFO:

6766

case SCHED_RR:

6766

case SCHED_RR:

6767

ret = MAX_USER_RT_PRIO-1;

6767

ret = MAX_USER_RT_PRIO-1;

6768

break;

6768

break;

6769

case SCHED_NORMAL:

6769

case SCHED_NORMAL:

6770

case SCHED_BATCH:

6770

case SCHED_BATCH:

6771

case SCHED_IDLE:

6771

case SCHED_IDLE:

6772

ret = 0;

6772

ret = 0;

6773

break;

6773

break;

6774

}

6774

}

6775

return ret;

6775

return ret;

6776

}

6776

}

6777

6778

/**

6778

/**

6779

* sys_sched_get_priority_min - return minimum RT priority.

6779

* sys_sched_get_priority_min - return minimum RT priority.

6780

* @policy: scheduling class.

6780

* @policy: scheduling class.

6781

*

6781

*

6782

* this syscall returns the minimum rt_priority that can be used

6782

* this syscall returns the minimum rt_priority that can be used

6783

* by a given scheduling class.

6783

* by a given scheduling class.

6784

*/

6784

*/

6785

SYSCALL_DEFINE1(sched_get_priority_min, int, policy)

6785

SYSCALL_DEFINE1(sched_get_priority_min, int, policy)

6786

{

6786

{

6787

int ret = -EINVAL;

6787

int ret = -EINVAL;

6788

6789

switch (policy) {

6789

switch (policy) {

6790

case SCHED_FIFO:

6790

case SCHED_FIFO:

6791

case SCHED_RR:

6791

case SCHED_RR:

6792

ret = 1;

6792

ret = 1;

6793

break;

6793

break;

6794

case SCHED_NORMAL:

6794

case SCHED_NORMAL:

6795

case SCHED_BATCH:

6795

case SCHED_BATCH:

6796

case SCHED_IDLE:

6796

case SCHED_IDLE:

6797

ret = 0;

6797

ret = 0;

6798

}

6798

}

6799

return ret;

6799

return ret;

6800

}

6800

}

6801

6802

/**

6802

/**

6803

* sys_sched_rr_get_interval - return the default timeslice of a process.

6803

* sys_sched_rr_get_interval - return the default timeslice of a process.

6804

* @pid: pid of the process.

6804

* @pid: pid of the process.

6805

* @interval: userspace pointer to the timeslice value.

6805

* @interval: userspace pointer to the timeslice value.

6806

*

6806

*

6807

* this syscall writes the default timeslice value of a given process

6807

* this syscall writes the default timeslice value of a given process

6808

* into the user-space timespec buffer. A value of '0' means infinity.

6808

* into the user-space timespec buffer. A value of '0' means infinity.

6809

*/

6809

*/

6810

SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,

6810

SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,

6811

struct timespec __user *, interval)

6811

struct timespec __user *, interval)

6812

{

6812

{

6813

struct task_struct *p;

6813

struct task_struct *p;

6814

unsigned int time_slice;

6814

unsigned int time_slice;

6815

int retval;

6815

int retval;

6816

struct timespec t;

6816

struct timespec t;

6817

6818

if (pid < 0)

6818

if (pid < 0)

6819

return -EINVAL;

6819

return -EINVAL;

6820

6821

retval = -ESRCH;

6821

retval = -ESRCH;

6822

read_lock(&tasklist_lock);

6822

read_lock(&tasklist_lock);

6823

p = find_process_by_pid(pid);

6823

p = find_process_by_pid(pid);

6824

if (!p)

6824

if (!p)

6825

goto out_unlock;

6825

goto out_unlock;

6826

6827

retval = security_task_getscheduler(p);

6827

retval = security_task_getscheduler(p);

6828

if (retval)

6828

if (retval)

6829

goto out_unlock;

6829

goto out_unlock;

6830

6831

time_slice = p->sched_class->get_rr_interval(p);

6831

time_slice = p->sched_class->get_rr_interval(p);

6832

6833

read_unlock(&tasklist_lock);

6833

read_unlock(&tasklist_lock);

6834

jiffies_to_timespec(time_slice, &t);

6834

jiffies_to_timespec(time_slice, &t);

6835

retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;

6835

retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;

6836

return retval;

6836

return retval;

6837

6838

out_unlock:

6838

out_unlock:

6839

read_unlock(&tasklist_lock);

6839

read_unlock(&tasklist_lock);

6840

return retval;

6840

return retval;

6841

}

6841

}

6842

6843

static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;

6843

static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;

6844

6845

void sched_show_task(struct task_struct *p)

6845

void sched_show_task(struct task_struct *p)

6846

{

6846

{

6847

unsigned long free = 0;

6847

unsigned long free = 0;

6848

unsigned state;

6848

unsigned state;

6849

6850

state = p->state ? __ffs(p->state) + 1 : 0;

6850

state = p->state ? __ffs(p->state) + 1 : 0;

6851

printk(KERN_INFO "%-13.13s %c", p->comm,

6851

printk(KERN_INFO "%-13.13s %c", p->comm,

6852

state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');

6852

state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');

6853

#if BITS_PER_LONG == 32

6853

#if BITS_PER_LONG == 32

6854

if (state == TASK_RUNNING)

6854

if (state == TASK_RUNNING)

6855

printk(KERN_CONT " running ");

6855

printk(KERN_CONT " running ");

6856

else

6856

else

6857

printk(KERN_CONT " %08lx ", thread_saved_pc(p));

6857

printk(KERN_CONT " %08lx ", thread_saved_pc(p));

6858

#else

6858

#else

6859

if (state == TASK_RUNNING)

6859

if (state == TASK_RUNNING)

6860

printk(KERN_CONT " running task ");

6860

printk(KERN_CONT " running task ");

6861

else

6861

else

6862

printk(KERN_CONT " %016lx ", thread_saved_pc(p));

6862

printk(KERN_CONT " %016lx ", thread_saved_pc(p));

6863

#endif

6863

#endif

6864

#ifdef CONFIG_DEBUG_STACK_USAGE

6864

#ifdef CONFIG_DEBUG_STACK_USAGE

6865

free = stack_not_used(p);

6865

free = stack_not_used(p);

6866

#endif

6866

#endif

6867

printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,

6867

printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,

6868

task_pid_nr(p), task_pid_nr(p->real_parent),

6868

task_pid_nr(p), task_pid_nr(p->real_parent),

6869

(unsigned long)task_thread_info(p)->flags);

6869

(unsigned long)task_thread_info(p)->flags);

6870

6871

show_stack(p, NULL);

6871

show_stack(p, NULL);

6872

}

6872

}

6873

6874

void show_state_filter(unsigned long state_filter)

6874

void show_state_filter(unsigned long state_filter)

6875

{

6875

{

6876

struct task_struct *g, *p;

6876

struct task_struct *g, *p;

6877

6878

#if BITS_PER_LONG == 32

6878

#if BITS_PER_LONG == 32

6879

printk(KERN_INFO

6879

printk(KERN_INFO

6880

" task PC stack pid father\n");

6880

" task PC stack pid father\n");

6881

#else

6881

#else

6882

printk(KERN_INFO

6882

printk(KERN_INFO

6883

" task PC stack pid father\n");

6883

" task PC stack pid father\n");

6884

#endif

6884

#endif

6885

read_lock(&tasklist_lock);

6885

read_lock(&tasklist_lock);

6886

do_each_thread(g, p) {

6886

do_each_thread(g, p) {

6887

/*

6887

/*

6888

* reset the NMI-timeout, listing all files on a slow

6888

* reset the NMI-timeout, listing all files on a slow

6889

* console might take alot of time:

6889

* console might take alot of time:

6890

*/

6890

*/

6891

touch_nmi_watchdog();

6891

touch_nmi_watchdog();

6892

if (!state_filter || (p->state & state_filter))

6892

if (!state_filter || (p->state & state_filter))

6893

sched_show_task(p);

6893

sched_show_task(p);

6894

} while_each_thread(g, p);

6894

} while_each_thread(g, p);

6895

6896

touch_all_softlockup_watchdogs();

6896

touch_all_softlockup_watchdogs();

6897

6898

#ifdef CONFIG_SCHED_DEBUG

6898

#ifdef CONFIG_SCHED_DEBUG

6899

sysrq_sched_debug_show();

6899

sysrq_sched_debug_show();

6900

#endif

6900

#endif

6901

read_unlock(&tasklist_lock);

6901

read_unlock(&tasklist_lock);

6902

/*

6902

/*

6903

* Only show locks if all tasks are dumped:

6903

* Only show locks if all tasks are dumped:

6904

*/

6904

*/

6905

if (state_filter == -1)

6905

if (state_filter == -1)

6906

debug_show_all_locks();

6906

debug_show_all_locks();

6907

}

6907

}

6908

6909

void __cpuinit init_idle_bootup_task(struct task_struct *idle)

6909

void __cpuinit init_idle_bootup_task(struct task_struct *idle)

6910

{

6910

{

6911

idle->sched_class = &idle_sched_class;

6911

idle->sched_class = &idle_sched_class;

6912

}

6912

}

6913

6914

/**

6914

/**

6915

* init_idle - set up an idle thread for a given CPU

6915

* init_idle - set up an idle thread for a given CPU

6916

* @idle: task in question

6916

* @idle: task in question

6917

* @cpu: cpu the idle task belongs to

6917

* @cpu: cpu the idle task belongs to

6918

*

6918

*

6919

* NOTE: this function does not set the idle thread's NEED_RESCHED

6919

* NOTE: this function does not set the idle thread's NEED_RESCHED

6920

* flag, to make booting more robust.

6920

* flag, to make booting more robust.

6921

*/

6921

*/

6922

void __cpuinit init_idle(struct task_struct *idle, int cpu)

6922

void __cpuinit init_idle(struct task_struct *idle, int cpu)

6923

{

6923

{

6924

struct rq *rq = cpu_rq(cpu);

6924

struct rq *rq = cpu_rq(cpu);

6925

unsigned long flags;

6925

unsigned long flags;

6926

6927

spin_lock_irqsave(&rq->lock, flags);

6927

spin_lock_irqsave(&rq->lock, flags);

6928

6929

__sched_fork(idle);

6929

__sched_fork(idle);

6930

idle->se.exec_start = sched_clock();

6930

idle->se.exec_start = sched_clock();

6931

6932

idle->prio = idle->normal_prio = MAX_PRIO;

6932

idle->prio = idle->normal_prio = MAX_PRIO;

6933

cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));

6933

cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));

6934

__set_task_cpu(idle, cpu);

6934

__set_task_cpu(idle, cpu);

6935

6936

rq->curr = rq->idle = idle;

6936

rq->curr = rq->idle = idle;

6937

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

6937

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

6938

idle->oncpu = 1;

6938

idle->oncpu = 1;

6939

#endif

6939

#endif

6940

spin_unlock_irqrestore(&rq->lock, flags);

6940

spin_unlock_irqrestore(&rq->lock, flags);

6941

6942

/* Set the preempt count _outside_ the spinlocks! */

6942

/* Set the preempt count _outside_ the spinlocks! */

6943

#if defined(CONFIG_PREEMPT)

6943

#if defined(CONFIG_PREEMPT)

6944

task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);

6944

task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);

6945

#else

6945

#else

6946

task_thread_info(idle)->preempt_count = 0;

6946

task_thread_info(idle)->preempt_count = 0;

6947

#endif

6947

#endif

6948

/*

6948

/*

6949

* The idle tasks have their own, simple scheduling class:

6949

* The idle tasks have their own, simple scheduling class:

6950

*/

6950

*/

6951

idle->sched_class = &idle_sched_class;

6951

idle->sched_class = &idle_sched_class;

6952

ftrace_graph_init_task(idle);

6952

ftrace_graph_init_task(idle);

6953

}

6953

}

6954

6955

/*

6955

/*

6956

* In a system that switches off the HZ timer nohz_cpu_mask

6956

* In a system that switches off the HZ timer nohz_cpu_mask

6957

* indicates which cpus entered this state. This is used

6957

* indicates which cpus entered this state. This is used

6958

* in the rcu update to wait only for active cpus. For system

6958

* in the rcu update to wait only for active cpus. For system

6959

* which do not switch off the HZ timer nohz_cpu_mask should

6959

* which do not switch off the HZ timer nohz_cpu_mask should

6960

* always be CPU_BITS_NONE.

6960

* always be CPU_BITS_NONE.

6961

*/

6961

*/

6962

cpumask_var_t nohz_cpu_mask;

6962

cpumask_var_t nohz_cpu_mask;

6963

6964

/*

6964

/*

6965

* Increase the granularity value when there are more CPUs,

6965

* Increase the granularity value when there are more CPUs,

6966

* because with more CPUs the 'effective latency' as visible

6966

* because with more CPUs the 'effective latency' as visible

6967

* to users decreases. But the relationship is not linear,

6967

* to users decreases. But the relationship is not linear,

6968

* so pick a second-best guess by going with the log2 of the

6968

* so pick a second-best guess by going with the log2 of the

6969

* number of CPUs.

6969

* number of CPUs.

6970

*

6970

*

6971

* This idea comes from the SD scheduler of Con Kolivas:

6971

* This idea comes from the SD scheduler of Con Kolivas:

6972

*/

6972

*/

6973

static inline void sched_init_granularity(void)

6973

static inline void sched_init_granularity(void)

6974

{

6974

{

6975

unsigned int factor = 1 + ilog2(num_online_cpus());

6975

unsigned int factor = 1 + ilog2(num_online_cpus());

6976

const unsigned long limit = 200000000;

6976

const unsigned long limit = 200000000;

6977

6978

sysctl_sched_min_granularity *= factor;

6978

sysctl_sched_min_granularity *= factor;

6979

if (sysctl_sched_min_granularity > limit)

6979

if (sysctl_sched_min_granularity > limit)

6980

sysctl_sched_min_granularity = limit;

6980

sysctl_sched_min_granularity = limit;

6981

6982

sysctl_sched_latency *= factor;

6982

sysctl_sched_latency *= factor;

6983

if (sysctl_sched_latency > limit)

6983

if (sysctl_sched_latency > limit)

6984

sysctl_sched_latency = limit;

6984

sysctl_sched_latency = limit;

6985

6986

sysctl_sched_wakeup_granularity *= factor;

6986

sysctl_sched_wakeup_granularity *= factor;

6987

6988

sysctl_sched_shares_ratelimit *= factor;

6988

sysctl_sched_shares_ratelimit *= factor;

6989

}

6989

}

6990

6991

#ifdef CONFIG_SMP

6991

#ifdef CONFIG_SMP

6992

/*

6992

/*

6993

* This is how migration works:

6993

* This is how migration works:

6994

*

6994

*

6995

* 1) we queue a struct migration_req structure in the source CPU's

6995

* 1) we queue a struct migration_req structure in the source CPU's

6996

* runqueue and wake up that CPU's migration thread.

6996

* runqueue and wake up that CPU's migration thread.

6997

* 2) we down() the locked semaphore => thread blocks.

6997

* 2) we down() the locked semaphore => thread blocks.

6998

* 3) migration thread wakes up (implicitly it forces the migrated

6998

* 3) migration thread wakes up (implicitly it forces the migrated

6999

* thread off the CPU)

6999

* thread off the CPU)

7000

* 4) it gets the migration request and checks whether the migrated

7000

* 4) it gets the migration request and checks whether the migrated

7001

* task is still in the wrong runqueue.

7001

* task is still in the wrong runqueue.

7002

* 5) if it's in the wrong runqueue then the migration thread removes

7002

* 5) if it's in the wrong runqueue then the migration thread removes

7003

* it and puts it into the right queue.

7003

* it and puts it into the right queue.

7004

* 6) migration thread up()s the semaphore.

7004

* 6) migration thread up()s the semaphore.

7005

* 7) we wake up and the migration is done.

7005

* 7) we wake up and the migration is done.

7006

*/

7006

*/

7007

7008

/*

7008

/*

7009

* Change a given task's CPU affinity. Migrate the thread to a

7009

* Change a given task's CPU affinity. Migrate the thread to a

7010

* proper CPU and schedule it away if the CPU it's executing on

7010

* proper CPU and schedule it away if the CPU it's executing on

7011

* is removed from the allowed bitmask.

7011

* is removed from the allowed bitmask.

7012

*

7012

*

7013

* NOTE: the caller must have a valid reference to the task, the

7013

* NOTE: the caller must have a valid reference to the task, the

7014

* task must not exit() & deallocate itself prematurely. The

7014

* task must not exit() & deallocate itself prematurely. The

7015

* call is not atomic; no spinlocks may be held.

7015

* call is not atomic; no spinlocks may be held.

7016

*/

7016

*/

7017

int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)

7017

int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)

7018

{

7018

{

7019

struct migration_req req;

7019

struct migration_req req;

7020

unsigned long flags;

7020

unsigned long flags;

7021

struct rq *rq;

7021

struct rq *rq;

7022

int ret = 0;

7022

int ret = 0;

7023

7024

rq = task_rq_lock(p, &flags);

7024

rq = task_rq_lock(p, &flags);

7025

if (!cpumask_intersects(new_mask, cpu_online_mask)) {

7025

if (!cpumask_intersects(new_mask, cpu_online_mask)) {

7026

ret = -EINVAL;

7026

ret = -EINVAL;

7027

goto out;

7027

goto out;

7028

}

7028

}

7029

7030

if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&

7030

if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&

7031

!cpumask_equal(&p->cpus_allowed, new_mask))) {

7031

!cpumask_equal(&p->cpus_allowed, new_mask))) {

7032

ret = -EINVAL;

7032

ret = -EINVAL;

7033

goto out;

7033

goto out;

7034

}

7034

}

7035

7036

if (p->sched_class->set_cpus_allowed)

7036

if (p->sched_class->set_cpus_allowed)

7037

p->sched_class->set_cpus_allowed(p, new_mask);

7037

p->sched_class->set_cpus_allowed(p, new_mask);

7038

else {

7038

else {

7039

cpumask_copy(&p->cpus_allowed, new_mask);

7039

cpumask_copy(&p->cpus_allowed, new_mask);

7040

p->rt.nr_cpus_allowed = cpumask_weight(new_mask);

7040

p->rt.nr_cpus_allowed = cpumask_weight(new_mask);

7041

}

7041

}

7042

7043

/* Can the task run on the task's current CPU? If so, we're done */

7043

/* Can the task run on the task's current CPU? If so, we're done */

7044

if (cpumask_test_cpu(task_cpu(p), new_mask))

7044

if (cpumask_test_cpu(task_cpu(p), new_mask))

7045

goto out;

7045

goto out;

7046

7047

if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {

7047

if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {

7048

/* Need help from migration thread: drop lock and wait. */

7048

/* Need help from migration thread: drop lock and wait. */

7049

struct task_struct *mt = rq->migration_thread;

7049

struct task_struct *mt = rq->migration_thread;

7050

7051

get_task_struct(mt);

7051

get_task_struct(mt);

7052

task_rq_unlock(rq, &flags);

7052

task_rq_unlock(rq, &flags);

7053

wake_up_process(rq->migration_thread);

7053

wake_up_process(rq->migration_thread);

7054

put_task_struct(mt);

7054

put_task_struct(mt);

7055

wait_for_completion(&req.done);

7055

wait_for_completion(&req.done);

7056

tlb_migrate_finish(p->mm);

7056

tlb_migrate_finish(p->mm);

7057

return 0;

7057

return 0;

7058

}

7058

}

7059

out:

7059

out:

7060

task_rq_unlock(rq, &flags);

7060

task_rq_unlock(rq, &flags);

7061

7062

return ret;

7062

return ret;

7063

}

7063

}

7064

EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

7064

EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

7065

7066

/*

7066

/*

7067

* Move (not current) task off this cpu, onto dest cpu. We're doing

7067

* Move (not current) task off this cpu, onto dest cpu. We're doing

7068

* this because either it can't run here any more (set_cpus_allowed()

7068

* this because either it can't run here any more (set_cpus_allowed()

7069

* away from this CPU, or CPU going down), or because we're

7069

* away from this CPU, or CPU going down), or because we're

7070

* attempting to rebalance this task on exec (sched_exec).

7070

* attempting to rebalance this task on exec (sched_exec).

7071

*

7071

*

7072

* So we race with normal scheduler movements, but that's OK, as long

7072

* So we race with normal scheduler movements, but that's OK, as long

7073

* as the task is no longer on this CPU.

7073

* as the task is no longer on this CPU.

7074

*

7074

*

7075

* Returns non-zero if task was successfully migrated.

7075

* Returns non-zero if task was successfully migrated.

7076

*/

7076

*/

7077

static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)

7077

static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)

7078

{

7078

{

7079

struct rq *rq_dest, *rq_src;

7079

struct rq *rq_dest, *rq_src;

7080

int ret = 0, on_rq;

7080

int ret = 0, on_rq;

7081

7082

if (unlikely(!cpu_active(dest_cpu)))

7082

if (unlikely(!cpu_active(dest_cpu)))

7083

return ret;

7083

return ret;

7084

7085

rq_src = cpu_rq(src_cpu);

7085

rq_src = cpu_rq(src_cpu);

7086

rq_dest = cpu_rq(dest_cpu);

7086

rq_dest = cpu_rq(dest_cpu);

7087

7088

double_rq_lock(rq_src, rq_dest);

7088

double_rq_lock(rq_src, rq_dest);

7089

/* Already moved. */

7089

/* Already moved. */

7090

if (task_cpu(p) != src_cpu)

7090

if (task_cpu(p) != src_cpu)

7091

goto done;

7091

goto done;

7092

/* Affinity changed (again). */

7092

/* Affinity changed (again). */

7093

if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

7093

if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

7094

goto fail;

7094

goto fail;

7095

7096

on_rq = p->se.on_rq;

7096

on_rq = p->se.on_rq;

7097

if (on_rq)

7097

if (on_rq)

7098

deactivate_task(rq_src, p, 0);

7098

deactivate_task(rq_src, p, 0);

7099

7100

set_task_cpu(p, dest_cpu);

7100

set_task_cpu(p, dest_cpu);

7101

if (on_rq) {

7101

if (on_rq) {

7102

activate_task(rq_dest, p, 0);

7102

activate_task(rq_dest, p, 0);

7103

check_preempt_curr(rq_dest, p, 0);

7103

check_preempt_curr(rq_dest, p, 0);

7104

}

7104

}

7105

done:

7105

done:

7106

ret = 1;

7106

ret = 1;

7107

fail:

7107

fail:

7108

double_rq_unlock(rq_src, rq_dest);

7108

double_rq_unlock(rq_src, rq_dest);

7109

return ret;

7109

return ret;

7110

}

7110

}

7111

7112

#define RCU_MIGRATION_IDLE 0

7112

#define RCU_MIGRATION_IDLE 0

7113

#define RCU_MIGRATION_NEED_QS 1

7113

#define RCU_MIGRATION_NEED_QS 1

7114

#define RCU_MIGRATION_GOT_QS 2

7114

#define RCU_MIGRATION_GOT_QS 2

7115

#define RCU_MIGRATION_MUST_SYNC 3

7115

#define RCU_MIGRATION_MUST_SYNC 3

7116

7117

/*

7117

/*

7118

* migration_thread - this is a highprio system thread that performs

7118

* migration_thread - this is a highprio system thread that performs

7119

* thread migration by bumping thread off CPU then 'pushing' onto

7119

* thread migration by bumping thread off CPU then 'pushing' onto

7120

* another runqueue.

7120

* another runqueue.

7121

*/

7121

*/

7122

static int migration_thread(void *data)

7122

static int migration_thread(void *data)

7123

{

7123

{

7124

int badcpu;

7124

int badcpu;

7125

int cpu = (long)data;

7125

int cpu = (long)data;

7126

struct rq *rq;

7126

struct rq *rq;

7127

7128

rq = cpu_rq(cpu);

7128

rq = cpu_rq(cpu);

7129

BUG_ON(rq->migration_thread != current);

7129

BUG_ON(rq->migration_thread != current);

7130

7131

set_current_state(TASK_INTERRUPTIBLE);

7131

set_current_state(TASK_INTERRUPTIBLE);

7132

while (!kthread_should_stop()) {

7132

while (!kthread_should_stop()) {

7133

struct migration_req *req;

7133

struct migration_req *req;

7134

struct list_head *head;

7134

struct list_head *head;

7135

7136

spin_lock_irq(&rq->lock);

7136

spin_lock_irq(&rq->lock);

7137

7138

if (cpu_is_offline(cpu)) {

7138

if (cpu_is_offline(cpu)) {

7139

spin_unlock_irq(&rq->lock);

7139

spin_unlock_irq(&rq->lock);

7140

break;

7140

break;

7141

}

7141

}

7142

7143

if (rq->active_balance) {

7143

if (rq->active_balance) {

7144

active_load_balance(rq, cpu);

7144

active_load_balance(rq, cpu);

7145

rq->active_balance = 0;

7145

rq->active_balance = 0;

7146

}

7146

}

7147

7148

head = &rq->migration_queue;

7148

head = &rq->migration_queue;

7149

7150

if (list_empty(head)) {

7150

if (list_empty(head)) {

7151

spin_unlock_irq(&rq->lock);

7151

spin_unlock_irq(&rq->lock);

7152

schedule();

7152

schedule();

7153

set_current_state(TASK_INTERRUPTIBLE);

7153

set_current_state(TASK_INTERRUPTIBLE);

7154

continue;

7154

continue;

7155

}

7155

}

7156

req = list_entry(head->next, struct migration_req, list);

7156

req = list_entry(head->next, struct migration_req, list);

7157

list_del_init(head->next);

7157

list_del_init(head->next);

7158

7159

if (req->task != NULL) {

7159

if (req->task != NULL) {

7160

spin_unlock(&rq->lock);

7160

spin_unlock(&rq->lock);

7161

__migrate_task(req->task, cpu, req->dest_cpu);

7161

__migrate_task(req->task, cpu, req->dest_cpu);

7162

} else if (likely(cpu == (badcpu = smp_processor_id()))) {

7162

} else if (likely(cpu == (badcpu = smp_processor_id()))) {

7163

req->dest_cpu = RCU_MIGRATION_GOT_QS;

7163

req->dest_cpu = RCU_MIGRATION_GOT_QS;

7164

spin_unlock(&rq->lock);

7164

spin_unlock(&rq->lock);

7165

} else {

7165

} else {

7166

req->dest_cpu = RCU_MIGRATION_MUST_SYNC;

7166

req->dest_cpu = RCU_MIGRATION_MUST_SYNC;

7167

spin_unlock(&rq->lock);

7167

spin_unlock(&rq->lock);

7168

WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);

7168

WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);

7169

}

7169

}

7170

local_irq_enable();

7170

local_irq_enable();

7171

7172

complete(&req->done);

7172

complete(&req->done);

7173

}

7173

}

7174

__set_current_state(TASK_RUNNING);

7174

__set_current_state(TASK_RUNNING);

7175

7176

return 0;

7176

return 0;

7177

}

7177

}

7178

7179

#ifdef CONFIG_HOTPLUG_CPU

7179

#ifdef CONFIG_HOTPLUG_CPU

7180

7181

static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)

7181

static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)

7182

{

7182

{

7183

int ret;

7183

int ret;

7184

7185

local_irq_disable();

7185

local_irq_disable();

7186

ret = __migrate_task(p, src_cpu, dest_cpu);

7186

ret = __migrate_task(p, src_cpu, dest_cpu);

7187

local_irq_enable();

7187

local_irq_enable();

7188

return ret;

7188

return ret;

7189

}

7189

}

7190

7191

/*

7191

/*

7192

* Figure out where task on dead CPU should go, use force if necessary.

7192

* Figure out where task on dead CPU should go, use force if necessary.

7193

*/

7193

*/

7194

static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)

7194

static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)

7195

{

7195

{

7196

int dest_cpu;

7196

int dest_cpu;

7197

const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));

7197

const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));

7198

7199

again:

7199

again:

7200

/* Look for allowed, online CPU in same node. */

7200

/* Look for allowed, online CPU in same node. */

7201

for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)

7201

for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)

7202

if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

7202

if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

7203

goto move;

7203

goto move;

7204

7205

/* Any allowed, online CPU? */

7205

/* Any allowed, online CPU? */

7206

dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);

7206

dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);

7207

if (dest_cpu < nr_cpu_ids)

7207

if (dest_cpu < nr_cpu_ids)

7208

goto move;

7208

goto move;

7209

7210

/* No more Mr. Nice Guy. */

7210

/* No more Mr. Nice Guy. */

7211

if (dest_cpu >= nr_cpu_ids) {

7211

if (dest_cpu >= nr_cpu_ids) {

7212

cpuset_cpus_allowed_locked(p, &p->cpus_allowed);

7212

cpuset_cpus_allowed_locked(p, &p->cpus_allowed);

7213

dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);

7213

dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);

7214

7215

/*

7215

/*

7216

* Don't tell them about moving exiting tasks or

7216

* Don't tell them about moving exiting tasks or

7217

* kernel threads (both mm NULL), since they never

7217

* kernel threads (both mm NULL), since they never

7218

* leave kernel.

7218

* leave kernel.

7219

*/

7219

*/

7220

if (p->mm && printk_ratelimit()) {

7220

if (p->mm && printk_ratelimit()) {

7221

printk(KERN_INFO "process %d (%s) no "

7221

printk(KERN_INFO "process %d (%s) no "

7222

"longer affine to cpu%d\n",

7222

"longer affine to cpu%d\n",

7223

task_pid_nr(p), p->comm, dead_cpu);

7223

task_pid_nr(p), p->comm, dead_cpu);

7224

}

7224

}

7225

}

7225

}

7226

7227

move:

7227

move:

7228

/* It can have affinity changed while we were choosing. */

7228

/* It can have affinity changed while we were choosing. */

7229

if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))

7229

if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))

7230

goto again;

7230

goto again;

7231

}

7231

}

7232

7233

/*

7233

/*

7234

* While a dead CPU has no uninterruptible tasks queued at this point,

7234

* While a dead CPU has no uninterruptible tasks queued at this point,

7235

* it might still have a nonzero ->nr_uninterruptible counter, because

7235

* it might still have a nonzero ->nr_uninterruptible counter, because

7236

* for performance reasons the counter is not stricly tracking tasks to

7236

* for performance reasons the counter is not stricly tracking tasks to

7237

* their home CPUs. So we just add the counter to another CPU's counter,

7237

* their home CPUs. So we just add the counter to another CPU's counter,

7238

* to keep the global sum constant after CPU-down:

7238

* to keep the global sum constant after CPU-down:

7239

*/

7239

*/

7240

static void migrate_nr_uninterruptible(struct rq *rq_src)

7240

static void migrate_nr_uninterruptible(struct rq *rq_src)

7241

{

7241

{

7242

struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));

7242

struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));

7243

unsigned long flags;

7243

unsigned long flags;

7244

7245

local_irq_save(flags);

7245

local_irq_save(flags);

7246

double_rq_lock(rq_src, rq_dest);

7246

double_rq_lock(rq_src, rq_dest);

7247

rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;

7247

rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;

7248

rq_src->nr_uninterruptible = 0;

7248

rq_src->nr_uninterruptible = 0;

7249

double_rq_unlock(rq_src, rq_dest);

7249

double_rq_unlock(rq_src, rq_dest);

7250

local_irq_restore(flags);

7250

local_irq_restore(flags);

7251

}

7251

}

7252

7253

/* Run through task list and migrate tasks from the dead cpu. */

7253

/* Run through task list and migrate tasks from the dead cpu. */

7254

static void migrate_live_tasks(int src_cpu)

7254

static void migrate_live_tasks(int src_cpu)

7255

{

7255

{

7256

struct task_struct *p, *t;

7256

struct task_struct *p, *t;

7257

7258

read_lock(&tasklist_lock);

7258

read_lock(&tasklist_lock);

7259

7260

do_each_thread(t, p) {

7260

do_each_thread(t, p) {

7261

if (p == current)

7261

if (p == current)

7262

continue;

7262

continue;

7263

7264

if (task_cpu(p) == src_cpu)

7264

if (task_cpu(p) == src_cpu)

7265

move_task_off_dead_cpu(src_cpu, p);

7265

move_task_off_dead_cpu(src_cpu, p);

7266

} while_each_thread(t, p);

7266

} while_each_thread(t, p);

7267

7268

read_unlock(&tasklist_lock);

7268

read_unlock(&tasklist_lock);

7269

}

7269

}

7270

7271

/*

7271

/*

7272

* Schedules idle task to be the next runnable task on current CPU.

7272

* Schedules idle task to be the next runnable task on current CPU.

7273

* It does so by boosting its priority to highest possible.

7273

* It does so by boosting its priority to highest possible.

7274

* Used by CPU offline code.

7274

* Used by CPU offline code.

7275

*/

7275

*/

7276

void sched_idle_next(void)

7276

void sched_idle_next(void)

7277

{

7277

{

7278

int this_cpu = smp_processor_id();

7278

int this_cpu = smp_processor_id();

7279

struct rq *rq = cpu_rq(this_cpu);

7279

struct rq *rq = cpu_rq(this_cpu);

7280

struct task_struct *p = rq->idle;

7280

struct task_struct *p = rq->idle;

7281

unsigned long flags;

7281

unsigned long flags;

7282

7283

/* cpu has to be offline */

7283

/* cpu has to be offline */

7284

BUG_ON(cpu_online(this_cpu));

7284

BUG_ON(cpu_online(this_cpu));

7285

7286

/*

7286

/*

7287

* Strictly not necessary since rest of the CPUs are stopped by now

7287

* Strictly not necessary since rest of the CPUs are stopped by now

7288

* and interrupts disabled on the current cpu.

7288

* and interrupts disabled on the current cpu.

7289

*/

7289

*/

7290

spin_lock_irqsave(&rq->lock, flags);

7290

spin_lock_irqsave(&rq->lock, flags);

7291

7292

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

7292

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

7293

7294

update_rq_clock(rq);

7294

update_rq_clock(rq);

7295

activate_task(rq, p, 0);

7295

activate_task(rq, p, 0);

7296

7297

spin_unlock_irqrestore(&rq->lock, flags);

7297

spin_unlock_irqrestore(&rq->lock, flags);

7298

}

7298

}

7299

7300

/*

7300

/*

7301

* Ensures that the idle task is using init_mm right before its cpu goes

7301

* Ensures that the idle task is using init_mm right before its cpu goes

7302

* offline.

7302

* offline.

7303

*/

7303

*/

7304

void idle_task_exit(void)

7304

void idle_task_exit(void)

7305

{

7305

{

7306

struct mm_struct *mm = current->active_mm;

7306

struct mm_struct *mm = current->active_mm;

7307

7308

BUG_ON(cpu_online(smp_processor_id()));

7308

BUG_ON(cpu_online(smp_processor_id()));

7309

7310

if (mm != &init_mm)

7310

if (mm != &init_mm)

7311

switch_mm(mm, &init_mm, current);

7311

switch_mm(mm, &init_mm, current);

7312

mmdrop(mm);

7312

mmdrop(mm);

7313

}

7313

}

7314

7315

/* called under rq->lock with disabled interrupts */

7315

/* called under rq->lock with disabled interrupts */

7316

static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)

7316

static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)

7317

{

7317

{

7318

struct rq *rq = cpu_rq(dead_cpu);

7318

struct rq *rq = cpu_rq(dead_cpu);

7319

7320

/* Must be exiting, otherwise would be on tasklist. */

7320

/* Must be exiting, otherwise would be on tasklist. */

7321

BUG_ON(!p->exit_state);

7321

BUG_ON(!p->exit_state);

7322

7323

/* Cannot have done final schedule yet: would have vanished. */

7323

/* Cannot have done final schedule yet: would have vanished. */

7324

BUG_ON(p->state == TASK_DEAD);

7324

BUG_ON(p->state == TASK_DEAD);

7325

7326

get_task_struct(p);

7326

get_task_struct(p);

7327

7328

/*

7328

/*

7329

* Drop lock around migration; if someone else moves it,

7329

* Drop lock around migration; if someone else moves it,

7330

* that's OK. No task can be added to this CPU, so iteration is

7330

* that's OK. No task can be added to this CPU, so iteration is

7331

* fine.

7331

* fine.

7332

*/

7332

*/

7333

spin_unlock_irq(&rq->lock);

7333

spin_unlock_irq(&rq->lock);

7334

move_task_off_dead_cpu(dead_cpu, p);

7334

move_task_off_dead_cpu(dead_cpu, p);

7335

spin_lock_irq(&rq->lock);

7335

spin_lock_irq(&rq->lock);

7336

7337

put_task_struct(p);

7337

put_task_struct(p);

7338

}

7338

}

7339

7340

/* release_task() removes task from tasklist, so we won't find dead tasks. */

7340

/* release_task() removes task from tasklist, so we won't find dead tasks. */

7341

static void migrate_dead_tasks(unsigned int dead_cpu)

7341

static void migrate_dead_tasks(unsigned int dead_cpu)

7342

{

7342

{

7343

struct rq *rq = cpu_rq(dead_cpu);

7343

struct rq *rq = cpu_rq(dead_cpu);

7344

struct task_struct *next;

7344

struct task_struct *next;

7345

7346

for ( ; ; ) {

7346

for ( ; ; ) {

7347

if (!rq->nr_running)

7347

if (!rq->nr_running)

7348

break;

7348

break;

7349

update_rq_clock(rq);

7349

update_rq_clock(rq);

7350

next = pick_next_task(rq);

7350

next = pick_next_task(rq);

7351

if (!next)

7351

if (!next)

7352

break;

7352

break;

7353

next->sched_class->put_prev_task(rq, next);

7353

next->sched_class->put_prev_task(rq, next);

7354

migrate_dead(dead_cpu, next);

7354

migrate_dead(dead_cpu, next);

7355

7356

}

7356

}

7357

}

7357

}

7358

7359

/*

7359

/*

7360

* remove the tasks which were accounted by rq from calc_load_tasks.

7360

* remove the tasks which were accounted by rq from calc_load_tasks.

7361

*/

7361

*/

7362

static void calc_global_load_remove(struct rq *rq)

7362

static void calc_global_load_remove(struct rq *rq)

7363

{

7363

{

7364

atomic_long_sub(rq->calc_load_active, &calc_load_tasks);

7364

atomic_long_sub(rq->calc_load_active, &calc_load_tasks);

7365

rq->calc_load_active = 0;

7365

rq->calc_load_active = 0;

7366

}

7366

}

7367

#endif /* CONFIG_HOTPLUG_CPU */

7367

#endif /* CONFIG_HOTPLUG_CPU */

7368

7369

#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)

7369

#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)

7370

7371

static struct ctl_table sd_ctl_dir[] = {

7371

static struct ctl_table sd_ctl_dir[] = {

7372

{

7372

{

7373

.procname = "sched_domain",

7373

.procname = "sched_domain",

7374

.mode = 0555,

7374

.mode = 0555,

7375

},

7375

},

7376

{0, },

7376

{}

7377

};

7377

};

7378

7379

static struct ctl_table sd_ctl_root[] = {

7379

static struct ctl_table sd_ctl_root[] = {

7380

{

7380

{

7381

.ctl_name = CTL_KERN,

7382

.procname = "kernel",

7381

.procname = "kernel",

7383

.mode = 0555,

7382

.mode = 0555,

7384

.child = sd_ctl_dir,

7383

.child = sd_ctl_dir,

7385

},

7384

},

7386

{0, },

7385

{}

7387

};

7386

};

7388

7387

7389

static struct ctl_table *sd_alloc_ctl_entry(int n)

7388

static struct ctl_table *sd_alloc_ctl_entry(int n)

7390

{

7389

{

7391

struct ctl_table *entry =

7390

struct ctl_table *entry =

7392

kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);

7391

kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);

7393

7392

7394

return entry;

7393

return entry;

7395

}

7394

}

7396

7395

7397

static void sd_free_ctl_entry(struct ctl_table **tablep)

7396

static void sd_free_ctl_entry(struct ctl_table **tablep)

7398

{

7397

{

7399

struct ctl_table *entry;

7398

struct ctl_table *entry;

7400

7399

7401

/*

7400

/*

7402

* In the intermediate directories, both the child directory and

7401

* In the intermediate directories, both the child directory and

7403

* procname are dynamically allocated and could fail but the mode

7402

* procname are dynamically allocated and could fail but the mode

7404

* will always be set. In the lowest directory the names are

7403

* will always be set. In the lowest directory the names are

7405

* static strings and all have proc handlers.

7404

* static strings and all have proc handlers.

7406

*/

7405

*/

7407

for (entry = *tablep; entry->mode; entry++) {

7406

for (entry = *tablep; entry->mode; entry++) {

7408

if (entry->child)

7407

if (entry->child)

7409

sd_free_ctl_entry(&entry->child);

7408

sd_free_ctl_entry(&entry->child);

7410

if (entry->proc_handler == NULL)

7409

if (entry->proc_handler == NULL)

7411

kfree(entry->procname);

7410

kfree(entry->procname);

7412

}

7411

}

7413

7412

7414

kfree(*tablep);

7413

kfree(*tablep);

7415

*tablep = NULL;

7414

*tablep = NULL;

7416

}

7415

}

7417

7416

7418

static void

7417

static void

7419

set_table_entry(struct ctl_table *entry,

7418

set_table_entry(struct ctl_table *entry,

7420

const char *procname, void *data, int maxlen,

7419

const char *procname, void *data, int maxlen,

7421

mode_t mode, proc_handler *proc_handler)

7420

mode_t mode, proc_handler *proc_handler)

7422

{

7421

{

7423

entry->procname = procname;

7422

entry->procname = procname;

7424

entry->data = data;

7423

entry->data = data;

7425

entry->maxlen = maxlen;

7424

entry->maxlen = maxlen;

7426

entry->mode = mode;

7425

entry->mode = mode;

7427

entry->proc_handler = proc_handler;

7426

entry->proc_handler = proc_handler;

7428

}

7427

}

7429

7428

7430

static struct ctl_table *

7429

static struct ctl_table *

7431

sd_alloc_ctl_domain_table(struct sched_domain *sd)

7430

sd_alloc_ctl_domain_table(struct sched_domain *sd)

7432

{

7431

{

7433

struct ctl_table *table = sd_alloc_ctl_entry(13);

7432

struct ctl_table *table = sd_alloc_ctl_entry(13);

7434

7433

7435

if (table == NULL)

7434

if (table == NULL)

7436

return NULL;

7435

return NULL;

7437

7436

7438

set_table_entry(&table[0], "min_interval", &sd->min_interval,

7437

set_table_entry(&table[0], "min_interval", &sd->min_interval,

7439

sizeof(long), 0644, proc_doulongvec_minmax);

7438

sizeof(long), 0644, proc_doulongvec_minmax);

7440

set_table_entry(&table[1], "max_interval", &sd->max_interval,

7439

set_table_entry(&table[1], "max_interval", &sd->max_interval,

7441

sizeof(long), 0644, proc_doulongvec_minmax);

7440

sizeof(long), 0644, proc_doulongvec_minmax);

7442

set_table_entry(&table[2], "busy_idx", &sd->busy_idx,

7441

set_table_entry(&table[2], "busy_idx", &sd->busy_idx,

7443

sizeof(int), 0644, proc_dointvec_minmax);

7442

sizeof(int), 0644, proc_dointvec_minmax);

7444

set_table_entry(&table[3], "idle_idx", &sd->idle_idx,

7443

set_table_entry(&table[3], "idle_idx", &sd->idle_idx,

7445

sizeof(int), 0644, proc_dointvec_minmax);

7444

sizeof(int), 0644, proc_dointvec_minmax);

7446

set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,

7445

set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,

7447

sizeof(int), 0644, proc_dointvec_minmax);

7446

sizeof(int), 0644, proc_dointvec_minmax);

7448

set_table_entry(&table[5], "wake_idx", &sd->wake_idx,

7447

set_table_entry(&table[5], "wake_idx", &sd->wake_idx,

7449

sizeof(int), 0644, proc_dointvec_minmax);

7448

sizeof(int), 0644, proc_dointvec_minmax);

7450

set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,

7449

set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,

7451

sizeof(int), 0644, proc_dointvec_minmax);

7450

sizeof(int), 0644, proc_dointvec_minmax);

7452

set_table_entry(&table[7], "busy_factor", &sd->busy_factor,

7451

set_table_entry(&table[7], "busy_factor", &sd->busy_factor,

7453

sizeof(int), 0644, proc_dointvec_minmax);

7452

sizeof(int), 0644, proc_dointvec_minmax);

7454

set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,

7453

set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,

7455

sizeof(int), 0644, proc_dointvec_minmax);

7454

sizeof(int), 0644, proc_dointvec_minmax);

7456

set_table_entry(&table[9], "cache_nice_tries",

7455

set_table_entry(&table[9], "cache_nice_tries",

7457

&sd->cache_nice_tries,

7456

&sd->cache_nice_tries,

7458

sizeof(int), 0644, proc_dointvec_minmax);

7457

sizeof(int), 0644, proc_dointvec_minmax);

7459

set_table_entry(&table[10], "flags", &sd->flags,

7458

set_table_entry(&table[10], "flags", &sd->flags,

7460

sizeof(int), 0644, proc_dointvec_minmax);

7459

sizeof(int), 0644, proc_dointvec_minmax);

7461

set_table_entry(&table[11], "name", sd->name,

7460

set_table_entry(&table[11], "name", sd->name,

7462

CORENAME_MAX_SIZE, 0444, proc_dostring);

7461

CORENAME_MAX_SIZE, 0444, proc_dostring);

7463

/* &table[12] is terminator */

7462

/* &table[12] is terminator */

7464

7463

7465

return table;

7464

return table;

7466

}

7465

}

7467

7466

7468

static ctl_table *sd_alloc_ctl_cpu_table(int cpu)

7467

static ctl_table *sd_alloc_ctl_cpu_table(int cpu)

7469

{

7468

{

7470

struct ctl_table *entry, *table;

7469

struct ctl_table *entry, *table;

7471

struct sched_domain *sd;

7470

struct sched_domain *sd;

7472

int domain_num = 0, i;

7471

int domain_num = 0, i;

7473

char buf[32];

7472

char buf[32];

7474

7473

7475

for_each_domain(cpu, sd)

7474

for_each_domain(cpu, sd)

7476

domain_num++;

7475

domain_num++;

7477

entry = table = sd_alloc_ctl_entry(domain_num + 1);

7476

entry = table = sd_alloc_ctl_entry(domain_num + 1);

7478

if (table == NULL)

7477

if (table == NULL)

7479

return NULL;

7478

return NULL;

7480

7479

7481

i = 0;

7480

i = 0;

7482

for_each_domain(cpu, sd) {

7481

for_each_domain(cpu, sd) {

7483

snprintf(buf, 32, "domain%d", i);

7482

snprintf(buf, 32, "domain%d", i);

7484

entry->procname = kstrdup(buf, GFP_KERNEL);

7483

entry->procname = kstrdup(buf, GFP_KERNEL);

7485

entry->mode = 0555;

7484

entry->mode = 0555;

7486

entry->child = sd_alloc_ctl_domain_table(sd);

7485

entry->child = sd_alloc_ctl_domain_table(sd);

7487

entry++;

7486

entry++;

7488

i++;

7487

i++;

7489

}

7488

}

7490

return table;

7489

return table;

7491

}

7490

}

7492

7491

7493

static struct ctl_table_header *sd_sysctl_header;

7492

static struct ctl_table_header *sd_sysctl_header;

7494

static void register_sched_domain_sysctl(void)

7493

static void register_sched_domain_sysctl(void)

7495

{

7494

{

7496

int i, cpu_num = num_online_cpus();

7495

int i, cpu_num = num_online_cpus();

7497

struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);

7496

struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);

7498

char buf[32];

7497

char buf[32];

7499

7498

7500

WARN_ON(sd_ctl_dir[0].child);

7499

WARN_ON(sd_ctl_dir[0].child);

7501

sd_ctl_dir[0].child = entry;

7500

sd_ctl_dir[0].child = entry;

7502

7501

7503

if (entry == NULL)

7502

if (entry == NULL)

7504

return;

7503

return;

7505

7504

7506

for_each_online_cpu(i) {

7505

for_each_online_cpu(i) {

7507

snprintf(buf, 32, "cpu%d", i);

7506

snprintf(buf, 32, "cpu%d", i);

7508

entry->procname = kstrdup(buf, GFP_KERNEL);

7507

entry->procname = kstrdup(buf, GFP_KERNEL);

7509

entry->mode = 0555;

7508

entry->mode = 0555;

7510

entry->child = sd_alloc_ctl_cpu_table(i);

7509

entry->child = sd_alloc_ctl_cpu_table(i);

7511

entry++;

7510

entry++;

7512

}

7511

}

7513

7512

7514

WARN_ON(sd_sysctl_header);

7513

WARN_ON(sd_sysctl_header);

7515

sd_sysctl_header = register_sysctl_table(sd_ctl_root);

7514

sd_sysctl_header = register_sysctl_table(sd_ctl_root);

7516

}

7515

}

7517

7516

7518

/* may be called multiple times per register */

7517

/* may be called multiple times per register */

7519

static void unregister_sched_domain_sysctl(void)

7518

static void unregister_sched_domain_sysctl(void)

7520

{

7519

{

7521

if (sd_sysctl_header)

7520

if (sd_sysctl_header)

7522

unregister_sysctl_table(sd_sysctl_header);

7521

unregister_sysctl_table(sd_sysctl_header);

7523

sd_sysctl_header = NULL;

7522

sd_sysctl_header = NULL;

7524

if (sd_ctl_dir[0].child)

7523

if (sd_ctl_dir[0].child)

7525

sd_free_ctl_entry(&sd_ctl_dir[0].child);

7524

sd_free_ctl_entry(&sd_ctl_dir[0].child);

7526

}

7525

}

7527

#else

7526

#else

7528

static void register_sched_domain_sysctl(void)

7527

static void register_sched_domain_sysctl(void)

7529

{

7528

{

7530

}

7529

}

7531

static void unregister_sched_domain_sysctl(void)

7530

static void unregister_sched_domain_sysctl(void)

7532

{

7531

{

7533

}

7532

}

7534

#endif

7533

#endif

7535

7534

7536

static void set_rq_online(struct rq *rq)

7535

static void set_rq_online(struct rq *rq)

7537

{

7536

{

7538

if (!rq->online) {

7537

if (!rq->online) {

7539

const struct sched_class *class;

7538

const struct sched_class *class;

7540

7539

7541

cpumask_set_cpu(rq->cpu, rq->rd->online);

7540

cpumask_set_cpu(rq->cpu, rq->rd->online);

7542

rq->online = 1;

7541

rq->online = 1;

7543

7542

7544

for_each_class(class) {

7543

for_each_class(class) {

7545

if (class->rq_online)

7544

if (class->rq_online)

7546

class->rq_online(rq);

7545

class->rq_online(rq);

7547

}

7546

}

7548

}

7547

}

7549

}

7548

}

7550

7549

7551

static void set_rq_offline(struct rq *rq)

7550

static void set_rq_offline(struct rq *rq)

7552

{

7551

{

7553

if (rq->online) {

7552

if (rq->online) {

7554

const struct sched_class *class;

7553

const struct sched_class *class;

7555

7554

7556

for_each_class(class) {

7555

for_each_class(class) {

7557

if (class->rq_offline)

7556

if (class->rq_offline)

7558

class->rq_offline(rq);

7557

class->rq_offline(rq);

7559

}

7558

}

7560

7559

7561

cpumask_clear_cpu(rq->cpu, rq->rd->online);

7560

cpumask_clear_cpu(rq->cpu, rq->rd->online);

7562

rq->online = 0;

7561

rq->online = 0;

7563

}

7562

}

7564

}

7563

}

7565

7564

7566

/*

7565

/*

7567

* migration_call - callback that gets triggered when a CPU is added.

7566

* migration_call - callback that gets triggered when a CPU is added.

7568

* Here we can start up the necessary migration thread for the new CPU.

7567

* Here we can start up the necessary migration thread for the new CPU.

7569

*/

7568

*/

7570

static int __cpuinit

7569

static int __cpuinit

7571

migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)

7570

migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)

7572

{

7571

{

7573

struct task_struct *p;

7572

struct task_struct *p;

7574

int cpu = (long)hcpu;

7573

int cpu = (long)hcpu;

7575

unsigned long flags;

7574

unsigned long flags;

7576

struct rq *rq;

7575

struct rq *rq;

7577

7576

7578

switch (action) {

7577

switch (action) {

7579

7578

7580

case CPU_UP_PREPARE:

7579

case CPU_UP_PREPARE:

7581

case CPU_UP_PREPARE_FROZEN:

7580

case CPU_UP_PREPARE_FROZEN:

7582

p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);

7581

p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);

7583

if (IS_ERR(p))

7582

if (IS_ERR(p))

7584

return NOTIFY_BAD;

7583

return NOTIFY_BAD;

7585

kthread_bind(p, cpu);

7584

kthread_bind(p, cpu);

7586

/* Must be high prio: stop_machine expects to yield to it. */

7585

/* Must be high prio: stop_machine expects to yield to it. */

7587

rq = task_rq_lock(p, &flags);

7586

rq = task_rq_lock(p, &flags);

7588

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

7587

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

7589

task_rq_unlock(rq, &flags);

7588

task_rq_unlock(rq, &flags);

7590

get_task_struct(p);

7589

get_task_struct(p);

7591

cpu_rq(cpu)->migration_thread = p;

7590

cpu_rq(cpu)->migration_thread = p;

7592

rq->calc_load_update = calc_load_update;

7591

rq->calc_load_update = calc_load_update;

7593

break;

7592

break;

7594

7593

7595

case CPU_ONLINE:

7594

case CPU_ONLINE:

7596

case CPU_ONLINE_FROZEN:

7595

case CPU_ONLINE_FROZEN:

7597

/* Strictly unnecessary, as first user will wake it. */

7596

/* Strictly unnecessary, as first user will wake it. */

7598

wake_up_process(cpu_rq(cpu)->migration_thread);

7597

wake_up_process(cpu_rq(cpu)->migration_thread);

7599

7598

7600

/* Update our root-domain */

7599

/* Update our root-domain */

7601

rq = cpu_rq(cpu);

7600

rq = cpu_rq(cpu);

7602

spin_lock_irqsave(&rq->lock, flags);

7601

spin_lock_irqsave(&rq->lock, flags);

7603

if (rq->rd) {

7602

if (rq->rd) {

7604

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

7603

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

7605

7604

7606

set_rq_online(rq);

7605

set_rq_online(rq);

7607

}

7606

}

7608

spin_unlock_irqrestore(&rq->lock, flags);

7607

spin_unlock_irqrestore(&rq->lock, flags);

7609

break;

7608

break;

7610

7609

7611

#ifdef CONFIG_HOTPLUG_CPU

7610

#ifdef CONFIG_HOTPLUG_CPU

7612

case CPU_UP_CANCELED:

7611

case CPU_UP_CANCELED:

7613

case CPU_UP_CANCELED_FROZEN:

7612

case CPU_UP_CANCELED_FROZEN:

7614

if (!cpu_rq(cpu)->migration_thread)

7613

if (!cpu_rq(cpu)->migration_thread)

7615

break;

7614

break;

7616

/* Unbind it from offline cpu so it can run. Fall thru. */

7615

/* Unbind it from offline cpu so it can run. Fall thru. */

7617

kthread_bind(cpu_rq(cpu)->migration_thread,

7616

kthread_bind(cpu_rq(cpu)->migration_thread,

7618

cpumask_any(cpu_online_mask));

7617

cpumask_any(cpu_online_mask));

7619

kthread_stop(cpu_rq(cpu)->migration_thread);

7618

kthread_stop(cpu_rq(cpu)->migration_thread);

7620

put_task_struct(cpu_rq(cpu)->migration_thread);

7619

put_task_struct(cpu_rq(cpu)->migration_thread);

7621

cpu_rq(cpu)->migration_thread = NULL;

7620

cpu_rq(cpu)->migration_thread = NULL;

7622

break;

7621

break;

7623

7622

7624

case CPU_DEAD:

7623

case CPU_DEAD:

7625

case CPU_DEAD_FROZEN:

7624

case CPU_DEAD_FROZEN:

7626

cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */

7625

cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */

7627

migrate_live_tasks(cpu);

7626

migrate_live_tasks(cpu);

7628

rq = cpu_rq(cpu);

7627

rq = cpu_rq(cpu);

7629

kthread_stop(rq->migration_thread);

7628

kthread_stop(rq->migration_thread);

7630

put_task_struct(rq->migration_thread);

7629

put_task_struct(rq->migration_thread);

7631

rq->migration_thread = NULL;

7630

rq->migration_thread = NULL;

7632

/* Idle task back to normal (off runqueue, low prio) */

7631

/* Idle task back to normal (off runqueue, low prio) */

7633

spin_lock_irq(&rq->lock);

7632

spin_lock_irq(&rq->lock);

7634

update_rq_clock(rq);

7633

update_rq_clock(rq);

7635

deactivate_task(rq, rq->idle, 0);

7634

deactivate_task(rq, rq->idle, 0);

7636

rq->idle->static_prio = MAX_PRIO;

7635

rq->idle->static_prio = MAX_PRIO;

7637

__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);

7636

__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);

7638

rq->idle->sched_class = &idle_sched_class;

7637

rq->idle->sched_class = &idle_sched_class;

7639

migrate_dead_tasks(cpu);

7638

migrate_dead_tasks(cpu);

7640

spin_unlock_irq(&rq->lock);

7639

spin_unlock_irq(&rq->lock);

7641

cpuset_unlock();

7640

cpuset_unlock();

7642

migrate_nr_uninterruptible(rq);

7641

migrate_nr_uninterruptible(rq);

7643

BUG_ON(rq->nr_running != 0);

7642

BUG_ON(rq->nr_running != 0);

7644

calc_global_load_remove(rq);

7643

calc_global_load_remove(rq);

7645

/*

7644

/*

7646

* No need to migrate the tasks: it was best-effort if

7645

* No need to migrate the tasks: it was best-effort if

7647

* they didn't take sched_hotcpu_mutex. Just wake up

7646

* they didn't take sched_hotcpu_mutex. Just wake up

7648

* the requestors.

7647

* the requestors.

7649

*/

7648

*/

7650

spin_lock_irq(&rq->lock);

7649

spin_lock_irq(&rq->lock);

7651

while (!list_empty(&rq->migration_queue)) {

7650

while (!list_empty(&rq->migration_queue)) {

7652

struct migration_req *req;

7651

struct migration_req *req;

7653

7652

7654

req = list_entry(rq->migration_queue.next,

7653

req = list_entry(rq->migration_queue.next,

7655

struct migration_req, list);

7654

struct migration_req, list);

7656

list_del_init(&req->list);

7655

list_del_init(&req->list);

7657

spin_unlock_irq(&rq->lock);

7656

spin_unlock_irq(&rq->lock);

7658

complete(&req->done);

7657

complete(&req->done);

7659

spin_lock_irq(&rq->lock);

7658

spin_lock_irq(&rq->lock);

7660

}

7659

}

7661

spin_unlock_irq(&rq->lock);

7660

spin_unlock_irq(&rq->lock);

7662

break;

7661

break;

7663

7662

7664

case CPU_DYING:

7663

case CPU_DYING:

7665

case CPU_DYING_FROZEN:

7664

case CPU_DYING_FROZEN:

7666

/* Update our root-domain */

7665

/* Update our root-domain */

7667

rq = cpu_rq(cpu);

7666

rq = cpu_rq(cpu);

7668

spin_lock_irqsave(&rq->lock, flags);

7667

spin_lock_irqsave(&rq->lock, flags);

7669

if (rq->rd) {

7668

if (rq->rd) {

7670

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

7669

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

7671

set_rq_offline(rq);

7670

set_rq_offline(rq);

7672

}

7671

}

7673

spin_unlock_irqrestore(&rq->lock, flags);

7672

spin_unlock_irqrestore(&rq->lock, flags);

7674

break;

7673

break;

7675

#endif

7674

#endif

7676

}

7675

}

7677

return NOTIFY_OK;

7676

return NOTIFY_OK;

7678

}

7677

}

7679

7678

7680

/*

7679

/*

7681

* Register at high priority so that task migration (migrate_all_tasks)

7680

* Register at high priority so that task migration (migrate_all_tasks)

7682

* happens before everything else. This has to be lower priority than

7681

* happens before everything else. This has to be lower priority than

7683

* the notifier in the perf_event subsystem, though.

7682

* the notifier in the perf_event subsystem, though.

7684

*/

7683

*/

7685

static struct notifier_block __cpuinitdata migration_notifier = {

7684

static struct notifier_block __cpuinitdata migration_notifier = {

7686

.notifier_call = migration_call,

7685

.notifier_call = migration_call,

7687

.priority = 10

7686

.priority = 10

7688

};

7687

};

7689

7688

7690

static int __init migration_init(void)

7689

static int __init migration_init(void)

7691

{

7690

{

7692

void *cpu = (void *)(long)smp_processor_id();

7691

void *cpu = (void *)(long)smp_processor_id();

7693

int err;

7692

int err;

7694

7693

7695

/* Start one for the boot CPU: */

7694

/* Start one for the boot CPU: */

7696

err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);

7695

err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);

7697

BUG_ON(err == NOTIFY_BAD);

7696

BUG_ON(err == NOTIFY_BAD);

7698

migration_call(&migration_notifier, CPU_ONLINE, cpu);

7697

migration_call(&migration_notifier, CPU_ONLINE, cpu);

7699

register_cpu_notifier(&migration_notifier);

7698

register_cpu_notifier(&migration_notifier);

7700

7699

7701

return 0;

7700

return 0;

7702

}

7701

}

7703

early_initcall(migration_init);

7702

early_initcall(migration_init);

7704

#endif

7703

#endif

7705

7704

7706

#ifdef CONFIG_SMP

7705

#ifdef CONFIG_SMP

7707

7706

7708

#ifdef CONFIG_SCHED_DEBUG

7707

#ifdef CONFIG_SCHED_DEBUG

7709

7708

7710

static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,

7709

static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,

7711

struct cpumask *groupmask)

7710

struct cpumask *groupmask)

7712

{

7711

{

7713

struct sched_group *group = sd->groups;

7712

struct sched_group *group = sd->groups;

7714

char str[256];

7713

char str[256];

7715

7714

7716

cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));

7715

cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));

7717

cpumask_clear(groupmask);

7716

cpumask_clear(groupmask);

7718

7717

7719

printk(KERN_DEBUG "%*s domain %d: ", level, "", level);

7718

printk(KERN_DEBUG "%*s domain %d: ", level, "", level);

7720

7719

7721

if (!(sd->flags & SD_LOAD_BALANCE)) {

7720

if (!(sd->flags & SD_LOAD_BALANCE)) {

7722

printk("does not load-balance\n");

7721

printk("does not load-balance\n");

7723

if (sd->parent)

7722

if (sd->parent)

7724

printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"

7723

printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"

7725

" has parent");

7724

" has parent");

7726

return -1;

7725

return -1;

7727

}

7726

}

7728

7727

7729

printk(KERN_CONT "span %s level %s\n", str, sd->name);

7728

printk(KERN_CONT "span %s level %s\n", str, sd->name);

7730

7729

7731

if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {

7730

if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {

7732

printk(KERN_ERR "ERROR: domain->span does not contain "

7731

printk(KERN_ERR "ERROR: domain->span does not contain "

7733

"CPU%d\n", cpu);

7732

"CPU%d\n", cpu);

7734

}

7733

}

7735

if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {

7734

if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {

7736

printk(KERN_ERR "ERROR: domain->groups does not contain"

7735

printk(KERN_ERR "ERROR: domain->groups does not contain"

7737

" CPU%d\n", cpu);

7736

" CPU%d\n", cpu);

7738

}

7737

}

7739

7738

7740

printk(KERN_DEBUG "%*s groups:", level + 1, "");

7739

printk(KERN_DEBUG "%*s groups:", level + 1, "");

7741

do {

7740

do {

7742

if (!group) {

7741

if (!group) {

7743

printk("\n");

7742

printk("\n");

7744

printk(KERN_ERR "ERROR: group is NULL\n");

7743

printk(KERN_ERR "ERROR: group is NULL\n");

7745

break;

7744

break;

7746

}

7745

}

7747

7746

7748

if (!group->cpu_power) {

7747

if (!group->cpu_power) {

7749

printk(KERN_CONT "\n");

7748

printk(KERN_CONT "\n");

7750

printk(KERN_ERR "ERROR: domain->cpu_power not "

7749

printk(KERN_ERR "ERROR: domain->cpu_power not "

7751

"set\n");

7750

"set\n");

7752

break;

7751

break;

7753

}

7752

}

7754

7753

7755

if (!cpumask_weight(sched_group_cpus(group))) {

7754

if (!cpumask_weight(sched_group_cpus(group))) {

7756

printk(KERN_CONT "\n");

7755

printk(KERN_CONT "\n");

7757

printk(KERN_ERR "ERROR: empty group\n");

7756

printk(KERN_ERR "ERROR: empty group\n");

7758

break;

7757

break;

7759

}

7758

}

7760

7759

7761

if (cpumask_intersects(groupmask, sched_group_cpus(group))) {

7760

if (cpumask_intersects(groupmask, sched_group_cpus(group))) {

7762

printk(KERN_CONT "\n");

7761

printk(KERN_CONT "\n");

7763

printk(KERN_ERR "ERROR: repeated CPUs\n");

7762

printk(KERN_ERR "ERROR: repeated CPUs\n");

7764

break;

7763

break;

7765

}

7764

}

7766

7765

7767

cpumask_or(groupmask, groupmask, sched_group_cpus(group));

7766

cpumask_or(groupmask, groupmask, sched_group_cpus(group));

7768

7767

7769

cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));

7768

cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));

7770

7769

7771

printk(KERN_CONT " %s", str);

7770

printk(KERN_CONT " %s", str);

7772

if (group->cpu_power != SCHED_LOAD_SCALE) {

7771

if (group->cpu_power != SCHED_LOAD_SCALE) {

7773

printk(KERN_CONT " (cpu_power = %d)",

7772

printk(KERN_CONT " (cpu_power = %d)",

7774

group->cpu_power);

7773

group->cpu_power);

7775

}

7774

}

7776

7775

7777

group = group->next;

7776

group = group->next;

7778

} while (group != sd->groups);

7777

} while (group != sd->groups);

7779

printk(KERN_CONT "\n");

7778

printk(KERN_CONT "\n");

7780

7779

7781

if (!cpumask_equal(sched_domain_span(sd), groupmask))

7780

if (!cpumask_equal(sched_domain_span(sd), groupmask))

7782

printk(KERN_ERR "ERROR: groups don't span domain->span\n");

7781

printk(KERN_ERR "ERROR: groups don't span domain->span\n");

7783

7782

7784

if (sd->parent &&

7783

if (sd->parent &&

7785

!cpumask_subset(groupmask, sched_domain_span(sd->parent)))

7784

!cpumask_subset(groupmask, sched_domain_span(sd->parent)))

7786

printk(KERN_ERR "ERROR: parent span is not a superset "

7785

printk(KERN_ERR "ERROR: parent span is not a superset "

7787

"of domain->span\n");

7786

"of domain->span\n");

7788

return 0;

7787

return 0;

7789

}

7788

}

7790

7789

7791

static void sched_domain_debug(struct sched_domain *sd, int cpu)

7790

static void sched_domain_debug(struct sched_domain *sd, int cpu)

7792

{

7791

{

7793

cpumask_var_t groupmask;

7792

cpumask_var_t groupmask;

7794

int level = 0;

7793

int level = 0;

7795

7794

7796

if (!sd) {

7795

if (!sd) {

7797

printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);

7796

printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);

7798

return;

7797

return;

7799

}

7798

}

7800

7799

7801

printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

7800

printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

7802

7801

7803

if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {

7802

if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {

7804

printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");

7803

printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");

7805

return;

7804

return;

7806

}

7805

}

7807

7806

7808

for (;;) {

7807

for (;;) {

7809

if (sched_domain_debug_one(sd, cpu, level, groupmask))

7808

if (sched_domain_debug_one(sd, cpu, level, groupmask))

7810

break;

7809

break;

7811

level++;

7810

level++;

7812

sd = sd->parent;

7811

sd = sd->parent;

7813

if (!sd)

7812

if (!sd)

7814

break;

7813

break;

7815

}

7814

}

7816

free_cpumask_var(groupmask);

7815

free_cpumask_var(groupmask);

7817

}

7816

}

7818

#else /* !CONFIG_SCHED_DEBUG */

7817

#else /* !CONFIG_SCHED_DEBUG */

7819

# define sched_domain_debug(sd, cpu) do { } while (0)

7818

# define sched_domain_debug(sd, cpu) do { } while (0)

7820

#endif /* CONFIG_SCHED_DEBUG */

7819

#endif /* CONFIG_SCHED_DEBUG */

7821

7820

7822

static int sd_degenerate(struct sched_domain *sd)

7821

static int sd_degenerate(struct sched_domain *sd)

7823

{

7822

{

7824

if (cpumask_weight(sched_domain_span(sd)) == 1)

7823

if (cpumask_weight(sched_domain_span(sd)) == 1)

7825

return 1;

7824

return 1;

7826

7825

7827

/* Following flags need at least 2 groups */

7826

/* Following flags need at least 2 groups */

7828

if (sd->flags & (SD_LOAD_BALANCE |

7827

if (sd->flags & (SD_LOAD_BALANCE |

7829

SD_BALANCE_NEWIDLE |

7828

SD_BALANCE_NEWIDLE |

7830

SD_BALANCE_FORK |

7829

SD_BALANCE_FORK |

7831

SD_BALANCE_EXEC |

7830

SD_BALANCE_EXEC |

7832

SD_SHARE_CPUPOWER |

7831

SD_SHARE_CPUPOWER |

7833

SD_SHARE_PKG_RESOURCES)) {

7832

SD_SHARE_PKG_RESOURCES)) {

7834

if (sd->groups != sd->groups->next)

7833

if (sd->groups != sd->groups->next)

7835

return 0;

7834

return 0;

7836

}

7835

}

7837

7836

7838

/* Following flags don't use groups */

7837

/* Following flags don't use groups */

7839

if (sd->flags & (SD_WAKE_AFFINE))

7838

if (sd->flags & (SD_WAKE_AFFINE))

7840

return 0;

7839

return 0;

7841

7840

7842

return 1;

7841

return 1;

7843

}

7842

}

7844

7843

7845

static int

7844

static int

7846

sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)

7845

sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)

7847

{

7846

{

7848

unsigned long cflags = sd->flags, pflags = parent->flags;

7847

unsigned long cflags = sd->flags, pflags = parent->flags;

7849

7848

7850

if (sd_degenerate(parent))

7849

if (sd_degenerate(parent))

7851

return 1;

7850

return 1;

7852

7851

7853

if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))

7852

if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))

7854

return 0;

7853

return 0;

7855

7854

7856

/* Flags needing groups don't count if only 1 group in parent */

7855

/* Flags needing groups don't count if only 1 group in parent */

7857

if (parent->groups == parent->groups->next) {

7856

if (parent->groups == parent->groups->next) {

7858

pflags &= ~(SD_LOAD_BALANCE |

7857

pflags &= ~(SD_LOAD_BALANCE |

7859

SD_BALANCE_NEWIDLE |

7858

SD_BALANCE_NEWIDLE |

7860

SD_BALANCE_FORK |

7859

SD_BALANCE_FORK |

7861

SD_BALANCE_EXEC |

7860

SD_BALANCE_EXEC |

7862

SD_SHARE_CPUPOWER |

7861

SD_SHARE_CPUPOWER |

7863

SD_SHARE_PKG_RESOURCES);

7862

SD_SHARE_PKG_RESOURCES);

7864

if (nr_node_ids == 1)

7863

if (nr_node_ids == 1)

7865

pflags &= ~SD_SERIALIZE;

7864

pflags &= ~SD_SERIALIZE;

7866

}

7865

}

7867

if (~cflags & pflags)

7866

if (~cflags & pflags)

7868

return 0;

7867

return 0;

7869

7868

7870

return 1;

7869

return 1;

7871

}

7870

}

7872

7871

7873

static void free_rootdomain(struct root_domain *rd)

7872

static void free_rootdomain(struct root_domain *rd)

7874

{

7873

{

7875

cpupri_cleanup(&rd->cpupri);

7874

cpupri_cleanup(&rd->cpupri);

7876

7875

7877

free_cpumask_var(rd->rto_mask);

7876

free_cpumask_var(rd->rto_mask);

7878

free_cpumask_var(rd->online);

7877

free_cpumask_var(rd->online);

7879

free_cpumask_var(rd->span);

7878

free_cpumask_var(rd->span);

7880

kfree(rd);

7879

kfree(rd);

7881

}

7880

}

7882

7881

7883

static void rq_attach_root(struct rq *rq, struct root_domain *rd)

7882

static void rq_attach_root(struct rq *rq, struct root_domain *rd)

7884

{

7883

{

7885

struct root_domain *old_rd = NULL;

7884

struct root_domain *old_rd = NULL;

7886

unsigned long flags;

7885

unsigned long flags;

7887

7886

7888

spin_lock_irqsave(&rq->lock, flags);

7887

spin_lock_irqsave(&rq->lock, flags);

7889

7888

7890

if (rq->rd) {

7889

if (rq->rd) {

7891

old_rd = rq->rd;

7890

old_rd = rq->rd;

7892

7891

7893

if (cpumask_test_cpu(rq->cpu, old_rd->online))

7892

if (cpumask_test_cpu(rq->cpu, old_rd->online))

7894

set_rq_offline(rq);

7893

set_rq_offline(rq);

7895

7894

7896

cpumask_clear_cpu(rq->cpu, old_rd->span);

7895

cpumask_clear_cpu(rq->cpu, old_rd->span);

7897

7896

7898

/*

7897

/*

7899

* If we dont want to free the old_rt yet then

7898

* If we dont want to free the old_rt yet then

7900

* set old_rd to NULL to skip the freeing later

7899

* set old_rd to NULL to skip the freeing later

7901

* in this function:

7900

* in this function:

7902

*/

7901

*/

7903

if (!atomic_dec_and_test(&old_rd->refcount))

7902

if (!atomic_dec_and_test(&old_rd->refcount))

7904

old_rd = NULL;

7903

old_rd = NULL;

7905

}

7904

}

7906

7905

7907

atomic_inc(&rd->refcount);

7906

atomic_inc(&rd->refcount);

7908

rq->rd = rd;

7907

rq->rd = rd;

7909

7908

7910

cpumask_set_cpu(rq->cpu, rd->span);

7909

cpumask_set_cpu(rq->cpu, rd->span);

7911

if (cpumask_test_cpu(rq->cpu, cpu_active_mask))

7910

if (cpumask_test_cpu(rq->cpu, cpu_active_mask))

7912

set_rq_online(rq);

7911

set_rq_online(rq);

7913

7912

7914

spin_unlock_irqrestore(&rq->lock, flags);

7913

spin_unlock_irqrestore(&rq->lock, flags);

7915

7914

7916

if (old_rd)

7915

if (old_rd)

7917

free_rootdomain(old_rd);

7916

free_rootdomain(old_rd);

7918

}

7917

}

7919

7918

7920

static int init_rootdomain(struct root_domain *rd, bool bootmem)

7919

static int init_rootdomain(struct root_domain *rd, bool bootmem)

7921

{

7920

{

7922

gfp_t gfp = GFP_KERNEL;

7921

gfp_t gfp = GFP_KERNEL;

7923

7922

7924

memset(rd, 0, sizeof(*rd));

7923

memset(rd, 0, sizeof(*rd));

7925

7924

7926

if (bootmem)

7925

if (bootmem)

7927

gfp = GFP_NOWAIT;

7926

gfp = GFP_NOWAIT;

7928

7927

7929

if (!alloc_cpumask_var(&rd->span, gfp))

7928

if (!alloc_cpumask_var(&rd->span, gfp))

7930

goto out;

7929

goto out;

7931

if (!alloc_cpumask_var(&rd->online, gfp))

7930

if (!alloc_cpumask_var(&rd->online, gfp))

7932

goto free_span;

7931

goto free_span;

7933

if (!alloc_cpumask_var(&rd->rto_mask, gfp))

7932

if (!alloc_cpumask_var(&rd->rto_mask, gfp))

7934

goto free_online;

7933

goto free_online;

7935

7934

7936

if (cpupri_init(&rd->cpupri, bootmem) != 0)

7935

if (cpupri_init(&rd->cpupri, bootmem) != 0)

7937

goto free_rto_mask;

7936

goto free_rto_mask;

7938

return 0;

7937

return 0;

7939

7938

7940

free_rto_mask:

7939

free_rto_mask:

7941

free_cpumask_var(rd->rto_mask);

7940

free_cpumask_var(rd->rto_mask);

7942

free_online:

7941

free_online:

7943

free_cpumask_var(rd->online);

7942

free_cpumask_var(rd->online);

7944

free_span:

7943

free_span:

7945

free_cpumask_var(rd->span);

7944

free_cpumask_var(rd->span);

7946

out:

7945

out:

7947

return -ENOMEM;

7946

return -ENOMEM;

7948

}

7947

}

7949

7948

7950

static void init_defrootdomain(void)

7949

static void init_defrootdomain(void)

7951

{

7950

{

7952

init_rootdomain(&def_root_domain, true);

7951

init_rootdomain(&def_root_domain, true);

7953

7952

7954

atomic_set(&def_root_domain.refcount, 1);

7953

atomic_set(&def_root_domain.refcount, 1);

7955

}

7954

}

7956

7955

7957

static struct root_domain *alloc_rootdomain(void)

7956

static struct root_domain *alloc_rootdomain(void)

7958

{

7957

{

7959

struct root_domain *rd;

7958

struct root_domain *rd;

7960

7959

7961

rd = kmalloc(sizeof(*rd), GFP_KERNEL);

7960

rd = kmalloc(sizeof(*rd), GFP_KERNEL);

7962

if (!rd)

7961

if (!rd)

7963

return NULL;

7962

return NULL;

7964

7963

7965

if (init_rootdomain(rd, false) != 0) {

7964

if (init_rootdomain(rd, false) != 0) {

7966

kfree(rd);

7965

kfree(rd);

7967

return NULL;

7966

return NULL;

7968

}

7967

}

7969

7968

7970

return rd;

7969

return rd;

7971

}

7970

}

7972

7971

7973

/*

7972

/*

7974

* Attach the domain 'sd' to 'cpu' as its base domain. Callers must

7973

* Attach the domain 'sd' to 'cpu' as its base domain. Callers must

7975

* hold the hotplug lock.

7974

* hold the hotplug lock.

7976

*/

7975

*/

7977

static void

7976

static void

7978

cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)

7977

cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)

7979

{

7978

{

7980

struct rq *rq = cpu_rq(cpu);

7979

struct rq *rq = cpu_rq(cpu);

7981

struct sched_domain *tmp;

7980

struct sched_domain *tmp;

7982

7981

7983

/* Remove the sched domains which do not contribute to scheduling. */

7982

/* Remove the sched domains which do not contribute to scheduling. */

7984

for (tmp = sd; tmp; ) {

7983

for (tmp = sd; tmp; ) {

7985

struct sched_domain *parent = tmp->parent;

7984

struct sched_domain *parent = tmp->parent;

7986

if (!parent)

7985

if (!parent)

7987

break;

7986

break;

7988

7987

7989

if (sd_parent_degenerate(tmp, parent)) {

7988

if (sd_parent_degenerate(tmp, parent)) {

7990

tmp->parent = parent->parent;

7989

tmp->parent = parent->parent;

7991

if (parent->parent)

7990

if (parent->parent)

7992

parent->parent->child = tmp;

7991

parent->parent->child = tmp;

7993

} else

7992

} else

7994

tmp = tmp->parent;

7993

tmp = tmp->parent;

7995

}

7994

}

7996

7995

7997

if (sd && sd_degenerate(sd)) {

7996

if (sd && sd_degenerate(sd)) {

7998

sd = sd->parent;

7997

sd = sd->parent;

7999

if (sd)

7998

if (sd)

8000

sd->child = NULL;

7999

sd->child = NULL;

8001

}

8000

}

8002

8001

8003

sched_domain_debug(sd, cpu);

8002

sched_domain_debug(sd, cpu);

8004

8003

8005

rq_attach_root(rq, rd);

8004

rq_attach_root(rq, rd);

8006

rcu_assign_pointer(rq->sd, sd);

8005

rcu_assign_pointer(rq->sd, sd);

8007

}

8006

}

8008

8007

8009

/* cpus with isolated domains */

8008

/* cpus with isolated domains */

8010

static cpumask_var_t cpu_isolated_map;

8009

static cpumask_var_t cpu_isolated_map;

8011

8010

8012

/* Setup the mask of cpus configured for isolated domains */

8011

/* Setup the mask of cpus configured for isolated domains */

8013

static int __init isolated_cpu_setup(char *str)

8012

static int __init isolated_cpu_setup(char *str)

8014

{

8013

{

8015

cpulist_parse(str, cpu_isolated_map);

8014

cpulist_parse(str, cpu_isolated_map);

8016

return 1;

8015

return 1;

8017

}

8016

}

8018

8017

8019

__setup("isolcpus=", isolated_cpu_setup);

8018

__setup("isolcpus=", isolated_cpu_setup);

8020

8019

8021

/*

8020

/*

8022

* init_sched_build_groups takes the cpumask we wish to span, and a pointer

8021

* init_sched_build_groups takes the cpumask we wish to span, and a pointer

8023

* to a function which identifies what group(along with sched group) a CPU

8022

* to a function which identifies what group(along with sched group) a CPU

8024

* belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids

8023

* belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids

8025

* (due to the fact that we keep track of groups covered with a struct cpumask).

8024

* (due to the fact that we keep track of groups covered with a struct cpumask).

8026

*

8025

*

8027

* init_sched_build_groups will build a circular linked list of the groups

8026

* init_sched_build_groups will build a circular linked list of the groups

8028

* covered by the given span, and will set each group's ->cpumask correctly,

8027

* covered by the given span, and will set each group's ->cpumask correctly,

8029

* and ->cpu_power to 0.

8028

* and ->cpu_power to 0.

8030

*/

8029

*/

8031

static void

8030

static void

8032

init_sched_build_groups(const struct cpumask *span,

8031

init_sched_build_groups(const struct cpumask *span,

8033

const struct cpumask *cpu_map,

8032

const struct cpumask *cpu_map,

8034

int (*group_fn)(int cpu, const struct cpumask *cpu_map,

8033

int (*group_fn)(int cpu, const struct cpumask *cpu_map,

8035

struct sched_group **sg,

8034

struct sched_group **sg,

8036

struct cpumask *tmpmask),

8035

struct cpumask *tmpmask),

8037

struct cpumask *covered, struct cpumask *tmpmask)

8036

struct cpumask *covered, struct cpumask *tmpmask)

8038

{

8037

{

8039

struct sched_group *first = NULL, *last = NULL;

8038

struct sched_group *first = NULL, *last = NULL;

8040

int i;

8039

int i;

8041

8040

8042

cpumask_clear(covered);

8041

cpumask_clear(covered);

8043

8042

8044

for_each_cpu(i, span) {

8043

for_each_cpu(i, span) {

8045

struct sched_group *sg;

8044

struct sched_group *sg;

8046

int group = group_fn(i, cpu_map, &sg, tmpmask);

8045

int group = group_fn(i, cpu_map, &sg, tmpmask);

8047

int j;

8046

int j;

8048

8047

8049

if (cpumask_test_cpu(i, covered))

8048

if (cpumask_test_cpu(i, covered))

8050

continue;

8049

continue;

8051

8050

8052

cpumask_clear(sched_group_cpus(sg));

8051

cpumask_clear(sched_group_cpus(sg));

8053

sg->cpu_power = 0;

8052

sg->cpu_power = 0;

8054

8053

8055

for_each_cpu(j, span) {

8054

for_each_cpu(j, span) {

8056

if (group_fn(j, cpu_map, NULL, tmpmask) != group)

8055

if (group_fn(j, cpu_map, NULL, tmpmask) != group)

8057

continue;

8056

continue;

8058

8057

8059

cpumask_set_cpu(j, covered);

8058

cpumask_set_cpu(j, covered);

8060

cpumask_set_cpu(j, sched_group_cpus(sg));

8059

cpumask_set_cpu(j, sched_group_cpus(sg));

8061

}

8060

}

8062

if (!first)

8061

if (!first)

8063

first = sg;

8062

first = sg;

8064

if (last)

8063

if (last)

8065

last->next = sg;

8064

last->next = sg;

8066

last = sg;

8065

last = sg;

8067

}

8066

}

8068

last->next = first;

8067

last->next = first;

8069

}

8068

}

8070

8069

8071

#define SD_NODES_PER_DOMAIN 16

8070

#define SD_NODES_PER_DOMAIN 16

8072

8071

8073

#ifdef CONFIG_NUMA

8072

#ifdef CONFIG_NUMA

8074

8073

8075

/**

8074

/**

8076

* find_next_best_node - find the next node to include in a sched_domain

8075

* find_next_best_node - find the next node to include in a sched_domain

8077

* @node: node whose sched_domain we're building

8076

* @node: node whose sched_domain we're building

8078

* @used_nodes: nodes already in the sched_domain

8077

* @used_nodes: nodes already in the sched_domain

8079

*

8078

*

8080

* Find the next node to include in a given scheduling domain. Simply

8079

* Find the next node to include in a given scheduling domain. Simply

8081

* finds the closest node not already in the @used_nodes map.

8080

* finds the closest node not already in the @used_nodes map.

8082

*

8081

*

8083

* Should use nodemask_t.

8082

* Should use nodemask_t.

8084

*/

8083

*/

8085

static int find_next_best_node(int node, nodemask_t *used_nodes)

8084

static int find_next_best_node(int node, nodemask_t *used_nodes)

8086

{

8085

{

8087

int i, n, val, min_val, best_node = 0;

8086

int i, n, val, min_val, best_node = 0;

8088

8087

8089

min_val = INT_MAX;

8088

min_val = INT_MAX;

8090

8089

8091

for (i = 0; i < nr_node_ids; i++) {

8090

for (i = 0; i < nr_node_ids; i++) {

8092

/* Start at @node */

8091

/* Start at @node */

8093

n = (node + i) % nr_node_ids;

8092

n = (node + i) % nr_node_ids;

8094

8093

8095

if (!nr_cpus_node(n))

8094

if (!nr_cpus_node(n))

8096

continue;

8095

continue;

8097

8096

8098

/* Skip already used nodes */

8097

/* Skip already used nodes */

8099

if (node_isset(n, *used_nodes))

8098

if (node_isset(n, *used_nodes))

8100

continue;

8099

continue;

8101

8100

8102

/* Simple min distance search */

8101

/* Simple min distance search */

8103

val = node_distance(node, n);

8102

val = node_distance(node, n);

8104

8103

8105

if (val < min_val) {

8104

if (val < min_val) {

8106

min_val = val;

8105

min_val = val;

8107

best_node = n;

8106

best_node = n;

8108

}

8107

}

8109

}

8108

}

8110

8109

8111

node_set(best_node, *used_nodes);

8110

node_set(best_node, *used_nodes);

8112

return best_node;

8111

return best_node;

8113

}

8112

}

8114

8113

8115

/**

8114

/**

8116

* sched_domain_node_span - get a cpumask for a node's sched_domain

8115

* sched_domain_node_span - get a cpumask for a node's sched_domain

8117

* @node: node whose cpumask we're constructing

8116

* @node: node whose cpumask we're constructing

8118

* @span: resulting cpumask

8117

* @span: resulting cpumask

8119

*

8118

*

8120

* Given a node, construct a good cpumask for its sched_domain to span. It

8119

* Given a node, construct a good cpumask for its sched_domain to span. It

8121

* should be one that prevents unnecessary balancing, but also spreads tasks

8120

* should be one that prevents unnecessary balancing, but also spreads tasks

8122

* out optimally.

8121

* out optimally.

8123

*/

8122

*/

8124

static void sched_domain_node_span(int node, struct cpumask *span)

8123

static void sched_domain_node_span(int node, struct cpumask *span)

8125

{

8124

{

8126

nodemask_t used_nodes;

8125

nodemask_t used_nodes;

8127

int i;

8126

int i;

8128

8127

8129

cpumask_clear(span);

8128

cpumask_clear(span);

8130

nodes_clear(used_nodes);

8129

nodes_clear(used_nodes);

8131

8130

8132

cpumask_or(span, span, cpumask_of_node(node));

8131

cpumask_or(span, span, cpumask_of_node(node));

8133

node_set(node, used_nodes);

8132

node_set(node, used_nodes);

8134

8133

8135

for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {

8134

for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {

8136

int next_node = find_next_best_node(node, &used_nodes);

8135

int next_node = find_next_best_node(node, &used_nodes);

8137

8136

8138

cpumask_or(span, span, cpumask_of_node(next_node));

8137

cpumask_or(span, span, cpumask_of_node(next_node));

8139

}

8138

}

8140

}

8139

}

8141

#endif /* CONFIG_NUMA */

8140

#endif /* CONFIG_NUMA */

8142

8141

8143

int sched_smt_power_savings = 0, sched_mc_power_savings = 0;

8142

int sched_smt_power_savings = 0, sched_mc_power_savings = 0;

8144

8143

8145

/*

8144

/*

8146

* The cpus mask in sched_group and sched_domain hangs off the end.

8145

* The cpus mask in sched_group and sched_domain hangs off the end.

8147

*

8146

*

8148

* ( See the the comments in include/linux/sched.h:struct sched_group

8147

* ( See the the comments in include/linux/sched.h:struct sched_group

8149

* and struct sched_domain. )

8148

* and struct sched_domain. )

8150

*/

8149

*/

8151

struct static_sched_group {

8150

struct static_sched_group {

8152

struct sched_group sg;

8151

struct sched_group sg;

8153

DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);

8152

DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);

8154

};

8153

};

8155

8154

8156

struct static_sched_domain {

8155

struct static_sched_domain {

8157

struct sched_domain sd;

8156

struct sched_domain sd;

8158

DECLARE_BITMAP(span, CONFIG_NR_CPUS);

8157

DECLARE_BITMAP(span, CONFIG_NR_CPUS);

8159

};

8158

};

8160

8159

8161

struct s_data {

8160

struct s_data {

8162

#ifdef CONFIG_NUMA

8161

#ifdef CONFIG_NUMA

8163

int sd_allnodes;

8162

int sd_allnodes;

8164

cpumask_var_t domainspan;

8163

cpumask_var_t domainspan;

8165

cpumask_var_t covered;

8164

cpumask_var_t covered;

8166

cpumask_var_t notcovered;

8165

cpumask_var_t notcovered;

8167

#endif

8166

#endif

8168

cpumask_var_t nodemask;

8167

cpumask_var_t nodemask;

8169

cpumask_var_t this_sibling_map;

8168

cpumask_var_t this_sibling_map;

8170

cpumask_var_t this_core_map;

8169

cpumask_var_t this_core_map;

8171

cpumask_var_t send_covered;

8170

cpumask_var_t send_covered;

8172

cpumask_var_t tmpmask;

8171

cpumask_var_t tmpmask;

8173

struct sched_group **sched_group_nodes;

8172

struct sched_group **sched_group_nodes;

8174

struct root_domain *rd;

8173

struct root_domain *rd;

8175

};

8174

};

8176

8175

8177

enum s_alloc {

8176

enum s_alloc {

8178

sa_sched_groups = 0,

8177

sa_sched_groups = 0,

8179

sa_rootdomain,

8178

sa_rootdomain,

8180

sa_tmpmask,

8179

sa_tmpmask,

8181

sa_send_covered,

8180

sa_send_covered,

8182

sa_this_core_map,

8181

sa_this_core_map,

8183

sa_this_sibling_map,

8182

sa_this_sibling_map,

8184

sa_nodemask,

8183

sa_nodemask,

8185

sa_sched_group_nodes,

8184

sa_sched_group_nodes,

8186

#ifdef CONFIG_NUMA

8185

#ifdef CONFIG_NUMA

8187

sa_notcovered,

8186

sa_notcovered,

8188

sa_covered,

8187

sa_covered,

8189

sa_domainspan,

8188

sa_domainspan,

8190

#endif

8189

#endif

8191

sa_none,

8190

sa_none,

8192

};

8191

};

8193

8192

8194

/*

8193

/*

8195

* SMT sched-domains:

8194

* SMT sched-domains:

8196

*/

8195

*/

8197

#ifdef CONFIG_SCHED_SMT

8196

#ifdef CONFIG_SCHED_SMT

8198

static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);

8197

static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);

8199

static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);

8198

static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);

8200

8199

8201

static int

8200

static int

8202

cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,

8201

cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,

8203

struct sched_group **sg, struct cpumask *unused)

8202

struct sched_group **sg, struct cpumask *unused)

8204

{

8203

{

8205

if (sg)

8204

if (sg)

8206

*sg = &per_cpu(sched_group_cpus, cpu).sg;

8205

*sg = &per_cpu(sched_group_cpus, cpu).sg;

8207

return cpu;

8206

return cpu;

8208

}

8207

}

8209

#endif /* CONFIG_SCHED_SMT */

8208

#endif /* CONFIG_SCHED_SMT */

8210

8209

8211

/*

8210

/*

8212

* multi-core sched-domains:

8211

* multi-core sched-domains:

8213

*/

8212

*/

8214

#ifdef CONFIG_SCHED_MC

8213

#ifdef CONFIG_SCHED_MC

8215

static DEFINE_PER_CPU(struct static_sched_domain, core_domains);

8214

static DEFINE_PER_CPU(struct static_sched_domain, core_domains);

8216

static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);

8215

static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);

8217

#endif /* CONFIG_SCHED_MC */

8216

#endif /* CONFIG_SCHED_MC */

8218

8217

8219

#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)

8218

#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)

8220

static int

8219

static int

8221

cpu_to_core_group(int cpu, const struct cpumask *cpu_map,

8220

cpu_to_core_group(int cpu, const struct cpumask *cpu_map,

8222

struct sched_group **sg, struct cpumask *mask)

8221

struct sched_group **sg, struct cpumask *mask)

8223

{

8222

{

8224

int group;

8223

int group;

8225

8224

8226

cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);

8225

cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);

8227

group = cpumask_first(mask);

8226

group = cpumask_first(mask);

8228

if (sg)

8227

if (sg)

8229

*sg = &per_cpu(sched_group_core, group).sg;

8228

*sg = &per_cpu(sched_group_core, group).sg;

8230

return group;

8229

return group;

8231

}

8230

}

8232

#elif defined(CONFIG_SCHED_MC)

8231

#elif defined(CONFIG_SCHED_MC)

8233

static int

8232

static int

8234

cpu_to_core_group(int cpu, const struct cpumask *cpu_map,

8233

cpu_to_core_group(int cpu, const struct cpumask *cpu_map,

8235

struct sched_group **sg, struct cpumask *unused)

8234

struct sched_group **sg, struct cpumask *unused)

8236

{

8235

{

8237

if (sg)

8236

if (sg)

8238

*sg = &per_cpu(sched_group_core, cpu).sg;

8237

*sg = &per_cpu(sched_group_core, cpu).sg;

8239

return cpu;

8238

return cpu;

8240

}

8239

}

8241

#endif

8240

#endif

8242

8241

8243

static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);

8242

static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);

8244

static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);

8243

static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);

8245

8244

8246

static int

8245

static int

8247

cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,

8246

cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,

8248

struct sched_group **sg, struct cpumask *mask)

8247

struct sched_group **sg, struct cpumask *mask)

8249

{

8248

{

8250

int group;

8249

int group;

8251

#ifdef CONFIG_SCHED_MC

8250

#ifdef CONFIG_SCHED_MC

8252

cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);

8251

cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);

8253

group = cpumask_first(mask);

8252

group = cpumask_first(mask);

8254

#elif defined(CONFIG_SCHED_SMT)

8253

#elif defined(CONFIG_SCHED_SMT)

8255

cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);

8254

cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);

8256

group = cpumask_first(mask);

8255

group = cpumask_first(mask);

8257

#else

8256

#else

8258

group = cpu;

8257

group = cpu;

8259

#endif

8258

#endif

8260

if (sg)

8259

if (sg)

8261

*sg = &per_cpu(sched_group_phys, group).sg;

8260

*sg = &per_cpu(sched_group_phys, group).sg;

8262

return group;

8261

return group;

8263

}

8262

}

8264

8263

8265

#ifdef CONFIG_NUMA

8264

#ifdef CONFIG_NUMA

8266

/*

8265

/*

8267

* The init_sched_build_groups can't handle what we want to do with node

8266

* The init_sched_build_groups can't handle what we want to do with node

8268

* groups, so roll our own. Now each node has its own list of groups which

8267

* groups, so roll our own. Now each node has its own list of groups which

8269

* gets dynamically allocated.

8268

* gets dynamically allocated.

8270

*/

8269

*/

8271

static DEFINE_PER_CPU(struct static_sched_domain, node_domains);

8270

static DEFINE_PER_CPU(struct static_sched_domain, node_domains);

8272

static struct sched_group ***sched_group_nodes_bycpu;

8271

static struct sched_group ***sched_group_nodes_bycpu;

8273

8272

8274

static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);

8273

static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);

8275

static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);

8274

static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);

8276

8275

8277

static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,

8276

static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,

8278

struct sched_group **sg,

8277

struct sched_group **sg,

8279

struct cpumask *nodemask)

8278

struct cpumask *nodemask)

8280

{

8279

{

8281

int group;

8280

int group;

8282

8281

8283

cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);

8282

cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);

8284

group = cpumask_first(nodemask);

8283

group = cpumask_first(nodemask);

8285

8284

8286

if (sg)

8285

if (sg)

8287

*sg = &per_cpu(sched_group_allnodes, group).sg;

8286

*sg = &per_cpu(sched_group_allnodes, group).sg;

8288

return group;

8287

return group;

8289

}

8288

}

8290

8289

8291

static void init_numa_sched_groups_power(struct sched_group *group_head)

8290

static void init_numa_sched_groups_power(struct sched_group *group_head)

8292

{

8291

{

8293

struct sched_group *sg = group_head;

8292

struct sched_group *sg = group_head;

8294

int j;

8293

int j;

8295

8294

8296

if (!sg)

8295

if (!sg)

8297

return;

8296

return;

8298

do {

8297

do {

8299

for_each_cpu(j, sched_group_cpus(sg)) {

8298

for_each_cpu(j, sched_group_cpus(sg)) {

8300

struct sched_domain *sd;

8299

struct sched_domain *sd;

8301

8300

8302

sd = &per_cpu(phys_domains, j).sd;

8301

sd = &per_cpu(phys_domains, j).sd;

8303

if (j != group_first_cpu(sd->groups)) {

8302

if (j != group_first_cpu(sd->groups)) {

8304

/*

8303

/*

8305

* Only add "power" once for each

8304

* Only add "power" once for each

8306

* physical package.

8305

* physical package.

8307

*/

8306

*/

8308

continue;

8307

continue;

8309

}

8308

}

8310

8309

8311

sg->cpu_power += sd->groups->cpu_power;

8310

sg->cpu_power += sd->groups->cpu_power;

8312

}

8311

}

8313

sg = sg->next;

8312

sg = sg->next;

8314

} while (sg != group_head);

8313

} while (sg != group_head);

8315

}

8314

}

8316

8315

8317

static int build_numa_sched_groups(struct s_data *d,

8316

static int build_numa_sched_groups(struct s_data *d,

8318

const struct cpumask *cpu_map, int num)

8317

const struct cpumask *cpu_map, int num)

8319

{

8318

{

8320

struct sched_domain *sd;

8319

struct sched_domain *sd;

8321

struct sched_group *sg, *prev;

8320

struct sched_group *sg, *prev;

8322

int n, j;

8321

int n, j;

8323

8322

8324

cpumask_clear(d->covered);

8323

cpumask_clear(d->covered);

8325

cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);

8324

cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);

8326

if (cpumask_empty(d->nodemask)) {

8325

if (cpumask_empty(d->nodemask)) {

8327

d->sched_group_nodes[num] = NULL;

8326

d->sched_group_nodes[num] = NULL;

8328

goto out;

8327

goto out;

8329

}

8328

}

8330

8329

8331

sched_domain_node_span(num, d->domainspan);

8330

sched_domain_node_span(num, d->domainspan);

8332

cpumask_and(d->domainspan, d->domainspan, cpu_map);

8331

cpumask_and(d->domainspan, d->domainspan, cpu_map);

8333

8332

8334

sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),

8333

sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),

8335

GFP_KERNEL, num);

8334

GFP_KERNEL, num);

8336

if (!sg) {

8335

if (!sg) {

8337

printk(KERN_WARNING "Can not alloc domain group for node %d\n",

8336

printk(KERN_WARNING "Can not alloc domain group for node %d\n",

8338

num);

8337

num);

8339

return -ENOMEM;

8338

return -ENOMEM;

8340

}

8339

}

8341

d->sched_group_nodes[num] = sg;

8340

d->sched_group_nodes[num] = sg;

8342

8341

8343

for_each_cpu(j, d->nodemask) {

8342

for_each_cpu(j, d->nodemask) {

8344

sd = &per_cpu(node_domains, j).sd;

8343

sd = &per_cpu(node_domains, j).sd;

8345

sd->groups = sg;

8344

sd->groups = sg;

8346

}

8345

}

8347

8346

8348

sg->cpu_power = 0;

8347

sg->cpu_power = 0;

8349

cpumask_copy(sched_group_cpus(sg), d->nodemask);

8348

cpumask_copy(sched_group_cpus(sg), d->nodemask);

8350

sg->next = sg;

8349

sg->next = sg;

8351

cpumask_or(d->covered, d->covered, d->nodemask);

8350

cpumask_or(d->covered, d->covered, d->nodemask);

8352

8351

8353

prev = sg;

8352

prev = sg;

8354

for (j = 0; j < nr_node_ids; j++) {

8353

for (j = 0; j < nr_node_ids; j++) {

8355

n = (num + j) % nr_node_ids;

8354

n = (num + j) % nr_node_ids;

8356

cpumask_complement(d->notcovered, d->covered);

8355

cpumask_complement(d->notcovered, d->covered);

8357

cpumask_and(d->tmpmask, d->notcovered, cpu_map);

8356

cpumask_and(d->tmpmask, d->notcovered, cpu_map);

8358

cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);

8357

cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);

8359

if (cpumask_empty(d->tmpmask))

8358

if (cpumask_empty(d->tmpmask))

8360

break;

8359

break;

8361

cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));

8360

cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));

8362

if (cpumask_empty(d->tmpmask))

8361

if (cpumask_empty(d->tmpmask))

8363

continue;

8362

continue;

8364

sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),

8363

sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),

8365

GFP_KERNEL, num);

8364

GFP_KERNEL, num);

8366

if (!sg) {

8365

if (!sg) {

8367

printk(KERN_WARNING

8366

printk(KERN_WARNING

8368

"Can not alloc domain group for node %d\n", j);

8367

"Can not alloc domain group for node %d\n", j);

8369

return -ENOMEM;

8368

return -ENOMEM;

8370

}

8369

}

8371

sg->cpu_power = 0;

8370

sg->cpu_power = 0;

8372

cpumask_copy(sched_group_cpus(sg), d->tmpmask);

8371

cpumask_copy(sched_group_cpus(sg), d->tmpmask);

8373

sg->next = prev->next;

8372

sg->next = prev->next;

8374

cpumask_or(d->covered, d->covered, d->tmpmask);

8373

cpumask_or(d->covered, d->covered, d->tmpmask);

8375

prev->next = sg;

8374

prev->next = sg;

8376

prev = sg;

8375

prev = sg;

8377

}

8376

}

8378

out:

8377

out:

8379

return 0;

8378

return 0;

8380

}

8379

}

8381

#endif /* CONFIG_NUMA */

8380

#endif /* CONFIG_NUMA */

8382

8381

8383

#ifdef CONFIG_NUMA

8382

#ifdef CONFIG_NUMA

8384

/* Free memory allocated for various sched_group structures */

8383

/* Free memory allocated for various sched_group structures */

8385

static void free_sched_groups(const struct cpumask *cpu_map,

8384

static void free_sched_groups(const struct cpumask *cpu_map,

8386

struct cpumask *nodemask)

8385

struct cpumask *nodemask)

8387

{

8386

{

8388

int cpu, i;

8387

int cpu, i;

8389

8388

8390

for_each_cpu(cpu, cpu_map) {

8389

for_each_cpu(cpu, cpu_map) {

8391

struct sched_group **sched_group_nodes

8390

struct sched_group **sched_group_nodes

8392

= sched_group_nodes_bycpu[cpu];

8391

= sched_group_nodes_bycpu[cpu];

8393

8392

8394

if (!sched_group_nodes)

8393

if (!sched_group_nodes)

8395

continue;

8394

continue;

8396

8395

8397

for (i = 0; i < nr_node_ids; i++) {

8396

for (i = 0; i < nr_node_ids; i++) {

8398

struct sched_group *oldsg, *sg = sched_group_nodes[i];

8397

struct sched_group *oldsg, *sg = sched_group_nodes[i];

8399

8398

8400

cpumask_and(nodemask, cpumask_of_node(i), cpu_map);

8399

cpumask_and(nodemask, cpumask_of_node(i), cpu_map);

8401

if (cpumask_empty(nodemask))

8400

if (cpumask_empty(nodemask))

8402

continue;

8401

continue;

8403

8402

8404

if (sg == NULL)

8403

if (sg == NULL)

8405

continue;

8404

continue;

8406

sg = sg->next;

8405

sg = sg->next;

8407

next_sg:

8406

next_sg:

8408

oldsg = sg;

8407

oldsg = sg;

8409

sg = sg->next;

8408

sg = sg->next;

8410

kfree(oldsg);

8409

kfree(oldsg);

8411

if (oldsg != sched_group_nodes[i])

8410

if (oldsg != sched_group_nodes[i])

8412

goto next_sg;

8411

goto next_sg;

8413

}

8412

}

8414

kfree(sched_group_nodes);

8413

kfree(sched_group_nodes);

8415

sched_group_nodes_bycpu[cpu] = NULL;

8414

sched_group_nodes_bycpu[cpu] = NULL;

8416

}

8415

}

8417

}

8416

}

8418

#else /* !CONFIG_NUMA */

8417

#else /* !CONFIG_NUMA */

8419

static void free_sched_groups(const struct cpumask *cpu_map,

8418

static void free_sched_groups(const struct cpumask *cpu_map,

8420

struct cpumask *nodemask)

8419

struct cpumask *nodemask)

8421

{

8420

{

8422

}

8421

}

8423

#endif /* CONFIG_NUMA */

8422

#endif /* CONFIG_NUMA */

8424

8423

8425

/*

8424

/*

8426

* Initialize sched groups cpu_power.

8425

* Initialize sched groups cpu_power.

8427

*

8426

*

8428

* cpu_power indicates the capacity of sched group, which is used while

8427

* cpu_power indicates the capacity of sched group, which is used while

8429

* distributing the load between different sched groups in a sched domain.

8428

* distributing the load between different sched groups in a sched domain.

8430

* Typically cpu_power for all the groups in a sched domain will be same unless

8429

* Typically cpu_power for all the groups in a sched domain will be same unless

8431

* there are asymmetries in the topology. If there are asymmetries, group

8430

* there are asymmetries in the topology. If there are asymmetries, group

8432

* having more cpu_power will pickup more load compared to the group having

8431

* having more cpu_power will pickup more load compared to the group having

8433

* less cpu_power.

8432

* less cpu_power.

8434

*/

8433

*/

8435

static void init_sched_groups_power(int cpu, struct sched_domain *sd)

8434

static void init_sched_groups_power(int cpu, struct sched_domain *sd)

8436

{

8435

{

8437

struct sched_domain *child;

8436

struct sched_domain *child;

8438

struct sched_group *group;

8437

struct sched_group *group;

8439

long power;

8438

long power;

8440

int weight;

8439

int weight;

8441

8440

8442

WARN_ON(!sd || !sd->groups);

8441

WARN_ON(!sd || !sd->groups);

8443

8442

8444

if (cpu != group_first_cpu(sd->groups))

8443

if (cpu != group_first_cpu(sd->groups))

8445

return;

8444

return;

8446

8445

8447

child = sd->child;

8446

child = sd->child;

8448

8447

8449

sd->groups->cpu_power = 0;

8448

sd->groups->cpu_power = 0;

8450

8449

8451

if (!child) {

8450

if (!child) {

8452

power = SCHED_LOAD_SCALE;

8451

power = SCHED_LOAD_SCALE;

8453

weight = cpumask_weight(sched_domain_span(sd));

8452

weight = cpumask_weight(sched_domain_span(sd));

8454

/*

8453

/*

8455

* SMT siblings share the power of a single core.

8454

* SMT siblings share the power of a single core.

8456

* Usually multiple threads get a better yield out of

8455

* Usually multiple threads get a better yield out of

8457

* that one core than a single thread would have,

8456

* that one core than a single thread would have,

8458

* reflect that in sd->smt_gain.

8457

* reflect that in sd->smt_gain.

8459

*/

8458

*/

8460

if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {

8459

if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {

8461

power *= sd->smt_gain;

8460

power *= sd->smt_gain;

8462

power /= weight;

8461

power /= weight;

8463

power >>= SCHED_LOAD_SHIFT;

8462

power >>= SCHED_LOAD_SHIFT;

8464

}

8463

}

8465

sd->groups->cpu_power += power;

8464

sd->groups->cpu_power += power;

8466

return;

8465

return;

8467

}

8466

}

8468

8467

8469

/*

8468

/*

8470

* Add cpu_power of each child group to this groups cpu_power.

8469

* Add cpu_power of each child group to this groups cpu_power.

8471

*/

8470

*/

8472

group = child->groups;

8471

group = child->groups;

8473

do {

8472

do {

8474

sd->groups->cpu_power += group->cpu_power;

8473

sd->groups->cpu_power += group->cpu_power;

8475

group = group->next;

8474

group = group->next;

8476

} while (group != child->groups);

8475

} while (group != child->groups);

8477

}

8476

}

8478

8477

8479

/*

8478

/*

8480

* Initializers for schedule domains

8479

* Initializers for schedule domains

8481

* Non-inlined to reduce accumulated stack pressure in build_sched_domains()

8480

* Non-inlined to reduce accumulated stack pressure in build_sched_domains()

8482

*/

8481

*/

8483

8482

8484

#ifdef CONFIG_SCHED_DEBUG

8483

#ifdef CONFIG_SCHED_DEBUG

8485

# define SD_INIT_NAME(sd, type) sd->name = #type

8484

# define SD_INIT_NAME(sd, type) sd->name = #type

8486

#else

8485

#else

8487

# define SD_INIT_NAME(sd, type) do { } while (0)

8486

# define SD_INIT_NAME(sd, type) do { } while (0)

8488

#endif

8487

#endif

8489

8488

8490

#define SD_INIT(sd, type) sd_init_##type(sd)

8489

#define SD_INIT(sd, type) sd_init_##type(sd)

8491

8490

8492

#define SD_INIT_FUNC(type) \

8491

#define SD_INIT_FUNC(type) \

8493

static noinline void sd_init_##type(struct sched_domain *sd) \

8492

static noinline void sd_init_##type(struct sched_domain *sd) \

8494

{ \

8493

{ \

8495

memset(sd, 0, sizeof(*sd)); \

8494

memset(sd, 0, sizeof(*sd)); \

8496

*sd = SD_##type##_INIT; \

8495

*sd = SD_##type##_INIT; \

8497

sd->level = SD_LV_##type; \

8496

sd->level = SD_LV_##type; \

8498

SD_INIT_NAME(sd, type); \

8497

SD_INIT_NAME(sd, type); \

8499

}

8498

}

8500

8499

8501

SD_INIT_FUNC(CPU)

8500

SD_INIT_FUNC(CPU)

8502

#ifdef CONFIG_NUMA

8501

#ifdef CONFIG_NUMA

8503

SD_INIT_FUNC(ALLNODES)

8502

SD_INIT_FUNC(ALLNODES)

8504

SD_INIT_FUNC(NODE)

8503

SD_INIT_FUNC(NODE)

8505

#endif

8504

#endif

8506

#ifdef CONFIG_SCHED_SMT

8505

#ifdef CONFIG_SCHED_SMT

8507

SD_INIT_FUNC(SIBLING)

8506

SD_INIT_FUNC(SIBLING)

8508

#endif

8507

#endif

8509

#ifdef CONFIG_SCHED_MC

8508

#ifdef CONFIG_SCHED_MC

8510

SD_INIT_FUNC(MC)

8509

SD_INIT_FUNC(MC)

8511

#endif

8510

#endif

8512

8511

8513

static int default_relax_domain_level = -1;

8512

static int default_relax_domain_level = -1;

8514

8513

8515

static int __init setup_relax_domain_level(char *str)

8514

static int __init setup_relax_domain_level(char *str)

8516

{

8515

{

8517

unsigned long val;

8516

unsigned long val;

8518

8517

8519

val = simple_strtoul(str, NULL, 0);

8518

val = simple_strtoul(str, NULL, 0);

8520

if (val < SD_LV_MAX)

8519

if (val < SD_LV_MAX)

8521

default_relax_domain_level = val;

8520

default_relax_domain_level = val;

8522

8521

8523

return 1;

8522

return 1;

8524

}

8523

}

8525

__setup("relax_domain_level=", setup_relax_domain_level);

8524

__setup("relax_domain_level=", setup_relax_domain_level);

8526

8525

8527

static void set_domain_attribute(struct sched_domain *sd,

8526

static void set_domain_attribute(struct sched_domain *sd,

8528

struct sched_domain_attr *attr)

8527

struct sched_domain_attr *attr)

8529

{

8528

{

8530

int request;

8529

int request;

8531

8530

8532

if (!attr || attr->relax_domain_level < 0) {

8531

if (!attr || attr->relax_domain_level < 0) {

8533

if (default_relax_domain_level < 0)

8532

if (default_relax_domain_level < 0)

8534

return;

8533

return;

8535

else

8534

else

8536

request = default_relax_domain_level;

8535

request = default_relax_domain_level;

8537

} else

8536

} else

8538

request = attr->relax_domain_level;

8537

request = attr->relax_domain_level;

8539

if (request < sd->level) {

8538

if (request < sd->level) {

8540

/* turn off idle balance on this domain */

8539

/* turn off idle balance on this domain */

8541

sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

8540

sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

8542

} else {

8541

} else {

8543

/* turn on idle balance on this domain */

8542

/* turn on idle balance on this domain */

8544

sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

8543

sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

8545

}

8544

}

8546

}

8545

}

8547

8546

8548

static void __free_domain_allocs(struct s_data *d, enum s_alloc what,

8547

static void __free_domain_allocs(struct s_data *d, enum s_alloc what,

8549

const struct cpumask *cpu_map)

8548

const struct cpumask *cpu_map)

8550

{

8549

{

8551

switch (what) {

8550

switch (what) {

8552

case sa_sched_groups:

8551

case sa_sched_groups:

8553

free_sched_groups(cpu_map, d->tmpmask); /* fall through */

8552

free_sched_groups(cpu_map, d->tmpmask); /* fall through */

8554

d->sched_group_nodes = NULL;

8553

d->sched_group_nodes = NULL;

8555

case sa_rootdomain:

8554

case sa_rootdomain:

8556

free_rootdomain(d->rd); /* fall through */

8555

free_rootdomain(d->rd); /* fall through */

8557

case sa_tmpmask:

8556

case sa_tmpmask:

8558

free_cpumask_var(d->tmpmask); /* fall through */

8557

free_cpumask_var(d->tmpmask); /* fall through */

8559

case sa_send_covered:

8558

case sa_send_covered:

8560

free_cpumask_var(d->send_covered); /* fall through */

8559

free_cpumask_var(d->send_covered); /* fall through */

8561

case sa_this_core_map:

8560

case sa_this_core_map:

8562

free_cpumask_var(d->this_core_map); /* fall through */

8561

free_cpumask_var(d->this_core_map); /* fall through */

8563

case sa_this_sibling_map:

8562

case sa_this_sibling_map:

8564

free_cpumask_var(d->this_sibling_map); /* fall through */

8563

free_cpumask_var(d->this_sibling_map); /* fall through */

8565

case sa_nodemask:

8564

case sa_nodemask:

8566

free_cpumask_var(d->nodemask); /* fall through */

8565

free_cpumask_var(d->nodemask); /* fall through */

8567

case sa_sched_group_nodes:

8566

case sa_sched_group_nodes:

8568

#ifdef CONFIG_NUMA

8567

#ifdef CONFIG_NUMA

8569

kfree(d->sched_group_nodes); /* fall through */

8568

kfree(d->sched_group_nodes); /* fall through */

8570

case sa_notcovered:

8569

case sa_notcovered:

8571

free_cpumask_var(d->notcovered); /* fall through */

8570

free_cpumask_var(d->notcovered); /* fall through */

8572

case sa_covered:

8571

case sa_covered:

8573

free_cpumask_var(d->covered); /* fall through */

8572

free_cpumask_var(d->covered); /* fall through */

8574

case sa_domainspan:

8573

case sa_domainspan:

8575

free_cpumask_var(d->domainspan); /* fall through */

8574

free_cpumask_var(d->domainspan); /* fall through */

8576

#endif

8575

#endif

8577

case sa_none:

8576

case sa_none:

8578

break;

8577

break;

8579

}

8578

}

8580

}

8579

}

8581

8580

8582

static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,

8581

static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,

8583

const struct cpumask *cpu_map)

8582

const struct cpumask *cpu_map)

8584

{

8583

{

8585

#ifdef CONFIG_NUMA

8584

#ifdef CONFIG_NUMA

8586

if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))

8585

if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))

8587

return sa_none;

8586

return sa_none;

8588

if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))

8587

if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))

8589

return sa_domainspan;

8588

return sa_domainspan;

8590

if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))

8589

if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))

8591

return sa_covered;

8590

return sa_covered;

8592

/* Allocate the per-node list of sched groups */

8591

/* Allocate the per-node list of sched groups */

8593

d->sched_group_nodes = kcalloc(nr_node_ids,

8592

d->sched_group_nodes = kcalloc(nr_node_ids,

8594

sizeof(struct sched_group *), GFP_KERNEL);

8593

sizeof(struct sched_group *), GFP_KERNEL);

8595

if (!d->sched_group_nodes) {

8594

if (!d->sched_group_nodes) {

8596

printk(KERN_WARNING "Can not alloc sched group node list\n");

8595

printk(KERN_WARNING "Can not alloc sched group node list\n");

8597

return sa_notcovered;

8596

return sa_notcovered;

8598

}

8597

}

8599

sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;

8598

sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;

8600

#endif

8599

#endif

8601

if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))

8600

if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))

8602

return sa_sched_group_nodes;

8601

return sa_sched_group_nodes;

8603

if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))

8602

if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))

8604

return sa_nodemask;

8603

return sa_nodemask;

8605

if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))

8604

if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))

8606

return sa_this_sibling_map;

8605

return sa_this_sibling_map;

8607

if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))

8606

if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))

8608

return sa_this_core_map;

8607

return sa_this_core_map;

8609

if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))

8608

if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))

8610

return sa_send_covered;

8609

return sa_send_covered;

8611

d->rd = alloc_rootdomain();

8610

d->rd = alloc_rootdomain();

8612

if (!d->rd) {

8611

if (!d->rd) {

8613

printk(KERN_WARNING "Cannot alloc root domain\n");

8612

printk(KERN_WARNING "Cannot alloc root domain\n");

8614

return sa_tmpmask;

8613

return sa_tmpmask;

8615

}

8614

}

8616

return sa_rootdomain;

8615

return sa_rootdomain;

8617

}

8616

}

8618

8617

8619

static struct sched_domain *__build_numa_sched_domains(struct s_data *d,

8618

static struct sched_domain *__build_numa_sched_domains(struct s_data *d,

8620

const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)

8619

const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)

8621

{

8620

{

8622

struct sched_domain *sd = NULL;

8621

struct sched_domain *sd = NULL;

8623

#ifdef CONFIG_NUMA

8622

#ifdef CONFIG_NUMA

8624

struct sched_domain *parent;

8623

struct sched_domain *parent;

8625

8624

8626

d->sd_allnodes = 0;

8625

d->sd_allnodes = 0;

8627

if (cpumask_weight(cpu_map) >

8626

if (cpumask_weight(cpu_map) >

8628

SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {

8627

SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {

8629

sd = &per_cpu(allnodes_domains, i).sd;

8628

sd = &per_cpu(allnodes_domains, i).sd;

8630

SD_INIT(sd, ALLNODES);

8629

SD_INIT(sd, ALLNODES);

8631

set_domain_attribute(sd, attr);

8630

set_domain_attribute(sd, attr);

8632

cpumask_copy(sched_domain_span(sd), cpu_map);

8631

cpumask_copy(sched_domain_span(sd), cpu_map);

8633

cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);

8632

cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);

8634

d->sd_allnodes = 1;

8633

d->sd_allnodes = 1;

8635

}

8634

}

8636

parent = sd;

8635

parent = sd;

8637

8636

8638

sd = &per_cpu(node_domains, i).sd;

8637

sd = &per_cpu(node_domains, i).sd;

8639

SD_INIT(sd, NODE);

8638

SD_INIT(sd, NODE);

8640

set_domain_attribute(sd, attr);

8639

set_domain_attribute(sd, attr);

8641

sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));

8640

sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));

8642

sd->parent = parent;

8641

sd->parent = parent;

8643

if (parent)

8642

if (parent)

8644

parent->child = sd;

8643

parent->child = sd;

8645

cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);

8644

cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);

8646

#endif

8645

#endif

8647

return sd;

8646

return sd;

8648

}

8647

}

8649

8648

8650

static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,

8649

static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,

8651

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

8650

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

8652

struct sched_domain *parent, int i)

8651

struct sched_domain *parent, int i)

8653

{

8652

{

8654

struct sched_domain *sd;

8653

struct sched_domain *sd;

8655

sd = &per_cpu(phys_domains, i).sd;

8654

sd = &per_cpu(phys_domains, i).sd;

8656

SD_INIT(sd, CPU);

8655

SD_INIT(sd, CPU);

8657

set_domain_attribute(sd, attr);

8656

set_domain_attribute(sd, attr);

8658

cpumask_copy(sched_domain_span(sd), d->nodemask);

8657

cpumask_copy(sched_domain_span(sd), d->nodemask);

8659

sd->parent = parent;

8658

sd->parent = parent;

8660

if (parent)

8659

if (parent)

8661

parent->child = sd;

8660

parent->child = sd;

8662

cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);

8661

cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);

8663

return sd;

8662

return sd;

8664

}

8663

}

8665

8664

8666

static struct sched_domain *__build_mc_sched_domain(struct s_data *d,

8665

static struct sched_domain *__build_mc_sched_domain(struct s_data *d,

8667

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

8666

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

8668

struct sched_domain *parent, int i)

8667

struct sched_domain *parent, int i)

8669

{

8668

{

8670

struct sched_domain *sd = parent;

8669

struct sched_domain *sd = parent;

8671

#ifdef CONFIG_SCHED_MC

8670

#ifdef CONFIG_SCHED_MC

8672

sd = &per_cpu(core_domains, i).sd;

8671

sd = &per_cpu(core_domains, i).sd;

8673

SD_INIT(sd, MC);

8672

SD_INIT(sd, MC);

8674

set_domain_attribute(sd, attr);

8673

set_domain_attribute(sd, attr);

8675

cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));

8674

cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));

8676

sd->parent = parent;

8675

sd->parent = parent;

8677

parent->child = sd;

8676

parent->child = sd;

8678

cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);

8677

cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);

8679

#endif

8678

#endif

8680

return sd;

8679

return sd;

8681

}

8680

}

8682

8681

8683

static struct sched_domain *__build_smt_sched_domain(struct s_data *d,

8682

static struct sched_domain *__build_smt_sched_domain(struct s_data *d,

8684

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

8683

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

8685

struct sched_domain *parent, int i)

8684

struct sched_domain *parent, int i)

8686

{

8685

{

8687

struct sched_domain *sd = parent;

8686

struct sched_domain *sd = parent;

8688

#ifdef CONFIG_SCHED_SMT

8687

#ifdef CONFIG_SCHED_SMT

8689

sd = &per_cpu(cpu_domains, i).sd;

8688

sd = &per_cpu(cpu_domains, i).sd;

8690

SD_INIT(sd, SIBLING);

8689

SD_INIT(sd, SIBLING);

8691

set_domain_attribute(sd, attr);

8690

set_domain_attribute(sd, attr);

8692

cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));

8691

cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));

8693

sd->parent = parent;

8692

sd->parent = parent;

8694

parent->child = sd;

8693

parent->child = sd;

8695

cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);

8694

cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);

8696

#endif

8695

#endif

8697

return sd;

8696

return sd;

8698

}

8697

}

8699

8698

8700

static void build_sched_groups(struct s_data *d, enum sched_domain_level l,

8699

static void build_sched_groups(struct s_data *d, enum sched_domain_level l,

8701

const struct cpumask *cpu_map, int cpu)

8700

const struct cpumask *cpu_map, int cpu)

8702

{

8701

{

8703

switch (l) {

8702

switch (l) {

8704

#ifdef CONFIG_SCHED_SMT

8703

#ifdef CONFIG_SCHED_SMT

8705

case SD_LV_SIBLING: /* set up CPU (sibling) groups */

8704

case SD_LV_SIBLING: /* set up CPU (sibling) groups */

8706

cpumask_and(d->this_sibling_map, cpu_map,

8705

cpumask_and(d->this_sibling_map, cpu_map,

8707

topology_thread_cpumask(cpu));

8706

topology_thread_cpumask(cpu));

8708

if (cpu == cpumask_first(d->this_sibling_map))

8707

if (cpu == cpumask_first(d->this_sibling_map))

8709

init_sched_build_groups(d->this_sibling_map, cpu_map,

8708

init_sched_build_groups(d->this_sibling_map, cpu_map,

8710

&cpu_to_cpu_group,

8709

&cpu_to_cpu_group,

8711

d->send_covered, d->tmpmask);

8710

d->send_covered, d->tmpmask);

8712

break;

8711

break;

8713

#endif

8712

#endif

8714

#ifdef CONFIG_SCHED_MC

8713

#ifdef CONFIG_SCHED_MC

8715

case SD_LV_MC: /* set up multi-core groups */

8714

case SD_LV_MC: /* set up multi-core groups */

8716

cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));

8715

cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));

8717

if (cpu == cpumask_first(d->this_core_map))

8716

if (cpu == cpumask_first(d->this_core_map))

8718

init_sched_build_groups(d->this_core_map, cpu_map,

8717

init_sched_build_groups(d->this_core_map, cpu_map,

8719

&cpu_to_core_group,

8718

&cpu_to_core_group,

8720

d->send_covered, d->tmpmask);

8719

d->send_covered, d->tmpmask);

8721

break;

8720

break;

8722

#endif

8721

#endif

8723

case SD_LV_CPU: /* set up physical groups */

8722

case SD_LV_CPU: /* set up physical groups */

8724

cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);

8723

cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);

8725

if (!cpumask_empty(d->nodemask))

8724

if (!cpumask_empty(d->nodemask))

8726

init_sched_build_groups(d->nodemask, cpu_map,

8725

init_sched_build_groups(d->nodemask, cpu_map,

8727

&cpu_to_phys_group,

8726

&cpu_to_phys_group,

8728

d->send_covered, d->tmpmask);

8727

d->send_covered, d->tmpmask);

8729

break;

8728

break;

8730

#ifdef CONFIG_NUMA

8729

#ifdef CONFIG_NUMA

8731

case SD_LV_ALLNODES:

8730

case SD_LV_ALLNODES:

8732

init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,

8731

init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,

8733

d->send_covered, d->tmpmask);

8732

d->send_covered, d->tmpmask);

8734

break;

8733

break;

8735

#endif

8734

#endif

8736

default:

8735

default:

8737

break;

8736

break;

8738

}

8737

}

8739

}

8738

}

8740

8739

8741

/*

8740

/*

8742

* Build sched domains for a given set of cpus and attach the sched domains

8741

* Build sched domains for a given set of cpus and attach the sched domains

8743

* to the individual cpus

8742

* to the individual cpus

8744

*/

8743

*/

8745

static int __build_sched_domains(const struct cpumask *cpu_map,

8744

static int __build_sched_domains(const struct cpumask *cpu_map,

8746

struct sched_domain_attr *attr)

8745

struct sched_domain_attr *attr)

8747

{

8746

{

8748

enum s_alloc alloc_state = sa_none;

8747

enum s_alloc alloc_state = sa_none;

8749

struct s_data d;

8748

struct s_data d;

8750

struct sched_domain *sd;

8749

struct sched_domain *sd;

8751

int i;

8750

int i;

8752

#ifdef CONFIG_NUMA

8751

#ifdef CONFIG_NUMA

8753

d.sd_allnodes = 0;

8752

d.sd_allnodes = 0;

8754

#endif

8753

#endif

8755

8754

8756

alloc_state = __visit_domain_allocation_hell(&d, cpu_map);

8755

alloc_state = __visit_domain_allocation_hell(&d, cpu_map);

8757

if (alloc_state != sa_rootdomain)

8756

if (alloc_state != sa_rootdomain)

8758

goto error;

8757

goto error;

8759

alloc_state = sa_sched_groups;

8758

alloc_state = sa_sched_groups;

8760

8759

8761

/*

8760

/*

8762

* Set up domains for cpus specified by the cpu_map.

8761

* Set up domains for cpus specified by the cpu_map.

8763

*/

8762

*/

8764

for_each_cpu(i, cpu_map) {

8763

for_each_cpu(i, cpu_map) {

8765

cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),

8764

cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),

8766

cpu_map);

8765

cpu_map);

8767

8766

8768

sd = __build_numa_sched_domains(&d, cpu_map, attr, i);

8767

sd = __build_numa_sched_domains(&d, cpu_map, attr, i);

8769

sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);

8768

sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);

8770

sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);

8769

sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);

8771

sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);

8770

sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);

8772

}

8771

}

8773

8772

8774

for_each_cpu(i, cpu_map) {

8773

for_each_cpu(i, cpu_map) {

8775

build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);

8774

build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);

8776

build_sched_groups(&d, SD_LV_MC, cpu_map, i);

8775

build_sched_groups(&d, SD_LV_MC, cpu_map, i);

8777

}

8776

}

8778

8777

8779

/* Set up physical groups */

8778

/* Set up physical groups */

8780

for (i = 0; i < nr_node_ids; i++)

8779

for (i = 0; i < nr_node_ids; i++)

8781

build_sched_groups(&d, SD_LV_CPU, cpu_map, i);

8780

build_sched_groups(&d, SD_LV_CPU, cpu_map, i);

8782

8781

8783

#ifdef CONFIG_NUMA

8782

#ifdef CONFIG_NUMA

8784

/* Set up node groups */

8783

/* Set up node groups */

8785

if (d.sd_allnodes)

8784

if (d.sd_allnodes)

8786

build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);

8785

build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);

8787

8786

8788

for (i = 0; i < nr_node_ids; i++)

8787

for (i = 0; i < nr_node_ids; i++)

8789

if (build_numa_sched_groups(&d, cpu_map, i))

8788

if (build_numa_sched_groups(&d, cpu_map, i))

8790

goto error;

8789

goto error;

8791

#endif

8790

#endif

8792

8791

8793

/* Calculate CPU power for physical packages and nodes */

8792

/* Calculate CPU power for physical packages and nodes */

8794

#ifdef CONFIG_SCHED_SMT

8793

#ifdef CONFIG_SCHED_SMT

8795

for_each_cpu(i, cpu_map) {

8794

for_each_cpu(i, cpu_map) {

8796

sd = &per_cpu(cpu_domains, i).sd;

8795

sd = &per_cpu(cpu_domains, i).sd;

8797

init_sched_groups_power(i, sd);

8796

init_sched_groups_power(i, sd);

8798

}

8797

}

8799

#endif

8798

#endif

8800

#ifdef CONFIG_SCHED_MC

8799

#ifdef CONFIG_SCHED_MC

8801

for_each_cpu(i, cpu_map) {

8800

for_each_cpu(i, cpu_map) {

8802

sd = &per_cpu(core_domains, i).sd;

8801

sd = &per_cpu(core_domains, i).sd;

8803

init_sched_groups_power(i, sd);

8802

init_sched_groups_power(i, sd);

8804

}

8803

}

8805

#endif

8804

#endif

8806

8805

8807

for_each_cpu(i, cpu_map) {

8806

for_each_cpu(i, cpu_map) {

8808

sd = &per_cpu(phys_domains, i).sd;

8807

sd = &per_cpu(phys_domains, i).sd;

8809

init_sched_groups_power(i, sd);

8808

init_sched_groups_power(i, sd);

8810

}

8809

}

8811

8810

8812

#ifdef CONFIG_NUMA

8811

#ifdef CONFIG_NUMA

8813

for (i = 0; i < nr_node_ids; i++)

8812

for (i = 0; i < nr_node_ids; i++)

8814

init_numa_sched_groups_power(d.sched_group_nodes[i]);

8813

init_numa_sched_groups_power(d.sched_group_nodes[i]);

8815

8814

8816

if (d.sd_allnodes) {

8815

if (d.sd_allnodes) {

8817

struct sched_group *sg;

8816

struct sched_group *sg;

8818

8817

8819

cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,

8818

cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,

8820

d.tmpmask);

8819

d.tmpmask);

8821

init_numa_sched_groups_power(sg);

8820

init_numa_sched_groups_power(sg);

8822

}

8821

}

8823

#endif

8822

#endif

8824

8823

8825

/* Attach the domains */

8824

/* Attach the domains */

8826

for_each_cpu(i, cpu_map) {

8825

for_each_cpu(i, cpu_map) {

8827

#ifdef CONFIG_SCHED_SMT

8826

#ifdef CONFIG_SCHED_SMT

8828

sd = &per_cpu(cpu_domains, i).sd;

8827

sd = &per_cpu(cpu_domains, i).sd;

8829

#elif defined(CONFIG_SCHED_MC)

8828

#elif defined(CONFIG_SCHED_MC)

8830

sd = &per_cpu(core_domains, i).sd;

8829

sd = &per_cpu(core_domains, i).sd;

8831

#else

8830

#else

8832

sd = &per_cpu(phys_domains, i).sd;

8831

sd = &per_cpu(phys_domains, i).sd;

8833

#endif

8832

#endif

8834

cpu_attach_domain(sd, d.rd, i);

8833

cpu_attach_domain(sd, d.rd, i);

8835

}

8834

}

8836

8835

8837

d.sched_group_nodes = NULL; /* don't free this we still need it */

8836

d.sched_group_nodes = NULL; /* don't free this we still need it */

8838

__free_domain_allocs(&d, sa_tmpmask, cpu_map);

8837

__free_domain_allocs(&d, sa_tmpmask, cpu_map);

8839

return 0;

8838

return 0;

8840

8839

8841

error:

8840

error:

8842

__free_domain_allocs(&d, alloc_state, cpu_map);

8841

__free_domain_allocs(&d, alloc_state, cpu_map);

8843

return -ENOMEM;

8842

return -ENOMEM;

8844

}

8843

}

8845

8844

8846

static int build_sched_domains(const struct cpumask *cpu_map)

8845

static int build_sched_domains(const struct cpumask *cpu_map)

8847

{

8846

{

8848

return __build_sched_domains(cpu_map, NULL);

8847

return __build_sched_domains(cpu_map, NULL);

8849

}

8848

}

8850

8849

8851

static struct cpumask *doms_cur; /* current sched domains */

8850

static struct cpumask *doms_cur; /* current sched domains */

8852

static int ndoms_cur; /* number of sched domains in 'doms_cur' */

8851

static int ndoms_cur; /* number of sched domains in 'doms_cur' */

8853

static struct sched_domain_attr *dattr_cur;

8852

static struct sched_domain_attr *dattr_cur;

8854

/* attribues of custom domains in 'doms_cur' */

8853

/* attribues of custom domains in 'doms_cur' */

8855

8854

8856

/*

8855

/*

8857

* Special case: If a kmalloc of a doms_cur partition (array of

8856

* Special case: If a kmalloc of a doms_cur partition (array of

8858

* cpumask) fails, then fallback to a single sched domain,

8857

* cpumask) fails, then fallback to a single sched domain,

8859

* as determined by the single cpumask fallback_doms.

8858

* as determined by the single cpumask fallback_doms.

8860

*/

8859

*/

8861

static cpumask_var_t fallback_doms;

8860

static cpumask_var_t fallback_doms;

8862

8861

8863

/*

8862

/*

8864

* arch_update_cpu_topology lets virtualized architectures update the

8863

* arch_update_cpu_topology lets virtualized architectures update the

8865

* cpu core maps. It is supposed to return 1 if the topology changed

8864

* cpu core maps. It is supposed to return 1 if the topology changed

8866

* or 0 if it stayed the same.

8865

* or 0 if it stayed the same.

8867

*/

8866

*/

8868

int __attribute__((weak)) arch_update_cpu_topology(void)

8867

int __attribute__((weak)) arch_update_cpu_topology(void)

8869

{

8868

{

8870

return 0;

8869

return 0;

8871

}

8870

}

8872

8871

8873

/*

8872

/*

8874

* Set up scheduler domains and groups. Callers must hold the hotplug lock.

8873

* Set up scheduler domains and groups. Callers must hold the hotplug lock.

8875

* For now this just excludes isolated cpus, but could be used to

8874

* For now this just excludes isolated cpus, but could be used to

8876

* exclude other special cases in the future.

8875

* exclude other special cases in the future.

8877

*/

8876

*/

8878

static int arch_init_sched_domains(const struct cpumask *cpu_map)

8877

static int arch_init_sched_domains(const struct cpumask *cpu_map)

8879

{

8878

{

8880

int err;

8879

int err;

8881

8880

8882

arch_update_cpu_topology();

8881

arch_update_cpu_topology();

8883

ndoms_cur = 1;

8882

ndoms_cur = 1;

8884

doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);

8883

doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);

8885

if (!doms_cur)

8884

if (!doms_cur)

8886

doms_cur = fallback_doms;

8885

doms_cur = fallback_doms;

8887

cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);

8886

cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);

8888

dattr_cur = NULL;

8887

dattr_cur = NULL;

8889

err = build_sched_domains(doms_cur);

8888

err = build_sched_domains(doms_cur);

8890

register_sched_domain_sysctl();

8889

register_sched_domain_sysctl();

8891

8890

8892

return err;

8891

return err;

8893

}

8892

}

8894

8893

8895

static void arch_destroy_sched_domains(const struct cpumask *cpu_map,

8894

static void arch_destroy_sched_domains(const struct cpumask *cpu_map,

8896

struct cpumask *tmpmask)

8895

struct cpumask *tmpmask)

8897

{

8896

{

8898

free_sched_groups(cpu_map, tmpmask);

8897

free_sched_groups(cpu_map, tmpmask);

8899

}

8898

}

8900

8899

8901

/*

8900

/*

8902

* Detach sched domains from a group of cpus specified in cpu_map

8901

* Detach sched domains from a group of cpus specified in cpu_map

8903

* These cpus will now be attached to the NULL domain

8902

* These cpus will now be attached to the NULL domain

8904

*/

8903

*/

8905

static void detach_destroy_domains(const struct cpumask *cpu_map)

8904

static void detach_destroy_domains(const struct cpumask *cpu_map)

8906

{

8905

{

8907

/* Save because hotplug lock held. */

8906

/* Save because hotplug lock held. */

8908

static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);

8907

static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);

8909

int i;

8908

int i;

8910

8909

8911

for_each_cpu(i, cpu_map)

8910

for_each_cpu(i, cpu_map)

8912

cpu_attach_domain(NULL, &def_root_domain, i);

8911

cpu_attach_domain(NULL, &def_root_domain, i);

8913

synchronize_sched();

8912

synchronize_sched();

8914

arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));

8913

arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));

8915

}

8914

}

8916

8915

8917

/* handle null as "default" */

8916

/* handle null as "default" */

8918

static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,

8917

static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,

8919

struct sched_domain_attr *new, int idx_new)

8918

struct sched_domain_attr *new, int idx_new)

8920

{

8919

{

8921

struct sched_domain_attr tmp;

8920

struct sched_domain_attr tmp;

8922

8921

8923

/* fast path */

8922

/* fast path */

8924

if (!new && !cur)

8923

if (!new && !cur)

8925

return 1;

8924

return 1;

8926

8925

8927

tmp = SD_ATTR_INIT;

8926

tmp = SD_ATTR_INIT;

8928

return !memcmp(cur ? (cur + idx_cur) : &tmp,

8927

return !memcmp(cur ? (cur + idx_cur) : &tmp,

8929

new ? (new + idx_new) : &tmp,

8928

new ? (new + idx_new) : &tmp,

8930

sizeof(struct sched_domain_attr));

8929

sizeof(struct sched_domain_attr));

8931

}

8930

}

8932

8931

8933

/*

8932

/*

8934

* Partition sched domains as specified by the 'ndoms_new'

8933

* Partition sched domains as specified by the 'ndoms_new'

8935

* cpumasks in the array doms_new[] of cpumasks. This compares

8934

* cpumasks in the array doms_new[] of cpumasks. This compares

8936

* doms_new[] to the current sched domain partitioning, doms_cur[].

8935

* doms_new[] to the current sched domain partitioning, doms_cur[].

8937

* It destroys each deleted domain and builds each new domain.

8936

* It destroys each deleted domain and builds each new domain.

8938

*

8937

*

8939

* 'doms_new' is an array of cpumask's of length 'ndoms_new'.

8938

* 'doms_new' is an array of cpumask's of length 'ndoms_new'.

8940

* The masks don't intersect (don't overlap.) We should setup one

8939

* The masks don't intersect (don't overlap.) We should setup one

8941

* sched domain for each mask. CPUs not in any of the cpumasks will

8940

* sched domain for each mask. CPUs not in any of the cpumasks will

8942

* not be load balanced. If the same cpumask appears both in the

8941

* not be load balanced. If the same cpumask appears both in the

8943

* current 'doms_cur' domains and in the new 'doms_new', we can leave

8942

* current 'doms_cur' domains and in the new 'doms_new', we can leave

8944

* it as it is.

8943

* it as it is.

8945

*

8944

*

8946

* The passed in 'doms_new' should be kmalloc'd. This routine takes

8945

* The passed in 'doms_new' should be kmalloc'd. This routine takes

8947

* ownership of it and will kfree it when done with it. If the caller

8946

* ownership of it and will kfree it when done with it. If the caller

8948

* failed the kmalloc call, then it can pass in doms_new == NULL &&

8947

* failed the kmalloc call, then it can pass in doms_new == NULL &&

8949

* ndoms_new == 1, and partition_sched_domains() will fallback to

8948

* ndoms_new == 1, and partition_sched_domains() will fallback to

8950

* the single partition 'fallback_doms', it also forces the domains

8949

* the single partition 'fallback_doms', it also forces the domains

8951

* to be rebuilt.

8950

* to be rebuilt.

8952

*

8951

*

8953

* If doms_new == NULL it will be replaced with cpu_online_mask.

8952

* If doms_new == NULL it will be replaced with cpu_online_mask.

8954

* ndoms_new == 0 is a special case for destroying existing domains,

8953

* ndoms_new == 0 is a special case for destroying existing domains,

8955

* and it will not create the default domain.

8954

* and it will not create the default domain.

8956

*

8955

*

8957

* Call with hotplug lock held

8956

* Call with hotplug lock held

8958

*/

8957

*/

8959

/* FIXME: Change to struct cpumask *doms_new[] */

8958

/* FIXME: Change to struct cpumask *doms_new[] */

8960

void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,

8959

void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,

8961

struct sched_domain_attr *dattr_new)

8960

struct sched_domain_attr *dattr_new)

8962

{

8961

{

8963

int i, j, n;

8962

int i, j, n;

8964

int new_topology;

8963

int new_topology;

8965

8964

8966

mutex_lock(&sched_domains_mutex);

8965

mutex_lock(&sched_domains_mutex);

8967

8966

8968

/* always unregister in case we don't destroy any domains */

8967

/* always unregister in case we don't destroy any domains */

8969

unregister_sched_domain_sysctl();

8968

unregister_sched_domain_sysctl();

8970

8969

8971

/* Let architecture update cpu core mappings. */

8970

/* Let architecture update cpu core mappings. */

8972

new_topology = arch_update_cpu_topology();

8971

new_topology = arch_update_cpu_topology();

8973

8972

8974

n = doms_new ? ndoms_new : 0;

8973

n = doms_new ? ndoms_new : 0;

8975

8974

8976

/* Destroy deleted domains */

8975

/* Destroy deleted domains */

8977

for (i = 0; i < ndoms_cur; i++) {

8976

for (i = 0; i < ndoms_cur; i++) {

8978

for (j = 0; j < n && !new_topology; j++) {

8977

for (j = 0; j < n && !new_topology; j++) {

8979

if (cpumask_equal(&doms_cur[i], &doms_new[j])

8978

if (cpumask_equal(&doms_cur[i], &doms_new[j])

8980

&& dattrs_equal(dattr_cur, i, dattr_new, j))

8979

&& dattrs_equal(dattr_cur, i, dattr_new, j))

8981

goto match1;

8980

goto match1;

8982

}

8981

}

8983

/* no match - a current sched domain not in new doms_new[] */

8982

/* no match - a current sched domain not in new doms_new[] */

8984

detach_destroy_domains(doms_cur + i);

8983

detach_destroy_domains(doms_cur + i);

8985

match1:

8984

match1:

8986

;

8985

;

8987

}

8986

}

8988

8987

8989

if (doms_new == NULL) {

8988

if (doms_new == NULL) {

8990

ndoms_cur = 0;

8989

ndoms_cur = 0;

8991

doms_new = fallback_doms;

8990

doms_new = fallback_doms;

8992

cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);

8991

cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);

8993

WARN_ON_ONCE(dattr_new);

8992

WARN_ON_ONCE(dattr_new);

8994

}

8993

}

8995

8994

8996

/* Build new domains */

8995

/* Build new domains */

8997

for (i = 0; i < ndoms_new; i++) {

8996

for (i = 0; i < ndoms_new; i++) {

8998

for (j = 0; j < ndoms_cur && !new_topology; j++) {

8997

for (j = 0; j < ndoms_cur && !new_topology; j++) {

8999

if (cpumask_equal(&doms_new[i], &doms_cur[j])

8998

if (cpumask_equal(&doms_new[i], &doms_cur[j])

9000

&& dattrs_equal(dattr_new, i, dattr_cur, j))

8999

&& dattrs_equal(dattr_new, i, dattr_cur, j))

9001

goto match2;

9000

goto match2;

9002

}

9001

}

9003

/* no match - add a new doms_new */

9002

/* no match - add a new doms_new */

9004

__build_sched_domains(doms_new + i,

9003

__build_sched_domains(doms_new + i,

9005

dattr_new ? dattr_new + i : NULL);

9004

dattr_new ? dattr_new + i : NULL);

9006

match2:

9005

match2:

9007

;

9006

;

9008

}

9007

}

9009

9008

9010

/* Remember the new sched domains */

9009

/* Remember the new sched domains */

9011

if (doms_cur != fallback_doms)

9010

if (doms_cur != fallback_doms)

9012

kfree(doms_cur);

9011

kfree(doms_cur);

9013

kfree(dattr_cur); /* kfree(NULL) is safe */

9012

kfree(dattr_cur); /* kfree(NULL) is safe */

9014

doms_cur = doms_new;

9013

doms_cur = doms_new;

9015

dattr_cur = dattr_new;

9014

dattr_cur = dattr_new;

9016

ndoms_cur = ndoms_new;

9015

ndoms_cur = ndoms_new;

9017

9016

9018

register_sched_domain_sysctl();

9017

register_sched_domain_sysctl();

9019

9018

9020

mutex_unlock(&sched_domains_mutex);

9019

mutex_unlock(&sched_domains_mutex);

9021

}

9020

}

9022

9021

9023

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

9022

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

9024

static void arch_reinit_sched_domains(void)

9023

static void arch_reinit_sched_domains(void)

9025

{

9024

{

9026

get_online_cpus();

9025

get_online_cpus();

9027

9026

9028

/* Destroy domains first to force the rebuild */

9027

/* Destroy domains first to force the rebuild */

9029

partition_sched_domains(0, NULL, NULL);

9028

partition_sched_domains(0, NULL, NULL);

9030

9029

9031

rebuild_sched_domains();

9030

rebuild_sched_domains();

9032

put_online_cpus();

9031

put_online_cpus();

9033

}

9032

}

9034

9033

9035

static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)

9034

static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)

9036

{

9035

{

9037

unsigned int level = 0;

9036

unsigned int level = 0;

9038

9037

9039

if (sscanf(buf, "%u", &level) != 1)

9038

if (sscanf(buf, "%u", &level) != 1)

9040

return -EINVAL;

9039

return -EINVAL;

9041

9040

9042

/*

9041

/*

9043

* level is always be positive so don't check for

9042

* level is always be positive so don't check for

9044

* level < POWERSAVINGS_BALANCE_NONE which is 0

9043

* level < POWERSAVINGS_BALANCE_NONE which is 0

9045

* What happens on 0 or 1 byte write,

9044

* What happens on 0 or 1 byte write,

9046

* need to check for count as well?

9045

* need to check for count as well?

9047

*/

9046

*/

9048

9047

9049

if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)

9048

if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)

9050

return -EINVAL;

9049

return -EINVAL;

9051

9050

9052

if (smt)

9051

if (smt)

9053

sched_smt_power_savings = level;

9052

sched_smt_power_savings = level;

9054

else

9053

else

9055

sched_mc_power_savings = level;

9054

sched_mc_power_savings = level;

9056

9055

9057

arch_reinit_sched_domains();

9056

arch_reinit_sched_domains();

9058

9057

9059

return count;

9058

return count;

9060

}

9059

}

9061

9060

9062

#ifdef CONFIG_SCHED_MC

9061

#ifdef CONFIG_SCHED_MC

9063

static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,

9062

static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,

9064

char *page)

9063

char *page)

9065

{

9064

{

9066

return sprintf(page, "%u\n", sched_mc_power_savings);

9065

return sprintf(page, "%u\n", sched_mc_power_savings);

9067

}

9066

}

9068

static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,

9067

static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,

9069

const char *buf, size_t count)

9068

const char *buf, size_t count)

9070

{

9069

{

9071

return sched_power_savings_store(buf, count, 0);

9070

return sched_power_savings_store(buf, count, 0);

9072

}

9071

}

9073

static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,

9072

static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,

9074

sched_mc_power_savings_show,

9073

sched_mc_power_savings_show,

9075

sched_mc_power_savings_store);

9074

sched_mc_power_savings_store);

9076

#endif

9075

#endif

9077

9076

9078

#ifdef CONFIG_SCHED_SMT

9077

#ifdef CONFIG_SCHED_SMT

9079

static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,

9078

static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,

9080

char *page)

9079

char *page)

9081

{

9080

{

9082

return sprintf(page, "%u\n", sched_smt_power_savings);

9081

return sprintf(page, "%u\n", sched_smt_power_savings);

9083

}

9082

}

9084

static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,

9083

static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,

9085

const char *buf, size_t count)

9084

const char *buf, size_t count)

9086

{

9085

{

9087

return sched_power_savings_store(buf, count, 1);

9086

return sched_power_savings_store(buf, count, 1);

9088

}

9087

}

9089

static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,

9088

static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,

9090

sched_smt_power_savings_show,

9089

sched_smt_power_savings_show,

9091

sched_smt_power_savings_store);

9090

sched_smt_power_savings_store);

9092

#endif

9091

#endif

9093

9092

9094

int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)

9093

int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)

9095

{

9094

{

9096

int err = 0;

9095

int err = 0;

9097

9096

9098

#ifdef CONFIG_SCHED_SMT

9097

#ifdef CONFIG_SCHED_SMT

9099

if (smt_capable())

9098

if (smt_capable())

9100

err = sysfs_create_file(&cls->kset.kobj,

9099

err = sysfs_create_file(&cls->kset.kobj,

9101

&attr_sched_smt_power_savings.attr);

9100

&attr_sched_smt_power_savings.attr);

9102

#endif

9101

#endif

9103

#ifdef CONFIG_SCHED_MC

9102

#ifdef CONFIG_SCHED_MC

9104

if (!err && mc_capable())

9103

if (!err && mc_capable())

9105

err = sysfs_create_file(&cls->kset.kobj,

9104

err = sysfs_create_file(&cls->kset.kobj,

9106

&attr_sched_mc_power_savings.attr);

9105

&attr_sched_mc_power_savings.attr);

9107

#endif

9106

#endif

9108

return err;

9107

return err;

9109

}

9108

}

9110

#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

9109

#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

9111

9110

9112

#ifndef CONFIG_CPUSETS

9111

#ifndef CONFIG_CPUSETS

9113

/*

9112

/*

9114

* Add online and remove offline CPUs from the scheduler domains.

9113

* Add online and remove offline CPUs from the scheduler domains.

9115

* When cpusets are enabled they take over this function.

9114

* When cpusets are enabled they take over this function.

9116

*/

9115

*/

9117

static int update_sched_domains(struct notifier_block *nfb,

9116

static int update_sched_domains(struct notifier_block *nfb,

9118

unsigned long action, void *hcpu)

9117

unsigned long action, void *hcpu)

9119

{

9118

{

9120

switch (action) {

9119

switch (action) {

9121

case CPU_ONLINE:

9120

case CPU_ONLINE:

9122

case CPU_ONLINE_FROZEN:

9121

case CPU_ONLINE_FROZEN:

9123

case CPU_DEAD:

9122

case CPU_DEAD:

9124

case CPU_DEAD_FROZEN:

9123

case CPU_DEAD_FROZEN:

9125

partition_sched_domains(1, NULL, NULL);

9124

partition_sched_domains(1, NULL, NULL);

9126

return NOTIFY_OK;

9125

return NOTIFY_OK;

9127

9126

9128

default:

9127

default:

9129

return NOTIFY_DONE;

9128

return NOTIFY_DONE;

9130

}

9129

}

9131

}

9130

}

9132

#endif

9131

#endif

9133

9132

9134

static int update_runtime(struct notifier_block *nfb,

9133

static int update_runtime(struct notifier_block *nfb,

9135

unsigned long action, void *hcpu)

9134

unsigned long action, void *hcpu)

9136

{

9135

{

9137

int cpu = (int)(long)hcpu;

9136

int cpu = (int)(long)hcpu;

9138

9137

9139

switch (action) {

9138

switch (action) {

9140

case CPU_DOWN_PREPARE:

9139

case CPU_DOWN_PREPARE:

9141

case CPU_DOWN_PREPARE_FROZEN:

9140

case CPU_DOWN_PREPARE_FROZEN:

9142

disable_runtime(cpu_rq(cpu));

9141

disable_runtime(cpu_rq(cpu));

9143

return NOTIFY_OK;

9142

return NOTIFY_OK;

9144

9143

9145

case CPU_DOWN_FAILED:

9144

case CPU_DOWN_FAILED:

9146

case CPU_DOWN_FAILED_FROZEN:

9145

case CPU_DOWN_FAILED_FROZEN:

9147

case CPU_ONLINE:

9146

case CPU_ONLINE:

9148

case CPU_ONLINE_FROZEN:

9147

case CPU_ONLINE_FROZEN:

9149

enable_runtime(cpu_rq(cpu));

9148

enable_runtime(cpu_rq(cpu));

9150

return NOTIFY_OK;

9149

return NOTIFY_OK;

9151

9150

9152

default:

9151

default:

9153

return NOTIFY_DONE;

9152

return NOTIFY_DONE;

9154

}

9153

}

9155

}

9154

}

9156

9155

9157

void __init sched_init_smp(void)

9156

void __init sched_init_smp(void)

9158

{

9157

{

9159

cpumask_var_t non_isolated_cpus;

9158

cpumask_var_t non_isolated_cpus;

9160

9159

9161

alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);

9160

alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);

9162

alloc_cpumask_var(&fallback_doms, GFP_KERNEL);

9161

alloc_cpumask_var(&fallback_doms, GFP_KERNEL);

9163

9162

9164

#if defined(CONFIG_NUMA)

9163

#if defined(CONFIG_NUMA)

9165

sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),

9164

sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),

9166

GFP_KERNEL);

9165

GFP_KERNEL);

9167

BUG_ON(sched_group_nodes_bycpu == NULL);

9166

BUG_ON(sched_group_nodes_bycpu == NULL);

9168

#endif

9167

#endif

9169

get_online_cpus();

9168

get_online_cpus();

9170

mutex_lock(&sched_domains_mutex);

9169

mutex_lock(&sched_domains_mutex);

9171

arch_init_sched_domains(cpu_online_mask);

9170

arch_init_sched_domains(cpu_online_mask);

9172

cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);

9171

cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);

9173

if (cpumask_empty(non_isolated_cpus))

9172

if (cpumask_empty(non_isolated_cpus))

9174

cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);

9173

cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);

9175

mutex_unlock(&sched_domains_mutex);

9174

mutex_unlock(&sched_domains_mutex);

9176

put_online_cpus();

9175

put_online_cpus();

9177

9176

9178

#ifndef CONFIG_CPUSETS

9177

#ifndef CONFIG_CPUSETS

9179

/* XXX: Theoretical race here - CPU may be hotplugged now */

9178

/* XXX: Theoretical race here - CPU may be hotplugged now */

9180

hotcpu_notifier(update_sched_domains, 0);

9179

hotcpu_notifier(update_sched_domains, 0);

9181

#endif

9180

#endif

9182

9181

9183

/* RT runtime code needs to handle some hotplug events */

9182

/* RT runtime code needs to handle some hotplug events */

9184

hotcpu_notifier(update_runtime, 0);

9183

hotcpu_notifier(update_runtime, 0);

9185

9184

9186

init_hrtick();

9185

init_hrtick();

9187

9186

9188

/* Move init over to a non-isolated CPU */

9187

/* Move init over to a non-isolated CPU */

9189

if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)

9188

if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)

9190

BUG();

9189

BUG();

9191

sched_init_granularity();

9190

sched_init_granularity();

9192

free_cpumask_var(non_isolated_cpus);

9191

free_cpumask_var(non_isolated_cpus);

9193

9192

9194

init_sched_rt_class();

9193

init_sched_rt_class();

9195

}

9194

}

9196

#else

9195

#else

9197

void __init sched_init_smp(void)

9196

void __init sched_init_smp(void)

9198

{

9197

{

9199

sched_init_granularity();

9198

sched_init_granularity();

9200

}

9199

}

9201

#endif /* CONFIG_SMP */

9200

#endif /* CONFIG_SMP */

9202

9201

9203

const_debug unsigned int sysctl_timer_migration = 1;

9202

const_debug unsigned int sysctl_timer_migration = 1;

9204

9203

9205

int in_sched_functions(unsigned long addr)

9204

int in_sched_functions(unsigned long addr)

9206

{

9205

{

9207

return in_lock_functions(addr) ||

9206

return in_lock_functions(addr) ||

9208

(addr >= (unsigned long)__sched_text_start

9207

(addr >= (unsigned long)__sched_text_start

9209

&& addr < (unsigned long)__sched_text_end);

9208

&& addr < (unsigned long)__sched_text_end);

9210

}

9209

}

9211

9210

9212

static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)

9211

static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)

9213

{

9212

{

9214

cfs_rq->tasks_timeline = RB_ROOT;

9213

cfs_rq->tasks_timeline = RB_ROOT;

9215

INIT_LIST_HEAD(&cfs_rq->tasks);

9214

INIT_LIST_HEAD(&cfs_rq->tasks);

9216

#ifdef CONFIG_FAIR_GROUP_SCHED

9215

#ifdef CONFIG_FAIR_GROUP_SCHED

9217

cfs_rq->rq = rq;

9216

cfs_rq->rq = rq;

9218

#endif

9217

#endif

9219

cfs_rq->min_vruntime = (u64)(-(1LL << 20));

9218

cfs_rq->min_vruntime = (u64)(-(1LL << 20));

9220

}

9219

}

9221

9220

9222

static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)

9221

static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)

9223

{

9222

{

9224

struct rt_prio_array *array;

9223

struct rt_prio_array *array;

9225

int i;

9224

int i;

9226

9225

9227

array = &rt_rq->active;

9226

array = &rt_rq->active;

9228

for (i = 0; i < MAX_RT_PRIO; i++) {

9227

for (i = 0; i < MAX_RT_PRIO; i++) {

9229

INIT_LIST_HEAD(array->queue + i);

9228

INIT_LIST_HEAD(array->queue + i);

9230

__clear_bit(i, array->bitmap);

9229

__clear_bit(i, array->bitmap);

9231

}

9230

}

9232

/* delimiter for bitsearch: */

9231

/* delimiter for bitsearch: */

9233

__set_bit(MAX_RT_PRIO, array->bitmap);

9232

__set_bit(MAX_RT_PRIO, array->bitmap);

9234

9233

9235

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

9234

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

9236

rt_rq->highest_prio.curr = MAX_RT_PRIO;

9235

rt_rq->highest_prio.curr = MAX_RT_PRIO;

9237

#ifdef CONFIG_SMP

9236

#ifdef CONFIG_SMP

9238

rt_rq->highest_prio.next = MAX_RT_PRIO;

9237

rt_rq->highest_prio.next = MAX_RT_PRIO;

9239

#endif

9238

#endif

9240

#endif

9239

#endif

9241

#ifdef CONFIG_SMP

9240

#ifdef CONFIG_SMP

9242

rt_rq->rt_nr_migratory = 0;

9241

rt_rq->rt_nr_migratory = 0;

9243

rt_rq->overloaded = 0;

9242

rt_rq->overloaded = 0;

9244

plist_head_init(&rt_rq->pushable_tasks, &rq->lock);

9243

plist_head_init(&rt_rq->pushable_tasks, &rq->lock);

9245

#endif

9244

#endif

9246

9245

9247

rt_rq->rt_time = 0;

9246

rt_rq->rt_time = 0;

9248

rt_rq->rt_throttled = 0;

9247

rt_rq->rt_throttled = 0;

9249

rt_rq->rt_runtime = 0;

9248

rt_rq->rt_runtime = 0;

9250

spin_lock_init(&rt_rq->rt_runtime_lock);

9249

spin_lock_init(&rt_rq->rt_runtime_lock);

9251

9250

9252

#ifdef CONFIG_RT_GROUP_SCHED

9251

#ifdef CONFIG_RT_GROUP_SCHED

9253

rt_rq->rt_nr_boosted = 0;

9252

rt_rq->rt_nr_boosted = 0;

9254

rt_rq->rq = rq;

9253

rt_rq->rq = rq;

9255

#endif

9254

#endif

9256

}

9255

}

9257

9256

9258

#ifdef CONFIG_FAIR_GROUP_SCHED

9257

#ifdef CONFIG_FAIR_GROUP_SCHED

9259

static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,

9258

static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,

9260

struct sched_entity *se, int cpu, int add,

9259

struct sched_entity *se, int cpu, int add,

9261

struct sched_entity *parent)

9260

struct sched_entity *parent)

9262

{

9261

{

9263

struct rq *rq = cpu_rq(cpu);

9262

struct rq *rq = cpu_rq(cpu);

9264

tg->cfs_rq[cpu] = cfs_rq;

9263

tg->cfs_rq[cpu] = cfs_rq;

9265

init_cfs_rq(cfs_rq, rq);

9264

init_cfs_rq(cfs_rq, rq);

9266

cfs_rq->tg = tg;

9265

cfs_rq->tg = tg;

9267

if (add)

9266

if (add)

9268

list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);

9267

list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);

9269

9268

9270

tg->se[cpu] = se;

9269

tg->se[cpu] = se;

9271

/* se could be NULL for init_task_group */

9270

/* se could be NULL for init_task_group */

9272

if (!se)

9271

if (!se)

9273

return;

9272

return;

9274

9273

9275

if (!parent)

9274

if (!parent)

9276

se->cfs_rq = &rq->cfs;

9275

se->cfs_rq = &rq->cfs;

9277

else

9276

else

9278

se->cfs_rq = parent->my_q;

9277

se->cfs_rq = parent->my_q;

9279

9278

9280

se->my_q = cfs_rq;

9279

se->my_q = cfs_rq;

9281

se->load.weight = tg->shares;

9280

se->load.weight = tg->shares;

9282

se->load.inv_weight = 0;

9281

se->load.inv_weight = 0;

9283

se->parent = parent;

9282

se->parent = parent;

9284

}

9283

}

9285

#endif

9284

#endif

9286

9285

9287

#ifdef CONFIG_RT_GROUP_SCHED

9286

#ifdef CONFIG_RT_GROUP_SCHED

9288

static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,

9287

static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,

9289

struct sched_rt_entity *rt_se, int cpu, int add,

9288

struct sched_rt_entity *rt_se, int cpu, int add,

9290

struct sched_rt_entity *parent)

9289

struct sched_rt_entity *parent)

9291

{

9290

{

9292

struct rq *rq = cpu_rq(cpu);

9291

struct rq *rq = cpu_rq(cpu);

9293

9292

9294

tg->rt_rq[cpu] = rt_rq;

9293

tg->rt_rq[cpu] = rt_rq;

9295

init_rt_rq(rt_rq, rq);

9294

init_rt_rq(rt_rq, rq);

9296

rt_rq->tg = tg;

9295

rt_rq->tg = tg;

9297

rt_rq->rt_se = rt_se;

9296

rt_rq->rt_se = rt_se;

9298

rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;

9297

rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;

9299

if (add)

9298

if (add)

9300

list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);

9299

list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);

9301

9300

9302

tg->rt_se[cpu] = rt_se;

9301

tg->rt_se[cpu] = rt_se;

9303

if (!rt_se)

9302

if (!rt_se)

9304

return;

9303

return;

9305

9304

9306

if (!parent)

9305

if (!parent)

9307

rt_se->rt_rq = &rq->rt;

9306

rt_se->rt_rq = &rq->rt;

9308

else

9307

else

9309

rt_se->rt_rq = parent->my_q;

9308

rt_se->rt_rq = parent->my_q;

9310

9309

9311

rt_se->my_q = rt_rq;

9310

rt_se->my_q = rt_rq;

9312

rt_se->parent = parent;

9311

rt_se->parent = parent;

9313

INIT_LIST_HEAD(&rt_se->run_list);

9312

INIT_LIST_HEAD(&rt_se->run_list);

9314

}

9313

}

9315

#endif

9314

#endif

9316

9315

9317

void __init sched_init(void)

9316

void __init sched_init(void)

9318

{

9317

{

9319

int i, j;

9318

int i, j;

9320

unsigned long alloc_size = 0, ptr;

9319

unsigned long alloc_size = 0, ptr;

9321

9320

9322

#ifdef CONFIG_FAIR_GROUP_SCHED

9321

#ifdef CONFIG_FAIR_GROUP_SCHED

9323

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

9322

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

9324

#endif

9323

#endif

9325

#ifdef CONFIG_RT_GROUP_SCHED

9324

#ifdef CONFIG_RT_GROUP_SCHED

9326

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

9325

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

9327

#endif

9326

#endif

9328

#ifdef CONFIG_USER_SCHED

9327

#ifdef CONFIG_USER_SCHED

9329

alloc_size *= 2;

9328

alloc_size *= 2;

9330

#endif

9329

#endif

9331

#ifdef CONFIG_CPUMASK_OFFSTACK

9330

#ifdef CONFIG_CPUMASK_OFFSTACK

9332

alloc_size += num_possible_cpus() * cpumask_size();

9331

alloc_size += num_possible_cpus() * cpumask_size();

9333

#endif

9332

#endif

9334

/*

9333

/*

9335

* As sched_init() is called before page_alloc is setup,

9334

* As sched_init() is called before page_alloc is setup,

9336

* we use alloc_bootmem().

9335

* we use alloc_bootmem().

9337

*/

9336

*/

9338

if (alloc_size) {

9337

if (alloc_size) {

9339

ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);

9338

ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);

9340

9339

9341

#ifdef CONFIG_FAIR_GROUP_SCHED

9340

#ifdef CONFIG_FAIR_GROUP_SCHED

9342

init_task_group.se = (struct sched_entity **)ptr;

9341

init_task_group.se = (struct sched_entity **)ptr;

9343

ptr += nr_cpu_ids * sizeof(void **);

9342

ptr += nr_cpu_ids * sizeof(void **);

9344

9343

9345

init_task_group.cfs_rq = (struct cfs_rq **)ptr;

9344

init_task_group.cfs_rq = (struct cfs_rq **)ptr;

9346

ptr += nr_cpu_ids * sizeof(void **);

9345

ptr += nr_cpu_ids * sizeof(void **);

9347

9346

9348

#ifdef CONFIG_USER_SCHED

9347

#ifdef CONFIG_USER_SCHED

9349

root_task_group.se = (struct sched_entity **)ptr;

9348

root_task_group.se = (struct sched_entity **)ptr;

9350

ptr += nr_cpu_ids * sizeof(void **);

9349

ptr += nr_cpu_ids * sizeof(void **);

9351

9350

9352

root_task_group.cfs_rq = (struct cfs_rq **)ptr;

9351

root_task_group.cfs_rq = (struct cfs_rq **)ptr;

9353

ptr += nr_cpu_ids * sizeof(void **);

9352

ptr += nr_cpu_ids * sizeof(void **);

9354

#endif /* CONFIG_USER_SCHED */

9353

#endif /* CONFIG_USER_SCHED */

9355

#endif /* CONFIG_FAIR_GROUP_SCHED */

9354

#endif /* CONFIG_FAIR_GROUP_SCHED */

9356

#ifdef CONFIG_RT_GROUP_SCHED

9355

#ifdef CONFIG_RT_GROUP_SCHED

9357

init_task_group.rt_se = (struct sched_rt_entity **)ptr;

9356

init_task_group.rt_se = (struct sched_rt_entity **)ptr;

9358

ptr += nr_cpu_ids * sizeof(void **);

9357

ptr += nr_cpu_ids * sizeof(void **);

9359

9358

9360

init_task_group.rt_rq = (struct rt_rq **)ptr;

9359

init_task_group.rt_rq = (struct rt_rq **)ptr;

9361

ptr += nr_cpu_ids * sizeof(void **);

9360

ptr += nr_cpu_ids * sizeof(void **);

9362

9361

9363

#ifdef CONFIG_USER_SCHED

9362

#ifdef CONFIG_USER_SCHED

9364

root_task_group.rt_se = (struct sched_rt_entity **)ptr;

9363

root_task_group.rt_se = (struct sched_rt_entity **)ptr;

9365

ptr += nr_cpu_ids * sizeof(void **);

9364

ptr += nr_cpu_ids * sizeof(void **);

9366

9365

9367

root_task_group.rt_rq = (struct rt_rq **)ptr;

9366

root_task_group.rt_rq = (struct rt_rq **)ptr;

9368

ptr += nr_cpu_ids * sizeof(void **);

9367

ptr += nr_cpu_ids * sizeof(void **);

9369

#endif /* CONFIG_USER_SCHED */

9368

#endif /* CONFIG_USER_SCHED */

9370

#endif /* CONFIG_RT_GROUP_SCHED */

9369

#endif /* CONFIG_RT_GROUP_SCHED */

9371

#ifdef CONFIG_CPUMASK_OFFSTACK

9370

#ifdef CONFIG_CPUMASK_OFFSTACK

9372

for_each_possible_cpu(i) {

9371

for_each_possible_cpu(i) {

9373

per_cpu(load_balance_tmpmask, i) = (void *)ptr;

9372

per_cpu(load_balance_tmpmask, i) = (void *)ptr;

9374

ptr += cpumask_size();

9373

ptr += cpumask_size();

9375

}

9374

}

9376

#endif /* CONFIG_CPUMASK_OFFSTACK */

9375

#endif /* CONFIG_CPUMASK_OFFSTACK */

9377

}

9376

}

9378

9377

9379

#ifdef CONFIG_SMP

9378

#ifdef CONFIG_SMP

9380

init_defrootdomain();

9379

init_defrootdomain();

9381

#endif

9380

#endif

9382

9381

9383

init_rt_bandwidth(&def_rt_bandwidth,

9382

init_rt_bandwidth(&def_rt_bandwidth,

9384

global_rt_period(), global_rt_runtime());

9383

global_rt_period(), global_rt_runtime());

9385

9384

9386

#ifdef CONFIG_RT_GROUP_SCHED

9385

#ifdef CONFIG_RT_GROUP_SCHED

9387

init_rt_bandwidth(&init_task_group.rt_bandwidth,

9386

init_rt_bandwidth(&init_task_group.rt_bandwidth,

9388

global_rt_period(), global_rt_runtime());

9387

global_rt_period(), global_rt_runtime());

9389

#ifdef CONFIG_USER_SCHED

9388

#ifdef CONFIG_USER_SCHED

9390

init_rt_bandwidth(&root_task_group.rt_bandwidth,

9389

init_rt_bandwidth(&root_task_group.rt_bandwidth,

9391

global_rt_period(), RUNTIME_INF);

9390

global_rt_period(), RUNTIME_INF);

9392

#endif /* CONFIG_USER_SCHED */

9391

#endif /* CONFIG_USER_SCHED */

9393

#endif /* CONFIG_RT_GROUP_SCHED */

9392

#endif /* CONFIG_RT_GROUP_SCHED */

9394

9393

9395

#ifdef CONFIG_GROUP_SCHED

9394

#ifdef CONFIG_GROUP_SCHED

9396

list_add(&init_task_group.list, &task_groups);

9395

list_add(&init_task_group.list, &task_groups);

9397

INIT_LIST_HEAD(&init_task_group.children);

9396

INIT_LIST_HEAD(&init_task_group.children);

9398

9397

9399

#ifdef CONFIG_USER_SCHED

9398

#ifdef CONFIG_USER_SCHED

9400

INIT_LIST_HEAD(&root_task_group.children);

9399

INIT_LIST_HEAD(&root_task_group.children);

9401

init_task_group.parent = &root_task_group;

9400

init_task_group.parent = &root_task_group;

9402

list_add(&init_task_group.siblings, &root_task_group.children);

9401

list_add(&init_task_group.siblings, &root_task_group.children);

9403

#endif /* CONFIG_USER_SCHED */

9402

#endif /* CONFIG_USER_SCHED */

9404

#endif /* CONFIG_GROUP_SCHED */

9403

#endif /* CONFIG_GROUP_SCHED */

9405

9404

9406

#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP

9405

#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP

9407

update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),

9406

update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),

9408

__alignof__(unsigned long));

9407

__alignof__(unsigned long));

9409

#endif

9408

#endif

9410

for_each_possible_cpu(i) {

9409

for_each_possible_cpu(i) {

9411

struct rq *rq;

9410

struct rq *rq;

9412

9411

9413

rq = cpu_rq(i);

9412

rq = cpu_rq(i);

9414

spin_lock_init(&rq->lock);

9413

spin_lock_init(&rq->lock);

9415

rq->nr_running = 0;

9414

rq->nr_running = 0;

9416

rq->calc_load_active = 0;

9415

rq->calc_load_active = 0;

9417

rq->calc_load_update = jiffies + LOAD_FREQ;

9416

rq->calc_load_update = jiffies + LOAD_FREQ;

9418

init_cfs_rq(&rq->cfs, rq);

9417

init_cfs_rq(&rq->cfs, rq);

9419

init_rt_rq(&rq->rt, rq);

9418

init_rt_rq(&rq->rt, rq);

9420

#ifdef CONFIG_FAIR_GROUP_SCHED

9419

#ifdef CONFIG_FAIR_GROUP_SCHED

9421

init_task_group.shares = init_task_group_load;

9420

init_task_group.shares = init_task_group_load;

9422

INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);

9421

INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);

9423

#ifdef CONFIG_CGROUP_SCHED

9422

#ifdef CONFIG_CGROUP_SCHED

9424

/*

9423

/*

9425

* How much cpu bandwidth does init_task_group get?

9424

* How much cpu bandwidth does init_task_group get?

9426

*

9425

*

9427

* In case of task-groups formed thr' the cgroup filesystem, it

9426

* In case of task-groups formed thr' the cgroup filesystem, it

9428

* gets 100% of the cpu resources in the system. This overall

9427

* gets 100% of the cpu resources in the system. This overall

9429

* system cpu resource is divided among the tasks of

9428

* system cpu resource is divided among the tasks of

9430

* init_task_group and its child task-groups in a fair manner,

9429

* init_task_group and its child task-groups in a fair manner,

9431

* based on each entity's (task or task-group's) weight

9430

* based on each entity's (task or task-group's) weight

9432

* (se->load.weight).

9431

* (se->load.weight).

9433

*

9432

*

9434

* In other words, if init_task_group has 10 tasks of weight

9433

* In other words, if init_task_group has 10 tasks of weight

9435

* 1024) and two child groups A0 and A1 (of weight 1024 each),

9434

* 1024) and two child groups A0 and A1 (of weight 1024 each),

9436

* then A0's share of the cpu resource is:

9435

* then A0's share of the cpu resource is:

9437

*

9436

*

9438

* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%

9437

* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%

9439

*

9438

*

9440

* We achieve this by letting init_task_group's tasks sit

9439

* We achieve this by letting init_task_group's tasks sit

9441

* directly in rq->cfs (i.e init_task_group->se[] = NULL).

9440

* directly in rq->cfs (i.e init_task_group->se[] = NULL).

9442

*/

9441

*/

9443

init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);

9442

init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);

9444

#elif defined CONFIG_USER_SCHED

9443

#elif defined CONFIG_USER_SCHED

9445

root_task_group.shares = NICE_0_LOAD;

9444

root_task_group.shares = NICE_0_LOAD;

9446

init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);

9445

init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);

9447

/*

9446

/*

9448

* In case of task-groups formed thr' the user id of tasks,

9447

* In case of task-groups formed thr' the user id of tasks,

9449

* init_task_group represents tasks belonging to root user.

9448

* init_task_group represents tasks belonging to root user.

9450

* Hence it forms a sibling of all subsequent groups formed.

9449

* Hence it forms a sibling of all subsequent groups formed.

9451

* In this case, init_task_group gets only a fraction of overall

9450

* In this case, init_task_group gets only a fraction of overall

9452

* system cpu resource, based on the weight assigned to root

9451

* system cpu resource, based on the weight assigned to root

9453

* user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished

9452

* user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished

9454

* by letting tasks of init_task_group sit in a separate cfs_rq

9453

* by letting tasks of init_task_group sit in a separate cfs_rq

9455

* (init_tg_cfs_rq) and having one entity represent this group of

9454

* (init_tg_cfs_rq) and having one entity represent this group of

9456

* tasks in rq->cfs (i.e init_task_group->se[] != NULL).

9455

* tasks in rq->cfs (i.e init_task_group->se[] != NULL).

9457

*/

9456

*/

9458

init_tg_cfs_entry(&init_task_group,

9457

init_tg_cfs_entry(&init_task_group,

9459

&per_cpu(init_tg_cfs_rq, i),

9458

&per_cpu(init_tg_cfs_rq, i),

9460

&per_cpu(init_sched_entity, i), i, 1,

9459

&per_cpu(init_sched_entity, i), i, 1,

9461

root_task_group.se[i]);

9460

root_task_group.se[i]);

9462

9461

9463

#endif

9462

#endif

9464

#endif /* CONFIG_FAIR_GROUP_SCHED */

9463

#endif /* CONFIG_FAIR_GROUP_SCHED */

9465

9464

9466

rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;

9465

rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;

9467

#ifdef CONFIG_RT_GROUP_SCHED

9466

#ifdef CONFIG_RT_GROUP_SCHED

9468

INIT_LIST_HEAD(&rq->leaf_rt_rq_list);

9467

INIT_LIST_HEAD(&rq->leaf_rt_rq_list);

9469

#ifdef CONFIG_CGROUP_SCHED

9468

#ifdef CONFIG_CGROUP_SCHED

9470

init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);

9469

init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);

9471

#elif defined CONFIG_USER_SCHED

9470

#elif defined CONFIG_USER_SCHED

9472

init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);

9471

init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);

9473

init_tg_rt_entry(&init_task_group,

9472

init_tg_rt_entry(&init_task_group,

9474

&per_cpu(init_rt_rq, i),

9473

&per_cpu(init_rt_rq, i),

9475

&per_cpu(init_sched_rt_entity, i), i, 1,

9474

&per_cpu(init_sched_rt_entity, i), i, 1,

9476

root_task_group.rt_se[i]);

9475

root_task_group.rt_se[i]);

9477

#endif

9476

#endif

9478

#endif

9477

#endif

9479

9478

9480

for (j = 0; j < CPU_LOAD_IDX_MAX; j++)

9479

for (j = 0; j < CPU_LOAD_IDX_MAX; j++)

9481

rq->cpu_load[j] = 0;

9480

rq->cpu_load[j] = 0;

9482

#ifdef CONFIG_SMP

9481

#ifdef CONFIG_SMP

9483

rq->sd = NULL;

9482

rq->sd = NULL;

9484

rq->rd = NULL;

9483

rq->rd = NULL;

9485

rq->post_schedule = 0;

9484

rq->post_schedule = 0;

9486

rq->active_balance = 0;

9485

rq->active_balance = 0;

9487

rq->next_balance = jiffies;

9486

rq->next_balance = jiffies;

9488

rq->push_cpu = 0;

9487

rq->push_cpu = 0;

9489

rq->cpu = i;

9488

rq->cpu = i;

9490

rq->online = 0;

9489

rq->online = 0;

9491

rq->migration_thread = NULL;

9490

rq->migration_thread = NULL;

9492

INIT_LIST_HEAD(&rq->migration_queue);

9491

INIT_LIST_HEAD(&rq->migration_queue);

9493

rq_attach_root(rq, &def_root_domain);

9492

rq_attach_root(rq, &def_root_domain);

9494

#endif

9493

#endif

9495

init_rq_hrtick(rq);

9494

init_rq_hrtick(rq);

9496

atomic_set(&rq->nr_iowait, 0);

9495

atomic_set(&rq->nr_iowait, 0);

9497

}

9496

}

9498

9497

9499

set_load_weight(&init_task);

9498

set_load_weight(&init_task);

9500

9499

9501

#ifdef CONFIG_PREEMPT_NOTIFIERS

9500

#ifdef CONFIG_PREEMPT_NOTIFIERS

9502

INIT_HLIST_HEAD(&init_task.preempt_notifiers);

9501

INIT_HLIST_HEAD(&init_task.preempt_notifiers);

9503

#endif

9502

#endif

9504

9503

9505

#ifdef CONFIG_SMP

9504

#ifdef CONFIG_SMP

9506

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

9505

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

9507

#endif

9506

#endif

9508

9507

9509

#ifdef CONFIG_RT_MUTEXES

9508

#ifdef CONFIG_RT_MUTEXES

9510

plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);

9509

plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);

9511

#endif

9510

#endif

9512

9511

9513

/*

9512

/*

9514

* The boot idle thread does lazy MMU switching as well:

9513

* The boot idle thread does lazy MMU switching as well:

9515

*/

9514

*/

9516

atomic_inc(&init_mm.mm_count);

9515

atomic_inc(&init_mm.mm_count);

9517

enter_lazy_tlb(&init_mm, current);

9516

enter_lazy_tlb(&init_mm, current);

9518

9517

9519

/*

9518

/*

9520

* Make us the idle thread. Technically, schedule() should not be

9519

* Make us the idle thread. Technically, schedule() should not be

9521

* called from this thread, however somewhere below it might be,

9520

* called from this thread, however somewhere below it might be,

9522

* but because we are the idle thread, we just pick up running again

9521

* but because we are the idle thread, we just pick up running again

9523

* when this runqueue becomes "idle".

9522

* when this runqueue becomes "idle".

9524

*/

9523

*/

9525

init_idle(current, smp_processor_id());

9524

init_idle(current, smp_processor_id());

9526

9525

9527

calc_load_update = jiffies + LOAD_FREQ;

9526

calc_load_update = jiffies + LOAD_FREQ;

9528

9527

9529

/*

9528

/*

9530

* During early bootup we pretend to be a normal task:

9529

* During early bootup we pretend to be a normal task:

9531

*/

9530

*/

9532

current->sched_class = &fair_sched_class;

9531

current->sched_class = &fair_sched_class;

9533

9532

9534

/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */

9533

/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */

9535

alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);

9534

alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);

9536

#ifdef CONFIG_SMP

9535

#ifdef CONFIG_SMP

9537

#ifdef CONFIG_NO_HZ

9536

#ifdef CONFIG_NO_HZ

9538

alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);

9537

alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);

9539

alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);

9538

alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);

9540

#endif

9539

#endif

9541

alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);

9540

alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);

9542

#endif /* SMP */

9541

#endif /* SMP */

9543

9542

9544

perf_event_init();

9543

perf_event_init();

9545

9544

9546

scheduler_running = 1;

9545

scheduler_running = 1;

9547

}

9546

}

9548

9547

9549

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

9548

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

9550

static inline int preempt_count_equals(int preempt_offset)

9549

static inline int preempt_count_equals(int preempt_offset)

9551

{

9550

{

9552

int nested = preempt_count() & ~PREEMPT_ACTIVE;

9551

int nested = preempt_count() & ~PREEMPT_ACTIVE;

9553

9552

9554

return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);

9553

return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);

9555

}

9554

}

9556

9555

9557

void __might_sleep(char *file, int line, int preempt_offset)

9556

void __might_sleep(char *file, int line, int preempt_offset)

9558

{

9557

{

9559

#ifdef in_atomic

9558

#ifdef in_atomic

9560

static unsigned long prev_jiffy; /* ratelimiting */

9559

static unsigned long prev_jiffy; /* ratelimiting */

9561

9560

9562

if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||

9561

if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||

9563

system_state != SYSTEM_RUNNING || oops_in_progress)

9562

system_state != SYSTEM_RUNNING || oops_in_progress)

9564

return;

9563

return;

9565

if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

9564

if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

9566

return;

9565

return;

9567

prev_jiffy = jiffies;

9566

prev_jiffy = jiffies;

9568

9567

9569

printk(KERN_ERR

9568

printk(KERN_ERR

9570

"BUG: sleeping function called from invalid context at %s:%d\n",

9569

"BUG: sleeping function called from invalid context at %s:%d\n",

9571

file, line);

9570

file, line);

9572

printk(KERN_ERR

9571

printk(KERN_ERR

9573

"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",

9572

"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",

9574

in_atomic(), irqs_disabled(),

9573

in_atomic(), irqs_disabled(),

9575

current->pid, current->comm);

9574

current->pid, current->comm);

9576

9575

9577

debug_show_held_locks(current);

9576

debug_show_held_locks(current);

9578

if (irqs_disabled())

9577

if (irqs_disabled())

9579

print_irqtrace_events(current);

9578

print_irqtrace_events(current);

9580

dump_stack();

9579

dump_stack();

9581

#endif

9580

#endif

9582

}

9581

}

9583

EXPORT_SYMBOL(__might_sleep);

9582

EXPORT_SYMBOL(__might_sleep);

9584

#endif

9583

#endif

9585

9584

9586

#ifdef CONFIG_MAGIC_SYSRQ

9585

#ifdef CONFIG_MAGIC_SYSRQ

9587

static void normalize_task(struct rq *rq, struct task_struct *p)

9586

static void normalize_task(struct rq *rq, struct task_struct *p)

9588

{

9587

{

9589

int on_rq;

9588

int on_rq;

9590

9589

9591

update_rq_clock(rq);

9590

update_rq_clock(rq);

9592

on_rq = p->se.on_rq;

9591

on_rq = p->se.on_rq;

9593

if (on_rq)

9592

if (on_rq)

9594

deactivate_task(rq, p, 0);

9593

deactivate_task(rq, p, 0);

9595

__setscheduler(rq, p, SCHED_NORMAL, 0);

9594

__setscheduler(rq, p, SCHED_NORMAL, 0);

9596

if (on_rq) {

9595

if (on_rq) {

9597

activate_task(rq, p, 0);

9596

activate_task(rq, p, 0);

9598

resched_task(rq->curr);

9597

resched_task(rq->curr);

9599

}

9598

}

9600

}

9599

}

9601

9600

9602

void normalize_rt_tasks(void)

9601

void normalize_rt_tasks(void)

9603

{

9602

{

9604

struct task_struct *g, *p;

9603

struct task_struct *g, *p;

9605

unsigned long flags;

9604

unsigned long flags;

9606

struct rq *rq;

9605

struct rq *rq;

9607

9606

9608

read_lock_irqsave(&tasklist_lock, flags);

9607

read_lock_irqsave(&tasklist_lock, flags);

9609

do_each_thread(g, p) {

9608

do_each_thread(g, p) {

9610

/*

9609

/*

9611

* Only normalize user tasks:

9610

* Only normalize user tasks:

9612

*/

9611

*/

9613

if (!p->mm)

9612

if (!p->mm)

9614

continue;

9613

continue;

9615

9614

9616

p->se.exec_start = 0;

9615

p->se.exec_start = 0;

9617

#ifdef CONFIG_SCHEDSTATS

9616

#ifdef CONFIG_SCHEDSTATS

9618

p->se.wait_start = 0;

9617

p->se.wait_start = 0;

9619

p->se.sleep_start = 0;

9618

p->se.sleep_start = 0;

9620

p->se.block_start = 0;

9619

p->se.block_start = 0;

9621

#endif

9620

#endif

9622

9621

9623

if (!rt_task(p)) {

9622

if (!rt_task(p)) {

9624

/*

9623

/*

9625

* Renice negative nice level userspace

9624

* Renice negative nice level userspace

9626

* tasks back to 0:

9625

* tasks back to 0:

9627

*/

9626

*/

9628

if (TASK_NICE(p) < 0 && p->mm)

9627

if (TASK_NICE(p) < 0 && p->mm)

9629

set_user_nice(p, 0);

9628

set_user_nice(p, 0);

9630

continue;

9629

continue;

9631

}

9630

}

9632

9631

9633

spin_lock(&p->pi_lock);

9632

spin_lock(&p->pi_lock);

9634

rq = __task_rq_lock(p);

9633

rq = __task_rq_lock(p);

9635

9634

9636

normalize_task(rq, p);

9635

normalize_task(rq, p);

9637

9636

9638

__task_rq_unlock(rq);

9637

__task_rq_unlock(rq);

9639

spin_unlock(&p->pi_lock);

9638

spin_unlock(&p->pi_lock);

9640

} while_each_thread(g, p);

9639

} while_each_thread(g, p);

9641

9640

9642

read_unlock_irqrestore(&tasklist_lock, flags);

9641

read_unlock_irqrestore(&tasklist_lock, flags);

9643

}

9642

}

9644

9643

9645

#endif /* CONFIG_MAGIC_SYSRQ */

9644

#endif /* CONFIG_MAGIC_SYSRQ */

9646

9645

9647

#ifdef CONFIG_IA64

9646

#ifdef CONFIG_IA64

9648

/*

9647

/*

9649

* These functions are only useful for the IA64 MCA handling.

9648

* These functions are only useful for the IA64 MCA handling.

9650

*

9649

*

9651

* They can only be called when the whole system has been

9650

* They can only be called when the whole system has been

9652

* stopped - every CPU needs to be quiescent, and no scheduling

9651

* stopped - every CPU needs to be quiescent, and no scheduling

9653

* activity can take place. Using them for anything else would

9652

* activity can take place. Using them for anything else would

9654

* be a serious bug, and as a result, they aren't even visible

9653

* be a serious bug, and as a result, they aren't even visible

9655

* under any other configuration.

9654

* under any other configuration.

9656

*/

9655

*/

9657

9656

9658

/**

9657

/**

9659

* curr_task - return the current task for a given cpu.

9658

* curr_task - return the current task for a given cpu.

9660

* @cpu: the processor in question.

9659

* @cpu: the processor in question.

9661

*

9660

*

9662

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

9661

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

9663

*/

9662

*/

9664

struct task_struct *curr_task(int cpu)

9663

struct task_struct *curr_task(int cpu)

9665

{

9664

{

9666

return cpu_curr(cpu);

9665

return cpu_curr(cpu);

9667

}

9666

}

9668

9667

9669

/**

9668

/**

9670

* set_curr_task - set the current task for a given cpu.

9669

* set_curr_task - set the current task for a given cpu.

9671

* @cpu: the processor in question.

9670

* @cpu: the processor in question.

9672

* @p: the task pointer to set.

9671

* @p: the task pointer to set.

9673

*

9672

*

9674

* Description: This function must only be used when non-maskable interrupts

9673

* Description: This function must only be used when non-maskable interrupts

9675

* are serviced on a separate stack. It allows the architecture to switch the

9674

* are serviced on a separate stack. It allows the architecture to switch the

9676

* notion of the current task on a cpu in a non-blocking manner. This function

9675

* notion of the current task on a cpu in a non-blocking manner. This function

9677

* must be called with all CPU's synchronized, and interrupts disabled, the

9676

* must be called with all CPU's synchronized, and interrupts disabled, the

9678

* and caller must save the original value of the current task (see

9677

* and caller must save the original value of the current task (see

9679

* curr_task() above) and restore that value before reenabling interrupts and

9678

* curr_task() above) and restore that value before reenabling interrupts and

9680

* re-starting the system.

9679

* re-starting the system.

9681

*

9680

*

9682

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

9681

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

9683

*/

9682

*/

9684

void set_curr_task(int cpu, struct task_struct *p)

9683

void set_curr_task(int cpu, struct task_struct *p)

9685

{

9684

{

9686

cpu_curr(cpu) = p;

9685

cpu_curr(cpu) = p;

9687

}

9686

}

9688

9687

9689

#endif

9688

#endif

9690

9689

9691

#ifdef CONFIG_FAIR_GROUP_SCHED

9690

#ifdef CONFIG_FAIR_GROUP_SCHED

9692

static void free_fair_sched_group(struct task_group *tg)

9691

static void free_fair_sched_group(struct task_group *tg)

9693

{

9692

{

9694

int i;

9693

int i;

9695

9694

9696

for_each_possible_cpu(i) {

9695

for_each_possible_cpu(i) {

9697

if (tg->cfs_rq)

9696

if (tg->cfs_rq)

9698

kfree(tg->cfs_rq[i]);

9697

kfree(tg->cfs_rq[i]);

9699

if (tg->se)

9698

if (tg->se)

9700

kfree(tg->se[i]);

9699

kfree(tg->se[i]);

9701

}

9700

}

9702

9701

9703

kfree(tg->cfs_rq);

9702

kfree(tg->cfs_rq);

9704

kfree(tg->se);

9703

kfree(tg->se);

9705

}

9704

}

9706

9705

9707

static

9706

static

9708

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

9707

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

9709

{

9708

{

9710

struct cfs_rq *cfs_rq;

9709

struct cfs_rq *cfs_rq;

9711

struct sched_entity *se;

9710

struct sched_entity *se;

9712

struct rq *rq;

9711

struct rq *rq;

9713

int i;

9712

int i;

9714

9713

9715

tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);

9714

tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);

9716

if (!tg->cfs_rq)

9715

if (!tg->cfs_rq)

9717

goto err;

9716

goto err;

9718

tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);

9717

tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);

9719

if (!tg->se)

9718

if (!tg->se)

9720

goto err;

9719

goto err;

9721

9720

9722

tg->shares = NICE_0_LOAD;

9721

tg->shares = NICE_0_LOAD;

9723

9722

9724

for_each_possible_cpu(i) {

9723

for_each_possible_cpu(i) {

9725

rq = cpu_rq(i);

9724

rq = cpu_rq(i);

9726

9725

9727

cfs_rq = kzalloc_node(sizeof(struct cfs_rq),

9726

cfs_rq = kzalloc_node(sizeof(struct cfs_rq),

9728

GFP_KERNEL, cpu_to_node(i));

9727

GFP_KERNEL, cpu_to_node(i));

9729

if (!cfs_rq)

9728

if (!cfs_rq)

9730

goto err;

9729

goto err;

9731

9730

9732

se = kzalloc_node(sizeof(struct sched_entity),

9731

se = kzalloc_node(sizeof(struct sched_entity),

9733

GFP_KERNEL, cpu_to_node(i));

9732

GFP_KERNEL, cpu_to_node(i));

9734

if (!se)

9733

if (!se)

9735

goto err;

9734

goto err;

9736

9735

9737

init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);

9736

init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);

9738

}

9737

}

9739

9738

9740

return 1;

9739

return 1;

9741

9740

9742

err:

9741

err:

9743

return 0;

9742

return 0;

9744

}

9743

}

9745

9744

9746

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

9745

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

9747

{

9746

{

9748

list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,

9747

list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,

9749

&cpu_rq(cpu)->leaf_cfs_rq_list);

9748

&cpu_rq(cpu)->leaf_cfs_rq_list);

9750

}

9749

}

9751

9750

9752

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

9751

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

9753

{

9752

{

9754

list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);

9753

list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);

9755

}

9754

}

9756

#else /* !CONFG_FAIR_GROUP_SCHED */

9755

#else /* !CONFG_FAIR_GROUP_SCHED */

9757

static inline void free_fair_sched_group(struct task_group *tg)

9756

static inline void free_fair_sched_group(struct task_group *tg)

9758

{

9757

{

9759

}

9758

}

9760

9759

9761

static inline

9760

static inline

9762

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

9761

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

9763

{

9762

{

9764

return 1;

9763

return 1;

9765

}

9764

}

9766

9765

9767

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

9766

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

9768

{

9767

{

9769

}

9768

}

9770

9769

9771

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

9770

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

9772

{

9771

{

9773

}

9772

}

9774

#endif /* CONFIG_FAIR_GROUP_SCHED */

9773

#endif /* CONFIG_FAIR_GROUP_SCHED */

9775

9774

9776

#ifdef CONFIG_RT_GROUP_SCHED

9775

#ifdef CONFIG_RT_GROUP_SCHED

9777

static void free_rt_sched_group(struct task_group *tg)

9776

static void free_rt_sched_group(struct task_group *tg)

9778

{

9777

{

9779

int i;

9778

int i;

9780

9779

9781

destroy_rt_bandwidth(&tg->rt_bandwidth);

9780

destroy_rt_bandwidth(&tg->rt_bandwidth);

9782

9781

9783

for_each_possible_cpu(i) {

9782

for_each_possible_cpu(i) {

9784

if (tg->rt_rq)

9783

if (tg->rt_rq)

9785

kfree(tg->rt_rq[i]);

9784

kfree(tg->rt_rq[i]);

9786

if (tg->rt_se)

9785

if (tg->rt_se)

9787

kfree(tg->rt_se[i]);

9786

kfree(tg->rt_se[i]);

9788

}

9787

}

9789

9788

9790

kfree(tg->rt_rq);

9789

kfree(tg->rt_rq);

9791

kfree(tg->rt_se);

9790

kfree(tg->rt_se);

9792

}

9791

}

9793

9792

9794

static

9793

static

9795

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

9794

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

9796

{

9795

{

9797

struct rt_rq *rt_rq;

9796

struct rt_rq *rt_rq;

9798

struct sched_rt_entity *rt_se;

9797

struct sched_rt_entity *rt_se;

9799

struct rq *rq;

9798

struct rq *rq;

9800

int i;

9799

int i;

9801

9800

9802

tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);

9801

tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);

9803

if (!tg->rt_rq)

9802

if (!tg->rt_rq)

9804

goto err;

9803

goto err;

9805

tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);

9804

tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);

9806

if (!tg->rt_se)

9805

if (!tg->rt_se)

9807

goto err;

9806

goto err;

9808

9807

9809

init_rt_bandwidth(&tg->rt_bandwidth,

9808

init_rt_bandwidth(&tg->rt_bandwidth,

9810

ktime_to_ns(def_rt_bandwidth.rt_period), 0);

9809

ktime_to_ns(def_rt_bandwidth.rt_period), 0);

9811

9810

9812

for_each_possible_cpu(i) {

9811

for_each_possible_cpu(i) {

9813

rq = cpu_rq(i);

9812

rq = cpu_rq(i);

9814

9813

9815

rt_rq = kzalloc_node(sizeof(struct rt_rq),

9814

rt_rq = kzalloc_node(sizeof(struct rt_rq),

9816

GFP_KERNEL, cpu_to_node(i));

9815

GFP_KERNEL, cpu_to_node(i));

9817

if (!rt_rq)

9816

if (!rt_rq)

9818

goto err;

9817

goto err;

9819

9818

9820

rt_se = kzalloc_node(sizeof(struct sched_rt_entity),

9819

rt_se = kzalloc_node(sizeof(struct sched_rt_entity),

9821

GFP_KERNEL, cpu_to_node(i));

9820

GFP_KERNEL, cpu_to_node(i));

9822

if (!rt_se)

9821

if (!rt_se)

9823

goto err;

9822

goto err;

9824

9823

9825

init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);

9824

init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);

9826

}

9825

}

9827

9826

9828

return 1;

9827

return 1;

9829

9828

9830

err:

9829

err:

9831

return 0;

9830

return 0;

9832

}

9831

}

9833

9832

9834

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

9833

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

9835

{

9834

{

9836

list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,

9835

list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,

9837

&cpu_rq(cpu)->leaf_rt_rq_list);

9836

&cpu_rq(cpu)->leaf_rt_rq_list);

9838

}

9837

}

9839

9838

9840

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

9839

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

9841

{

9840

{

9842

list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);

9841

list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);

9843

}

9842

}

9844

#else /* !CONFIG_RT_GROUP_SCHED */

9843

#else /* !CONFIG_RT_GROUP_SCHED */

9845

static inline void free_rt_sched_group(struct task_group *tg)

9844

static inline void free_rt_sched_group(struct task_group *tg)

9846

{

9845

{

9847

}

9846

}

9848

9847

9849

static inline

9848

static inline

9850

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

9849

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

9851

{

9850

{

9852

return 1;

9851

return 1;

9853

}

9852

}

9854

9853

9855

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

9854

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

9856

{

9855

{

9857

}

9856

}

9858

9857

9859

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

9858

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

9860

{

9859

{

9861

}

9860

}

9862

#endif /* CONFIG_RT_GROUP_SCHED */

9861

#endif /* CONFIG_RT_GROUP_SCHED */

9863

9862

9864

#ifdef CONFIG_GROUP_SCHED

9863

#ifdef CONFIG_GROUP_SCHED

9865

static void free_sched_group(struct task_group *tg)

9864

static void free_sched_group(struct task_group *tg)

9866

{

9865

{

9867

free_fair_sched_group(tg);

9866

free_fair_sched_group(tg);

9868

free_rt_sched_group(tg);

9867

free_rt_sched_group(tg);

9869

kfree(tg);

9868

kfree(tg);

9870

}

9869

}

9871

9870

9872

/* allocate runqueue etc for a new task group */

9871

/* allocate runqueue etc for a new task group */

9873

struct task_group *sched_create_group(struct task_group *parent)

9872

struct task_group *sched_create_group(struct task_group *parent)

9874

{

9873

{

9875

struct task_group *tg;

9874

struct task_group *tg;

9876

unsigned long flags;

9875

unsigned long flags;

9877

int i;

9876

int i;

9878

9877

9879

tg = kzalloc(sizeof(*tg), GFP_KERNEL);

9878

tg = kzalloc(sizeof(*tg), GFP_KERNEL);

9880

if (!tg)

9879

if (!tg)

9881

return ERR_PTR(-ENOMEM);

9880

return ERR_PTR(-ENOMEM);

9882

9881

9883

if (!alloc_fair_sched_group(tg, parent))

9882

if (!alloc_fair_sched_group(tg, parent))

9884

goto err;

9883

goto err;

9885

9884

9886

if (!alloc_rt_sched_group(tg, parent))

9885

if (!alloc_rt_sched_group(tg, parent))

9887

goto err;

9886

goto err;

9888

9887

9889

spin_lock_irqsave(&task_group_lock, flags);

9888

spin_lock_irqsave(&task_group_lock, flags);

9890

for_each_possible_cpu(i) {

9889

for_each_possible_cpu(i) {

9891

register_fair_sched_group(tg, i);

9890

register_fair_sched_group(tg, i);

9892

register_rt_sched_group(tg, i);

9891

register_rt_sched_group(tg, i);

9893

}

9892

}

9894

list_add_rcu(&tg->list, &task_groups);

9893

list_add_rcu(&tg->list, &task_groups);

9895

9894

9896

WARN_ON(!parent); /* root should already exist */

9895

WARN_ON(!parent); /* root should already exist */

9897

9896

9898

tg->parent = parent;

9897

tg->parent = parent;

9899

INIT_LIST_HEAD(&tg->children);

9898

INIT_LIST_HEAD(&tg->children);

9900

list_add_rcu(&tg->siblings, &parent->children);

9899

list_add_rcu(&tg->siblings, &parent->children);

9901

spin_unlock_irqrestore(&task_group_lock, flags);

9900

spin_unlock_irqrestore(&task_group_lock, flags);

9902

9901

9903

return tg;

9902

return tg;

9904

9903

9905

err:

9904

err:

9906

free_sched_group(tg);

9905

free_sched_group(tg);

9907

return ERR_PTR(-ENOMEM);

9906

return ERR_PTR(-ENOMEM);

9908

}

9907

}

9909

9908

9910

/* rcu callback to free various structures associated with a task group */

9909

/* rcu callback to free various structures associated with a task group */

9911

static void free_sched_group_rcu(struct rcu_head *rhp)

9910

static void free_sched_group_rcu(struct rcu_head *rhp)

9912

{

9911

{

9913

/* now it should be safe to free those cfs_rqs */

9912

/* now it should be safe to free those cfs_rqs */

9914

free_sched_group(container_of(rhp, struct task_group, rcu));

9913

free_sched_group(container_of(rhp, struct task_group, rcu));

9915

}

9914

}

9916

9915

9917

/* Destroy runqueue etc associated with a task group */

9916

/* Destroy runqueue etc associated with a task group */

9918

void sched_destroy_group(struct task_group *tg)

9917

void sched_destroy_group(struct task_group *tg)

9919

{

9918

{

9920

unsigned long flags;

9919

unsigned long flags;

9921

int i;

9920

int i;

9922

9921

9923

spin_lock_irqsave(&task_group_lock, flags);

9922

spin_lock_irqsave(&task_group_lock, flags);

9924

for_each_possible_cpu(i) {

9923

for_each_possible_cpu(i) {

9925

unregister_fair_sched_group(tg, i);

9924

unregister_fair_sched_group(tg, i);

9926

unregister_rt_sched_group(tg, i);

9925

unregister_rt_sched_group(tg, i);

9927

}

9926

}

9928

list_del_rcu(&tg->list);

9927

list_del_rcu(&tg->list);

9929

list_del_rcu(&tg->siblings);

9928

list_del_rcu(&tg->siblings);

9930

spin_unlock_irqrestore(&task_group_lock, flags);

9929

spin_unlock_irqrestore(&task_group_lock, flags);

9931

9930

9932

/* wait for possible concurrent references to cfs_rqs complete */

9931

/* wait for possible concurrent references to cfs_rqs complete */

9933

call_rcu(&tg->rcu, free_sched_group_rcu);

9932

call_rcu(&tg->rcu, free_sched_group_rcu);

9934

}

9933

}

9935

9934

9936

/* change task's runqueue when it moves between groups.

9935

/* change task's runqueue when it moves between groups.

9937

* The caller of this function should have put the task in its new group

9936

* The caller of this function should have put the task in its new group

9938

* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to

9937

* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to

9939

* reflect its new group.

9938

* reflect its new group.

9940

*/

9939

*/

9941

void sched_move_task(struct task_struct *tsk)

9940

void sched_move_task(struct task_struct *tsk)

9942

{

9941

{

9943

int on_rq, running;

9942

int on_rq, running;

9944

unsigned long flags;

9943

unsigned long flags;

9945

struct rq *rq;

9944

struct rq *rq;

9946

9945

9947

rq = task_rq_lock(tsk, &flags);

9946

rq = task_rq_lock(tsk, &flags);

9948

9947

9949

update_rq_clock(rq);

9948

update_rq_clock(rq);

9950

9949

9951

running = task_current(rq, tsk);

9950

running = task_current(rq, tsk);

9952

on_rq = tsk->se.on_rq;

9951

on_rq = tsk->se.on_rq;

9953

9952

9954

if (on_rq)

9953

if (on_rq)

9955

dequeue_task(rq, tsk, 0);

9954

dequeue_task(rq, tsk, 0);

9956

if (unlikely(running))

9955

if (unlikely(running))

9957

tsk->sched_class->put_prev_task(rq, tsk);

9956

tsk->sched_class->put_prev_task(rq, tsk);

9958

9957

9959

set_task_rq(tsk, task_cpu(tsk));

9958

set_task_rq(tsk, task_cpu(tsk));

9960

9959

9961

#ifdef CONFIG_FAIR_GROUP_SCHED

9960

#ifdef CONFIG_FAIR_GROUP_SCHED

9962

if (tsk->sched_class->moved_group)

9961

if (tsk->sched_class->moved_group)

9963

tsk->sched_class->moved_group(tsk);

9962

tsk->sched_class->moved_group(tsk);

9964

#endif

9963

#endif

9965

9964

9966

if (unlikely(running))

9965

if (unlikely(running))

9967

tsk->sched_class->set_curr_task(rq);

9966

tsk->sched_class->set_curr_task(rq);

9968

if (on_rq)

9967

if (on_rq)

9969

enqueue_task(rq, tsk, 0);

9968

enqueue_task(rq, tsk, 0);

9970

9969

9971

task_rq_unlock(rq, &flags);

9970

task_rq_unlock(rq, &flags);

9972

}

9971

}

9973

#endif /* CONFIG_GROUP_SCHED */

9972

#endif /* CONFIG_GROUP_SCHED */

9974

9973

9975

#ifdef CONFIG_FAIR_GROUP_SCHED

9974

#ifdef CONFIG_FAIR_GROUP_SCHED

9976

static void __set_se_shares(struct sched_entity *se, unsigned long shares)

9975

static void __set_se_shares(struct sched_entity *se, unsigned long shares)

9977

{

9976

{

9978

struct cfs_rq *cfs_rq = se->cfs_rq;

9977

struct cfs_rq *cfs_rq = se->cfs_rq;

9979

int on_rq;

9978

int on_rq;

9980

9979

9981

on_rq = se->on_rq;

9980

on_rq = se->on_rq;

9982

if (on_rq)

9981

if (on_rq)

9983

dequeue_entity(cfs_rq, se, 0);

9982

dequeue_entity(cfs_rq, se, 0);

9984

9983

9985

se->load.weight = shares;

9984

se->load.weight = shares;

9986

se->load.inv_weight = 0;

9985

se->load.inv_weight = 0;

9987

9986

9988

if (on_rq)

9987

if (on_rq)

9989

enqueue_entity(cfs_rq, se, 0);

9988

enqueue_entity(cfs_rq, se, 0);

9990

}

9989

}

9991

9990

9992

static void set_se_shares(struct sched_entity *se, unsigned long shares)

9991

static void set_se_shares(struct sched_entity *se, unsigned long shares)

9993

{

9992

{

9994

struct cfs_rq *cfs_rq = se->cfs_rq;

9993

struct cfs_rq *cfs_rq = se->cfs_rq;

9995

struct rq *rq = cfs_rq->rq;

9994

struct rq *rq = cfs_rq->rq;

9996

unsigned long flags;

9995

unsigned long flags;

9997

9996

9998

spin_lock_irqsave(&rq->lock, flags);

9997

spin_lock_irqsave(&rq->lock, flags);

9999

__set_se_shares(se, shares);

9998

__set_se_shares(se, shares);

10000

spin_unlock_irqrestore(&rq->lock, flags);

9999

spin_unlock_irqrestore(&rq->lock, flags);

10001

}

10000

}

10002

10001

10003

static DEFINE_MUTEX(shares_mutex);

10002

static DEFINE_MUTEX(shares_mutex);

10004

10003

10005

int sched_group_set_shares(struct task_group *tg, unsigned long shares)

10004

int sched_group_set_shares(struct task_group *tg, unsigned long shares)

10006

{

10005

{

10007

int i;

10006

int i;

10008

unsigned long flags;

10007

unsigned long flags;

10009

10008

10010

/*

10009

/*

10011

* We can't change the weight of the root cgroup.

10010

* We can't change the weight of the root cgroup.

10012

*/

10011

*/

10013

if (!tg->se[0])

10012

if (!tg->se[0])

10014

return -EINVAL;

10013

return -EINVAL;

10015

10014

10016

if (shares < MIN_SHARES)

10015

if (shares < MIN_SHARES)

10017

shares = MIN_SHARES;

10016

shares = MIN_SHARES;

10018

else if (shares > MAX_SHARES)

10017

else if (shares > MAX_SHARES)

10019

shares = MAX_SHARES;

10018

shares = MAX_SHARES;

10020

10019

10021

mutex_lock(&shares_mutex);

10020

mutex_lock(&shares_mutex);

10022

if (tg->shares == shares)

10021

if (tg->shares == shares)

10023

goto done;

10022

goto done;

10024

10023

10025

spin_lock_irqsave(&task_group_lock, flags);

10024

spin_lock_irqsave(&task_group_lock, flags);

10026

for_each_possible_cpu(i)

10025

for_each_possible_cpu(i)

10027

unregister_fair_sched_group(tg, i);

10026

unregister_fair_sched_group(tg, i);

10028

list_del_rcu(&tg->siblings);

10027

list_del_rcu(&tg->siblings);

10029

spin_unlock_irqrestore(&task_group_lock, flags);

10028

spin_unlock_irqrestore(&task_group_lock, flags);

10030

10029

10031

/* wait for any ongoing reference to this group to finish */

10030

/* wait for any ongoing reference to this group to finish */

10032

synchronize_sched();

10031

synchronize_sched();

10033

10032

10034

/*

10033

/*

10035

* Now we are free to modify the group's share on each cpu

10034

* Now we are free to modify the group's share on each cpu

10036

* w/o tripping rebalance_share or load_balance_fair.

10035

* w/o tripping rebalance_share or load_balance_fair.

10037

*/

10036

*/

10038

tg->shares = shares;

10037

tg->shares = shares;

10039

for_each_possible_cpu(i) {

10038

for_each_possible_cpu(i) {

10040

/*

10039

/*

10041

* force a rebalance

10040

* force a rebalance

10042

*/

10041

*/

10043

cfs_rq_set_shares(tg->cfs_rq[i], 0);

10042

cfs_rq_set_shares(tg->cfs_rq[i], 0);

10044

set_se_shares(tg->se[i], shares);

10043

set_se_shares(tg->se[i], shares);

10045

}

10044

}

10046

10045

10047

/*

10046

/*

10048

* Enable load balance activity on this group, by inserting it back on

10047

* Enable load balance activity on this group, by inserting it back on

10049

* each cpu's rq->leaf_cfs_rq_list.

10048

* each cpu's rq->leaf_cfs_rq_list.

10050

*/

10049

*/

10051

spin_lock_irqsave(&task_group_lock, flags);

10050

spin_lock_irqsave(&task_group_lock, flags);

10052

for_each_possible_cpu(i)

10051

for_each_possible_cpu(i)

10053

register_fair_sched_group(tg, i);

10052

register_fair_sched_group(tg, i);

10054

list_add_rcu(&tg->siblings, &tg->parent->children);

10053

list_add_rcu(&tg->siblings, &tg->parent->children);

10055

spin_unlock_irqrestore(&task_group_lock, flags);

10054

spin_unlock_irqrestore(&task_group_lock, flags);

10056

done:

10055

done:

10057

mutex_unlock(&shares_mutex);

10056

mutex_unlock(&shares_mutex);

10058

return 0;

10057

return 0;

10059

}

10058

}

10060

10059

10061

unsigned long sched_group_shares(struct task_group *tg)

10060

unsigned long sched_group_shares(struct task_group *tg)

10062

{

10061

{

10063

return tg->shares;

10062

return tg->shares;

10064

}

10063

}

10065

#endif

10064

#endif

10066

10065

10067

#ifdef CONFIG_RT_GROUP_SCHED

10066

#ifdef CONFIG_RT_GROUP_SCHED

10068

/*

10067

/*

10069

* Ensure that the real time constraints are schedulable.

10068

* Ensure that the real time constraints are schedulable.

10070

*/

10069

*/

10071

static DEFINE_MUTEX(rt_constraints_mutex);

10070

static DEFINE_MUTEX(rt_constraints_mutex);

10072

10071

10073

static unsigned long to_ratio(u64 period, u64 runtime)

10072

static unsigned long to_ratio(u64 period, u64 runtime)

10074

{

10073

{

10075

if (runtime == RUNTIME_INF)

10074

if (runtime == RUNTIME_INF)

10076

return 1ULL << 20;

10075

return 1ULL << 20;

10077

10076

10078

return div64_u64(runtime << 20, period);

10077

return div64_u64(runtime << 20, period);

10079

}

10078

}

10080

10079

10081

/* Must be called with tasklist_lock held */

10080

/* Must be called with tasklist_lock held */

10082

static inline int tg_has_rt_tasks(struct task_group *tg)

10081

static inline int tg_has_rt_tasks(struct task_group *tg)

10083

{

10082

{

10084

struct task_struct *g, *p;

10083

struct task_struct *g, *p;

10085

10084

10086

do_each_thread(g, p) {

10085

do_each_thread(g, p) {

10087

if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)

10086

if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)

10088

return 1;

10087

return 1;

10089

} while_each_thread(g, p);

10088

} while_each_thread(g, p);

10090

10089

10091

return 0;

10090

return 0;

10092

}

10091

}

10093

10092

10094

struct rt_schedulable_data {

10093

struct rt_schedulable_data {

10095

struct task_group *tg;

10094

struct task_group *tg;

10096

u64 rt_period;

10095

u64 rt_period;

10097

u64 rt_runtime;

10096

u64 rt_runtime;

10098

};

10097

};

10099

10098

10100

static int tg_schedulable(struct task_group *tg, void *data)

10099

static int tg_schedulable(struct task_group *tg, void *data)

10101

{

10100

{

10102

struct rt_schedulable_data *d = data;

10101

struct rt_schedulable_data *d = data;

10103

struct task_group *child;

10102

struct task_group *child;

10104

unsigned long total, sum = 0;

10103

unsigned long total, sum = 0;

10105

u64 period, runtime;

10104

u64 period, runtime;

10106

10105

10107

period = ktime_to_ns(tg->rt_bandwidth.rt_period);

10106

period = ktime_to_ns(tg->rt_bandwidth.rt_period);

10108

runtime = tg->rt_bandwidth.rt_runtime;

10107

runtime = tg->rt_bandwidth.rt_runtime;

10109

10108

10110

if (tg == d->tg) {

10109

if (tg == d->tg) {

10111

period = d->rt_period;

10110

period = d->rt_period;

10112

runtime = d->rt_runtime;

10111

runtime = d->rt_runtime;

10113

}

10112

}

10114

10113

10115

#ifdef CONFIG_USER_SCHED

10114

#ifdef CONFIG_USER_SCHED

10116

if (tg == &root_task_group) {

10115

if (tg == &root_task_group) {

10117

period = global_rt_period();

10116

period = global_rt_period();

10118

runtime = global_rt_runtime();

10117

runtime = global_rt_runtime();

10119

}

10118

}

10120

#endif

10119

#endif

10121

10120

10122

/*

10121

/*

10123

* Cannot have more runtime than the period.

10122

* Cannot have more runtime than the period.

10124

*/

10123

*/

10125

if (runtime > period && runtime != RUNTIME_INF)

10124

if (runtime > period && runtime != RUNTIME_INF)

10126

return -EINVAL;

10125

return -EINVAL;

10127

10126

10128

/*

10127

/*

10129

* Ensure we don't starve existing RT tasks.

10128

* Ensure we don't starve existing RT tasks.

10130

*/

10129

*/

10131

if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))

10130

if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))

10132

return -EBUSY;

10131

return -EBUSY;

10133

10132

10134

total = to_ratio(period, runtime);

10133

total = to_ratio(period, runtime);

10135

10134

10136

/*

10135

/*

10137

* Nobody can have more than the global setting allows.

10136

* Nobody can have more than the global setting allows.

10138

*/

10137

*/

10139

if (total > to_ratio(global_rt_period(), global_rt_runtime()))

10138

if (total > to_ratio(global_rt_period(), global_rt_runtime()))

10140

return -EINVAL;

10139

return -EINVAL;

10141

10140

10142

/*

10141

/*

10143

* The sum of our children's runtime should not exceed our own.

10142

* The sum of our children's runtime should not exceed our own.

10144

*/

10143

*/

10145

list_for_each_entry_rcu(child, &tg->children, siblings) {

10144

list_for_each_entry_rcu(child, &tg->children, siblings) {

10146

period = ktime_to_ns(child->rt_bandwidth.rt_period);

10145

period = ktime_to_ns(child->rt_bandwidth.rt_period);

10147

runtime = child->rt_bandwidth.rt_runtime;

10146

runtime = child->rt_bandwidth.rt_runtime;

10148

10147

10149

if (child == d->tg) {

10148

if (child == d->tg) {

10150

period = d->rt_period;

10149

period = d->rt_period;

10151

runtime = d->rt_runtime;

10150

runtime = d->rt_runtime;

10152

}

10151

}

10153

10152

10154

sum += to_ratio(period, runtime);

10153

sum += to_ratio(period, runtime);

10155

}

10154

}

10156

10155

10157

if (sum > total)

10156

if (sum > total)

10158

return -EINVAL;

10157

return -EINVAL;

10159

10158

10160

return 0;

10159

return 0;

10161

}

10160

}

10162

10161

10163

static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)

10162

static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)

10164

{

10163

{

10165

struct rt_schedulable_data data = {

10164

struct rt_schedulable_data data = {

10166

.tg = tg,

10165

.tg = tg,

10167

.rt_period = period,

10166

.rt_period = period,

10168

.rt_runtime = runtime,

10167

.rt_runtime = runtime,

10169

};

10168

};

10170

10169

10171

return walk_tg_tree(tg_schedulable, tg_nop, &data);

10170

return walk_tg_tree(tg_schedulable, tg_nop, &data);

10172

}

10171

}

10173

10172

10174

static int tg_set_bandwidth(struct task_group *tg,

10173

static int tg_set_bandwidth(struct task_group *tg,

10175

u64 rt_period, u64 rt_runtime)

10174

u64 rt_period, u64 rt_runtime)

10176

{

10175

{

10177

int i, err = 0;

10176

int i, err = 0;

10178

10177

10179

mutex_lock(&rt_constraints_mutex);

10178

mutex_lock(&rt_constraints_mutex);

10180

read_lock(&tasklist_lock);

10179

read_lock(&tasklist_lock);

10181

err = __rt_schedulable(tg, rt_period, rt_runtime);

10180

err = __rt_schedulable(tg, rt_period, rt_runtime);

10182

if (err)

10181

if (err)

10183

goto unlock;

10182

goto unlock;

10184

10183

10185

spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);

10184

spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);

10186

tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);

10185

tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);

10187

tg->rt_bandwidth.rt_runtime = rt_runtime;

10186

tg->rt_bandwidth.rt_runtime = rt_runtime;

10188

10187

10189

for_each_possible_cpu(i) {

10188

for_each_possible_cpu(i) {

10190

struct rt_rq *rt_rq = tg->rt_rq[i];

10189

struct rt_rq *rt_rq = tg->rt_rq[i];

10191

10190

10192

spin_lock(&rt_rq->rt_runtime_lock);

10191

spin_lock(&rt_rq->rt_runtime_lock);

10193

rt_rq->rt_runtime = rt_runtime;

10192

rt_rq->rt_runtime = rt_runtime;

10194

spin_unlock(&rt_rq->rt_runtime_lock);

10193

spin_unlock(&rt_rq->rt_runtime_lock);

10195

}

10194

}

10196

spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);

10195

spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);

10197

unlock:

10196

unlock:

10198

read_unlock(&tasklist_lock);

10197

read_unlock(&tasklist_lock);

10199

mutex_unlock(&rt_constraints_mutex);

10198

mutex_unlock(&rt_constraints_mutex);

10200

10199

10201

return err;

10200

return err;

10202

}

10201

}

10203

10202

10204

int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)

10203

int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)

10205

{

10204

{

10206

u64 rt_runtime, rt_period;

10205

u64 rt_runtime, rt_period;

10207

10206

10208

rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);

10207

rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);

10209

rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;

10208

rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;

10210

if (rt_runtime_us < 0)

10209

if (rt_runtime_us < 0)

10211

rt_runtime = RUNTIME_INF;

10210

rt_runtime = RUNTIME_INF;

10212

10211

10213

return tg_set_bandwidth(tg, rt_period, rt_runtime);

10212

return tg_set_bandwidth(tg, rt_period, rt_runtime);

10214

}

10213

}

10215

10214

10216

long sched_group_rt_runtime(struct task_group *tg)

10215

long sched_group_rt_runtime(struct task_group *tg)

10217

{

10216

{

10218

u64 rt_runtime_us;

10217

u64 rt_runtime_us;

10219

10218

10220

if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)

10219

if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)

10221

return -1;

10220

return -1;

10222

10221

10223

rt_runtime_us = tg->rt_bandwidth.rt_runtime;

10222

rt_runtime_us = tg->rt_bandwidth.rt_runtime;

10224

do_div(rt_runtime_us, NSEC_PER_USEC);

10223

do_div(rt_runtime_us, NSEC_PER_USEC);

10225

return rt_runtime_us;

10224

return rt_runtime_us;

10226

}

10225

}

10227

10226

10228

int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)

10227

int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)

10229

{

10228

{

10230

u64 rt_runtime, rt_period;

10229

u64 rt_runtime, rt_period;

10231

10230

10232

rt_period = (u64)rt_period_us * NSEC_PER_USEC;

10231

rt_period = (u64)rt_period_us * NSEC_PER_USEC;

10233

rt_runtime = tg->rt_bandwidth.rt_runtime;

10232

rt_runtime = tg->rt_bandwidth.rt_runtime;

10234

10233

10235

if (rt_period == 0)

10234

if (rt_period == 0)

10236

return -EINVAL;

10235

return -EINVAL;

10237

10236

10238

return tg_set_bandwidth(tg, rt_period, rt_runtime);

10237

return tg_set_bandwidth(tg, rt_period, rt_runtime);

10239

}

10238

}

10240

10239

10241

long sched_group_rt_period(struct task_group *tg)

10240

long sched_group_rt_period(struct task_group *tg)

10242

{

10241

{

10243

u64 rt_period_us;

10242

u64 rt_period_us;

10244

10243

10245

rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);

10244

rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);

10246

do_div(rt_period_us, NSEC_PER_USEC);

10245

do_div(rt_period_us, NSEC_PER_USEC);

10247

return rt_period_us;

10246

return rt_period_us;

10248

}

10247

}

10249

10248

10250

static int sched_rt_global_constraints(void)

10249

static int sched_rt_global_constraints(void)

10251

{

10250

{

10252

u64 runtime, period;

10251

u64 runtime, period;

10253

int ret = 0;

10252

int ret = 0;

10254

10253

10255

if (sysctl_sched_rt_period <= 0)

10254

if (sysctl_sched_rt_period <= 0)

10256

return -EINVAL;

10255

return -EINVAL;

10257

10256

10258

runtime = global_rt_runtime();

10257

runtime = global_rt_runtime();

10259

period = global_rt_period();

10258

period = global_rt_period();

10260

10259

10261

/*

10260

/*

10262

* Sanity check on the sysctl variables.

10261

* Sanity check on the sysctl variables.

10263

*/

10262

*/

10264

if (runtime > period && runtime != RUNTIME_INF)

10263

if (runtime > period && runtime != RUNTIME_INF)

10265

return -EINVAL;

10264

return -EINVAL;

10266

10265

10267

mutex_lock(&rt_constraints_mutex);

10266

mutex_lock(&rt_constraints_mutex);

10268

read_lock(&tasklist_lock);

10267

read_lock(&tasklist_lock);

10269

ret = __rt_schedulable(NULL, 0, 0);

10268

ret = __rt_schedulable(NULL, 0, 0);

10270

read_unlock(&tasklist_lock);

10269

read_unlock(&tasklist_lock);

10271

mutex_unlock(&rt_constraints_mutex);

10270

mutex_unlock(&rt_constraints_mutex);

10272

10271

10273

return ret;

10272

return ret;

10274

}

10273

}

10275

10274

10276

int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)

10275

int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)

10277

{

10276

{

10278

/* Don't accept realtime tasks when there is no way for them to run */

10277

/* Don't accept realtime tasks when there is no way for them to run */

10279

if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)

10278

if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)

10280

return 0;

10279

return 0;

10281

10280

10282

return 1;

10281

return 1;

10283

}

10282

}

10284

10283

10285

#else /* !CONFIG_RT_GROUP_SCHED */

10284

#else /* !CONFIG_RT_GROUP_SCHED */

10286

static int sched_rt_global_constraints(void)

10285

static int sched_rt_global_constraints(void)

10287

{

10286

{

10288

unsigned long flags;

10287

unsigned long flags;

10289

int i;

10288

int i;

10290

10289

10291

if (sysctl_sched_rt_period <= 0)

10290

if (sysctl_sched_rt_period <= 0)

10292

return -EINVAL;

10291

return -EINVAL;

10293

10292

10294

/*

10293

/*

10295

* There's always some RT tasks in the root group

10294

* There's always some RT tasks in the root group

10296

* -- migration, kstopmachine etc..

10295

* -- migration, kstopmachine etc..

10297

*/

10296

*/

10298

if (sysctl_sched_rt_runtime == 0)

10297

if (sysctl_sched_rt_runtime == 0)

10299

return -EBUSY;

10298

return -EBUSY;

10300

10299

10301

spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);

10300

spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);

10302

for_each_possible_cpu(i) {

10301

for_each_possible_cpu(i) {

10303

struct rt_rq *rt_rq = &cpu_rq(i)->rt;

10302

struct rt_rq *rt_rq = &cpu_rq(i)->rt;

10304

10303

10305

spin_lock(&rt_rq->rt_runtime_lock);

10304

spin_lock(&rt_rq->rt_runtime_lock);

10306

rt_rq->rt_runtime = global_rt_runtime();

10305

rt_rq->rt_runtime = global_rt_runtime();

10307

spin_unlock(&rt_rq->rt_runtime_lock);

10306

spin_unlock(&rt_rq->rt_runtime_lock);

10308

}

10307

}

10309

spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);

10308

spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);

10310

10309

10311

return 0;

10310

return 0;

10312

}

10311

}

10313

#endif /* CONFIG_RT_GROUP_SCHED */

10312

#endif /* CONFIG_RT_GROUP_SCHED */

10314

10313

10315

int sched_rt_handler(struct ctl_table *table, int write,

10314

int sched_rt_handler(struct ctl_table *table, int write,

10316

void __user *buffer, size_t *lenp,

10315

void __user *buffer, size_t *lenp,

10317

loff_t *ppos)

10316

loff_t *ppos)

10318

{

10317

{

10319

int ret;

10318

int ret;

10320

int old_period, old_runtime;

10319

int old_period, old_runtime;

10321

static DEFINE_MUTEX(mutex);

10320

static DEFINE_MUTEX(mutex);

10322

10321

10323

mutex_lock(&mutex);

10322

mutex_lock(&mutex);

10324

old_period = sysctl_sched_rt_period;

10323

old_period = sysctl_sched_rt_period;

10325

old_runtime = sysctl_sched_rt_runtime;

10324

old_runtime = sysctl_sched_rt_runtime;

10326

10325

10327

ret = proc_dointvec(table, write, buffer, lenp, ppos);

10326

ret = proc_dointvec(table, write, buffer, lenp, ppos);

10328

10327

10329

if (!ret && write) {

10328

if (!ret && write) {

10330

ret = sched_rt_global_constraints();

10329

ret = sched_rt_global_constraints();

10331

if (ret) {

10330

if (ret) {

10332

sysctl_sched_rt_period = old_period;

10331

sysctl_sched_rt_period = old_period;

10333

sysctl_sched_rt_runtime = old_runtime;

10332

sysctl_sched_rt_runtime = old_runtime;

10334

} else {

10333

} else {

10335

def_rt_bandwidth.rt_runtime = global_rt_runtime();

10334

def_rt_bandwidth.rt_runtime = global_rt_runtime();

10336

def_rt_bandwidth.rt_period =

10335

def_rt_bandwidth.rt_period =

10337

ns_to_ktime(global_rt_period());

10336

ns_to_ktime(global_rt_period());

10338

}

10337

}

10339

}

10338

}

10340

mutex_unlock(&mutex);

10339

mutex_unlock(&mutex);

10341

10340

10342

return ret;

10341

return ret;

10343

}

10342

}

10344

10343

10345

#ifdef CONFIG_CGROUP_SCHED

10344

#ifdef CONFIG_CGROUP_SCHED

10346

10345

10347

/* return corresponding task_group object of a cgroup */

10346

/* return corresponding task_group object of a cgroup */

10348

static inline struct task_group *cgroup_tg(struct cgroup *cgrp)

10347

static inline struct task_group *cgroup_tg(struct cgroup *cgrp)

10349

{

10348

{

10350

return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),

10349

return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),

10351

struct task_group, css);

10350

struct task_group, css);

10352

}

10351

}

10353

10352

10354

static struct cgroup_subsys_state *

10353

static struct cgroup_subsys_state *

10355

cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)

10354

cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)

10356

{

10355

{

10357

struct task_group *tg, *parent;

10356

struct task_group *tg, *parent;

10358

10357

10359

if (!cgrp->parent) {

10358

if (!cgrp->parent) {

10360

/* This is early initialization for the top cgroup */

10359

/* This is early initialization for the top cgroup */

10361

return &init_task_group.css;

10360

return &init_task_group.css;

10362

}

10361

}

10363

10362

10364

parent = cgroup_tg(cgrp->parent);

10363

parent = cgroup_tg(cgrp->parent);

10365

tg = sched_create_group(parent);

10364

tg = sched_create_group(parent);

10366

if (IS_ERR(tg))

10365

if (IS_ERR(tg))

10367

return ERR_PTR(-ENOMEM);

10366

return ERR_PTR(-ENOMEM);

10368

10367

10369

return &tg->css;

10368

return &tg->css;

10370

}

10369

}

10371

10370

10372

static void

10371

static void

10373

cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

10372

cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

10374

{

10373

{

10375

struct task_group *tg = cgroup_tg(cgrp);

10374

struct task_group *tg = cgroup_tg(cgrp);

10376

10375

10377

sched_destroy_group(tg);

10376

sched_destroy_group(tg);

10378

}

10377

}

10379

10378

10380

static int

10379

static int

10381

cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)

10380

cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)

10382

{

10381

{

10383

#ifdef CONFIG_RT_GROUP_SCHED

10382

#ifdef CONFIG_RT_GROUP_SCHED

10384

if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))

10383

if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))

10385

return -EINVAL;

10384

return -EINVAL;

10386

#else

10385

#else

10387

/* We don't support RT-tasks being in separate groups */

10386

/* We don't support RT-tasks being in separate groups */

10388

if (tsk->sched_class != &fair_sched_class)

10387

if (tsk->sched_class != &fair_sched_class)

10389

return -EINVAL;

10388

return -EINVAL;

10390

#endif

10389

#endif

10391

return 0;

10390

return 0;

10392

}

10391

}

10393

10392

10394

static int

10393

static int

10395

cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

10394

cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

10396

struct task_struct *tsk, bool threadgroup)

10395

struct task_struct *tsk, bool threadgroup)

10397

{

10396

{

10398

int retval = cpu_cgroup_can_attach_task(cgrp, tsk);

10397

int retval = cpu_cgroup_can_attach_task(cgrp, tsk);

10399

if (retval)

10398

if (retval)

10400

return retval;

10399

return retval;

10401

if (threadgroup) {

10400

if (threadgroup) {

10402

struct task_struct *c;

10401

struct task_struct *c;

10403

rcu_read_lock();

10402

rcu_read_lock();

10404

list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {

10403

list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {

10405

retval = cpu_cgroup_can_attach_task(cgrp, c);

10404

retval = cpu_cgroup_can_attach_task(cgrp, c);

10406

if (retval) {

10405

if (retval) {

10407

rcu_read_unlock();

10406

rcu_read_unlock();

10408

return retval;

10407

return retval;

10409

}

10408

}

10410

}

10409

}

10411

rcu_read_unlock();

10410

rcu_read_unlock();

10412

}

10411

}

10413

return 0;

10412

return 0;

10414

}

10413

}

10415

10414

10416

static void

10415

static void

10417

cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

10416

cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

10418

struct cgroup *old_cont, struct task_struct *tsk,

10417

struct cgroup *old_cont, struct task_struct *tsk,

10419

bool threadgroup)

10418

bool threadgroup)

10420

{

10419

{

10421

sched_move_task(tsk);

10420

sched_move_task(tsk);

10422

if (threadgroup) {

10421

if (threadgroup) {

10423

struct task_struct *c;

10422

struct task_struct *c;

10424

rcu_read_lock();

10423

rcu_read_lock();

10425

list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {

10424

list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {

10426

sched_move_task(c);

10425

sched_move_task(c);

10427

}

10426

}

10428

rcu_read_unlock();

10427

rcu_read_unlock();

10429

}

10428

}

10430

}

10429

}

10431

10430

10432

#ifdef CONFIG_FAIR_GROUP_SCHED

10431

#ifdef CONFIG_FAIR_GROUP_SCHED

10433

static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,

10432

static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,

10434

u64 shareval)

10433

u64 shareval)

10435

{

10434

{

10436

return sched_group_set_shares(cgroup_tg(cgrp), shareval);

10435

return sched_group_set_shares(cgroup_tg(cgrp), shareval);

10437

}

10436

}

10438

10437

10439

static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)

10438

static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)

10440

{

10439

{

10441

struct task_group *tg = cgroup_tg(cgrp);

10440

struct task_group *tg = cgroup_tg(cgrp);

10442

10441

10443

return (u64) tg->shares;

10442

return (u64) tg->shares;

10444

}

10443

}

10445

#endif /* CONFIG_FAIR_GROUP_SCHED */

10444

#endif /* CONFIG_FAIR_GROUP_SCHED */

10446

10445

10447

#ifdef CONFIG_RT_GROUP_SCHED

10446

#ifdef CONFIG_RT_GROUP_SCHED

10448

static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,

10447

static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,

10449

s64 val)

10448

s64 val)

10450

{

10449

{

10451

return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);

10450

return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);

10452

}

10451

}

10453

10452

10454

static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)

10453

static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)

10455

{

10454

{

10456

return sched_group_rt_runtime(cgroup_tg(cgrp));

10455

return sched_group_rt_runtime(cgroup_tg(cgrp));

10457

}

10456

}

10458

10457

10459

static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,

10458

static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,

10460

u64 rt_period_us)

10459

u64 rt_period_us)

10461

{

10460

{

10462

return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);

10461

return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);

10463

}

10462

}

10464

10463

10465

static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)

10464

static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)

10466

{

10465

{

10467

return sched_group_rt_period(cgroup_tg(cgrp));

10466

return sched_group_rt_period(cgroup_tg(cgrp));

10468

}

10467

}

10469

#endif /* CONFIG_RT_GROUP_SCHED */

10468

#endif /* CONFIG_RT_GROUP_SCHED */

10470

10469

10471

static struct cftype cpu_files[] = {

10470

static struct cftype cpu_files[] = {

10472

#ifdef CONFIG_FAIR_GROUP_SCHED

10471

#ifdef CONFIG_FAIR_GROUP_SCHED

10473

{

10472

{

10474

.name = "shares",

10473

.name = "shares",

10475

.read_u64 = cpu_shares_read_u64,

10474

.read_u64 = cpu_shares_read_u64,

10476

.write_u64 = cpu_shares_write_u64,

10475

.write_u64 = cpu_shares_write_u64,

10477

},

10476

},

10478

#endif

10477

#endif

10479

#ifdef CONFIG_RT_GROUP_SCHED

10478

#ifdef CONFIG_RT_GROUP_SCHED

10480

{

10479

{

10481

.name = "rt_runtime_us",

10480

.name = "rt_runtime_us",

10482

.read_s64 = cpu_rt_runtime_read,

10481

.read_s64 = cpu_rt_runtime_read,

10483

.write_s64 = cpu_rt_runtime_write,

10482

.write_s64 = cpu_rt_runtime_write,

10484

},

10483

},

10485

{

10484

{

10486

.name = "rt_period_us",

10485

.name = "rt_period_us",

10487

.read_u64 = cpu_rt_period_read_uint,

10486

.read_u64 = cpu_rt_period_read_uint,

10488

.write_u64 = cpu_rt_period_write_uint,

10487

.write_u64 = cpu_rt_period_write_uint,

10489

},

10488

},

10490

#endif

10489

#endif

10491

};

10490

};

10492

10491

10493

static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)

10492

static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)

10494

{

10493

{

10495

return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));

10494

return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));

10496

}

10495

}

10497

10496

10498

struct cgroup_subsys cpu_cgroup_subsys = {

10497

struct cgroup_subsys cpu_cgroup_subsys = {

10499

.name = "cpu",

10498

.name = "cpu",

10500

.create = cpu_cgroup_create,

10499

.create = cpu_cgroup_create,

10501

.destroy = cpu_cgroup_destroy,

10500

.destroy = cpu_cgroup_destroy,

10502

.can_attach = cpu_cgroup_can_attach,

10501

.can_attach = cpu_cgroup_can_attach,

10503

.attach = cpu_cgroup_attach,

10502

.attach = cpu_cgroup_attach,

10504

.populate = cpu_cgroup_populate,

10503

.populate = cpu_cgroup_populate,

10505

.subsys_id = cpu_cgroup_subsys_id,

10504

.subsys_id = cpu_cgroup_subsys_id,

10506

.early_init = 1,

10505

.early_init = 1,

10507

};

10506

};

10508

10507

10509

#endif /* CONFIG_CGROUP_SCHED */

10508

#endif /* CONFIG_CGROUP_SCHED */

10510

10509

10511

#ifdef CONFIG_CGROUP_CPUACCT

10510

#ifdef CONFIG_CGROUP_CPUACCT

10512

10511

10513

/*

10512

/*

10514

* CPU accounting code for task groups.

10513

* CPU accounting code for task groups.

10515

*

10514

*

10516

* Based on the work by Paul Menage (menage@google.com) and Balbir Singh

10515

* Based on the work by Paul Menage (menage@google.com) and Balbir Singh

10517

* (balbir@in.ibm.com).

10516

* (balbir@in.ibm.com).

10518

*/

10517

*/

10519

10518

10520

/* track cpu usage of a group of tasks and its child groups */

10519

/* track cpu usage of a group of tasks and its child groups */

10521

struct cpuacct {

10520

struct cpuacct {

10522

struct cgroup_subsys_state css;

10521

struct cgroup_subsys_state css;

10523

/* cpuusage holds pointer to a u64-type object on every cpu */

10522

/* cpuusage holds pointer to a u64-type object on every cpu */

10524

u64 *cpuusage;

10523

u64 *cpuusage;

10525

struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];

10524

struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];

10526

struct cpuacct *parent;

10525

struct cpuacct *parent;

10527

};

10526

};

10528

10527

10529

struct cgroup_subsys cpuacct_subsys;

10528

struct cgroup_subsys cpuacct_subsys;

10530

10529

10531

/* return cpu accounting group corresponding to this container */

10530

/* return cpu accounting group corresponding to this container */

10532

static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)

10531

static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)

10533

{

10532

{

10534

return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),

10533

return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),

10535

struct cpuacct, css);

10534

struct cpuacct, css);

10536

}

10535

}

10537

10536

10538

/* return cpu accounting group to which this task belongs */

10537

/* return cpu accounting group to which this task belongs */

10539

static inline struct cpuacct *task_ca(struct task_struct *tsk)

10538

static inline struct cpuacct *task_ca(struct task_struct *tsk)

10540

{

10539

{

10541

return container_of(task_subsys_state(tsk, cpuacct_subsys_id),

10540

return container_of(task_subsys_state(tsk, cpuacct_subsys_id),

10542

struct cpuacct, css);

10541

struct cpuacct, css);

10543

}

10542

}

10544

10543

10545

/* create a new cpu accounting group */

10544

/* create a new cpu accounting group */

10546

static struct cgroup_subsys_state *cpuacct_create(

10545

static struct cgroup_subsys_state *cpuacct_create(

10547

struct cgroup_subsys *ss, struct cgroup *cgrp)

10546

struct cgroup_subsys *ss, struct cgroup *cgrp)

10548

{

10547

{

10549

struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);

10548

struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);

10550

int i;

10549

int i;

10551

10550

10552

if (!ca)

10551

if (!ca)

10553

goto out;

10552

goto out;

10554

10553

10555

ca->cpuusage = alloc_percpu(u64);

10554

ca->cpuusage = alloc_percpu(u64);

10556

if (!ca->cpuusage)

10555

if (!ca->cpuusage)

10557

goto out_free_ca;

10556

goto out_free_ca;

10558

10557

10559

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

10558

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

10560

if (percpu_counter_init(&ca->cpustat[i], 0))

10559

if (percpu_counter_init(&ca->cpustat[i], 0))

10561

goto out_free_counters;

10560

goto out_free_counters;

10562

10561

10563

if (cgrp->parent)

10562

if (cgrp->parent)

10564

ca->parent = cgroup_ca(cgrp->parent);

10563

ca->parent = cgroup_ca(cgrp->parent);

10565

10564

10566

return &ca->css;

10565

return &ca->css;

10567

10566

10568

out_free_counters:

10567

out_free_counters:

10569

while (--i >= 0)

10568

while (--i >= 0)

10570

percpu_counter_destroy(&ca->cpustat[i]);

10569

percpu_counter_destroy(&ca->cpustat[i]);

10571

free_percpu(ca->cpuusage);

10570

free_percpu(ca->cpuusage);

10572

out_free_ca:

10571

out_free_ca:

10573

kfree(ca);

10572

kfree(ca);

10574

out:

10573

out:

10575

return ERR_PTR(-ENOMEM);

10574

return ERR_PTR(-ENOMEM);

10576

}

10575

}

10577

10576

10578

/* destroy an existing cpu accounting group */

10577

/* destroy an existing cpu accounting group */

10579

static void

10578

static void

10580

cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

10579

cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

10581

{

10580

{

10582

struct cpuacct *ca = cgroup_ca(cgrp);

10581

struct cpuacct *ca = cgroup_ca(cgrp);

10583

int i;

10582

int i;

10584

10583

10585

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

10584

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

10586

percpu_counter_destroy(&ca->cpustat[i]);

10585

percpu_counter_destroy(&ca->cpustat[i]);

10587

free_percpu(ca->cpuusage);

10586

free_percpu(ca->cpuusage);

10588

kfree(ca);

10587

kfree(ca);

10589

}

10588

}

10590

10589

10591

static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)

10590

static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)

10592

{

10591

{

10593

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

10592

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

10594

u64 data;

10593

u64 data;

10595

10594

10596

#ifndef CONFIG_64BIT

10595

#ifndef CONFIG_64BIT

10597

/*

10596

/*

10598

* Take rq->lock to make 64-bit read safe on 32-bit platforms.

10597

* Take rq->lock to make 64-bit read safe on 32-bit platforms.

10599

*/

10598

*/

10600

spin_lock_irq(&cpu_rq(cpu)->lock);

10599

spin_lock_irq(&cpu_rq(cpu)->lock);

10601

data = *cpuusage;

10600

data = *cpuusage;

10602

spin_unlock_irq(&cpu_rq(cpu)->lock);

10601

spin_unlock_irq(&cpu_rq(cpu)->lock);

10603

#else

10602

#else

10604

data = *cpuusage;

10603

data = *cpuusage;

10605

#endif

10604

#endif

10606

10605

10607

return data;

10606

return data;

10608

}

10607

}

10609

10608

10610

static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)

10609

static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)

10611

{

10610

{

10612

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

10611

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

10613

10612

10614

#ifndef CONFIG_64BIT

10613

#ifndef CONFIG_64BIT

10615

/*

10614

/*

10616

* Take rq->lock to make 64-bit write safe on 32-bit platforms.

10615

* Take rq->lock to make 64-bit write safe on 32-bit platforms.

10617

*/

10616

*/

10618

spin_lock_irq(&cpu_rq(cpu)->lock);

10617

spin_lock_irq(&cpu_rq(cpu)->lock);

10619

*cpuusage = val;

10618

*cpuusage = val;

10620

spin_unlock_irq(&cpu_rq(cpu)->lock);

10619

spin_unlock_irq(&cpu_rq(cpu)->lock);

10621

#else

10620

#else

10622

*cpuusage = val;

10621

*cpuusage = val;

10623

#endif

10622

#endif

10624

}

10623

}

10625

10624

10626

/* return total cpu usage (in nanoseconds) of a group */

10625

/* return total cpu usage (in nanoseconds) of a group */

10627

static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)

10626

static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)

10628

{

10627

{

10629

struct cpuacct *ca = cgroup_ca(cgrp);

10628

struct cpuacct *ca = cgroup_ca(cgrp);

10630

u64 totalcpuusage = 0;

10629

u64 totalcpuusage = 0;

10631

int i;

10630

int i;

10632

10631

10633

for_each_present_cpu(i)

10632

for_each_present_cpu(i)

10634

totalcpuusage += cpuacct_cpuusage_read(ca, i);

10633

totalcpuusage += cpuacct_cpuusage_read(ca, i);

10635

10634

10636

return totalcpuusage;

10635

return totalcpuusage;

10637

}

10636

}

10638

10637

10639

static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,

10638

static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,

10640

u64 reset)

10639

u64 reset)

10641

{

10640

{

10642

struct cpuacct *ca = cgroup_ca(cgrp);

10641

struct cpuacct *ca = cgroup_ca(cgrp);

10643

int err = 0;

10642

int err = 0;

10644

int i;

10643

int i;

10645

10644

10646

if (reset) {

10645

if (reset) {

10647

err = -EINVAL;

10646

err = -EINVAL;

10648

goto out;

10647

goto out;

10649

}

10648

}

10650

10649

10651

for_each_present_cpu(i)

10650

for_each_present_cpu(i)

10652

cpuacct_cpuusage_write(ca, i, 0);

10651

cpuacct_cpuusage_write(ca, i, 0);

10653

10652

10654

out:

10653

out:

10655

return err;

10654

return err;

10656

}

10655

}

10657

10656

10658

static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,

10657

static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,

10659

struct seq_file *m)

10658

struct seq_file *m)

10660

{

10659

{

10661

struct cpuacct *ca = cgroup_ca(cgroup);

10660

struct cpuacct *ca = cgroup_ca(cgroup);

10662

u64 percpu;

10661

u64 percpu;

10663

int i;

10662

int i;

10664

10663

10665

for_each_present_cpu(i) {

10664

for_each_present_cpu(i) {

10666

percpu = cpuacct_cpuusage_read(ca, i);

10665

percpu = cpuacct_cpuusage_read(ca, i);

10667

seq_printf(m, "%llu ", (unsigned long long) percpu);

10666

seq_printf(m, "%llu ", (unsigned long long) percpu);

10668

}

10667

}

10669

seq_printf(m, "\n");

10668

seq_printf(m, "\n");

10670

return 0;

10669

return 0;

10671

}

10670

}

10672

10671

10673

static const char *cpuacct_stat_desc[] = {

10672

static const char *cpuacct_stat_desc[] = {

10674

[CPUACCT_STAT_USER] = "user",

10673

[CPUACCT_STAT_USER] = "user",

10675

[CPUACCT_STAT_SYSTEM] = "system",

10674

[CPUACCT_STAT_SYSTEM] = "system",

10676

};

10675

};

10677

10676

10678

static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,

10677

static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,

10679

struct cgroup_map_cb *cb)

10678

struct cgroup_map_cb *cb)

10680

{

10679

{

10681

struct cpuacct *ca = cgroup_ca(cgrp);

10680

struct cpuacct *ca = cgroup_ca(cgrp);

10682

int i;

10681

int i;

10683

10682

10684

for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {

10683

for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {

10685

s64 val = percpu_counter_read(&ca->cpustat[i]);

10684

s64 val = percpu_counter_read(&ca->cpustat[i]);

10686

val = cputime64_to_clock_t(val);

10685

val = cputime64_to_clock_t(val);

10687

cb->fill(cb, cpuacct_stat_desc[i], val);

10686

cb->fill(cb, cpuacct_stat_desc[i], val);

10688

}

10687

}

10689

return 0;

10688

return 0;

10690

}

10689

}

10691

10690

10692

static struct cftype files[] = {

10691

static struct cftype files[] = {

10693

{

10692

{

10694

.name = "usage",

10693

.name = "usage",

10695

.read_u64 = cpuusage_read,

10694

.read_u64 = cpuusage_read,

10696

.write_u64 = cpuusage_write,

10695

.write_u64 = cpuusage_write,

10697

},

10696

},

10698

{

10697

{

10699

.name = "usage_percpu",

10698

.name = "usage_percpu",

10700

.read_seq_string = cpuacct_percpu_seq_read,

10699

.read_seq_string = cpuacct_percpu_seq_read,

10701

},

10700

},

10702

{

10701

{

10703

.name = "stat",

10702

.name = "stat",

10704

.read_map = cpuacct_stats_show,

10703

.read_map = cpuacct_stats_show,

10705

},

10704

},

10706

};

10705

};

10707

10706

10708

static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)

10707

static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)

10709

{

10708

{

10710

return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));

10709

return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));

10711

}

10710

}

10712

10711

10713

/*

10712

/*

10714

* charge this task's execution time to its accounting group.

10713

* charge this task's execution time to its accounting group.

10715

*

10714

*

10716

* called with rq->lock held.

10715

* called with rq->lock held.

10717

*/

10716

*/

10718

static void cpuacct_charge(struct task_struct *tsk, u64 cputime)

10717

static void cpuacct_charge(struct task_struct *tsk, u64 cputime)

10719

{

10718

{

10720

struct cpuacct *ca;

10719

struct cpuacct *ca;

10721

int cpu;

10720

int cpu;

10722

10721

10723

if (unlikely(!cpuacct_subsys.active))

10722

if (unlikely(!cpuacct_subsys.active))

10724

return;

10723

return;

10725

10724

10726

cpu = task_cpu(tsk);

10725

cpu = task_cpu(tsk);

10727

10726

10728

rcu_read_lock();

10727

rcu_read_lock();

10729

10728

10730

ca = task_ca(tsk);

10729

ca = task_ca(tsk);

10731

10730

10732

for (; ca; ca = ca->parent) {

10731

for (; ca; ca = ca->parent) {

10733

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

10732

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

10734

*cpuusage += cputime;

10733

*cpuusage += cputime;

10735

}

10734

}

10736

10735

10737

rcu_read_unlock();

10736

rcu_read_unlock();

10738

}

10737

}

10739

10738

10740

/*

10739

/*

10741

* Charge the system/user time to the task's accounting group.

10740

* Charge the system/user time to the task's accounting group.

10742

*/

10741

*/

10743

static void cpuacct_update_stats(struct task_struct *tsk,

10742

static void cpuacct_update_stats(struct task_struct *tsk,

10744

enum cpuacct_stat_index idx, cputime_t val)

10743

enum cpuacct_stat_index idx, cputime_t val)

10745

{

10744

{

10746

struct cpuacct *ca;

10745

struct cpuacct *ca;

10747

10746

10748

if (unlikely(!cpuacct_subsys.active))

10747

if (unlikely(!cpuacct_subsys.active))

10749

return;

10748

return;

10750

10749

10751

rcu_read_lock();

10750

rcu_read_lock();

10752

ca = task_ca(tsk);

10751

ca = task_ca(tsk);

10753

10752

10754

do {

10753

do {

10755

percpu_counter_add(&ca->cpustat[idx], val);

10754

percpu_counter_add(&ca->cpustat[idx], val);

10756

ca = ca->parent;

10755

ca = ca->parent;

10757

} while (ca);

10756

} while (ca);

10758

rcu_read_unlock();

10757

rcu_read_unlock();

10759

}

10758

}

10760

10759

10761

struct cgroup_subsys cpuacct_subsys = {

10760

struct cgroup_subsys cpuacct_subsys = {

10762

.name = "cpuacct",

10761

.name = "cpuacct",

10763

.create = cpuacct_create,

10762

.create = cpuacct_create,

10764

.destroy = cpuacct_destroy,

10763

.destroy = cpuacct_destroy,

10765

.populate = cpuacct_populate,

10764

.populate = cpuacct_populate,

10766

.subsys_id = cpuacct_subsys_id,

10765

.subsys_id = cpuacct_subsys_id,

10767

};

10766

};

10768

#endif /* CONFIG_CGROUP_CPUACCT */

10767

#endif /* CONFIG_CGROUP_CPUACCT */

10769

10768

10770

#ifndef CONFIG_SMP

10769

#ifndef CONFIG_SMP

10771

10770

10772

int rcu_expedited_torture_stats(char *page)

10771

int rcu_expedited_torture_stats(char *page)

10773

{

10772

{

10774

return 0;

10773

return 0;

10775

}

10774

}

10776

EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);

10775

EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);

10777

10776

10778

void synchronize_sched_expedited(void)

10777

void synchronize_sched_expedited(void)

10779

{

10778

{

10780

}

10779

}

10781

EXPORT_SYMBOL_GPL(synchronize_sched_expedited);

10780

EXPORT_SYMBOL_GPL(synchronize_sched_expedited);

10782

10781

10783

#else /* #ifndef CONFIG_SMP */

10782

#else /* #ifndef CONFIG_SMP */

10784

10783

10785

static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);

10784

static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);

10786

static DEFINE_MUTEX(rcu_sched_expedited_mutex);

10785

static DEFINE_MUTEX(rcu_sched_expedited_mutex);

10787

10786

10788

#define RCU_EXPEDITED_STATE_POST -2

10787

#define RCU_EXPEDITED_STATE_POST -2

10789

#define RCU_EXPEDITED_STATE_IDLE -1

10788

#define RCU_EXPEDITED_STATE_IDLE -1

10790

10789

10791

static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;

10790

static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;

10792

10791

10793

int rcu_expedited_torture_stats(char *page)

10792

int rcu_expedited_torture_stats(char *page)

10794

{

10793

{

10795

int cnt = 0;

10794

int cnt = 0;

10796

int cpu;

10795

int cpu;

10797

10796

10798

cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);

10797

cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);

10799

for_each_online_cpu(cpu) {

10798

for_each_online_cpu(cpu) {

10800

cnt += sprintf(&page[cnt], " %d:%d",

10799

cnt += sprintf(&page[cnt], " %d:%d",

10801

cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);

10800

cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);

10802

}

10801

}

10803

cnt += sprintf(&page[cnt], "\n");

10802

cnt += sprintf(&page[cnt], "\n");

10804

return cnt;

10803

return cnt;

10805

}

10804

}

10806

EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);

10805

EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);

10807

10806

10808

static long synchronize_sched_expedited_count;

10807

static long synchronize_sched_expedited_count;

10809

10808

10810

/*

10809

/*

10811

* Wait for an rcu-sched grace period to elapse, but use "big hammer"

10810

* Wait for an rcu-sched grace period to elapse, but use "big hammer"

10812

* approach to force grace period to end quickly. This consumes

10811

* approach to force grace period to end quickly. This consumes

10813

* significant time on all CPUs, and is thus not recommended for

10812

* significant time on all CPUs, and is thus not recommended for

10814

* any sort of common-case code.

10813

* any sort of common-case code.

10815

*

10814

*

10816

* Note that it is illegal to call this function while holding any

10815

* Note that it is illegal to call this function while holding any

10817

* lock that is acquired by a CPU-hotplug notifier. Failing to

10816

* lock that is acquired by a CPU-hotplug notifier. Failing to

10818

* observe this restriction will result in deadlock.

10817

* observe this restriction will result in deadlock.

10819

*/

10818

*/

10820

void synchronize_sched_expedited(void)

10819

void synchronize_sched_expedited(void)

10821

{

10820

{

10822

int cpu;

10821

int cpu;

10823

unsigned long flags;

10822

unsigned long flags;

10824

bool need_full_sync = 0;

10823

bool need_full_sync = 0;

10825

struct rq *rq;

10824

struct rq *rq;

10826

struct migration_req *req;

10825

struct migration_req *req;

10827

long snap;

10826

long snap;

10828

int trycount = 0;

10827

int trycount = 0;

10829

10828

10830

smp_mb(); /* ensure prior mod happens before capturing snap. */

10829

smp_mb(); /* ensure prior mod happens before capturing snap. */

10831

snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;

10830

snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;

10832

get_online_cpus();

10831

get_online_cpus();

10833

while (!mutex_trylock(&rcu_sched_expedited_mutex)) {

10832

while (!mutex_trylock(&rcu_sched_expedited_mutex)) {

10834

put_online_cpus();

10833

put_online_cpus();

10835

if (trycount++ < 10)

10834

if (trycount++ < 10)

10836

udelay(trycount * num_online_cpus());

10835

udelay(trycount * num_online_cpus());

10837

else {

10836

else {

10838

synchronize_sched();

10837

synchronize_sched();

10839

return;

10838

return;

10840

}

10839

}

10841

if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {

10840

if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {

10842

smp_mb(); /* ensure test happens before caller kfree */

10841

smp_mb(); /* ensure test happens before caller kfree */

10843

return;

10842

return;

10844

}

10843

}

10845

get_online_cpus();

10844

get_online_cpus();

10846

}

10845

}

10847

rcu_expedited_state = RCU_EXPEDITED_STATE_POST;

10846

rcu_expedited_state = RCU_EXPEDITED_STATE_POST;

10848

for_each_online_cpu(cpu) {

10847

for_each_online_cpu(cpu) {

10849

rq = cpu_rq(cpu);

10848

rq = cpu_rq(cpu);

10850

req = &per_cpu(rcu_migration_req, cpu);

10849

req = &per_cpu(rcu_migration_req, cpu);

10851

init_completion(&req->done);

10850

init_completion(&req->done);

10852

req->task = NULL;

10851

req->task = NULL;

10853

req->dest_cpu = RCU_MIGRATION_NEED_QS;

10852

req->dest_cpu = RCU_MIGRATION_NEED_QS;

10854

spin_lock_irqsave(&rq->lock, flags);

10853

spin_lock_irqsave(&rq->lock, flags);

10855

list_add(&req->list, &rq->migration_queue);

10854

list_add(&req->list, &rq->migration_queue);

10856

spin_unlock_irqrestore(&rq->lock, flags);

10855

spin_unlock_irqrestore(&rq->lock, flags);

10857

wake_up_process(rq->migration_thread);

10856

wake_up_process(rq->migration_thread);

10858

}

10857

}

10859

for_each_online_cpu(cpu) {

10858

for_each_online_cpu(cpu) {

10860

rcu_expedited_state = cpu;

10859

rcu_expedited_state = cpu;

10861

req = &per_cpu(rcu_migration_req, cpu);

10860

req = &per_cpu(rcu_migration_req, cpu);

10862

rq = cpu_rq(cpu);

10861

rq = cpu_rq(cpu);

10863

wait_for_completion(&req->done);

10862

wait_for_completion(&req->done);

10864

spin_lock_irqsave(&rq->lock, flags);

10863

spin_lock_irqsave(&rq->lock, flags);

10865

if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))

10864

if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))

10866

need_full_sync = 1;

10865

need_full_sync = 1;

10867

req->dest_cpu = RCU_MIGRATION_IDLE;

10866

req->dest_cpu = RCU_MIGRATION_IDLE;

10868

spin_unlock_irqrestore(&rq->lock, flags);

10867

spin_unlock_irqrestore(&rq->lock, flags);

10869

}

10868

}

10870

rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;

10869

rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;

10871

mutex_unlock(&rcu_sched_expedited_mutex);

10870

mutex_unlock(&rcu_sched_expedited_mutex);

10872

put_online_cpus();

10871

put_online_cpus();

10873

if (need_full_sync)

10872

if (need_full_sync)

10874

synchronize_sched();

10873

synchronize_sched();

10875

}

10874

}

10876

EXPORT_SYMBOL_GPL(synchronize_sched_expedited);

10875

EXPORT_SYMBOL_GPL(synchronize_sched_expedited);

10877

10876

10878

#endif /* #else #ifndef CONFIG_SMP */

10877

#endif /* #else #ifndef CONFIG_SMP */

10879

10878

GITLAB

Eric Lee / linux-smarc-t335x-v3.2

sysctl kernel: Remove binary sysctl logic

 /*
  *  kernel/sched.c
  *
  *  Kernel scheduler and related syscalls
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
  *
  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
  *		make semaphores SMP safe
  *  1998-11-19	Implemented schedule_timeout() and related stuff
  *		by Andrea Arcangeli
  *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
  *		hybrid priority-list and round-robin design with
  *		an array-switch method of distributing timeslices
  *		and per-CPU runqueues.  Cleanups and useful suggestions
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
  *  2007-04-15  Work begun on replacing all interactivity tuning with a
  *              fair scheduling design by Con Kolivas.
  *  2007-05-05  Load balancing (smp-nice) and other improvements
  *              by Peter Williams
  *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
  *              Thomas Gleixner, Mike Kravetz
  */
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/nmi.h>
 #include <linux/init.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
 #include <linux/smp_lock.h>
 #include <asm/mmu_context.h>
 #include <linux/interrupt.h>
 #include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/debug_locks.h>
 #include <linux/perf_event.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
 #include <linux/freezer.h>
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/delay.h>
 #include <linux/pid_namespace.h>
 #include <linux/smp.h>
 #include <linux/threads.h>
 #include <linux/timer.h>
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/percpu.h>
 #include <linux/kthread.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
 #include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 #include "sched_cpupri.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  * and back.
  */
 #define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
 #define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
 #define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
 /*
  * 'User priority' is the nice value converted to something we
  * can work with better when scaling various scheduler parameters,
  * it's a [ 0 ... 39 ] range.
  */
 #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
 #define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 /*
  * Helpers for converting nanosecond timing to jiffy resolution
  */
 #define NS_TO_JIFFIES(TIME)	((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
 #define NICE_0_LOAD		SCHED_LOAD_SCALE
 #define NICE_0_SHIFT		SCHED_LOAD_SHIFT
 /*
  * These are the 'tuning knobs' of the scheduler:
  *
  * default timeslice is 100 msecs (used only for SCHED_RR tasks).
  * Timeslices get refilled after they expire.
  */
 #define DEF_TIMESLICE		(100 * HZ / 1000)
 /*
  * single value that denotes runtime == period, ie unlimited time.
  */
 #define RUNTIME_INF	((u64)~0ULL)
 static inline int rt_policy(int policy)
 {
 	if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
 		return 1;
 	return 0;
 }
 static inline int task_has_rt_policy(struct task_struct *p)
 {
 	return rt_policy(p->policy);
 }
 /*
  * This is the priority-queue data structure of the RT scheduling class:
  */
 struct rt_prio_array {
 	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
 	struct list_head queue[MAX_RT_PRIO];
 };
 struct rt_bandwidth {
 	/* nests inside the rq lock: */
 	spinlock_t		rt_runtime_lock;
 	ktime_t			rt_period;
 	u64			rt_runtime;
 	struct hrtimer		rt_period_timer;
 };
 static struct rt_bandwidth def_rt_bandwidth;
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
 {
 	struct rt_bandwidth *rt_b =
 		container_of(timer, struct rt_bandwidth, rt_period_timer);
 	ktime_t now;
 	int overrun;
 	int idle = 0;
 	for (;;) {
 		now = hrtimer_cb_get_time(timer);
 		overrun = hrtimer_forward(timer, now, rt_b->rt_period);
 		if (!overrun)
 			break;
 		idle = do_sched_rt_period_timer(rt_b, overrun);
 	}
 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 }
 static
 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 {
 	rt_b->rt_period = ns_to_ktime(period);
 	rt_b->rt_runtime = runtime;
 	spin_lock_init(&rt_b->rt_runtime_lock);
 	hrtimer_init(&rt_b->rt_period_timer,
 			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rt_b->rt_period_timer.function = sched_rt_period_timer;
 }
 static inline int rt_bandwidth_enabled(void)
 {
 	return sysctl_sched_rt_runtime >= 0;
 }
 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
 	ktime_t now;
 	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 		return;
 	if (hrtimer_active(&rt_b->rt_period_timer))
 		return;
 	spin_lock(&rt_b->rt_runtime_lock);
 	for (;;) {
 		unsigned long delta;
 		ktime_t soft, hard;
 		if (hrtimer_active(&rt_b->rt_period_timer))
 			break;
 		now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
 		hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
 		soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
 		hard = hrtimer_get_expires(&rt_b->rt_period_timer);
 		delta = ktime_to_ns(ktime_sub(hard, soft));
 		__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
 				HRTIMER_MODE_ABS_PINNED, 0);
 	}
 	spin_unlock(&rt_b->rt_runtime_lock);
 }
 #ifdef CONFIG_RT_GROUP_SCHED
 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
 	hrtimer_cancel(&rt_b->rt_period_timer);
 }
 #endif
 /*
  * sched_domains_mutex serializes calls to arch_init_sched_domains,
  * detach_destroy_domains and partition_sched_domains.
  */
 static DEFINE_MUTEX(sched_domains_mutex);
 #ifdef CONFIG_GROUP_SCHED
 #include <linux/cgroup.h>
 struct cfs_rq;
 static LIST_HEAD(task_groups);
 /* task group related information */
 struct task_group {
 #ifdef CONFIG_CGROUP_SCHED
 	struct cgroup_subsys_state css;
 #endif
 #ifdef CONFIG_USER_SCHED
 	uid_t uid;
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* schedulable entities of this group on each cpu */
 	struct sched_entity **se;
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
 	unsigned long shares;
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	struct sched_rt_entity **rt_se;
 	struct rt_rq **rt_rq;
 	struct rt_bandwidth rt_bandwidth;
 #endif
 	struct rcu_head rcu;
 	struct list_head list;
 	struct task_group *parent;
 	struct list_head siblings;
 	struct list_head children;
 };
 #ifdef CONFIG_USER_SCHED
 /* Helper function to pass uid information to create_sched_user() */
 void set_tg_uid(struct user_struct *user)
 {
 	user->tg->uid = user->uid;
 }
 /*
  * Root task group.
  *	Every UID task group (including init_task_group aka UID-0) will
  *	be a child to this group.
  */
 struct task_group root_task_group;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /* Default task group's sched entity on each cpu */
 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
 #endif /* CONFIG_RT_GROUP_SCHED */
 #else /* !CONFIG_USER_SCHED */
 #define root_task_group init_task_group
 #endif /* CONFIG_USER_SCHED */
 /* task_group_lock serializes add/remove of task groups and also changes to
  * a task group's cpu shares.
  */
 static DEFINE_SPINLOCK(task_group_lock);
 #ifdef CONFIG_SMP
 static int root_task_group_empty(void)
 {
 	return list_empty(&root_task_group.children);
 }
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_USER_SCHED
 # define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD)
 #else /* !CONFIG_USER_SCHED */
 # define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
 #endif /* CONFIG_USER_SCHED */
 /*
  * A weight of 0 or 1 can cause arithmetics problems.
  * A weight of a cfs_rq is the sum of weights of which entities
  * are queued on this cfs_rq, so a weight of a entity should not be
  * too large, so as the shares value of a task group.
  * (The default weight is 1024 - so there's no practical
  *  limitation from this.)
  */
 #define MIN_SHARES	2
 #define MAX_SHARES	(1UL << 18)
 static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 #endif
 /* Default task group.
  *	Every task in system belong to this group at bootup.
  */
 struct task_group init_task_group;
 /* return group to which a task belongs */
 static inline struct task_group *task_group(struct task_struct *p)
 {
 	struct task_group *tg;
 #ifdef CONFIG_USER_SCHED
 	rcu_read_lock();
 	tg = __task_cred(p)->user->tg;
 	rcu_read_unlock();
 #elif defined(CONFIG_CGROUP_SCHED)
 	tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
 				struct task_group, css);
 #else
 	tg = &init_task_group;
 #endif
 	return tg;
 }
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
 	p->se.parent = task_group(p)->se[cpu];
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
 	p->rt.parent = task_group(p)->rt_se[cpu];
 #endif
 }
 #else
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
 static inline struct task_group *task_group(struct task_struct *p)
 {
 	return NULL;
 }
 #endif	/* CONFIG_GROUP_SCHED */
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight load;
 	unsigned long nr_running;
 	u64 exec_clock;
 	u64 min_vruntime;
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
 	struct list_head tasks;
 	struct list_head *balance_iterator;
 	/*
 	 * 'curr' points to currently running entity on this cfs_rq.
 	 * It is set to NULL otherwise (i.e when none are currently running).
 	 */
 	struct sched_entity *curr, *next, *last;
 	unsigned int nr_spread_over;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
 	/*
 	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
 	 * (like users, containers etc.)
 	 *
 	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
 	 * list is used during load balance.
 	 */
 	struct list_head leaf_cfs_rq_list;
 	struct task_group *tg;	/* group that "owns" this runqueue */
 #ifdef CONFIG_SMP
 	/*
 	 * the part of load.weight contributed by tasks
 	 */
 	unsigned long task_weight;
 	/*
 	 *   h_load = weight * f(tg)
 	 *
 	 * Where f(tg) is the recursive weight fraction assigned to
 	 * this group.
 	 */
 	unsigned long h_load;
 	/*
 	 * this cpu's part of tg->shares
 	 */
 	unsigned long shares;
 	/*
 	 * load.weight at the time we set shares
 	 */
 	unsigned long rq_weight;
 #endif
 #endif
 };
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
 	struct rt_prio_array active;
 	unsigned long rt_nr_running;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	struct {
 		int curr; /* highest queued rt task prio */
 #ifdef CONFIG_SMP
 		int next; /* next highest */
 #endif
 	} highest_prio;
 #endif
 #ifdef CONFIG_SMP
 	unsigned long rt_nr_migratory;
 	unsigned long rt_nr_total;
 	int overloaded;
 	struct plist_head pushable_tasks;
 #endif
 	int rt_throttled;
 	u64 rt_time;
 	u64 rt_runtime;
 	/* Nests inside the rq lock: */
 	spinlock_t rt_runtime_lock;
 #ifdef CONFIG_RT_GROUP_SCHED
 	unsigned long rt_nr_boosted;
 	struct rq *rq;
 	struct list_head leaf_rt_rq_list;
 	struct task_group *tg;
 	struct sched_rt_entity *rt_se;
 #endif
 };
 #ifdef CONFIG_SMP
 /*
  * We add the notion of a root-domain which will be used to define per-domain
  * variables. Each exclusive cpuset essentially defines an island domain by
  * fully partitioning the member cpus from any other cpuset. Whenever a new
  * exclusive cpuset is created, we also create and attach a new root-domain
  * object.
  *
  */
 struct root_domain {
 	atomic_t refcount;
 	cpumask_var_t span;
 	cpumask_var_t online;
 	/*
 	 * The "RT overload" flag: it gets set if a CPU has more than
 	 * one runnable RT task.
 	 */
 	cpumask_var_t rto_mask;
 	atomic_t rto_count;
 #ifdef CONFIG_SMP
 	struct cpupri cpupri;
 #endif
 };
 /*
  * By default the system creates a single root-domain with all cpus as
  * members (mimicking the global state we have today).
  */
 static struct root_domain def_root_domain;
 #endif
 /*
  * This is the main, per-CPU runqueue data structure.
  *
  * Locking rule: those places that want to lock multiple runqueues
  * (such as the load balancing or the thread migration code), lock
  * acquire operations must be ordered by ascending &runqueue.
  */
 struct rq {
 	/* runqueue lock: */
 	spinlock_t lock;
 	/*
 	 * nr_running and cpu_load should be in the same cacheline because
 	 * remote CPUs use both these fields when doing load calculation.
 	 */
 	unsigned long nr_running;
 	#define CPU_LOAD_IDX_MAX 5
 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 #ifdef CONFIG_NO_HZ
 	unsigned long last_tick_seen;
 	unsigned char in_nohz_recently;
 #endif
 	/* capture load from *all* tasks on this cpu: */
 	struct load_weight load;
 	unsigned long nr_load_updates;
 	u64 nr_switches;
 	u64 nr_migrations_in;
 	struct cfs_rq cfs;
 	struct rt_rq rt;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
 	struct list_head leaf_cfs_rq_list;
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	struct list_head leaf_rt_rq_list;
 #endif
 	/*
 	 * This is part of a global counter where only the total sum
 	 * over all CPUs matters. A task can increase this counter on
 	 * one CPU and if it got migrated afterwards it may decrease
 	 * it on another CPU. Always updated under the runqueue lock:
 	 */
 	unsigned long nr_uninterruptible;
 	struct task_struct *curr, *idle;
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
 	u64 clock;
 	atomic_t nr_iowait;
 #ifdef CONFIG_SMP
 	struct root_domain *rd;
 	struct sched_domain *sd;
 	unsigned char idle_at_tick;
 	/* For active balancing */
 	int post_schedule;
 	int active_balance;
 	int push_cpu;
 	/* cpu of this runqueue: */
 	int cpu;
 	int online;
 	unsigned long avg_load_per_task;
 	struct task_struct *migration_thread;
 	struct list_head migration_queue;
 	u64 rt_avg;
 	u64 age_stamp;
 #endif
 	/* calc_load related fields */
 	unsigned long calc_load_update;
 	long calc_load_active;
 #ifdef CONFIG_SCHED_HRTICK
 #ifdef CONFIG_SMP
 	int hrtick_csd_pending;
 	struct call_single_data hrtick_csd;
 #endif
 	struct hrtimer hrtick_timer;
 #endif
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
 	struct sched_info rq_sched_info;
 	unsigned long long rq_cpu_time;
 	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
 	/* sys_sched_yield() stats */
 	unsigned int yld_count;
 	/* schedule() stats */
 	unsigned int sched_switch;
 	unsigned int sched_count;
 	unsigned int sched_goidle;
 	/* try_to_wake_up() stats */
 	unsigned int ttwu_count;
 	unsigned int ttwu_local;
 	/* BKL stats */
 	unsigned int bkl_count;
 #endif
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 static inline
 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 {
 	rq->curr->sched_class->check_preempt_curr(rq, p, flags);
 }
 static inline int cpu_of(struct rq *rq)
 {
 #ifdef CONFIG_SMP
 	return rq->cpu;
 #else
 	return 0;
 #endif
 }
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
  * See detach_destroy_domains: synchronize_sched for details.
  *
  * The domain tree of any CPU may only be accessed from within
  * preempt-disabled sections.
  */
 #define for_each_domain(cpu, __sd) \
 	for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
 #define this_rq()		(&__get_cpu_var(runqueues))
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 #define raw_rq()		(&__raw_get_cpu_var(runqueues))
 inline void update_rq_clock(struct rq *rq)
 {
 	rq->clock = sched_clock_cpu(cpu_of(rq));
 }
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
 #ifdef CONFIG_SCHED_DEBUG
 # define const_debug __read_mostly
 #else
 # define const_debug static const
 #endif
 /**
  * runqueue_is_locked
  * @cpu: the processor in question.
  *
  * Returns true if the current cpu runqueue is locked.
  * This interface allows printk to be called with the runqueue lock
  * held and know whether or not it is OK to wake up the klogd.
  */
 int runqueue_is_locked(int cpu)
 {
 	return spin_is_locked(&cpu_rq(cpu)->lock);
 }
 /*
  * Debugging: various feature bits
  */
 #define SCHED_FEAT(name, enabled)	\
 	__SCHED_FEAT_##name ,
 enum {
 #include "sched_features.h"
 };
 #undef SCHED_FEAT
 #define SCHED_FEAT(name, enabled)	\
 	(1UL << __SCHED_FEAT_##name) * enabled |
 const_debug unsigned int sysctl_sched_features =
 #include "sched_features.h"
 	0;
 #undef SCHED_FEAT
 #ifdef CONFIG_SCHED_DEBUG
 #define SCHED_FEAT(name, enabled)	\
 	#name ,
 static __read_mostly char *sched_feat_names[] = {
 #include "sched_features.h"
 	NULL
 };
 #undef SCHED_FEAT
 static int sched_feat_show(struct seq_file *m, void *v)
 {
 	int i;
 	for (i = 0; sched_feat_names[i]; i++) {
 		if (!(sysctl_sched_features & (1UL << i)))
 			seq_puts(m, "NO_");
 		seq_printf(m, "%s ", sched_feat_names[i]);
 	}
 	seq_puts(m, "\n");
 	return 0;
 }
 static ssize_t
 sched_feat_write(struct file *filp, const char __user *ubuf,
 		size_t cnt, loff_t *ppos)
 {
 	char buf[64];
 	char *cmp = buf;
 	int neg = 0;
 	int i;
 	if (cnt > 63)
 		cnt = 63;
 	if (copy_from_user(&buf, ubuf, cnt))
 		return -EFAULT;
 	buf[cnt] = 0;
 	if (strncmp(buf, "NO_", 3) == 0) {
 		neg = 1;
 		cmp += 3;
 	}
 	for (i = 0; sched_feat_names[i]; i++) {
 		int len = strlen(sched_feat_names[i]);
 		if (strncmp(cmp, sched_feat_names[i], len) == 0) {
 			if (neg)
 				sysctl_sched_features &= ~(1UL << i);
 			else
 				sysctl_sched_features |= (1UL << i);
 			break;
 		}
 	}
 	if (!sched_feat_names[i])
 		return -EINVAL;
 	filp->f_pos += cnt;
 	return cnt;
 }
 static int sched_feat_open(struct inode *inode, struct file *filp)
 {
 	return single_open(filp, sched_feat_show, NULL);
 }
 static const struct file_operations sched_feat_fops = {
 	.open		= sched_feat_open,
 	.write		= sched_feat_write,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
 static __init int sched_init_debug(void)
 {
 	debugfs_create_file("sched_features", 0644, NULL, NULL,
 			&sched_feat_fops);
 	return 0;
 }
 late_initcall(sched_init_debug);
 #endif
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 /*
  * Number of tasks to iterate in a single balance run.
  * Limited because this is done with IRQs disabled.
  */
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 /*
  * ratelimit for updating the group shares.
  * default: 0.25ms
  */
 unsigned int sysctl_sched_shares_ratelimit = 250000;
 /*
  * Inject some fuzzyness into changing the per-cpu group shares
  * this avoids remote rq-locks at the expense of fairness.
  * default: 4
  */
 unsigned int sysctl_sched_shares_thresh = 4;
 /*
  * period over which we average the RT time consumption, measured
  * in ms.
  *
  * default: 1s
  */
 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
 /*
  * period over which we measure -rt task cpu usage in us.
  * default: 1s
  */
 unsigned int sysctl_sched_rt_period = 1000000;
 static __read_mostly int scheduler_running;
 /*
  * part of the period that we allow rt tasks to run in us.
  * default: 0.95s
  */
 int sysctl_sched_rt_runtime = 950000;
 static inline u64 global_rt_period(void)
 {
 	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
 }
 static inline u64 global_rt_runtime(void)
 {
 	if (sysctl_sched_rt_runtime < 0)
 		return RUNTIME_INF;
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
 #ifndef finish_arch_switch
 # define finish_arch_switch(prev)	do { } while (0)
 #endif
 static inline int task_current(struct rq *rq, struct task_struct *p)
 {
 	return rq->curr == p;
 }
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline int task_running(struct rq *rq, struct task_struct *p)
 {
 	return task_current(rq, p);
 }
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 }
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	/* this is a valid case when another task releases the spinlock */
 	rq->lock.owner = current;
 #endif
 	/*
 	 * If we are tracking spinlock dependencies then we have to
 	 * fix up the runqueue lock - which gets 'carried over' from
 	 * prev into current:
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
 	spin_unlock_irq(&rq->lock);
 }
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
 static inline int task_running(struct rq *rq, struct task_struct *p)
 {
 #ifdef CONFIG_SMP
 	return p->oncpu;
 #else
 	return task_current(rq, p);
 #endif
 }
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 #ifdef CONFIG_SMP
 	/*
 	 * We can optimise this out completely for !SMP, because the
 	 * SMP rebalancing from interrupt is the only thing that cares
 	 * here.
 	 */
 	next->oncpu = 1;
 #endif
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 	spin_unlock_irq(&rq->lock);
 #else
 	spin_unlock(&rq->lock);
 #endif
 }
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
 #ifdef CONFIG_SMP
 	/*
 	 * After ->oncpu is cleared, the task can be moved to a different CPU.
 	 * We must ensure this doesn't happen until the switch is completely
 	 * finished.
 	 */
 	smp_wmb();
 	prev->oncpu = 0;
 #endif
 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 	local_irq_enable();
 #endif
 }
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
  * __task_rq_lock - lock the runqueue a given task resides on.
  * Must be called interrupts disabled.
  */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
 	__acquires(rq->lock)
 {
 	for (;;) {
 		struct rq *rq = task_rq(p);
 		spin_lock(&rq->lock);
 		if (likely(rq == task_rq(p)))
 			return rq;
 		spin_unlock(&rq->lock);
 	}
 }
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
  * interrupts. Note the ordering: we can safely lookup the task_rq without
  * explicitly disabling preemption.
  */
 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 	for (;;) {
 		local_irq_save(*flags);
 		rq = task_rq(p);
 		spin_lock(&rq->lock);
 		if (likely(rq == task_rq(p)))
 			return rq;
 		spin_unlock_irqrestore(&rq->lock, *flags);
 	}
 }
 void task_rq_unlock_wait(struct task_struct *p)
 {
 	struct rq *rq = task_rq(p);
 	smp_mb(); /* spin-unlock-wait is not a full memory barrier */
 	spin_unlock_wait(&rq->lock);
 }
 static void __task_rq_unlock(struct rq *rq)
 	__releases(rq->lock)
 {
 	spin_unlock(&rq->lock);
 }
 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
 	__releases(rq->lock)
 {
 	spin_unlock_irqrestore(&rq->lock, *flags);
 }
 /*
  * this_rq_lock - lock this runqueue and disable interrupts.
  */
 static struct rq *this_rq_lock(void)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 	local_irq_disable();
 	rq = this_rq();
 	spin_lock(&rq->lock);
 	return rq;
 }
 #ifdef CONFIG_SCHED_HRTICK
 /*
  * Use HR-timers to deliver accurate preemption points.
  *
  * Its all a bit involved since we cannot program an hrt while holding the
  * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
  * reschedule event.
  *
  * When we get rescheduled we reprogram the hrtick_timer outside of the
  * rq->lock.
  */
 /*
  * Use hrtick when:
  *  - enabled by features
  *  - hrtimer is actually high res
  */
 static inline int hrtick_enabled(struct rq *rq)
 {
 	if (!sched_feat(HRTICK))
 		return 0;
 	if (!cpu_active(cpu_of(rq)))
 		return 0;
 	return hrtimer_is_hres_active(&rq->hrtick_timer);
 }
 static void hrtick_clear(struct rq *rq)
 {
 	if (hrtimer_active(&rq->hrtick_timer))
 		hrtimer_cancel(&rq->hrtick_timer);
 }
 /*
  * High-resolution timer tick.
  * Runs from hardirq context with interrupts disabled.
  */
 static enum hrtimer_restart hrtick(struct hrtimer *timer)
 {
 	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
 	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 	spin_lock(&rq->lock);
 	update_rq_clock(rq);
 	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
 	spin_unlock(&rq->lock);
 	return HRTIMER_NORESTART;
 }
 #ifdef CONFIG_SMP
 /*
  * called from hardirq (IPI) context
  */
 static void __hrtick_start(void *arg)
 {
 	struct rq *rq = arg;
 	spin_lock(&rq->lock);
 	hrtimer_restart(&rq->hrtick_timer);
 	rq->hrtick_csd_pending = 0;
 	spin_unlock(&rq->lock);
 }
 /*
  * Called to set the hrtick timer state.
  *
  * called with rq->lock held and irqs disabled
  */
 static void hrtick_start(struct rq *rq, u64 delay)
 {
 	struct hrtimer *timer = &rq->hrtick_timer;
 	ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
 	hrtimer_set_expires(timer, time);
 	if (rq == this_rq()) {
 		hrtimer_restart(timer);
 	} else if (!rq->hrtick_csd_pending) {
 		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
 		rq->hrtick_csd_pending = 1;
 	}
 }
 static int
 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	int cpu = (int)(long)hcpu;
 	switch (action) {
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		hrtick_clear(cpu_rq(cpu));
 		return NOTIFY_OK;
 	}
 	return NOTIFY_DONE;
 }
 static __init void init_hrtick(void)
 {
 	hotcpu_notifier(hotplug_hrtick, 0);
 }
 #else
 /*
  * Called to set the hrtick timer state.
  *
  * called with rq->lock held and irqs disabled
  */
 static void hrtick_start(struct rq *rq, u64 delay)
 {
 	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
 			HRTIMER_MODE_REL_PINNED, 0);
 }
 static inline void init_hrtick(void)
 {
 }
 #endif /* CONFIG_SMP */
 static void init_rq_hrtick(struct rq *rq)
 {
 #ifdef CONFIG_SMP
 	rq->hrtick_csd_pending = 0;
 	rq->hrtick_csd.flags = 0;
 	rq->hrtick_csd.func = __hrtick_start;
 	rq->hrtick_csd.info = rq;
 #endif
 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rq->hrtick_timer.function = hrtick;
 }
 #else	/* CONFIG_SCHED_HRTICK */
 static inline void hrtick_clear(struct rq *rq)
 {
 }
 static inline void init_rq_hrtick(struct rq *rq)
 {
 }
 static inline void init_hrtick(void)
 {
 }
 #endif	/* CONFIG_SCHED_HRTICK */
 /*
  * resched_task - mark a task 'to be rescheduled now'.
  *
  * On UP this means the setting of the need_resched flag, on SMP it
  * might also involve a cross-CPU call to trigger the scheduler on
  * the target CPU.
  */
 #ifdef CONFIG_SMP
 #ifndef tsk_is_polling
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
 static void resched_task(struct task_struct *p)
 {
 	int cpu;
 	assert_spin_locked(&task_rq(p)->lock);
 	if (test_tsk_need_resched(p))
 		return;
 	set_tsk_need_resched(p);
 	cpu = task_cpu(p);
 	if (cpu == smp_processor_id())
 		return;
 	/* NEED_RESCHED must be visible before we test polling */
 	smp_mb();
 	if (!tsk_is_polling(p))
 		smp_send_reschedule(cpu);
 }
 static void resched_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 	if (!spin_trylock_irqsave(&rq->lock, flags))
 		return;
 	resched_task(cpu_curr(cpu));
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 #ifdef CONFIG_NO_HZ
 /*
  * When add_timer_on() enqueues a timer into the timer wheel of an
  * idle CPU then this timer might expire before the next timer event
  * which is scheduled to wake up that CPU. In case of a completely
  * idle system the next event might even be infinite time into the
  * future. wake_up_idle_cpu() ensures that the CPU is woken up and
  * leaves the inner idle loop so the newly added timer is taken into
  * account when the CPU goes back to idle and evaluates the timer
  * wheel for the next timer event.
  */
 void wake_up_idle_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	if (cpu == smp_processor_id())
 		return;
 	/*
 	 * This is safe, as this function is called with the timer
 	 * wheel base lock of (cpu) held. When the CPU is on the way
 	 * to idle and has not yet set rq->curr to idle then it will
 	 * be serialized on the timer wheel base lock and take the new
 	 * timer into account automatically.
 	 */
 	if (rq->curr != rq->idle)
 		return;
 	/*
 	 * We can set TIF_RESCHED on the idle task of the other CPU
 	 * lockless. The worst case is that the other CPU runs the
 	 * idle task through an additional NOOP schedule()
 	 */
 	set_tsk_need_resched(rq->idle);
 	/* NEED_RESCHED must be visible before we test polling */
 	smp_mb();
 	if (!tsk_is_polling(rq->idle))
 		smp_send_reschedule(cpu);
 }
 #endif /* CONFIG_NO_HZ */
 static u64 sched_avg_period(void)
 {
 	return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
 }
 static void sched_avg_update(struct rq *rq)
 {
 	s64 period = sched_avg_period();
 	while ((s64)(rq->clock - rq->age_stamp) > period) {
 		rq->age_stamp += period;
 		rq->rt_avg /= 2;
 	}
 }
 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 {
 	rq->rt_avg += rt_delta;
 	sched_avg_update(rq);
 }
 #else /* !CONFIG_SMP */
 static void resched_task(struct task_struct *p)
 {
 	assert_spin_locked(&task_rq(p)->lock);
 	set_tsk_need_resched(p);
 }
 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 {
 }
 #endif /* CONFIG_SMP */
 #if BITS_PER_LONG == 32
 # define WMULT_CONST	(~0UL)
 #else
 # define WMULT_CONST	(1UL << 32)
 #endif
 #define WMULT_SHIFT	32
 /*
  * Shift right and round:
  */
 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
 /*
  * delta *= weight / lw
  */
 static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 		struct load_weight *lw)
 {
 	u64 tmp;
 	if (!lw->inv_weight) {
 		if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
 			lw->inv_weight = 1;
 		else
 			lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
 				/ (lw->weight+1);
 	}
 	tmp = (u64)delta_exec * weight;
 	/*
 	 * Check whether we'd overflow the 64-bit multiplication:
 	 */
 	if (unlikely(tmp > WMULT_CONST))
 		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
 			WMULT_SHIFT/2);
 	else
 		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
 	lw->inv_weight = 0;
 }
 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 {
 	lw->weight -= dec;
 	lw->inv_weight = 0;
 }
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
  * of tasks with abnormal "nice" values across CPUs the contribution that
  * each task makes to its run queue's load is weighted according to its
  * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
  * scaled version of the new time slice allocation that they receive on time
  * slice expiry etc.
  */
 #define WEIGHT_IDLEPRIO                3
 #define WMULT_IDLEPRIO         1431655765
 /*
  * Nice levels are multiplicative, with a gentle 10% change for every
  * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
  * nice 1, it will get ~10% less CPU time than another CPU-bound task
  * that remained on nice 0.
  *
  * The "10% effect" is relative and cumulative: from _any_ nice level,
  * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
  * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
  * If a task goes up by ~10% and another task goes down by ~10% then
  * the relative distance between them is ~25%.)
  */
 static const int prio_to_weight[40] = {
  /* -20 */     88761,     71755,     56483,     46273,     36291,
  /* -15 */     29154,     23254,     18705,     14949,     11916,
  /* -10 */      9548,      7620,      6100,      4904,      3906,
  /*  -5 */      3121,      2501,      1991,      1586,      1277,
  /*   0 */      1024,       820,       655,       526,       423,
  /*   5 */       335,       272,       215,       172,       137,
  /*  10 */       110,        87,        70,        56,        45,
  /*  15 */        36,        29,        23,        18,        15,
 };
 /*
  * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
  *
  * In cases where the weight does not change often, we can use the
  * precalculated inverse to speed up arithmetics by turning divisions
  * into multiplications:
  */
 static const u32 prio_to_wmult[40] = {
  /* -20 */     48388,     59856,     76040,     92818,    118348,
  /* -15 */    147320,    184698,    229616,    287308,    360437,
  /* -10 */    449829,    563644,    704093,    875809,   1099582,
  /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
  /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
  /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
  /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
 /*
  * runqueue iterator, to support SMP load-balancing between different
  * scheduling classes, without having to expose their internal data
  * structures to the load-balancing proper:
  */
 struct rq_iterator {
 	void *arg;
 	struct task_struct *(*start)(void *);
 	struct task_struct *(*next)(void *);
 };
 #ifdef CONFIG_SMP
 static unsigned long
 balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	      unsigned long max_load_move, struct sched_domain *sd,
 	      enum cpu_idle_type idle, int *all_pinned,
 	      int *this_best_prio, struct rq_iterator *iterator);
 static int
 iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		   struct sched_domain *sd, enum cpu_idle_type idle,
 		   struct rq_iterator *iterator);
 #endif
 /* Time spent by the tasks of the cpu accounting group executing in ... */
 enum cpuacct_stat_index {
 	CPUACCT_STAT_USER,	/* ... user mode */
 	CPUACCT_STAT_SYSTEM,	/* ... kernel mode */
 	CPUACCT_STAT_NSTATS,
 };
 #ifdef CONFIG_CGROUP_CPUACCT
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
 static void cpuacct_update_stats(struct task_struct *tsk,
 		enum cpuacct_stat_index idx, cputime_t val);
 #else
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 static inline void cpuacct_update_stats(struct task_struct *tsk,
 		enum cpuacct_stat_index idx, cputime_t val) {}
 #endif
 static inline void inc_cpu_load(struct rq *rq, unsigned long load)
 {
 	update_load_add(&rq->load, load);
 }
 static inline void dec_cpu_load(struct rq *rq, unsigned long load)
 {
 	update_load_sub(&rq->load, load);
 }
 #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
 typedef int (*tg_visitor)(struct task_group *, void *);
 /*
  * Iterate the full tree, calling @down when first entering a node and @up when
  * leaving it for the final time.
  */
 static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
 {
 	struct task_group *parent, *child;
 	int ret;
 	rcu_read_lock();
 	parent = &root_task_group;
 down:
 	ret = (*down)(parent, data);
 	if (ret)
 		goto out_unlock;
 	list_for_each_entry_rcu(child, &parent->children, siblings) {
 		parent = child;
 		goto down;
 up:
 		continue;
 	}
 	ret = (*up)(parent, data);
 	if (ret)
 		goto out_unlock;
 	child = parent;
 	parent = parent->parent;
 	if (parent)
 		goto up;
 out_unlock:
 	rcu_read_unlock();
 	return ret;
 }
 static int tg_nop(struct task_group *tg, void *data)
 {
 	return 0;
 }
 #endif
 #ifdef CONFIG_SMP
 /* Used instead of source_load when we know the type == 0 */
 static unsigned long weighted_cpuload(const int cpu)
 {
 	return cpu_rq(cpu)->load.weight;
 }
 /*
  * Return a low guess at the load of a migration-source cpu weighted
  * according to the scheduling class and "nice" value.
  *
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
 static unsigned long source_load(int cpu, int type)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
 	if (type == 0 || !sched_feat(LB_BIAS))
 		return total;
 	return min(rq->cpu_load[type-1], total);
 }
 /*
  * Return a high guess at the load of a migration-target cpu weighted
  * according to the scheduling class and "nice" value.
  */
 static unsigned long target_load(int cpu, int type)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
 	if (type == 0 || !sched_feat(LB_BIAS))
 		return total;
 	return max(rq->cpu_load[type-1], total);
 }
 static struct sched_group *group_of(int cpu)
 {
 	struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
 	if (!sd)
 		return NULL;
 	return sd->groups;
 }
 static unsigned long power_of(int cpu)
 {
 	struct sched_group *group = group_of(cpu);
 	if (!group)
 		return SCHED_LOAD_SCALE;
 	return group->cpu_power;
 }
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
 	if (nr_running)
 		rq->avg_load_per_task = rq->load.weight / nr_running;
 	else
 		rq->avg_load_per_task = 0;
 	return rq->avg_load_per_task;
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static __read_mostly unsigned long *update_shares_data;
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 /*
  * Calculate and set the cpu's group shares.
  */
 static void update_group_shares_cpu(struct task_group *tg, int cpu,
 				    unsigned long sd_shares,
 				    unsigned long sd_rq_weight,
 				    unsigned long *usd_rq_weight)
 {
 	unsigned long shares, rq_weight;
 	int boost = 0;
 	rq_weight = usd_rq_weight[cpu];
 	if (!rq_weight) {
 		boost = 1;
 		rq_weight = NICE_0_LOAD;
 	}
 	/*
 	 *             \Sum_j shares_j * rq_weight_i
 	 * shares_i =  -----------------------------
 	 *                  \Sum_j rq_weight_j
 	 */
 	shares = (sd_shares * rq_weight) / sd_rq_weight;
 	shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
 	if (abs(shares - tg->se[cpu]->load.weight) >
 			sysctl_sched_shares_thresh) {
 		struct rq *rq = cpu_rq(cpu);
 		unsigned long flags;
 		spin_lock_irqsave(&rq->lock, flags);
 		tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
 		tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
 		__set_se_shares(tg->se[cpu], shares);
 		spin_unlock_irqrestore(&rq->lock, flags);
 	}
 }
 /*
  * Re-compute the task group their per cpu shares over the given domain.
  * This needs to be done in a bottom-up fashion because the rq weight of a
  * parent group depends on the shares of its child groups.
  */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
 	unsigned long weight, rq_weight = 0, shares = 0;
 	unsigned long *usd_rq_weight;
 	struct sched_domain *sd = data;
 	unsigned long flags;
 	int i;
 	if (!tg->se[0])
 		return 0;
 	local_irq_save(flags);
 	usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
 	for_each_cpu(i, sched_domain_span(sd)) {
 		weight = tg->cfs_rq[i]->load.weight;
 		usd_rq_weight[i] = weight;
 		/*
 		 * If there are currently no tasks on the cpu pretend there
 		 * is one of average load so that when a new task gets to
 		 * run here it will not get delayed by group starvation.
 		 */
 		if (!weight)
 			weight = NICE_0_LOAD;
 		rq_weight += weight;
 		shares += tg->cfs_rq[i]->shares;
 	}
 	if ((!shares && rq_weight) || shares > tg->shares)
 		shares = tg->shares;
 	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
 		shares = tg->shares;
 	for_each_cpu(i, sched_domain_span(sd))
 		update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
 	local_irq_restore(flags);
 	return 0;
 }
 /*
  * Compute the cpu's hierarchical load factor for each task group.
  * This needs to be done in a top-down fashion because the load of a child
  * group is a fraction of its parents load.
  */
 static int tg_load_down(struct task_group *tg, void *data)
 {
 	unsigned long load;
 	long cpu = (long)data;
 	if (!tg->parent) {
 		load = cpu_rq(cpu)->load.weight;
 	} else {
 		load = tg->parent->cfs_rq[cpu]->h_load;
 		load *= tg->cfs_rq[cpu]->shares;
 		load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
 	}
 	tg->cfs_rq[cpu]->h_load = load;
 	return 0;
 }
 static void update_shares(struct sched_domain *sd)
 {
 	s64 elapsed;
 	u64 now;
 	if (root_task_group_empty())
 		return;
 	now = cpu_clock(raw_smp_processor_id());
 	elapsed = now - sd->last_update;
 	if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
 		sd->last_update = now;
 		walk_tg_tree(tg_nop, tg_shares_up, sd);
 	}
 }
 static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 {
 	if (root_task_group_empty())
 		return;
 	spin_unlock(&rq->lock);
 	update_shares(sd);
 	spin_lock(&rq->lock);
 }
 static void update_h_load(long cpu)
 {
 	if (root_task_group_empty())
 		return;
 	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
 #else
 static inline void update_shares(struct sched_domain *sd)
 {
 }
 static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 {
 }
 #endif
 #ifdef CONFIG_PREEMPT
 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
 /*
  * fair double_lock_balance: Safely acquires both rq->locks in a fair
  * way at the expense of forcing extra atomic operations in all
  * invocations.  This assures that the double_lock is acquired using the
  * same underlying policy as the spinlock_t on this architecture, which
  * reduces latency compared to the unfair variant below.  However, it
  * also adds more overhead and therefore may reduce throughput.
  */
 static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(this_rq->lock)
 	__acquires(busiest->lock)
 	__acquires(this_rq->lock)
 {
 	spin_unlock(&this_rq->lock);
 	double_rq_lock(this_rq, busiest);
 	return 1;
 }
 #else
 /*
  * Unfair double_lock_balance: Optimizes throughput at the expense of
  * latency by eliminating extra atomic operations when the locks are
  * already in proper order on entry.  This favors lower cpu-ids and will
  * grant the double lock to lower cpus over higher ids under contention,
  * regardless of entry order into the function.
  */
 static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(this_rq->lock)
 	__acquires(busiest->lock)
 	__acquires(this_rq->lock)
 {
 	int ret = 0;
 	if (unlikely(!spin_trylock(&busiest->lock))) {
 		if (busiest < this_rq) {
 			spin_unlock(&this_rq->lock);
 			spin_lock(&busiest->lock);
 			spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
 			ret = 1;
 		} else
 			spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
 	}
 	return ret;
 }
 #endif /* CONFIG_PREEMPT */
 /*
  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
  */
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 {
 	if (unlikely(!irqs_disabled())) {
 		/* printk() doesn't work good under rq->lock */
 		spin_unlock(&this_rq->lock);
 		BUG_ON(1);
 	}
 	return _double_lock_balance(this_rq, busiest);
 }
 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(busiest->lock)
 {
 	spin_unlock(&busiest->lock);
 	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
 }
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 {
 #ifdef CONFIG_SMP
 	cfs_rq->shares = shares;
 #endif
 }
 #endif
 static void calc_load_account_active(struct rq *this_rq);
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
 #endif
 #define sched_class_highest (&rt_sched_class)
 #define for_each_class(class) \
    for (class = sched_class_highest; class; class = class->next)
 static void inc_nr_running(struct rq *rq)
 {
 	rq->nr_running++;
 }
 static void dec_nr_running(struct rq *rq)
 {
 	rq->nr_running--;
 }
 static void set_load_weight(struct task_struct *p)
 {
 	if (task_has_rt_policy(p)) {
 		p->se.load.weight = prio_to_weight[0] * 2;
 		p->se.load.inv_weight = prio_to_wmult[0] >> 1;
 		return;
 	}
 	/*
 	 * SCHED_IDLE tasks get minimal weight:
 	 */
 	if (p->policy == SCHED_IDLE) {
 		p->se.load.weight = WEIGHT_IDLEPRIO;
 		p->se.load.inv_weight = WMULT_IDLEPRIO;
 		return;
 	}
 	p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
 	p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 }
 static void update_avg(u64 *avg, u64 sample)
 {
 	s64 diff = sample - *avg;
 	*avg += diff >> 3;
 }
 static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	if (wakeup)
 		p->se.start_runtime = p->se.sum_exec_runtime;
 	sched_info_queued(p);
 	p->sched_class->enqueue_task(rq, p, wakeup);
 	p->se.on_rq = 1;
 }
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
 	if (sleep) {
 		if (p->se.last_wakeup) {
 			update_avg(&p->se.avg_overlap,
 				p->se.sum_exec_runtime - p->se.last_wakeup);
 			p->se.last_wakeup = 0;
 		} else {
 			update_avg(&p->se.avg_wakeup,
 				sysctl_sched_wakeup_granularity);
 		}
 	}
 	sched_info_dequeued(p);
 	p->sched_class->dequeue_task(rq, p, sleep);
 	p->se.on_rq = 0;
 }
 /*
  * __normal_prio - return the priority that is based on the static prio
  */
 static inline int __normal_prio(struct task_struct *p)
 {
 	return p->static_prio;
 }
 /*
  * Calculate the expected normal priority: i.e. priority
  * without taking RT-inheritance into account. Might be
  * boosted by interactivity modifiers. Changes upon fork,
  * setprio syscalls, and whenever the interactivity
  * estimator recalculates.
  */
 static inline int normal_prio(struct task_struct *p)
 {
 	int prio;
 	if (task_has_rt_policy(p))
 		prio = MAX_RT_PRIO-1 - p->rt_priority;
 	else
 		prio = __normal_prio(p);
 	return prio;
 }
 /*
  * Calculate the current priority, i.e. the priority
  * taken into account by the scheduler. This value might
  * be boosted by RT tasks, or might be boosted by
  * interactivity modifiers. Will be RT if the task got
  * RT-boosted. If not then it returns p->normal_prio.
  */
 static int effective_prio(struct task_struct *p)
 {
 	p->normal_prio = normal_prio(p);
 	/*
 	 * If we are RT tasks or we were boosted to RT priority,
 	 * keep the priority unchanged. Otherwise, update priority
 	 * to the normal priority:
 	 */
 	if (!rt_prio(p->prio))
 		return p->normal_prio;
 	return p->prio;
 }
 /*
  * activate_task - move a task to the runqueue.
  */
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible--;
 	enqueue_task(rq, p, wakeup);
 	inc_nr_running(rq);
 }
 /*
  * deactivate_task - remove a task from the runqueue.
  */
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 {
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible++;
 	dequeue_task(rq, p, sleep);
 	dec_nr_running(rq);
 }
 /**
  * task_curr - is this task currently executing on a CPU?
  * @p: the task in question.
  */
 inline int task_curr(const struct task_struct *p)
 {
 	return cpu_curr(task_cpu(p)) == p;
 }
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
 	set_task_rq(p, cpu);
 #ifdef CONFIG_SMP
 	/*
 	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
 	 * successfuly executed on another CPU. We must ensure that updates of
 	 * per-task data have been completed by this moment.
 	 */
 	smp_wmb();
 	task_thread_info(p)->cpu = cpu;
 #endif
 }
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 				       const struct sched_class *prev_class,
 				       int oldprio, int running)
 {
 	if (prev_class != p->sched_class) {
 		if (prev_class->switched_from)
 			prev_class->switched_from(rq, p, running);
 		p->sched_class->switched_to(rq, p, running);
 	} else
 		p->sched_class->prio_changed(rq, p, oldprio, running);
 }
 #ifdef CONFIG_SMP
 /*
  * Is this task likely cache-hot:
  */
 static int
 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 {
 	s64 delta;
 	/*
 	 * Buddy candidates are cache hot:
 	 */
 	if (sched_feat(CACHE_HOT_BUDDY) &&
 			(&p->se == cfs_rq_of(&p->se)->next ||
 			 &p->se == cfs_rq_of(&p->se)->last))
 		return 1;
 	if (p->sched_class != &fair_sched_class)
 		return 0;
 	if (sysctl_sched_migration_cost == -1)
 		return 1;
 	if (sysctl_sched_migration_cost == 0)
 		return 0;
 	delta = now - p->se.exec_start;
 	return delta < (s64)sysctl_sched_migration_cost;
 }
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 	int old_cpu = task_cpu(p);
 	struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
 	struct cfs_rq *old_cfsrq = task_cfs_rq(p),
 		      *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
 	u64 clock_offset;
 	clock_offset = old_rq->clock - new_rq->clock;
 	trace_sched_migrate_task(p, new_cpu);
 #ifdef CONFIG_SCHEDSTATS
 	if (p->se.wait_start)
 		p->se.wait_start -= clock_offset;
 	if (p->se.sleep_start)
 		p->se.sleep_start -= clock_offset;
 	if (p->se.block_start)
 		p->se.block_start -= clock_offset;
 #endif
 	if (old_cpu != new_cpu) {
 		p->se.nr_migrations++;
 		new_rq->nr_migrations_in++;
 #ifdef CONFIG_SCHEDSTATS
 		if (task_hot(p, old_rq->clock, NULL))
 			schedstat_inc(p, se.nr_forced2_migrations);
 #endif
 		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
 				     1, 1, NULL, 0);
 	}
 	p->se.vruntime -= old_cfsrq->min_vruntime -
 					 new_cfsrq->min_vruntime;
 	__set_task_cpu(p, new_cpu);
 }
 struct migration_req {
 	struct list_head list;
 	struct task_struct *task;
 	int dest_cpu;
 	struct completion done;
 };
 /*
  * The task's runqueue lock must be held.
  * Returns true if you have to wait for migration thread.
  */
 static int
 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 {
 	struct rq *rq = task_rq(p);
 	/*
 	 * If the task is not on a runqueue (and not running), then
 	 * it is sufficient to simply update the task's cpu field.
 	 */
 	if (!p->se.on_rq && !task_running(rq, p)) {
 		set_task_cpu(p, dest_cpu);
 		return 0;
 	}
 	init_completion(&req->done);
 	req->task = p;
 	req->dest_cpu = dest_cpu;
 	list_add(&req->list, &rq->migration_queue);
 	return 1;
 }
 /*
  * wait_task_context_switch -	wait for a thread to complete at least one
  *				context switch.
  *
  * @p must not be current.
  */
 void wait_task_context_switch(struct task_struct *p)
 {
 	unsigned long nvcsw, nivcsw, flags;
 	int running;
 	struct rq *rq;
 	nvcsw	= p->nvcsw;
 	nivcsw	= p->nivcsw;
 	for (;;) {
 		/*
 		 * The runqueue is assigned before the actual context
 		 * switch. We need to take the runqueue lock.
 		 *
 		 * We could check initially without the lock but it is
 		 * very likely that we need to take the lock in every
 		 * iteration.
 		 */
 		rq = task_rq_lock(p, &flags);
 		running = task_running(rq, p);
 		task_rq_unlock(rq, &flags);
 		if (likely(!running))
 			break;
 		/*
 		 * The switch count is incremented before the actual
 		 * context switch. We thus wait for two switches to be
 		 * sure at least one completed.
 		 */
 		if ((p->nvcsw - nvcsw) > 1)
 			break;
 		if ((p->nivcsw - nivcsw) > 1)
 			break;
 		cpu_relax();
 	}
 }
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
  * If @match_state is nonzero, it's the @p->state value just checked and
  * not expected to change.  If it changes, i.e. @p might have woken up,
  * then return zero.  When we succeed in waiting for @p to be off its CPU,
  * we return a positive number (its total switch count).  If a second call
  * a short while later returns the same number, the caller can be sure that
  * @p has remained unscheduled the whole time.
  *
  * The caller must ensure that the task *will* unschedule sometime soon,
  * else this function might spin for a *long* time. This function can't
  * be called with interrupts off, or it may introduce deadlock with
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
  */
 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
 	unsigned long flags;
 	int running, on_rq;
 	unsigned long ncsw;
 	struct rq *rq;
 	for (;;) {
 		/*
 		 * We do the initial early heuristics without holding
 		 * any task-queue locks at all. We'll only try to get
 		 * the runqueue lock when things look like they will
 		 * work out!
 		 */
 		rq = task_rq(p);
 		/*
 		 * If the task is actively running on another CPU
 		 * still, just relax and busy-wait without holding
 		 * any locks.
 		 *
 		 * NOTE! Since we don't hold any locks, it's not
 		 * even sure that "rq" stays as the right runqueue!
 		 * But we don't care, since "task_running()" will
 		 * return false if the runqueue has changed and p
 		 * is actually now running somewhere else!
 		 */
 		while (task_running(rq, p)) {
 			if (match_state && unlikely(p->state != match_state))
 				return 0;
 			cpu_relax();
 		}
 		/*
 		 * Ok, time to look more closely! We need the rq
 		 * lock now, to be *sure*. If we're wrong, we'll
 		 * just go back and repeat.
 		 */
 		rq = task_rq_lock(p, &flags);
 		trace_sched_wait_task(rq, p);
 		running = task_running(rq, p);
 		on_rq = p->se.on_rq;
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
 		task_rq_unlock(rq, &flags);
 		/*
 		 * If it changed from the expected state, bail out now.
 		 */
 		if (unlikely(!ncsw))
 			break;
 		/*
 		 * Was it really running after all now that we
 		 * checked with the proper locks actually held?
 		 *
 		 * Oops. Go back and try again..
 		 */
 		if (unlikely(running)) {
 			cpu_relax();
 			continue;
 		}
 		/*
 		 * It's not enough that it's not actively running,
 		 * it must be off the runqueue _entirely_, and not
 		 * preempted!
 		 *
 		 * So if it was still runnable (but just not actively
 		 * running right now), it's preempted, and we should
 		 * yield - it could be a while.
 		 */
 		if (unlikely(on_rq)) {
 			schedule_timeout_uninterruptible(1);
 			continue;
 		}
 		/*
 		 * Ahh, all good. It wasn't running, and it wasn't
 		 * runnable, which means that it will never become
 		 * running in the future either. We're all done!
 		 */
 		break;
 	}
 	return ncsw;
 }
 /***
  * kick_process - kick a running thread to enter/exit the kernel
  * @p: the to-be-kicked thread
  *
  * Cause a process which is running on another CPU to enter
  * kernel-mode, without any delay. (to get signals handled.)
  *
  * NOTE: this function doesnt have to take the runqueue lock,
  * because all it wants to ensure is that the remote task enters
  * the kernel. If the IPI races and the task has been migrated
  * to another CPU then no harm is done and the purpose has been
  * achieved as well.
  */
 void kick_process(struct task_struct *p)
 {
 	int cpu;
 	preempt_disable();
 	cpu = task_cpu(p);
 	if ((cpu != smp_processor_id()) && task_curr(p))
 		smp_send_reschedule(cpu);
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(kick_process);
 #endif /* CONFIG_SMP */
 /**
  * task_oncpu_function_call - call a function on the cpu on which a task runs
  * @p:		the task to evaluate
  * @func:	the function to be called
  * @info:	the function call argument
  *
  * Calls the function @func when the task is currently running. This might
  * be on the current CPU, which just calls the function directly
  */
 void task_oncpu_function_call(struct task_struct *p,
 			      void (*func) (void *info), void *info)
 {
 	int cpu;
 	preempt_disable();
 	cpu = task_cpu(p);
 	if (task_curr(p))
 		smp_call_function_single(cpu, func, info, 1);
 	preempt_enable();
 }
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
  * @state: the mask of task states that can be woken
  * @sync: do a synchronous wakeup?
  *
  * Put it on the run-queue if it's not already there. The "current"
  * thread is always on the run-queue (except when the actual
  * re-schedule is in progress), and as such you're allowed to do
  * the simpler "current->state = TASK_RUNNING" to mark yourself
  * runnable without the overhead of this.
  *
  * returns failure only if the task is already active.
  */
 static int try_to_wake_up(struct task_struct *p, unsigned int state,
 			  int wake_flags)
 {
 	int cpu, orig_cpu, this_cpu, success = 0;
 	unsigned long flags;
 	struct rq *rq, *orig_rq;
 	if (!sched_feat(SYNC_WAKEUPS))
 		wake_flags &= ~WF_SYNC;
 	this_cpu = get_cpu();
 	smp_wmb();
 	rq = orig_rq = task_rq_lock(p, &flags);
 	update_rq_clock(rq);
 	if (!(p->state & state))
 		goto out;
 	if (p->se.on_rq)
 		goto out_running;
 	cpu = task_cpu(p);
 	orig_cpu = cpu;
 #ifdef CONFIG_SMP
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 	/*
 	 * In order to handle concurrent wakeups and release the rq->lock
 	 * we put the task in TASK_WAKING state.
 	 *
 	 * First fix up the nr_uninterruptible count:
 	 */
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible--;
 	p->state = TASK_WAKING;
 	task_rq_unlock(rq, &flags);
 	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
 	if (cpu != orig_cpu)
 		set_task_cpu(p, cpu);
 	rq = task_rq_lock(p, &flags);
 	if (rq != orig_rq)
 		update_rq_clock(rq);
 	WARN_ON(p->state != TASK_WAKING);
 	cpu = task_cpu(p);
 #ifdef CONFIG_SCHEDSTATS
 	schedstat_inc(rq, ttwu_count);
 	if (cpu == this_cpu)
 		schedstat_inc(rq, ttwu_local);
 	else {
 		struct sched_domain *sd;
 		for_each_domain(this_cpu, sd) {
 			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
 				schedstat_inc(sd, ttwu_wake_remote);
 				break;
 			}
 		}
 	}
 #endif /* CONFIG_SCHEDSTATS */
 out_activate:
 #endif /* CONFIG_SMP */
 	schedstat_inc(p, se.nr_wakeups);
 	if (wake_flags & WF_SYNC)
 		schedstat_inc(p, se.nr_wakeups_sync);
 	if (orig_cpu != cpu)
 		schedstat_inc(p, se.nr_wakeups_migrate);
 	if (cpu == this_cpu)
 		schedstat_inc(p, se.nr_wakeups_local);
 	else
 		schedstat_inc(p, se.nr_wakeups_remote);
 	activate_task(rq, p, 1);
 	success = 1;
 	/*
 	 * Only attribute actual wakeups done by this task.
 	 */
 	if (!in_interrupt()) {
 		struct sched_entity *se = &current->se;
 		u64 sample = se->sum_exec_runtime;
 		if (se->last_wakeup)
 			sample -= se->last_wakeup;
 		else
 			sample -= se->start_runtime;
 		update_avg(&se->avg_wakeup, sample);
 		se->last_wakeup = se->sum_exec_runtime;
 	}
 out_running:
 	trace_sched_wakeup(rq, p, success);
 	check_preempt_curr(rq, p, wake_flags);
 	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
 		p->sched_class->task_wake_up(rq, p);
 #endif
 out:
 	task_rq_unlock(rq, &flags);
 	put_cpu();
 	return success;
 }
 /**
  * wake_up_process - Wake up a specific process
  * @p: The process to be woken up.
  *
  * Attempt to wake up the nominated process and move it to the set of runnable
  * processes.  Returns 1 if the process was woken up, 0 if it was already
  * running.
  *
  * It may be assumed that this function implies a write memory barrier before
  * changing the task state if and only if any tasks are woken up.
  */
 int wake_up_process(struct task_struct *p)
 {
 	return try_to_wake_up(p, TASK_ALL, 0);
 }
 EXPORT_SYMBOL(wake_up_process);
 int wake_up_state(struct task_struct *p, unsigned int state)
 {
 	return try_to_wake_up(p, state, 0);
 }
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
  *
  * __sched_fork() is basic setup used by init_idle() too:
  */
 static void __sched_fork(struct task_struct *p)
 {
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
 	p->se.last_wakeup		= 0;
 	p->se.avg_overlap		= 0;
 	p->se.start_runtime		= 0;
 	p->se.avg_wakeup		= sysctl_sched_wakeup_granularity;
 	p->se.avg_running		= 0;
 #ifdef CONFIG_SCHEDSTATS
 	p->se.wait_start			= 0;
 	p->se.wait_max				= 0;
 	p->se.wait_count			= 0;
 	p->se.wait_sum				= 0;
 	p->se.sleep_start			= 0;
 	p->se.sleep_max				= 0;
 	p->se.sum_sleep_runtime			= 0;
 	p->se.block_start			= 0;
 	p->se.block_max				= 0;
 	p->se.exec_max				= 0;
 	p->se.slice_max				= 0;
 	p->se.nr_migrations_cold		= 0;
 	p->se.nr_failed_migrations_affine	= 0;
 	p->se.nr_failed_migrations_running	= 0;
 	p->se.nr_failed_migrations_hot		= 0;
 	p->se.nr_forced_migrations		= 0;
 	p->se.nr_forced2_migrations		= 0;
 	p->se.nr_wakeups			= 0;
 	p->se.nr_wakeups_sync			= 0;
 	p->se.nr_wakeups_migrate		= 0;
 	p->se.nr_wakeups_local			= 0;
 	p->se.nr_wakeups_remote			= 0;
 	p->se.nr_wakeups_affine			= 0;
 	p->se.nr_wakeups_affine_attempts	= 0;
 	p->se.nr_wakeups_passive		= 0;
 	p->se.nr_wakeups_idle			= 0;
 #endif
 	INIT_LIST_HEAD(&p->rt.run_list);
 	p->se.on_rq = 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
 	/*
 	 * We mark the process as running here, but have not actually
 	 * inserted it onto the runqueue yet. This guarantees that
 	 * nobody will actually run it, and a signal or other external
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
 	p->state = TASK_RUNNING;
 }
 /*
  * fork()/clone()-time setup:
  */
 void sched_fork(struct task_struct *p, int clone_flags)
 {
 	int cpu = get_cpu();
 	__sched_fork(p);
 	/*
 	 * Revert to default priority/policy on fork if requested.
 	 */
 	if (unlikely(p->sched_reset_on_fork)) {
 		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
 			p->policy = SCHED_NORMAL;
 			p->normal_prio = p->static_prio;
 		}
 		if (PRIO_TO_NICE(p->static_prio) < 0) {
 			p->static_prio = NICE_TO_PRIO(0);
 			p->normal_prio = p->static_prio;
 			set_load_weight(p);
 		}
 		/*
 		 * We don't need the reset flag anymore after the fork. It has
 		 * fulfilled its duty:
 		 */
 		p->sched_reset_on_fork = 0;
 	}
 	/*
 	 * Make sure we do not leak PI boosting priority to the child.
 	 */
 	p->prio = current->normal_prio;
 	if (!rt_prio(p->prio))
 		p->sched_class = &fair_sched_class;
 #ifdef CONFIG_SMP
 	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
 #endif
 	set_task_cpu(p, cpu);
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	p->oncpu = 0;
 #endif
 #ifdef CONFIG_PREEMPT
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
 	plist_node_init(&p->pushable_tasks, MAX_PRIO);
 	put_cpu();
 }
 /*
  * wake_up_new_task - wake up a newly created task for the first time.
  *
  * This function will do some initial scheduler statistics housekeeping
  * that must be done for every newly created context, then puts the task
  * on the runqueue and wakes it.
  */
 void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
 	unsigned long flags;
 	struct rq *rq;
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
 	update_rq_clock(rq);
 	if (!p->sched_class->task_new || !current->se.on_rq) {
 		activate_task(rq, p, 0);
 	} else {
 		/*
 		 * Let the scheduling class do new task startup
 		 * management (if any):
 		 */
 		p->sched_class->task_new(rq, p);
 		inc_nr_running(rq);
 	}
 	trace_sched_wakeup_new(rq, p, 1);
 	check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
 		p->sched_class->task_wake_up(rq, p);
 #endif
 	task_rq_unlock(rq, &flags);
 }
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 /**
  * preempt_notifier_register - tell me when current is being preempted & rescheduled
  * @notifier: notifier struct to register
  */
 void preempt_notifier_register(struct preempt_notifier *notifier)
 {
 	hlist_add_head(&notifier->link, &current->preempt_notifiers);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_register);
 /**
  * preempt_notifier_unregister - no longer interested in preemption notifications
  * @notifier: notifier struct to unregister
  *
  * This is safe to call from within a preemption notifier.
  */
 void preempt_notifier_unregister(struct preempt_notifier *notifier)
 {
 	hlist_del(&notifier->link);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
 	struct preempt_notifier *notifier;
 	struct hlist_node *node;
 	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
 		notifier->ops->sched_in(notifier, raw_smp_processor_id());
 }
 static void
 fire_sched_out_preempt_notifiers(struct task_struct *curr,
 				 struct task_struct *next)
 {
 	struct preempt_notifier *notifier;
 	struct hlist_node *node;
 	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
 		notifier->ops->sched_out(notifier, next);
 }
 #else /* !CONFIG_PREEMPT_NOTIFIERS */
 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
 }
 static void
 fire_sched_out_preempt_notifiers(struct task_struct *curr,
 				 struct task_struct *next)
 {
 }
 #endif /* CONFIG_PREEMPT_NOTIFIERS */
 /**
  * prepare_task_switch - prepare to switch tasks
  * @rq: the runqueue preparing to switch
  * @prev: the current task that is being switched out
  * @next: the task we are going to switch to.
  *
  * This is called with the rq lock held and interrupts off. It must
  * be paired with a subsequent finish_task_switch after the context
  * switch.
  *
  * prepare_task_switch sets up locking and calls architecture specific
  * hooks.
  */
 static inline void
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
 		    struct task_struct *next)
 {
 	fire_sched_out_preempt_notifiers(prev, next);
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
 }
 /**
  * finish_task_switch - clean up after a task-switch
  * @rq: runqueue associated with task-switch
  * @prev: the thread we just switched away from.
  *
  * finish_task_switch must be called after the context switch, paired
  * with a prepare_task_switch call before the context switch.
  * finish_task_switch will reconcile locking set up by prepare_task_switch,
  * and do any other architecture-specific cleanup actions.
  *
  * Note that we may have delayed dropping an mm in context_switch(). If
  * so, we finish that here outside of the runqueue lock. (Doing it
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
  */
 static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct mm_struct *mm = rq->prev_mm;
 	long prev_state;
 	rq->prev_mm = NULL;
 	/*
 	 * A task struct has one reference for the use as "current".
 	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
 	 * schedule one last time. The schedule call will never return, and
 	 * the scheduled task must drop that reference.
 	 * The test for TASK_DEAD must occur while the runqueue locks are
 	 * still held, otherwise prev could be scheduled on another cpu, die
 	 * there before we look at prev->state, and then the reference would
 	 * be dropped twice.
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
 	prev_state = prev->state;
 	finish_arch_switch(prev);
 	perf_event_task_sched_in(current, cpu_of(rq));
 	finish_lock_switch(rq, prev);
 	fire_sched_in_preempt_notifiers(current);
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
 		/*
 		 * Remove function-return probe instances associated with this
 		 * task and put them back on the free list.
 		 */
 		kprobe_flush_task(prev);
 		put_task_struct(prev);
 	}
 }
 #ifdef CONFIG_SMP
 /* assumes rq->lock is held */
 static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
 {
 	if (prev->sched_class->pre_schedule)
 		prev->sched_class->pre_schedule(rq, prev);
 }
 /* rq->lock is NOT held, but preemption is disabled */
 static inline void post_schedule(struct rq *rq)
 {
 	if (rq->post_schedule) {
 		unsigned long flags;
 		spin_lock_irqsave(&rq->lock, flags);
 		if (rq->curr->sched_class->post_schedule)
 			rq->curr->sched_class->post_schedule(rq);
 		spin_unlock_irqrestore(&rq->lock, flags);
 		rq->post_schedule = 0;
 	}
 }
 #else
 static inline void pre_schedule(struct rq *rq, struct task_struct *p)
 {
 }
 static inline void post_schedule(struct rq *rq)
 {
 }
 #endif
 /**
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
  */
 asmlinkage void schedule_tail(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
 	finish_task_switch(rq, prev);
 	/*
 	 * FIXME: do we need to worry about rq being invalidated by the
 	 * task_switch?
 	 */
 	post_schedule(rq);
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	/* In this case, finish_task_switch does not reenable preemption */
 	preempt_enable();
 #endif
 	if (current->set_child_tid)
 		put_user(task_pid_vnr(current), current->set_child_tid);
 }
 /*
  * context_switch - switch to the new MM and the new
  * thread's register state.
  */
 static inline void
 context_switch(struct rq *rq, struct task_struct *prev,
 	       struct task_struct *next)
 {
 	struct mm_struct *mm, *oldmm;
 	prepare_task_switch(rq, prev, next);
 	trace_sched_switch(rq, prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
 	 * For paravirt, this is coupled with an exit in switch_to to
 	 * combine the page table reload and the switch backend into
 	 * one hypercall.
 	 */
 	arch_start_context_switch(prev);
 	if (unlikely(!mm)) {
 		next->active_mm = oldmm;
 		atomic_inc(&oldmm->mm_count);
 		enter_lazy_tlb(oldmm, next);
 	} else
 		switch_mm(oldmm, mm, next);
 	if (unlikely(!prev->mm)) {
 		prev->active_mm = NULL;
 		rq->prev_mm = oldmm;
 	}
 	/*
 	 * Since the runqueue lock will be released by the next
 	 * task (which is an invalid locking op but in the case
 	 * of the scheduler it's an obvious special-case), so we
 	 * do an early lockdep release here:
 	 */
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 #endif
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 	barrier();
 	/*
 	 * this_rq must be evaluated again because prev may have moved
 	 * CPUs since it called schedule(), thus the 'rq' on its stack
 	 * frame will be invalid.
 	 */
 	finish_task_switch(this_rq(), prev);
 }
 /*
  * nr_running, nr_uninterruptible and nr_context_switches:
  *
  * externally visible scheduler statistics: current number of runnable
  * threads, current number of uninterruptible-sleeping threads, total
  * number of context switches performed since bootup.
  */
 unsigned long nr_running(void)
 {
 	unsigned long i, sum = 0;
 	for_each_online_cpu(i)
 		sum += cpu_rq(i)->nr_running;
 	return sum;
 }
 unsigned long nr_uninterruptible(void)
 {
 	unsigned long i, sum = 0;
 	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_uninterruptible;
 	/*
 	 * Since we read the counters lockless, it might be slightly
 	 * inaccurate. Do not allow it to go below zero though:
 	 */
 	if (unlikely((long)sum < 0))
 		sum = 0;
 	return sum;
 }
 unsigned long long nr_context_switches(void)
 {
 	int i;
 	unsigned long long sum = 0;
 	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_switches;
 	return sum;
 }
 unsigned long nr_iowait(void)
 {
 	unsigned long i, sum = 0;
 	for_each_possible_cpu(i)
 		sum += atomic_read(&cpu_rq(i)->nr_iowait);
 	return sum;
 }
 unsigned long nr_iowait_cpu(void)
 {
 	struct rq *this = this_rq();
 	return atomic_read(&this->nr_iowait);
 }
 unsigned long this_cpu_load(void)
 {
 	struct rq *this = this_rq();
 	return this->cpu_load[0];
 }
 /* Variables and functions for calc_load */
 static atomic_long_t calc_load_tasks;
 static unsigned long calc_load_update;
 unsigned long avenrun[3];
 EXPORT_SYMBOL(avenrun);
 /**
  * get_avenrun - get the load average array
  * @loads:	pointer to dest load array
  * @offset:	offset to add
  * @shift:	shift count to shift the result left
  *
  * These values are estimates at best, so no need for locking.
  */
 void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 {
 	loads[0] = (avenrun[0] + offset) << shift;
 	loads[1] = (avenrun[1] + offset) << shift;
 	loads[2] = (avenrun[2] + offset) << shift;
 }
 static unsigned long
 calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
 	load *= exp;
 	load += active * (FIXED_1 - exp);
 	return load >> FSHIFT;
 }
 /*
  * calc_load - update the avenrun load estimates 10 ticks after the
  * CPUs have updated calc_load_tasks.
  */
 void calc_global_load(void)
 {
 	unsigned long upd = calc_load_update + 10;
 	long active;
 	if (time_before(jiffies, upd))
 		return;
 	active = atomic_long_read(&calc_load_tasks);
 	active = active > 0 ? active * FIXED_1 : 0;
 	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
 	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
 	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
 	calc_load_update += LOAD_FREQ;
 }
 /*
  * Either called from update_cpu_load() or from a cpu going idle
  */
 static void calc_load_account_active(struct rq *this_rq)
 {
 	long nr_active, delta;
 	nr_active = this_rq->nr_running;
 	nr_active += (long) this_rq->nr_uninterruptible;
 	if (nr_active != this_rq->calc_load_active) {
 		delta = nr_active - this_rq->calc_load_active;
 		this_rq->calc_load_active = nr_active;
 		atomic_long_add(delta, &calc_load_tasks);
 	}
 }
 /*
  * Externally visible per-cpu scheduler statistics:
  * cpu_nr_migrations(cpu) - number of migrations into that cpu
  */
 u64 cpu_nr_migrations(int cpu)
 {
 	return cpu_rq(cpu)->nr_migrations_in;
 }
 /*
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
  */
 static void update_cpu_load(struct rq *this_rq)
 {
 	unsigned long this_load = this_rq->load.weight;
 	int i, scale;
 	this_rq->nr_load_updates++;
 	/* Update our load: */
 	for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
 		unsigned long old_load, new_load;
 		/* scale is effectively 1 << i now, and >> i divides by scale */
 		old_load = this_rq->cpu_load[i];
 		new_load = this_load;
 		/*
 		 * Round up the averaging division if load is increasing. This
 		 * prevents us from getting stuck on 9 if the load is 10, for
 		 * example.
 		 */
 		if (new_load > old_load)
 			new_load += scale-1;
 		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
 	}
 	if (time_after_eq(jiffies, this_rq->calc_load_update)) {
 		this_rq->calc_load_update += LOAD_FREQ;
 		calc_load_account_active(this_rq);
 	}
 }
 #ifdef CONFIG_SMP
 /*
  * double_rq_lock - safely lock two runqueues
  *
  * Note this does not disable interrupts like task_rq_lock,
  * you need to do so manually before calling.
  */
 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
 	__acquires(rq1->lock)
 	__acquires(rq2->lock)
 {
 	BUG_ON(!irqs_disabled());
 	if (rq1 == rq2) {
 		spin_lock(&rq1->lock);
 		__acquire(rq2->lock);	/* Fake it out ;) */
 	} else {
 		if (rq1 < rq2) {
 			spin_lock(&rq1->lock);
 			spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
 		} else {
 			spin_lock(&rq2->lock);
 			spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
 		}
 	}
 	update_rq_clock(rq1);
 	update_rq_clock(rq2);
 }
 /*
  * double_rq_unlock - safely unlock two runqueues
  *
  * Note this does not restore interrupts like task_rq_unlock,
  * you need to do so manually after calling.
  */
 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 	__releases(rq1->lock)
 	__releases(rq2->lock)
 {
 	spin_unlock(&rq1->lock);
 	if (rq1 != rq2)
 		spin_unlock(&rq2->lock);
 	else
 		__release(rq2->lock);
 }
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
  * allow dest_cpu, which will force the cpu onto dest_cpu. Then
  * the cpu_allowed mask is restored.
  */
 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
 {
 	struct migration_req req;
 	unsigned long flags;
 	struct rq *rq;
 	rq = task_rq_lock(p, &flags);
 	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
 	    || unlikely(!cpu_active(dest_cpu)))
 		goto out;
 	/* force the process onto the specified CPU */
 	if (migrate_task(p, dest_cpu, &req)) {
 		/* Need to wait for migration thread (might exit: take ref). */
 		struct task_struct *mt = rq->migration_thread;
 		get_task_struct(mt);
 		task_rq_unlock(rq, &flags);
 		wake_up_process(mt);
 		put_task_struct(mt);
 		wait_for_completion(&req.done);
 		return;
 	}
 out:
 	task_rq_unlock(rq, &flags);
 }
 /*
  * sched_exec - execve() is a valuable balancing opportunity, because at
  * this point the task has the smallest effective memory and cache footprint.
  */
 void sched_exec(void)
 {
 	int new_cpu, this_cpu = get_cpu();
 	new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
 	put_cpu();
 	if (new_cpu != this_cpu)
 		sched_migrate_task(current, new_cpu);
 }
 /*
  * pull_task - move a task from a remote runqueue to the local runqueue.
  * Both runqueues must be locked.
  */
 static void pull_task(struct rq *src_rq, struct task_struct *p,
 		      struct rq *this_rq, int this_cpu)
 {
 	deactivate_task(src_rq, p, 0);
 	set_task_cpu(p, this_cpu);
 	activate_task(this_rq, p, 0);
 	/*
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
 	 * to be always true for them.
 	 */
 	check_preempt_curr(this_rq, p, 0);
 }
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
 static
 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 		     struct sched_domain *sd, enum cpu_idle_type idle,
 		     int *all_pinned)
 {
 	int tsk_cache_hot = 0;
 	/*
 	 * We do not migrate tasks that are:
 	 * 1) running (obviously), or
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
 	 */
 	if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
 		schedstat_inc(p, se.nr_failed_migrations_affine);
 		return 0;
 	}
 	*all_pinned = 0;
 	if (task_running(rq, p)) {
 		schedstat_inc(p, se.nr_failed_migrations_running);
 		return 0;
 	}
 	/*
 	 * Aggressive migration if:
 	 * 1) task is cache cold, or
 	 * 2) too many balance attempts have failed.
 	 */
 	tsk_cache_hot = task_hot(p, rq->clock, sd);
 	if (!tsk_cache_hot ||
 		sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
 		if (tsk_cache_hot) {
 			schedstat_inc(sd, lb_hot_gained[idle]);
 			schedstat_inc(p, se.nr_forced_migrations);
 		}
 #endif
 		return 1;
 	}
 	if (tsk_cache_hot) {
 		schedstat_inc(p, se.nr_failed_migrations_hot);
 		return 0;
 	}
 	return 1;
 }
 static unsigned long
 balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	      unsigned long max_load_move, struct sched_domain *sd,
 	      enum cpu_idle_type idle, int *all_pinned,
 	      int *this_best_prio, struct rq_iterator *iterator)
 {
 	int loops = 0, pulled = 0, pinned = 0;
 	struct task_struct *p;
 	long rem_load_move = max_load_move;
 	if (max_load_move == 0)
 		goto out;
 	pinned = 1;
 	/*
 	 * Start the load-balancing iterator:
 	 */
 	p = iterator->start(iterator->arg);
 next:
 	if (!p || loops++ > sysctl_sched_nr_migrate)
 		goto out;
 	if ((p->se.load.weight >> 1) > rem_load_move ||
 	    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
 		p = iterator->next(iterator->arg);
 		goto next;
 	}
 	pull_task(busiest, p, this_rq, this_cpu);
 	pulled++;
 	rem_load_move -= p->se.load.weight;
 #ifdef CONFIG_PREEMPT
 	/*
 	 * NEWIDLE balancing is a source of latency, so preemptible kernels
 	 * will stop after the first task is pulled to minimize the critical
 	 * section.
 	 */
 	if (idle == CPU_NEWLY_IDLE)
 		goto out;
 #endif
 	/*
 	 * We only want to steal up to the prescribed amount of weighted load.
 	 */
 	if (rem_load_move > 0) {
 		if (p->prio < *this_best_prio)
 			*this_best_prio = p->prio;
 		p = iterator->next(iterator->arg);
 		goto next;
 	}
 out:
 	/*
 	 * Right now, this is one of only two places pull_task() is called,
 	 * so we can safely collect pull_task() stats here rather than
 	 * inside pull_task().
 	 */
 	schedstat_add(sd, lb_gained[idle], pulled);
 	if (all_pinned)
 		*all_pinned = pinned;
 	return max_load_move - rem_load_move;
 }
 /*
  * move_tasks tries to move up to max_load_move weighted load from busiest to
  * this_rq, as part of a balancing operation within domain "sd".
  * Returns 1 if successful and 0 otherwise.
  *
  * Called with both runqueues locked.
  */
 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		      unsigned long max_load_move,
 		      struct sched_domain *sd, enum cpu_idle_type idle,
 		      int *all_pinned)
 {
 	const struct sched_class *class = sched_class_highest;
 	unsigned long total_load_moved = 0;
 	int this_best_prio = this_rq->curr->prio;
 	do {
 		total_load_moved +=
 			class->load_balance(this_rq, this_cpu, busiest,
 				max_load_move - total_load_moved,
 				sd, idle, all_pinned, &this_best_prio);
 		class = class->next;
 #ifdef CONFIG_PREEMPT
 		/*
 		 * NEWIDLE balancing is a source of latency, so preemptible
 		 * kernels will stop after the first task is pulled to minimize
 		 * the critical section.
 		 */
 		if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
 			break;
 #endif
 	} while (class && max_load_move > total_load_moved);
 	return total_load_moved > 0;
 }
 static int
 iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		   struct sched_domain *sd, enum cpu_idle_type idle,
 		   struct rq_iterator *iterator)
 {
 	struct task_struct *p = iterator->start(iterator->arg);
 	int pinned = 0;
 	while (p) {
 		if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
 			pull_task(busiest, p, this_rq, this_cpu);
 			/*
 			 * Right now, this is only the second place pull_task()
 			 * is called, so we can safely collect pull_task()
 			 * stats here rather than inside pull_task().
 			 */
 			schedstat_inc(sd, lb_gained[idle]);
 			return 1;
 		}
 		p = iterator->next(iterator->arg);
 	}
 	return 0;
 }
 /*
  * move_one_task tries to move exactly one task from busiest to this_rq, as
  * part of active balancing operations within "domain".
  * Returns 1 if successful and 0 otherwise.
  *
  * Called with both runqueues locked.
  */
 static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 			 struct sched_domain *sd, enum cpu_idle_type idle)
 {
 	const struct sched_class *class;
 	for_each_class(class) {
 		if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
 			return 1;
 	}
 	return 0;
 }
 /********** Helpers for find_busiest_group ************************/
 /*
  * sd_lb_stats - Structure to store the statistics of a sched_domain
  * 		during load balancing.
  */
 struct sd_lb_stats {
 	struct sched_group *busiest; /* Busiest group in this sd */
 	struct sched_group *this;  /* Local group in this sd */
 	unsigned long total_load;  /* Total load of all groups in sd */
 	unsigned long total_pwr;   /*	Total power of all groups in sd */
 	unsigned long avg_load;	   /* Average load across all groups in sd */
 	/** Statistics of this group */
 	unsigned long this_load;
 	unsigned long this_load_per_task;
 	unsigned long this_nr_running;
 	/* Statistics of the busiest group */
 	unsigned long max_load;
 	unsigned long busiest_load_per_task;
 	unsigned long busiest_nr_running;
 	int group_imb; /* Is there imbalance in this sd */
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 	int power_savings_balance; /* Is powersave balance needed for this sd */
 	struct sched_group *group_min; /* Least loaded group in sd */
 	struct sched_group *group_leader; /* Group which relieves group_min */
 	unsigned long min_load_per_task; /* load_per_task in group_min */
 	unsigned long leader_nr_running; /* Nr running of group_leader */
 	unsigned long min_nr_running; /* Nr running of group_min */
 #endif
 };
 /*
  * sg_lb_stats - stats of a sched_group required for load_balancing
  */
 struct sg_lb_stats {
 	unsigned long avg_load; /*Avg load across the CPUs of the group */
 	unsigned long group_load; /* Total load over the CPUs of the group */
 	unsigned long sum_nr_running; /* Nr tasks running in the group */
 	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
 	unsigned long group_capacity;
 	int group_imb; /* Is there an imbalance in the group ? */
 };
 /**
  * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
  * @group: The group whose first cpu is to be returned.
  */
 static inline unsigned int group_first_cpu(struct sched_group *group)
 {
 	return cpumask_first(sched_group_cpus(group));
 }
 /**
  * get_sd_load_idx - Obtain the load index for a given sched domain.
  * @sd: The sched_domain whose load_idx is to be obtained.
  * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
  */
 static inline int get_sd_load_idx(struct sched_domain *sd,
 					enum cpu_idle_type idle)
 {
 	int load_idx;
 	switch (idle) {
 	case CPU_NOT_IDLE:
 		load_idx = sd->busy_idx;
 		break;
 	case CPU_NEWLY_IDLE:
 		load_idx = sd->newidle_idx;
 		break;
 	default:
 		load_idx = sd->idle_idx;
 		break;
 	}
 	return load_idx;
 }
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 /**
  * init_sd_power_savings_stats - Initialize power savings statistics for
  * the given sched_domain, during load balancing.
  *
  * @sd: Sched domain whose power-savings statistics are to be initialized.
  * @sds: Variable containing the statistics for sd.
  * @idle: Idle status of the CPU at which we're performing load-balancing.
  */
 static inline void init_sd_power_savings_stats(struct sched_domain *sd,
 	struct sd_lb_stats *sds, enum cpu_idle_type idle)
 {
 	/*
 	 * Busy processors will not participate in power savings
 	 * balance.
 	 */
 	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
 		sds->power_savings_balance = 0;
 	else {
 		sds->power_savings_balance = 1;
 		sds->min_nr_running = ULONG_MAX;
 		sds->leader_nr_running = 0;
 	}
 }
 /**
  * update_sd_power_savings_stats - Update the power saving stats for a
  * sched_domain while performing load balancing.
  *
  * @group: sched_group belonging to the sched_domain under consideration.
  * @sds: Variable containing the statistics of the sched_domain
  * @local_group: Does group contain the CPU for which we're performing
  * 		load balancing ?
  * @sgs: Variable containing the statistics of the group.
  */
 static inline void update_sd_power_savings_stats(struct sched_group *group,
 	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
 {
 	if (!sds->power_savings_balance)
 		return;
 	/*
 	 * If the local group is idle or completely loaded
 	 * no need to do power savings balance at this domain
 	 */
 	if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
 				!sds->this_nr_running))
 		sds->power_savings_balance = 0;
 	/*
 	 * If a group is already running at full capacity or idle,
 	 * don't include that group in power savings calculations
 	 */
 	if (!sds->power_savings_balance ||
 		sgs->sum_nr_running >= sgs->group_capacity ||
 		!sgs->sum_nr_running)
 		return;
 	/*
 	 * Calculate the group which has the least non-idle load.
 	 * This is the group from where we need to pick up the load
 	 * for saving power
 	 */
 	if ((sgs->sum_nr_running < sds->min_nr_running) ||
 	    (sgs->sum_nr_running == sds->min_nr_running &&
 	     group_first_cpu(group) > group_first_cpu(sds->group_min))) {
 		sds->group_min = group;
 		sds->min_nr_running = sgs->sum_nr_running;
 		sds->min_load_per_task = sgs->sum_weighted_load /
 						sgs->sum_nr_running;
 	}
 	/*
 	 * Calculate the group which is almost near its
 	 * capacity but still has some space to pick up some load
 	 * from other group and save more power
 	 */
 	if (sgs->sum_nr_running + 1 > sgs->group_capacity)
 		return;
 	if (sgs->sum_nr_running > sds->leader_nr_running ||
 	    (sgs->sum_nr_running == sds->leader_nr_running &&
 	     group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
 		sds->group_leader = group;
 		sds->leader_nr_running = sgs->sum_nr_running;
 	}
 }
 /**
  * check_power_save_busiest_group - see if there is potential for some power-savings balance
  * @sds: Variable containing the statistics of the sched_domain
  *	under consideration.
  * @this_cpu: Cpu at which we're currently performing load-balancing.
  * @imbalance: Variable to store the imbalance.
  *
  * Description:
  * Check if we have potential to perform some power-savings balance.
  * If yes, set the busiest group to be the least loaded group in the
  * sched_domain, so that it's CPUs can be put to idle.
  *
  * Returns 1 if there is potential to perform power-savings balance.
  * Else returns 0.
  */
 static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
 					int this_cpu, unsigned long *imbalance)
 {
 	if (!sds->power_savings_balance)
 		return 0;
 	if (sds->this != sds->group_leader ||
 			sds->group_leader == sds->group_min)
 		return 0;
 	*imbalance = sds->min_load_per_task;
 	sds->busiest = sds->group_min;
 	return 1;
 }
 #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 static inline void init_sd_power_savings_stats(struct sched_domain *sd,
 	struct sd_lb_stats *sds, enum cpu_idle_type idle)
 {
 	return;
 }
 static inline void update_sd_power_savings_stats(struct sched_group *group,
 	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
 {
 	return;
 }
 static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
 					int this_cpu, unsigned long *imbalance)
 {
 	return 0;
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
 {
 	return SCHED_LOAD_SCALE;
 }
 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
 {
 	return default_scale_freq_power(sd, cpu);
 }
 unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
 {
 	unsigned long weight = cpumask_weight(sched_domain_span(sd));
 	unsigned long smt_gain = sd->smt_gain;
 	smt_gain /= weight;
 	return smt_gain;
 }
 unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
 {
 	return default_scale_smt_power(sd, cpu);
 }
 unsigned long scale_rt_power(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	u64 total, available;
 	sched_avg_update(rq);
 	total = sched_avg_period() + (rq->clock - rq->age_stamp);
 	available = total - rq->rt_avg;
 	if (unlikely((s64)total < SCHED_LOAD_SCALE))
 		total = SCHED_LOAD_SCALE;
 	total >>= SCHED_LOAD_SHIFT;
 	return div_u64(available, total);
 }
 static void update_cpu_power(struct sched_domain *sd, int cpu)
 {
 	unsigned long weight = cpumask_weight(sched_domain_span(sd));
 	unsigned long power = SCHED_LOAD_SCALE;
 	struct sched_group *sdg = sd->groups;
 	if (sched_feat(ARCH_POWER))
 		power *= arch_scale_freq_power(sd, cpu);
 	else
 		power *= default_scale_freq_power(sd, cpu);
 	power >>= SCHED_LOAD_SHIFT;
 	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
 		if (sched_feat(ARCH_POWER))
 			power *= arch_scale_smt_power(sd, cpu);
 		else
 			power *= default_scale_smt_power(sd, cpu);
 		power >>= SCHED_LOAD_SHIFT;
 	}
 	power *= scale_rt_power(cpu);
 	power >>= SCHED_LOAD_SHIFT;
 	if (!power)
 		power = 1;
 	sdg->cpu_power = power;
 }
 static void update_group_power(struct sched_domain *sd, int cpu)
 {
 	struct sched_domain *child = sd->child;
 	struct sched_group *group, *sdg = sd->groups;
 	unsigned long power;
 	if (!child) {
 		update_cpu_power(sd, cpu);
 		return;
 	}
 	power = 0;
 	group = child->groups;
 	do {
 		power += group->cpu_power;
 		group = group->next;
 	} while (group != child->groups);
 	sdg->cpu_power = power;
 }
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @sd: The sched_domain whose statistics are to be updated.
  * @group: sched_group whose statistics are to be updated.
  * @this_cpu: Cpu for which load balance is currently performed.
  * @idle: Idle status of this_cpu
  * @load_idx: Load index of sched_domain of this_cpu for load calc.
  * @sd_idle: Idle status of the sched_domain containing group.
  * @local_group: Does group contain this_cpu.
  * @cpus: Set of cpus considered for load balancing.
  * @balance: Should we balance.
  * @sgs: variable to hold the statistics for this group.
  */
 static inline void update_sg_lb_stats(struct sched_domain *sd,
 			struct sched_group *group, int this_cpu,
 			enum cpu_idle_type idle, int load_idx, int *sd_idle,
 			int local_group, const struct cpumask *cpus,
 			int *balance, struct sg_lb_stats *sgs)
 {
 	unsigned long load, max_cpu_load, min_cpu_load;
 	int i;
 	unsigned int balance_cpu = -1, first_idle_cpu = 0;
 	unsigned long sum_avg_load_per_task;
 	unsigned long avg_load_per_task;
 	if (local_group) {
 		balance_cpu = group_first_cpu(group);
 		if (balance_cpu == this_cpu)
 			update_group_power(sd, this_cpu);
 	}
 	/* Tally up the load of all CPUs in the group */
 	sum_avg_load_per_task = avg_load_per_task = 0;
 	max_cpu_load = 0;
 	min_cpu_load = ~0UL;
 	for_each_cpu_and(i, sched_group_cpus(group), cpus) {
 		struct rq *rq = cpu_rq(i);
 		if (*sd_idle && rq->nr_running)
 			*sd_idle = 0;
 		/* Bias balancing toward cpus of our domain */
 		if (local_group) {
 			if (idle_cpu(i) && !first_idle_cpu) {
 				first_idle_cpu = 1;
 				balance_cpu = i;
 			}
 			load = target_load(i, load_idx);
 		} else {
 			load = source_load(i, load_idx);
 			if (load > max_cpu_load)
 				max_cpu_load = load;
 			if (min_cpu_load > load)
 				min_cpu_load = load;
 		}
 		sgs->group_load += load;
 		sgs->sum_nr_running += rq->nr_running;
 		sgs->sum_weighted_load += weighted_cpuload(i);
 		sum_avg_load_per_task += cpu_avg_load_per_task(i);
 	}
 	/*
 	 * First idle cpu or the first cpu(busiest) in this sched group
 	 * is eligible for doing load balancing at this and above
 	 * domains. In the newly idle case, we will allow all the cpu's
 	 * to do the newly idle load balance.
 	 */
 	if (idle != CPU_NEWLY_IDLE && local_group &&
 	    balance_cpu != this_cpu && balance) {
 		*balance = 0;
 		return;
 	}
 	/* Adjust by relative CPU power of the group */
 	sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
 	/*
 	 * Consider the group unbalanced when the imbalance is larger
 	 * than the average weight of two tasks.
 	 *
 	 * APZ: with cgroup the avg task weight can vary wildly and
 	 *      might not be a suitable number - should we keep a
 	 *      normalized nr_running number somewhere that negates
 	 *      the hierarchy?
 	 */
 	avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
 		group->cpu_power;
 	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
 		sgs->group_imb = 1;
 	sgs->group_capacity =
 		DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
 }
 /**
  * update_sd_lb_stats - Update sched_group's statistics for load balancing.
  * @sd: sched_domain whose statistics are to be updated.
  * @this_cpu: Cpu for which load balance is currently performed.
  * @idle: Idle status of this_cpu
  * @sd_idle: Idle status of the sched_domain containing group.
  * @cpus: Set of cpus considered for load balancing.
  * @balance: Should we balance.
  * @sds: variable to hold the statistics for this sched_domain.
  */
 static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 			enum cpu_idle_type idle, int *sd_idle,
 			const struct cpumask *cpus, int *balance,
 			struct sd_lb_stats *sds)
 {
 	struct sched_domain *child = sd->child;
 	struct sched_group *group = sd->groups;
 	struct sg_lb_stats sgs;
 	int load_idx, prefer_sibling = 0;
 	if (child && child->flags & SD_PREFER_SIBLING)
 		prefer_sibling = 1;
 	init_sd_power_savings_stats(sd, sds, idle);
 	load_idx = get_sd_load_idx(sd, idle);
 	do {
 		int local_group;
 		local_group = cpumask_test_cpu(this_cpu,
 					       sched_group_cpus(group));
 		memset(&sgs, 0, sizeof(sgs));
 		update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
 				local_group, cpus, balance, &sgs);
 		if (local_group && balance && !(*balance))
 			return;
 		sds->total_load += sgs.group_load;
 		sds->total_pwr += group->cpu_power;
 		/*
 		 * In case the child domain prefers tasks go to siblings
 		 * first, lower the group capacity to one so that we'll try
 		 * and move all the excess tasks away.
 		 */
 		if (prefer_sibling)
 			sgs.group_capacity = min(sgs.group_capacity, 1UL);
 		if (local_group) {
 			sds->this_load = sgs.avg_load;
 			sds->this = group;
 			sds->this_nr_running = sgs.sum_nr_running;
 			sds->this_load_per_task = sgs.sum_weighted_load;
 		} else if (sgs.avg_load > sds->max_load &&
 			   (sgs.sum_nr_running > sgs.group_capacity ||
 				sgs.group_imb)) {
 			sds->max_load = sgs.avg_load;
 			sds->busiest = group;
 			sds->busiest_nr_running = sgs.sum_nr_running;
 			sds->busiest_load_per_task = sgs.sum_weighted_load;
 			sds->group_imb = sgs.group_imb;
 		}
 		update_sd_power_savings_stats(group, sds, local_group, &sgs);
 		group = group->next;
 	} while (group != sd->groups);
 }
 /**
  * fix_small_imbalance - Calculate the minor imbalance that exists
  *			amongst the groups of a sched_domain, during
  *			load balancing.
  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
  * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
  * @imbalance: Variable to store the imbalance.
  */
 static inline void fix_small_imbalance(struct sd_lb_stats *sds,
 				int this_cpu, unsigned long *imbalance)
 {
 	unsigned long tmp, pwr_now = 0, pwr_move = 0;
 	unsigned int imbn = 2;
 	if (sds->this_nr_running) {
 		sds->this_load_per_task /= sds->this_nr_running;
 		if (sds->busiest_load_per_task >
 				sds->this_load_per_task)
 			imbn = 1;
 	} else
 		sds->this_load_per_task =
 			cpu_avg_load_per_task(this_cpu);
 	if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
 			sds->busiest_load_per_task * imbn) {
 		*imbalance = sds->busiest_load_per_task;
 		return;
 	}
 	/*
 	 * OK, we don't have enough imbalance to justify moving tasks,
 	 * however we may be able to increase total CPU power used by
 	 * moving them.
 	 */
 	pwr_now += sds->busiest->cpu_power *
 			min(sds->busiest_load_per_task, sds->max_load);
 	pwr_now += sds->this->cpu_power *
 			min(sds->this_load_per_task, sds->this_load);
 	pwr_now /= SCHED_LOAD_SCALE;
 	/* Amount of load we'd subtract */
 	tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
 		sds->busiest->cpu_power;
 	if (sds->max_load > tmp)
 		pwr_move += sds->busiest->cpu_power *
 			min(sds->busiest_load_per_task, sds->max_load - tmp);
 	/* Amount of load we'd add */
 	if (sds->max_load * sds->busiest->cpu_power <
 		sds->busiest_load_per_task * SCHED_LOAD_SCALE)
 		tmp = (sds->max_load * sds->busiest->cpu_power) /
 			sds->this->cpu_power;
 	else
 		tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
 			sds->this->cpu_power;
 	pwr_move += sds->this->cpu_power *
 			min(sds->this_load_per_task, sds->this_load + tmp);
 	pwr_move /= SCHED_LOAD_SCALE;
 	/* Move if we gain throughput */
 	if (pwr_move > pwr_now)
 		*imbalance = sds->busiest_load_per_task;
 }
 /**
  * calculate_imbalance - Calculate the amount of imbalance present within the
  *			 groups of a given sched_domain during load balance.
  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
  * @this_cpu: Cpu for which currently load balance is being performed.
  * @imbalance: The variable to store the imbalance.
  */
 static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 		unsigned long *imbalance)
 {
 	unsigned long max_pull;
 	/*
 	 * In the presence of smp nice balancing, certain scenarios can have
 	 * max load less than avg load(as we skip the groups at or below
 	 * its cpu_power, while calculating max_load..)
 	 */
 	if (sds->max_load < sds->avg_load) {
 		*imbalance = 0;
 		return fix_small_imbalance(sds, this_cpu, imbalance);
 	}
 	/* Don't want to pull so many tasks that a group would go idle */
 	max_pull = min(sds->max_load - sds->avg_load,
 			sds->max_load - sds->busiest_load_per_task);
 	/* How much load to actually move to equalise the imbalance */
 	*imbalance = min(max_pull * sds->busiest->cpu_power,
 		(sds->avg_load - sds->this_load) * sds->this->cpu_power)
 			/ SCHED_LOAD_SCALE;
 	/*
 	 * if *imbalance is less than the average load per runnable task
 	 * there is no gaurantee that any tasks will be moved so we'll have
 	 * a think about bumping its value to force at least one task to be
 	 * moved
 	 */
 	if (*imbalance < sds->busiest_load_per_task)
 		return fix_small_imbalance(sds, this_cpu, imbalance);
 }
 /******* find_busiest_group() helpers end here *********************/
 /**
  * find_busiest_group - Returns the busiest group within the sched_domain
  * if there is an imbalance. If there isn't an imbalance, and
  * the user has opted for power-savings, it returns a group whose
  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
  * such a group exists.
  *
  * Also calculates the amount of weighted load which should be moved
  * to restore balance.
  *
  * @sd: The sched_domain whose busiest group is to be returned.
  * @this_cpu: The cpu for which load balancing is currently being performed.
  * @imbalance: Variable which stores amount of weighted load which should
  *		be moved to restore balance/put a group to idle.
  * @idle: The idle status of this_cpu.
  * @sd_idle: The idleness of sd
  * @cpus: The set of CPUs under consideration for load-balancing.
  * @balance: Pointer to a variable indicating if this_cpu
  *	is the appropriate cpu to perform load balancing at this_level.
  *
  * Returns:	- the busiest group if imbalance exists.
  *		- If no imbalance and user has opted for power-savings balance,
  *		   return the least loaded group whose CPUs can be
  *		   put to idle by rebalancing its tasks onto our group.
  */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
 		   unsigned long *imbalance, enum cpu_idle_type idle,
 		   int *sd_idle, const struct cpumask *cpus, int *balance)
 {
 	struct sd_lb_stats sds;
 	memset(&sds, 0, sizeof(sds));
 	/*
 	 * Compute the various statistics relavent for load balancing at
 	 * this level.
 	 */
 	update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
 					balance, &sds);
 	/* Cases where imbalance does not exist from POV of this_cpu */
 	/* 1) this_cpu is not the appropriate cpu to perform load balancing
 	 *    at this level.
 	 * 2) There is no busy sibling group to pull from.
 	 * 3) This group is the busiest group.
 	 * 4) This group is more busy than the avg busieness at this
 	 *    sched_domain.
 	 * 5) The imbalance is within the specified limit.
 	 * 6) Any rebalance would lead to ping-pong
 	 */
 	if (balance && !(*balance))
 		goto ret;
 	if (!sds.busiest || sds.busiest_nr_running == 0)
 		goto out_balanced;
 	if (sds.this_load >= sds.max_load)
 		goto out_balanced;
 	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
 	if (sds.this_load >= sds.avg_load)
 		goto out_balanced;
 	if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
 		goto out_balanced;
 	sds.busiest_load_per_task /= sds.busiest_nr_running;
 	if (sds.group_imb)
 		sds.busiest_load_per_task =
 			min(sds.busiest_load_per_task, sds.avg_load);
 	/*
 	 * We're trying to get all the cpus to the average_load, so we don't
 	 * want to push ourselves above the average load, nor do we wish to
 	 * reduce the max loaded cpu below the average load, as either of these
 	 * actions would just result in more rebalancing later, and ping-pong
 	 * tasks around. Thus we look for the minimum possible imbalance.
 	 * Negative imbalances (*we* are more loaded than anyone else) will
 	 * be counted as no imbalance for these purposes -- we can't fix that
 	 * by pulling tasks to us. Be careful of negative numbers as they'll
 	 * appear as very large values with unsigned longs.
 	 */
 	if (sds.max_load <= sds.busiest_load_per_task)
 		goto out_balanced;
 	/* Looks like there is an imbalance. Compute it */
 	calculate_imbalance(&sds, this_cpu, imbalance);
 	return sds.busiest;
 out_balanced:
 	/*
 	 * There is no obvious imbalance. But check if we can do some balancing
 	 * to save power.
 	 */
 	if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
 		return sds.busiest;
 ret:
 	*imbalance = 0;
 	return NULL;
 }
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
 static struct rq *
 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 		   unsigned long imbalance, const struct cpumask *cpus)
 {
 	struct rq *busiest = NULL, *rq;
 	unsigned long max_load = 0;
 	int i;
 	for_each_cpu(i, sched_group_cpus(group)) {
 		unsigned long power = power_of(i);
 		unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
 		unsigned long wl;
 		if (!cpumask_test_cpu(i, cpus))
 			continue;
 		rq = cpu_rq(i);
 		wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
 		wl /= power;
 		if (capacity && rq->nr_running == 1 && wl > imbalance)
 			continue;
 		if (wl > max_load) {
 			max_load = wl;
 			busiest = rq;
 		}
 	}
 	return busiest;
 }
 /*
  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
  * so long as it is large enough.
  */
 #define MAX_PINNED_INTERVAL	512
 /* Working cpumask for load_balance and load_balance_newidle. */
 static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *balance)
 {
 	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
 	struct sched_group *group;
 	unsigned long imbalance;
 	struct rq *busiest;
 	unsigned long flags;
 	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
 	cpumask_setall(cpus);
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
 	 * let the state of idle sibling percolate up as CPU_IDLE, instead of
 	 * portraying it as CPU_NOT_IDLE.
 	 */
 	if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		sd_idle = 1;
 	schedstat_inc(sd, lb_count[idle]);
 redo:
 	update_shares(sd);
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
 				   cpus, balance);
 	if (*balance == 0)
 		goto out_balanced;
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[idle]);
 		goto out_balanced;
 	}
 	busiest = find_busiest_queue(group, idle, imbalance, cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
 	}
 	BUG_ON(busiest == this_rq);
 	schedstat_add(sd, lb_imbalance[idle], imbalance);
 	ld_moved = 0;
 	if (busiest->nr_running > 1) {
 		/*
 		 * Attempt to move tasks. If find_busiest_group has found
 		 * an imbalance but busiest->nr_running <= 1, the group is
 		 * still unbalanced. ld_moved simply stays zero, so it is
 		 * correctly treated as an imbalance.
 		 */
 		local_irq_save(flags);
 		double_rq_lock(this_rq, busiest);
 		ld_moved = move_tasks(this_rq, this_cpu, busiest,
 				      imbalance, sd, idle, &all_pinned);
 		double_rq_unlock(this_rq, busiest);
 		local_irq_restore(flags);
 		/*
 		 * some other cpu did the load balance for us.
 		 */
 		if (ld_moved && this_cpu != smp_processor_id())
 			resched_cpu(this_cpu);
 		/* All tasks on this runqueue were pinned by CPU affinity */
 		if (unlikely(all_pinned)) {
 			cpumask_clear_cpu(cpu_of(busiest), cpus);
 			if (!cpumask_empty(cpus))
 				goto redo;
 			goto out_balanced;
 		}
 	}
 	if (!ld_moved) {
 		schedstat_inc(sd, lb_failed[idle]);
 		sd->nr_balance_failed++;
 		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
 			spin_lock_irqsave(&busiest->lock, flags);
 			/* don't kick the migration_thread, if the curr
 			 * task on busiest cpu can't be moved to this_cpu
 			 */
 			if (!cpumask_test_cpu(this_cpu,
 					      &busiest->curr->cpus_allowed)) {
 				spin_unlock_irqrestore(&busiest->lock, flags);
 				all_pinned = 1;
 				goto out_one_pinned;
 			}
 			if (!busiest->active_balance) {
 				busiest->active_balance = 1;
 				busiest->push_cpu = this_cpu;
 				active_balance = 1;
 			}
 			spin_unlock_irqrestore(&busiest->lock, flags);
 			if (active_balance)
 				wake_up_process(busiest->migration_thread);
 			/*
 			 * We've kicked active balancing, reset the failure
 			 * counter.
 			 */
 			sd->nr_balance_failed = sd->cache_nice_tries+1;
 		}
 	} else
 		sd->nr_balance_failed = 0;
 	if (likely(!active_balance)) {
 		/* We were unbalanced, so reset the balancing interval */
 		sd->balance_interval = sd->min_interval;
 	} else {
 		/*
 		 * If we've begun active balancing, start to back off. This
 		 * case may not be covered by the all_pinned logic if there
 		 * is only 1 task on the busy runqueue (because we don't call
 		 * move_tasks).
 		 */
 		if (sd->balance_interval < sd->max_interval)
 			sd->balance_interval *= 2;
 	}
 	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		ld_moved = -1;
 	goto out;
 out_balanced:
 	schedstat_inc(sd, lb_balanced[idle]);
 	sd->nr_balance_failed = 0;
 out_one_pinned:
 	/* tune up the balancing interval */
 	if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
 			(sd->balance_interval < sd->max_interval))
 		sd->balance_interval *= 2;
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		ld_moved = -1;
 	else
 		ld_moved = 0;
 out:
 	if (ld_moved)
 		update_shares(sd);
 	return ld_moved;
 }
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
  *
  * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
  * this_rq is locked.
  */
 static int
 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 {
 	struct sched_group *group;
 	struct rq *busiest = NULL;
 	unsigned long imbalance;
 	int ld_moved = 0;
 	int sd_idle = 0;
 	int all_pinned = 0;
 	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
 	cpumask_setall(cpus);
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
 	 * let the state of idle sibling percolate up as IDLE, instead of
 	 * portraying it as CPU_NOT_IDLE.
 	 */
 	if (sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		sd_idle = 1;
 	schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
 redo:
 	update_shares_locked(this_rq, sd);
 	group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
 				   &sd_idle, cpus, NULL);
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
 		goto out_balanced;
 	}
 	busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
 		goto out_balanced;
 	}
 	BUG_ON(busiest == this_rq);
 	schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
 	ld_moved = 0;
 	if (busiest->nr_running > 1) {
 		/* Attempt to move tasks */
 		double_lock_balance(this_rq, busiest);
 		/* this_rq->clock is already updated */
 		update_rq_clock(busiest);
 		ld_moved = move_tasks(this_rq, this_cpu, busiest,
 					imbalance, sd, CPU_NEWLY_IDLE,
 					&all_pinned);
 		double_unlock_balance(this_rq, busiest);
 		if (unlikely(all_pinned)) {
 			cpumask_clear_cpu(cpu_of(busiest), cpus);
 			if (!cpumask_empty(cpus))
 				goto redo;
 		}
 	}
 	if (!ld_moved) {
 		int active_balance = 0;
 		schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
 		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 		    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 			return -1;
 		if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
 			return -1;
 		if (sd->nr_balance_failed++ < 2)
 			return -1;
 		/*
 		 * The only task running in a non-idle cpu can be moved to this
 		 * cpu in an attempt to completely freeup the other CPU
 		 * package. The same method used to move task in load_balance()
 		 * have been extended for load_balance_newidle() to speedup
 		 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
 		 *
 		 * The package power saving logic comes from
 		 * find_busiest_group().  If there are no imbalance, then
 		 * f_b_g() will return NULL.  However when sched_mc={1,2} then
 		 * f_b_g() will select a group from which a running task may be
 		 * pulled to this cpu in order to make the other package idle.
 		 * If there is no opportunity to make a package idle and if
 		 * there are no imbalance, then f_b_g() will return NULL and no
 		 * action will be taken in load_balance_newidle().
 		 *
 		 * Under normal task pull operation due to imbalance, there
 		 * will be more than one task in the source run queue and
 		 * move_tasks() will succeed.  ld_moved will be true and this
 		 * active balance code will not be triggered.
 		 */
 		/* Lock busiest in correct order while this_rq is held */
 		double_lock_balance(this_rq, busiest);
 		/*
 		 * don't kick the migration_thread, if the curr
 		 * task on busiest cpu can't be moved to this_cpu
 		 */
 		if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
 			double_unlock_balance(this_rq, busiest);
 			all_pinned = 1;
 			return ld_moved;
 		}
 		if (!busiest->active_balance) {
 			busiest->active_balance = 1;
 			busiest->push_cpu = this_cpu;
 			active_balance = 1;
 		}
 		double_unlock_balance(this_rq, busiest);
 		/*
 		 * Should not call ttwu while holding a rq->lock
 		 */
 		spin_unlock(&this_rq->lock);
 		if (active_balance)
 			wake_up_process(busiest->migration_thread);
 		spin_lock(&this_rq->lock);
 	} else
 		sd->nr_balance_failed = 0;
 	update_shares_locked(this_rq, sd);
 	return ld_moved;
 out_balanced:
 	schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		return -1;
 	sd->nr_balance_failed = 0;
 	return 0;
 }
 /*
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
  */
 static void idle_balance(int this_cpu, struct rq *this_rq)
 {
 	struct sched_domain *sd;
 	int pulled_task = 0;
 	unsigned long next_balance = jiffies + HZ;
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
 		if (!(sd->flags & SD_LOAD_BALANCE))
 			continue;
 		if (sd->flags & SD_BALANCE_NEWIDLE)
 			/* If we've pulled tasks over stop searching: */
 			pulled_task = load_balance_newidle(this_cpu, this_rq,
 							   sd);
 		interval = msecs_to_jiffies(sd->balance_interval);
 		if (time_after(next_balance, sd->last_balance + interval))
 			next_balance = sd->last_balance + interval;
 		if (pulled_task)
 			break;
 	}
 	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
 		/*
 		 * We are going idle. next_balance may be set based on
 		 * a busy processor. So reset next_balance.
 		 */
 		this_rq->next_balance = next_balance;
 	}
 }
 /*
  * active_load_balance is run by migration threads. It pushes running tasks
  * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
  * running on each physical CPU where possible, and avoids physical /
  * logical imbalances.
  *
  * Called with busiest_rq locked.
  */
 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 {
 	int target_cpu = busiest_rq->push_cpu;
 	struct sched_domain *sd;
 	struct rq *target_rq;
 	/* Is there any task to move? */
 	if (busiest_rq->nr_running <= 1)
 		return;
 	target_rq = cpu_rq(target_cpu);
 	/*
 	 * This condition is "impossible", if it occurs
 	 * we need to fix it. Originally reported by
 	 * Bjorn Helgaas on a 128-cpu setup.
 	 */
 	BUG_ON(busiest_rq == target_rq);
 	/* move a task from busiest_rq to target_rq */
 	double_lock_balance(busiest_rq, target_rq);
 	update_rq_clock(busiest_rq);
 	update_rq_clock(target_rq);
 	/* Search for an sd spanning us and the target CPU. */
 	for_each_domain(target_cpu, sd) {
 		if ((sd->flags & SD_LOAD_BALANCE) &&
 		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
 				break;
 	}
 	if (likely(sd)) {
 		schedstat_inc(sd, alb_count);
 		if (move_one_task(target_rq, target_cpu, busiest_rq,
 				  sd, CPU_IDLE))
 			schedstat_inc(sd, alb_pushed);
 		else
 			schedstat_inc(sd, alb_failed);
 	}
 	double_unlock_balance(busiest_rq, target_rq);
 }
 #ifdef CONFIG_NO_HZ
 static struct {
 	atomic_t load_balancer;
 	cpumask_var_t cpu_mask;
 	cpumask_var_t ilb_grp_nohz_mask;
 } nohz ____cacheline_aligned = {
 	.load_balancer = ATOMIC_INIT(-1),
 };
 int get_nohz_load_balancer(void)
 {
 	return atomic_read(&nohz.load_balancer);
 }
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 /**
  * lowest_flag_domain - Return lowest sched_domain containing flag.
  * @cpu:	The cpu whose lowest level of sched domain is to
  *		be returned.
  * @flag:	The flag to check for the lowest sched_domain
  *		for the given cpu.
  *
  * Returns the lowest sched_domain of a cpu which contains the given flag.
  */
 static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
 {
 	struct sched_domain *sd;
 	for_each_domain(cpu, sd)
 		if (sd && (sd->flags & flag))
 			break;
 	return sd;
 }
 /**
  * for_each_flag_domain - Iterates over sched_domains containing the flag.
  * @cpu:	The cpu whose domains we're iterating over.
  * @sd:		variable holding the value of the power_savings_sd
  *		for cpu.
  * @flag:	The flag to filter the sched_domains to be iterated.
  *
  * Iterates over all the scheduler domains for a given cpu that has the 'flag'
  * set, starting from the lowest sched_domain to the highest.
  */
 #define for_each_flag_domain(cpu, sd, flag) \
 	for (sd = lowest_flag_domain(cpu, flag); \
 		(sd && (sd->flags & flag)); sd = sd->parent)
 /**
  * is_semi_idle_group - Checks if the given sched_group is semi-idle.
  * @ilb_group:	group to be checked for semi-idleness
  *
  * Returns:	1 if the group is semi-idle. 0 otherwise.
  *
  * We define a sched_group to be semi idle if it has atleast one idle-CPU
  * and atleast one non-idle CPU. This helper function checks if the given
  * sched_group is semi-idle or not.
  */
 static inline int is_semi_idle_group(struct sched_group *ilb_group)
 {
 	cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
 					sched_group_cpus(ilb_group));
 	/*
 	 * A sched_group is semi-idle when it has atleast one busy cpu
 	 * and atleast one idle cpu.
 	 */
 	if (cpumask_empty(nohz.ilb_grp_nohz_mask))
 		return 0;
 	if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
 		return 0;
 	return 1;
 }
 /**
  * find_new_ilb - Finds the optimum idle load balancer for nomination.
  * @cpu:	The cpu which is nominating a new idle_load_balancer.
  *
  * Returns:	Returns the id of the idle load balancer if it exists,
  *		Else, returns >= nr_cpu_ids.
  *
  * This algorithm picks the idle load balancer such that it belongs to a
  * semi-idle powersavings sched_domain. The idea is to try and avoid
  * completely idle packages/cores just for the purpose of idle load balancing
  * when there are other idle cpu's which are better suited for that job.
  */
 static int find_new_ilb(int cpu)
 {
 	struct sched_domain *sd;
 	struct sched_group *ilb_group;
 	/*
 	 * Have idle load balancer selection from semi-idle packages only
 	 * when power-aware load balancing is enabled
 	 */
 	if (!(sched_smt_power_savings || sched_mc_power_savings))
 		goto out_done;
 	/*
 	 * Optimize for the case when we have no idle CPUs or only one
 	 * idle CPU. Don't walk the sched_domain hierarchy in such cases
 	 */
 	if (cpumask_weight(nohz.cpu_mask) < 2)
 		goto out_done;
 	for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
 		ilb_group = sd->groups;
 		do {
 			if (is_semi_idle_group(ilb_group))
 				return cpumask_first(nohz.ilb_grp_nohz_mask);
 			ilb_group = ilb_group->next;
 		} while (ilb_group != sd->groups);
 	}
 out_done:
 	return cpumask_first(nohz.cpu_mask);
 }
 #else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
 static inline int find_new_ilb(int call_cpu)
 {
 	return cpumask_first(nohz.cpu_mask);
 }
 #endif
 /*
  * This routine will try to nominate the ilb (idle load balancing)
  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
  * load balancing on behalf of all those cpus. If all the cpus in the system
  * go into this tickless mode, then there will be no ilb owner (as there is
  * no need for one) and all the cpus will sleep till the next wakeup event
  * arrives...
  *
  * For the ilb owner, tick is not stopped. And this tick will be used
  * for idle load balancing. ilb owner will still be part of
  * nohz.cpu_mask..
  *
  * While stopping the tick, this cpu will become the ilb owner if there
  * is no other owner. And will be the owner till that cpu becomes busy
  * or if all cpus in the system stop their ticks at which point
  * there is no need for ilb owner.
  *
  * When the ilb owner becomes busy, it nominates another owner, during the
  * next busy scheduler_tick()
  */
 int select_nohz_load_balancer(int stop_tick)
 {
 	int cpu = smp_processor_id();
 	if (stop_tick) {
 		cpu_rq(cpu)->in_nohz_recently = 1;
 		if (!cpu_active(cpu)) {
 			if (atomic_read(&nohz.load_balancer) != cpu)
 				return 0;
 			/*
 			 * If we are going offline and still the leader,
 			 * give up!
 			 */
 			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
 				BUG();
 			return 0;
 		}
 		cpumask_set_cpu(cpu, nohz.cpu_mask);
 		/* time for ilb owner also to sleep */
 		if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
 			if (atomic_read(&nohz.load_balancer) == cpu)
 				atomic_set(&nohz.load_balancer, -1);
 			return 0;
 		}
 		if (atomic_read(&nohz.load_balancer) == -1) {
 			/* make me the ilb owner */
 			if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
 				return 1;
 		} else if (atomic_read(&nohz.load_balancer) == cpu) {
 			int new_ilb;
 			if (!(sched_smt_power_savings ||
 						sched_mc_power_savings))
 				return 1;
 			/*
 			 * Check to see if there is a more power-efficient
 			 * ilb.
 			 */
 			new_ilb = find_new_ilb(cpu);
 			if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
 				atomic_set(&nohz.load_balancer, -1);
 				resched_cpu(new_ilb);
 				return 0;
 			}
 			return 1;
 		}
 	} else {
 		if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
 			return 0;
 		cpumask_clear_cpu(cpu, nohz.cpu_mask);
 		if (atomic_read(&nohz.load_balancer) == cpu)
 			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
 				BUG();
 	}
 	return 0;
 }
 #endif
 static DEFINE_SPINLOCK(balancing);
 /*
  * It checks each scheduling domain to see if it is due to be balanced,
  * and initiates a balancing operation if so.
  *
  * Balancing parameters are set up in arch_init_sched_domains.
  */
 static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 {
 	int balance = 1;
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long interval;
 	struct sched_domain *sd;
 	/* Earliest time when we have to do rebalance again */
 	unsigned long next_balance = jiffies + 60*HZ;
 	int update_next_balance = 0;
 	int need_serialize;
 	for_each_domain(cpu, sd) {
 		if (!(sd->flags & SD_LOAD_BALANCE))
 			continue;
 		interval = sd->balance_interval;
 		if (idle != CPU_IDLE)
 			interval *= sd->busy_factor;
 		/* scale ms to jiffies */
 		interval = msecs_to_jiffies(interval);
 		if (unlikely(!interval))
 			interval = 1;
 		if (interval > HZ*NR_CPUS/10)
 			interval = HZ*NR_CPUS/10;
 		need_serialize = sd->flags & SD_SERIALIZE;
 		if (need_serialize) {
 			if (!spin_trylock(&balancing))
 				goto out;
 		}
 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
 			if (load_balance(cpu, rq, sd, idle, &balance)) {
 				/*
 				 * We've pulled tasks over so either we're no
 				 * longer idle, or one of our SMT siblings is
 				 * not idle.
 				 */
 				idle = CPU_NOT_IDLE;
 			}
 			sd->last_balance = jiffies;
 		}
 		if (need_serialize)
 			spin_unlock(&balancing);
 out:
 		if (time_after(next_balance, sd->last_balance + interval)) {
 			next_balance = sd->last_balance + interval;
 			update_next_balance = 1;
 		}
 		/*
 		 * Stop the load balance at this level. There is another
 		 * CPU in our sched group which is doing load balancing more
 		 * actively.
 		 */
 		if (!balance)
 			break;
 	}
 	/*
 	 * next_balance will be updated only when there is a need.
 	 * When the cpu is attached to null domain for ex, it will not be
 	 * updated.
 	 */
 	if (likely(update_next_balance))
 		rq->next_balance = next_balance;
 }
 /*
  * run_rebalance_domains is triggered when needed from the scheduler tick.
  * In CONFIG_NO_HZ case, the idle load balance owner will do the
  * rebalancing for all the cpus for whom scheduler ticks are stopped.
  */
 static void run_rebalance_domains(struct softirq_action *h)
 {
 	int this_cpu = smp_processor_id();
 	struct rq *this_rq = cpu_rq(this_cpu);
 	enum cpu_idle_type idle = this_rq->idle_at_tick ?
 						CPU_IDLE : CPU_NOT_IDLE;
 	rebalance_domains(this_cpu, idle);
 #ifdef CONFIG_NO_HZ
 	/*
 	 * If this cpu is the owner for idle load balancing, then do the
 	 * balancing on behalf of the other idle cpus whose ticks are
 	 * stopped.
 	 */
 	if (this_rq->idle_at_tick &&
 	    atomic_read(&nohz.load_balancer) == this_cpu) {
 		struct rq *rq;
 		int balance_cpu;
 		for_each_cpu(balance_cpu, nohz.cpu_mask) {
 			if (balance_cpu == this_cpu)
 				continue;
 			/*
 			 * If this cpu gets work to do, stop the load balancing
 			 * work being done for other cpus. Next load
 			 * balancing owner will pick it up.
 			 */
 			if (need_resched())
 				break;
 			rebalance_domains(balance_cpu, CPU_IDLE);
 			rq = cpu_rq(balance_cpu);
 			if (time_after(this_rq->next_balance, rq->next_balance))
 				this_rq->next_balance = rq->next_balance;
 		}
 	}
 #endif
 }
 static inline int on_null_domain(int cpu)
 {
 	return !rcu_dereference(cpu_rq(cpu)->sd);
 }
 /*
  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
  *
  * In case of CONFIG_NO_HZ, this is the place where we nominate a new
  * idle load balancing owner or decide to stop the periodic load balancing,
  * if the whole system is idle.
  */
 static inline void trigger_load_balance(struct rq *rq, int cpu)
 {
 #ifdef CONFIG_NO_HZ
 	/*
 	 * If we were in the nohz mode recently and busy at the current
 	 * scheduler tick, then check if we need to nominate new idle
 	 * load balancer.
 	 */
 	if (rq->in_nohz_recently && !rq->idle_at_tick) {
 		rq->in_nohz_recently = 0;
 		if (atomic_read(&nohz.load_balancer) == cpu) {
 			cpumask_clear_cpu(cpu, nohz.cpu_mask);
 			atomic_set(&nohz.load_balancer, -1);
 		}
 		if (atomic_read(&nohz.load_balancer) == -1) {
 			int ilb = find_new_ilb(cpu);
 			if (ilb < nr_cpu_ids)
 				resched_cpu(ilb);
 		}
 	}
 	/*
 	 * If this cpu is idle and doing idle load balancing for all the
 	 * cpus with ticks stopped, is it time for that to stop?
 	 */
 	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
 	    cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
 		resched_cpu(cpu);
 		return;
 	}
 	/*
 	 * If this cpu is idle and the idle load balancing is done by
 	 * someone else, then no need raise the SCHED_SOFTIRQ
 	 */
 	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
 	    cpumask_test_cpu(cpu, nohz.cpu_mask))
 		return;
 #endif
 	/* Don't need to rebalance while attached to NULL domain */
 	if (time_after_eq(jiffies, rq->next_balance) &&
 	    likely(!on_null_domain(cpu)))
 		raise_softirq(SCHED_SOFTIRQ);
 }
 #else	/* CONFIG_SMP */
 /*
  * on UP we do not need to balance between CPUs:
  */
 static inline void idle_balance(int cpu, struct rq *rq)
 {
 }
 #endif
 DEFINE_PER_CPU(struct kernel_stat, kstat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 /*
  * Return any ns on the sched_clock that have not yet been accounted in
  * @p in case that task is currently running.
  *
  * Called with task_rq_lock() held on @rq.
  */
 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
 {
 	u64 ns = 0;
 	if (task_current(rq, p)) {
 		update_rq_clock(rq);
 		ns = rq->clock - p->se.exec_start;
 		if ((s64)ns < 0)
 			ns = 0;
 	}
 	return ns;
 }
 unsigned long long task_delta_exec(struct task_struct *p)
 {
 	unsigned long flags;
 	struct rq *rq;
 	u64 ns = 0;
 	rq = task_rq_lock(p, &flags);
 	ns = do_task_delta_exec(p, rq);
 	task_rq_unlock(rq, &flags);
 	return ns;
 }
 /*
  * Return accounted runtime for the task.
  * In case the task is currently running, return the runtime plus current's
  * pending runtime that have not been accounted yet.
  */
 unsigned long long task_sched_runtime(struct task_struct *p)
 {
 	unsigned long flags;
 	struct rq *rq;
 	u64 ns = 0;
 	rq = task_rq_lock(p, &flags);
 	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
 	task_rq_unlock(rq, &flags);
 	return ns;
 }
 /*
  * Return sum_exec_runtime for the thread group.
  * In case the task is currently running, return the sum plus current's
  * pending runtime that have not been accounted yet.
  *
  * Note that the thread group might have other running tasks as well,
  * so the return value not includes other pending runtime that other
  * running tasks might have.
  */
 unsigned long long thread_group_sched_runtime(struct task_struct *p)
 {
 	struct task_cputime totals;
 	unsigned long flags;
 	struct rq *rq;
 	u64 ns;
 	rq = task_rq_lock(p, &flags);
 	thread_group_cputime(p, &totals);
 	ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
 	task_rq_unlock(rq, &flags);
 	return ns;
 }
 /*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in user space since the last update
  * @cputime_scaled: cputime scaled by cpu frequency
  */
 void account_user_time(struct task_struct *p, cputime_t cputime,
 		       cputime_t cputime_scaled)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t tmp;
 	/* Add user time to process. */
 	p->utime = cputime_add(p->utime, cputime);
 	p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
 	account_group_user_time(p, cputime);
 	/* Add user time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
 	if (TASK_NICE(p) > 0)
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
 	else
 		cpustat->user = cputime64_add(cpustat->user, tmp);
 	cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
 	/* Account for user time used */
 	acct_update_integrals(p);
 }
 /*
  * Account guest cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in virtual machine since the last update
  * @cputime_scaled: cputime scaled by cpu frequency
  */
 static void account_guest_time(struct task_struct *p, cputime_t cputime,
 			       cputime_t cputime_scaled)
 {
 	cputime64_t tmp;
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	tmp = cputime_to_cputime64(cputime);
 	/* Add guest time to process. */
 	p->utime = cputime_add(p->utime, cputime);
 	p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
 	account_group_user_time(p, cputime);
 	p->gtime = cputime_add(p->gtime, cputime);
 	/* Add guest time to cpustat. */
 	cpustat->user = cputime64_add(cpustat->user, tmp);
 	cpustat->guest = cputime64_add(cpustat->guest, tmp);
 }
 /*
  * Account system cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
  * @cputime: the cpu time spent in kernel space since the last update
  * @cputime_scaled: cputime scaled by cpu frequency
  */
 void account_system_time(struct task_struct *p, int hardirq_offset,
 			 cputime_t cputime, cputime_t cputime_scaled)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t tmp;
 	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
 		account_guest_time(p, cputime, cputime_scaled);
 		return;
 	}
 	/* Add system time to process. */
 	p->stime = cputime_add(p->stime, cputime);
 	p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
 	account_group_system_time(p, cputime);
 	/* Add system time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
 	if (hardirq_count() - hardirq_offset)
 		cpustat->irq = cputime64_add(cpustat->irq, tmp);
 	else if (softirq_count())
 		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
 	else
 		cpustat->system = cputime64_add(cpustat->system, tmp);
 	cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
 	/* Account for system time used */
 	acct_update_integrals(p);
 }
 /*
  * Account for involuntary wait time.
  * @steal: the cpu time spent in involuntary wait
  */
 void account_steal_time(cputime_t cputime)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t cputime64 = cputime_to_cputime64(cputime);
 	cpustat->steal = cputime64_add(cpustat->steal, cputime64);
 }
 /*
  * Account for idle time.
  * @cputime: the cpu time spent in idle wait
  */
 void account_idle_time(cputime_t cputime)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t cputime64 = cputime_to_cputime64(cputime);
 	struct rq *rq = this_rq();
 	if (atomic_read(&rq->nr_iowait) > 0)
 		cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
 	else
 		cpustat->idle = cputime64_add(cpustat->idle, cputime64);
 }
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 /*
  * Account a single tick of cpu time.
  * @p: the process that the cpu time gets accounted to
  * @user_tick: indicates if the tick is a user or a system tick
  */
 void account_process_tick(struct task_struct *p, int user_tick)
 {
 	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
 	struct rq *rq = this_rq();
 	if (user_tick)
 		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
 	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
 		account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
 				    one_jiffy_scaled);
 	else
 		account_idle_time(cputime_one_jiffy);
 }
 /*
  * Account multiple ticks of steal time.
  * @p: the process from which the cpu time has been stolen
  * @ticks: number of stolen ticks
  */
 void account_steal_ticks(unsigned long ticks)
 {
 	account_steal_time(jiffies_to_cputime(ticks));
 }
 /*
  * Account multiple ticks of idle time.
  * @ticks: number of stolen ticks
  */
 void account_idle_ticks(unsigned long ticks)
 {
 	account_idle_time(jiffies_to_cputime(ticks));
 }
 #endif
 /*
  * Use precise platform statistics if available:
  */
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 cputime_t task_utime(struct task_struct *p)
 {
 	return p->utime;
 }
 cputime_t task_stime(struct task_struct *p)
 {
 	return p->stime;
 }
 #else
 cputime_t task_utime(struct task_struct *p)
 {
 	clock_t utime = cputime_to_clock_t(p->utime),
 		total = utime + cputime_to_clock_t(p->stime);
 	u64 temp;
 	/*
 	 * Use CFS's precise accounting:
 	 */
 	temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
 	if (total) {
 		temp *= utime;
 		do_div(temp, total);
 	}
 	utime = (clock_t)temp;
 	p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
 	return p->prev_utime;
 }
 cputime_t task_stime(struct task_struct *p)
 {
 	clock_t stime;
 	/*
 	 * Use CFS's precise accounting. (we subtract utime from
 	 * the total, to make sure the total observed by userspace
 	 * grows monotonically - apps rely on that):
 	 */
 	stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
 			cputime_to_clock_t(task_utime(p));
 	if (stime >= 0)
 		p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
 	return p->prev_stime;
 }
 #endif
 inline cputime_t task_gtime(struct task_struct *p)
 {
 	return p->gtime;
 }
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
  *
  * It also gets called by the fork code, when changing the parent's
  * timeslices.
  */
 void scheduler_tick(void)
 {
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
 	struct task_struct *curr = rq->curr;
 	sched_clock_tick();
 	spin_lock(&rq->lock);
 	update_rq_clock(rq);
 	update_cpu_load(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	spin_unlock(&rq->lock);
 	perf_event_task_tick(curr, cpu);
 #ifdef CONFIG_SMP
 	rq->idle_at_tick = idle_cpu(cpu);
 	trigger_load_balance(rq, cpu);
 #endif
 }
 notrace unsigned long get_parent_ip(unsigned long addr)
 {
 	if (in_lock_functions(addr)) {
 		addr = CALLER_ADDR2;
 		if (in_lock_functions(addr))
 			addr = CALLER_ADDR3;
 	}
 	return addr;
 }
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
 				defined(CONFIG_PREEMPT_TRACER))
 void __kprobes add_preempt_count(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
 		return;
 #endif
 	preempt_count() += val;
 #ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Spinlock count overflowing soon?
 	 */
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
 #endif
 	if (preempt_count() == val)
 		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 EXPORT_SYMBOL(add_preempt_count);
 void __kprobes sub_preempt_count(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
 		return;
 	/*
 	 * Is the spinlock portion underflowing?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
 			!(preempt_count() & PREEMPT_MASK)))
 		return;
 #endif
 	if (preempt_count() == val)
 		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 	preempt_count() -= val;
 }
 EXPORT_SYMBOL(sub_preempt_count);
 #endif
 /*
  * Print scheduling while atomic bug:
  */
 static noinline void __schedule_bug(struct task_struct *prev)
 {
 	struct pt_regs *regs = get_irq_regs();
 	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
 		prev->comm, prev->pid, preempt_count());
 	debug_show_held_locks(prev);
 	print_modules();
 	if (irqs_disabled())
 		print_irqtrace_events(prev);
 	if (regs)
 		show_regs(regs);
 	else
 		dump_stack();
 }
 /*
  * Various schedule()-time debugging checks and statistics:
  */
 static inline void schedule_debug(struct task_struct *prev)
 {
 	/*
 	 * Test if we are atomic. Since do_exit() needs to call into
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
 	if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
 		__schedule_bug(prev);
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 	schedstat_inc(this_rq(), sched_count);
 #ifdef CONFIG_SCHEDSTATS
 	if (unlikely(prev->lock_depth >= 0)) {
 		schedstat_inc(this_rq(), bkl_count);
 		schedstat_inc(prev, sched_info.bkl_count);
 	}
 #endif
 }
 static void put_prev_task(struct rq *rq, struct task_struct *p)
 {
 	u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
 	update_avg(&p->se.avg_running, runtime);
 	if (p->state == TASK_RUNNING) {
 		/*
 		 * In order to avoid avg_overlap growing stale when we are
 		 * indeed overlapping and hence not getting put to sleep, grow
 		 * the avg_overlap on preemption.
 		 *
 		 * We use the average preemption runtime because that
 		 * correlates to the amount of cache footprint a task can
 		 * build up.
 		 */
 		runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
 		update_avg(&p->se.avg_overlap, runtime);
 	} else {
 		update_avg(&p->se.avg_running, 0);
 	}
 	p->sched_class->put_prev_task(rq, p);
 }
 /*
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
 pick_next_task(struct rq *rq)
 {
 	const struct sched_class *class;
 	struct task_struct *p;
 	/*
 	 * Optimization: we know that if all tasks are in
 	 * the fair class we can call that function directly:
 	 */
 	if (likely(rq->nr_running == rq->cfs.nr_running)) {
 		p = fair_sched_class.pick_next_task(rq);
 		if (likely(p))
 			return p;
 	}
 	class = sched_class_highest;
 	for ( ; ; ) {
 		p = class->pick_next_task(rq);
 		if (p)
 			return p;
 		/*
 		 * Will never be NULL as the idle class always
 		 * returns a non-NULL p:
 		 */
 		class = class->next;
 	}
 }
 /*
  * schedule() is the main scheduler function.
  */
 asmlinkage void __sched schedule(void)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
 	int cpu;
 need_resched:
 	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_sched_qs(cpu);
 	prev = rq->curr;
 	switch_count = &prev->nivcsw;
 	release_kernel_lock(prev);
 need_resched_nonpreemptible:
 	schedule_debug(prev);
 	if (sched_feat(HRTICK))
 		hrtick_clear(rq);
 	spin_lock_irq(&rq->lock);
 	update_rq_clock(rq);
 	clear_tsk_need_resched(prev);
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		if (unlikely(signal_pending_state(prev->state, prev)))
 			prev->state = TASK_RUNNING;
 		else
 			deactivate_task(rq, prev, 1);
 		switch_count = &prev->nvcsw;
 	}
 	pre_schedule(rq, prev);
 	if (unlikely(!rq->nr_running))
 		idle_balance(cpu, rq);
 	put_prev_task(rq, prev);
 	next = pick_next_task(rq);
 	if (likely(prev != next)) {
 		sched_info_switch(prev, next);
 		perf_event_task_sched_out(prev, next, cpu);
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
 		context_switch(rq, prev, next); /* unlocks the rq */
 		/*
 		 * the context switch might have flipped the stack from under
 		 * us, hence refresh the local variables.
 		 */
 		cpu = smp_processor_id();
 		rq = cpu_rq(cpu);
 	} else
 		spin_unlock_irq(&rq->lock);
 	post_schedule(rq);
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
 	preempt_enable_no_resched();
 	if (need_resched())
 		goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
 #ifdef CONFIG_SMP
 /*
  * Look out! "owner" is an entirely speculative pointer
  * access and not reliable.
  */
 int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
 {
 	unsigned int cpu;
 	struct rq *rq;
 	if (!sched_feat(OWNER_SPIN))
 		return 0;
 #ifdef CONFIG_DEBUG_PAGEALLOC
 	/*
 	 * Need to access the cpu field knowing that
 	 * DEBUG_PAGEALLOC could have unmapped it if
 	 * the mutex owner just released it and exited.
 	 */
 	if (probe_kernel_address(&owner->cpu, cpu))
 		goto out;
 #else
 	cpu = owner->cpu;
 #endif
 	/*
 	 * Even if the access succeeded (likely case),
 	 * the cpu field may no longer be valid.
 	 */
 	if (cpu >= nr_cpumask_bits)
 		goto out;
 	/*
 	 * We need to validate that we can do a
 	 * get_cpu() and that we have the percpu area.
 	 */
 	if (!cpu_online(cpu))
 		goto out;
 	rq = cpu_rq(cpu);
 	for (;;) {
 		/*
 		 * Owner changed, break to re-assess state.
 		 */
 		if (lock->owner != owner)
 			break;
 		/*
 		 * Is that owner really running on that cpu?
 		 */
 		if (task_thread_info(rq->curr) != owner || need_resched())
 			return 0;
 		cpu_relax();
 	}
 out:
 	return 1;
 }
 #endif
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
  * off of preempt_enable. Kernel preemptions off return from interrupt
  * occur there and call schedule directly.
  */
 asmlinkage void __sched preempt_schedule(void)
 {
 	struct thread_info *ti = current_thread_info();
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task. Just return..
 	 */
 	if (likely(ti->preempt_count || irqs_disabled()))
 		return;
 	do {
 		add_preempt_count(PREEMPT_ACTIVE);
 		schedule();
 		sub_preempt_count(PREEMPT_ACTIVE);
 		/*
 		 * Check again in case we missed a preemption opportunity
 		 * between schedule and now.
 		 */
 		barrier();
 	} while (need_resched());
 }
 EXPORT_SYMBOL(preempt_schedule);
 /*
  * this is the entry point to schedule() from kernel preemption
  * off of irq context.
  * Note, that this is called and return with irqs disabled. This will
  * protect us against recursive calling from irq.
  */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
 	struct thread_info *ti = current_thread_info();
 	/* Catch callers which need to be fixed */
 	BUG_ON(ti->preempt_count || !irqs_disabled());
 	do {
 		add_preempt_count(PREEMPT_ACTIVE);
 		local_irq_enable();
 		schedule();
 		local_irq_disable();
 		sub_preempt_count(PREEMPT_ACTIVE);
 		/*
 		 * Check again in case we missed a preemption opportunity
 		 * between schedule and now.
 		 */
 		barrier();
 	} while (need_resched());
 }
 #endif /* CONFIG_PREEMPT */
 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
 			  void *key)
 {
 	return try_to_wake_up(curr->private, mode, wake_flags);
 }
 EXPORT_SYMBOL(default_wake_function);
 /*
  * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
  * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
  * number) then we wake all the non-exclusive tasks and one exclusive task.
  *
  * There are circumstances in which we can try to wake a task which has already
  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, int wake_flags, void *key)
 {
 	wait_queue_t *curr, *next;
 	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
 		unsigned flags = curr->flags;
 		if (curr->func(curr, mode, wake_flags, key) &&
 				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
 			break;
 	}
 }
 /**
  * __wake_up - wake up threads blocked on a waitqueue.
  * @q: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  * @key: is directly passed to the wakeup function
  *
  * It may be assumed that this function implies a write memory barrier before
  * changing the task state if and only if any tasks are woken up.
  */
 void __wake_up(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, void *key)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&q->lock, flags);
 	__wake_up_common(q, mode, nr_exclusive, 0, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(__wake_up);
 /*
  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
  */
 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 {
 	__wake_up_common(q, mode, 1, 0, NULL);
 }
 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
 {
 	__wake_up_common(q, mode, 1, 0, key);
 }
 /**
  * __wake_up_sync_key - wake up threads blocked on a waitqueue.
  * @q: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  * @key: opaque value to be passed to wakeup targets
  *
  * The sync wakeup differs that the waker knows that it will schedule
  * away soon, so while the target thread will be woken up, it will not
  * be migrated to another CPU - ie. the two threads are 'synchronized'
  * with each other. This can prevent needless bouncing between CPUs.
  *
  * On UP it can prevent extra preemption.
  *
  * It may be assumed that this function implies a write memory barrier before
  * changing the task state if and only if any tasks are woken up.
  */
 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, void *key)
 {
 	unsigned long flags;
 	int wake_flags = WF_SYNC;
 	if (unlikely(!q))
 		return;
 	if (unlikely(!nr_exclusive))
 		wake_flags = 0;
 	spin_lock_irqsave(&q->lock, flags);
 	__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
 /*
  * __wake_up_sync - see __wake_up_sync_key()
  */
 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 {
 	__wake_up_sync_key(q, mode, nr_exclusive, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 /**
  * complete: - signals a single thread waiting on this completion
  * @x:  holds the state of this particular completion
  *
  * This will wake up a single thread waiting on this completion. Threads will be
  * awakened in the same order in which they were queued.
  *
  * See also complete_all(), wait_for_completion() and related routines.
  *
  * It may be assumed that this function implies a write memory barrier before
  * changing the task state if and only if any tasks are woken up.
  */
 void complete(struct completion *x)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done++;
 	__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete);
 /**
  * complete_all: - signals all threads waiting on this completion
  * @x:  holds the state of this particular completion
  *
  * This will wake up all threads waiting on this particular completion event.
  *
  * It may be assumed that this function implies a write memory barrier before
  * changing the task state if and only if any tasks are woken up.
  */
 void complete_all(struct completion *x)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done += UINT_MAX/2;
 	__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete_all);
 static inline long __sched
 do_wait_for_common(struct completion *x, long timeout, int state)
 {
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 		wait.flags |= WQ_FLAG_EXCLUSIVE;
 		__add_wait_queue_tail(&x->wait, &wait);
 		do {
 			if (signal_pending_state(state, current)) {
 				timeout = -ERESTARTSYS;
 				break;
 			}
 			__set_current_state(state);
 			spin_unlock_irq(&x->wait.lock);
 			timeout = schedule_timeout(timeout);
 			spin_lock_irq(&x->wait.lock);
 		} while (!x->done && timeout);
 		__remove_wait_queue(&x->wait, &wait);
 		if (!x->done)
 			return timeout;
 	}
 	x->done--;
 	return timeout ?: 1;
 }
 static long __sched
 wait_for_common(struct completion *x, long timeout, int state)
 {
 	might_sleep();
 	spin_lock_irq(&x->wait.lock);
 	timeout = do_wait_for_common(x, timeout, state);
 	spin_unlock_irq(&x->wait.lock);
 	return timeout;
 }
 /**
  * wait_for_completion: - waits for completion of a task
  * @x:  holds the state of this particular completion
  *
  * This waits to be signaled for completion of a specific task. It is NOT
  * interruptible and there is no timeout.
  *
  * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
  * and interrupt capability. Also see complete().
  */
 void __sched wait_for_completion(struct completion *x)
 {
 	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_for_completion);
 /**
  * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
  * @x:  holds the state of this particular completion
  * @timeout:  timeout value in jiffies
  *
  * This waits for either a completion of a specific task to be signaled or for a
  * specified timeout to expire. The timeout is in jiffies. It is not
  * interruptible.
  */
 unsigned long __sched
 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 {
 	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_for_completion_timeout);
 /**
  * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
  * @x:  holds the state of this particular completion
  *
  * This waits for completion of a specific task to be signaled. It is
  * interruptible.
  */
 int __sched wait_for_completion_interruptible(struct completion *x)
 {
 	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
 	if (t == -ERESTARTSYS)
 		return t;
 	return 0;
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible);
 /**
  * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
  * @x:  holds the state of this particular completion
  * @timeout:  timeout value in jiffies
  *
  * This waits for either a completion of a specific task to be signaled or for a
  * specified timeout to expire. It is interruptible. The timeout is in jiffies.
  */
 unsigned long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
 					  unsigned long timeout)
 {
 	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
 /**
  * wait_for_completion_killable: - waits for completion of a task (killable)
  * @x:  holds the state of this particular completion
  *
  * This waits to be signaled for completion of a specific task. It can be
  * interrupted by a kill signal.
  */
 int __sched wait_for_completion_killable(struct completion *x)
 {
 	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
 	if (t == -ERESTARTSYS)
 		return t;
 	return 0;
 }
 EXPORT_SYMBOL(wait_for_completion_killable);
 /**
  *	try_wait_for_completion - try to decrement a completion without blocking
  *	@x:	completion structure
  *
  *	Returns: 0 if a decrement cannot be done without blocking
  *		 1 if a decrement succeeded.
  *
  *	If a completion is being used as a counting completion,
  *	attempt to decrement the counter without blocking. This
  *	enables us to avoid waiting if the resource the completion
  *	is protecting is not available.
  */
 bool try_wait_for_completion(struct completion *x)
 {
 	int ret = 1;
 	spin_lock_irq(&x->wait.lock);
 	if (!x->done)
 		ret = 0;
 	else
 		x->done--;
 	spin_unlock_irq(&x->wait.lock);
 	return ret;
 }
 EXPORT_SYMBOL(try_wait_for_completion);
 /**
  *	completion_done - Test to see if a completion has any waiters
  *	@x:	completion structure
  *
  *	Returns: 0 if there are waiters (wait_for_completion() in progress)
  *		 1 if there are no waiters.
  *
  */
 bool completion_done(struct completion *x)
 {
 	int ret = 1;
 	spin_lock_irq(&x->wait.lock);
 	if (!x->done)
 		ret = 0;
 	spin_unlock_irq(&x->wait.lock);
 	return ret;
 }
 EXPORT_SYMBOL(completion_done);
 static long __sched
 sleep_on_common(wait_queue_head_t *q, int state, long timeout)
 {
 	unsigned long flags;
 	wait_queue_t wait;
 	init_waitqueue_entry(&wait, current);
 	__set_current_state(state);
 	spin_lock_irqsave(&q->lock, flags);
 	__add_wait_queue(q, &wait);
 	spin_unlock(&q->lock);
 	timeout = schedule_timeout(timeout);
 	spin_lock_irq(&q->lock);
 	__remove_wait_queue(q, &wait);
 	spin_unlock_irqrestore(&q->lock, flags);
 	return timeout;
 }
 void __sched interruptible_sleep_on(wait_queue_head_t *q)
 {
 	sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
 }
 EXPORT_SYMBOL(interruptible_sleep_on);
 long __sched
 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
 }
 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
 void __sched sleep_on(wait_queue_head_t *q)
 {
 	sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
 }
 EXPORT_SYMBOL(sleep_on);
 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
 }
 EXPORT_SYMBOL(sleep_on_timeout);
 #ifdef CONFIG_RT_MUTEXES
 /*
  * rt_mutex_setprio - set the current priority of a task
  * @p: task
  * @prio: prio value (kernel-internal form)
  *
  * This function changes the 'effective' priority of a task. It does
  * not touch ->normal_prio like __setscheduler().
  *
  * Used by the rt_mutex code to implement priority inheritance logic.
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
 	unsigned long flags;
 	int oldprio, on_rq, running;
 	struct rq *rq;
 	const struct sched_class *prev_class = p->sched_class;
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 	rq = task_rq_lock(p, &flags);
 	update_rq_clock(rq);
 	oldprio = p->prio;
 	on_rq = p->se.on_rq;
 	running = task_current(rq, p);
 	if (on_rq)
 		dequeue_task(rq, p, 0);
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
 	if (rt_prio(prio))
 		p->sched_class = &rt_sched_class;
 	else
 		p->sched_class = &fair_sched_class;
 	p->prio = prio;
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (on_rq) {
 		enqueue_task(rq, p, 0);
 		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
 	task_rq_unlock(rq, &flags);
 }
 #endif
 void set_user_nice(struct task_struct *p, long nice)
 {
 	int old_prio, delta, on_rq;
 	unsigned long flags;
 	struct rq *rq;
 	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
 		return;
 	/*
 	 * We have to be careful, if called from sys_setpriority(),
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
 	rq = task_rq_lock(p, &flags);
 	update_rq_clock(rq);
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
 	 * it wont have any effect on scheduling until the task is
 	 * SCHED_FIFO/SCHED_RR:
 	 */
 	if (task_has_rt_policy(p)) {
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
 	on_rq = p->se.on_rq;
 	if (on_rq)
 		dequeue_task(rq, p, 0);
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
 	old_prio = p->prio;
 	p->prio = effective_prio(p);
 	delta = p->prio - old_prio;
 	if (on_rq) {
 		enqueue_task(rq, p, 0);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
 		 */
 		if (delta < 0 || (delta > 0 && task_running(rq, p)))
 			resched_task(rq->curr);
 	}
 out_unlock:
 	task_rq_unlock(rq, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
 /*
  * can_nice - check if a task can reduce its nice value
  * @p: task
  * @nice: nice value
  */
 int can_nice(const struct task_struct *p, const int nice)
 {
 	/* convert nice value [19,-20] to rlimit style value [1,40] */
 	int nice_rlim = 20 - nice;
 	return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
 		capable(CAP_SYS_NICE));
 }
 #ifdef __ARCH_WANT_SYS_NICE
 /*
  * sys_nice - change the priority of the current process.
  * @increment: priority increment
  *
  * sys_setpriority is a more generic, but much slower function that
  * does similar things.
  */
 SYSCALL_DEFINE1(nice, int, increment)
 {
 	long nice, retval;
 	/*
 	 * Setpriority might change our priority at the same moment.
 	 * We don't have to worry. Conceptually one call occurs first
 	 * and we have a single winner.
 	 */
 	if (increment < -40)
 		increment = -40;
 	if (increment > 40)
 		increment = 40;
 	nice = TASK_NICE(current) + increment;
 	if (nice < -20)
 		nice = -20;
 	if (nice > 19)
 		nice = 19;
 	if (increment < 0 && !can_nice(current, nice))
 		return -EPERM;
 	retval = security_task_setnice(current, nice);
 	if (retval)
 		return retval;
 	set_user_nice(current, nice);
 	return 0;
 }
 #endif
 /**
  * task_prio - return the priority value of a given task.
  * @p: the task in question.
  *
  * This is the priority value as seen by users in /proc.
  * RT tasks are offset by -200. Normal tasks are centered
  * around 0, value goes from -16 to +15.
  */
 int task_prio(const struct task_struct *p)
 {
 	return p->prio - MAX_RT_PRIO;
 }
 /**
  * task_nice - return the nice value of a given task.
  * @p: the task in question.
  */
 int task_nice(const struct task_struct *p)
 {
 	return TASK_NICE(p);
 }
 EXPORT_SYMBOL(task_nice);
 /**
  * idle_cpu - is a given cpu idle currently?
  * @cpu: the processor in question.
  */
 int idle_cpu(int cpu)
 {
 	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
 }
 /**
  * idle_task - return the idle task for a given cpu.
  * @cpu: the processor in question.
  */
 struct task_struct *idle_task(int cpu)
 {
 	return cpu_rq(cpu)->idle;
 }
 /**
  * find_process_by_pid - find a process with a matching PID value.
  * @pid: the pid in question.
  */
 static struct task_struct *find_process_by_pid(pid_t pid)
 {
 	return pid ? find_task_by_vpid(pid) : current;
 }
 /* Actually do priority change: must hold rq lock. */
 static void
 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 {
 	BUG_ON(p->se.on_rq);
 	p->policy = policy;
 	switch (p->policy) {
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 	case SCHED_IDLE:
 		p->sched_class = &fair_sched_class;
 		break;
 	case SCHED_FIFO:
 	case SCHED_RR:
 		p->sched_class = &rt_sched_class;
 		break;
 	}
 	p->rt_priority = prio;
 	p->normal_prio = normal_prio(p);
 	/* we are holding p->pi_lock already */
 	p->prio = rt_mutex_getprio(p);
 	set_load_weight(p);
 }
 /*
  * check the target process has a UID that matches the current process's
  */
 static bool check_same_owner(struct task_struct *p)
 {
 	const struct cred *cred = current_cred(), *pcred;
 	bool match;
 	rcu_read_lock();
 	pcred = __task_cred(p);
 	match = (cred->euid == pcred->euid ||
 		 cred->euid == pcred->uid);
 	rcu_read_unlock();
 	return match;
 }
 static int __sched_setscheduler(struct task_struct *p, int policy,
 				struct sched_param *param, bool user)
 {
 	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	unsigned long flags;
 	const struct sched_class *prev_class = p->sched_class;
 	struct rq *rq;
 	int reset_on_fork;
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
 recheck:
 	/* double check policy once rq lock held */
 	if (policy < 0) {
 		reset_on_fork = p->sched_reset_on_fork;
 		policy = oldpolicy = p->policy;
 	} else {
 		reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
 		policy &= ~SCHED_RESET_ON_FORK;
 		if (policy != SCHED_FIFO && policy != SCHED_RR &&
 				policy != SCHED_NORMAL && policy != SCHED_BATCH &&
 				policy != SCHED_IDLE)
 			return -EINVAL;
 	}
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
 	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
 	 * SCHED_BATCH and SCHED_IDLE is 0.
 	 */
 	if (param->sched_priority < 0 ||
 	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
 	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
 		return -EINVAL;
 	if (rt_policy(policy) != (param->sched_priority != 0))
 		return -EINVAL;
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
 	if (user && !capable(CAP_SYS_NICE)) {
 		if (rt_policy(policy)) {
 			unsigned long rlim_rtprio;
 			if (!lock_task_sighand(p, &flags))
 				return -ESRCH;
 			rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
 			unlock_task_sighand(p, &flags);
 			/* can't set/change the rt policy */
 			if (policy != p->policy && !rlim_rtprio)
 				return -EPERM;
 			/* can't increase priority */
 			if (param->sched_priority > p->rt_priority &&
 			    param->sched_priority > rlim_rtprio)
 				return -EPERM;
 		}
 		/*
 		 * Like positive nice levels, dont allow tasks to
 		 * move out of SCHED_IDLE either:
 		 */
 		if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
 			return -EPERM;
 		/* can't change other user's priorities */
 		if (!check_same_owner(p))
 			return -EPERM;
 		/* Normal users shall not reset the sched_reset_on_fork flag */
 		if (p->sched_reset_on_fork && !reset_on_fork)
 			return -EPERM;
 	}
 	if (user) {
 #ifdef CONFIG_RT_GROUP_SCHED
 		/*
 		 * Do not allow realtime tasks into groups that have no runtime
 		 * assigned.
 		 */
 		if (rt_bandwidth_enabled() && rt_policy(policy) &&
 				task_group(p)->rt_bandwidth.rt_runtime == 0)
 			return -EPERM;
 #endif
 		retval = security_task_setscheduler(p, policy, param);
 		if (retval)
 			return retval;
 	}
 	/*
 	 * make sure no PI-waiters arrive (or leave) while we are
 	 * changing the priority of the task:
 	 */
 	spin_lock_irqsave(&p->pi_lock, flags);
 	/*
 	 * To be able to change p->policy safely, the apropriate
 	 * runqueue lock must be held.
 	 */
 	rq = __task_rq_lock(p);
 	/* recheck policy now with rq lock held */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
 		__task_rq_unlock(rq);
 		spin_unlock_irqrestore(&p->pi_lock, flags);
 		goto recheck;
 	}
 	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
 	running = task_current(rq, p);
 	if (on_rq)
 		deactivate_task(rq, p, 0);
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
 	p->sched_reset_on_fork = reset_on_fork;
 	oldprio = p->prio;
 	__setscheduler(rq, p, policy, param->sched_priority);
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (on_rq) {
 		activate_task(rq, p, 0);
 		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
 	__task_rq_unlock(rq);
 	spin_unlock_irqrestore(&p->pi_lock, flags);
 	rt_mutex_adjust_pi(p);
 	return 0;
 }
 /**
  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
  * @p: the task in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  *
  * NOTE that the task may be already dead.
  */
 int sched_setscheduler(struct task_struct *p, int policy,
 		       struct sched_param *param)
 {
 	return __sched_setscheduler(p, policy, param, true);
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
 /**
  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
  * @p: the task in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  *
  * Just like sched_setscheduler, only don't bother checking if the
  * current context has permission.  For example, this is needed in
  * stop_machine(): we create temporary high priority worker threads,
  * but our caller might not have that capability.
  */
 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
 			       struct sched_param *param)
 {
 	return __sched_setscheduler(p, policy, param, false);
 }
 static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
 	struct sched_param lparam;
 	struct task_struct *p;
 	int retval;
 	if (!param || pid < 0)
 		return -EINVAL;
 	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
 		return -EFAULT;
 	rcu_read_lock();
 	retval = -ESRCH;
 	p = find_process_by_pid(pid);
 	if (p != NULL)
 		retval = sched_setscheduler(p, policy, &lparam);
 	rcu_read_unlock();
 	return retval;
 }
 /**
  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
  * @pid: the pid in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  */
 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
 		struct sched_param __user *, param)
 {
 	/* negative values for policy are not valid */
 	if (policy < 0)
 		return -EINVAL;
 	return do_sched_setscheduler(pid, policy, param);
 }
 /**
  * sys_sched_setparam - set/change the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the new RT priority.
  */
 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 {
 	return do_sched_setscheduler(pid, -1, param);
 }
 /**
  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
  * @pid: the pid in question.
  */
 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
 {
 	struct task_struct *p;
 	int retval;
 	if (pid < 0)
 		return -EINVAL;
 	retval = -ESRCH;
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (p) {
 		retval = security_task_getscheduler(p);
 		if (!retval)
 			retval = p->policy
 				| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
 	}
 	read_unlock(&tasklist_lock);
 	return retval;
 }
 /**
  * sys_sched_getparam - get the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the RT priority.
  */
 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
 {
 	struct sched_param lp;
 	struct task_struct *p;
 	int retval;
 	if (!param || pid < 0)
 		return -EINVAL;
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	retval = -ESRCH;
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	lp.sched_priority = p->rt_priority;
 	read_unlock(&tasklist_lock);
 	/*
 	 * This one might sleep, we cannot do it with a spinlock held ...
 	 */
 	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
 	return retval;
 out_unlock:
 	read_unlock(&tasklist_lock);
 	return retval;
 }
 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 {
 	cpumask_var_t cpus_allowed, new_mask;
 	struct task_struct *p;
 	int retval;
 	get_online_cpus();
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (!p) {
 		read_unlock(&tasklist_lock);
 		put_online_cpus();
 		return -ESRCH;
 	}
 	/*
 	 * It is not safe to call set_cpus_allowed with the
 	 * tasklist_lock held. We will bump the task_struct's
 	 * usage count and then drop tasklist_lock.
 	 */
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
 		retval = -ENOMEM;
 		goto out_put_task;
 	}
 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
 		retval = -ENOMEM;
 		goto out_free_cpus_allowed;
 	}
 	retval = -EPERM;
 	if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
 		goto out_unlock;
 	retval = security_task_setscheduler(p, 0, NULL);
 	if (retval)
 		goto out_unlock;
 	cpuset_cpus_allowed(p, cpus_allowed);
 	cpumask_and(new_mask, in_mask, cpus_allowed);
  again:
 	retval = set_cpus_allowed_ptr(p, new_mask);
 	if (!retval) {
 		cpuset_cpus_allowed(p, cpus_allowed);
 		if (!cpumask_subset(new_mask, cpus_allowed)) {
 			/*
 			 * We must have raced with a concurrent cpuset
 			 * update. Just reset the cpus_allowed to the
 			 * cpuset's cpus_allowed
 			 */
 			cpumask_copy(new_mask, cpus_allowed);
 			goto again;
 		}
 	}
 out_unlock:
 	free_cpumask_var(new_mask);
 out_free_cpus_allowed:
 	free_cpumask_var(cpus_allowed);
 out_put_task:
 	put_task_struct(p);
 	put_online_cpus();
 	return retval;
 }
 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
 			     struct cpumask *new_mask)
 {
 	if (len < cpumask_size())
 		cpumask_clear(new_mask);
 	else if (len > cpumask_size())
 		len = cpumask_size();
 	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
 }
 /**
  * sys_sched_setaffinity - set the cpu affinity of a process
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to the new cpu mask
  */
 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
 		unsigned long __user *, user_mask_ptr)
 {
 	cpumask_var_t new_mask;
 	int retval;
 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
 		return -ENOMEM;
 	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
 	if (retval == 0)
 		retval = sched_setaffinity(pid, new_mask);
 	free_cpumask_var(new_mask);
 	return retval;
 }
 long sched_getaffinity(pid_t pid, struct cpumask *mask)
 {
 	struct task_struct *p;
 	int retval;
 	get_online_cpus();
 	read_lock(&tasklist_lock);
 	retval = -ESRCH;
 	p = find_process_by_pid(pid);
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
 out_unlock:
 	read_unlock(&tasklist_lock);
 	put_online_cpus();
 	return retval;
 }
 /**
  * sys_sched_getaffinity - get the cpu affinity of a process
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to hold the current cpu mask
  */
 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
 		unsigned long __user *, user_mask_ptr)
 {
 	int ret;
 	cpumask_var_t mask;
 	if (len < cpumask_size())
 		return -EINVAL;
 	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
 		return -ENOMEM;
 	ret = sched_getaffinity(pid, mask);
 	if (ret == 0) {
 		if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
 			ret = -EFAULT;
 		else
 			ret = cpumask_size();
 	}
 	free_cpumask_var(mask);
 	return ret;
 }
 /**
  * sys_sched_yield - yield the current processor to other threads.
  *
  * This function yields the current CPU to other tasks. If there are no
  * other threads running on this CPU then this function will return.
  */
 SYSCALL_DEFINE0(sched_yield)
 {
 	struct rq *rq = this_rq_lock();
 	schedstat_inc(rq, yld_count);
 	current->sched_class->yield_task(rq);
 	/*
 	 * Since we are going to call schedule() anyway, there's
 	 * no need to preempt or enable interrupts:
 	 */
 	__release(rq->lock);
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 	_raw_spin_unlock(&rq->lock);
 	preempt_enable_no_resched();
 	schedule();
 	return 0;
 }
 static inline int should_resched(void)
 {
 	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
 }
 static void __cond_resched(void)
 {
 	add_preempt_count(PREEMPT_ACTIVE);
 	schedule();
 	sub_preempt_count(PREEMPT_ACTIVE);
 }
 int __sched _cond_resched(void)
 {
 	if (should_resched()) {
 		__cond_resched();
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(_cond_resched);
 /*
  * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
  * call schedule, and on return reacquire the lock.
  *
  * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
  * operations here to prevent schedule() from being called twice (once via
  * spin_unlock(), once by hand).
  */
 int __cond_resched_lock(spinlock_t *lock)
 {
 	int resched = should_resched();
 	int ret = 0;
 	lockdep_assert_held(lock);
 	if (spin_needbreak(lock) || resched) {
 		spin_unlock(lock);
 		if (resched)
 			__cond_resched();
 		else
 			cpu_relax();
 		ret = 1;
 		spin_lock(lock);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(__cond_resched_lock);
 int __sched __cond_resched_softirq(void)
 {
 	BUG_ON(!in_softirq());
 	if (should_resched()) {
 		local_bh_enable();
 		__cond_resched();
 		local_bh_disable();
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(__cond_resched_softirq);
 /**
  * yield - yield the current processor to other threads.
  *
  * This is a shortcut for kernel-space yielding - it marks the
  * thread runnable and calls sys_sched_yield().
  */
 void __sched yield(void)
 {
 	set_current_state(TASK_RUNNING);
 	sys_sched_yield();
 }
 EXPORT_SYMBOL(yield);
 /*
  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
  * that process accounting knows that this is a task in IO wait state.
  */
 void __sched io_schedule(void)
 {
 	struct rq *rq = raw_rq();
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	current->in_iowait = 1;
 	schedule();
 	current->in_iowait = 0;
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
 }
 EXPORT_SYMBOL(io_schedule);
 long __sched io_schedule_timeout(long timeout)
 {
 	struct rq *rq = raw_rq();
 	long ret;
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	current->in_iowait = 1;
 	ret = schedule_timeout(timeout);
 	current->in_iowait = 0;
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
 	return ret;
 }
 /**
  * sys_sched_get_priority_max - return maximum RT priority.
  * @policy: scheduling class.
  *
  * this syscall returns the maximum rt_priority that can be used
  * by a given scheduling class.
  */
 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
 {
 	int ret = -EINVAL;
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
 		ret = MAX_USER_RT_PRIO-1;
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 	case SCHED_IDLE:
 		ret = 0;
 		break;
 	}
 	return ret;
 }
 /**
  * sys_sched_get_priority_min - return minimum RT priority.
  * @policy: scheduling class.
  *
  * this syscall returns the minimum rt_priority that can be used
  * by a given scheduling class.
  */
 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
 {
 	int ret = -EINVAL;
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
 		ret = 1;
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 	case SCHED_IDLE:
 		ret = 0;
 	}
 	return ret;
 }
 /**
  * sys_sched_rr_get_interval - return the default timeslice of a process.
  * @pid: pid of the process.
  * @interval: userspace pointer to the timeslice value.
  *
  * this syscall writes the default timeslice value of a given process
  * into the user-space timespec buffer. A value of '0' means infinity.
  */
 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 		struct timespec __user *, interval)
 {
 	struct task_struct *p;
 	unsigned int time_slice;
 	int retval;
 	struct timespec t;
 	if (pid < 0)
 		return -EINVAL;
 	retval = -ESRCH;
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	time_slice = p->sched_class->get_rr_interval(p);
 	read_unlock(&tasklist_lock);
 	jiffies_to_timespec(time_slice, &t);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 	return retval;
 out_unlock:
 	read_unlock(&tasklist_lock);
 	return retval;
 }
 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
 void sched_show_task(struct task_struct *p)
 {
 	unsigned long free = 0;
 	unsigned state;
 	state = p->state ? __ffs(p->state) + 1 : 0;
 	printk(KERN_INFO "%-13.13s %c", p->comm,
 		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if BITS_PER_LONG == 32
 	if (state == TASK_RUNNING)
 		printk(KERN_CONT " running  ");
 	else
 		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
 #else
 	if (state == TASK_RUNNING)
 		printk(KERN_CONT "  running task    ");
 	else
 		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	free = stack_not_used(p);
 #endif
 	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
 		task_pid_nr(p), task_pid_nr(p->real_parent),
 		(unsigned long)task_thread_info(p)->flags);
 	show_stack(p, NULL);
 }
 void show_state_filter(unsigned long state_filter)
 {
 	struct task_struct *g, *p;
 #if BITS_PER_LONG == 32
 	printk(KERN_INFO
 		"  task                PC stack   pid father\n");
 #else
 	printk(KERN_INFO
 		"  task                        PC stack   pid father\n");
 #endif
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
 		 * console might take alot of time:
 		 */
 		touch_nmi_watchdog();
 		if (!state_filter || (p->state & state_filter))
 			sched_show_task(p);
 	} while_each_thread(g, p);
 	touch_all_softlockup_watchdogs();
 #ifdef CONFIG_SCHED_DEBUG
 	sysrq_sched_debug_show();
 #endif
 	read_unlock(&tasklist_lock);
 	/*
 	 * Only show locks if all tasks are dumped:
 	 */
 	if (state_filter == -1)
 		debug_show_all_locks();
 }
 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
 {
 	idle->sched_class = &idle_sched_class;
 }
 /**
  * init_idle - set up an idle thread for a given CPU
  * @idle: task in question
  * @cpu: cpu the idle task belongs to
  *
  * NOTE: this function does not set the idle thread's NEED_RESCHED
  * flag, to make booting more robust.
  */
 void __cpuinit init_idle(struct task_struct *idle, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 	spin_lock_irqsave(&rq->lock, flags);
 	__sched_fork(idle);
 	idle->se.exec_start = sched_clock();
 	idle->prio = idle->normal_prio = MAX_PRIO;
 	cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
 	__set_task_cpu(idle, cpu);
 	rq->curr = rq->idle = idle;
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	idle->oncpu = 1;
 #endif
 	spin_unlock_irqrestore(&rq->lock, flags);
 	/* Set the preempt count _outside_ the spinlocks! */
 #if defined(CONFIG_PREEMPT)
 	task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
 #else
 	task_thread_info(idle)->preempt_count = 0;
 #endif
 	/*
 	 * The idle tasks have their own, simple scheduling class:
 	 */
 	idle->sched_class = &idle_sched_class;
 	ftrace_graph_init_task(idle);
 }
 /*
  * In a system that switches off the HZ timer nohz_cpu_mask
  * indicates which cpus entered this state. This is used
  * in the rcu update to wait only for active cpus. For system
  * which do not switch off the HZ timer nohz_cpu_mask should
  * always be CPU_BITS_NONE.
  */
 cpumask_var_t nohz_cpu_mask;
 /*
  * Increase the granularity value when there are more CPUs,
  * because with more CPUs the 'effective latency' as visible
  * to users decreases. But the relationship is not linear,
  * so pick a second-best guess by going with the log2 of the
  * number of CPUs.
  *
  * This idea comes from the SD scheduler of Con Kolivas:
  */
 static inline void sched_init_granularity(void)
 {
 	unsigned int factor = 1 + ilog2(num_online_cpus());
 	const unsigned long limit = 200000000;
 	sysctl_sched_min_granularity *= factor;
 	if (sysctl_sched_min_granularity > limit)
 		sysctl_sched_min_granularity = limit;
 	sysctl_sched_latency *= factor;
 	if (sysctl_sched_latency > limit)
 		sysctl_sched_latency = limit;
 	sysctl_sched_wakeup_granularity *= factor;
 	sysctl_sched_shares_ratelimit *= factor;
 }
 #ifdef CONFIG_SMP
 /*
  * This is how migration works:
  *
  * 1) we queue a struct migration_req structure in the source CPU's
  *    runqueue and wake up that CPU's migration thread.
  * 2) we down() the locked semaphore => thread blocks.
  * 3) migration thread wakes up (implicitly it forces the migrated
  *    thread off the CPU)
  * 4) it gets the migration request and checks whether the migrated
  *    task is still in the wrong runqueue.
  * 5) if it's in the wrong runqueue then the migration thread removes
  *    it and puts it into the right queue.
  * 6) migration thread up()s the semaphore.
  * 7) we wake up and the migration is done.
  */
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
  * is removed from the allowed bitmask.
  *
  * NOTE: the caller must have a valid reference to the task, the
  * task must not exit() & deallocate itself prematurely. The
  * call is not atomic; no spinlocks may be held.
  */
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
 	struct migration_req req;
 	unsigned long flags;
 	struct rq *rq;
 	int ret = 0;
 	rq = task_rq_lock(p, &flags);
 	if (!cpumask_intersects(new_mask, cpu_online_mask)) {
 		ret = -EINVAL;
 		goto out;
 	}
 	if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
 		     !cpumask_equal(&p->cpus_allowed, new_mask))) {
 		ret = -EINVAL;
 		goto out;
 	}
 	if (p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, new_mask);
 	else {
 		cpumask_copy(&p->cpus_allowed, new_mask);
 		p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
 	}
 	/* Can the task run on the task's current CPU? If so, we're done */
 	if (cpumask_test_cpu(task_cpu(p), new_mask))
 		goto out;
 	if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
 		/* Need help from migration thread: drop lock and wait. */
 		struct task_struct *mt = rq->migration_thread;
 		get_task_struct(mt);
 		task_rq_unlock(rq, &flags);
 		wake_up_process(rq->migration_thread);
 		put_task_struct(mt);
 		wait_for_completion(&req.done);
 		tlb_migrate_finish(p->mm);
 		return 0;
 	}
 out:
 	task_rq_unlock(rq, &flags);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 /*
  * Move (not current) task off this cpu, onto dest cpu. We're doing
  * this because either it can't run here any more (set_cpus_allowed()
  * away from this CPU, or CPU going down), or because we're
  * attempting to rebalance this task on exec (sched_exec).
  *
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
  *
  * Returns non-zero if task was successfully migrated.
  */
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
 	struct rq *rq_dest, *rq_src;
 	int ret = 0, on_rq;
 	if (unlikely(!cpu_active(dest_cpu)))
 		return ret;
 	rq_src = cpu_rq(src_cpu);
 	rq_dest = cpu_rq(dest_cpu);
 	double_rq_lock(rq_src, rq_dest);
 	/* Already moved. */
 	if (task_cpu(p) != src_cpu)
 		goto done;
 	/* Affinity changed (again). */
 	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
 		goto fail;
 	on_rq = p->se.on_rq;
 	if (on_rq)
 		deactivate_task(rq_src, p, 0);
 	set_task_cpu(p, dest_cpu);
 	if (on_rq) {
 		activate_task(rq_dest, p, 0);
 		check_preempt_curr(rq_dest, p, 0);
 	}
 done:
 	ret = 1;
 fail:
 	double_rq_unlock(rq_src, rq_dest);
 	return ret;
 }
 #define RCU_MIGRATION_IDLE	0
 #define RCU_MIGRATION_NEED_QS	1
 #define RCU_MIGRATION_GOT_QS	2
 #define RCU_MIGRATION_MUST_SYNC	3
 /*
  * migration_thread - this is a highprio system thread that performs
  * thread migration by bumping thread off CPU then 'pushing' onto
  * another runqueue.
  */
 static int migration_thread(void *data)
 {
 	int badcpu;
 	int cpu = (long)data;
 	struct rq *rq;
 	rq = cpu_rq(cpu);
 	BUG_ON(rq->migration_thread != current);
 	set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
 		struct migration_req *req;
 		struct list_head *head;
 		spin_lock_irq(&rq->lock);
 		if (cpu_is_offline(cpu)) {
 			spin_unlock_irq(&rq->lock);
 			break;
 		}
 		if (rq->active_balance) {
 			active_load_balance(rq, cpu);
 			rq->active_balance = 0;
 		}
 		head = &rq->migration_queue;
 		if (list_empty(head)) {
 			spin_unlock_irq(&rq->lock);
 			schedule();
 			set_current_state(TASK_INTERRUPTIBLE);
 			continue;
 		}
 		req = list_entry(head->next, struct migration_req, list);
 		list_del_init(head->next);
 		if (req->task != NULL) {
 			spin_unlock(&rq->lock);
 			__migrate_task(req->task, cpu, req->dest_cpu);
 		} else if (likely(cpu == (badcpu = smp_processor_id()))) {
 			req->dest_cpu = RCU_MIGRATION_GOT_QS;
 			spin_unlock(&rq->lock);
 		} else {
 			req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
 			spin_unlock(&rq->lock);
 			WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
 		}
 		local_irq_enable();
 		complete(&req->done);
 	}
 	__set_current_state(TASK_RUNNING);
 	return 0;
 }
 #ifdef CONFIG_HOTPLUG_CPU
 static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
 {
 	int ret;
 	local_irq_disable();
 	ret = __migrate_task(p, src_cpu, dest_cpu);
 	local_irq_enable();
 	return ret;
 }
 /*
  * Figure out where task on dead CPU should go, use force if necessary.
  */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
 	int dest_cpu;
 	const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
 again:
 	/* Look for allowed, online CPU in same node. */
 	for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
 		if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
 			goto move;
 	/* Any allowed, online CPU? */
 	dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
 	if (dest_cpu < nr_cpu_ids)
 		goto move;
 	/* No more Mr. Nice Guy. */
 	if (dest_cpu >= nr_cpu_ids) {
 		cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
 		dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
 		/*
 		 * Don't tell them about moving exiting tasks or
 		 * kernel threads (both mm NULL), since they never
 		 * leave kernel.
 		 */
 		if (p->mm && printk_ratelimit()) {
 			printk(KERN_INFO "process %d (%s) no "
 			       "longer affine to cpu%d\n",
 			       task_pid_nr(p), p->comm, dead_cpu);
 		}
 	}
 move:
 	/* It can have affinity changed while we were choosing. */
 	if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
 		goto again;
 }
 /*
  * While a dead CPU has no uninterruptible tasks queued at this point,
  * it might still have a nonzero ->nr_uninterruptible counter, because
  * for performance reasons the counter is not stricly tracking tasks to
  * their home CPUs. So we just add the counter to another CPU's counter,
  * to keep the global sum constant after CPU-down:
  */
 static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
 	struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
 	unsigned long flags;
 	local_irq_save(flags);
 	double_rq_lock(rq_src, rq_dest);
 	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
 	rq_src->nr_uninterruptible = 0;
 	double_rq_unlock(rq_src, rq_dest);
 	local_irq_restore(flags);
 }
 /* Run through task list and migrate tasks from the dead cpu. */
 static void migrate_live_tasks(int src_cpu)
 {
 	struct task_struct *p, *t;
 	read_lock(&tasklist_lock);
 	do_each_thread(t, p) {
 		if (p == current)
 			continue;
 		if (task_cpu(p) == src_cpu)
 			move_task_off_dead_cpu(src_cpu, p);
 	} while_each_thread(t, p);
 	read_unlock(&tasklist_lock);
 }
 /*
  * Schedules idle task to be the next runnable task on current CPU.
  * It does so by boosting its priority to highest possible.
  * Used by CPU offline code.
  */
 void sched_idle_next(void)
 {
 	int this_cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(this_cpu);
 	struct task_struct *p = rq->idle;
 	unsigned long flags;
 	/* cpu has to be offline */
 	BUG_ON(cpu_online(this_cpu));
 	/*
 	 * Strictly not necessary since rest of the CPUs are stopped by now
 	 * and interrupts disabled on the current cpu.
 	 */
 	spin_lock_irqsave(&rq->lock, flags);
 	__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
 	update_rq_clock(rq);
 	activate_task(rq, p, 0);
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 /*
  * Ensures that the idle task is using init_mm right before its cpu goes
  * offline.
  */
 void idle_task_exit(void)
 {
 	struct mm_struct *mm = current->active_mm;
 	BUG_ON(cpu_online(smp_processor_id()));
 	if (mm != &init_mm)
 		switch_mm(mm, &init_mm, current);
 	mmdrop(mm);
 }
 /* called under rq->lock with disabled interrupts */
 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 {
 	struct rq *rq = cpu_rq(dead_cpu);
 	/* Must be exiting, otherwise would be on tasklist. */
 	BUG_ON(!p->exit_state);
 	/* Cannot have done final schedule yet: would have vanished. */
 	BUG_ON(p->state == TASK_DEAD);
 	get_task_struct(p);
 	/*
 	 * Drop lock around migration; if someone else moves it,
 	 * that's OK. No task can be added to this CPU, so iteration is
 	 * fine.
 	 */
 	spin_unlock_irq(&rq->lock);
 	move_task_off_dead_cpu(dead_cpu, p);
 	spin_lock_irq(&rq->lock);
 	put_task_struct(p);
 }
 /* release_task() removes task from tasklist, so we won't find dead tasks. */
 static void migrate_dead_tasks(unsigned int dead_cpu)
 {
 	struct rq *rq = cpu_rq(dead_cpu);
 	struct task_struct *next;
 	for ( ; ; ) {
 		if (!rq->nr_running)
 			break;
 		update_rq_clock(rq);
 		next = pick_next_task(rq);
 		if (!next)
 			break;
 		next->sched_class->put_prev_task(rq, next);
 		migrate_dead(dead_cpu, next);
 	}
 }
 /*
  * remove the tasks which were accounted by rq from calc_load_tasks.
  */
 static void calc_global_load_remove(struct rq *rq)
 {
 	atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
 	rq->calc_load_active = 0;
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
 static struct ctl_table sd_ctl_dir[] = {
 	{
 		.procname	= "sched_domain",
 		.mode		= 0555,
 	},
-	{0, },
+	{}
 };
 static struct ctl_table sd_ctl_root[] = {
 	{
-		.ctl_name	= CTL_KERN,
 		.procname	= "kernel",
 		.mode		= 0555,
 		.child		= sd_ctl_dir,
 	},
-	{0, },
+	{}
 };
 static struct ctl_table *sd_alloc_ctl_entry(int n)
 {
 	struct ctl_table *entry =
 		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
 	return entry;
 }
 static void sd_free_ctl_entry(struct ctl_table **tablep)
 {
 	struct ctl_table *entry;
 	/*
 	 * In the intermediate directories, both the child directory and
 	 * procname are dynamically allocated and could fail but the mode
 	 * will always be set. In the lowest directory the names are
 	 * static strings and all have proc handlers.
 	 */
 	for (entry = *tablep; entry->mode; entry++) {
 		if (entry->child)
 			sd_free_ctl_entry(&entry->child);
 		if (entry->proc_handler == NULL)
 			kfree(entry->procname);
 	}
 	kfree(*tablep);
 	*tablep = NULL;
 }
 static void
 set_table_entry(struct ctl_table *entry,
 		const char *procname, void *data, int maxlen,
 		mode_t mode, proc_handler *proc_handler)
 {
 	entry->procname = procname;
 	entry->data = data;
 	entry->maxlen = maxlen;
 	entry->mode = mode;
 	entry->proc_handler = proc_handler;
 }
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
 	struct ctl_table *table = sd_alloc_ctl_entry(13);
 	if (table == NULL)
 		return NULL;
 	set_table_entry(&table[0], "min_interval", &sd->min_interval,
 		sizeof(long), 0644, proc_doulongvec_minmax);
 	set_table_entry(&table[1], "max_interval", &sd->max_interval,
 		sizeof(long), 0644, proc_doulongvec_minmax);
 	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[9], "cache_nice_tries",
 		&sd->cache_nice_tries,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[10], "flags", &sd->flags,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[11], "name", sd->name,
 		CORENAME_MAX_SIZE, 0444, proc_dostring);
 	/* &table[12] is terminator */
 	return table;
 }
 static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
 {
 	struct ctl_table *entry, *table;
 	struct sched_domain *sd;
 	int domain_num = 0, i;
 	char buf[32];
 	for_each_domain(cpu, sd)
 		domain_num++;
 	entry = table = sd_alloc_ctl_entry(domain_num + 1);
 	if (table == NULL)
 		return NULL;
 	i = 0;
 	for_each_domain(cpu, sd) {
 		snprintf(buf, 32, "domain%d", i);
 		entry->procname = kstrdup(buf, GFP_KERNEL);
 		entry->mode = 0555;
 		entry->child = sd_alloc_ctl_domain_table(sd);
 		entry++;
 		i++;
 	}
 	return table;
 }
 static struct ctl_table_header *sd_sysctl_header;
 static void register_sched_domain_sysctl(void)
 {
 	int i, cpu_num = num_online_cpus();
 	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
 	char buf[32];
 	WARN_ON(sd_ctl_dir[0].child);
 	sd_ctl_dir[0].child = entry;
 	if (entry == NULL)
 		return;
 	for_each_online_cpu(i) {
 		snprintf(buf, 32, "cpu%d", i);
 		entry->procname = kstrdup(buf, GFP_KERNEL);
 		entry->mode = 0555;
 		entry->child = sd_alloc_ctl_cpu_table(i);
 		entry++;
 	}
 	WARN_ON(sd_sysctl_header);
 	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
 }
 /* may be called multiple times per register */
 static void unregister_sched_domain_sysctl(void)
 {
 	if (sd_sysctl_header)
 		unregister_sysctl_table(sd_sysctl_header);
 	sd_sysctl_header = NULL;
 	if (sd_ctl_dir[0].child)
 		sd_free_ctl_entry(&sd_ctl_dir[0].child);
 }
 #else
 static void register_sched_domain_sysctl(void)
 {
 }
 static void unregister_sched_domain_sysctl(void)
 {
 }
 #endif
 static void set_rq_online(struct rq *rq)
 {
 	if (!rq->online) {
 		const struct sched_class *class;
 		cpumask_set_cpu(rq->cpu, rq->rd->online);
 		rq->online = 1;
 		for_each_class(class) {
 			if (class->rq_online)
 				class->rq_online(rq);
 		}
 	}
 }
 static void set_rq_offline(struct rq *rq)
 {
 	if (rq->online) {
 		const struct sched_class *class;
 		for_each_class(class) {
 			if (class->rq_offline)
 				class->rq_offline(rq);
 		}
 		cpumask_clear_cpu(rq->cpu, rq->rd->online);
 		rq->online = 0;
 	}
 }
 /*
  * migration_call - callback that gets triggered when a CPU is added.
  * Here we can start up the necessary migration thread for the new CPU.
  */
 static int __cpuinit
 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	struct task_struct *p;
 	int cpu = (long)hcpu;
 	unsigned long flags;
 	struct rq *rq;
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
 		p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
 		if (IS_ERR(p))
 			return NOTIFY_BAD;
 		kthread_bind(p, cpu);
 		/* Must be high prio: stop_machine expects to yield to it. */
 		rq = task_rq_lock(p, &flags);
 		__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
 		task_rq_unlock(rq, &flags);
 		get_task_struct(p);
 		cpu_rq(cpu)->migration_thread = p;
 		rq->calc_load_update = calc_load_update;
 		break;
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		/* Strictly unnecessary, as first user will wake it. */
 		wake_up_process(cpu_rq(cpu)->migration_thread);
 		/* Update our root-domain */
 		rq = cpu_rq(cpu);
 		spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 			set_rq_online(rq);
 		}
 		spin_unlock_irqrestore(&rq->lock, flags);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 		if (!cpu_rq(cpu)->migration_thread)
 			break;
 		/* Unbind it from offline cpu so it can run. Fall thru. */
 		kthread_bind(cpu_rq(cpu)->migration_thread,
 			     cpumask_any(cpu_online_mask));
 		kthread_stop(cpu_rq(cpu)->migration_thread);
 		put_task_struct(cpu_rq(cpu)->migration_thread);
 		cpu_rq(cpu)->migration_thread = NULL;
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
 		migrate_live_tasks(cpu);
 		rq = cpu_rq(cpu);
 		kthread_stop(rq->migration_thread);
 		put_task_struct(rq->migration_thread);
 		rq->migration_thread = NULL;
 		/* Idle task back to normal (off runqueue, low prio) */
 		spin_lock_irq(&rq->lock);
 		update_rq_clock(rq);
 		deactivate_task(rq, rq->idle, 0);
 		rq->idle->static_prio = MAX_PRIO;
 		__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
 		rq->idle->sched_class = &idle_sched_class;
 		migrate_dead_tasks(cpu);
 		spin_unlock_irq(&rq->lock);
 		cpuset_unlock();
 		migrate_nr_uninterruptible(rq);
 		BUG_ON(rq->nr_running != 0);
 		calc_global_load_remove(rq);
 		/*
 		 * No need to migrate the tasks: it was best-effort if
 		 * they didn't take sched_hotcpu_mutex. Just wake up
 		 * the requestors.
 		 */
 		spin_lock_irq(&rq->lock);
 		while (!list_empty(&rq->migration_queue)) {
 			struct migration_req *req;
 			req = list_entry(rq->migration_queue.next,
 					 struct migration_req, list);
 			list_del_init(&req->list);
 			spin_unlock_irq(&rq->lock);
 			complete(&req->done);
 			spin_lock_irq(&rq->lock);
 		}
 		spin_unlock_irq(&rq->lock);
 		break;
 	case CPU_DYING:
 	case CPU_DYING_FROZEN:
 		/* Update our root-domain */
 		rq = cpu_rq(cpu);
 		spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 			set_rq_offline(rq);
 		}
 		spin_unlock_irqrestore(&rq->lock, flags);
 		break;
 #endif
 	}
 	return NOTIFY_OK;
 }
 /*
  * Register at high priority so that task migration (migrate_all_tasks)
  * happens before everything else.  This has to be lower priority than
  * the notifier in the perf_event subsystem, though.
  */
 static struct notifier_block __cpuinitdata migration_notifier = {
 	.notifier_call = migration_call,
 	.priority = 10
 };
 static int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err;
 	/* Start one for the boot CPU: */
 	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
 	BUG_ON(err == NOTIFY_BAD);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
 	return 0;
 }
 early_initcall(migration_init);
 #endif
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SCHED_DEBUG
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 				  struct cpumask *groupmask)
 {
 	struct sched_group *group = sd->groups;
 	char str[256];
 	cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
 	cpumask_clear(groupmask);
 	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
 	if (!(sd->flags & SD_LOAD_BALANCE)) {
 		printk("does not load-balance\n");
 		if (sd->parent)
 			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
 					" has parent");
 		return -1;
 	}
 	printk(KERN_CONT "span %s level %s\n", str, sd->name);
 	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
 		printk(KERN_ERR "ERROR: domain->span does not contain "
 				"CPU%d\n", cpu);
 	}
 	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
 		printk(KERN_ERR "ERROR: domain->groups does not contain"
 				" CPU%d\n", cpu);
 	}
 	printk(KERN_DEBUG "%*s groups:", level + 1, "");
 	do {
 		if (!group) {
 			printk("\n");
 			printk(KERN_ERR "ERROR: group is NULL\n");
 			break;
 		}
 		if (!group->cpu_power) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: domain->cpu_power not "
 					"set\n");
 			break;
 		}
 		if (!cpumask_weight(sched_group_cpus(group))) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: empty group\n");
 			break;
 		}
 		if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: repeated CPUs\n");
 			break;
 		}
 		cpumask_or(groupmask, groupmask, sched_group_cpus(group));
 		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
 		printk(KERN_CONT " %s", str);
 		if (group->cpu_power != SCHED_LOAD_SCALE) {
 			printk(KERN_CONT " (cpu_power = %d)",
 				group->cpu_power);
 		}
 		group = group->next;
 	} while (group != sd->groups);
 	printk(KERN_CONT "\n");
 	if (!cpumask_equal(sched_domain_span(sd), groupmask))
 		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
 	if (sd->parent &&
 	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
 		printk(KERN_ERR "ERROR: parent span is not a superset "
 			"of domain->span\n");
 	return 0;
 }
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
 	cpumask_var_t groupmask;
 	int level = 0;
 	if (!sd) {
 		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
 		return;
 	}
 	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
 	if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
 		printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
 		return;
 	}
 	for (;;) {
 		if (sched_domain_debug_one(sd, cpu, level, groupmask))
 			break;
 		level++;
 		sd = sd->parent;
 		if (!sd)
 			break;
 	}
 	free_cpumask_var(groupmask);
 }
 #else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
 #endif /* CONFIG_SCHED_DEBUG */
 static int sd_degenerate(struct sched_domain *sd)
 {
 	if (cpumask_weight(sched_domain_span(sd)) == 1)
 		return 1;
 	/* Following flags need at least 2 groups */
 	if (sd->flags & (SD_LOAD_BALANCE |
 			 SD_BALANCE_NEWIDLE |
 			 SD_BALANCE_FORK |
 			 SD_BALANCE_EXEC |
 			 SD_SHARE_CPUPOWER |
 			 SD_SHARE_PKG_RESOURCES)) {
 		if (sd->groups != sd->groups->next)
 			return 0;
 	}
 	/* Following flags don't use groups */
 	if (sd->flags & (SD_WAKE_AFFINE))
 		return 0;
 	return 1;
 }
 static int
 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 {
 	unsigned long cflags = sd->flags, pflags = parent->flags;
 	if (sd_degenerate(parent))
 		return 1;
 	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
 		return 0;
 	/* Flags needing groups don't count if only 1 group in parent */
 	if (parent->groups == parent->groups->next) {
 		pflags &= ~(SD_LOAD_BALANCE |
 				SD_BALANCE_NEWIDLE |
 				SD_BALANCE_FORK |
 				SD_BALANCE_EXEC |
 				SD_SHARE_CPUPOWER |
 				SD_SHARE_PKG_RESOURCES);
 		if (nr_node_ids == 1)
 			pflags &= ~SD_SERIALIZE;
 	}
 	if (~cflags & pflags)
 		return 0;
 	return 1;
 }
 static void free_rootdomain(struct root_domain *rd)
 {
 	cpupri_cleanup(&rd->cpupri);
 	free_cpumask_var(rd->rto_mask);
 	free_cpumask_var(rd->online);
 	free_cpumask_var(rd->span);
 	kfree(rd);
 }
 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 {
 	struct root_domain *old_rd = NULL;
 	unsigned long flags;
 	spin_lock_irqsave(&rq->lock, flags);
 	if (rq->rd) {
 		old_rd = rq->rd;
 		if (cpumask_test_cpu(rq->cpu, old_rd->online))
 			set_rq_offline(rq);
 		cpumask_clear_cpu(rq->cpu, old_rd->span);
 		/*
 		 * If we dont want to free the old_rt yet then
 		 * set old_rd to NULL to skip the freeing later
 		 * in this function:
 		 */
 		if (!atomic_dec_and_test(&old_rd->refcount))
 			old_rd = NULL;
 	}
 	atomic_inc(&rd->refcount);
 	rq->rd = rd;
 	cpumask_set_cpu(rq->cpu, rd->span);
 	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
 		set_rq_online(rq);
 	spin_unlock_irqrestore(&rq->lock, flags);
 	if (old_rd)
 		free_rootdomain(old_rd);
 }
 static int init_rootdomain(struct root_domain *rd, bool bootmem)
 {
 	gfp_t gfp = GFP_KERNEL;
 	memset(rd, 0, sizeof(*rd));
 	if (bootmem)
 		gfp = GFP_NOWAIT;
 	if (!alloc_cpumask_var(&rd->span, gfp))
 		goto out;
 	if (!alloc_cpumask_var(&rd->online, gfp))
 		goto free_span;
 	if (!alloc_cpumask_var(&rd->rto_mask, gfp))
 		goto free_online;
 	if (cpupri_init(&rd->cpupri, bootmem) != 0)
 		goto free_rto_mask;
 	return 0;
 free_rto_mask:
 	free_cpumask_var(rd->rto_mask);
 free_online:
 	free_cpumask_var(rd->online);
 free_span:
 	free_cpumask_var(rd->span);
 out:
 	return -ENOMEM;
 }
 static void init_defrootdomain(void)
 {
 	init_rootdomain(&def_root_domain, true);
 	atomic_set(&def_root_domain.refcount, 1);
 }
 static struct root_domain *alloc_rootdomain(void)
 {
 	struct root_domain *rd;
 	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
 	if (!rd)
 		return NULL;
 	if (init_rootdomain(rd, false) != 0) {
 		kfree(rd);
 		return NULL;
 	}
 	return rd;
 }
 /*
  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
  * hold the hotplug lock.
  */
 static void
 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
 	/* Remove the sched domains which do not contribute to scheduling. */
 	for (tmp = sd; tmp; ) {
 		struct sched_domain *parent = tmp->parent;
 		if (!parent)
 			break;
 		if (sd_parent_degenerate(tmp, parent)) {
 			tmp->parent = parent->parent;
 			if (parent->parent)
 				parent->parent->child = tmp;
 		} else
 			tmp = tmp->parent;
 	}
 	if (sd && sd_degenerate(sd)) {
 		sd = sd->parent;
 		if (sd)
 			sd->child = NULL;
 	}
 	sched_domain_debug(sd, cpu);
 	rq_attach_root(rq, rd);
 	rcu_assign_pointer(rq->sd, sd);
 }
 /* cpus with isolated domains */
 static cpumask_var_t cpu_isolated_map;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
 	cpulist_parse(str, cpu_isolated_map);
 	return 1;
 }
 __setup("isolcpus=", isolated_cpu_setup);
 /*
  * init_sched_build_groups takes the cpumask we wish to span, and a pointer
  * to a function which identifies what group(along with sched group) a CPU
  * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
  * (due to the fact that we keep track of groups covered with a struct cpumask).
  *
  * init_sched_build_groups will build a circular linked list of the groups
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
 static void
 init_sched_build_groups(const struct cpumask *span,
 			const struct cpumask *cpu_map,
 			int (*group_fn)(int cpu, const struct cpumask *cpu_map,
 					struct sched_group **sg,
 					struct cpumask *tmpmask),
 			struct cpumask *covered, struct cpumask *tmpmask)
 {
 	struct sched_group *first = NULL, *last = NULL;
 	int i;
 	cpumask_clear(covered);
 	for_each_cpu(i, span) {
 		struct sched_group *sg;
 		int group = group_fn(i, cpu_map, &sg, tmpmask);
 		int j;
 		if (cpumask_test_cpu(i, covered))
 			continue;
 		cpumask_clear(sched_group_cpus(sg));
 		sg->cpu_power = 0;
 		for_each_cpu(j, span) {
 			if (group_fn(j, cpu_map, NULL, tmpmask) != group)
 				continue;
 			cpumask_set_cpu(j, covered);
 			cpumask_set_cpu(j, sched_group_cpus(sg));
 		}
 		if (!first)
 			first = sg;
 		if (last)
 			last->next = sg;
 		last = sg;
 	}
 	last->next = first;
 }
 #define SD_NODES_PER_DOMAIN 16
 #ifdef CONFIG_NUMA
 /**
  * find_next_best_node - find the next node to include in a sched_domain
  * @node: node whose sched_domain we're building
  * @used_nodes: nodes already in the sched_domain
  *
  * Find the next node to include in a given scheduling domain. Simply
  * finds the closest node not already in the @used_nodes map.
  *
  * Should use nodemask_t.
  */
 static int find_next_best_node(int node, nodemask_t *used_nodes)
 {
 	int i, n, val, min_val, best_node = 0;
 	min_val = INT_MAX;
 	for (i = 0; i < nr_node_ids; i++) {
 		/* Start at @node */
 		n = (node + i) % nr_node_ids;
 		if (!nr_cpus_node(n))
 			continue;
 		/* Skip already used nodes */
 		if (node_isset(n, *used_nodes))
 			continue;
 		/* Simple min distance search */
 		val = node_distance(node, n);
 		if (val < min_val) {
 			min_val = val;
 			best_node = n;
 		}
 	}
 	node_set(best_node, *used_nodes);
 	return best_node;
 }
 /**
  * sched_domain_node_span - get a cpumask for a node's sched_domain
  * @node: node whose cpumask we're constructing
  * @span: resulting cpumask
  *
  * Given a node, construct a good cpumask for its sched_domain to span. It
  * should be one that prevents unnecessary balancing, but also spreads tasks
  * out optimally.
  */
 static void sched_domain_node_span(int node, struct cpumask *span)
 {
 	nodemask_t used_nodes;
 	int i;
 	cpumask_clear(span);
 	nodes_clear(used_nodes);
 	cpumask_or(span, span, cpumask_of_node(node));
 	node_set(node, used_nodes);
 	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
 		int next_node = find_next_best_node(node, &used_nodes);
 		cpumask_or(span, span, cpumask_of_node(next_node));
 	}
 }
 #endif /* CONFIG_NUMA */
 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 /*
  * The cpus mask in sched_group and sched_domain hangs off the end.
  *
  * ( See the the comments in include/linux/sched.h:struct sched_group
  *   and struct sched_domain. )
  */
 struct static_sched_group {
 	struct sched_group sg;
 	DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
 };
 struct static_sched_domain {
 	struct sched_domain sd;
 	DECLARE_BITMAP(span, CONFIG_NR_CPUS);
 };
 struct s_data {
 #ifdef CONFIG_NUMA
 	int			sd_allnodes;
 	cpumask_var_t		domainspan;
 	cpumask_var_t		covered;
 	cpumask_var_t		notcovered;
 #endif
 	cpumask_var_t		nodemask;
 	cpumask_var_t		this_sibling_map;
 	cpumask_var_t		this_core_map;
 	cpumask_var_t		send_covered;
 	cpumask_var_t		tmpmask;
 	struct sched_group	**sched_group_nodes;
 	struct root_domain	*rd;
 };
 enum s_alloc {
 	sa_sched_groups = 0,
 	sa_rootdomain,
 	sa_tmpmask,
 	sa_send_covered,
 	sa_this_core_map,
 	sa_this_sibling_map,
 	sa_nodemask,
 	sa_sched_group_nodes,
 #ifdef CONFIG_NUMA
 	sa_notcovered,
 	sa_covered,
 	sa_domainspan,
 #endif
 	sa_none,
 };
 /*
  * SMT sched-domains:
  */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
 static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
 static int
 cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
 		 struct sched_group **sg, struct cpumask *unused)
 {
 	if (sg)
 		*sg = &per_cpu(sched_group_cpus, cpu).sg;
 	return cpu;
 }
 #endif /* CONFIG_SCHED_SMT */
 /*
  * multi-core sched-domains:
  */
 #ifdef CONFIG_SCHED_MC
 static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
 static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
 #endif /* CONFIG_SCHED_MC */
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
 static int
 cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
 		  struct sched_group **sg, struct cpumask *mask)
 {
 	int group;
 	cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
 	group = cpumask_first(mask);
 	if (sg)
 		*sg = &per_cpu(sched_group_core, group).sg;
 	return group;
 }
 #elif defined(CONFIG_SCHED_MC)
 static int
 cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
 		  struct sched_group **sg, struct cpumask *unused)
 {
 	if (sg)
 		*sg = &per_cpu(sched_group_core, cpu).sg;
 	return cpu;
 }
 #endif
 static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
 static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
 static int
 cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
 		  struct sched_group **sg, struct cpumask *mask)
 {
 	int group;
 #ifdef CONFIG_SCHED_MC
 	cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
 	group = cpumask_first(mask);
 #elif defined(CONFIG_SCHED_SMT)
 	cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
 	group = cpumask_first(mask);
 #else
 	group = cpu;
 #endif
 	if (sg)
 		*sg = &per_cpu(sched_group_phys, group).sg;
 	return group;
 }
 #ifdef CONFIG_NUMA
 /*
  * The init_sched_build_groups can't handle what we want to do with node
  * groups, so roll our own. Now each node has its own list of groups which
  * gets dynamically allocated.
  */
 static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
 static struct sched_group ***sched_group_nodes_bycpu;
 static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
 static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
 static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
 				 struct sched_group **sg,
 				 struct cpumask *nodemask)
 {
 	int group;
 	cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
 	group = cpumask_first(nodemask);
 	if (sg)
 		*sg = &per_cpu(sched_group_allnodes, group).sg;
 	return group;
 }
 static void init_numa_sched_groups_power(struct sched_group *group_head)
 {
 	struct sched_group *sg = group_head;
 	int j;
 	if (!sg)
 		return;
 	do {
 		for_each_cpu(j, sched_group_cpus(sg)) {
 			struct sched_domain *sd;
 			sd = &per_cpu(phys_domains, j).sd;
 			if (j != group_first_cpu(sd->groups)) {
 				/*
 				 * Only add "power" once for each
 				 * physical package.
 				 */
 				continue;
 			}
 			sg->cpu_power += sd->groups->cpu_power;
 		}
 		sg = sg->next;
 	} while (sg != group_head);
 }
 static int build_numa_sched_groups(struct s_data *d,
 				   const struct cpumask *cpu_map, int num)
 {
 	struct sched_domain *sd;
 	struct sched_group *sg, *prev;
 	int n, j;
 	cpumask_clear(d->covered);
 	cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
 	if (cpumask_empty(d->nodemask)) {
 		d->sched_group_nodes[num] = NULL;
 		goto out;
 	}
 	sched_domain_node_span(num, d->domainspan);
 	cpumask_and(d->domainspan, d->domainspan, cpu_map);
 	sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
 			  GFP_KERNEL, num);
 	if (!sg) {
 		printk(KERN_WARNING "Can not alloc domain group for node %d\n",
 		       num);
 		return -ENOMEM;
 	}
 	d->sched_group_nodes[num] = sg;
 	for_each_cpu(j, d->nodemask) {
 		sd = &per_cpu(node_domains, j).sd;
 		sd->groups = sg;
 	}
 	sg->cpu_power = 0;
 	cpumask_copy(sched_group_cpus(sg), d->nodemask);
 	sg->next = sg;
 	cpumask_or(d->covered, d->covered, d->nodemask);
 	prev = sg;
 	for (j = 0; j < nr_node_ids; j++) {
 		n = (num + j) % nr_node_ids;
 		cpumask_complement(d->notcovered, d->covered);
 		cpumask_and(d->tmpmask, d->notcovered, cpu_map);
 		cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
 		if (cpumask_empty(d->tmpmask))
 			break;
 		cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
 		if (cpumask_empty(d->tmpmask))
 			continue;
 		sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
 				  GFP_KERNEL, num);
 		if (!sg) {
 			printk(KERN_WARNING
 			       "Can not alloc domain group for node %d\n", j);
 			return -ENOMEM;
 		}
 		sg->cpu_power = 0;
 		cpumask_copy(sched_group_cpus(sg), d->tmpmask);
 		sg->next = prev->next;
 		cpumask_or(d->covered, d->covered, d->tmpmask);
 		prev->next = sg;
 		prev = sg;
 	}
 out:
 	return 0;
 }
 #endif /* CONFIG_NUMA */
 #ifdef CONFIG_NUMA
 /* Free memory allocated for various sched_group structures */
 static void free_sched_groups(const struct cpumask *cpu_map,
 			      struct cpumask *nodemask)
 {
 	int cpu, i;
 	for_each_cpu(cpu, cpu_map) {
 		struct sched_group **sched_group_nodes
 			= sched_group_nodes_bycpu[cpu];
 		if (!sched_group_nodes)
 			continue;
 		for (i = 0; i < nr_node_ids; i++) {
 			struct sched_group *oldsg, *sg = sched_group_nodes[i];
 			cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
 			if (cpumask_empty(nodemask))
 				continue;
 			if (sg == NULL)
 				continue;
 			sg = sg->next;
 next_sg:
 			oldsg = sg;
 			sg = sg->next;
 			kfree(oldsg);
 			if (oldsg != sched_group_nodes[i])
 				goto next_sg;
 		}
 		kfree(sched_group_nodes);
 		sched_group_nodes_bycpu[cpu] = NULL;
 	}
 }
 #else /* !CONFIG_NUMA */
 static void free_sched_groups(const struct cpumask *cpu_map,
 			      struct cpumask *nodemask)
 {
 }
 #endif /* CONFIG_NUMA */
 /*
  * Initialize sched groups cpu_power.
  *
  * cpu_power indicates the capacity of sched group, which is used while
  * distributing the load between different sched groups in a sched domain.
  * Typically cpu_power for all the groups in a sched domain will be same unless
  * there are asymmetries in the topology. If there are asymmetries, group
  * having more cpu_power will pickup more load compared to the group having
  * less cpu_power.
  */
 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 {
 	struct sched_domain *child;
 	struct sched_group *group;
 	long power;
 	int weight;
 	WARN_ON(!sd || !sd->groups);
 	if (cpu != group_first_cpu(sd->groups))
 		return;
 	child = sd->child;
 	sd->groups->cpu_power = 0;
 	if (!child) {
 		power = SCHED_LOAD_SCALE;
 		weight = cpumask_weight(sched_domain_span(sd));
 		/*
 		 * SMT siblings share the power of a single core.
 		 * Usually multiple threads get a better yield out of
 		 * that one core than a single thread would have,
 		 * reflect that in sd->smt_gain.
 		 */
 		if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
 			power *= sd->smt_gain;
 			power /= weight;
 			power >>= SCHED_LOAD_SHIFT;
 		}
 		sd->groups->cpu_power += power;
 		return;
 	}
 	/*
 	 * Add cpu_power of each child group to this groups cpu_power.
 	 */
 	group = child->groups;
 	do {
 		sd->groups->cpu_power += group->cpu_power;
 		group = group->next;
 	} while (group != child->groups);
 }
 /*
  * Initializers for schedule domains
  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
  */
 #ifdef CONFIG_SCHED_DEBUG
 # define SD_INIT_NAME(sd, type)		sd->name = #type
 #else
 # define SD_INIT_NAME(sd, type)		do { } while (0)
 #endif
 #define	SD_INIT(sd, type)	sd_init_##type(sd)
 #define SD_INIT_FUNC(type)	\
 static noinline void sd_init_##type(struct sched_domain *sd)	\
 {								\
 	memset(sd, 0, sizeof(*sd));				\
 	*sd = SD_##type##_INIT;					\
 	sd->level = SD_LV_##type;				\
 	SD_INIT_NAME(sd, type);					\
 }
 SD_INIT_FUNC(CPU)
 #ifdef CONFIG_NUMA
  SD_INIT_FUNC(ALLNODES)
  SD_INIT_FUNC(NODE)
 #endif
 #ifdef CONFIG_SCHED_SMT
  SD_INIT_FUNC(SIBLING)
 #endif
 #ifdef CONFIG_SCHED_MC
  SD_INIT_FUNC(MC)
 #endif
 static int default_relax_domain_level = -1;
 static int __init setup_relax_domain_level(char *str)
 {
 	unsigned long val;
 	val = simple_strtoul(str, NULL, 0);
 	if (val < SD_LV_MAX)
 		default_relax_domain_level = val;
 	return 1;
 }
 __setup("relax_domain_level=", setup_relax_domain_level);
 static void set_domain_attribute(struct sched_domain *sd,
 				 struct sched_domain_attr *attr)
 {
 	int request;
 	if (!attr || attr->relax_domain_level < 0) {
 		if (default_relax_domain_level < 0)
 			return;
 		else
 			request = default_relax_domain_level;
 	} else
 		request = attr->relax_domain_level;
 	if (request < sd->level) {
 		/* turn off idle balance on this domain */
 		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
 	} else {
 		/* turn on idle balance on this domain */
 		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
 	}
 }
 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
 				 const struct cpumask *cpu_map)
 {
 	switch (what) {
 	case sa_sched_groups:
 		free_sched_groups(cpu_map, d->tmpmask); /* fall through */
 		d->sched_group_nodes = NULL;
 	case sa_rootdomain:
 		free_rootdomain(d->rd); /* fall through */
 	case sa_tmpmask:
 		free_cpumask_var(d->tmpmask); /* fall through */
 	case sa_send_covered:
 		free_cpumask_var(d->send_covered); /* fall through */
 	case sa_this_core_map:
 		free_cpumask_var(d->this_core_map); /* fall through */
 	case sa_this_sibling_map:
 		free_cpumask_var(d->this_sibling_map); /* fall through */
 	case sa_nodemask:
 		free_cpumask_var(d->nodemask); /* fall through */
 	case sa_sched_group_nodes:
 #ifdef CONFIG_NUMA
 		kfree(d->sched_group_nodes); /* fall through */
 	case sa_notcovered:
 		free_cpumask_var(d->notcovered); /* fall through */
 	case sa_covered:
 		free_cpumask_var(d->covered); /* fall through */
 	case sa_domainspan:
 		free_cpumask_var(d->domainspan); /* fall through */
 #endif
 	case sa_none:
 		break;
 	}
 }
 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
 						   const struct cpumask *cpu_map)
 {
 #ifdef CONFIG_NUMA
 	if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
 		return sa_none;
 	if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
 		return sa_domainspan;
 	if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
 		return sa_covered;
 	/* Allocate the per-node list of sched groups */
 	d->sched_group_nodes = kcalloc(nr_node_ids,
 				      sizeof(struct sched_group *), GFP_KERNEL);
 	if (!d->sched_group_nodes) {
 		printk(KERN_WARNING "Can not alloc sched group node list\n");
 		return sa_notcovered;
 	}
 	sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
 #endif
 	if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
 		return sa_sched_group_nodes;
 	if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
 		return sa_nodemask;
 	if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
 		return sa_this_sibling_map;
 	if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
 		return sa_this_core_map;
 	if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
 		return sa_send_covered;
 	d->rd = alloc_rootdomain();
 	if (!d->rd) {
 		printk(KERN_WARNING "Cannot alloc root domain\n");
 		return sa_tmpmask;
 	}
 	return sa_rootdomain;
 }
 static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
 	const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
 {
 	struct sched_domain *sd = NULL;
 #ifdef CONFIG_NUMA
 	struct sched_domain *parent;
 	d->sd_allnodes = 0;
 	if (cpumask_weight(cpu_map) >
 	    SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
 		sd = &per_cpu(allnodes_domains, i).sd;
 		SD_INIT(sd, ALLNODES);
 		set_domain_attribute(sd, attr);
 		cpumask_copy(sched_domain_span(sd), cpu_map);
 		cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
 		d->sd_allnodes = 1;
 	}
 	parent = sd;
 	sd = &per_cpu(node_domains, i).sd;
 	SD_INIT(sd, NODE);
 	set_domain_attribute(sd, attr);
 	sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
 	sd->parent = parent;
 	if (parent)
 		parent->child = sd;
 	cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
 #endif
 	return sd;
 }
 static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
 	const struct cpumask *cpu_map, struct sched_domain_attr *attr,
 	struct sched_domain *parent, int i)
 {
 	struct sched_domain *sd;
 	sd = &per_cpu(phys_domains, i).sd;
 	SD_INIT(sd, CPU);
 	set_domain_attribute(sd, attr);
 	cpumask_copy(sched_domain_span(sd), d->nodemask);
 	sd->parent = parent;
 	if (parent)
 		parent->child = sd;
 	cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
 	return sd;
 }
 static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
 	const struct cpumask *cpu_map, struct sched_domain_attr *attr,
 	struct sched_domain *parent, int i)
 {
 	struct sched_domain *sd = parent;
 #ifdef CONFIG_SCHED_MC
 	sd = &per_cpu(core_domains, i).sd;
 	SD_INIT(sd, MC);
 	set_domain_attribute(sd, attr);
 	cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
 	sd->parent = parent;
 	parent->child = sd;
 	cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
 #endif
 	return sd;
 }
 static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
 	const struct cpumask *cpu_map, struct sched_domain_attr *attr,
 	struct sched_domain *parent, int i)
 {
 	struct sched_domain *sd = parent;
 #ifdef CONFIG_SCHED_SMT
 	sd = &per_cpu(cpu_domains, i).sd;
 	SD_INIT(sd, SIBLING);
 	set_domain_attribute(sd, attr);
 	cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
 	sd->parent = parent;
 	parent->child = sd;
 	cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
 #endif
 	return sd;
 }
 static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
 			       const struct cpumask *cpu_map, int cpu)
 {
 	switch (l) {
 #ifdef CONFIG_SCHED_SMT
 	case SD_LV_SIBLING: /* set up CPU (sibling) groups */
 		cpumask_and(d->this_sibling_map, cpu_map,
 			    topology_thread_cpumask(cpu));
 		if (cpu == cpumask_first(d->this_sibling_map))
 			init_sched_build_groups(d->this_sibling_map, cpu_map,
 						&cpu_to_cpu_group,
 						d->send_covered, d->tmpmask);
 		break;
 #endif
 #ifdef CONFIG_SCHED_MC
 	case SD_LV_MC: /* set up multi-core groups */
 		cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
 		if (cpu == cpumask_first(d->this_core_map))
 			init_sched_build_groups(d->this_core_map, cpu_map,
 						&cpu_to_core_group,
 						d->send_covered, d->tmpmask);
 		break;
 #endif
 	case SD_LV_CPU: /* set up physical groups */
 		cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
 		if (!cpumask_empty(d->nodemask))
 			init_sched_build_groups(d->nodemask, cpu_map,
 						&cpu_to_phys_group,
 						d->send_covered, d->tmpmask);
 		break;
 #ifdef CONFIG_NUMA
 	case SD_LV_ALLNODES:
 		init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
 					d->send_covered, d->tmpmask);
 		break;
 #endif
 	default:
 		break;
 	}
 }
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
 static int __build_sched_domains(const struct cpumask *cpu_map,
 				 struct sched_domain_attr *attr)
 {
 	enum s_alloc alloc_state = sa_none;
 	struct s_data d;
 	struct sched_domain *sd;
 	int i;
 #ifdef CONFIG_NUMA
 	d.sd_allnodes = 0;
 #endif
 	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
 	if (alloc_state != sa_rootdomain)
 		goto error;
 	alloc_state = sa_sched_groups;
 	/*
 	 * Set up domains for cpus specified by the cpu_map.
 	 */
 	for_each_cpu(i, cpu_map) {
 		cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
 			    cpu_map);
 		sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
 		sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
 		sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
 		sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
 	}
 	for_each_cpu(i, cpu_map) {
 		build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
 		build_sched_groups(&d, SD_LV_MC, cpu_map, i);
 	}
 	/* Set up physical groups */
 	for (i = 0; i < nr_node_ids; i++)
 		build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
 	if (d.sd_allnodes)
 		build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
 	for (i = 0; i < nr_node_ids; i++)
 		if (build_numa_sched_groups(&d, cpu_map, i))
 			goto error;
 #endif
 	/* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
 	for_each_cpu(i, cpu_map) {
 		sd = &per_cpu(cpu_domains, i).sd;
 		init_sched_groups_power(i, sd);
 	}
 #endif
 #ifdef CONFIG_SCHED_MC
 	for_each_cpu(i, cpu_map) {
 		sd = &per_cpu(core_domains, i).sd;
 		init_sched_groups_power(i, sd);
 	}
 #endif
 	for_each_cpu(i, cpu_map) {
 		sd = &per_cpu(phys_domains, i).sd;
 		init_sched_groups_power(i, sd);
 	}
 #ifdef CONFIG_NUMA
 	for (i = 0; i < nr_node_ids; i++)
 		init_numa_sched_groups_power(d.sched_group_nodes[i]);
 	if (d.sd_allnodes) {
 		struct sched_group *sg;
 		cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
 								d.tmpmask);
 		init_numa_sched_groups_power(sg);
 	}
 #endif
 	/* Attach the domains */
 	for_each_cpu(i, cpu_map) {
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i).sd;
 #elif defined(CONFIG_SCHED_MC)
 		sd = &per_cpu(core_domains, i).sd;
 #else
 		sd = &per_cpu(phys_domains, i).sd;
 #endif
 		cpu_attach_domain(sd, d.rd, i);
 	}
 	d.sched_group_nodes = NULL; /* don't free this we still need it */
 	__free_domain_allocs(&d, sa_tmpmask, cpu_map);
 	return 0;
 error:
 	__free_domain_allocs(&d, alloc_state, cpu_map);
 	return -ENOMEM;
 }
 static int build_sched_domains(const struct cpumask *cpu_map)
 {
 	return __build_sched_domains(cpu_map, NULL);
 }
 static struct cpumask *doms_cur;	/* current sched domains */
 static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
 static struct sched_domain_attr *dattr_cur;
 				/* attribues of custom domains in 'doms_cur' */
 /*
  * Special case: If a kmalloc of a doms_cur partition (array of
  * cpumask) fails, then fallback to a single sched domain,
  * as determined by the single cpumask fallback_doms.
  */
 static cpumask_var_t fallback_doms;
 /*
  * arch_update_cpu_topology lets virtualized architectures update the
  * cpu core maps. It is supposed to return 1 if the topology changed
  * or 0 if it stayed the same.
  */
 int __attribute__((weak)) arch_update_cpu_topology(void)
 {
 	return 0;
 }
 /*
  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
  * For now this just excludes isolated cpus, but could be used to
  * exclude other special cases in the future.
  */
 static int arch_init_sched_domains(const struct cpumask *cpu_map)
 {
 	int err;
 	arch_update_cpu_topology();
 	ndoms_cur = 1;
 	doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
 	if (!doms_cur)
 		doms_cur = fallback_doms;
 	cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
 	dattr_cur = NULL;
 	err = build_sched_domains(doms_cur);
 	register_sched_domain_sysctl();
 	return err;
 }
 static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
 				       struct cpumask *tmpmask)
 {
 	free_sched_groups(cpu_map, tmpmask);
 }
 /*
  * Detach sched domains from a group of cpus specified in cpu_map
  * These cpus will now be attached to the NULL domain
  */
 static void detach_destroy_domains(const struct cpumask *cpu_map)
 {
 	/* Save because hotplug lock held. */
 	static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
 	int i;
 	for_each_cpu(i, cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
 	synchronize_sched();
 	arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
 }
 /* handle null as "default" */
 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
 			struct sched_domain_attr *new, int idx_new)
 {
 	struct sched_domain_attr tmp;
 	/* fast path */
 	if (!new && !cur)
 		return 1;
 	tmp = SD_ATTR_INIT;
 	return !memcmp(cur ? (cur + idx_cur) : &tmp,
 			new ? (new + idx_new) : &tmp,
 			sizeof(struct sched_domain_attr));
 }
 /*
  * Partition sched domains as specified by the 'ndoms_new'
  * cpumasks in the array doms_new[] of cpumasks. This compares
  * doms_new[] to the current sched domain partitioning, doms_cur[].
  * It destroys each deleted domain and builds each new domain.
  *
  * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
  * The masks don't intersect (don't overlap.) We should setup one
  * sched domain for each mask. CPUs not in any of the cpumasks will
  * not be load balanced. If the same cpumask appears both in the
  * current 'doms_cur' domains and in the new 'doms_new', we can leave
  * it as it is.
  *
  * The passed in 'doms_new' should be kmalloc'd. This routine takes
  * ownership of it and will kfree it when done with it. If the caller
  * failed the kmalloc call, then it can pass in doms_new == NULL &&
  * ndoms_new == 1, and partition_sched_domains() will fallback to
  * the single partition 'fallback_doms', it also forces the domains
  * to be rebuilt.
  *
  * If doms_new == NULL it will be replaced with cpu_online_mask.
  * ndoms_new == 0 is a special case for destroying existing domains,
  * and it will not create the default domain.
  *
  * Call with hotplug lock held
  */
 /* FIXME: Change to struct cpumask *doms_new[] */
 void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
 			     struct sched_domain_attr *dattr_new)
 {
 	int i, j, n;
 	int new_topology;
 	mutex_lock(&sched_domains_mutex);
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
 	/* Let architecture update cpu core mappings. */
 	new_topology = arch_update_cpu_topology();
 	n = doms_new ? ndoms_new : 0;
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
 		for (j = 0; j < n && !new_topology; j++) {
 			if (cpumask_equal(&doms_cur[i], &doms_new[j])
 			    && dattrs_equal(dattr_cur, i, dattr_new, j))
 				goto match1;
 		}
 		/* no match - a current sched domain not in new doms_new[] */
 		detach_destroy_domains(doms_cur + i);
 match1:
 		;
 	}
 	if (doms_new == NULL) {
 		ndoms_cur = 0;
 		doms_new = fallback_doms;
 		cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
 		WARN_ON_ONCE(dattr_new);
 	}
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < ndoms_cur && !new_topology; j++) {
 			if (cpumask_equal(&doms_new[i], &doms_cur[j])
 			    && dattrs_equal(dattr_new, i, dattr_cur, j))
 				goto match2;
 		}
 		/* no match - add a new doms_new */
 		__build_sched_domains(doms_new + i,
 					dattr_new ? dattr_new + i : NULL);
 match2:
 		;
 	}
 	/* Remember the new sched domains */
 	if (doms_cur != fallback_doms)
 		kfree(doms_cur);
 	kfree(dattr_cur);	/* kfree(NULL) is safe */
 	doms_cur = doms_new;
 	dattr_cur = dattr_new;
 	ndoms_cur = ndoms_new;
 	register_sched_domain_sysctl();
 	mutex_unlock(&sched_domains_mutex);
 }
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 static void arch_reinit_sched_domains(void)
 {
 	get_online_cpus();
 	/* Destroy domains first to force the rebuild */
 	partition_sched_domains(0, NULL, NULL);
 	rebuild_sched_domains();
 	put_online_cpus();
 }
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 {
 	unsigned int level = 0;
 	if (sscanf(buf, "%u", &level) != 1)
 		return -EINVAL;
 	/*
 	 * level is always be positive so don't check for
 	 * level < POWERSAVINGS_BALANCE_NONE which is 0
 	 * What happens on 0 or 1 byte write,
 	 * need to check for count as well?
 	 */
 	if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
 		return -EINVAL;
 	if (smt)
 		sched_smt_power_savings = level;
 	else
 		sched_mc_power_savings = level;
 	arch_reinit_sched_domains();
 	return count;
 }
 #ifdef CONFIG_SCHED_MC
 static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
 					   char *page)
 {
 	return sprintf(page, "%u\n", sched_mc_power_savings);
 }
 static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
 					    const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 0);
 }
 static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
 			 sched_mc_power_savings_show,
 			 sched_mc_power_savings_store);
 #endif
 #ifdef CONFIG_SCHED_SMT
 static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
 					    char *page)
 {
 	return sprintf(page, "%u\n", sched_smt_power_savings);
 }
 static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
 					     const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 1);
 }
 static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
 		   sched_smt_power_savings_show,
 		   sched_smt_power_savings_store);
 #endif
 int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 {
 	int err = 0;
 #ifdef CONFIG_SCHED_SMT
 	if (smt_capable())
 		err = sysfs_create_file(&cls->kset.kobj,
 					&attr_sched_smt_power_savings.attr);
 #endif
 #ifdef CONFIG_SCHED_MC
 	if (!err && mc_capable())
 		err = sysfs_create_file(&cls->kset.kobj,
 					&attr_sched_mc_power_savings.attr);
 #endif
 	return err;
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 #ifndef CONFIG_CPUSETS
 /*
  * Add online and remove offline CPUs from the scheduler domains.
  * When cpusets are enabled they take over this function.
  */
 static int update_sched_domains(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		partition_sched_domains(1, NULL, NULL);
 		return NOTIFY_OK;
 	default:
 		return NOTIFY_DONE;
 	}
 }
 #endif
 static int update_runtime(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
 	int cpu = (int)(long)hcpu;
 	switch (action) {
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		disable_runtime(cpu_rq(cpu));
 		return NOTIFY_OK;
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		enable_runtime(cpu_rq(cpu));
 		return NOTIFY_OK;
 	default:
 		return NOTIFY_DONE;
 	}
 }
 void __init sched_init_smp(void)
 {
 	cpumask_var_t non_isolated_cpus;
 	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
 	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 #if defined(CONFIG_NUMA)
 	sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
 								GFP_KERNEL);
 	BUG_ON(sched_group_nodes_bycpu == NULL);
 #endif
 	get_online_cpus();
 	mutex_lock(&sched_domains_mutex);
 	arch_init_sched_domains(cpu_online_mask);
 	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
 	if (cpumask_empty(non_isolated_cpus))
 		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
 	mutex_unlock(&sched_domains_mutex);
 	put_online_cpus();
 #ifndef CONFIG_CPUSETS
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
 #endif
 	/* RT runtime code needs to handle some hotplug events */
 	hotcpu_notifier(update_runtime, 0);
 	init_hrtick();
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
 		BUG();
 	sched_init_granularity();
 	free_cpumask_var(non_isolated_cpus);
 	init_sched_rt_class();
 }
 #else
 void __init sched_init_smp(void)
 {
 	sched_init_granularity();
 }
 #endif /* CONFIG_SMP */
 const_debug unsigned int sysctl_timer_migration = 1;
 int in_sched_functions(unsigned long addr)
 {
 	return in_lock_functions(addr) ||
 		(addr >= (unsigned long)__sched_text_start
 		&& addr < (unsigned long)__sched_text_end);
 }
 static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT;
 	INIT_LIST_HEAD(&cfs_rq->tasks);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->rq = rq;
 #endif
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 }
 static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 {
 	struct rt_prio_array *array;
 	int i;
 	array = &rt_rq->active;
 	for (i = 0; i < MAX_RT_PRIO; i++) {
 		INIT_LIST_HEAD(array->queue + i);
 		__clear_bit(i, array->bitmap);
 	}
 	/* delimiter for bitsearch: */
 	__set_bit(MAX_RT_PRIO, array->bitmap);
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	rt_rq->highest_prio.curr = MAX_RT_PRIO;
 #ifdef CONFIG_SMP
 	rt_rq->highest_prio.next = MAX_RT_PRIO;
 #endif
 #endif
 #ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
 	rt_rq->overloaded = 0;
 	plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
 #endif
 	rt_rq->rt_time = 0;
 	rt_rq->rt_throttled = 0;
 	rt_rq->rt_runtime = 0;
 	spin_lock_init(&rt_rq->rt_runtime_lock);
 #ifdef CONFIG_RT_GROUP_SCHED
 	rt_rq->rt_nr_boosted = 0;
 	rt_rq->rq = rq;
 #endif
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 				struct sched_entity *se, int cpu, int add,
 				struct sched_entity *parent)
 {
 	struct rq *rq = cpu_rq(cpu);
 	tg->cfs_rq[cpu] = cfs_rq;
 	init_cfs_rq(cfs_rq, rq);
 	cfs_rq->tg = tg;
 	if (add)
 		list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
 	tg->se[cpu] = se;
 	/* se could be NULL for init_task_group */
 	if (!se)
 		return;
 	if (!parent)
 		se->cfs_rq = &rq->cfs;
 	else
 		se->cfs_rq = parent->my_q;
 	se->my_q = cfs_rq;
 	se->load.weight = tg->shares;
 	se->load.inv_weight = 0;
 	se->parent = parent;
 }
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 		struct sched_rt_entity *rt_se, int cpu, int add,
 		struct sched_rt_entity *parent)
 {
 	struct rq *rq = cpu_rq(cpu);
 	tg->rt_rq[cpu] = rt_rq;
 	init_rt_rq(rt_rq, rq);
 	rt_rq->tg = tg;
 	rt_rq->rt_se = rt_se;
 	rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
 	if (add)
 		list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
 	tg->rt_se[cpu] = rt_se;
 	if (!rt_se)
 		return;
 	if (!parent)
 		rt_se->rt_rq = &rq->rt;
 	else
 		rt_se->rt_rq = parent->my_q;
 	rt_se->my_q = rt_rq;
 	rt_se->parent = parent;
 	INIT_LIST_HEAD(&rt_se->run_list);
 }
 #endif
 void __init sched_init(void)
 {
 	int i, j;
 	unsigned long alloc_size = 0, ptr;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
 #ifdef CONFIG_USER_SCHED
 	alloc_size *= 2;
 #endif
 #ifdef CONFIG_CPUMASK_OFFSTACK
 	alloc_size += num_possible_cpus() * cpumask_size();
 #endif
 	/*
 	 * As sched_init() is called before page_alloc is setup,
 	 * we use alloc_bootmem().
 	 */
 	if (alloc_size) {
 		ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		init_task_group.se = (struct sched_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 		init_task_group.cfs_rq = (struct cfs_rq **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 #ifdef CONFIG_USER_SCHED
 		root_task_group.se = (struct sched_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 		root_task_group.cfs_rq = (struct cfs_rq **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 		init_task_group.rt_se = (struct sched_rt_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 		init_task_group.rt_rq = (struct rt_rq **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 #ifdef CONFIG_USER_SCHED
 		root_task_group.rt_se = (struct sched_rt_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 		root_task_group.rt_rq = (struct rt_rq **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CPUMASK_OFFSTACK
 		for_each_possible_cpu(i) {
 			per_cpu(load_balance_tmpmask, i) = (void *)ptr;
 			ptr += cpumask_size();
 		}
 #endif /* CONFIG_CPUMASK_OFFSTACK */
 	}
 #ifdef CONFIG_SMP
 	init_defrootdomain();
 #endif
 	init_rt_bandwidth(&def_rt_bandwidth,
 			global_rt_period(), global_rt_runtime());
 #ifdef CONFIG_RT_GROUP_SCHED
 	init_rt_bandwidth(&init_task_group.rt_bandwidth,
 			global_rt_period(), global_rt_runtime());
 #ifdef CONFIG_USER_SCHED
 	init_rt_bandwidth(&root_task_group.rt_bandwidth,
 			global_rt_period(), RUNTIME_INF);
 #endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_GROUP_SCHED
 	list_add(&init_task_group.list, &task_groups);
 	INIT_LIST_HEAD(&init_task_group.children);
 #ifdef CONFIG_USER_SCHED
 	INIT_LIST_HEAD(&root_task_group.children);
 	init_task_group.parent = &root_task_group;
 	list_add(&init_task_group.siblings, &root_task_group.children);
 #endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_GROUP_SCHED */
 #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
 	update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
 					    __alignof__(unsigned long));
 #endif
 	for_each_possible_cpu(i) {
 		struct rq *rq;
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
 		rq->nr_running = 0;
 		rq->calc_load_active = 0;
 		rq->calc_load_update = jiffies + LOAD_FREQ;
 		init_cfs_rq(&rq->cfs, rq);
 		init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		init_task_group.shares = init_task_group_load;
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
 #ifdef CONFIG_CGROUP_SCHED
 		/*
 		 * How much cpu bandwidth does init_task_group get?
 		 *
 		 * In case of task-groups formed thr' the cgroup filesystem, it
 		 * gets 100% of the cpu resources in the system. This overall
 		 * system cpu resource is divided among the tasks of
 		 * init_task_group and its child task-groups in a fair manner,
 		 * based on each entity's (task or task-group's) weight
 		 * (se->load.weight).
 		 *
 		 * In other words, if init_task_group has 10 tasks of weight
 		 * 1024) and two child groups A0 and A1 (of weight 1024 each),
 		 * then A0's share of the cpu resource is:
 		 *
 		 *	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
 		 *
 		 * We achieve this by letting init_task_group's tasks sit
 		 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
 		 */
 		init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
 #elif defined CONFIG_USER_SCHED
 		root_task_group.shares = NICE_0_LOAD;
 		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
 		/*
 		 * In case of task-groups formed thr' the user id of tasks,
 		 * init_task_group represents tasks belonging to root user.
 		 * Hence it forms a sibling of all subsequent groups formed.
 		 * In this case, init_task_group gets only a fraction of overall
 		 * system cpu resource, based on the weight assigned to root
 		 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
 		 * by letting tasks of init_task_group sit in a separate cfs_rq
 		 * (init_tg_cfs_rq) and having one entity represent this group of
 		 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
 		 */
 		init_tg_cfs_entry(&init_task_group,
 				&per_cpu(init_tg_cfs_rq, i),
 				&per_cpu(init_sched_entity, i), i, 1,
 				root_task_group.se[i]);
 #endif
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
 		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
 #ifdef CONFIG_CGROUP_SCHED
 		init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
 #elif defined CONFIG_USER_SCHED
 		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
 		init_tg_rt_entry(&init_task_group,
 				&per_cpu(init_rt_rq, i),
 				&per_cpu(init_sched_rt_entity, i), i, 1,
 				root_task_group.rt_se[i]);
 #endif
 #endif
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
 		rq->rd = NULL;
 		rq->post_schedule = 0;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
 		rq->cpu = i;
 		rq->online = 0;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
 		rq_attach_root(rq, &def_root_domain);
 #endif
 		init_rq_hrtick(rq);
 		atomic_set(&rq->nr_iowait, 0);
 	}
 	set_load_weight(&init_task);
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
 #endif
 #ifdef CONFIG_SMP
 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
 #endif
 #ifdef CONFIG_RT_MUTEXES
 	plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
 #endif
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current);
 	/*
 	 * Make us the idle thread. Technically, schedule() should not be
 	 * called from this thread, however somewhere below it might be,
 	 * but because we are the idle thread, we just pick up running again
 	 * when this runqueue becomes "idle".
 	 */
 	init_idle(current, smp_processor_id());
 	calc_load_update = jiffies + LOAD_FREQ;
 	/*
 	 * During early bootup we pretend to be a normal task:
 	 */
 	current->sched_class = &fair_sched_class;
 	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
 	alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
 	alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
 	alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
 #endif
 	alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
 	perf_event_init();
 	scheduler_running = 1;
 }
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
 	int nested = preempt_count() & ~PREEMPT_ACTIVE;
 	return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
 }
 void __might_sleep(char *file, int line, int preempt_offset)
 {
 #ifdef in_atomic
 	static unsigned long prev_jiffy;	/* ratelimiting */
 	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
 	    system_state != SYSTEM_RUNNING || oops_in_progress)
 		return;
 	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 		return;
 	prev_jiffy = jiffies;
 	printk(KERN_ERR
 		"BUG: sleeping function called from invalid context at %s:%d\n",
 			file, line);
 	printk(KERN_ERR
 		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
 			in_atomic(), irqs_disabled(),
 			current->pid, current->comm);
 	debug_show_held_locks(current);
 	if (irqs_disabled())
 		print_irqtrace_events(current);
 	dump_stack();
 #endif
 }
 EXPORT_SYMBOL(__might_sleep);
 #endif
 #ifdef CONFIG_MAGIC_SYSRQ
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
 	int on_rq;
 	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
 	if (on_rq)
 		deactivate_task(rq, p, 0);
 	__setscheduler(rq, p, SCHED_NORMAL, 0);
 	if (on_rq) {
 		activate_task(rq, p, 0);
 		resched_task(rq->curr);
 	}
 }
 void normalize_rt_tasks(void)
 {
 	struct task_struct *g, *p;
 	unsigned long flags;
 	struct rq *rq;
 	read_lock_irqsave(&tasklist_lock, flags);
 	do_each_thread(g, p) {
 		/*
 		 * Only normalize user tasks:
 		 */
 		if (!p->mm)
 			continue;
 		p->se.exec_start		= 0;
 #ifdef CONFIG_SCHEDSTATS
 		p->se.wait_start		= 0;
 		p->se.sleep_start		= 0;
 		p->se.block_start		= 0;
 #endif
 		if (!rt_task(p)) {
 			/*
 			 * Renice negative nice level userspace
 			 * tasks back to 0:
 			 */
 			if (TASK_NICE(p) < 0 && p->mm)
 				set_user_nice(p, 0);
 			continue;
 		}
 		spin_lock(&p->pi_lock);
 		rq = __task_rq_lock(p);
 		normalize_task(rq, p);
 		__task_rq_unlock(rq);
 		spin_unlock(&p->pi_lock);
 	} while_each_thread(g, p);
 	read_unlock_irqrestore(&tasklist_lock, flags);
 }
 #endif /* CONFIG_MAGIC_SYSRQ */
 #ifdef CONFIG_IA64
 /*
  * These functions are only useful for the IA64 MCA handling.
  *
  * They can only be called when the whole system has been
  * stopped - every CPU needs to be quiescent, and no scheduling
  * activity can take place. Using them for anything else would
  * be a serious bug, and as a result, they aren't even visible
  * under any other configuration.
  */
 /**
  * curr_task - return the current task for a given cpu.
  * @cpu: the processor in question.
  *
  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
  */
 struct task_struct *curr_task(int cpu)
 {
 	return cpu_curr(cpu);
 }
 /**
  * set_curr_task - set the current task for a given cpu.
  * @cpu: the processor in question.
  * @p: the task pointer to set.
  *
  * Description: This function must only be used when non-maskable interrupts
  * are serviced on a separate stack. It allows the architecture to switch the
  * notion of the current task on a cpu in a non-blocking manner. This function
  * must be called with all CPU's synchronized, and interrupts disabled, the
  * and caller must save the original value of the current task (see
  * curr_task() above) and restore that value before reenabling interrupts and
  * re-starting the system.
  *
  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
  */
 void set_curr_task(int cpu, struct task_struct *p)
 {
 	cpu_curr(cpu) = p;
 }
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void free_fair_sched_group(struct task_group *tg)
 {
 	int i;
 	for_each_possible_cpu(i) {
 		if (tg->cfs_rq)
 			kfree(tg->cfs_rq[i]);
 		if (tg->se)
 			kfree(tg->se[i]);
 	}
 	kfree(tg->cfs_rq);
 	kfree(tg->se);
 }
 static
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se;
 	struct rq *rq;
 	int i;
 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
 	if (!tg->cfs_rq)
 		goto err;
 	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
 	if (!tg->se)
 		goto err;
 	tg->shares = NICE_0_LOAD;
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
 				      GFP_KERNEL, cpu_to_node(i));
 		if (!cfs_rq)
 			goto err;
 		se = kzalloc_node(sizeof(struct sched_entity),
 				  GFP_KERNEL, cpu_to_node(i));
 		if (!se)
 			goto err;
 		init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
 	}
 	return 1;
  err:
 	return 0;
 }
 static inline void register_fair_sched_group(struct task_group *tg, int cpu)
 {
 	list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
 			&cpu_rq(cpu)->leaf_cfs_rq_list);
 }
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
 	list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
 }
 #else /* !CONFG_FAIR_GROUP_SCHED */
 static inline void free_fair_sched_group(struct task_group *tg)
 {
 }
 static inline
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	return 1;
 }
 static inline void register_fair_sched_group(struct task_group *tg, int cpu)
 {
 }
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 static void free_rt_sched_group(struct task_group *tg)
 {
 	int i;
 	destroy_rt_bandwidth(&tg->rt_bandwidth);
 	for_each_possible_cpu(i) {
 		if (tg->rt_rq)
 			kfree(tg->rt_rq[i]);
 		if (tg->rt_se)
 			kfree(tg->rt_se[i]);
 	}
 	kfree(tg->rt_rq);
 	kfree(tg->rt_se);
 }
 static
 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct rt_rq *rt_rq;
 	struct sched_rt_entity *rt_se;
 	struct rq *rq;
 	int i;
 	tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
 	if (!tg->rt_rq)
 		goto err;
 	tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
 	if (!tg->rt_se)
 		goto err;
 	init_rt_bandwidth(&tg->rt_bandwidth,
 			ktime_to_ns(def_rt_bandwidth.rt_period), 0);
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 		rt_rq = kzalloc_node(sizeof(struct rt_rq),
 				     GFP_KERNEL, cpu_to_node(i));
 		if (!rt_rq)
 			goto err;
 		rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
 				     GFP_KERNEL, cpu_to_node(i));
 		if (!rt_se)
 			goto err;
 		init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
 	}
 	return 1;
  err:
 	return 0;
 }
 static inline void register_rt_sched_group(struct task_group *tg, int cpu)
 {
 	list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
 			&cpu_rq(cpu)->leaf_rt_rq_list);
 }
 static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
 {
 	list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
 }
 #else /* !CONFIG_RT_GROUP_SCHED */
 static inline void free_rt_sched_group(struct task_group *tg)
 {
 }
 static inline
 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	return 1;
 }
 static inline void register_rt_sched_group(struct task_group *tg, int cpu)
 {
 }
 static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
 {
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_GROUP_SCHED
 static void free_sched_group(struct task_group *tg)
 {
 	free_fair_sched_group(tg);
 	free_rt_sched_group(tg);
 	kfree(tg);
 }
 /* allocate runqueue etc for a new task group */
 struct task_group *sched_create_group(struct task_group *parent)
 {
 	struct task_group *tg;
 	unsigned long flags;
 	int i;
 	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
 	if (!tg)
 		return ERR_PTR(-ENOMEM);
 	if (!alloc_fair_sched_group(tg, parent))
 		goto err;
 	if (!alloc_rt_sched_group(tg, parent))
 		goto err;
 	spin_lock_irqsave(&task_group_lock, flags);
 	for_each_possible_cpu(i) {
 		register_fair_sched_group(tg, i);
 		register_rt_sched_group(tg, i);
 	}
 	list_add_rcu(&tg->list, &task_groups);
 	WARN_ON(!parent); /* root should already exist */
 	tg->parent = parent;
 	INIT_LIST_HEAD(&tg->children);
 	list_add_rcu(&tg->siblings, &parent->children);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 	return tg;
 err:
 	free_sched_group(tg);
 	return ERR_PTR(-ENOMEM);
 }
 /* rcu callback to free various structures associated with a task group */
 static void free_sched_group_rcu(struct rcu_head *rhp)
 {
 	/* now it should be safe to free those cfs_rqs */
 	free_sched_group(container_of(rhp, struct task_group, rcu));
 }
 /* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 {
 	unsigned long flags;
 	int i;
 	spin_lock_irqsave(&task_group_lock, flags);
 	for_each_possible_cpu(i) {
 		unregister_fair_sched_group(tg, i);
 		unregister_rt_sched_group(tg, i);
 	}
 	list_del_rcu(&tg->list);
 	list_del_rcu(&tg->siblings);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 	/* wait for possible concurrent references to cfs_rqs complete */
 	call_rcu(&tg->rcu, free_sched_group_rcu);
 }
 /* change task's runqueue when it moves between groups.
  *	The caller of this function should have put the task in its new group
  *	by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
  *	reflect its new group.
  */
 void sched_move_task(struct task_struct *tsk)
 {
 	int on_rq, running;
 	unsigned long flags;
 	struct rq *rq;
 	rq = task_rq_lock(tsk, &flags);
 	update_rq_clock(rq);
 	running = task_current(rq, tsk);
 	on_rq = tsk->se.on_rq;
 	if (on_rq)
 		dequeue_task(rq, tsk, 0);
 	if (unlikely(running))
 		tsk->sched_class->put_prev_task(rq, tsk);
 	set_task_rq(tsk, task_cpu(tsk));
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (tsk->sched_class->moved_group)
 		tsk->sched_class->moved_group(tsk);
 #endif
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
 	if (on_rq)
 		enqueue_task(rq, tsk, 0);
 	task_rq_unlock(rq, &flags);
 }
 #endif /* CONFIG_GROUP_SCHED */
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void __set_se_shares(struct sched_entity *se, unsigned long shares)
 {
 	struct cfs_rq *cfs_rq = se->cfs_rq;
 	int on_rq;
 	on_rq = se->on_rq;
 	if (on_rq)
 		dequeue_entity(cfs_rq, se, 0);
 	se->load.weight = shares;
 	se->load.inv_weight = 0;
 	if (on_rq)
 		enqueue_entity(cfs_rq, se, 0);
 }
 static void set_se_shares(struct sched_entity *se, unsigned long shares)
 {
 	struct cfs_rq *cfs_rq = se->cfs_rq;
 	struct rq *rq = cfs_rq->rq;
 	unsigned long flags;
 	spin_lock_irqsave(&rq->lock, flags);
 	__set_se_shares(se, shares);
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 static DEFINE_MUTEX(shares_mutex);
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
 	int i;
 	unsigned long flags;
 	/*
 	 * We can't change the weight of the root cgroup.
 	 */
 	if (!tg->se[0])
 		return -EINVAL;
 	if (shares < MIN_SHARES)
 		shares = MIN_SHARES;
 	else if (shares > MAX_SHARES)
 		shares = MAX_SHARES;
 	mutex_lock(&shares_mutex);
 	if (tg->shares == shares)
 		goto done;
 	spin_lock_irqsave(&task_group_lock, flags);
 	for_each_possible_cpu(i)
 		unregister_fair_sched_group(tg, i);
 	list_del_rcu(&tg->siblings);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 	/* wait for any ongoing reference to this group to finish */
 	synchronize_sched();
 	/*
 	 * Now we are free to modify the group's share on each cpu
 	 * w/o tripping rebalance_share or load_balance_fair.
 	 */
 	tg->shares = shares;
 	for_each_possible_cpu(i) {
 		/*
 		 * force a rebalance
 		 */
 		cfs_rq_set_shares(tg->cfs_rq[i], 0);
 		set_se_shares(tg->se[i], shares);
 	}
 	/*
 	 * Enable load balance activity on this group, by inserting it back on
 	 * each cpu's rq->leaf_cfs_rq_list.
 	 */
 	spin_lock_irqsave(&task_group_lock, flags);
 	for_each_possible_cpu(i)
 		register_fair_sched_group(tg, i);
 	list_add_rcu(&tg->siblings, &tg->parent->children);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 done:
 	mutex_unlock(&shares_mutex);
 	return 0;
 }
 unsigned long sched_group_shares(struct task_group *tg)
 {
 	return tg->shares;
 }
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 /*
  * Ensure that the real time constraints are schedulable.
  */
 static DEFINE_MUTEX(rt_constraints_mutex);
 static unsigned long to_ratio(u64 period, u64 runtime)
 {
 	if (runtime == RUNTIME_INF)
 		return 1ULL << 20;
 	return div64_u64(runtime << 20, period);
 }
 /* Must be called with tasklist_lock held */
 static inline int tg_has_rt_tasks(struct task_group *tg)
 {
 	struct task_struct *g, *p;
 	do_each_thread(g, p) {
 		if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
 			return 1;
 	} while_each_thread(g, p);
 	return 0;
 }
 struct rt_schedulable_data {
 	struct task_group *tg;
 	u64 rt_period;
 	u64 rt_runtime;
 };
 static int tg_schedulable(struct task_group *tg, void *data)
 {
 	struct rt_schedulable_data *d = data;
 	struct task_group *child;
 	unsigned long total, sum = 0;
 	u64 period, runtime;
 	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
 	runtime = tg->rt_bandwidth.rt_runtime;
 	if (tg == d->tg) {
 		period = d->rt_period;
 		runtime = d->rt_runtime;
 	}
 #ifdef CONFIG_USER_SCHED
 	if (tg == &root_task_group) {
 		period = global_rt_period();
 		runtime = global_rt_runtime();
 	}
 #endif
 	/*
 	 * Cannot have more runtime than the period.
 	 */
 	if (runtime > period && runtime != RUNTIME_INF)
 		return -EINVAL;
 	/*
 	 * Ensure we don't starve existing RT tasks.
 	 */
 	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
 		return -EBUSY;
 	total = to_ratio(period, runtime);
 	/*
 	 * Nobody can have more than the global setting allows.
 	 */
 	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
 		return -EINVAL;
 	/*
 	 * The sum of our children's runtime should not exceed our own.
 	 */
 	list_for_each_entry_rcu(child, &tg->children, siblings) {
 		period = ktime_to_ns(child->rt_bandwidth.rt_period);
 		runtime = child->rt_bandwidth.rt_runtime;
 		if (child == d->tg) {
 			period = d->rt_period;
 			runtime = d->rt_runtime;
 		}
 		sum += to_ratio(period, runtime);
 	}
 	if (sum > total)
 		return -EINVAL;
 	return 0;
 }
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
 	struct rt_schedulable_data data = {
 		.tg = tg,
 		.rt_period = period,
 		.rt_runtime = runtime,
 	};
 	return walk_tg_tree(tg_schedulable, tg_nop, &data);
 }
 static int tg_set_bandwidth(struct task_group *tg,
 		u64 rt_period, u64 rt_runtime)
 {
 	int i, err = 0;
 	mutex_lock(&rt_constraints_mutex);
 	read_lock(&tasklist_lock);
 	err = __rt_schedulable(tg, rt_period, rt_runtime);
 	if (err)
 		goto unlock;
 	spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
 	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
 	tg->rt_bandwidth.rt_runtime = rt_runtime;
 	for_each_possible_cpu(i) {
 		struct rt_rq *rt_rq = tg->rt_rq[i];
 		spin_lock(&rt_rq->rt_runtime_lock);
 		rt_rq->rt_runtime = rt_runtime;
 		spin_unlock(&rt_rq->rt_runtime_lock);
 	}
 	spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
  unlock:
 	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 	return err;
 }
 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 {
 	u64 rt_runtime, rt_period;
 	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
 	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
 	if (rt_runtime_us < 0)
 		rt_runtime = RUNTIME_INF;
 	return tg_set_bandwidth(tg, rt_period, rt_runtime);
 }
 long sched_group_rt_runtime(struct task_group *tg)
 {
 	u64 rt_runtime_us;
 	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
 		return -1;
 	rt_runtime_us = tg->rt_bandwidth.rt_runtime;
 	do_div(rt_runtime_us, NSEC_PER_USEC);
 	return rt_runtime_us;
 }
 int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
 {
 	u64 rt_runtime, rt_period;
 	rt_period = (u64)rt_period_us * NSEC_PER_USEC;
 	rt_runtime = tg->rt_bandwidth.rt_runtime;
 	if (rt_period == 0)
 		return -EINVAL;
 	return tg_set_bandwidth(tg, rt_period, rt_runtime);
 }
 long sched_group_rt_period(struct task_group *tg)
 {
 	u64 rt_period_us;
 	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
 	do_div(rt_period_us, NSEC_PER_USEC);
 	return rt_period_us;
 }
 static int sched_rt_global_constraints(void)
 {
 	u64 runtime, period;
 	int ret = 0;
 	if (sysctl_sched_rt_period <= 0)
 		return -EINVAL;
 	runtime = global_rt_runtime();
 	period = global_rt_period();
 	/*
 	 * Sanity check on the sysctl variables.
 	 */
 	if (runtime > period && runtime != RUNTIME_INF)
 		return -EINVAL;
 	mutex_lock(&rt_constraints_mutex);
 	read_lock(&tasklist_lock);
 	ret = __rt_schedulable(NULL, 0, 0);
 	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 	return ret;
 }
 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 {
 	/* Don't accept realtime tasks when there is no way for them to run */
 	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
 		return 0;
 	return 1;
 }
 #else /* !CONFIG_RT_GROUP_SCHED */
 static int sched_rt_global_constraints(void)
 {
 	unsigned long flags;
 	int i;
 	if (sysctl_sched_rt_period <= 0)
 		return -EINVAL;
 	/*
 	 * There's always some RT tasks in the root group
 	 * -- migration, kstopmachine etc..
 	 */
 	if (sysctl_sched_rt_runtime == 0)
 		return -EBUSY;
 	spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
 	for_each_possible_cpu(i) {
 		struct rt_rq *rt_rq = &cpu_rq(i)->rt;
 		spin_lock(&rt_rq->rt_runtime_lock);
 		rt_rq->rt_runtime = global_rt_runtime();
 		spin_unlock(&rt_rq->rt_runtime_lock);
 	}
 	spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
 	return 0;
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 int sched_rt_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int ret;
 	int old_period, old_runtime;
 	static DEFINE_MUTEX(mutex);
 	mutex_lock(&mutex);
 	old_period = sysctl_sched_rt_period;
 	old_runtime = sysctl_sched_rt_runtime;
 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (!ret && write) {
 		ret = sched_rt_global_constraints();
 		if (ret) {
 			sysctl_sched_rt_period = old_period;
 			sysctl_sched_rt_runtime = old_runtime;
 		} else {
 			def_rt_bandwidth.rt_runtime = global_rt_runtime();
 			def_rt_bandwidth.rt_period =
 				ns_to_ktime(global_rt_period());
 		}
 	}
 	mutex_unlock(&mutex);
 	return ret;
 }
 #ifdef CONFIG_CGROUP_SCHED
 /* return corresponding task_group object of a cgroup */
 static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
 {
 	return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
 			    struct task_group, css);
 }
 static struct cgroup_subsys_state *
 cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	struct task_group *tg, *parent;
 	if (!cgrp->parent) {
 		/* This is early initialization for the top cgroup */
 		return &init_task_group.css;
 	}
 	parent = cgroup_tg(cgrp->parent);
 	tg = sched_create_group(parent);
 	if (IS_ERR(tg))
 		return ERR_PTR(-ENOMEM);
 	return &tg->css;
 }
 static void
 cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	struct task_group *tg = cgroup_tg(cgrp);
 	sched_destroy_group(tg);
 }
 static int
 cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
 	if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
 		return -EINVAL;
 #else
 	/* We don't support RT-tasks being in separate groups */
 	if (tsk->sched_class != &fair_sched_class)
 		return -EINVAL;
 #endif
 	return 0;
 }
 static int
 cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 		      struct task_struct *tsk, bool threadgroup)
 {
 	int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
 	if (retval)
 		return retval;
 	if (threadgroup) {
 		struct task_struct *c;
 		rcu_read_lock();
 		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
 			retval = cpu_cgroup_can_attach_task(cgrp, c);
 			if (retval) {
 				rcu_read_unlock();
 				return retval;
 			}
 		}
 		rcu_read_unlock();
 	}
 	return 0;
 }
 static void
 cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 		  struct cgroup *old_cont, struct task_struct *tsk,
 		  bool threadgroup)
 {
 	sched_move_task(tsk);
 	if (threadgroup) {
 		struct task_struct *c;
 		rcu_read_lock();
 		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
 			sched_move_task(c);
 		}
 		rcu_read_unlock();
 	}
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
 				u64 shareval)
 {
 	return sched_group_set_shares(cgroup_tg(cgrp), shareval);
 }
 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
 {
 	struct task_group *tg = cgroup_tg(cgrp);
 	return (u64) tg->shares;
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
 				s64 val)
 {
 	return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
 }
 static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
 {
 	return sched_group_rt_runtime(cgroup_tg(cgrp));
 }
 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
 		u64 rt_period_us)
 {
 	return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
 }
 static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
 {
 	return sched_group_rt_period(cgroup_tg(cgrp));
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 static struct cftype cpu_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
 		.name = "shares",
 		.read_u64 = cpu_shares_read_u64,
 		.write_u64 = cpu_shares_write_u64,
 	},
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	{
 		.name = "rt_runtime_us",
 		.read_s64 = cpu_rt_runtime_read,
 		.write_s64 = cpu_rt_runtime_write,
 	},
 	{
 		.name = "rt_period_us",
 		.read_u64 = cpu_rt_period_read_uint,
 		.write_u64 = cpu_rt_period_write_uint,
 	},
 #endif
 };
 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
 }
 struct cgroup_subsys cpu_cgroup_subsys = {
 	.name		= "cpu",
 	.create		= cpu_cgroup_create,
 	.destroy	= cpu_cgroup_destroy,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
 	.populate	= cpu_cgroup_populate,
 	.subsys_id	= cpu_cgroup_subsys_id,
 	.early_init	= 1,
 };
 #endif	/* CONFIG_CGROUP_SCHED */
 #ifdef CONFIG_CGROUP_CPUACCT
 /*
  * CPU accounting code for task groups.
  *
  * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
  * (balbir@in.ibm.com).
  */
 /* track cpu usage of a group of tasks and its child groups */
 struct cpuacct {
 	struct cgroup_subsys_state css;
 	/* cpuusage holds pointer to a u64-type object on every cpu */
 	u64 *cpuusage;
 	struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
 	struct cpuacct *parent;
 };
 struct cgroup_subsys cpuacct_subsys;
 /* return cpu accounting group corresponding to this container */
 static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
 {
 	return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
 			    struct cpuacct, css);
 }
 /* return cpu accounting group to which this task belongs */
 static inline struct cpuacct *task_ca(struct task_struct *tsk)
 {
 	return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
 			    struct cpuacct, css);
 }
 /* create a new cpu accounting group */
 static struct cgroup_subsys_state *cpuacct_create(
 	struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
 	int i;
 	if (!ca)
 		goto out;
 	ca->cpuusage = alloc_percpu(u64);
 	if (!ca->cpuusage)
 		goto out_free_ca;
 	for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
 		if (percpu_counter_init(&ca->cpustat[i], 0))
 			goto out_free_counters;
 	if (cgrp->parent)
 		ca->parent = cgroup_ca(cgrp->parent);
 	return &ca->css;
 out_free_counters:
 	while (--i >= 0)
 		percpu_counter_destroy(&ca->cpustat[i]);
 	free_percpu(ca->cpuusage);
 out_free_ca:
 	kfree(ca);
 out:
 	return ERR_PTR(-ENOMEM);
 }
 /* destroy an existing cpu accounting group */
 static void
 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	struct cpuacct *ca = cgroup_ca(cgrp);
 	int i;
 	for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
 		percpu_counter_destroy(&ca->cpustat[i]);
 	free_percpu(ca->cpuusage);
 	kfree(ca);
 }
 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 {
 	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 	u64 data;
 #ifndef CONFIG_64BIT
 	/*
 	 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
 	 */
 	spin_lock_irq(&cpu_rq(cpu)->lock);
 	data = *cpuusage;
 	spin_unlock_irq(&cpu_rq(cpu)->lock);
 #else
 	data = *cpuusage;
 #endif
 	return data;
 }
 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 {
 	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 #ifndef CONFIG_64BIT
 	/*
 	 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
 	 */
 	spin_lock_irq(&cpu_rq(cpu)->lock);
 	*cpuusage = val;
 	spin_unlock_irq(&cpu_rq(cpu)->lock);
 #else
 	*cpuusage = val;
 #endif
 }
 /* return total cpu usage (in nanoseconds) of a group */
 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
 {
 	struct cpuacct *ca = cgroup_ca(cgrp);
 	u64 totalcpuusage = 0;
 	int i;
 	for_each_present_cpu(i)
 		totalcpuusage += cpuacct_cpuusage_read(ca, i);
 	return totalcpuusage;
 }
 static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
 								u64 reset)
 {
 	struct cpuacct *ca = cgroup_ca(cgrp);
 	int err = 0;
 	int i;
 	if (reset) {
 		err = -EINVAL;
 		goto out;
 	}
 	for_each_present_cpu(i)
 		cpuacct_cpuusage_write(ca, i, 0);
 out:
 	return err;
 }
 static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
 				   struct seq_file *m)
 {
 	struct cpuacct *ca = cgroup_ca(cgroup);
 	u64 percpu;
 	int i;
 	for_each_present_cpu(i) {
 		percpu = cpuacct_cpuusage_read(ca, i);
 		seq_printf(m, "%llu ", (unsigned long long) percpu);
 	}
 	seq_printf(m, "\n");
 	return 0;
 }
 static const char *cpuacct_stat_desc[] = {
 	[CPUACCT_STAT_USER] = "user",
 	[CPUACCT_STAT_SYSTEM] = "system",
 };
 static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
 		struct cgroup_map_cb *cb)
 {
 	struct cpuacct *ca = cgroup_ca(cgrp);
 	int i;
 	for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
 		s64 val = percpu_counter_read(&ca->cpustat[i]);
 		val = cputime64_to_clock_t(val);
 		cb->fill(cb, cpuacct_stat_desc[i], val);
 	}
 	return 0;
 }
 static struct cftype files[] = {
 	{
 		.name = "usage",
 		.read_u64 = cpuusage_read,
 		.write_u64 = cpuusage_write,
 	},
 	{
 		.name = "usage_percpu",
 		.read_seq_string = cpuacct_percpu_seq_read,
 	},
 	{
 		.name = "stat",
 		.read_map = cpuacct_stats_show,
 	},
 };
 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
 }
 /*
  * charge this task's execution time to its accounting group.
  *
  * called with rq->lock held.
  */
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
 	struct cpuacct *ca;
 	int cpu;
 	if (unlikely(!cpuacct_subsys.active))
 		return;
 	cpu = task_cpu(tsk);
 	rcu_read_lock();
 	ca = task_ca(tsk);
 	for (; ca; ca = ca->parent) {
 		u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 		*cpuusage += cputime;
 	}
 	rcu_read_unlock();
 }
 /*
  * Charge the system/user time to the task's accounting group.
  */
 static void cpuacct_update_stats(struct task_struct *tsk,
 		enum cpuacct_stat_index idx, cputime_t val)
 {
 	struct cpuacct *ca;
 	if (unlikely(!cpuacct_subsys.active))
 		return;
 	rcu_read_lock();
 	ca = task_ca(tsk);
 	do {
 		percpu_counter_add(&ca->cpustat[idx], val);
 		ca = ca->parent;
 	} while (ca);
 	rcu_read_unlock();
 }
 struct cgroup_subsys cpuacct_subsys = {
 	.name = "cpuacct",
 	.create = cpuacct_create,
 	.destroy = cpuacct_destroy,
 	.populate = cpuacct_populate,
 	.subsys_id = cpuacct_subsys_id,
 };
 #endif	/* CONFIG_CGROUP_CPUACCT */
 #ifndef CONFIG_SMP
 int rcu_expedited_torture_stats(char *page)
 {
 	return 0;
 }
 EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
 void synchronize_sched_expedited(void)
 {
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 #else /* #ifndef CONFIG_SMP */
 static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
 static DEFINE_MUTEX(rcu_sched_expedited_mutex);
 #define RCU_EXPEDITED_STATE_POST -2
 #define RCU_EXPEDITED_STATE_IDLE -1
 static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
 int rcu_expedited_torture_stats(char *page)
 {
 	int cnt = 0;
 	int cpu;
 	cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
 	for_each_online_cpu(cpu) {
 		 cnt += sprintf(&page[cnt], " %d:%d",
 				cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
 	}
 	cnt += sprintf(&page[cnt], "\n");
 	return cnt;
 }
 EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
 static long synchronize_sched_expedited_count;
 /*
  * Wait for an rcu-sched grace period to elapse, but use "big hammer"
  * approach to force grace period to end quickly.  This consumes
  * significant time on all CPUs, and is thus not recommended for
  * any sort of common-case code.
  *
  * Note that it is illegal to call this function while holding any
  * lock that is acquired by a CPU-hotplug notifier.  Failing to
  * observe this restriction will result in deadlock.
  */
 void synchronize_sched_expedited(void)
 {
 	int cpu;
 	unsigned long flags;
 	bool need_full_sync = 0;
 	struct rq *rq;
 	struct migration_req *req;
 	long snap;
 	int trycount = 0;
 	smp_mb();  /* ensure prior mod happens before capturing snap. */
 	snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
 	get_online_cpus();
 	while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
 		put_online_cpus();
 		if (trycount++ < 10)
 			udelay(trycount * num_online_cpus());
 		else {
 			synchronize_sched();
 			return;
 		}
 		if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
 			smp_mb(); /* ensure test happens before caller kfree */
 			return;
 		}
 		get_online_cpus();
 	}
 	rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
 	for_each_online_cpu(cpu) {
 		rq = cpu_rq(cpu);
 		req = &per_cpu(rcu_migration_req, cpu);
 		init_completion(&req->done);
 		req->task = NULL;
 		req->dest_cpu = RCU_MIGRATION_NEED_QS;
 		spin_lock_irqsave(&rq->lock, flags);
 		list_add(&req->list, &rq->migration_queue);
 		spin_unlock_irqrestore(&rq->lock, flags);
 		wake_up_process(rq->migration_thread);
 	}
 	for_each_online_cpu(cpu) {
 		rcu_expedited_state = cpu;
 		req = &per_cpu(rcu_migration_req, cpu);
 		rq = cpu_rq(cpu);
 		wait_for_completion(&req->done);
 		spin_lock_irqsave(&rq->lock, flags);
 		if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
 			need_full_sync = 1;
 		req->dest_cpu = RCU_MIGRATION_IDLE;
 		spin_unlock_irqrestore(&rq->lock, flags);
 	}
 	rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
 	mutex_unlock(&rcu_sched_expedited_mutex);
 	put_online_cpus();
 	if (need_full_sync)
 		synchronize_sched();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 #endif /* #else #ifndef CONFIG_SMP */

 /* Worker thread pool for slow items, such as filesystem lookups or mkdirs
  *
  * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public Licence
  * as published by the Free Software Foundation; either version
  * 2 of the Licence, or (at your option) any later version.
  *
  * See Documentation/slow-work.txt
  */
 #include <linux/module.h>
 #include <linux/slow-work.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/wait.h>
 #define SLOW_WORK_CULL_TIMEOUT (5 * HZ)	/* cull threads 5s after running out of
 					 * things to do */
 #define SLOW_WORK_OOM_TIMEOUT (5 * HZ)	/* can't start new threads for 5s after
 					 * OOM */
 static void slow_work_cull_timeout(unsigned long);
 static void slow_work_oom_timeout(unsigned long);
 #ifdef CONFIG_SYSCTL
 static int slow_work_min_threads_sysctl(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
 					void __user *, size_t *, loff_t *);
 #endif
 /*
  * The pool of threads has at least min threads in it as long as someone is
  * using the facility, and may have as many as max.
  *
  * A portion of the pool may be processing very slow operations.
  */
 static unsigned slow_work_min_threads = 2;
 static unsigned slow_work_max_threads = 4;
 static unsigned vslow_work_proportion = 50; /* % of threads that may process
 					     * very slow work */
 #ifdef CONFIG_SYSCTL
 static const int slow_work_min_min_threads = 2;
 static int slow_work_max_max_threads = 255;
 static const int slow_work_min_vslow = 1;
 static const int slow_work_max_vslow = 99;
 ctl_table slow_work_sysctls[] = {
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "min-threads",
 		.data		= &slow_work_min_threads,
 		.maxlen		= sizeof(unsigned),
 		.mode		= 0644,
 		.proc_handler	= slow_work_min_threads_sysctl,
 		.extra1		= (void *) &slow_work_min_min_threads,
 		.extra2		= &slow_work_max_threads,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "max-threads",
 		.data		= &slow_work_max_threads,
 		.maxlen		= sizeof(unsigned),
 		.mode		= 0644,
 		.proc_handler	= slow_work_max_threads_sysctl,
 		.extra1		= &slow_work_min_threads,
 		.extra2		= (void *) &slow_work_max_max_threads,
 	},
 	{
-		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "vslow-percentage",
 		.data		= &vslow_work_proportion,
 		.maxlen		= sizeof(unsigned),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
 		.extra1		= (void *) &slow_work_min_vslow,
 		.extra2		= (void *) &slow_work_max_vslow,
 	},
-	{ .ctl_name = 0 }
+	{}
 };
 #endif
 /*
  * The active state of the thread pool
  */
 static atomic_t slow_work_thread_count;
 static atomic_t vslow_work_executing_count;
 static bool slow_work_may_not_start_new_thread;
 static bool slow_work_cull; /* cull a thread due to lack of activity */
 static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
 static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
 static struct slow_work slow_work_new_thread; /* new thread starter */
 /*
  * The queues of work items and the lock governing access to them.  These are
  * shared between all the CPUs.  It doesn't make sense to have per-CPU queues
  * as the number of threads bears no relation to the number of CPUs.
  *
  * There are two queues of work items: one for slow work items, and one for
  * very slow work items.
  */
 static LIST_HEAD(slow_work_queue);
 static LIST_HEAD(vslow_work_queue);
 static DEFINE_SPINLOCK(slow_work_queue_lock);
 /*
  * The thread controls.  A variable used to signal to the threads that they
  * should exit when the queue is empty, a waitqueue used by the threads to wait
  * for signals, and a completion set by the last thread to exit.
  */
 static bool slow_work_threads_should_exit;
 static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
 static DECLARE_COMPLETION(slow_work_last_thread_exited);
 /*
  * The number of users of the thread pool and its lock.  Whilst this is zero we
  * have no threads hanging around, and when this reaches zero, we wait for all
  * active or queued work items to complete and kill all the threads we do have.
  */
 static int slow_work_user_count;
 static DEFINE_MUTEX(slow_work_user_lock);
 /*
  * Calculate the maximum number of active threads in the pool that are
  * permitted to process very slow work items.
  *
  * The answer is rounded up to at least 1, but may not equal or exceed the
  * maximum number of the threads in the pool.  This means we always have at
  * least one thread that can process slow work items, and we always have at
  * least one thread that won't get tied up doing so.
  */
 static unsigned slow_work_calc_vsmax(void)
 {
 	unsigned vsmax;
 	vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
 	vsmax /= 100;
 	vsmax = max(vsmax, 1U);
 	return min(vsmax, slow_work_max_threads - 1);
 }
 /*
  * Attempt to execute stuff queued on a slow thread.  Return true if we managed
  * it, false if there was nothing to do.
  */
 static bool slow_work_execute(void)
 {
 	struct slow_work *work = NULL;
 	unsigned vsmax;
 	bool very_slow;
 	vsmax = slow_work_calc_vsmax();
 	/* see if we can schedule a new thread to be started if we're not
 	 * keeping up with the work */
 	if (!waitqueue_active(&slow_work_thread_wq) &&
 	    (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
 	    atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
 	    !slow_work_may_not_start_new_thread)
 		slow_work_enqueue(&slow_work_new_thread);
 	/* find something to execute */
 	spin_lock_irq(&slow_work_queue_lock);
 	if (!list_empty(&vslow_work_queue) &&
 	    atomic_read(&vslow_work_executing_count) < vsmax) {
 		work = list_entry(vslow_work_queue.next,
 				  struct slow_work, link);
 		if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
 			BUG();
 		list_del_init(&work->link);
 		atomic_inc(&vslow_work_executing_count);
 		very_slow = true;
 	} else if (!list_empty(&slow_work_queue)) {
 		work = list_entry(slow_work_queue.next,
 				  struct slow_work, link);
 		if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
 			BUG();
 		list_del_init(&work->link);
 		very_slow = false;
 	} else {
 		very_slow = false; /* avoid the compiler warning */
 	}
 	spin_unlock_irq(&slow_work_queue_lock);
 	if (!work)
 		return false;
 	if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
 		BUG();
 	work->ops->execute(work);
 	if (very_slow)
 		atomic_dec(&vslow_work_executing_count);
 	clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
 	/* if someone tried to enqueue the item whilst we were executing it,
 	 * then it'll be left unenqueued to avoid multiple threads trying to
 	 * execute it simultaneously
 	 *
 	 * there is, however, a race between us testing the pending flag and
 	 * getting the spinlock, and between the enqueuer setting the pending
 	 * flag and getting the spinlock, so we use a deferral bit to tell us
 	 * if the enqueuer got there first
 	 */
 	if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
 		spin_lock_irq(&slow_work_queue_lock);
 		if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
 		    test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
 			goto auto_requeue;
 		spin_unlock_irq(&slow_work_queue_lock);
 	}
 	work->ops->put_ref(work);
 	return true;
 auto_requeue:
 	/* we must complete the enqueue operation
 	 * - we transfer our ref on the item back to the appropriate queue
 	 * - don't wake another thread up as we're awake already
 	 */
 	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
 		list_add_tail(&work->link, &vslow_work_queue);
 	else
 		list_add_tail(&work->link, &slow_work_queue);
 	spin_unlock_irq(&slow_work_queue_lock);
 	return true;
 }
 /**
  * slow_work_enqueue - Schedule a slow work item for processing
  * @work: The work item to queue
  *
  * Schedule a slow work item for processing.  If the item is already undergoing
  * execution, this guarantees not to re-enter the execution routine until the
  * first execution finishes.
  *
  * The item is pinned by this function as it retains a reference to it, managed
  * through the item operations.  The item is unpinned once it has been
  * executed.
  *
  * An item may hog the thread that is running it for a relatively large amount
  * of time, sufficient, for example, to perform several lookup, mkdir, create
  * and setxattr operations.  It may sleep on I/O and may sleep to obtain locks.
  *
  * Conversely, if a number of items are awaiting processing, it may take some
  * time before any given item is given attention.  The number of threads in the
  * pool may be increased to deal with demand, but only up to a limit.
  *
  * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
  * the very slow queue, from which only a portion of the threads will be
  * allowed to pick items to execute.  This ensures that very slow items won't
  * overly block ones that are just ordinarily slow.
  *
  * Returns 0 if successful, -EAGAIN if not.
  */
 int slow_work_enqueue(struct slow_work *work)
 {
 	unsigned long flags;
 	BUG_ON(slow_work_user_count <= 0);
 	BUG_ON(!work);
 	BUG_ON(!work->ops);
 	BUG_ON(!work->ops->get_ref);
 	/* when honouring an enqueue request, we only promise that we will run
 	 * the work function in the future; we do not promise to run it once
 	 * per enqueue request
 	 *
 	 * we use the PENDING bit to merge together repeat requests without
 	 * having to disable IRQs and take the spinlock, whilst still
 	 * maintaining our promise
 	 */
 	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
 		spin_lock_irqsave(&slow_work_queue_lock, flags);
 		/* we promise that we will not attempt to execute the work
 		 * function in more than one thread simultaneously
 		 *
 		 * this, however, leaves us with a problem if we're asked to
 		 * enqueue the work whilst someone is executing the work
 		 * function as simply queueing the work immediately means that
 		 * another thread may try executing it whilst it is already
 		 * under execution
 		 *
 		 * to deal with this, we set the ENQ_DEFERRED bit instead of
 		 * enqueueing, and the thread currently executing the work
 		 * function will enqueue the work item when the work function
 		 * returns and it has cleared the EXECUTING bit
 		 */
 		if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
 			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
 		} else {
 			if (work->ops->get_ref(work) < 0)
 				goto cant_get_ref;
 			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
 				list_add_tail(&work->link, &vslow_work_queue);
 			else
 				list_add_tail(&work->link, &slow_work_queue);
 			wake_up(&slow_work_thread_wq);
 		}
 		spin_unlock_irqrestore(&slow_work_queue_lock, flags);
 	}
 	return 0;
 cant_get_ref:
 	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
 	return -EAGAIN;
 }
 EXPORT_SYMBOL(slow_work_enqueue);
 /*
  * Schedule a cull of the thread pool at some time in the near future
  */
 static void slow_work_schedule_cull(void)
 {
 	mod_timer(&slow_work_cull_timer,
 		  round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
 }
 /*
  * Worker thread culling algorithm
  */
 static bool slow_work_cull_thread(void)
 {
 	unsigned long flags;
 	bool do_cull = false;
 	spin_lock_irqsave(&slow_work_queue_lock, flags);
 	if (slow_work_cull) {
 		slow_work_cull = false;
 		if (list_empty(&slow_work_queue) &&
 		    list_empty(&vslow_work_queue) &&
 		    atomic_read(&slow_work_thread_count) >
 		    slow_work_min_threads) {
 			slow_work_schedule_cull();
 			do_cull = true;
 		}
 	}
 	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
 	return do_cull;
 }
 /*
  * Determine if there is slow work available for dispatch
  */
 static inline bool slow_work_available(int vsmax)
 {
 	return !list_empty(&slow_work_queue) ||
 		(!list_empty(&vslow_work_queue) &&
 		 atomic_read(&vslow_work_executing_count) < vsmax);
 }
 /*
  * Worker thread dispatcher
  */
 static int slow_work_thread(void *_data)
 {
 	int vsmax;
 	DEFINE_WAIT(wait);
 	set_freezable();
 	set_user_nice(current, -5);
 	for (;;) {
 		vsmax = vslow_work_proportion;
 		vsmax *= atomic_read(&slow_work_thread_count);
 		vsmax /= 100;
 		prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
 					  TASK_INTERRUPTIBLE);
 		if (!freezing(current) &&
 		    !slow_work_threads_should_exit &&
 		    !slow_work_available(vsmax) &&
 		    !slow_work_cull)
 			schedule();
 		finish_wait(&slow_work_thread_wq, &wait);
 		try_to_freeze();
 		vsmax = vslow_work_proportion;
 		vsmax *= atomic_read(&slow_work_thread_count);
 		vsmax /= 100;
 		if (slow_work_available(vsmax) && slow_work_execute()) {
 			cond_resched();
 			if (list_empty(&slow_work_queue) &&
 			    list_empty(&vslow_work_queue) &&
 			    atomic_read(&slow_work_thread_count) >
 			    slow_work_min_threads)
 				slow_work_schedule_cull();
 			continue;
 		}
 		if (slow_work_threads_should_exit)
 			break;
 		if (slow_work_cull && slow_work_cull_thread())
 			break;
 	}
 	if (atomic_dec_and_test(&slow_work_thread_count))
 		complete_and_exit(&slow_work_last_thread_exited, 0);
 	return 0;
 }
 /*
  * Handle thread cull timer expiration
  */
 static void slow_work_cull_timeout(unsigned long data)
 {
 	slow_work_cull = true;
 	wake_up(&slow_work_thread_wq);
 }
 /*
  * Get a reference on slow work thread starter
  */
 static int slow_work_new_thread_get_ref(struct slow_work *work)
 {
 	return 0;
 }
 /*
  * Drop a reference on slow work thread starter
  */
 static void slow_work_new_thread_put_ref(struct slow_work *work)
 {
 }
 /*
  * Start a new slow work thread
  */
 static void slow_work_new_thread_execute(struct slow_work *work)
 {
 	struct task_struct *p;
 	if (slow_work_threads_should_exit)
 		return;
 	if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
 		return;
 	if (!mutex_trylock(&slow_work_user_lock))
 		return;
 	slow_work_may_not_start_new_thread = true;
 	atomic_inc(&slow_work_thread_count);
 	p = kthread_run(slow_work_thread, NULL, "kslowd");
 	if (IS_ERR(p)) {
 		printk(KERN_DEBUG "Slow work thread pool: OOM\n");
 		if (atomic_dec_and_test(&slow_work_thread_count))
 			BUG(); /* we're running on a slow work thread... */
 		mod_timer(&slow_work_oom_timer,
 			  round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
 	} else {
 		/* ratelimit the starting of new threads */
 		mod_timer(&slow_work_oom_timer, jiffies + 1);
 	}
 	mutex_unlock(&slow_work_user_lock);
 }
 static const struct slow_work_ops slow_work_new_thread_ops = {
 	.get_ref	= slow_work_new_thread_get_ref,
 	.put_ref	= slow_work_new_thread_put_ref,
 	.execute	= slow_work_new_thread_execute,
 };
 /*
  * post-OOM new thread start suppression expiration
  */
 static void slow_work_oom_timeout(unsigned long data)
 {
 	slow_work_may_not_start_new_thread = false;
 }
 #ifdef CONFIG_SYSCTL
 /*
  * Handle adjustment of the minimum number of threads
  */
 static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
 					void __user *buffer,
 					size_t *lenp, loff_t *ppos)
 {
 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	int n;
 	if (ret == 0) {
 		mutex_lock(&slow_work_user_lock);
 		if (slow_work_user_count > 0) {
 			/* see if we need to start or stop threads */
 			n = atomic_read(&slow_work_thread_count) -
 				slow_work_min_threads;
 			if (n < 0 && !slow_work_may_not_start_new_thread)
 				slow_work_enqueue(&slow_work_new_thread);
 			else if (n > 0)
 				slow_work_schedule_cull();
 		}
 		mutex_unlock(&slow_work_user_lock);
 	}
 	return ret;
 }
 /*
  * Handle adjustment of the maximum number of threads
  */
 static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
 					void __user *buffer,
 					size_t *lenp, loff_t *ppos)
 {
 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	int n;
 	if (ret == 0) {
 		mutex_lock(&slow_work_user_lock);
 		if (slow_work_user_count > 0) {
 			/* see if we need to stop threads */
 			n = slow_work_max_threads -
 				atomic_read(&slow_work_thread_count);
 			if (n < 0)
 				slow_work_schedule_cull();
 		}
 		mutex_unlock(&slow_work_user_lock);
 	}
 	return ret;
 }
 #endif /* CONFIG_SYSCTL */
 /**
  * slow_work_register_user - Register a user of the facility
  *
  * Register a user of the facility, starting up the initial threads if there
  * aren't any other users at this point.  This will return 0 if successful, or
  * an error if not.
  */
 int slow_work_register_user(void)
 {
 	struct task_struct *p;
 	int loop;
 	mutex_lock(&slow_work_user_lock);
 	if (slow_work_user_count == 0) {
 		printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
 		init_completion(&slow_work_last_thread_exited);
 		slow_work_threads_should_exit = false;
 		slow_work_init(&slow_work_new_thread,
 			       &slow_work_new_thread_ops);
 		slow_work_may_not_start_new_thread = false;
 		slow_work_cull = false;
 		/* start the minimum number of threads */
 		for (loop = 0; loop < slow_work_min_threads; loop++) {
 			atomic_inc(&slow_work_thread_count);
 			p = kthread_run(slow_work_thread, NULL, "kslowd");
 			if (IS_ERR(p))
 				goto error;
 		}
 		printk(KERN_NOTICE "Slow work thread pool: Ready\n");
 	}
 	slow_work_user_count++;
 	mutex_unlock(&slow_work_user_lock);
 	return 0;
 error:
 	if (atomic_dec_and_test(&slow_work_thread_count))
 		complete(&slow_work_last_thread_exited);
 	if (loop > 0) {
 		printk(KERN_ERR "Slow work thread pool:"
 		       " Aborting startup on ENOMEM\n");
 		slow_work_threads_should_exit = true;
 		wake_up_all(&slow_work_thread_wq);
 		wait_for_completion(&slow_work_last_thread_exited);
 		printk(KERN_ERR "Slow work thread pool: Aborted\n");
 	}
 	mutex_unlock(&slow_work_user_lock);
 	return PTR_ERR(p);
 }
 EXPORT_SYMBOL(slow_work_register_user);
 /**
  * slow_work_unregister_user - Unregister a user of the facility
  *
  * Unregister a user of the facility, killing all the threads if this was the
  * last one.
  */
 void slow_work_unregister_user(void)
 {
 	mutex_lock(&slow_work_user_lock);
 	BUG_ON(slow_work_user_count <= 0);
 	slow_work_user_count--;
 	if (slow_work_user_count == 0) {
 		printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
 		slow_work_threads_should_exit = true;
 		del_timer_sync(&slow_work_cull_timer);
 		del_timer_sync(&slow_work_oom_timer);
 		wake_up_all(&slow_work_thread_wq);
 		wait_for_completion(&slow_work_last_thread_exited);
 		printk(KERN_NOTICE "Slow work thread pool:"
 		       " Shut down complete\n");
 	}
 	mutex_unlock(&slow_work_user_lock);
 }
 EXPORT_SYMBOL(slow_work_unregister_user);
 /*
  * Initialise the slow work facility
  */
 static int __init init_slow_work(void)
 {
 	unsigned nr_cpus = num_possible_cpus();
 	if (slow_work_max_threads < nr_cpus)
 		slow_work_max_threads = nr_cpus;
 #ifdef CONFIG_SYSCTL
 	if (slow_work_max_max_threads < nr_cpus * 2)
 		slow_work_max_max_threads = nr_cpus * 2;
 #endif
 	return 0;
 }
 subsys_initcall(init_slow_work);

 /*
  *  Copyright (C) 2007
  *
  *  Author: Eric Biederman <ebiederm@xmision.com>
  *
  *  This program is free software; you can redistribute it and/or
  *  modify it under the terms of the GNU General Public License as
  *  published by the Free Software Foundation, version 2 of the
  *  License.
  */
 #include <linux/module.h>
 #include <linux/uts.h>
 #include <linux/utsname.h>
 #include <linux/sysctl.h>
 static void *get_uts(ctl_table *table, int write)
 {
 	char *which = table->data;
 	struct uts_namespace *uts_ns;
 	uts_ns = current->nsproxy->uts_ns;
 	which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
 	if (!write)
 		down_read(&uts_sem);
 	else
 		down_write(&uts_sem);
 	return which;
 }
 static void put_uts(ctl_table *table, int write, void *which)
 {
 	if (!write)
 		up_read(&uts_sem);
 	else
 		up_write(&uts_sem);
 }
 #ifdef CONFIG_PROC_SYSCTL
 /*
  *	Special case of dostring for the UTS structure. This has locks
  *	to observe. Should this be in kernel/sys.c ????
  */
 static int proc_do_uts_string(ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table uts_table;
 	int r;
 	memcpy(&uts_table, table, sizeof(uts_table));
 	uts_table.data = get_uts(table, write);
 	r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
 	put_uts(table, write, uts_table.data);
 	return r;
 }
 #else
 #define proc_do_uts_string NULL
 #endif
-#ifdef CONFIG_SYSCTL_SYSCALL
-/* The generic string strategy routine: */
-static int sysctl_uts_string(ctl_table *table,
-		  void __user *oldval, size_t __user *oldlenp,
-		  void __user *newval, size_t newlen)
-{
-	struct ctl_table uts_table;
-	int r, write;
-	write = newval && newlen;
-	memcpy(&uts_table, table, sizeof(uts_table));
-	uts_table.data = get_uts(table, write);
-	r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen);
-	put_uts(table, write, uts_table.data);
-	return r;
-}
-#else
-#define sysctl_uts_string NULL
-#endif
 static struct ctl_table uts_kern_table[] = {
 	{
-		.ctl_name	= KERN_OSTYPE,
 		.procname	= "ostype",
 		.data		= init_uts_ns.name.sysname,
 		.maxlen		= sizeof(init_uts_ns.name.sysname),
 		.mode		= 0444,
 		.proc_handler	= proc_do_uts_string,
-		.strategy	= sysctl_uts_string,
 	},
 	{
-		.ctl_name	= KERN_OSRELEASE,
 		.procname	= "osrelease",
 		.data		= init_uts_ns.name.release,
 		.maxlen		= sizeof(init_uts_ns.name.release),
 		.mode		= 0444,
 		.proc_handler	= proc_do_uts_string,
-		.strategy	= sysctl_uts_string,
 	},
 	{
-		.ctl_name	= KERN_VERSION,
 		.procname	= "version",
 		.data		= init_uts_ns.name.version,
 		.maxlen		= sizeof(init_uts_ns.name.version),
 		.mode		= 0444,
 		.proc_handler	= proc_do_uts_string,
-		.strategy	= sysctl_uts_string,
 	},
 	{
-		.ctl_name	= KERN_NODENAME,
 		.procname	= "hostname",
 		.data		= init_uts_ns.name.nodename,
 		.maxlen		= sizeof(init_uts_ns.name.nodename),
 		.mode		= 0644,
 		.proc_handler	= proc_do_uts_string,
-		.strategy	= sysctl_uts_string,
 	},
 	{
-		.ctl_name	= KERN_DOMAINNAME,
 		.procname	= "domainname",
 		.data		= init_uts_ns.name.domainname,
 		.maxlen		= sizeof(init_uts_ns.name.domainname),
 		.mode		= 0644,
 		.proc_handler	= proc_do_uts_string,
-		.strategy	= sysctl_uts_string,
 	},
 	{}
 };
 static struct ctl_table uts_root_table[] = {
 	{
-		.ctl_name	= CTL_KERN,
 		.procname	= "kernel",
 		.mode		= 0555,
 		.child		= uts_kern_table,
 	},
 	{}
 };
 static int __init utsname_sysctl_init(void)
 {
 	register_sysctl_table(uts_root_table);
 	return 0;
 }
 __initcall(utsname_sysctl_init);