Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* kernel/sched.c

2

* kernel/sched.c

3

*

3

*

4

* Kernel scheduler and related syscalls

4

* Kernel scheduler and related syscalls

5

*

5

*

6

7

*

7

*

8

* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and

8

* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and

9

* make semaphores SMP safe

9

* make semaphores SMP safe

10

* 1998-11-19 Implemented schedule_timeout() and related stuff

10

* 1998-11-19 Implemented schedule_timeout() and related stuff

11

* by Andrea Arcangeli

11

* by Andrea Arcangeli

12

* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:

12

* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:

13

* hybrid priority-list and round-robin design with

13

* hybrid priority-list and round-robin design with

14

* an array-switch method of distributing timeslices

14

* an array-switch method of distributing timeslices

15

* and per-CPU runqueues. Cleanups and useful suggestions

15

* and per-CPU runqueues. Cleanups and useful suggestions

16

* by Davide Libenzi, preemptible kernel bits by Robert Love.

16

* by Davide Libenzi, preemptible kernel bits by Robert Love.

17

* 2003-09-03 Interactivity tuning by Con Kolivas.

17

* 2003-09-03 Interactivity tuning by Con Kolivas.

18

* 2004-04-02 Scheduler domains code by Nick Piggin

18

* 2004-04-02 Scheduler domains code by Nick Piggin

19

* 2007-04-15 Work begun on replacing all interactivity tuning with a

19

* 2007-04-15 Work begun on replacing all interactivity tuning with a

20

* fair scheduling design by Con Kolivas.

20

* fair scheduling design by Con Kolivas.

21

* 2007-05-05 Load balancing (smp-nice) and other improvements

21

* 2007-05-05 Load balancing (smp-nice) and other improvements

22

* by Peter Williams

22

* by Peter Williams

23

* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith

23

* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith

24

* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri

24

* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri

25

* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,

25

* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,

26

* Thomas Gleixner, Mike Kravetz

26

* Thomas Gleixner, Mike Kravetz

27

*/

27

*/

28

29

#include <linux/mm.h>

29

#include <linux/mm.h>

30

#include <linux/module.h>

30

#include <linux/module.h>

31

#include <linux/nmi.h>

31

#include <linux/nmi.h>

32

#include <linux/init.h>

32

#include <linux/init.h>

33

#include <linux/uaccess.h>

33

#include <linux/uaccess.h>

34

#include <linux/highmem.h>

34

#include <linux/highmem.h>

35

#include <linux/smp_lock.h>

35

#include <linux/smp_lock.h>

36

#include <asm/mmu_context.h>

36

#include <asm/mmu_context.h>

37

#include <linux/interrupt.h>

37

#include <linux/interrupt.h>

38

#include <linux/capability.h>

38

#include <linux/capability.h>

39

#include <linux/completion.h>

39

#include <linux/completion.h>

40

#include <linux/kernel_stat.h>

40

#include <linux/kernel_stat.h>

41

#include <linux/debug_locks.h>

41

#include <linux/debug_locks.h>

42

#include <linux/security.h>

42

#include <linux/security.h>

43

#include <linux/notifier.h>

43

#include <linux/notifier.h>

44

#include <linux/profile.h>

44

#include <linux/profile.h>

45

#include <linux/freezer.h>

45

#include <linux/freezer.h>

46

#include <linux/vmalloc.h>

46

#include <linux/vmalloc.h>

47

#include <linux/blkdev.h>

47

#include <linux/blkdev.h>

48

#include <linux/delay.h>

48

#include <linux/delay.h>

49

#include <linux/pid_namespace.h>

49

#include <linux/pid_namespace.h>

50

#include <linux/smp.h>

50

#include <linux/smp.h>

51

#include <linux/threads.h>

51

#include <linux/threads.h>

52

#include <linux/timer.h>

52

#include <linux/timer.h>

53

#include <linux/rcupdate.h>

53

#include <linux/rcupdate.h>

54

#include <linux/cpu.h>

54

#include <linux/cpu.h>

55

#include <linux/cpuset.h>

55

#include <linux/cpuset.h>

56

#include <linux/percpu.h>

56

#include <linux/percpu.h>

57

#include <linux/kthread.h>

57

#include <linux/kthread.h>

58

#include <linux/proc_fs.h>

58

#include <linux/proc_fs.h>

59

#include <linux/seq_file.h>

59

#include <linux/seq_file.h>

60

#include <linux/sysctl.h>

60

#include <linux/sysctl.h>

61

#include <linux/syscalls.h>

61

#include <linux/syscalls.h>

62

#include <linux/times.h>

62

#include <linux/times.h>

63

#include <linux/tsacct_kern.h>

63

#include <linux/tsacct_kern.h>

64

#include <linux/kprobes.h>

64

#include <linux/kprobes.h>

65

#include <linux/delayacct.h>

65

#include <linux/delayacct.h>

66

#include <linux/reciprocal_div.h>

66

#include <linux/reciprocal_div.h>

67

#include <linux/unistd.h>

67

#include <linux/unistd.h>

68

#include <linux/pagemap.h>

68

#include <linux/pagemap.h>

69

#include <linux/hrtimer.h>

69

#include <linux/hrtimer.h>

70

#include <linux/tick.h>

70

#include <linux/tick.h>

71

#include <linux/bootmem.h>

71

#include <linux/bootmem.h>

72

#include <linux/debugfs.h>

72

#include <linux/debugfs.h>

73

#include <linux/ctype.h>

73

#include <linux/ctype.h>

74

#include <linux/ftrace.h>

74

#include <linux/ftrace.h>

75

#include <trace/sched.h>

75

#include <trace/sched.h>

76

77

#include <asm/tlb.h>

77

#include <asm/tlb.h>

78

#include <asm/irq_regs.h>

78

#include <asm/irq_regs.h>

79

80

#include "sched_cpupri.h"

80

#include "sched_cpupri.h"

81

82

/*

82

/*

83

* Convert user-nice values [ -20 ... 0 ... 19 ]

83

* Convert user-nice values [ -20 ... 0 ... 19 ]

84

* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],

84

* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],

85

* and back.

85

* and back.

86

*/

86

*/

87

#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)

87

#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)

88

#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)

88

#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)

89

#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)

89

#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)

90

91

/*

91

/*

92

* 'User priority' is the nice value converted to something we

92

* 'User priority' is the nice value converted to something we

93

* can work with better when scaling various scheduler parameters,

93

* can work with better when scaling various scheduler parameters,

94

* it's a [ 0 ... 39 ] range.

94

* it's a [ 0 ... 39 ] range.

95

*/

95

*/

96

#define USER_PRIO(p) ((p)-MAX_RT_PRIO)

96

#define USER_PRIO(p) ((p)-MAX_RT_PRIO)

97

#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)

97

#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)

98

#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))

98

#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))

99

100

/*

100

/*

101

* Helpers for converting nanosecond timing to jiffy resolution

101

* Helpers for converting nanosecond timing to jiffy resolution

102

*/

102

*/

103

#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))

103

#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))

104

105

#define NICE_0_LOAD SCHED_LOAD_SCALE

105

#define NICE_0_LOAD SCHED_LOAD_SCALE

106

#define NICE_0_SHIFT SCHED_LOAD_SHIFT

106

#define NICE_0_SHIFT SCHED_LOAD_SHIFT

107

108

/*

108

/*

109

* These are the 'tuning knobs' of the scheduler:

109

* These are the 'tuning knobs' of the scheduler:

110

*

110

*

111

* default timeslice is 100 msecs (used only for SCHED_RR tasks).

111

* default timeslice is 100 msecs (used only for SCHED_RR tasks).

112

* Timeslices get refilled after they expire.

112

* Timeslices get refilled after they expire.

113

*/

113

*/

114

#define DEF_TIMESLICE (100 * HZ / 1000)

114

#define DEF_TIMESLICE (100 * HZ / 1000)

115

116

/*

116

/*

117

* single value that denotes runtime == period, ie unlimited time.

117

* single value that denotes runtime == period, ie unlimited time.

118

*/

118

*/

119

#define RUNTIME_INF ((u64)~0ULL)

119

#define RUNTIME_INF ((u64)~0ULL)

120

121

#ifdef CONFIG_SMP

121

#ifdef CONFIG_SMP

122

/*

122

/*

123

* Divide a load by a sched group cpu_power : (load / sg->__cpu_power)

123

* Divide a load by a sched group cpu_power : (load / sg->__cpu_power)

124

* Since cpu_power is a 'constant', we can use a reciprocal divide.

124

* Since cpu_power is a 'constant', we can use a reciprocal divide.

125

*/

125

*/

126

static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)

126

static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)

127

{

127

{

128

return reciprocal_divide(load, sg->reciprocal_cpu_power);

128

return reciprocal_divide(load, sg->reciprocal_cpu_power);

129

}

129

}

130

131

/*

131

/*

132

* Each time a sched group cpu_power is changed,

132

* Each time a sched group cpu_power is changed,

133

* we must compute its reciprocal value

133

* we must compute its reciprocal value

134

*/

134

*/

135

static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)

135

static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)

136

{

136

{

137

sg->__cpu_power += val;

137

sg->__cpu_power += val;

138

sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);

138

sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);

139

}

139

}

140

#endif

140

#endif

141

142

static inline int rt_policy(int policy)

142

static inline int rt_policy(int policy)

143

{

143

{

144

if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))

144

if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))

145

return 1;

145

return 1;

146

return 0;

146

return 0;

147

}

147

}

148

149

static inline int task_has_rt_policy(struct task_struct *p)

149

static inline int task_has_rt_policy(struct task_struct *p)

150

{

150

{

151

return rt_policy(p->policy);

151

return rt_policy(p->policy);

152

}

152

}

153

154

/*

154

/*

155

* This is the priority-queue data structure of the RT scheduling class:

155

* This is the priority-queue data structure of the RT scheduling class:

156

*/

156

*/

157

struct rt_prio_array {

157

struct rt_prio_array {

158

DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */

158

DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */

159

struct list_head queue[MAX_RT_PRIO];

159

struct list_head queue[MAX_RT_PRIO];

160

};

160

};

161

162

struct rt_bandwidth {

162

struct rt_bandwidth {

163

/* nests inside the rq lock: */

163

/* nests inside the rq lock: */

164

spinlock_t rt_runtime_lock;

164

spinlock_t rt_runtime_lock;

165

ktime_t rt_period;

165

ktime_t rt_period;

166

u64 rt_runtime;

166

u64 rt_runtime;

167

struct hrtimer rt_period_timer;

167

struct hrtimer rt_period_timer;

168

};

168

};

169

170

static struct rt_bandwidth def_rt_bandwidth;

170

static struct rt_bandwidth def_rt_bandwidth;

171

172

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);

172

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);

173

174

static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)

174

static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)

175

{

175

{

176

struct rt_bandwidth *rt_b =

176

struct rt_bandwidth *rt_b =

177

container_of(timer, struct rt_bandwidth, rt_period_timer);

177

container_of(timer, struct rt_bandwidth, rt_period_timer);

178

ktime_t now;

178

ktime_t now;

179

int overrun;

179

int overrun;

180

int idle = 0;

180

int idle = 0;

181

182

for (;;) {

182

for (;;) {

183

now = hrtimer_cb_get_time(timer);

183

now = hrtimer_cb_get_time(timer);

184

overrun = hrtimer_forward(timer, now, rt_b->rt_period);

184

overrun = hrtimer_forward(timer, now, rt_b->rt_period);

185

186

if (!overrun)

186

if (!overrun)

187

break;

187

break;

188

189

idle = do_sched_rt_period_timer(rt_b, overrun);

189

idle = do_sched_rt_period_timer(rt_b, overrun);

190

}

190

}

191

192

return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;

192

return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;

193

}

193

}

194

195

static

195

static

196

void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)

196

void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)

197

{

197

{

198

rt_b->rt_period = ns_to_ktime(period);

198

rt_b->rt_period = ns_to_ktime(period);

199

rt_b->rt_runtime = runtime;

199

rt_b->rt_runtime = runtime;

200

201

spin_lock_init(&rt_b->rt_runtime_lock);

201

spin_lock_init(&rt_b->rt_runtime_lock);

202

203

hrtimer_init(&rt_b->rt_period_timer,

203

hrtimer_init(&rt_b->rt_period_timer,

204

CLOCK_MONOTONIC, HRTIMER_MODE_REL);

204

CLOCK_MONOTONIC, HRTIMER_MODE_REL);

205

rt_b->rt_period_timer.function = sched_rt_period_timer;

205

rt_b->rt_period_timer.function = sched_rt_period_timer;

206

rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;

206

rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;

207

}

207

}

208

209

static inline int rt_bandwidth_enabled(void)

209

static inline int rt_bandwidth_enabled(void)

210

{

210

{

211

return sysctl_sched_rt_runtime >= 0;

211

return sysctl_sched_rt_runtime >= 0;

212

}

212

}

213

214

static void start_rt_bandwidth(struct rt_bandwidth *rt_b)

214

static void start_rt_bandwidth(struct rt_bandwidth *rt_b)

215

{

215

{

216

ktime_t now;

216

ktime_t now;

217

218

if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)

218

if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)

219

return;

219

return;

220

221

if (hrtimer_active(&rt_b->rt_period_timer))

221

if (hrtimer_active(&rt_b->rt_period_timer))

222

return;

222

return;

223

224

spin_lock(&rt_b->rt_runtime_lock);

224

spin_lock(&rt_b->rt_runtime_lock);

225

for (;;) {

225

for (;;) {

226

if (hrtimer_active(&rt_b->rt_period_timer))

226

if (hrtimer_active(&rt_b->rt_period_timer))

227

break;

227

break;

228

229

now = hrtimer_cb_get_time(&rt_b->rt_period_timer);

229

now = hrtimer_cb_get_time(&rt_b->rt_period_timer);

230

hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);

230

hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);

231

hrtimer_start_expires(&rt_b->rt_period_timer,

231

hrtimer_start_expires(&rt_b->rt_period_timer,

232

HRTIMER_MODE_ABS);

232

HRTIMER_MODE_ABS);

233

}

233

}

234

spin_unlock(&rt_b->rt_runtime_lock);

234

spin_unlock(&rt_b->rt_runtime_lock);

235

}

235

}

236

237

#ifdef CONFIG_RT_GROUP_SCHED

237

#ifdef CONFIG_RT_GROUP_SCHED

238

static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)

238

static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)

239

{

239

{

240

hrtimer_cancel(&rt_b->rt_period_timer);

240

hrtimer_cancel(&rt_b->rt_period_timer);

241

}

241

}

242

#endif

242

#endif

243

244

/*

244

/*

245

* sched_domains_mutex serializes calls to arch_init_sched_domains,

245

* sched_domains_mutex serializes calls to arch_init_sched_domains,

246

* detach_destroy_domains and partition_sched_domains.

246

* detach_destroy_domains and partition_sched_domains.

247

*/

247

*/

248

static DEFINE_MUTEX(sched_domains_mutex);

248

static DEFINE_MUTEX(sched_domains_mutex);

249

250

#ifdef CONFIG_GROUP_SCHED

250

#ifdef CONFIG_GROUP_SCHED

251

252

#include <linux/cgroup.h>

252

#include <linux/cgroup.h>

253

254

struct cfs_rq;

254

struct cfs_rq;

255

256

static LIST_HEAD(task_groups);

256

static LIST_HEAD(task_groups);

257

258

/* task group related information */

258

/* task group related information */

259

struct task_group {

259

struct task_group {

260

#ifdef CONFIG_CGROUP_SCHED

260

#ifdef CONFIG_CGROUP_SCHED

261

struct cgroup_subsys_state css;

261

struct cgroup_subsys_state css;

262

#endif

262

#endif

263

264

#ifdef CONFIG_USER_SCHED

264

#ifdef CONFIG_USER_SCHED

265

uid_t uid;

265

uid_t uid;

266

#endif

266

#endif

267

268

#ifdef CONFIG_FAIR_GROUP_SCHED

268

#ifdef CONFIG_FAIR_GROUP_SCHED

269

/* schedulable entities of this group on each cpu */

269

/* schedulable entities of this group on each cpu */

270

struct sched_entity **se;

270

struct sched_entity **se;

271

/* runqueue "owned" by this group on each cpu */

271

/* runqueue "owned" by this group on each cpu */

272

struct cfs_rq **cfs_rq;

272

struct cfs_rq **cfs_rq;

273

unsigned long shares;

273

unsigned long shares;

274

#endif

274

#endif

275

276

#ifdef CONFIG_RT_GROUP_SCHED

276

#ifdef CONFIG_RT_GROUP_SCHED

277

struct sched_rt_entity **rt_se;

277

struct sched_rt_entity **rt_se;

278

struct rt_rq **rt_rq;

278

struct rt_rq **rt_rq;

279

280

struct rt_bandwidth rt_bandwidth;

280

struct rt_bandwidth rt_bandwidth;

281

#endif

281

#endif

282

283

struct rcu_head rcu;

283

struct rcu_head rcu;

284

struct list_head list;

284

struct list_head list;

285

286

struct task_group *parent;

286

struct task_group *parent;

287

struct list_head siblings;

287

struct list_head siblings;

288

struct list_head children;

288

struct list_head children;

289

};

289

};

290

291

#ifdef CONFIG_USER_SCHED

291

#ifdef CONFIG_USER_SCHED

292

293

/* Helper function to pass uid information to create_sched_user() */

293

/* Helper function to pass uid information to create_sched_user() */

294

void set_tg_uid(struct user_struct *user)

294

void set_tg_uid(struct user_struct *user)

295

{

295

{

296

user->tg->uid = user->uid;

296

user->tg->uid = user->uid;

297

}

297

}

298

299

/*

299

/*

300

* Root task group.

300

* Root task group.

301

* Every UID task group (including init_task_group aka UID-0) will

301

* Every UID task group (including init_task_group aka UID-0) will

302

* be a child to this group.

302

* be a child to this group.

303

*/

303

*/

304

struct task_group root_task_group;

304

struct task_group root_task_group;

305

306

#ifdef CONFIG_FAIR_GROUP_SCHED

306

#ifdef CONFIG_FAIR_GROUP_SCHED

307

/* Default task group's sched entity on each cpu */

307

/* Default task group's sched entity on each cpu */

308

static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);

308

static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);

309

/* Default task group's cfs_rq on each cpu */

309

/* Default task group's cfs_rq on each cpu */

310

static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;

310

static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;

311

#endif /* CONFIG_FAIR_GROUP_SCHED */

311

#endif /* CONFIG_FAIR_GROUP_SCHED */

312

313

#ifdef CONFIG_RT_GROUP_SCHED

313

#ifdef CONFIG_RT_GROUP_SCHED

314

static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);

314

static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);

315

static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;

315

static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;

316

#endif /* CONFIG_RT_GROUP_SCHED */

316

#endif /* CONFIG_RT_GROUP_SCHED */

317

#else /* !CONFIG_USER_SCHED */

317

#else /* !CONFIG_USER_SCHED */

318

#define root_task_group init_task_group

318

#define root_task_group init_task_group

319

#endif /* CONFIG_USER_SCHED */

319

#endif /* CONFIG_USER_SCHED */

320

321

/* task_group_lock serializes add/remove of task groups and also changes to

321

/* task_group_lock serializes add/remove of task groups and also changes to

322

* a task group's cpu shares.

322

* a task group's cpu shares.

323

*/

323

*/

324

static DEFINE_SPINLOCK(task_group_lock);

324

static DEFINE_SPINLOCK(task_group_lock);

325

326

#ifdef CONFIG_FAIR_GROUP_SCHED

326

#ifdef CONFIG_FAIR_GROUP_SCHED

327

#ifdef CONFIG_USER_SCHED

327

#ifdef CONFIG_USER_SCHED

328

# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)

328

# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)

329

#else /* !CONFIG_USER_SCHED */

329

#else /* !CONFIG_USER_SCHED */

330

# define INIT_TASK_GROUP_LOAD NICE_0_LOAD

330

# define INIT_TASK_GROUP_LOAD NICE_0_LOAD

331

#endif /* CONFIG_USER_SCHED */

331

#endif /* CONFIG_USER_SCHED */

332

333

/*

333

/*

334

* A weight of 0 or 1 can cause arithmetics problems.

334

* A weight of 0 or 1 can cause arithmetics problems.

335

* A weight of a cfs_rq is the sum of weights of which entities

335

* A weight of a cfs_rq is the sum of weights of which entities

336

* are queued on this cfs_rq, so a weight of a entity should not be

336

* are queued on this cfs_rq, so a weight of a entity should not be

337

* too large, so as the shares value of a task group.

337

* too large, so as the shares value of a task group.

338

* (The default weight is 1024 - so there's no practical

338

* (The default weight is 1024 - so there's no practical

339

* limitation from this.)

339

* limitation from this.)

340

*/

340

*/

341

#define MIN_SHARES 2

341

#define MIN_SHARES 2

342

#define MAX_SHARES (1UL << 18)

342

#define MAX_SHARES (1UL << 18)

343

344

static int init_task_group_load = INIT_TASK_GROUP_LOAD;

344

static int init_task_group_load = INIT_TASK_GROUP_LOAD;

345

#endif

345

#endif

346

347

/* Default task group.

347

/* Default task group.

348

* Every task in system belong to this group at bootup.

348

* Every task in system belong to this group at bootup.

349

*/

349

*/

350

struct task_group init_task_group;

350

struct task_group init_task_group;

351

352

/* return group to which a task belongs */

352

/* return group to which a task belongs */

353

static inline struct task_group *task_group(struct task_struct *p)

353

static inline struct task_group *task_group(struct task_struct *p)

354

{

354

{

355

struct task_group *tg;

355

struct task_group *tg;

356

357

#ifdef CONFIG_USER_SCHED

357

#ifdef CONFIG_USER_SCHED

358

tg = p->user->tg;

358

tg = p->user->tg;

359

#elif defined(CONFIG_CGROUP_SCHED)

359

#elif defined(CONFIG_CGROUP_SCHED)

360

tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),

360

tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),

361

struct task_group, css);

361

struct task_group, css);

362

#else

362

#else

363

tg = &init_task_group;

363

tg = &init_task_group;

364

#endif

364

#endif

365

return tg;

365

return tg;

366

}

366

}

367

368

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */

368

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */

369

static inline void set_task_rq(struct task_struct *p, unsigned int cpu)

369

static inline void set_task_rq(struct task_struct *p, unsigned int cpu)

370

{

370

{

371

#ifdef CONFIG_FAIR_GROUP_SCHED

371

#ifdef CONFIG_FAIR_GROUP_SCHED

372

p->se.cfs_rq = task_group(p)->cfs_rq[cpu];

372

p->se.cfs_rq = task_group(p)->cfs_rq[cpu];

373

p->se.parent = task_group(p)->se[cpu];

373

p->se.parent = task_group(p)->se[cpu];

374

#endif

374

#endif

375

376

#ifdef CONFIG_RT_GROUP_SCHED

376

#ifdef CONFIG_RT_GROUP_SCHED

377

p->rt.rt_rq = task_group(p)->rt_rq[cpu];

377

p->rt.rt_rq = task_group(p)->rt_rq[cpu];

378

p->rt.parent = task_group(p)->rt_se[cpu];

378

p->rt.parent = task_group(p)->rt_se[cpu];

379

#endif

379

#endif

380

}

380

}

381

382

#else

382

#else

383

384

static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }

384

static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }

385

static inline struct task_group *task_group(struct task_struct *p)

385

static inline struct task_group *task_group(struct task_struct *p)

386

{

386

{

387

return NULL;

387

return NULL;

388

}

388

}

389

390

#endif /* CONFIG_GROUP_SCHED */

390

#endif /* CONFIG_GROUP_SCHED */

391

392

/* CFS-related fields in a runqueue */

392

/* CFS-related fields in a runqueue */

393

struct cfs_rq {

393

struct cfs_rq {

394

struct load_weight load;

394

struct load_weight load;

395

unsigned long nr_running;

395

unsigned long nr_running;

396

397

u64 exec_clock;

397

u64 exec_clock;

398

u64 min_vruntime;

398

u64 min_vruntime;

399

400

struct rb_root tasks_timeline;

400

struct rb_root tasks_timeline;

401

struct rb_node *rb_leftmost;

401

struct rb_node *rb_leftmost;

402

403

struct list_head tasks;

403

struct list_head tasks;

404

struct list_head *balance_iterator;

404

struct list_head *balance_iterator;

405

406

/*

406

/*

407

* 'curr' points to currently running entity on this cfs_rq.

407

* 'curr' points to currently running entity on this cfs_rq.

408

* It is set to NULL otherwise (i.e when none are currently running).

408

* It is set to NULL otherwise (i.e when none are currently running).

409

*/

409

*/

410

struct sched_entity *curr, *next, *last;

410

struct sched_entity *curr, *next, *last;

411

412

unsigned int nr_spread_over;

412

unsigned int nr_spread_over;

413

414

#ifdef CONFIG_FAIR_GROUP_SCHED

414

#ifdef CONFIG_FAIR_GROUP_SCHED

415

struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */

415

struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */

416

417

/*

417

/*

418

* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in

418

* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in

419

* a hierarchy). Non-leaf lrqs hold other higher schedulable entities

419

* a hierarchy). Non-leaf lrqs hold other higher schedulable entities

420

* (like users, containers etc.)

420

* (like users, containers etc.)

421

*

421

*

422

* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This

422

* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This

423

* list is used during load balance.

423

* list is used during load balance.

424

*/

424

*/

425

struct list_head leaf_cfs_rq_list;

425

struct list_head leaf_cfs_rq_list;

426

struct task_group *tg; /* group that "owns" this runqueue */

426

struct task_group *tg; /* group that "owns" this runqueue */

427

428

#ifdef CONFIG_SMP

428

#ifdef CONFIG_SMP

429

/*

429

/*

430

* the part of load.weight contributed by tasks

430

* the part of load.weight contributed by tasks

431

*/

431

*/

432

unsigned long task_weight;

432

unsigned long task_weight;

433

434

/*

434

/*

435

* h_load = weight * f(tg)

435

* h_load = weight * f(tg)

436

*

436

*

437

* Where f(tg) is the recursive weight fraction assigned to

437

* Where f(tg) is the recursive weight fraction assigned to

438

* this group.

438

* this group.

439

*/

439

*/

440

unsigned long h_load;

440

unsigned long h_load;

441

442

/*

442

/*

443

* this cpu's part of tg->shares

443

* this cpu's part of tg->shares

444

*/

444

*/

445

unsigned long shares;

445

unsigned long shares;

446

447

/*

447

/*

448

* load.weight at the time we set shares

448

* load.weight at the time we set shares

449

*/

449

*/

450

unsigned long rq_weight;

450

unsigned long rq_weight;

451

#endif

451

#endif

452

#endif

452

#endif

453

};

453

};

454

455

/* Real-Time classes' related field in a runqueue: */

455

/* Real-Time classes' related field in a runqueue: */

456

struct rt_rq {

456

struct rt_rq {

457

struct rt_prio_array active;

457

struct rt_prio_array active;

458

unsigned long rt_nr_running;

458

unsigned long rt_nr_running;

459

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

459

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

460

int highest_prio; /* highest queued rt task prio */

460

int highest_prio; /* highest queued rt task prio */

461

#endif

461

#endif

462

#ifdef CONFIG_SMP

462

#ifdef CONFIG_SMP

463

unsigned long rt_nr_migratory;

463

unsigned long rt_nr_migratory;

464

int overloaded;

464

int overloaded;

465

#endif

465

#endif

466

int rt_throttled;

466

int rt_throttled;

467

u64 rt_time;

467

u64 rt_time;

468

u64 rt_runtime;

468

u64 rt_runtime;

469

/* Nests inside the rq lock: */

469

/* Nests inside the rq lock: */

470

spinlock_t rt_runtime_lock;

470

spinlock_t rt_runtime_lock;

471

472

#ifdef CONFIG_RT_GROUP_SCHED

472

#ifdef CONFIG_RT_GROUP_SCHED

473

unsigned long rt_nr_boosted;

473

unsigned long rt_nr_boosted;

474

475

struct rq *rq;

475

struct rq *rq;

476

struct list_head leaf_rt_rq_list;

476

struct list_head leaf_rt_rq_list;

477

struct task_group *tg;

477

struct task_group *tg;

478

struct sched_rt_entity *rt_se;

478

struct sched_rt_entity *rt_se;

479

#endif

479

#endif

480

};

480

};

481

482

#ifdef CONFIG_SMP

482

#ifdef CONFIG_SMP

483

484

/*

484

/*

485

* We add the notion of a root-domain which will be used to define per-domain

485

* We add the notion of a root-domain which will be used to define per-domain

486

* variables. Each exclusive cpuset essentially defines an island domain by

486

* variables. Each exclusive cpuset essentially defines an island domain by

487

* fully partitioning the member cpus from any other cpuset. Whenever a new

487

* fully partitioning the member cpus from any other cpuset. Whenever a new

488

* exclusive cpuset is created, we also create and attach a new root-domain

488

* exclusive cpuset is created, we also create and attach a new root-domain

489

* object.

489

* object.

490

*

490

*

491

*/

491

*/

492

struct root_domain {

492

struct root_domain {

493

atomic_t refcount;

493

atomic_t refcount;

494

cpumask_t span;

494

cpumask_t span;

495

cpumask_t online;

495

cpumask_t online;

496

497

/*

497

/*

498

* The "RT overload" flag: it gets set if a CPU has more than

498

* The "RT overload" flag: it gets set if a CPU has more than

499

* one runnable RT task.

499

* one runnable RT task.

500

*/

500

*/

501

cpumask_t rto_mask;

501

cpumask_t rto_mask;

502

atomic_t rto_count;

502

atomic_t rto_count;

503

#ifdef CONFIG_SMP

503

#ifdef CONFIG_SMP

504

struct cpupri cpupri;

504

struct cpupri cpupri;

505

#endif

505

#endif

506

};

506

};

507

508

/*

508

/*

509

* By default the system creates a single root-domain with all cpus as

509

* By default the system creates a single root-domain with all cpus as

510

* members (mimicking the global state we have today).

510

* members (mimicking the global state we have today).

511

*/

511

*/

512

static struct root_domain def_root_domain;

512

static struct root_domain def_root_domain;

513

514

#endif

514

#endif

515

516

/*

516

/*

517

* This is the main, per-CPU runqueue data structure.

517

* This is the main, per-CPU runqueue data structure.

518

*

518

*

519

* Locking rule: those places that want to lock multiple runqueues

519

* Locking rule: those places that want to lock multiple runqueues

520

* (such as the load balancing or the thread migration code), lock

520

* (such as the load balancing or the thread migration code), lock

521

* acquire operations must be ordered by ascending &runqueue.

521

* acquire operations must be ordered by ascending &runqueue.

522

*/

522

*/

523

struct rq {

523

struct rq {

524

/* runqueue lock: */

524

/* runqueue lock: */

525

spinlock_t lock;

525

spinlock_t lock;

526

527

/*

527

/*

528

* nr_running and cpu_load should be in the same cacheline because

528

* nr_running and cpu_load should be in the same cacheline because

529

* remote CPUs use both these fields when doing load calculation.

529

* remote CPUs use both these fields when doing load calculation.

530

*/

530

*/

531

unsigned long nr_running;

531

unsigned long nr_running;

532

#define CPU_LOAD_IDX_MAX 5

532

#define CPU_LOAD_IDX_MAX 5

533

unsigned long cpu_load[CPU_LOAD_IDX_MAX];

533

unsigned long cpu_load[CPU_LOAD_IDX_MAX];

534

unsigned char idle_at_tick;

534

unsigned char idle_at_tick;

535

#ifdef CONFIG_NO_HZ

535

#ifdef CONFIG_NO_HZ

536

unsigned long last_tick_seen;

536

unsigned long last_tick_seen;

537

unsigned char in_nohz_recently;

537

unsigned char in_nohz_recently;

538

#endif

538

#endif

539

/* capture load from *all* tasks on this cpu: */

539

/* capture load from *all* tasks on this cpu: */

540

struct load_weight load;

540

struct load_weight load;

541

unsigned long nr_load_updates;

541

unsigned long nr_load_updates;

542

u64 nr_switches;

542

u64 nr_switches;

543

544

struct cfs_rq cfs;

544

struct cfs_rq cfs;

545

struct rt_rq rt;

545

struct rt_rq rt;

546

547

#ifdef CONFIG_FAIR_GROUP_SCHED

547

#ifdef CONFIG_FAIR_GROUP_SCHED

548

/* list of leaf cfs_rq on this cpu: */

548

/* list of leaf cfs_rq on this cpu: */

549

struct list_head leaf_cfs_rq_list;

549

struct list_head leaf_cfs_rq_list;

550

#endif

550

#endif

551

#ifdef CONFIG_RT_GROUP_SCHED

551

#ifdef CONFIG_RT_GROUP_SCHED

552

struct list_head leaf_rt_rq_list;

552

struct list_head leaf_rt_rq_list;

553

#endif

553

#endif

554

555

/*

555

/*

556

* This is part of a global counter where only the total sum

556

* This is part of a global counter where only the total sum

557

* over all CPUs matters. A task can increase this counter on

557

* over all CPUs matters. A task can increase this counter on

558

* one CPU and if it got migrated afterwards it may decrease

558

* one CPU and if it got migrated afterwards it may decrease

559

* it on another CPU. Always updated under the runqueue lock:

559

* it on another CPU. Always updated under the runqueue lock:

560

*/

560

*/

561

unsigned long nr_uninterruptible;

561

unsigned long nr_uninterruptible;

562

563

struct task_struct *curr, *idle;

563

struct task_struct *curr, *idle;

564

unsigned long next_balance;

564

unsigned long next_balance;

565

struct mm_struct *prev_mm;

565

struct mm_struct *prev_mm;

566

567

u64 clock;

567

u64 clock;

568

569

atomic_t nr_iowait;

569

atomic_t nr_iowait;

570

571

#ifdef CONFIG_SMP

571

#ifdef CONFIG_SMP

572

struct root_domain *rd;

572

struct root_domain *rd;

573

struct sched_domain *sd;

573

struct sched_domain *sd;

574

575

/* For active balancing */

575

/* For active balancing */

576

int active_balance;

576

int active_balance;

577

int push_cpu;

577

int push_cpu;

578

/* cpu of this runqueue: */

578

/* cpu of this runqueue: */

579

int cpu;

579

int cpu;

580

int online;

580

int online;

581

582

unsigned long avg_load_per_task;

582

unsigned long avg_load_per_task;

583

584

struct task_struct *migration_thread;

584

struct task_struct *migration_thread;

585

struct list_head migration_queue;

585

struct list_head migration_queue;

586

#endif

586

#endif

587

588

#ifdef CONFIG_SCHED_HRTICK

588

#ifdef CONFIG_SCHED_HRTICK

589

#ifdef CONFIG_SMP

589

#ifdef CONFIG_SMP

590

int hrtick_csd_pending;

590

int hrtick_csd_pending;

591

struct call_single_data hrtick_csd;

591

struct call_single_data hrtick_csd;

592

#endif

592

#endif

593

struct hrtimer hrtick_timer;

593

struct hrtimer hrtick_timer;

594

#endif

594

#endif

595

596

#ifdef CONFIG_SCHEDSTATS

596

#ifdef CONFIG_SCHEDSTATS

597

/* latency stats */

597

/* latency stats */

598

struct sched_info rq_sched_info;

598

struct sched_info rq_sched_info;

599

600

/* sys_sched_yield() stats */

600

/* sys_sched_yield() stats */

601

unsigned int yld_exp_empty;

601

unsigned int yld_exp_empty;

602

unsigned int yld_act_empty;

602

unsigned int yld_act_empty;

603

unsigned int yld_both_empty;

603

unsigned int yld_both_empty;

604

unsigned int yld_count;

604

unsigned int yld_count;

605

606

/* schedule() stats */

606

/* schedule() stats */

607

unsigned int sched_switch;

607

unsigned int sched_switch;

608

unsigned int sched_count;

608

unsigned int sched_count;

609

unsigned int sched_goidle;

609

unsigned int sched_goidle;

610

611

/* try_to_wake_up() stats */

611

/* try_to_wake_up() stats */

612

unsigned int ttwu_count;

612

unsigned int ttwu_count;

613

unsigned int ttwu_local;

613

unsigned int ttwu_local;

614

615

/* BKL stats */

615

/* BKL stats */

616

unsigned int bkl_count;

616

unsigned int bkl_count;

617

#endif

617

#endif

618

};

618

};

619

620

static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

620

static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

621

622

static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)

622

static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)

623

{

623

{

624

rq->curr->sched_class->check_preempt_curr(rq, p, sync);

624

rq->curr->sched_class->check_preempt_curr(rq, p, sync);

625

}

625

}

626

627

static inline int cpu_of(struct rq *rq)

627

static inline int cpu_of(struct rq *rq)

628

{

628

{

629

#ifdef CONFIG_SMP

629

#ifdef CONFIG_SMP

630

return rq->cpu;

630

return rq->cpu;

631

#else

631

#else

632

return 0;

632

return 0;

633

#endif

633

#endif

634

}

634

}

635

636

/*

636

/*

637

* The domain tree (rq->sd) is protected by RCU's quiescent state transition.

637

* The domain tree (rq->sd) is protected by RCU's quiescent state transition.

638

* See detach_destroy_domains: synchronize_sched for details.

638

* See detach_destroy_domains: synchronize_sched for details.

639

*

639

*

640

* The domain tree of any CPU may only be accessed from within

640

* The domain tree of any CPU may only be accessed from within

641

* preempt-disabled sections.

641

* preempt-disabled sections.

642

*/

642

*/

643

#define for_each_domain(cpu, __sd) \

643

#define for_each_domain(cpu, __sd) \

644

for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)

644

for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)

645

646

#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))

646

#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))

647

#define this_rq() (&__get_cpu_var(runqueues))

647

#define this_rq() (&__get_cpu_var(runqueues))

648

#define task_rq(p) cpu_rq(task_cpu(p))

648

#define task_rq(p) cpu_rq(task_cpu(p))

649

#define cpu_curr(cpu) (cpu_rq(cpu)->curr)

649

#define cpu_curr(cpu) (cpu_rq(cpu)->curr)

650

651

static inline void update_rq_clock(struct rq *rq)

651

static inline void update_rq_clock(struct rq *rq)

652

{

652

{

653

rq->clock = sched_clock_cpu(cpu_of(rq));

653

rq->clock = sched_clock_cpu(cpu_of(rq));

654

}

654

}

655

656

/*

656

/*

657

* Tunables that become constants when CONFIG_SCHED_DEBUG is off:

657

* Tunables that become constants when CONFIG_SCHED_DEBUG is off:

658

*/

658

*/

659

#ifdef CONFIG_SCHED_DEBUG

659

#ifdef CONFIG_SCHED_DEBUG

660

# define const_debug __read_mostly

660

# define const_debug __read_mostly

661

#else

661

#else

662

# define const_debug static const

662

# define const_debug static const

663

#endif

663

#endif

664

665

/**

665

/**

666

* runqueue_is_locked

666

* runqueue_is_locked

667

*

667

*

668

* Returns true if the current cpu runqueue is locked.

668

* Returns true if the current cpu runqueue is locked.

669

* This interface allows printk to be called with the runqueue lock

669

* This interface allows printk to be called with the runqueue lock

670

* held and know whether or not it is OK to wake up the klogd.

670

* held and know whether or not it is OK to wake up the klogd.

671

*/

671

*/

672

int runqueue_is_locked(void)

672

int runqueue_is_locked(void)

673

{

673

{

674

int cpu = get_cpu();

674

int cpu = get_cpu();

675

struct rq *rq = cpu_rq(cpu);

675

struct rq *rq = cpu_rq(cpu);

676

int ret;

676

int ret;

677

678

ret = spin_is_locked(&rq->lock);

678

ret = spin_is_locked(&rq->lock);

679

put_cpu();

679

put_cpu();

680

return ret;

680

return ret;

681

}

681

}

682

683

/*

683

/*

684

* Debugging: various feature bits

684

* Debugging: various feature bits

685

*/

685

*/

686

687

#define SCHED_FEAT(name, enabled) \

687

#define SCHED_FEAT(name, enabled) \

688

__SCHED_FEAT_##name ,

688

__SCHED_FEAT_##name ,

689

690

enum {

690

enum {

691

#include "sched_features.h"

691

#include "sched_features.h"

692

};

692

};

693

694

#undef SCHED_FEAT

694

#undef SCHED_FEAT

695

696

#define SCHED_FEAT(name, enabled) \

696

#define SCHED_FEAT(name, enabled) \

697

(1UL << __SCHED_FEAT_##name) * enabled |

697

(1UL << __SCHED_FEAT_##name) * enabled |

698

699

const_debug unsigned int sysctl_sched_features =

699

const_debug unsigned int sysctl_sched_features =

700

#include "sched_features.h"

700

#include "sched_features.h"

701

0;

701

0;

702

703

#undef SCHED_FEAT

703

#undef SCHED_FEAT

704

705

#ifdef CONFIG_SCHED_DEBUG

705

#ifdef CONFIG_SCHED_DEBUG

706

#define SCHED_FEAT(name, enabled) \

706

#define SCHED_FEAT(name, enabled) \

707

#name ,

707

#name ,

708

709

static __read_mostly char *sched_feat_names[] = {

709

static __read_mostly char *sched_feat_names[] = {

710

#include "sched_features.h"

710

#include "sched_features.h"

711

NULL

711

NULL

712

};

712

};

713

714

#undef SCHED_FEAT

714

#undef SCHED_FEAT

715

716

static int sched_feat_show(struct seq_file *m, void *v)

716

static int sched_feat_show(struct seq_file *m, void *v)

717

{

717

{

718

int i;

718

int i;

719

720

for (i = 0; sched_feat_names[i]; i++) {

720

for (i = 0; sched_feat_names[i]; i++) {

721

if (!(sysctl_sched_features & (1UL << i)))

721

if (!(sysctl_sched_features & (1UL << i)))

722

seq_puts(m, "NO_");

722

seq_puts(m, "NO_");

723

seq_printf(m, "%s ", sched_feat_names[i]);

723

seq_printf(m, "%s ", sched_feat_names[i]);

724

}

724

}

725

seq_puts(m, "\n");

725

seq_puts(m, "\n");

726

727

return 0;

727

return 0;

728

}

728

}

729

730

static ssize_t

730

static ssize_t

731

sched_feat_write(struct file *filp, const char __user *ubuf,

731

sched_feat_write(struct file *filp, const char __user *ubuf,

732

size_t cnt, loff_t *ppos)

732

size_t cnt, loff_t *ppos)

733

{

733

{

734

char buf[64];

734

char buf[64];

735

char *cmp = buf;

735

char *cmp = buf;

736

int neg = 0;

736

int neg = 0;

737

int i;

737

int i;

738

739

if (cnt > 63)

739

if (cnt > 63)

740

cnt = 63;

740

cnt = 63;

741

742

if (copy_from_user(&buf, ubuf, cnt))

742

if (copy_from_user(&buf, ubuf, cnt))

743

return -EFAULT;

743

return -EFAULT;

744

745

buf[cnt] = 0;

745

buf[cnt] = 0;

746

747

if (strncmp(buf, "NO_", 3) == 0) {

747

if (strncmp(buf, "NO_", 3) == 0) {

748

neg = 1;

748

neg = 1;

749

cmp += 3;

749

cmp += 3;

750

}

750

}

751

752

for (i = 0; sched_feat_names[i]; i++) {

752

for (i = 0; sched_feat_names[i]; i++) {

753

int len = strlen(sched_feat_names[i]);

753

int len = strlen(sched_feat_names[i]);

754

755

if (strncmp(cmp, sched_feat_names[i], len) == 0) {

755

if (strncmp(cmp, sched_feat_names[i], len) == 0) {

756

if (neg)

756

if (neg)

757

sysctl_sched_features &= ~(1UL << i);

757

sysctl_sched_features &= ~(1UL << i);

758

else

758

else

759

sysctl_sched_features |= (1UL << i);

759

sysctl_sched_features |= (1UL << i);

760

break;

760

break;

761

}

761

}

762

}

762

}

763

764

if (!sched_feat_names[i])

764

if (!sched_feat_names[i])

765

return -EINVAL;

765

return -EINVAL;

766

767

filp->f_pos += cnt;

767

filp->f_pos += cnt;

768

769

return cnt;

769

return cnt;

770

}

770

}

771

772

static int sched_feat_open(struct inode *inode, struct file *filp)

772

static int sched_feat_open(struct inode *inode, struct file *filp)

773

{

773

{

774

return single_open(filp, sched_feat_show, NULL);

774

return single_open(filp, sched_feat_show, NULL);

775

}

775

}

776

777

static struct file_operations sched_feat_fops = {

777

static struct file_operations sched_feat_fops = {

778

.open = sched_feat_open,

778

.open = sched_feat_open,

779

.write = sched_feat_write,

779

.write = sched_feat_write,

780

.read = seq_read,

780

.read = seq_read,

781

.llseek = seq_lseek,

781

.llseek = seq_lseek,

782

.release = single_release,

782

.release = single_release,

783

};

783

};

784

785

static __init int sched_init_debug(void)

785

static __init int sched_init_debug(void)

786

{

786

{

787

debugfs_create_file("sched_features", 0644, NULL, NULL,

787

debugfs_create_file("sched_features", 0644, NULL, NULL,

788

&sched_feat_fops);

788

&sched_feat_fops);

789

790

return 0;

790

return 0;

791

}

791

}

792

late_initcall(sched_init_debug);

792

late_initcall(sched_init_debug);

793

794

#endif

794

#endif

795

796

#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))

796

#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))

797

798

/*

798

/*

799

* Number of tasks to iterate in a single balance run.

799

* Number of tasks to iterate in a single balance run.

800

* Limited because this is done with IRQs disabled.

800

* Limited because this is done with IRQs disabled.

801

*/

801

*/

802

const_debug unsigned int sysctl_sched_nr_migrate = 32;

802

const_debug unsigned int sysctl_sched_nr_migrate = 32;

803

804

/*

804

/*

805

* ratelimit for updating the group shares.

805

* ratelimit for updating the group shares.

806

* default: 0.25ms

806

* default: 0.25ms

807

*/

807

*/

808

unsigned int sysctl_sched_shares_ratelimit = 250000;

808

unsigned int sysctl_sched_shares_ratelimit = 250000;

809

810

/*

810

/*

811

* Inject some fuzzyness into changing the per-cpu group shares

811

* Inject some fuzzyness into changing the per-cpu group shares

812

* this avoids remote rq-locks at the expense of fairness.

812

* this avoids remote rq-locks at the expense of fairness.

813

* default: 4

813

* default: 4

814

*/

814

*/

815

unsigned int sysctl_sched_shares_thresh = 4;

815

unsigned int sysctl_sched_shares_thresh = 4;

816

817

/*

817

/*

818

* period over which we measure -rt task cpu usage in us.

818

* period over which we measure -rt task cpu usage in us.

819

* default: 1s

819

* default: 1s

820

*/

820

*/

821

unsigned int sysctl_sched_rt_period = 1000000;

821

unsigned int sysctl_sched_rt_period = 1000000;

822

823

static __read_mostly int scheduler_running;

823

static __read_mostly int scheduler_running;

824

825

/*

825

/*

826

* part of the period that we allow rt tasks to run in us.

826

* part of the period that we allow rt tasks to run in us.

827

* default: 0.95s

827

* default: 0.95s

828

*/

828

*/

829

int sysctl_sched_rt_runtime = 950000;

829

int sysctl_sched_rt_runtime = 950000;

830

831

static inline u64 global_rt_period(void)

831

static inline u64 global_rt_period(void)

832

{

832

{

833

return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;

833

return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;

834

}

834

}

835

836

static inline u64 global_rt_runtime(void)

836

static inline u64 global_rt_runtime(void)

837

{

837

{

838

if (sysctl_sched_rt_runtime < 0)

838

if (sysctl_sched_rt_runtime < 0)

839

return RUNTIME_INF;

839

return RUNTIME_INF;

840

841

return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;

841

return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;

842

}

842

}

843

844

#ifndef prepare_arch_switch

844

#ifndef prepare_arch_switch

845

# define prepare_arch_switch(next) do { } while (0)

845

# define prepare_arch_switch(next) do { } while (0)

846

#endif

846

#endif

847

#ifndef finish_arch_switch

847

#ifndef finish_arch_switch

848

# define finish_arch_switch(prev) do { } while (0)

848

# define finish_arch_switch(prev) do { } while (0)

849

#endif

849

#endif

850

851

static inline int task_current(struct rq *rq, struct task_struct *p)

851

static inline int task_current(struct rq *rq, struct task_struct *p)

852

{

852

{

853

return rq->curr == p;

853

return rq->curr == p;

854

}

854

}

855

856

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

856

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

857

static inline int task_running(struct rq *rq, struct task_struct *p)

857

static inline int task_running(struct rq *rq, struct task_struct *p)

858

{

858

{

859

return task_current(rq, p);

859

return task_current(rq, p);

860

}

860

}

861

862

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

862

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

863

{

863

{

864

}

864

}

865

866

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

866

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

867

{

867

{

868

#ifdef CONFIG_DEBUG_SPINLOCK

868

#ifdef CONFIG_DEBUG_SPINLOCK

869

/* this is a valid case when another task releases the spinlock */

869

/* this is a valid case when another task releases the spinlock */

870

rq->lock.owner = current;

870

rq->lock.owner = current;

871

#endif

871

#endif

872

/*

872

/*

873

* If we are tracking spinlock dependencies then we have to

873

* If we are tracking spinlock dependencies then we have to

874

* fix up the runqueue lock - which gets 'carried over' from

874

* fix up the runqueue lock - which gets 'carried over' from

875

* prev into current:

875

* prev into current:

876

*/

876

*/

877

spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);

877

spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);

878

879

spin_unlock_irq(&rq->lock);

879

spin_unlock_irq(&rq->lock);

880

}

880

}

881

882

#else /* __ARCH_WANT_UNLOCKED_CTXSW */

882

#else /* __ARCH_WANT_UNLOCKED_CTXSW */

883

static inline int task_running(struct rq *rq, struct task_struct *p)

883

static inline int task_running(struct rq *rq, struct task_struct *p)

884

{

884

{

885

#ifdef CONFIG_SMP

885

#ifdef CONFIG_SMP

886

return p->oncpu;

886

return p->oncpu;

887

#else

887

#else

888

return task_current(rq, p);

888

return task_current(rq, p);

889

#endif

889

#endif

890

}

890

}

891

892

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

892

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

893

{

893

{

894

#ifdef CONFIG_SMP

894

#ifdef CONFIG_SMP

895

/*

895

/*

896

* We can optimise this out completely for !SMP, because the

896

* We can optimise this out completely for !SMP, because the

897

* SMP rebalancing from interrupt is the only thing that cares

897

* SMP rebalancing from interrupt is the only thing that cares

898

* here.

898

* here.

899

*/

899

*/

900

next->oncpu = 1;

900

next->oncpu = 1;

901

#endif

901

#endif

902

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

902

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

903

spin_unlock_irq(&rq->lock);

903

spin_unlock_irq(&rq->lock);

904

#else

904

#else

905

spin_unlock(&rq->lock);

905

spin_unlock(&rq->lock);

906

#endif

906

#endif

907

}

907

}

908

909

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

909

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

910

{

910

{

911

#ifdef CONFIG_SMP

911

#ifdef CONFIG_SMP

912

/*

912

/*

913

* After ->oncpu is cleared, the task can be moved to a different CPU.

913

* After ->oncpu is cleared, the task can be moved to a different CPU.

914

* We must ensure this doesn't happen until the switch is completely

914

* We must ensure this doesn't happen until the switch is completely

915

* finished.

915

* finished.

916

*/

916

*/

917

smp_wmb();

917

smp_wmb();

918

prev->oncpu = 0;

918

prev->oncpu = 0;

919

#endif

919

#endif

920

#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW

920

#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW

921

local_irq_enable();

921

local_irq_enable();

922

#endif

922

#endif

923

}

923

}

924

#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

924

#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

925

926

/*

926

/*

927

* __task_rq_lock - lock the runqueue a given task resides on.

927

* __task_rq_lock - lock the runqueue a given task resides on.

928

* Must be called interrupts disabled.

928

* Must be called interrupts disabled.

929

*/

929

*/

930

static inline struct rq *__task_rq_lock(struct task_struct *p)

930

static inline struct rq *__task_rq_lock(struct task_struct *p)

931

__acquires(rq->lock)

931

__acquires(rq->lock)

932

{

932

{

933

for (;;) {

933

for (;;) {

934

struct rq *rq = task_rq(p);

934

struct rq *rq = task_rq(p);

935

spin_lock(&rq->lock);

935

spin_lock(&rq->lock);

936

if (likely(rq == task_rq(p)))

936

if (likely(rq == task_rq(p)))

937

return rq;

937

return rq;

938

spin_unlock(&rq->lock);

938

spin_unlock(&rq->lock);

939

}

939

}

940

}

940

}

941

942

/*

942

/*

943

* task_rq_lock - lock the runqueue a given task resides on and disable

943

* task_rq_lock - lock the runqueue a given task resides on and disable

944

* interrupts. Note the ordering: we can safely lookup the task_rq without

944

* interrupts. Note the ordering: we can safely lookup the task_rq without

945

* explicitly disabling preemption.

945

* explicitly disabling preemption.

946

*/

946

*/

947

static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)

947

static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)

948

__acquires(rq->lock)

948

__acquires(rq->lock)

949

{

949

{

950

struct rq *rq;

950

struct rq *rq;

951

952

for (;;) {

952

for (;;) {

953

local_irq_save(*flags);

953

local_irq_save(*flags);

954

rq = task_rq(p);

954

rq = task_rq(p);

955

spin_lock(&rq->lock);

955

spin_lock(&rq->lock);

956

if (likely(rq == task_rq(p)))

956

if (likely(rq == task_rq(p)))

957

return rq;

957

return rq;

958

spin_unlock_irqrestore(&rq->lock, *flags);

958

spin_unlock_irqrestore(&rq->lock, *flags);

959

}

959

}

960

}

960

}

961

962

void task_rq_unlock_wait(struct task_struct *p)

962

void task_rq_unlock_wait(struct task_struct *p)

963

{

963

{

964

struct rq *rq = task_rq(p);

964

struct rq *rq = task_rq(p);

965

966

smp_mb(); /* spin-unlock-wait is not a full memory barrier */

966

smp_mb(); /* spin-unlock-wait is not a full memory barrier */

967

spin_unlock_wait(&rq->lock);

967

spin_unlock_wait(&rq->lock);

968

}

968

}

969

970

static void __task_rq_unlock(struct rq *rq)

970

static void __task_rq_unlock(struct rq *rq)

971

__releases(rq->lock)

971

__releases(rq->lock)

972

{

972

{

973

spin_unlock(&rq->lock);

973

spin_unlock(&rq->lock);

974

}

974

}

975

976

static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)

976

static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)

977

__releases(rq->lock)

977

__releases(rq->lock)

978

{

978

{

979

spin_unlock_irqrestore(&rq->lock, *flags);

979

spin_unlock_irqrestore(&rq->lock, *flags);

980

}

980

}

981

982

/*

982

/*

983

* this_rq_lock - lock this runqueue and disable interrupts.

983

* this_rq_lock - lock this runqueue and disable interrupts.

984

*/

984

*/

985

static struct rq *this_rq_lock(void)

985

static struct rq *this_rq_lock(void)

986

__acquires(rq->lock)

986

__acquires(rq->lock)

987

{

987

{

988

struct rq *rq;

988

struct rq *rq;

989

990

local_irq_disable();

990

local_irq_disable();

991

rq = this_rq();

991

rq = this_rq();

992

spin_lock(&rq->lock);

992

spin_lock(&rq->lock);

993

994

return rq;

994

return rq;

995

}

995

}

996

997

#ifdef CONFIG_SCHED_HRTICK

997

#ifdef CONFIG_SCHED_HRTICK

998

/*

998

/*

999

* Use HR-timers to deliver accurate preemption points.

999

* Use HR-timers to deliver accurate preemption points.

1000

*

1000

*

1001

* Its all a bit involved since we cannot program an hrt while holding the

1001

* Its all a bit involved since we cannot program an hrt while holding the

1002

* rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a

1002

* rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a

1003

* reschedule event.

1003

* reschedule event.

1004

*

1004

*

1005

* When we get rescheduled we reprogram the hrtick_timer outside of the

1005

* When we get rescheduled we reprogram the hrtick_timer outside of the

1006

* rq->lock.

1006

* rq->lock.

1007

*/

1007

*/

1008

1009

/*

1009

/*

1010

* Use hrtick when:

1010

* Use hrtick when:

1011

* - enabled by features

1011

* - enabled by features

1012

* - hrtimer is actually high res

1012

* - hrtimer is actually high res

1013

*/

1013

*/

1014

static inline int hrtick_enabled(struct rq *rq)

1014

static inline int hrtick_enabled(struct rq *rq)

1015

{

1015

{

1016

if (!sched_feat(HRTICK))

1016

if (!sched_feat(HRTICK))

1017

return 0;

1017

return 0;

1018

if (!cpu_active(cpu_of(rq)))

1018

if (!cpu_active(cpu_of(rq)))

1019

return 0;

1019

return 0;

1020

return hrtimer_is_hres_active(&rq->hrtick_timer);

1020

return hrtimer_is_hres_active(&rq->hrtick_timer);

1021

}

1021

}

1022

1023

static void hrtick_clear(struct rq *rq)

1023

static void hrtick_clear(struct rq *rq)

1024

{

1024

{

1025

if (hrtimer_active(&rq->hrtick_timer))

1025

if (hrtimer_active(&rq->hrtick_timer))

1026

hrtimer_cancel(&rq->hrtick_timer);

1026

hrtimer_cancel(&rq->hrtick_timer);

1027

}

1027

}

1028

1029

/*

1029

/*

1030

* High-resolution timer tick.

1030

* High-resolution timer tick.

1031

* Runs from hardirq context with interrupts disabled.

1031

* Runs from hardirq context with interrupts disabled.

1032

*/

1032

*/

1033

static enum hrtimer_restart hrtick(struct hrtimer *timer)

1033

static enum hrtimer_restart hrtick(struct hrtimer *timer)

1034

{

1034

{

1035

struct rq *rq = container_of(timer, struct rq, hrtick_timer);

1035

struct rq *rq = container_of(timer, struct rq, hrtick_timer);

1036

1037

WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

1037

WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

1038

1039

spin_lock(&rq->lock);

1039

spin_lock(&rq->lock);

1040

update_rq_clock(rq);

1040

update_rq_clock(rq);

1041

rq->curr->sched_class->task_tick(rq, rq->curr, 1);

1041

rq->curr->sched_class->task_tick(rq, rq->curr, 1);

1042

spin_unlock(&rq->lock);

1042

spin_unlock(&rq->lock);

1043

1044

return HRTIMER_NORESTART;

1044

return HRTIMER_NORESTART;

1045

}

1045

}

1046

1047

#ifdef CONFIG_SMP

1047

#ifdef CONFIG_SMP

1048

/*

1048

/*

1049

* called from hardirq (IPI) context

1049

* called from hardirq (IPI) context

1050

*/

1050

*/

1051

static void __hrtick_start(void *arg)

1051

static void __hrtick_start(void *arg)

1052

{

1052

{

1053

struct rq *rq = arg;

1053

struct rq *rq = arg;

1054

1055

spin_lock(&rq->lock);

1055

spin_lock(&rq->lock);

1056

hrtimer_restart(&rq->hrtick_timer);

1056

hrtimer_restart(&rq->hrtick_timer);

1057

rq->hrtick_csd_pending = 0;

1057

rq->hrtick_csd_pending = 0;

1058

spin_unlock(&rq->lock);

1058

spin_unlock(&rq->lock);

1059

}

1059

}

1060

1061

/*

1061

/*

1062

* Called to set the hrtick timer state.

1062

* Called to set the hrtick timer state.

1063

*

1063

*

1064

* called with rq->lock held and irqs disabled

1064

* called with rq->lock held and irqs disabled

1065

*/

1065

*/

1066

static void hrtick_start(struct rq *rq, u64 delay)

1066

static void hrtick_start(struct rq *rq, u64 delay)

1067

{

1067

{

1068

struct hrtimer *timer = &rq->hrtick_timer;

1068

struct hrtimer *timer = &rq->hrtick_timer;

1069

ktime_t time = ktime_add_ns(timer->base->get_time(), delay);

1069

ktime_t time = ktime_add_ns(timer->base->get_time(), delay);

1070

1071

hrtimer_set_expires(timer, time);

1071

hrtimer_set_expires(timer, time);

1072

1073

if (rq == this_rq()) {

1073

if (rq == this_rq()) {

1074

hrtimer_restart(timer);

1074

hrtimer_restart(timer);

1075

} else if (!rq->hrtick_csd_pending) {

1075

} else if (!rq->hrtick_csd_pending) {

1076

__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);

1076

__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);

1077

rq->hrtick_csd_pending = 1;

1077

rq->hrtick_csd_pending = 1;

1078

}

1078

}

1079

}

1079

}

1080

1081

static int

1081

static int

1082

hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)

1082

hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)

1083

{

1083

{

1084

int cpu = (int)(long)hcpu;

1084

int cpu = (int)(long)hcpu;

1085

1086

switch (action) {

1086

switch (action) {

1087

case CPU_UP_CANCELED:

1087

case CPU_UP_CANCELED:

1088

case CPU_UP_CANCELED_FROZEN:

1088

case CPU_UP_CANCELED_FROZEN:

1089

case CPU_DOWN_PREPARE:

1089

case CPU_DOWN_PREPARE:

1090

case CPU_DOWN_PREPARE_FROZEN:

1090

case CPU_DOWN_PREPARE_FROZEN:

1091

case CPU_DEAD:

1091

case CPU_DEAD:

1092

case CPU_DEAD_FROZEN:

1092

case CPU_DEAD_FROZEN:

1093

hrtick_clear(cpu_rq(cpu));

1093

hrtick_clear(cpu_rq(cpu));

1094

return NOTIFY_OK;

1094

return NOTIFY_OK;

1095

}

1095

}

1096

1097

return NOTIFY_DONE;

1097

return NOTIFY_DONE;

1098

}

1098

}

1099

1100

static __init void init_hrtick(void)

1100

static __init void init_hrtick(void)

1101

{

1101

{

1102

hotcpu_notifier(hotplug_hrtick, 0);

1102

hotcpu_notifier(hotplug_hrtick, 0);

1103

}

1103

}

1104

#else

1104

#else

1105

/*

1105

/*

1106

* Called to set the hrtick timer state.

1106

* Called to set the hrtick timer state.

1107

*

1107

*

1108

* called with rq->lock held and irqs disabled

1108

* called with rq->lock held and irqs disabled

1109

*/

1109

*/

1110

static void hrtick_start(struct rq *rq, u64 delay)

1110

static void hrtick_start(struct rq *rq, u64 delay)

1111

{

1111

{

1112

hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);

1112

hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);

1113

}

1113

}

1114

1115

static inline void init_hrtick(void)

1115

static inline void init_hrtick(void)

1116

{

1116

{

1117

}

1117

}

1118

#endif /* CONFIG_SMP */

1118

#endif /* CONFIG_SMP */

1119

1120

static void init_rq_hrtick(struct rq *rq)

1120

static void init_rq_hrtick(struct rq *rq)

1121

{

1121

{

1122

#ifdef CONFIG_SMP

1122

#ifdef CONFIG_SMP

1123

rq->hrtick_csd_pending = 0;

1123

rq->hrtick_csd_pending = 0;

1124

1125

rq->hrtick_csd.flags = 0;

1125

rq->hrtick_csd.flags = 0;

1126

rq->hrtick_csd.func = __hrtick_start;

1126

rq->hrtick_csd.func = __hrtick_start;

1127

rq->hrtick_csd.info = rq;

1127

rq->hrtick_csd.info = rq;

1128

#endif

1128

#endif

1129

1130

hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

1130

hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

1131

rq->hrtick_timer.function = hrtick;

1131

rq->hrtick_timer.function = hrtick;

1132

rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;

1132

rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;

1133

}

1133

}

1134

#else /* CONFIG_SCHED_HRTICK */

1134

#else /* CONFIG_SCHED_HRTICK */

1135

static inline void hrtick_clear(struct rq *rq)

1135

static inline void hrtick_clear(struct rq *rq)

1136

{

1136

{

1137

}

1137

}

1138

1139

static inline void init_rq_hrtick(struct rq *rq)

1139

static inline void init_rq_hrtick(struct rq *rq)

1140

{

1140

{

1141

}

1141

}

1142

1143

static inline void init_hrtick(void)

1143

static inline void init_hrtick(void)

1144

{

1144

{

1145

}

1145

}

1146

#endif /* CONFIG_SCHED_HRTICK */

1146

#endif /* CONFIG_SCHED_HRTICK */

1147

1148

/*

1148

/*

1149

* resched_task - mark a task 'to be rescheduled now'.

1149

* resched_task - mark a task 'to be rescheduled now'.

1150

*

1150

*

1151

* On UP this means the setting of the need_resched flag, on SMP it

1151

* On UP this means the setting of the need_resched flag, on SMP it

1152

* might also involve a cross-CPU call to trigger the scheduler on

1152

* might also involve a cross-CPU call to trigger the scheduler on

1153

* the target CPU.

1153

* the target CPU.

1154

*/

1154

*/

1155

#ifdef CONFIG_SMP

1155

#ifdef CONFIG_SMP

1156

1157

#ifndef tsk_is_polling

1157

#ifndef tsk_is_polling

1158

#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)

1158

#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)

1159

#endif

1159

#endif

1160

1161

static void resched_task(struct task_struct *p)

1161

static void resched_task(struct task_struct *p)

1162

{

1162

{

1163

int cpu;

1163

int cpu;

1164

1165

assert_spin_locked(&task_rq(p)->lock);

1165

assert_spin_locked(&task_rq(p)->lock);

1166

1167

if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))

1167

if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))

1168

return;

1168

return;

1169

1170

set_tsk_thread_flag(p, TIF_NEED_RESCHED);

1170

set_tsk_thread_flag(p, TIF_NEED_RESCHED);

1171

1172

cpu = task_cpu(p);

1172

cpu = task_cpu(p);

1173

if (cpu == smp_processor_id())

1173

if (cpu == smp_processor_id())

1174

return;

1174

return;

1175

1176

/* NEED_RESCHED must be visible before we test polling */

1176

/* NEED_RESCHED must be visible before we test polling */

1177

smp_mb();

1177

smp_mb();

1178

if (!tsk_is_polling(p))

1178

if (!tsk_is_polling(p))

1179

smp_send_reschedule(cpu);

1179

smp_send_reschedule(cpu);

1180

}

1180

}

1181

1182

static void resched_cpu(int cpu)

1182

static void resched_cpu(int cpu)

1183

{

1183

{

1184

struct rq *rq = cpu_rq(cpu);

1184

struct rq *rq = cpu_rq(cpu);

1185

unsigned long flags;

1185

unsigned long flags;

1186

1187

if (!spin_trylock_irqsave(&rq->lock, flags))

1187

if (!spin_trylock_irqsave(&rq->lock, flags))

1188

return;

1188

return;

1189

resched_task(cpu_curr(cpu));

1189

resched_task(cpu_curr(cpu));

1190

spin_unlock_irqrestore(&rq->lock, flags);

1190

spin_unlock_irqrestore(&rq->lock, flags);

1191

}

1191

}

1192

1193

#ifdef CONFIG_NO_HZ

1193

#ifdef CONFIG_NO_HZ

1194

/*

1194

/*

1195

* When add_timer_on() enqueues a timer into the timer wheel of an

1195

* When add_timer_on() enqueues a timer into the timer wheel of an

1196

* idle CPU then this timer might expire before the next timer event

1196

* idle CPU then this timer might expire before the next timer event

1197

* which is scheduled to wake up that CPU. In case of a completely

1197

* which is scheduled to wake up that CPU. In case of a completely

1198

* idle system the next event might even be infinite time into the

1198

* idle system the next event might even be infinite time into the

1199

* future. wake_up_idle_cpu() ensures that the CPU is woken up and

1199

* future. wake_up_idle_cpu() ensures that the CPU is woken up and

1200

* leaves the inner idle loop so the newly added timer is taken into

1200

* leaves the inner idle loop so the newly added timer is taken into

1201

* account when the CPU goes back to idle and evaluates the timer

1201

* account when the CPU goes back to idle and evaluates the timer

1202

* wheel for the next timer event.

1202

* wheel for the next timer event.

1203

*/

1203

*/

1204

void wake_up_idle_cpu(int cpu)

1204

void wake_up_idle_cpu(int cpu)

1205

{

1205

{

1206

struct rq *rq = cpu_rq(cpu);

1206

struct rq *rq = cpu_rq(cpu);

1207

1208

if (cpu == smp_processor_id())

1208

if (cpu == smp_processor_id())

1209

return;

1209

return;

1210

1211

/*

1211

/*

1212

* This is safe, as this function is called with the timer

1212

* This is safe, as this function is called with the timer

1213

* wheel base lock of (cpu) held. When the CPU is on the way

1213

* wheel base lock of (cpu) held. When the CPU is on the way

1214

* to idle and has not yet set rq->curr to idle then it will

1214

* to idle and has not yet set rq->curr to idle then it will

1215

* be serialized on the timer wheel base lock and take the new

1215

* be serialized on the timer wheel base lock and take the new

1216

* timer into account automatically.

1216

* timer into account automatically.

1217

*/

1217

*/

1218

if (rq->curr != rq->idle)

1218

if (rq->curr != rq->idle)

1219

return;

1219

return;

1220

1221

/*

1221

/*

1222

* We can set TIF_RESCHED on the idle task of the other CPU

1222

* We can set TIF_RESCHED on the idle task of the other CPU

1223

* lockless. The worst case is that the other CPU runs the

1223

* lockless. The worst case is that the other CPU runs the

1224

* idle task through an additional NOOP schedule()

1224

* idle task through an additional NOOP schedule()

1225

*/

1225

*/

1226

set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);

1226

set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);

1227

1228

/* NEED_RESCHED must be visible before we test polling */

1228

/* NEED_RESCHED must be visible before we test polling */

1229

smp_mb();

1229

smp_mb();

1230

if (!tsk_is_polling(rq->idle))

1230

if (!tsk_is_polling(rq->idle))

1231

smp_send_reschedule(cpu);

1231

smp_send_reschedule(cpu);

1232

}

1232

}

1233

#endif /* CONFIG_NO_HZ */

1233

#endif /* CONFIG_NO_HZ */

1234

1235

#else /* !CONFIG_SMP */

1235

#else /* !CONFIG_SMP */

1236

static void resched_task(struct task_struct *p)

1236

static void resched_task(struct task_struct *p)

1237

{

1237

{

1238

assert_spin_locked(&task_rq(p)->lock);

1238

assert_spin_locked(&task_rq(p)->lock);

1239

set_tsk_need_resched(p);

1239

set_tsk_need_resched(p);

1240

}

1240

}

1241

#endif /* CONFIG_SMP */

1241

#endif /* CONFIG_SMP */

1242

1243

#if BITS_PER_LONG == 32

1243

#if BITS_PER_LONG == 32

1244

# define WMULT_CONST (~0UL)

1244

# define WMULT_CONST (~0UL)

1245

#else

1245

#else

1246

# define WMULT_CONST (1UL << 32)

1246

# define WMULT_CONST (1UL << 32)

1247

#endif

1247

#endif

1248

1249

#define WMULT_SHIFT 32

1249

#define WMULT_SHIFT 32

1250

1251

/*

1251

/*

1252

* Shift right and round:

1252

* Shift right and round:

1253

*/

1253

*/

1254

#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))

1254

#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))

1255

1256

/*

1256

/*

1257

* delta *= weight / lw

1257

* delta *= weight / lw

1258

*/

1258

*/

1259

static unsigned long

1259

static unsigned long

1260

calc_delta_mine(unsigned long delta_exec, unsigned long weight,

1260

calc_delta_mine(unsigned long delta_exec, unsigned long weight,

1261

struct load_weight *lw)

1261

struct load_weight *lw)

1262

{

1262

{

1263

u64 tmp;

1263

u64 tmp;

1264

1265

if (!lw->inv_weight) {

1265

if (!lw->inv_weight) {

1266

if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))

1266

if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))

1267

lw->inv_weight = 1;

1267

lw->inv_weight = 1;

1268

else

1268

else

1269

lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)

1269

lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)

1270

/ (lw->weight+1);

1270

/ (lw->weight+1);

1271

}

1271

}

1272

1273

tmp = (u64)delta_exec * weight;

1273

tmp = (u64)delta_exec * weight;

1274

/*

1274

/*

1275

* Check whether we'd overflow the 64-bit multiplication:

1275

* Check whether we'd overflow the 64-bit multiplication:

1276

*/

1276

*/

1277

if (unlikely(tmp > WMULT_CONST))

1277

if (unlikely(tmp > WMULT_CONST))

1278

tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,

1278

tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,

1279

WMULT_SHIFT/2);

1279

WMULT_SHIFT/2);

1280

else

1280

else

1281

tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);

1281

tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);

1282

1283

return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);

1283

return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);

1284

}

1284

}

1285

1286

static inline void update_load_add(struct load_weight *lw, unsigned long inc)

1286

static inline void update_load_add(struct load_weight *lw, unsigned long inc)

1287

{

1287

{

1288

lw->weight += inc;

1288

lw->weight += inc;

1289

lw->inv_weight = 0;

1289

lw->inv_weight = 0;

1290

}

1290

}

1291

1292

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)

1292

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)

1293

{

1293

{

1294

lw->weight -= dec;

1294

lw->weight -= dec;

1295

lw->inv_weight = 0;

1295

lw->inv_weight = 0;

1296

}

1296

}

1297

1298

/*

1298

/*

1299

* To aid in avoiding the subversion of "niceness" due to uneven distribution

1299

* To aid in avoiding the subversion of "niceness" due to uneven distribution

1300

* of tasks with abnormal "nice" values across CPUs the contribution that

1300

* of tasks with abnormal "nice" values across CPUs the contribution that

1301

* each task makes to its run queue's load is weighted according to its

1301

* each task makes to its run queue's load is weighted according to its

1302

* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a

1302

* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a

1303

* scaled version of the new time slice allocation that they receive on time

1303

* scaled version of the new time slice allocation that they receive on time

1304

* slice expiry etc.

1304

* slice expiry etc.

1305

*/

1305

*/

1306

1307

#define WEIGHT_IDLEPRIO 2

1307

#define WEIGHT_IDLEPRIO 2

1308

#define WMULT_IDLEPRIO (1 << 31)

1308

#define WMULT_IDLEPRIO (1 << 31)

1309

1310

/*

1310

/*

1311

* Nice levels are multiplicative, with a gentle 10% change for every

1311

* Nice levels are multiplicative, with a gentle 10% change for every

1312

* nice level changed. I.e. when a CPU-bound task goes from nice 0 to

1312

* nice level changed. I.e. when a CPU-bound task goes from nice 0 to

1313

* nice 1, it will get ~10% less CPU time than another CPU-bound task

1313

* nice 1, it will get ~10% less CPU time than another CPU-bound task

1314

* that remained on nice 0.

1314

* that remained on nice 0.

1315

*

1315

*

1316

* The "10% effect" is relative and cumulative: from _any_ nice level,

1316

* The "10% effect" is relative and cumulative: from _any_ nice level,

1317

* if you go up 1 level, it's -10% CPU usage, if you go down 1 level

1317

* if you go up 1 level, it's -10% CPU usage, if you go down 1 level

1318

* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.

1318

* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.

1319

* If a task goes up by ~10% and another task goes down by ~10% then

1319

* If a task goes up by ~10% and another task goes down by ~10% then

1320

* the relative distance between them is ~25%.)

1320

* the relative distance between them is ~25%.)

1321

*/

1321

*/

1322

static const int prio_to_weight[40] = {

1322

static const int prio_to_weight[40] = {

1323

/* -20 */ 88761, 71755, 56483, 46273, 36291,

1323

/* -20 */ 88761, 71755, 56483, 46273, 36291,

1324

/* -15 */ 29154, 23254, 18705, 14949, 11916,

1324

/* -15 */ 29154, 23254, 18705, 14949, 11916,

1325

/* -10 */ 9548, 7620, 6100, 4904, 3906,

1325

/* -10 */ 9548, 7620, 6100, 4904, 3906,

1326

/* -5 */ 3121, 2501, 1991, 1586, 1277,

1326

/* -5 */ 3121, 2501, 1991, 1586, 1277,

1327

/* 0 */ 1024, 820, 655, 526, 423,

1327

/* 0 */ 1024, 820, 655, 526, 423,

1328

/* 5 */ 335, 272, 215, 172, 137,

1328

/* 5 */ 335, 272, 215, 172, 137,

1329

/* 10 */ 110, 87, 70, 56, 45,

1329

/* 10 */ 110, 87, 70, 56, 45,

1330

/* 15 */ 36, 29, 23, 18, 15,

1330

/* 15 */ 36, 29, 23, 18, 15,

1331

};

1331

};

1332

1333

/*

1333

/*

1334

* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.

1334

* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.

1335

*

1335

*

1336

* In cases where the weight does not change often, we can use the

1336

* In cases where the weight does not change often, we can use the

1337

* precalculated inverse to speed up arithmetics by turning divisions

1337

* precalculated inverse to speed up arithmetics by turning divisions

1338

* into multiplications:

1338

* into multiplications:

1339

*/

1339

*/

1340

static const u32 prio_to_wmult[40] = {

1340

static const u32 prio_to_wmult[40] = {

1341

/* -20 */ 48388, 59856, 76040, 92818, 118348,

1341

/* -20 */ 48388, 59856, 76040, 92818, 118348,

1342

/* -15 */ 147320, 184698, 229616, 287308, 360437,

1342

/* -15 */ 147320, 184698, 229616, 287308, 360437,

1343

/* -10 */ 449829, 563644, 704093, 875809, 1099582,

1343

/* -10 */ 449829, 563644, 704093, 875809, 1099582,

1344

/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,

1344

/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,

1345

/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,

1345

/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,

1346

/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,

1346

/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,

1347

/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,

1347

/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,

1348

/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,

1348

/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,

1349

};

1349

};

1350

1351

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);

1351

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);

1352

1353

/*

1353

/*

1354

* runqueue iterator, to support SMP load-balancing between different

1354

* runqueue iterator, to support SMP load-balancing between different

1355

* scheduling classes, without having to expose their internal data

1355

* scheduling classes, without having to expose their internal data

1356

* structures to the load-balancing proper:

1356

* structures to the load-balancing proper:

1357

*/

1357

*/

1358

struct rq_iterator {

1358

struct rq_iterator {

1359

void *arg;

1359

void *arg;

1360

struct task_struct *(*start)(void *);

1360

struct task_struct *(*start)(void *);

1361

struct task_struct *(*next)(void *);

1361

struct task_struct *(*next)(void *);

1362

};

1362

};

1363

1364

#ifdef CONFIG_SMP

1364

#ifdef CONFIG_SMP

1365

static unsigned long

1365

static unsigned long

1366

balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

1366

balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

1367

unsigned long max_load_move, struct sched_domain *sd,

1367

unsigned long max_load_move, struct sched_domain *sd,

1368

enum cpu_idle_type idle, int *all_pinned,

1368

enum cpu_idle_type idle, int *all_pinned,

1369

int *this_best_prio, struct rq_iterator *iterator);

1369

int *this_best_prio, struct rq_iterator *iterator);

1370

1371

static int

1371

static int

1372

iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

1372

iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

1373

struct sched_domain *sd, enum cpu_idle_type idle,

1373

struct sched_domain *sd, enum cpu_idle_type idle,

1374

struct rq_iterator *iterator);

1374

struct rq_iterator *iterator);

1375

#endif

1375

#endif

1376

1377

#ifdef CONFIG_CGROUP_CPUACCT

1377

#ifdef CONFIG_CGROUP_CPUACCT

1378

static void cpuacct_charge(struct task_struct *tsk, u64 cputime);

1378

static void cpuacct_charge(struct task_struct *tsk, u64 cputime);

1379

#else

1379

#else

1380

static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}

1380

static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}

1381

#endif

1381

#endif

1382

1383

static inline void inc_cpu_load(struct rq *rq, unsigned long load)

1383

static inline void inc_cpu_load(struct rq *rq, unsigned long load)

1384

{

1384

{

1385

update_load_add(&rq->load, load);

1385

update_load_add(&rq->load, load);

1386

}

1386

}

1387

1388

static inline void dec_cpu_load(struct rq *rq, unsigned long load)

1388

static inline void dec_cpu_load(struct rq *rq, unsigned long load)

1389

{

1389

{

1390

update_load_sub(&rq->load, load);

1390

update_load_sub(&rq->load, load);

1391

}

1391

}

1392

1393

#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)

1393

#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)

1394

typedef int (*tg_visitor)(struct task_group *, void *);

1394

typedef int (*tg_visitor)(struct task_group *, void *);

1395

1396

/*

1396

/*

1397

* Iterate the full tree, calling @down when first entering a node and @up when

1397

* Iterate the full tree, calling @down when first entering a node and @up when

1398

* leaving it for the final time.

1398

* leaving it for the final time.

1399

*/

1399

*/

1400

static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)

1400

static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)

1401

{

1401

{

1402

struct task_group *parent, *child;

1402

struct task_group *parent, *child;

1403

int ret;

1403

int ret;

1404

1405

rcu_read_lock();

1405

rcu_read_lock();

1406

parent = &root_task_group;

1406

parent = &root_task_group;

1407

down:

1407

down:

1408

ret = (*down)(parent, data);

1408

ret = (*down)(parent, data);

1409

if (ret)

1409

if (ret)

1410

goto out_unlock;

1410

goto out_unlock;

1411

list_for_each_entry_rcu(child, &parent->children, siblings) {

1411

list_for_each_entry_rcu(child, &parent->children, siblings) {

1412

parent = child;

1412

parent = child;

1413

goto down;

1413

goto down;

1414

1415

up:

1415

up:

1416

continue;

1416

continue;

1417

}

1417

}

1418

ret = (*up)(parent, data);

1418

ret = (*up)(parent, data);

1419

if (ret)

1419

if (ret)

1420

goto out_unlock;

1420

goto out_unlock;

1421

1422

child = parent;

1422

child = parent;

1423

parent = parent->parent;

1423

parent = parent->parent;

1424

if (parent)

1424

if (parent)

1425

goto up;

1425

goto up;

1426

out_unlock:

1426

out_unlock:

1427

rcu_read_unlock();

1427

rcu_read_unlock();

1428

1429

return ret;

1429

return ret;

1430

}

1430

}

1431

1432

static int tg_nop(struct task_group *tg, void *data)

1432

static int tg_nop(struct task_group *tg, void *data)

1433

{

1433

{

1434

return 0;

1434

return 0;

1435

}

1435

}

1436

#endif

1436

#endif

1437

1438

#ifdef CONFIG_SMP

1438

#ifdef CONFIG_SMP

1439

static unsigned long source_load(int cpu, int type);

1439

static unsigned long source_load(int cpu, int type);

1440

static unsigned long target_load(int cpu, int type);

1440

static unsigned long target_load(int cpu, int type);

1441

static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);

1441

static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);

1442

1443

static unsigned long cpu_avg_load_per_task(int cpu)

1443

static unsigned long cpu_avg_load_per_task(int cpu)

1444

{

1444

{

1445

struct rq *rq = cpu_rq(cpu);

1445

struct rq *rq = cpu_rq(cpu);

1446

unsigned long nr_running = ACCESS_ONCE(rq->nr_running);

1446

unsigned long nr_running = ACCESS_ONCE(rq->nr_running);

1447

1448

if (nr_running)

1448

if (nr_running)

1449

rq->avg_load_per_task = rq->load.weight / nr_running;

1449

rq->avg_load_per_task = rq->load.weight / nr_running;

1450

else

1450

else

1451

rq->avg_load_per_task = 0;

1451

rq->avg_load_per_task = 0;

1452

1453

return rq->avg_load_per_task;

1453

return rq->avg_load_per_task;

1454

}

1454

}

1455

1456

#ifdef CONFIG_FAIR_GROUP_SCHED

1456

#ifdef CONFIG_FAIR_GROUP_SCHED

1457

1458

static void __set_se_shares(struct sched_entity *se, unsigned long shares);

1458

static void __set_se_shares(struct sched_entity *se, unsigned long shares);

1459

1460

/*

1460

/*

1461

* Calculate and set the cpu's group shares.

1461

* Calculate and set the cpu's group shares.

1462

*/

1462

*/

1463

static void

1463

static void

1464

update_group_shares_cpu(struct task_group *tg, int cpu,

1464

update_group_shares_cpu(struct task_group *tg, int cpu,

1465

unsigned long sd_shares, unsigned long sd_rq_weight)

1465

unsigned long sd_shares, unsigned long sd_rq_weight)

1466

{

1466

{

1467

unsigned long shares;

1467

unsigned long shares;

1468

unsigned long rq_weight;

1468

unsigned long rq_weight;

1469

1470

if (!tg->se[cpu])

1470

if (!tg->se[cpu])

1471

return;

1471

return;

1472

1473

rq_weight = tg->cfs_rq[cpu]->rq_weight;

1473

rq_weight = tg->cfs_rq[cpu]->rq_weight;

1474

1475

/*

1475

/*

1476

* \Sum shares * rq_weight

1476

* \Sum shares * rq_weight

1477

* shares = -----------------------

1477

* shares = -----------------------

1478

* \Sum rq_weight

1478

* \Sum rq_weight

1479

*

1479

*

1480

*/

1480

*/

1481

shares = (sd_shares * rq_weight) / sd_rq_weight;

1481

shares = (sd_shares * rq_weight) / sd_rq_weight;

1482

shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);

1482

shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);

1483

1484

if (abs(shares - tg->se[cpu]->load.weight) >

1484

if (abs(shares - tg->se[cpu]->load.weight) >

1485

sysctl_sched_shares_thresh) {

1485

sysctl_sched_shares_thresh) {

1486

struct rq *rq = cpu_rq(cpu);

1486

struct rq *rq = cpu_rq(cpu);

1487

unsigned long flags;

1487

unsigned long flags;

1488

1489

spin_lock_irqsave(&rq->lock, flags);

1489

spin_lock_irqsave(&rq->lock, flags);

1490

tg->cfs_rq[cpu]->shares = shares;

1490

tg->cfs_rq[cpu]->shares = shares;

1491

1492

__set_se_shares(tg->se[cpu], shares);

1492

__set_se_shares(tg->se[cpu], shares);

1493

spin_unlock_irqrestore(&rq->lock, flags);

1493

spin_unlock_irqrestore(&rq->lock, flags);

1494

}

1494

}

1495

}

1495

}

1496

1497

/*

1497

/*

1498

* Re-compute the task group their per cpu shares over the given domain.

1498

* Re-compute the task group their per cpu shares over the given domain.

1499

* This needs to be done in a bottom-up fashion because the rq weight of a

1499

* This needs to be done in a bottom-up fashion because the rq weight of a

1500

* parent group depends on the shares of its child groups.

1500

* parent group depends on the shares of its child groups.

1501

*/

1501

*/

1502

static int tg_shares_up(struct task_group *tg, void *data)

1502

static int tg_shares_up(struct task_group *tg, void *data)

1503

{

1503

{

1504

unsigned long weight, rq_weight = 0;

1504

unsigned long weight, rq_weight = 0;

1505

unsigned long shares = 0;

1505

unsigned long shares = 0;

1506

struct sched_domain *sd = data;

1506

struct sched_domain *sd = data;

1507

int i;

1507

int i;

1508

1509

for_each_cpu_mask(i, sd->span) {

1509

for_each_cpu_mask(i, sd->span) {

1510

/*

1510

/*

1511

* If there are currently no tasks on the cpu pretend there

1511

* If there are currently no tasks on the cpu pretend there

1512

* is one of average load so that when a new task gets to

1512

* is one of average load so that when a new task gets to

1513

* run here it will not get delayed by group starvation.

1513

* run here it will not get delayed by group starvation.

1514

*/

1514

*/

1515

weight = tg->cfs_rq[i]->load.weight;

1515

weight = tg->cfs_rq[i]->load.weight;

1516

if (!weight)

1516

if (!weight)

1517

weight = NICE_0_LOAD;

1517

weight = NICE_0_LOAD;

1518

1519

tg->cfs_rq[i]->rq_weight = weight;

1519

tg->cfs_rq[i]->rq_weight = weight;

1520

rq_weight += weight;

1520

rq_weight += weight;

1521

shares += tg->cfs_rq[i]->shares;

1521

shares += tg->cfs_rq[i]->shares;

1522

}

1522

}

1523

1524

if ((!shares && rq_weight) || shares > tg->shares)

1524

if ((!shares && rq_weight) || shares > tg->shares)

1525

shares = tg->shares;

1525

shares = tg->shares;

1526

1527

if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))

1527

if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))

1528

shares = tg->shares;

1528

shares = tg->shares;

1529

1530

for_each_cpu_mask(i, sd->span)

1530

for_each_cpu_mask(i, sd->span)

1531

update_group_shares_cpu(tg, i, shares, rq_weight);

1531

update_group_shares_cpu(tg, i, shares, rq_weight);

1532

1533

return 0;

1533

return 0;

1534

}

1534

}

1535

1536

/*

1536

/*

1537

* Compute the cpu's hierarchical load factor for each task group.

1537

* Compute the cpu's hierarchical load factor for each task group.

1538

* This needs to be done in a top-down fashion because the load of a child

1538

* This needs to be done in a top-down fashion because the load of a child

1539

* group is a fraction of its parents load.

1539

* group is a fraction of its parents load.

1540

*/

1540

*/

1541

static int tg_load_down(struct task_group *tg, void *data)

1541

static int tg_load_down(struct task_group *tg, void *data)

1542

{

1542

{

1543

unsigned long load;

1543

unsigned long load;

1544

long cpu = (long)data;

1544

long cpu = (long)data;

1545

1546

if (!tg->parent) {

1546

if (!tg->parent) {

1547

load = cpu_rq(cpu)->load.weight;

1547

load = cpu_rq(cpu)->load.weight;

1548

} else {

1548

} else {

1549

load = tg->parent->cfs_rq[cpu]->h_load;

1549

load = tg->parent->cfs_rq[cpu]->h_load;

1550

load *= tg->cfs_rq[cpu]->shares;

1550

load *= tg->cfs_rq[cpu]->shares;

1551

load /= tg->parent->cfs_rq[cpu]->load.weight + 1;

1551

load /= tg->parent->cfs_rq[cpu]->load.weight + 1;

1552

}

1552

}

1553

1554

tg->cfs_rq[cpu]->h_load = load;

1554

tg->cfs_rq[cpu]->h_load = load;

1555

1556

return 0;

1556

return 0;

1557

}

1557

}

1558

1559

static void update_shares(struct sched_domain *sd)

1559

static void update_shares(struct sched_domain *sd)

1560

{

1560

{

1561

u64 now = cpu_clock(raw_smp_processor_id());

1561

u64 now = cpu_clock(raw_smp_processor_id());

1562

s64 elapsed = now - sd->last_update;

1562

s64 elapsed = now - sd->last_update;

1563

1564

if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {

1564

if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {

1565

sd->last_update = now;

1565

sd->last_update = now;

1566

walk_tg_tree(tg_nop, tg_shares_up, sd);

1566

walk_tg_tree(tg_nop, tg_shares_up, sd);

1567

}

1567

}

1568

}

1568

}

1569

1570

static void update_shares_locked(struct rq *rq, struct sched_domain *sd)

1570

static void update_shares_locked(struct rq *rq, struct sched_domain *sd)

1571

{

1571

{

1572

spin_unlock(&rq->lock);

1572

spin_unlock(&rq->lock);

1573

update_shares(sd);

1573

update_shares(sd);

1574

spin_lock(&rq->lock);

1574

spin_lock(&rq->lock);

1575

}

1575

}

1576

1577

static void update_h_load(long cpu)

1577

static void update_h_load(long cpu)

1578

{

1578

{

1579

walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);

1579

walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);

1580

}

1580

}

1581

1582

#else

1582

#else

1583

1584

static inline void update_shares(struct sched_domain *sd)

1584

static inline void update_shares(struct sched_domain *sd)

1585

{

1585

{

1586

}

1586

}

1587

1588

static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)

1588

static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)

1589

{

1589

{

1590

}

1590

}

1591

1592

#endif

1592

#endif

1593

1594

/*

1594

/*

1595

* double_lock_balance - lock the busiest runqueue, this_rq is locked already.

1595

* double_lock_balance - lock the busiest runqueue, this_rq is locked already.

1596

*/

1596

*/

1597

static int double_lock_balance(struct rq *this_rq, struct rq *busiest)

1597

static int double_lock_balance(struct rq *this_rq, struct rq *busiest)

1598

__releases(this_rq->lock)

1598

__releases(this_rq->lock)

1599

__acquires(busiest->lock)

1599

__acquires(busiest->lock)

1600

__acquires(this_rq->lock)

1600

__acquires(this_rq->lock)

1601

{

1601

{

1602

int ret = 0;

1602

int ret = 0;

1603

1604

if (unlikely(!irqs_disabled())) {

1604

if (unlikely(!irqs_disabled())) {

1605

/* printk() doesn't work good under rq->lock */

1605

/* printk() doesn't work good under rq->lock */

1606

spin_unlock(&this_rq->lock);

1606

spin_unlock(&this_rq->lock);

1607

BUG_ON(1);

1607

BUG_ON(1);

1608

}

1608

}

1609

if (unlikely(!spin_trylock(&busiest->lock))) {

1609

if (unlikely(!spin_trylock(&busiest->lock))) {

1610

if (busiest < this_rq) {

1610

if (busiest < this_rq) {

1611

spin_unlock(&this_rq->lock);

1611

spin_unlock(&this_rq->lock);

1612

spin_lock(&busiest->lock);

1612

spin_lock(&busiest->lock);

1613

spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);

1613

spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);

1614

ret = 1;

1614

ret = 1;

1615

} else

1615

} else

1616

spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);

1616

spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);

1617

}

1617

}

1618

return ret;

1618

return ret;

1619

}

1619

}

1620

1621

static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)

1621

static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)

1622

__releases(busiest->lock)

1622

__releases(busiest->lock)

1623

{

1623

{

1624

spin_unlock(&busiest->lock);

1624

spin_unlock(&busiest->lock);

1625

lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);

1625

lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);

1626

}

1626

}

1627

#endif

1627

#endif

1628

1629

#ifdef CONFIG_FAIR_GROUP_SCHED

1629

#ifdef CONFIG_FAIR_GROUP_SCHED

1630

static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)

1630

static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)

1631

{

1631

{

1632

#ifdef CONFIG_SMP

1632

#ifdef CONFIG_SMP

1633

cfs_rq->shares = shares;

1633

cfs_rq->shares = shares;

1634

#endif

1634

#endif

1635

}

1635

}

1636

#endif

1636

#endif

1637

1638

#include "sched_stats.h"

1638

#include "sched_stats.h"

1639

#include "sched_idletask.c"

1639

#include "sched_idletask.c"

1640

#include "sched_fair.c"

1640

#include "sched_fair.c"

1641

#include "sched_rt.c"

1641

#include "sched_rt.c"

1642

#ifdef CONFIG_SCHED_DEBUG

1642

#ifdef CONFIG_SCHED_DEBUG

1643

# include "sched_debug.c"

1643

# include "sched_debug.c"

1644

#endif

1644

#endif

1645

1646

#define sched_class_highest (&rt_sched_class)

1646

#define sched_class_highest (&rt_sched_class)

1647

#define for_each_class(class) \

1647

#define for_each_class(class) \

1648

for (class = sched_class_highest; class; class = class->next)

1648

for (class = sched_class_highest; class; class = class->next)

1649

1650

static void inc_nr_running(struct rq *rq)

1650

static void inc_nr_running(struct rq *rq)

1651

{

1651

{

1652

rq->nr_running++;

1652

rq->nr_running++;

1653

}

1653

}

1654

1655

static void dec_nr_running(struct rq *rq)

1655

static void dec_nr_running(struct rq *rq)

1656

{

1656

{

1657

rq->nr_running--;

1657

rq->nr_running--;

1658

}

1658

}

1659

1660

static void set_load_weight(struct task_struct *p)

1660

static void set_load_weight(struct task_struct *p)

1661

{

1661

{

1662

if (task_has_rt_policy(p)) {

1662

if (task_has_rt_policy(p)) {

1663

p->se.load.weight = prio_to_weight[0] * 2;

1663

p->se.load.weight = prio_to_weight[0] * 2;

1664

p->se.load.inv_weight = prio_to_wmult[0] >> 1;

1664

p->se.load.inv_weight = prio_to_wmult[0] >> 1;

1665

return;

1665

return;

1666

}

1666

}

1667

1668

/*

1668

/*

1669

* SCHED_IDLE tasks get minimal weight:

1669

* SCHED_IDLE tasks get minimal weight:

1670

*/

1670

*/

1671

if (p->policy == SCHED_IDLE) {

1671

if (p->policy == SCHED_IDLE) {

1672

p->se.load.weight = WEIGHT_IDLEPRIO;

1672

p->se.load.weight = WEIGHT_IDLEPRIO;

1673

p->se.load.inv_weight = WMULT_IDLEPRIO;

1673

p->se.load.inv_weight = WMULT_IDLEPRIO;

1674

return;

1674

return;

1675

}

1675

}

1676

1677

p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];

1677

p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];

1678

p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];

1678

p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];

1679

}

1679

}

1680

1681

static void update_avg(u64 *avg, u64 sample)

1681

static void update_avg(u64 *avg, u64 sample)

1682

{

1682

{

1683

s64 diff = sample - *avg;

1683

s64 diff = sample - *avg;

1684

*avg += diff >> 3;

1684

*avg += diff >> 3;

1685

}

1685

}

1686

1687

static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)

1687

static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)

1688

{

1688

{

1689

sched_info_queued(p);

1689

sched_info_queued(p);

1690

p->sched_class->enqueue_task(rq, p, wakeup);

1690

p->sched_class->enqueue_task(rq, p, wakeup);

1691

p->se.on_rq = 1;

1691

p->se.on_rq = 1;

1692

}

1692

}

1693

1694

static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)

1694

static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)

1695

{

1695

{

1696

if (sleep && p->se.last_wakeup) {

1696

if (sleep && p->se.last_wakeup) {

1697

update_avg(&p->se.avg_overlap,

1697

update_avg(&p->se.avg_overlap,

1698

p->se.sum_exec_runtime - p->se.last_wakeup);

1698

p->se.sum_exec_runtime - p->se.last_wakeup);

1699

p->se.last_wakeup = 0;

1699

p->se.last_wakeup = 0;

1700

}

1700

}

1701

1702

sched_info_dequeued(p);

1702

sched_info_dequeued(p);

1703

p->sched_class->dequeue_task(rq, p, sleep);

1703

p->sched_class->dequeue_task(rq, p, sleep);

1704

p->se.on_rq = 0;

1704

p->se.on_rq = 0;

1705

}

1705

}

1706

1707

/*

1707

/*

1708

* __normal_prio - return the priority that is based on the static prio

1708

* __normal_prio - return the priority that is based on the static prio

1709

*/

1709

*/

1710

static inline int __normal_prio(struct task_struct *p)

1710

static inline int __normal_prio(struct task_struct *p)

1711

{

1711

{

1712

return p->static_prio;

1712

return p->static_prio;

1713

}

1713

}

1714

1715

/*

1715

/*

1716

* Calculate the expected normal priority: i.e. priority

1716

* Calculate the expected normal priority: i.e. priority

1717

* without taking RT-inheritance into account. Might be

1717

* without taking RT-inheritance into account. Might be

1718

* boosted by interactivity modifiers. Changes upon fork,

1718

* boosted by interactivity modifiers. Changes upon fork,

1719

* setprio syscalls, and whenever the interactivity

1719

* setprio syscalls, and whenever the interactivity

1720

* estimator recalculates.

1720

* estimator recalculates.

1721

*/

1721

*/

1722

static inline int normal_prio(struct task_struct *p)

1722

static inline int normal_prio(struct task_struct *p)

1723

{

1723

{

1724

int prio;

1724

int prio;

1725

1726

if (task_has_rt_policy(p))

1726

if (task_has_rt_policy(p))

1727

prio = MAX_RT_PRIO-1 - p->rt_priority;

1727

prio = MAX_RT_PRIO-1 - p->rt_priority;

1728

else

1728

else

1729

prio = __normal_prio(p);

1729

prio = __normal_prio(p);

1730

return prio;

1730

return prio;

1731

}

1731

}

1732

1733

/*

1733

/*

1734

* Calculate the current priority, i.e. the priority

1734

* Calculate the current priority, i.e. the priority

1735

* taken into account by the scheduler. This value might

1735

* taken into account by the scheduler. This value might

1736

* be boosted by RT tasks, or might be boosted by

1736

* be boosted by RT tasks, or might be boosted by

1737

* interactivity modifiers. Will be RT if the task got

1737

* interactivity modifiers. Will be RT if the task got

1738

* RT-boosted. If not then it returns p->normal_prio.

1738

* RT-boosted. If not then it returns p->normal_prio.

1739

*/

1739

*/

1740

static int effective_prio(struct task_struct *p)

1740

static int effective_prio(struct task_struct *p)

1741

{

1741

{

1742

p->normal_prio = normal_prio(p);

1742

p->normal_prio = normal_prio(p);

1743

/*

1743

/*

1744

* If we are RT tasks or we were boosted to RT priority,

1744

* If we are RT tasks or we were boosted to RT priority,

1745

* keep the priority unchanged. Otherwise, update priority

1745

* keep the priority unchanged. Otherwise, update priority

1746

* to the normal priority:

1746

* to the normal priority:

1747

*/

1747

*/

1748

if (!rt_prio(p->prio))

1748

if (!rt_prio(p->prio))

1749

return p->normal_prio;

1749

return p->normal_prio;

1750

return p->prio;

1750

return p->prio;

1751

}

1751

}

1752

1753

/*

1753

/*

1754

* activate_task - move a task to the runqueue.

1754

* activate_task - move a task to the runqueue.

1755

*/

1755

*/

1756

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)

1756

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)

1757

{

1757

{

1758

if (task_contributes_to_load(p))

1758

if (task_contributes_to_load(p))

1759

rq->nr_uninterruptible--;

1759

rq->nr_uninterruptible--;

1760

1761

enqueue_task(rq, p, wakeup);

1761

enqueue_task(rq, p, wakeup);

1762

inc_nr_running(rq);

1762

inc_nr_running(rq);

1763

}

1763

}

1764

1765

/*

1765

/*

1766

* deactivate_task - remove a task from the runqueue.

1766

* deactivate_task - remove a task from the runqueue.

1767

*/

1767

*/

1768

static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)

1768

static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)

1769

{

1769

{

1770

if (task_contributes_to_load(p))

1770

if (task_contributes_to_load(p))

1771

rq->nr_uninterruptible++;

1771

rq->nr_uninterruptible++;

1772

1773

dequeue_task(rq, p, sleep);

1773

dequeue_task(rq, p, sleep);

1774

dec_nr_running(rq);

1774

dec_nr_running(rq);

1775

}

1775

}

1776

1777

/**

1777

/**

1778

* task_curr - is this task currently executing on a CPU?

1778

* task_curr - is this task currently executing on a CPU?

1779

* @p: the task in question.

1779

* @p: the task in question.

1780

*/

1780

*/

1781

inline int task_curr(const struct task_struct *p)

1781

inline int task_curr(const struct task_struct *p)

1782

{

1782

{

1783

return cpu_curr(task_cpu(p)) == p;

1783

return cpu_curr(task_cpu(p)) == p;

1784

}

1784

}

1785

1786

static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)

1786

static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)

1787

{

1787

{

1788

set_task_rq(p, cpu);

1788

set_task_rq(p, cpu);

1789

#ifdef CONFIG_SMP

1789

#ifdef CONFIG_SMP

1790

/*

1790

/*

1791

* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be

1791

* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be

1792

* successfuly executed on another CPU. We must ensure that updates of

1792

* successfuly executed on another CPU. We must ensure that updates of

1793

* per-task data have been completed by this moment.

1793

* per-task data have been completed by this moment.

1794

*/

1794

*/

1795

smp_wmb();

1795

smp_wmb();

1796

task_thread_info(p)->cpu = cpu;

1796

task_thread_info(p)->cpu = cpu;

1797

#endif

1797

#endif

1798

}

1798

}

1799

1800

static inline void check_class_changed(struct rq *rq, struct task_struct *p,

1800

static inline void check_class_changed(struct rq *rq, struct task_struct *p,

1801

const struct sched_class *prev_class,

1801

const struct sched_class *prev_class,

1802

int oldprio, int running)

1802

int oldprio, int running)

1803

{

1803

{

1804

if (prev_class != p->sched_class) {

1804

if (prev_class != p->sched_class) {

1805

if (prev_class->switched_from)

1805

if (prev_class->switched_from)

1806

prev_class->switched_from(rq, p, running);

1806

prev_class->switched_from(rq, p, running);

1807

p->sched_class->switched_to(rq, p, running);

1807

p->sched_class->switched_to(rq, p, running);

1808

} else

1808

} else

1809

p->sched_class->prio_changed(rq, p, oldprio, running);

1809

p->sched_class->prio_changed(rq, p, oldprio, running);

1810

}

1810

}

1811

1812

#ifdef CONFIG_SMP

1812

#ifdef CONFIG_SMP

1813

1814

/* Used instead of source_load when we know the type == 0 */

1814

/* Used instead of source_load when we know the type == 0 */

1815

static unsigned long weighted_cpuload(const int cpu)

1815

static unsigned long weighted_cpuload(const int cpu)

1816

{

1816

{

1817

return cpu_rq(cpu)->load.weight;

1817

return cpu_rq(cpu)->load.weight;

1818

}

1818

}

1819

1820

/*

1820

/*

1821

* Is this task likely cache-hot:

1821

* Is this task likely cache-hot:

1822

*/

1822

*/

1823

static int

1823

static int

1824

task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)

1824

task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)

1825

{

1825

{

1826

s64 delta;

1826

s64 delta;

1827

1828

/*

1828

/*

1829

* Buddy candidates are cache hot:

1829

* Buddy candidates are cache hot:

1830

*/

1830

*/

1831

if (sched_feat(CACHE_HOT_BUDDY) &&

1831

if (sched_feat(CACHE_HOT_BUDDY) &&

1832

(&p->se == cfs_rq_of(&p->se)->next ||

1832

(&p->se == cfs_rq_of(&p->se)->next ||

1833

&p->se == cfs_rq_of(&p->se)->last))

1833

&p->se == cfs_rq_of(&p->se)->last))

1834

return 1;

1834

return 1;

1835

1836

if (p->sched_class != &fair_sched_class)

1836

if (p->sched_class != &fair_sched_class)

1837

return 0;

1837

return 0;

1838

1839

if (sysctl_sched_migration_cost == -1)

1839

if (sysctl_sched_migration_cost == -1)

1840

return 1;

1840

return 1;

1841

if (sysctl_sched_migration_cost == 0)

1841

if (sysctl_sched_migration_cost == 0)

1842

return 0;

1842

return 0;

1843

1844

delta = now - p->se.exec_start;

1844

delta = now - p->se.exec_start;

1845

1846

return delta < (s64)sysctl_sched_migration_cost;

1846

return delta < (s64)sysctl_sched_migration_cost;

1847

}

1847

}

1848

1849

1850

void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

1850

void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

1851

{

1851

{

1852

int old_cpu = task_cpu(p);

1852

int old_cpu = task_cpu(p);

1853

struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);

1853

struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);

1854

struct cfs_rq *old_cfsrq = task_cfs_rq(p),

1854

struct cfs_rq *old_cfsrq = task_cfs_rq(p),

1855

*new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);

1855

*new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);

1856

u64 clock_offset;

1856

u64 clock_offset;

1857

1858

clock_offset = old_rq->clock - new_rq->clock;

1858

clock_offset = old_rq->clock - new_rq->clock;

1859

1860

#ifdef CONFIG_SCHEDSTATS

1860

#ifdef CONFIG_SCHEDSTATS

1861

if (p->se.wait_start)

1861

if (p->se.wait_start)

1862

p->se.wait_start -= clock_offset;

1862

p->se.wait_start -= clock_offset;

1863

if (p->se.sleep_start)

1863

if (p->se.sleep_start)

1864

p->se.sleep_start -= clock_offset;

1864

p->se.sleep_start -= clock_offset;

1865

if (p->se.block_start)

1865

if (p->se.block_start)

1866

p->se.block_start -= clock_offset;

1866

p->se.block_start -= clock_offset;

1867

if (old_cpu != new_cpu) {

1867

if (old_cpu != new_cpu) {

1868

schedstat_inc(p, se.nr_migrations);

1868

schedstat_inc(p, se.nr_migrations);

1869

if (task_hot(p, old_rq->clock, NULL))

1869

if (task_hot(p, old_rq->clock, NULL))

1870

schedstat_inc(p, se.nr_forced2_migrations);

1870

schedstat_inc(p, se.nr_forced2_migrations);

1871

}

1871

}

1872

#endif

1872

#endif

1873

p->se.vruntime -= old_cfsrq->min_vruntime -

1873

p->se.vruntime -= old_cfsrq->min_vruntime -

1874

new_cfsrq->min_vruntime;

1874

new_cfsrq->min_vruntime;

1875

1876

__set_task_cpu(p, new_cpu);

1876

__set_task_cpu(p, new_cpu);

1877

}

1877

}

1878

1879

struct migration_req {

1879

struct migration_req {

1880

struct list_head list;

1880

struct list_head list;

1881

1882

struct task_struct *task;

1882

struct task_struct *task;

1883

int dest_cpu;

1883

int dest_cpu;

1884

1885

struct completion done;

1885

struct completion done;

1886

};

1886

};

1887

1888

/*

1888

/*

1889

* The task's runqueue lock must be held.

1889

* The task's runqueue lock must be held.

1890

* Returns true if you have to wait for migration thread.

1890

* Returns true if you have to wait for migration thread.

1891

*/

1891

*/

1892

static int

1892

static int

1893

migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)

1893

migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)

1894

{

1894

{

1895

struct rq *rq = task_rq(p);

1895

struct rq *rq = task_rq(p);

1896

1897

/*

1897

/*

1898

* If the task is not on a runqueue (and not running), then

1898

* If the task is not on a runqueue (and not running), then

1899

* it is sufficient to simply update the task's cpu field.

1899

* it is sufficient to simply update the task's cpu field.

1900

*/

1900

*/

1901

if (!p->se.on_rq && !task_running(rq, p)) {

1901

if (!p->se.on_rq && !task_running(rq, p)) {

1902

set_task_cpu(p, dest_cpu);

1902

set_task_cpu(p, dest_cpu);

1903

return 0;

1903

return 0;

1904

}

1904

}

1905

1906

init_completion(&req->done);

1906

init_completion(&req->done);

1907

req->task = p;

1907

req->task = p;

1908

req->dest_cpu = dest_cpu;

1908

req->dest_cpu = dest_cpu;

1909

list_add(&req->list, &rq->migration_queue);

1909

list_add(&req->list, &rq->migration_queue);

1910

1911

return 1;

1911

return 1;

1912

}

1912

}

1913

1914

/*

1914

/*

1915

* wait_task_inactive - wait for a thread to unschedule.

1915

* wait_task_inactive - wait for a thread to unschedule.

1916

*

1916

*

1917

* If @match_state is nonzero, it's the @p->state value just checked and

1917

* If @match_state is nonzero, it's the @p->state value just checked and

1918

* not expected to change. If it changes, i.e. @p might have woken up,

1918

* not expected to change. If it changes, i.e. @p might have woken up,

1919

* then return zero. When we succeed in waiting for @p to be off its CPU,

1919

* then return zero. When we succeed in waiting for @p to be off its CPU,

1920

* we return a positive number (its total switch count). If a second call

1920

* we return a positive number (its total switch count). If a second call

1921

* a short while later returns the same number, the caller can be sure that

1921

* a short while later returns the same number, the caller can be sure that

1922

* @p has remained unscheduled the whole time.

1922

* @p has remained unscheduled the whole time.

1923

*

1923

*

1924

* The caller must ensure that the task *will* unschedule sometime soon,

1924

* The caller must ensure that the task *will* unschedule sometime soon,

1925

* else this function might spin for a *long* time. This function can't

1925

* else this function might spin for a *long* time. This function can't

1926

* be called with interrupts off, or it may introduce deadlock with

1926

* be called with interrupts off, or it may introduce deadlock with

1927

* smp_call_function() if an IPI is sent by the same process we are

1927

* smp_call_function() if an IPI is sent by the same process we are

1928

* waiting to become inactive.

1928

* waiting to become inactive.

1929

*/

1929

*/

1930

unsigned long wait_task_inactive(struct task_struct *p, long match_state)

1930

unsigned long wait_task_inactive(struct task_struct *p, long match_state)

1931

{

1931

{

1932

unsigned long flags;

1932

unsigned long flags;

1933

int running, on_rq;

1933

int running, on_rq;

1934

unsigned long ncsw;

1934

unsigned long ncsw;

1935

struct rq *rq;

1935

struct rq *rq;

1936

1937

for (;;) {

1937

for (;;) {

1938

/*

1938

/*

1939

* We do the initial early heuristics without holding

1939

* We do the initial early heuristics without holding

1940

* any task-queue locks at all. We'll only try to get

1940

* any task-queue locks at all. We'll only try to get

1941

* the runqueue lock when things look like they will

1941

* the runqueue lock when things look like they will

1942

* work out!

1942

* work out!

1943

*/

1943

*/

1944

rq = task_rq(p);

1944

rq = task_rq(p);

1945

1946

/*

1946

/*

1947

* If the task is actively running on another CPU

1947

* If the task is actively running on another CPU

1948

* still, just relax and busy-wait without holding

1948

* still, just relax and busy-wait without holding

1949

* any locks.

1949

* any locks.

1950

*

1950

*

1951

* NOTE! Since we don't hold any locks, it's not

1951

* NOTE! Since we don't hold any locks, it's not

1952

* even sure that "rq" stays as the right runqueue!

1952

* even sure that "rq" stays as the right runqueue!

1953

* But we don't care, since "task_running()" will

1953

* But we don't care, since "task_running()" will

1954

* return false if the runqueue has changed and p

1954

* return false if the runqueue has changed and p

1955

* is actually now running somewhere else!

1955

* is actually now running somewhere else!

1956

*/

1956

*/

1957

while (task_running(rq, p)) {

1957

while (task_running(rq, p)) {

1958

if (match_state && unlikely(p->state != match_state))

1958

if (match_state && unlikely(p->state != match_state))

1959

return 0;

1959

return 0;

1960

cpu_relax();

1960

cpu_relax();

1961

}

1961

}

1962

1963

/*

1963

/*

1964

* Ok, time to look more closely! We need the rq

1964

* Ok, time to look more closely! We need the rq

1965

* lock now, to be *sure*. If we're wrong, we'll

1965

* lock now, to be *sure*. If we're wrong, we'll

1966

* just go back and repeat.

1966

* just go back and repeat.

1967

*/

1967

*/

1968

rq = task_rq_lock(p, &flags);

1968

rq = task_rq_lock(p, &flags);

1969

trace_sched_wait_task(rq, p);

1969

trace_sched_wait_task(rq, p);

1970

running = task_running(rq, p);

1970

running = task_running(rq, p);

1971

on_rq = p->se.on_rq;

1971

on_rq = p->se.on_rq;

1972

ncsw = 0;

1972

ncsw = 0;

1973

if (!match_state || p->state == match_state)

1973

if (!match_state || p->state == match_state)

1974

ncsw = p->nvcsw | LONG_MIN; /* sets MSB */

1974

ncsw = p->nvcsw | LONG_MIN; /* sets MSB */

1975

task_rq_unlock(rq, &flags);

1975

task_rq_unlock(rq, &flags);

1976

1977

/*

1977

/*

1978

* If it changed from the expected state, bail out now.

1978

* If it changed from the expected state, bail out now.

1979

*/

1979

*/

1980

if (unlikely(!ncsw))

1980

if (unlikely(!ncsw))

1981

break;

1981

break;

1982

1983

/*

1983

/*

1984

* Was it really running after all now that we

1984

* Was it really running after all now that we

1985

* checked with the proper locks actually held?

1985

* checked with the proper locks actually held?

1986

*

1986

*

1987

* Oops. Go back and try again..

1987

* Oops. Go back and try again..

1988

*/

1988

*/

1989

if (unlikely(running)) {

1989

if (unlikely(running)) {

1990

cpu_relax();

1990

cpu_relax();

1991

continue;

1991

continue;

1992

}

1992

}

1993

1994

/*

1994

/*

1995

* It's not enough that it's not actively running,

1995

* It's not enough that it's not actively running,

1996

* it must be off the runqueue _entirely_, and not

1996

* it must be off the runqueue _entirely_, and not

1997

* preempted!

1997

* preempted!

1998

*

1998

*

1999

* So if it wa still runnable (but just not actively

1999

* So if it wa still runnable (but just not actively

2000

* running right now), it's preempted, and we should

2000

* running right now), it's preempted, and we should

2001

* yield - it could be a while.

2001

* yield - it could be a while.

2002

*/

2002

*/

2003

if (unlikely(on_rq)) {

2003

if (unlikely(on_rq)) {

2004

schedule_timeout_uninterruptible(1);

2004

schedule_timeout_uninterruptible(1);

2005

continue;

2005

continue;

2006

}

2006

}

2007

2008

/*

2008

/*

2009

* Ahh, all good. It wasn't running, and it wasn't

2009

* Ahh, all good. It wasn't running, and it wasn't

2010

* runnable, which means that it will never become

2010

* runnable, which means that it will never become

2011

* running in the future either. We're all done!

2011

* running in the future either. We're all done!

2012

*/

2012

*/

2013

break;

2013

break;

2014

}

2014

}

2015

2016

return ncsw;

2016

return ncsw;

2017

}

2017

}

2018

2019

/***

2019

/***

2020

* kick_process - kick a running thread to enter/exit the kernel

2020

* kick_process - kick a running thread to enter/exit the kernel

2021

* @p: the to-be-kicked thread

2021

* @p: the to-be-kicked thread

2022

*

2022

*

2023

* Cause a process which is running on another CPU to enter

2023

* Cause a process which is running on another CPU to enter

2024

* kernel-mode, without any delay. (to get signals handled.)

2024

* kernel-mode, without any delay. (to get signals handled.)

2025

*

2025

*

2026

* NOTE: this function doesnt have to take the runqueue lock,

2026

* NOTE: this function doesnt have to take the runqueue lock,

2027

* because all it wants to ensure is that the remote task enters

2027

* because all it wants to ensure is that the remote task enters

2028

* the kernel. If the IPI races and the task has been migrated

2028

* the kernel. If the IPI races and the task has been migrated

2029

* to another CPU then no harm is done and the purpose has been

2029

* to another CPU then no harm is done and the purpose has been

2030

* achieved as well.

2030

* achieved as well.

2031

*/

2031

*/

2032

void kick_process(struct task_struct *p)

2032

void kick_process(struct task_struct *p)

2033

{

2033

{

2034

int cpu;

2034

int cpu;

2035

2036

preempt_disable();

2036

preempt_disable();

2037

cpu = task_cpu(p);

2037

cpu = task_cpu(p);

2038

if ((cpu != smp_processor_id()) && task_curr(p))

2038

if ((cpu != smp_processor_id()) && task_curr(p))

2039

smp_send_reschedule(cpu);

2039

smp_send_reschedule(cpu);

2040

preempt_enable();

2040

preempt_enable();

2041

}

2041

}

2042

2043

/*

2043

/*

2044

* Return a low guess at the load of a migration-source cpu weighted

2044

* Return a low guess at the load of a migration-source cpu weighted

2045

* according to the scheduling class and "nice" value.

2045

* according to the scheduling class and "nice" value.

2046

*

2046

*

2047

* We want to under-estimate the load of migration sources, to

2047

* We want to under-estimate the load of migration sources, to

2048

* balance conservatively.

2048

* balance conservatively.

2049

*/

2049

*/

2050

static unsigned long source_load(int cpu, int type)

2050

static unsigned long source_load(int cpu, int type)

2051

{

2051

{

2052

struct rq *rq = cpu_rq(cpu);

2052

struct rq *rq = cpu_rq(cpu);

2053

unsigned long total = weighted_cpuload(cpu);

2053

unsigned long total = weighted_cpuload(cpu);

2054

2055

if (type == 0 || !sched_feat(LB_BIAS))

2055

if (type == 0 || !sched_feat(LB_BIAS))

2056

return total;

2056

return total;

2057

2058

return min(rq->cpu_load[type-1], total);

2058

return min(rq->cpu_load[type-1], total);

2059

}

2059

}

2060

2061

/*

2061

/*

2062

* Return a high guess at the load of a migration-target cpu weighted

2062

* Return a high guess at the load of a migration-target cpu weighted

2063

* according to the scheduling class and "nice" value.

2063

* according to the scheduling class and "nice" value.

2064

*/

2064

*/

2065

static unsigned long target_load(int cpu, int type)

2065

static unsigned long target_load(int cpu, int type)

2066

{

2066

{

2067

struct rq *rq = cpu_rq(cpu);

2067

struct rq *rq = cpu_rq(cpu);

2068

unsigned long total = weighted_cpuload(cpu);

2068

unsigned long total = weighted_cpuload(cpu);

2069

2070

if (type == 0 || !sched_feat(LB_BIAS))

2070

if (type == 0 || !sched_feat(LB_BIAS))

2071

return total;

2071

return total;

2072

2073

return max(rq->cpu_load[type-1], total);

2073

return max(rq->cpu_load[type-1], total);

2074

}

2074

}

2075

2076

/*

2076

/*

2077

* find_idlest_group finds and returns the least busy CPU group within the

2077

* find_idlest_group finds and returns the least busy CPU group within the

2078

* domain.

2078

* domain.

2079

*/

2079

*/

2080

static struct sched_group *

2080

static struct sched_group *

2081

find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)

2081

find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)

2082

{

2082

{

2083

struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;

2083

struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;

2084

unsigned long min_load = ULONG_MAX, this_load = 0;

2084

unsigned long min_load = ULONG_MAX, this_load = 0;

2085

int load_idx = sd->forkexec_idx;

2085

int load_idx = sd->forkexec_idx;

2086

int imbalance = 100 + (sd->imbalance_pct-100)/2;

2086

int imbalance = 100 + (sd->imbalance_pct-100)/2;

2087

2088

do {

2088

do {

2089

unsigned long load, avg_load;

2089

unsigned long load, avg_load;

2090

int local_group;

2090

int local_group;

2091

int i;

2091

int i;

2092

2093

/* Skip over this group if it has no CPUs allowed */

2093

/* Skip over this group if it has no CPUs allowed */

2094

if (!cpus_intersects(group->cpumask, p->cpus_allowed))

2094

if (!cpus_intersects(group->cpumask, p->cpus_allowed))

2095

continue;

2095

continue;

2096

2097

local_group = cpu_isset(this_cpu, group->cpumask);

2097

local_group = cpu_isset(this_cpu, group->cpumask);

2098

2099

/* Tally up the load of all CPUs in the group */

2099

/* Tally up the load of all CPUs in the group */

2100

avg_load = 0;

2100

avg_load = 0;

2101

2102

for_each_cpu_mask_nr(i, group->cpumask) {

2102

for_each_cpu_mask_nr(i, group->cpumask) {

2103

/* Bias balancing toward cpus of our domain */

2103

/* Bias balancing toward cpus of our domain */

2104

if (local_group)

2104

if (local_group)

2105

load = source_load(i, load_idx);

2105

load = source_load(i, load_idx);

2106

else

2106

else

2107

load = target_load(i, load_idx);

2107

load = target_load(i, load_idx);

2108

2109

avg_load += load;

2109

avg_load += load;

2110

}

2110

}

2111

2112

/* Adjust by relative CPU power of the group */

2112

/* Adjust by relative CPU power of the group */

2113

avg_load = sg_div_cpu_power(group,

2113

avg_load = sg_div_cpu_power(group,

2114

avg_load * SCHED_LOAD_SCALE);

2114

avg_load * SCHED_LOAD_SCALE);

2115

2116

if (local_group) {

2116

if (local_group) {

2117

this_load = avg_load;

2117

this_load = avg_load;

2118

this = group;

2118

this = group;

2119

} else if (avg_load < min_load) {

2119

} else if (avg_load < min_load) {

2120

min_load = avg_load;

2120

min_load = avg_load;

2121

idlest = group;

2121

idlest = group;

2122

}

2122

}

2123

} while (group = group->next, group != sd->groups);

2123

} while (group = group->next, group != sd->groups);

2124

2125

if (!idlest || 100*this_load < imbalance*min_load)

2125

if (!idlest || 100*this_load < imbalance*min_load)

2126

return NULL;

2126

return NULL;

2127

return idlest;

2127

return idlest;

2128

}

2128

}

2129

2130

/*

2130

/*

2131

* find_idlest_cpu - find the idlest cpu among the cpus in group.

2131

* find_idlest_cpu - find the idlest cpu among the cpus in group.

2132

*/

2132

*/

2133

static int

2133

static int

2134

find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,

2134

find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,

2135

cpumask_t *tmp)

2135

cpumask_t *tmp)

2136

{

2136

{

2137

unsigned long load, min_load = ULONG_MAX;

2137

unsigned long load, min_load = ULONG_MAX;

2138

int idlest = -1;

2138

int idlest = -1;

2139

int i;

2139

int i;

2140

2141

/* Traverse only the allowed CPUs */

2141

/* Traverse only the allowed CPUs */

2142

cpus_and(*tmp, group->cpumask, p->cpus_allowed);

2142

cpus_and(*tmp, group->cpumask, p->cpus_allowed);

2143

2144

for_each_cpu_mask_nr(i, *tmp) {

2144

for_each_cpu_mask_nr(i, *tmp) {

2145

load = weighted_cpuload(i);

2145

load = weighted_cpuload(i);

2146

2147

if (load < min_load || (load == min_load && i == this_cpu)) {

2147

if (load < min_load || (load == min_load && i == this_cpu)) {

2148

min_load = load;

2148

min_load = load;

2149

idlest = i;

2149

idlest = i;

2150

}

2150

}

2151

}

2151

}

2152

2153

return idlest;

2153

return idlest;

2154

}

2154

}

2155

2156

/*

2156

/*

2157

* sched_balance_self: balance the current task (running on cpu) in domains

2157

* sched_balance_self: balance the current task (running on cpu) in domains

2158

* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and

2158

* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and

2159

* SD_BALANCE_EXEC.

2159

* SD_BALANCE_EXEC.

2160

*

2160

*

2161

* Balance, ie. select the least loaded group.

2161

* Balance, ie. select the least loaded group.

2162

*

2162

*

2163

* Returns the target CPU number, or the same CPU if no balancing is needed.

2163

* Returns the target CPU number, or the same CPU if no balancing is needed.

2164

*

2164

*

2165

* preempt must be disabled.

2165

* preempt must be disabled.

2166

*/

2166

*/

2167

static int sched_balance_self(int cpu, int flag)

2167

static int sched_balance_self(int cpu, int flag)

2168

{

2168

{

2169

struct task_struct *t = current;

2169

struct task_struct *t = current;

2170

struct sched_domain *tmp, *sd = NULL;

2170

struct sched_domain *tmp, *sd = NULL;

2171

2172

for_each_domain(cpu, tmp) {

2172

for_each_domain(cpu, tmp) {

2173

/*

2173

/*

2174

* If power savings logic is enabled for a domain, stop there.

2174

* If power savings logic is enabled for a domain, stop there.

2175

*/

2175

*/

2176

if (tmp->flags & SD_POWERSAVINGS_BALANCE)

2176

if (tmp->flags & SD_POWERSAVINGS_BALANCE)

2177

break;

2177

break;

2178

if (tmp->flags & flag)

2178

if (tmp->flags & flag)

2179

sd = tmp;

2179

sd = tmp;

2180

}

2180

}

2181

2182

if (sd)

2182

if (sd)

2183

update_shares(sd);

2183

update_shares(sd);

2184

2185

while (sd) {

2185

while (sd) {

2186

cpumask_t span, tmpmask;

2186

cpumask_t span, tmpmask;

2187

struct sched_group *group;

2187

struct sched_group *group;

2188

int new_cpu, weight;

2188

int new_cpu, weight;

2189

2190

if (!(sd->flags & flag)) {

2190

if (!(sd->flags & flag)) {

2191

sd = sd->child;

2191

sd = sd->child;

2192

continue;

2192

continue;

2193

}

2193

}

2194

2195

span = sd->span;

2195

span = sd->span;

2196

group = find_idlest_group(sd, t, cpu);

2196

group = find_idlest_group(sd, t, cpu);

2197

if (!group) {

2197

if (!group) {

2198

sd = sd->child;

2198

sd = sd->child;

2199

continue;

2199

continue;

2200

}

2200

}

2201

2202

new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);

2202

new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);

2203

if (new_cpu == -1 || new_cpu == cpu) {

2203

if (new_cpu == -1 || new_cpu == cpu) {

2204

/* Now try balancing at a lower domain level of cpu */

2204

/* Now try balancing at a lower domain level of cpu */

2205

sd = sd->child;

2205

sd = sd->child;

2206

continue;

2206

continue;

2207

}

2207

}

2208

2209

/* Now try balancing at a lower domain level of new_cpu */

2209

/* Now try balancing at a lower domain level of new_cpu */

2210

cpu = new_cpu;

2210

cpu = new_cpu;

2211

sd = NULL;

2211

sd = NULL;

2212

weight = cpus_weight(span);

2212

weight = cpus_weight(span);

2213

for_each_domain(cpu, tmp) {

2213

for_each_domain(cpu, tmp) {

2214

if (weight <= cpus_weight(tmp->span))

2214

if (weight <= cpus_weight(tmp->span))

2215

break;

2215

break;

2216

if (tmp->flags & flag)

2216

if (tmp->flags & flag)

2217

sd = tmp;

2217

sd = tmp;

2218

}

2218

}

2219

/* while loop will break here if sd == NULL */

2219

/* while loop will break here if sd == NULL */

2220

}

2220

}

2221

2222

return cpu;

2222

return cpu;

2223

}

2223

}

2224

2225

#endif /* CONFIG_SMP */

2225

#endif /* CONFIG_SMP */

2226

2227

/***

2227

/***

2228

* try_to_wake_up - wake up a thread

2228

* try_to_wake_up - wake up a thread

2229

* @p: the to-be-woken-up thread

2229

* @p: the to-be-woken-up thread

2230

* @state: the mask of task states that can be woken

2230

* @state: the mask of task states that can be woken

2231

* @sync: do a synchronous wakeup?

2231

* @sync: do a synchronous wakeup?

2232

*

2232

*

2233

* Put it on the run-queue if it's not already there. The "current"

2233

* Put it on the run-queue if it's not already there. The "current"

2234

* thread is always on the run-queue (except when the actual

2234

* thread is always on the run-queue (except when the actual

2235

* re-schedule is in progress), and as such you're allowed to do

2235

* re-schedule is in progress), and as such you're allowed to do

2236

* the simpler "current->state = TASK_RUNNING" to mark yourself

2236

* the simpler "current->state = TASK_RUNNING" to mark yourself

2237

* runnable without the overhead of this.

2237

* runnable without the overhead of this.

2238

*

2238

*

2239

* returns failure only if the task is already active.

2239

* returns failure only if the task is already active.

2240

*/

2240

*/

2241

static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)

2241

static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)

2242

{

2242

{

2243

int cpu, orig_cpu, this_cpu, success = 0;

2243

int cpu, orig_cpu, this_cpu, success = 0;

2244

unsigned long flags;

2244

unsigned long flags;

2245

long old_state;

2245

long old_state;

2246

struct rq *rq;

2246

struct rq *rq;

2247

2248

if (!sched_feat(SYNC_WAKEUPS))

2248

if (!sched_feat(SYNC_WAKEUPS))

2249

sync = 0;

2249

sync = 0;

2250

2251

#ifdef CONFIG_SMP

2251

#ifdef CONFIG_SMP

2252

if (sched_feat(LB_WAKEUP_UPDATE)) {

2252

if (sched_feat(LB_WAKEUP_UPDATE)) {

2253

struct sched_domain *sd;

2253

struct sched_domain *sd;

2254

2255

this_cpu = raw_smp_processor_id();

2255

this_cpu = raw_smp_processor_id();

2256

cpu = task_cpu(p);

2256

cpu = task_cpu(p);

2257

2258

for_each_domain(this_cpu, sd) {

2258

for_each_domain(this_cpu, sd) {

2259

if (cpu_isset(cpu, sd->span)) {

2259

if (cpu_isset(cpu, sd->span)) {

2260

update_shares(sd);

2260

update_shares(sd);

2261

break;

2261

break;

2262

}

2262

}

2263

}

2263

}

2264

}

2264

}

2265

#endif

2265

#endif

2266

2267

smp_wmb();

2267

smp_wmb();

2268

rq = task_rq_lock(p, &flags);

2268

rq = task_rq_lock(p, &flags);

2269

old_state = p->state;

2269

old_state = p->state;

2270

if (!(old_state & state))

2270

if (!(old_state & state))

2271

goto out;

2271

goto out;

2272

2273

if (p->se.on_rq)

2273

if (p->se.on_rq)

2274

goto out_running;

2274

goto out_running;

2275

2276

cpu = task_cpu(p);

2276

cpu = task_cpu(p);

2277

orig_cpu = cpu;

2277

orig_cpu = cpu;

2278

this_cpu = smp_processor_id();

2278

this_cpu = smp_processor_id();

2279

2280

#ifdef CONFIG_SMP

2280

#ifdef CONFIG_SMP

2281

if (unlikely(task_running(rq, p)))

2281

if (unlikely(task_running(rq, p)))

2282

goto out_activate;

2282

goto out_activate;

2283

2284

cpu = p->sched_class->select_task_rq(p, sync);

2284

cpu = p->sched_class->select_task_rq(p, sync);

2285

if (cpu != orig_cpu) {

2285

if (cpu != orig_cpu) {

2286

set_task_cpu(p, cpu);

2286

set_task_cpu(p, cpu);

2287

task_rq_unlock(rq, &flags);

2287

task_rq_unlock(rq, &flags);

2288

/* might preempt at this point */

2288

/* might preempt at this point */

2289

rq = task_rq_lock(p, &flags);

2289

rq = task_rq_lock(p, &flags);

2290

old_state = p->state;

2290

old_state = p->state;

2291

if (!(old_state & state))

2291

if (!(old_state & state))

2292

goto out;

2292

goto out;

2293

if (p->se.on_rq)

2293

if (p->se.on_rq)

2294

goto out_running;

2294

goto out_running;

2295

2296

this_cpu = smp_processor_id();

2296

this_cpu = smp_processor_id();

2297

cpu = task_cpu(p);

2297

cpu = task_cpu(p);

2298

}

2298

}

2299

2300

#ifdef CONFIG_SCHEDSTATS

2300

#ifdef CONFIG_SCHEDSTATS

2301

schedstat_inc(rq, ttwu_count);

2301

schedstat_inc(rq, ttwu_count);

2302

if (cpu == this_cpu)

2302

if (cpu == this_cpu)

2303

schedstat_inc(rq, ttwu_local);

2303

schedstat_inc(rq, ttwu_local);

2304

else {

2304

else {

2305

struct sched_domain *sd;

2305

struct sched_domain *sd;

2306

for_each_domain(this_cpu, sd) {

2306

for_each_domain(this_cpu, sd) {

2307

if (cpu_isset(cpu, sd->span)) {

2307

if (cpu_isset(cpu, sd->span)) {

2308

schedstat_inc(sd, ttwu_wake_remote);

2308

schedstat_inc(sd, ttwu_wake_remote);

2309

break;

2309

break;

2310

}

2310

}

2311

}

2311

}

2312

}

2312

}

2313

#endif /* CONFIG_SCHEDSTATS */

2313

#endif /* CONFIG_SCHEDSTATS */

2314

2315

out_activate:

2315

out_activate:

2316

#endif /* CONFIG_SMP */

2316

#endif /* CONFIG_SMP */

2317

schedstat_inc(p, se.nr_wakeups);

2317

schedstat_inc(p, se.nr_wakeups);

2318

if (sync)

2318

if (sync)

2319

schedstat_inc(p, se.nr_wakeups_sync);

2319

schedstat_inc(p, se.nr_wakeups_sync);

2320

if (orig_cpu != cpu)

2320

if (orig_cpu != cpu)

2321

schedstat_inc(p, se.nr_wakeups_migrate);

2321

schedstat_inc(p, se.nr_wakeups_migrate);

2322

if (cpu == this_cpu)

2322

if (cpu == this_cpu)

2323

schedstat_inc(p, se.nr_wakeups_local);

2323

schedstat_inc(p, se.nr_wakeups_local);

2324

else

2324

else

2325

schedstat_inc(p, se.nr_wakeups_remote);

2325

schedstat_inc(p, se.nr_wakeups_remote);

2326

update_rq_clock(rq);

2326

update_rq_clock(rq);

2327

activate_task(rq, p, 1);

2327

activate_task(rq, p, 1);

2328

success = 1;

2328

success = 1;

2329

2330

out_running:

2330

out_running:

2331

trace_sched_wakeup(rq, p);

2331

trace_sched_wakeup(rq, p);

2332

check_preempt_curr(rq, p, sync);

2332

check_preempt_curr(rq, p, sync);

2333

2334

p->state = TASK_RUNNING;

2334

p->state = TASK_RUNNING;

2335

#ifdef CONFIG_SMP

2335

#ifdef CONFIG_SMP

2336

if (p->sched_class->task_wake_up)

2336

if (p->sched_class->task_wake_up)

2337

p->sched_class->task_wake_up(rq, p);

2337

p->sched_class->task_wake_up(rq, p);

2338

#endif

2338

#endif

2339

out:

2339

out:

2340

current->se.last_wakeup = current->se.sum_exec_runtime;

2340

current->se.last_wakeup = current->se.sum_exec_runtime;

2341

2342

task_rq_unlock(rq, &flags);

2342

task_rq_unlock(rq, &flags);

2343

2344

return success;

2344

return success;

2345

}

2345

}

2346

2347

int wake_up_process(struct task_struct *p)

2347

int wake_up_process(struct task_struct *p)

2348

{

2348

{

2349

return try_to_wake_up(p, TASK_ALL, 0);

2349

return try_to_wake_up(p, TASK_ALL, 0);

2350

}

2350

}

2351

EXPORT_SYMBOL(wake_up_process);

2351

EXPORT_SYMBOL(wake_up_process);

2352

2353

int wake_up_state(struct task_struct *p, unsigned int state)

2353

int wake_up_state(struct task_struct *p, unsigned int state)

2354

{

2354

{

2355

return try_to_wake_up(p, state, 0);

2355

return try_to_wake_up(p, state, 0);

2356

}

2356

}

2357

2358

/*

2358

/*

2359

* Perform scheduler related setup for a newly forked process p.

2359

* Perform scheduler related setup for a newly forked process p.

2360

* p is forked by current.

2360

* p is forked by current.

2361

*

2361

*

2362

* __sched_fork() is basic setup used by init_idle() too:

2362

* __sched_fork() is basic setup used by init_idle() too:

2363

*/

2363

*/

2364

static void __sched_fork(struct task_struct *p)

2364

static void __sched_fork(struct task_struct *p)

2365

{

2365

{

2366

p->se.exec_start = 0;

2366

p->se.exec_start = 0;

2367

p->se.sum_exec_runtime = 0;

2367

p->se.sum_exec_runtime = 0;

2368

p->se.prev_sum_exec_runtime = 0;

2368

p->se.prev_sum_exec_runtime = 0;

2369

p->se.last_wakeup = 0;

2369

p->se.last_wakeup = 0;

2370

p->se.avg_overlap = 0;

2370

p->se.avg_overlap = 0;

2371

2372

#ifdef CONFIG_SCHEDSTATS

2372

#ifdef CONFIG_SCHEDSTATS

2373

p->se.wait_start = 0;

2373

p->se.wait_start = 0;

2374

p->se.sum_sleep_runtime = 0;

2374

p->se.sum_sleep_runtime = 0;

2375

p->se.sleep_start = 0;

2375

p->se.sleep_start = 0;

2376

p->se.block_start = 0;

2376

p->se.block_start = 0;

2377

p->se.sleep_max = 0;

2377

p->se.sleep_max = 0;

2378

p->se.block_max = 0;

2378

p->se.block_max = 0;

2379

p->se.exec_max = 0;

2379

p->se.exec_max = 0;

2380

p->se.slice_max = 0;

2380

p->se.slice_max = 0;

2381

p->se.wait_max = 0;

2381

p->se.wait_max = 0;

2382

#endif

2382

#endif

2383

2384

INIT_LIST_HEAD(&p->rt.run_list);

2384

INIT_LIST_HEAD(&p->rt.run_list);

2385

p->se.on_rq = 0;

2385

p->se.on_rq = 0;

2386

INIT_LIST_HEAD(&p->se.group_node);

2386

INIT_LIST_HEAD(&p->se.group_node);

2387

2388

#ifdef CONFIG_PREEMPT_NOTIFIERS

2388

#ifdef CONFIG_PREEMPT_NOTIFIERS

2389

INIT_HLIST_HEAD(&p->preempt_notifiers);

2389

INIT_HLIST_HEAD(&p->preempt_notifiers);

2390

#endif

2390

#endif

2391

2392

/*

2392

/*

2393

* We mark the process as running here, but have not actually

2393

* We mark the process as running here, but have not actually

2394

* inserted it onto the runqueue yet. This guarantees that

2394

* inserted it onto the runqueue yet. This guarantees that

2395

* nobody will actually run it, and a signal or other external

2395

* nobody will actually run it, and a signal or other external

2396

* event cannot wake it up and insert it on the runqueue either.

2396

* event cannot wake it up and insert it on the runqueue either.

2397

*/

2397

*/

2398

p->state = TASK_RUNNING;

2398

p->state = TASK_RUNNING;

2399

}

2399

}

2400

2401

/*

2401

/*

2402

* fork()/clone()-time setup:

2402

* fork()/clone()-time setup:

2403

*/

2403

*/

2404

void sched_fork(struct task_struct *p, int clone_flags)

2404

void sched_fork(struct task_struct *p, int clone_flags)

2405

{

2405

{

2406

int cpu = get_cpu();

2406

int cpu = get_cpu();

2407

2408

__sched_fork(p);

2408

__sched_fork(p);

2409

2410

#ifdef CONFIG_SMP

2410

#ifdef CONFIG_SMP

2411

cpu = sched_balance_self(cpu, SD_BALANCE_FORK);

2411

cpu = sched_balance_self(cpu, SD_BALANCE_FORK);

2412

#endif

2412

#endif

2413

set_task_cpu(p, cpu);

2413

set_task_cpu(p, cpu);

2414

2415

/*

2415

/*

2416

* Make sure we do not leak PI boosting priority to the child:

2416

* Make sure we do not leak PI boosting priority to the child:

2417

*/

2417

*/

2418

p->prio = current->normal_prio;

2418

p->prio = current->normal_prio;

2419

if (!rt_prio(p->prio))

2419

if (!rt_prio(p->prio))

2420

p->sched_class = &fair_sched_class;

2420

p->sched_class = &fair_sched_class;

2421

2422

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)

2422

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)

2423

if (likely(sched_info_on()))

2423

if (likely(sched_info_on()))

2424

memset(&p->sched_info, 0, sizeof(p->sched_info));

2424

memset(&p->sched_info, 0, sizeof(p->sched_info));

2425

#endif

2425

#endif

2426

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

2426

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

2427

p->oncpu = 0;

2427

p->oncpu = 0;

2428

#endif

2428

#endif

2429

#ifdef CONFIG_PREEMPT

2429

#ifdef CONFIG_PREEMPT

2430

/* Want to start with kernel preemption disabled. */

2430

/* Want to start with kernel preemption disabled. */

2431

task_thread_info(p)->preempt_count = 1;

2431

task_thread_info(p)->preempt_count = 1;

2432

#endif

2432

#endif

2433

put_cpu();

2433

put_cpu();

2434

}

2434

}

2435

2436

/*

2436

/*

2437

* wake_up_new_task - wake up a newly created task for the first time.

2437

* wake_up_new_task - wake up a newly created task for the first time.

2438

*

2438

*

2439

* This function will do some initial scheduler statistics housekeeping

2439

* This function will do some initial scheduler statistics housekeeping

2440

* that must be done for every newly created context, then puts the task

2440

* that must be done for every newly created context, then puts the task

2441

* on the runqueue and wakes it.

2441

* on the runqueue and wakes it.

2442

*/

2442

*/

2443

void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)

2443

void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)

2444

{

2444

{

2445

unsigned long flags;

2445

unsigned long flags;

2446

struct rq *rq;

2446

struct rq *rq;

2447

2448

rq = task_rq_lock(p, &flags);

2448

rq = task_rq_lock(p, &flags);

2449

BUG_ON(p->state != TASK_RUNNING);

2449

BUG_ON(p->state != TASK_RUNNING);

2450

update_rq_clock(rq);

2450

update_rq_clock(rq);

2451

2452

p->prio = effective_prio(p);

2452

p->prio = effective_prio(p);

2453

2454

if (!p->sched_class->task_new || !current->se.on_rq) {

2454

if (!p->sched_class->task_new || !current->se.on_rq) {

2455

activate_task(rq, p, 0);

2455

activate_task(rq, p, 0);

2456

} else {

2456

} else {

2457

/*

2457

/*

2458

* Let the scheduling class do new task startup

2458

* Let the scheduling class do new task startup

2459

* management (if any):

2459

* management (if any):

2460

*/

2460

*/

2461

p->sched_class->task_new(rq, p);

2461

p->sched_class->task_new(rq, p);

2462

inc_nr_running(rq);

2462

inc_nr_running(rq);

2463

}

2463

}

2464

trace_sched_wakeup_new(rq, p);

2464

trace_sched_wakeup_new(rq, p);

2465

check_preempt_curr(rq, p, 0);

2465

check_preempt_curr(rq, p, 0);

2466

#ifdef CONFIG_SMP

2466

#ifdef CONFIG_SMP

2467

if (p->sched_class->task_wake_up)

2467

if (p->sched_class->task_wake_up)

2468

p->sched_class->task_wake_up(rq, p);

2468

p->sched_class->task_wake_up(rq, p);

2469

#endif

2469

#endif

2470

task_rq_unlock(rq, &flags);

2470

task_rq_unlock(rq, &flags);

2471

}

2471

}

2472

2473

#ifdef CONFIG_PREEMPT_NOTIFIERS

2473

#ifdef CONFIG_PREEMPT_NOTIFIERS

2474

2475

/**

2475

/**

2476

* preempt_notifier_register - tell me when current is being being preempted & rescheduled

2476

* preempt_notifier_register - tell me when current is being being preempted & rescheduled

2477

* @notifier: notifier struct to register

2477

* @notifier: notifier struct to register

2478

*/

2478

*/

2479

void preempt_notifier_register(struct preempt_notifier *notifier)

2479

void preempt_notifier_register(struct preempt_notifier *notifier)

2480

{

2480

{

2481

hlist_add_head(&notifier->link, &current->preempt_notifiers);

2481

hlist_add_head(&notifier->link, &current->preempt_notifiers);

2482

}

2482

}

2483

EXPORT_SYMBOL_GPL(preempt_notifier_register);

2483

EXPORT_SYMBOL_GPL(preempt_notifier_register);

2484

2485

/**

2485

/**

2486

* preempt_notifier_unregister - no longer interested in preemption notifications

2486

* preempt_notifier_unregister - no longer interested in preemption notifications

2487

* @notifier: notifier struct to unregister

2487

* @notifier: notifier struct to unregister

2488

*

2488

*

2489

* This is safe to call from within a preemption notifier.

2489

* This is safe to call from within a preemption notifier.

2490

*/

2490

*/

2491

void preempt_notifier_unregister(struct preempt_notifier *notifier)

2491

void preempt_notifier_unregister(struct preempt_notifier *notifier)

2492

{

2492

{

2493

hlist_del(&notifier->link);

2493

hlist_del(&notifier->link);

2494

}

2494

}

2495

EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

2495

EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

2496

2497

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2497

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2498

{

2498

{

2499

struct preempt_notifier *notifier;

2499

struct preempt_notifier *notifier;

2500

struct hlist_node *node;

2500

struct hlist_node *node;

2501

2502

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2502

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2503

notifier->ops->sched_in(notifier, raw_smp_processor_id());

2503

notifier->ops->sched_in(notifier, raw_smp_processor_id());

2504

}

2504

}

2505

2506

static void

2506

static void

2507

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2507

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2508

struct task_struct *next)

2508

struct task_struct *next)

2509

{

2509

{

2510

struct preempt_notifier *notifier;

2510

struct preempt_notifier *notifier;

2511

struct hlist_node *node;

2511

struct hlist_node *node;

2512

2513

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2513

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2514

notifier->ops->sched_out(notifier, next);

2514

notifier->ops->sched_out(notifier, next);

2515

}

2515

}

2516

2517

#else /* !CONFIG_PREEMPT_NOTIFIERS */

2517

#else /* !CONFIG_PREEMPT_NOTIFIERS */

2518

2519

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2519

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2520

{

2520

{

2521

}

2521

}

2522

2523

static void

2523

static void

2524

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2524

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2525

struct task_struct *next)

2525

struct task_struct *next)

2526

{

2526

{

2527

}

2527

}

2528

2529

#endif /* CONFIG_PREEMPT_NOTIFIERS */

2529

#endif /* CONFIG_PREEMPT_NOTIFIERS */

2530

2531

/**

2531

/**

2532

* prepare_task_switch - prepare to switch tasks

2532

* prepare_task_switch - prepare to switch tasks

2533

* @rq: the runqueue preparing to switch

2533

* @rq: the runqueue preparing to switch

2534

* @prev: the current task that is being switched out

2534

* @prev: the current task that is being switched out

2535

* @next: the task we are going to switch to.

2535

* @next: the task we are going to switch to.

2536

*

2536

*

2537

* This is called with the rq lock held and interrupts off. It must

2537

* This is called with the rq lock held and interrupts off. It must

2538

* be paired with a subsequent finish_task_switch after the context

2538

* be paired with a subsequent finish_task_switch after the context

2539

* switch.

2539

* switch.

2540

*

2540

*

2541

* prepare_task_switch sets up locking and calls architecture specific

2541

* prepare_task_switch sets up locking and calls architecture specific

2542

* hooks.

2542

* hooks.

2543

*/

2543

*/

2544

static inline void

2544

static inline void

2545

prepare_task_switch(struct rq *rq, struct task_struct *prev,

2545

prepare_task_switch(struct rq *rq, struct task_struct *prev,

2546

struct task_struct *next)

2546

struct task_struct *next)

2547

{

2547

{

2548

fire_sched_out_preempt_notifiers(prev, next);

2548

fire_sched_out_preempt_notifiers(prev, next);

2549

prepare_lock_switch(rq, next);

2549

prepare_lock_switch(rq, next);

2550

prepare_arch_switch(next);

2550

prepare_arch_switch(next);

2551

}

2551

}

2552

2553

/**

2553

/**

2554

* finish_task_switch - clean up after a task-switch

2554

* finish_task_switch - clean up after a task-switch

2555

* @rq: runqueue associated with task-switch

2555

* @rq: runqueue associated with task-switch

2556

* @prev: the thread we just switched away from.

2556

* @prev: the thread we just switched away from.

2557

*

2557

*

2558

* finish_task_switch must be called after the context switch, paired

2558

* finish_task_switch must be called after the context switch, paired

2559

* with a prepare_task_switch call before the context switch.

2559

* with a prepare_task_switch call before the context switch.

2560

* finish_task_switch will reconcile locking set up by prepare_task_switch,

2560

* finish_task_switch will reconcile locking set up by prepare_task_switch,

2561

* and do any other architecture-specific cleanup actions.

2561

* and do any other architecture-specific cleanup actions.

2562

*

2562

*

2563

* Note that we may have delayed dropping an mm in context_switch(). If

2563

* Note that we may have delayed dropping an mm in context_switch(). If

2564

* so, we finish that here outside of the runqueue lock. (Doing it

2564

* so, we finish that here outside of the runqueue lock. (Doing it

2565

* with the lock held can cause deadlocks; see schedule() for

2565

* with the lock held can cause deadlocks; see schedule() for

2566

* details.)

2566

* details.)

2567

*/

2567

*/

2568

static void finish_task_switch(struct rq *rq, struct task_struct *prev)

2568

static void finish_task_switch(struct rq *rq, struct task_struct *prev)

2569

__releases(rq->lock)

2569

__releases(rq->lock)

2570

{

2570

{

2571

struct mm_struct *mm = rq->prev_mm;

2571

struct mm_struct *mm = rq->prev_mm;

2572

long prev_state;

2572

long prev_state;

2573

2574

rq->prev_mm = NULL;

2574

rq->prev_mm = NULL;

2575

2576

/*

2576

/*

2577

* A task struct has one reference for the use as "current".

2577

* A task struct has one reference for the use as "current".

2578

* If a task dies, then it sets TASK_DEAD in tsk->state and calls

2578

* If a task dies, then it sets TASK_DEAD in tsk->state and calls

2579

* schedule one last time. The schedule call will never return, and

2579

* schedule one last time. The schedule call will never return, and

2580

* the scheduled task must drop that reference.

2580

* the scheduled task must drop that reference.

2581

* The test for TASK_DEAD must occur while the runqueue locks are

2581

* The test for TASK_DEAD must occur while the runqueue locks are

2582

* still held, otherwise prev could be scheduled on another cpu, die

2582

* still held, otherwise prev could be scheduled on another cpu, die

2583

* there before we look at prev->state, and then the reference would

2583

* there before we look at prev->state, and then the reference would

2584

* be dropped twice.

2584

* be dropped twice.

2585

* Manfred Spraul <manfred@colorfullife.com>

2585

* Manfred Spraul <manfred@colorfullife.com>

2586

*/

2586

*/

2587

prev_state = prev->state;

2587

prev_state = prev->state;

2588

finish_arch_switch(prev);

2588

finish_arch_switch(prev);

2589

finish_lock_switch(rq, prev);

2589

finish_lock_switch(rq, prev);

2590

#ifdef CONFIG_SMP

2590

#ifdef CONFIG_SMP

2591

if (current->sched_class->post_schedule)

2591

if (current->sched_class->post_schedule)

2592

current->sched_class->post_schedule(rq);

2592

current->sched_class->post_schedule(rq);

2593

#endif

2593

#endif

2594

2595

fire_sched_in_preempt_notifiers(current);

2595

fire_sched_in_preempt_notifiers(current);

2596

if (mm)

2596

if (mm)

2597

mmdrop(mm);

2597

mmdrop(mm);

2598

if (unlikely(prev_state == TASK_DEAD)) {

2598

if (unlikely(prev_state == TASK_DEAD)) {

2599

/*

2599

/*

2600

* Remove function-return probe instances associated with this

2600

* Remove function-return probe instances associated with this

2601

* task and put them back on the free list.

2601

* task and put them back on the free list.

2602

*/

2602

*/

2603

kprobe_flush_task(prev);

2603

kprobe_flush_task(prev);

2604

put_task_struct(prev);

2604

put_task_struct(prev);

2605

}

2605

}

2606

}

2606

}

2607

2608

/**

2608

/**

2609

* schedule_tail - first thing a freshly forked thread must call.

2609

* schedule_tail - first thing a freshly forked thread must call.

2610

* @prev: the thread we just switched away from.

2610

* @prev: the thread we just switched away from.

2611

*/

2611

*/

2612

asmlinkage void schedule_tail(struct task_struct *prev)

2612

asmlinkage void schedule_tail(struct task_struct *prev)

2613

__releases(rq->lock)

2613

__releases(rq->lock)

2614

{

2614

{

2615

struct rq *rq = this_rq();

2615

struct rq *rq = this_rq();

2616

2617

finish_task_switch(rq, prev);

2617

finish_task_switch(rq, prev);

2618

#ifdef __ARCH_WANT_UNLOCKED_CTXSW

2618

#ifdef __ARCH_WANT_UNLOCKED_CTXSW

2619

/* In this case, finish_task_switch does not reenable preemption */

2619

/* In this case, finish_task_switch does not reenable preemption */

2620

preempt_enable();

2620

preempt_enable();

2621

#endif

2621

#endif

2622

if (current->set_child_tid)

2622

if (current->set_child_tid)

2623

put_user(task_pid_vnr(current), current->set_child_tid);

2623

put_user(task_pid_vnr(current), current->set_child_tid);

2624

}

2624

}

2625

2626

/*

2626

/*

2627

* context_switch - switch to the new MM and the new

2627

* context_switch - switch to the new MM and the new

2628

* thread's register state.

2628

* thread's register state.

2629

*/

2629

*/

2630

static inline void

2630

static inline void

2631

context_switch(struct rq *rq, struct task_struct *prev,

2631

context_switch(struct rq *rq, struct task_struct *prev,

2632

struct task_struct *next)

2632

struct task_struct *next)

2633

{

2633

{

2634

struct mm_struct *mm, *oldmm;

2634

struct mm_struct *mm, *oldmm;

2635

2636

prepare_task_switch(rq, prev, next);

2636

prepare_task_switch(rq, prev, next);

2637

trace_sched_switch(rq, prev, next);

2637

trace_sched_switch(rq, prev, next);

2638

mm = next->mm;

2638

mm = next->mm;

2639

oldmm = prev->active_mm;

2639

oldmm = prev->active_mm;

2640

/*

2640

/*

2641

* For paravirt, this is coupled with an exit in switch_to to

2641

* For paravirt, this is coupled with an exit in switch_to to

2642

* combine the page table reload and the switch backend into

2642

* combine the page table reload and the switch backend into

2643

* one hypercall.

2643

* one hypercall.

2644

*/

2644

*/

2645

arch_enter_lazy_cpu_mode();

2645

arch_enter_lazy_cpu_mode();

2646

2647

if (unlikely(!mm)) {

2647

if (unlikely(!mm)) {

2648

next->active_mm = oldmm;

2648

next->active_mm = oldmm;

2649

atomic_inc(&oldmm->mm_count);

2649

atomic_inc(&oldmm->mm_count);

2650

enter_lazy_tlb(oldmm, next);

2650

enter_lazy_tlb(oldmm, next);

2651

} else

2651

} else

2652

switch_mm(oldmm, mm, next);

2652

switch_mm(oldmm, mm, next);

2653

2654

if (unlikely(!prev->mm)) {

2654

if (unlikely(!prev->mm)) {

2655

prev->active_mm = NULL;

2655

prev->active_mm = NULL;

2656

rq->prev_mm = oldmm;

2656

rq->prev_mm = oldmm;

2657

}

2657

}

2658

/*

2658

/*

2659

* Since the runqueue lock will be released by the next

2659

* Since the runqueue lock will be released by the next

2660

* task (which is an invalid locking op but in the case

2660

* task (which is an invalid locking op but in the case

2661

* of the scheduler it's an obvious special-case), so we

2661

* of the scheduler it's an obvious special-case), so we

2662

* do an early lockdep release here:

2662

* do an early lockdep release here:

2663

*/

2663

*/

2664

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

2664

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

2665

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

2665

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

2666

#endif

2666

#endif

2667

2668

/* Here we just switch the register state and the stack. */

2668

/* Here we just switch the register state and the stack. */

2669

switch_to(prev, next, prev);

2669

switch_to(prev, next, prev);

2670

2671

barrier();

2671

barrier();

2672

/*

2672

/*

2673

* this_rq must be evaluated again because prev may have moved

2673

* this_rq must be evaluated again because prev may have moved

2674

* CPUs since it called schedule(), thus the 'rq' on its stack

2674

* CPUs since it called schedule(), thus the 'rq' on its stack

2675

* frame will be invalid.

2675

* frame will be invalid.

2676

*/

2676

*/

2677

finish_task_switch(this_rq(), prev);

2677

finish_task_switch(this_rq(), prev);

2678

}

2678

}

2679

2680

/*

2680

/*

2681

* nr_running, nr_uninterruptible and nr_context_switches:

2681

* nr_running, nr_uninterruptible and nr_context_switches:

2682

*

2682

*

2683

* externally visible scheduler statistics: current number of runnable

2683

* externally visible scheduler statistics: current number of runnable

2684

* threads, current number of uninterruptible-sleeping threads, total

2684

* threads, current number of uninterruptible-sleeping threads, total

2685

* number of context switches performed since bootup.

2685

* number of context switches performed since bootup.

2686

*/

2686

*/

2687

unsigned long nr_running(void)

2687

unsigned long nr_running(void)

2688

{

2688

{

2689

unsigned long i, sum = 0;

2689

unsigned long i, sum = 0;

2690

2691

for_each_online_cpu(i)

2691

for_each_online_cpu(i)

2692

sum += cpu_rq(i)->nr_running;

2692

sum += cpu_rq(i)->nr_running;

2693

2694

return sum;

2694

return sum;

2695

}

2695

}

2696

2697

unsigned long nr_uninterruptible(void)

2697

unsigned long nr_uninterruptible(void)

2698

{

2698

{

2699

unsigned long i, sum = 0;

2699

unsigned long i, sum = 0;

2700

2701

for_each_possible_cpu(i)

2701

for_each_possible_cpu(i)

2702

sum += cpu_rq(i)->nr_uninterruptible;

2702

sum += cpu_rq(i)->nr_uninterruptible;

2703

2704

/*

2704

/*

2705

* Since we read the counters lockless, it might be slightly

2705

* Since we read the counters lockless, it might be slightly

2706

* inaccurate. Do not allow it to go below zero though:

2706

* inaccurate. Do not allow it to go below zero though:

2707

*/

2707

*/

2708

if (unlikely((long)sum < 0))

2708

if (unlikely((long)sum < 0))

2709

sum = 0;

2709

sum = 0;

2710

2711

return sum;

2711

return sum;

2712

}

2712

}

2713

2714

unsigned long long nr_context_switches(void)

2714

unsigned long long nr_context_switches(void)

2715

{

2715

{

2716

int i;

2716

int i;

2717

unsigned long long sum = 0;

2717

unsigned long long sum = 0;

2718

2719

for_each_possible_cpu(i)

2719

for_each_possible_cpu(i)

2720

sum += cpu_rq(i)->nr_switches;

2720

sum += cpu_rq(i)->nr_switches;

2721

2722

return sum;

2722

return sum;

2723

}

2723

}

2724

2725

unsigned long nr_iowait(void)

2725

unsigned long nr_iowait(void)

2726

{

2726

{

2727

unsigned long i, sum = 0;

2727

unsigned long i, sum = 0;

2728

2729

for_each_possible_cpu(i)

2729

for_each_possible_cpu(i)

2730

sum += atomic_read(&cpu_rq(i)->nr_iowait);

2730

sum += atomic_read(&cpu_rq(i)->nr_iowait);

2731

2732

return sum;

2732

return sum;

2733

}

2733

}

2734

2735

unsigned long nr_active(void)

2735

unsigned long nr_active(void)

2736

{

2736

{

2737

unsigned long i, running = 0, uninterruptible = 0;

2737

unsigned long i, running = 0, uninterruptible = 0;

2738

2739

for_each_online_cpu(i) {

2739

for_each_online_cpu(i) {

2740

running += cpu_rq(i)->nr_running;

2740

running += cpu_rq(i)->nr_running;

2741

uninterruptible += cpu_rq(i)->nr_uninterruptible;

2741

uninterruptible += cpu_rq(i)->nr_uninterruptible;

2742

}

2742

}

2743

2744

if (unlikely((long)uninterruptible < 0))

2744

if (unlikely((long)uninterruptible < 0))

2745

uninterruptible = 0;

2745

uninterruptible = 0;

2746

2747

return running + uninterruptible;

2747

return running + uninterruptible;

2748

}

2748

}

2749

2750

/*

2750

/*

2751

* Update rq->cpu_load[] statistics. This function is usually called every

2751

* Update rq->cpu_load[] statistics. This function is usually called every

2752

* scheduler tick (TICK_NSEC).

2752

* scheduler tick (TICK_NSEC).

2753

*/

2753

*/

2754

static void update_cpu_load(struct rq *this_rq)

2754

static void update_cpu_load(struct rq *this_rq)

2755

{

2755

{

2756

unsigned long this_load = this_rq->load.weight;

2756

unsigned long this_load = this_rq->load.weight;

2757

int i, scale;

2757

int i, scale;

2758

2759

this_rq->nr_load_updates++;

2759

this_rq->nr_load_updates++;

2760

2761

/* Update our load: */

2761

/* Update our load: */

2762

for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {

2762

for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {

2763

unsigned long old_load, new_load;

2763

unsigned long old_load, new_load;

2764

2765

/* scale is effectively 1 << i now, and >> i divides by scale */

2765

/* scale is effectively 1 << i now, and >> i divides by scale */

2766

2767

old_load = this_rq->cpu_load[i];

2767

old_load = this_rq->cpu_load[i];

2768

new_load = this_load;

2768

new_load = this_load;

2769

/*

2769

/*

2770

* Round up the averaging division if load is increasing. This

2770

* Round up the averaging division if load is increasing. This

2771

* prevents us from getting stuck on 9 if the load is 10, for

2771

* prevents us from getting stuck on 9 if the load is 10, for

2772

* example.

2772

* example.

2773

*/

2773

*/

2774

if (new_load > old_load)

2774

if (new_load > old_load)

2775

new_load += scale-1;

2775

new_load += scale-1;

2776

this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;

2776

this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;

2777

}

2777

}

2778

}

2778

}

2779

2780

#ifdef CONFIG_SMP

2780

#ifdef CONFIG_SMP

2781

2782

/*

2782

/*

2783

* double_rq_lock - safely lock two runqueues

2783

* double_rq_lock - safely lock two runqueues

2784

*

2784

*

2785

* Note this does not disable interrupts like task_rq_lock,

2785

* Note this does not disable interrupts like task_rq_lock,

2786

* you need to do so manually before calling.

2786

* you need to do so manually before calling.

2787

*/

2787

*/

2788

static void double_rq_lock(struct rq *rq1, struct rq *rq2)

2788

static void double_rq_lock(struct rq *rq1, struct rq *rq2)

2789

__acquires(rq1->lock)

2789

__acquires(rq1->lock)

2790

__acquires(rq2->lock)

2790

__acquires(rq2->lock)

2791

{

2791

{

2792

BUG_ON(!irqs_disabled());

2792

BUG_ON(!irqs_disabled());

2793

if (rq1 == rq2) {

2793

if (rq1 == rq2) {

2794

spin_lock(&rq1->lock);

2794

spin_lock(&rq1->lock);

2795

__acquire(rq2->lock); /* Fake it out ;) */

2795

__acquire(rq2->lock); /* Fake it out ;) */

2796

} else {

2796

} else {

2797

if (rq1 < rq2) {

2797

if (rq1 < rq2) {

2798

spin_lock(&rq1->lock);

2798

spin_lock(&rq1->lock);

2799

spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);

2799

spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);

2800

} else {

2800

} else {

2801

spin_lock(&rq2->lock);

2801

spin_lock(&rq2->lock);

2802

spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);

2802

spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);

2803

}

2803

}

2804

}

2804

}

2805

update_rq_clock(rq1);

2805

update_rq_clock(rq1);

2806

update_rq_clock(rq2);

2806

update_rq_clock(rq2);

2807

}

2807

}

2808

2809

/*

2809

/*

2810

* double_rq_unlock - safely unlock two runqueues

2810

* double_rq_unlock - safely unlock two runqueues

2811

*

2811

*

2812

* Note this does not restore interrupts like task_rq_unlock,

2812

* Note this does not restore interrupts like task_rq_unlock,

2813

* you need to do so manually after calling.

2813

* you need to do so manually after calling.

2814

*/

2814

*/

2815

static void double_rq_unlock(struct rq *rq1, struct rq *rq2)

2815

static void double_rq_unlock(struct rq *rq1, struct rq *rq2)

2816

__releases(rq1->lock)

2816

__releases(rq1->lock)

2817

__releases(rq2->lock)

2817

__releases(rq2->lock)

2818

{

2818

{

2819

spin_unlock(&rq1->lock);

2819

spin_unlock(&rq1->lock);

2820

if (rq1 != rq2)

2820

if (rq1 != rq2)

2821

spin_unlock(&rq2->lock);

2821

spin_unlock(&rq2->lock);

2822

else

2822

else

2823

__release(rq2->lock);

2823

__release(rq2->lock);

2824

}

2824

}

2825

2826

/*

2826

/*

2827

* If dest_cpu is allowed for this process, migrate the task to it.

2827

* If dest_cpu is allowed for this process, migrate the task to it.

2828

* This is accomplished by forcing the cpu_allowed mask to only

2828

* This is accomplished by forcing the cpu_allowed mask to only

2829

* allow dest_cpu, which will force the cpu onto dest_cpu. Then

2829

* allow dest_cpu, which will force the cpu onto dest_cpu. Then

2830

* the cpu_allowed mask is restored.

2830

* the cpu_allowed mask is restored.

2831

*/

2831

*/

2832

static void sched_migrate_task(struct task_struct *p, int dest_cpu)

2832

static void sched_migrate_task(struct task_struct *p, int dest_cpu)

2833

{

2833

{

2834

struct migration_req req;

2834

struct migration_req req;

2835

unsigned long flags;

2835

unsigned long flags;

2836

struct rq *rq;

2836

struct rq *rq;

2837

2838

rq = task_rq_lock(p, &flags);

2838

rq = task_rq_lock(p, &flags);

2839

if (!cpu_isset(dest_cpu, p->cpus_allowed)

2839

if (!cpu_isset(dest_cpu, p->cpus_allowed)

2840

|| unlikely(!cpu_active(dest_cpu)))

2840

|| unlikely(!cpu_active(dest_cpu)))

2841

goto out;

2841

goto out;

2842

2843

trace_sched_migrate_task(rq, p, dest_cpu);

2843

trace_sched_migrate_task(rq, p, dest_cpu);

2844

/* force the process onto the specified CPU */

2844

/* force the process onto the specified CPU */

2845

if (migrate_task(p, dest_cpu, &req)) {

2845

if (migrate_task(p, dest_cpu, &req)) {

2846

/* Need to wait for migration thread (might exit: take ref). */

2846

/* Need to wait for migration thread (might exit: take ref). */

2847

struct task_struct *mt = rq->migration_thread;

2847

struct task_struct *mt = rq->migration_thread;

2848

2849

get_task_struct(mt);

2849

get_task_struct(mt);

2850

task_rq_unlock(rq, &flags);

2850

task_rq_unlock(rq, &flags);

2851

wake_up_process(mt);

2851

wake_up_process(mt);

2852

put_task_struct(mt);

2852

put_task_struct(mt);

2853

wait_for_completion(&req.done);

2853

wait_for_completion(&req.done);

2854

2855

return;

2855

return;

2856

}

2856

}

2857

out:

2857

out:

2858

task_rq_unlock(rq, &flags);

2858

task_rq_unlock(rq, &flags);

2859

}

2859

}

2860

2861

/*

2861

/*

2862

* sched_exec - execve() is a valuable balancing opportunity, because at

2862

* sched_exec - execve() is a valuable balancing opportunity, because at

2863

* this point the task has the smallest effective memory and cache footprint.

2863

* this point the task has the smallest effective memory and cache footprint.

2864

*/

2864

*/

2865

void sched_exec(void)

2865

void sched_exec(void)

2866

{

2866

{

2867

int new_cpu, this_cpu = get_cpu();

2867

int new_cpu, this_cpu = get_cpu();

2868

new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);

2868

new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);

2869

put_cpu();

2869

put_cpu();

2870

if (new_cpu != this_cpu)

2870

if (new_cpu != this_cpu)

2871

sched_migrate_task(current, new_cpu);

2871

sched_migrate_task(current, new_cpu);

2872

}

2872

}

2873

2874

/*

2874

/*

2875

* pull_task - move a task from a remote runqueue to the local runqueue.

2875

* pull_task - move a task from a remote runqueue to the local runqueue.

2876

* Both runqueues must be locked.

2876

* Both runqueues must be locked.

2877

*/

2877

*/

2878

static void pull_task(struct rq *src_rq, struct task_struct *p,

2878

static void pull_task(struct rq *src_rq, struct task_struct *p,

2879

struct rq *this_rq, int this_cpu)

2879

struct rq *this_rq, int this_cpu)

2880

{

2880

{

2881

deactivate_task(src_rq, p, 0);

2881

deactivate_task(src_rq, p, 0);

2882

set_task_cpu(p, this_cpu);

2882

set_task_cpu(p, this_cpu);

2883

activate_task(this_rq, p, 0);

2883

activate_task(this_rq, p, 0);

2884

/*

2884

/*

2885

* Note that idle threads have a prio of MAX_PRIO, for this test

2885

* Note that idle threads have a prio of MAX_PRIO, for this test

2886

* to be always true for them.

2886

* to be always true for them.

2887

*/

2887

*/

2888

check_preempt_curr(this_rq, p, 0);

2888

check_preempt_curr(this_rq, p, 0);

2889

}

2889

}

2890

2891

/*

2891

/*

2892

* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?

2892

* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?

2893

*/

2893

*/

2894

static

2894

static

2895

int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,

2895

int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,

2896

struct sched_domain *sd, enum cpu_idle_type idle,

2896

struct sched_domain *sd, enum cpu_idle_type idle,

2897

int *all_pinned)

2897

int *all_pinned)

2898

{

2898

{

2899

/*

2899

/*

2900

* We do not migrate tasks that are:

2900

* We do not migrate tasks that are:

2901

* 1) running (obviously), or

2901

* 1) running (obviously), or

2902

* 2) cannot be migrated to this CPU due to cpus_allowed, or

2902

* 2) cannot be migrated to this CPU due to cpus_allowed, or

2903

* 3) are cache-hot on their current CPU.

2903

* 3) are cache-hot on their current CPU.

2904

*/

2904

*/

2905

if (!cpu_isset(this_cpu, p->cpus_allowed)) {

2905

if (!cpu_isset(this_cpu, p->cpus_allowed)) {

2906

schedstat_inc(p, se.nr_failed_migrations_affine);

2906

schedstat_inc(p, se.nr_failed_migrations_affine);

2907

return 0;

2907

return 0;

2908

}

2908

}

2909

*all_pinned = 0;

2909

*all_pinned = 0;

2910

2911

if (task_running(rq, p)) {

2911

if (task_running(rq, p)) {

2912

schedstat_inc(p, se.nr_failed_migrations_running);

2912

schedstat_inc(p, se.nr_failed_migrations_running);

2913

return 0;

2913

return 0;

2914

}

2914

}

2915

2916

/*

2916

/*

2917

* Aggressive migration if:

2917

* Aggressive migration if:

2918

* 1) task is cache cold, or

2918

* 1) task is cache cold, or

2919

* 2) too many balance attempts have failed.

2919

* 2) too many balance attempts have failed.

2920

*/

2920

*/

2921

2922

if (!task_hot(p, rq->clock, sd) ||

2922

if (!task_hot(p, rq->clock, sd) ||

2923

sd->nr_balance_failed > sd->cache_nice_tries) {

2923

sd->nr_balance_failed > sd->cache_nice_tries) {

2924

#ifdef CONFIG_SCHEDSTATS

2924

#ifdef CONFIG_SCHEDSTATS

2925

if (task_hot(p, rq->clock, sd)) {

2925

if (task_hot(p, rq->clock, sd)) {

2926

schedstat_inc(sd, lb_hot_gained[idle]);

2926

schedstat_inc(sd, lb_hot_gained[idle]);

2927

schedstat_inc(p, se.nr_forced_migrations);

2927

schedstat_inc(p, se.nr_forced_migrations);

2928

}

2928

}

2929

#endif

2929

#endif

2930

return 1;

2930

return 1;

2931

}

2931

}

2932

2933

if (task_hot(p, rq->clock, sd)) {

2933

if (task_hot(p, rq->clock, sd)) {

2934

schedstat_inc(p, se.nr_failed_migrations_hot);

2934

schedstat_inc(p, se.nr_failed_migrations_hot);

2935

return 0;

2935

return 0;

2936

}

2936

}

2937

return 1;

2937

return 1;

2938

}

2938

}

2939

2940

static unsigned long

2940

static unsigned long

2941

balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

2941

balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

2942

unsigned long max_load_move, struct sched_domain *sd,

2942

unsigned long max_load_move, struct sched_domain *sd,

2943

enum cpu_idle_type idle, int *all_pinned,

2943

enum cpu_idle_type idle, int *all_pinned,

2944

int *this_best_prio, struct rq_iterator *iterator)

2944

int *this_best_prio, struct rq_iterator *iterator)

2945

{

2945

{

2946

int loops = 0, pulled = 0, pinned = 0;

2946

int loops = 0, pulled = 0, pinned = 0;

2947

struct task_struct *p;

2947

struct task_struct *p;

2948

long rem_load_move = max_load_move;

2948

long rem_load_move = max_load_move;

2949

2950

if (max_load_move == 0)

2950

if (max_load_move == 0)

2951

goto out;

2951

goto out;

2952

2953

pinned = 1;

2953

pinned = 1;

2954

2955

/*

2955

/*

2956

* Start the load-balancing iterator:

2956

* Start the load-balancing iterator:

2957

*/

2957

*/

2958

p = iterator->start(iterator->arg);

2958

p = iterator->start(iterator->arg);

2959

if (!p || loops++ > sysctl_sched_nr_migrate)

2960

if (!p || loops++ > sysctl_sched_nr_migrate)

2961

goto out;

2961

goto out;

2962

2963

if ((p->se.load.weight >> 1) > rem_load_move ||

2963

if ((p->se.load.weight >> 1) > rem_load_move ||

2964

!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {

2964

!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {

2965

p = iterator->next(iterator->arg);

2965

p = iterator->next(iterator->arg);

2966

goto next;

2966

goto next;

2967

}

2967

}

2968

2969

pull_task(busiest, p, this_rq, this_cpu);

2969

pull_task(busiest, p, this_rq, this_cpu);

2970

pulled++;

2970

pulled++;

2971

rem_load_move -= p->se.load.weight;

2971

rem_load_move -= p->se.load.weight;

2972

2973

/*

2973

/*

2974

* We only want to steal up to the prescribed amount of weighted load.

2974

* We only want to steal up to the prescribed amount of weighted load.

2975

*/

2975

*/

2976

if (rem_load_move > 0) {

2976

if (rem_load_move > 0) {

2977

if (p->prio < *this_best_prio)

2977

if (p->prio < *this_best_prio)

2978

*this_best_prio = p->prio;

2978

*this_best_prio = p->prio;

2979

p = iterator->next(iterator->arg);

2979

p = iterator->next(iterator->arg);

2980

goto next;

2980

goto next;

2981

}

2981

}

2982

out:

2982

out:

2983

/*

2983

/*

2984

* Right now, this is one of only two places pull_task() is called,

2984

* Right now, this is one of only two places pull_task() is called,

2985

* so we can safely collect pull_task() stats here rather than

2985

* so we can safely collect pull_task() stats here rather than

2986

* inside pull_task().

2986

* inside pull_task().

2987

*/

2987

*/

2988

schedstat_add(sd, lb_gained[idle], pulled);

2988

schedstat_add(sd, lb_gained[idle], pulled);

2989

2990

if (all_pinned)

2990

if (all_pinned)

2991

*all_pinned = pinned;

2991

*all_pinned = pinned;

2992

2993

return max_load_move - rem_load_move;

2993

return max_load_move - rem_load_move;

2994

}

2994

}

2995

2996

/*

2996

/*

2997

* move_tasks tries to move up to max_load_move weighted load from busiest to

2997

* move_tasks tries to move up to max_load_move weighted load from busiest to

2998

* this_rq, as part of a balancing operation within domain "sd".

2998

* this_rq, as part of a balancing operation within domain "sd".

2999

* Returns 1 if successful and 0 otherwise.

2999

* Returns 1 if successful and 0 otherwise.

3000

*

3000

*

3001

* Called with both runqueues locked.

3001

* Called with both runqueues locked.

3002

*/

3002

*/

3003

static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

3003

static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

3004

unsigned long max_load_move,

3004

unsigned long max_load_move,

3005

struct sched_domain *sd, enum cpu_idle_type idle,

3005

struct sched_domain *sd, enum cpu_idle_type idle,

3006

int *all_pinned)

3006

int *all_pinned)

3007

{

3007

{

3008

const struct sched_class *class = sched_class_highest;

3008

const struct sched_class *class = sched_class_highest;

3009

unsigned long total_load_moved = 0;

3009

unsigned long total_load_moved = 0;

3010

int this_best_prio = this_rq->curr->prio;

3010

int this_best_prio = this_rq->curr->prio;

3011

3012

do {

3012

do {

3013

total_load_moved +=

3013

total_load_moved +=

3014

class->load_balance(this_rq, this_cpu, busiest,

3014

class->load_balance(this_rq, this_cpu, busiest,

3015

max_load_move - total_load_moved,

3015

max_load_move - total_load_moved,

3016

sd, idle, all_pinned, &this_best_prio);

3016

sd, idle, all_pinned, &this_best_prio);

3017

class = class->next;

3017

class = class->next;

3018

3019

if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)

3019

if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)

3020

break;

3020

break;

3021

3022

} while (class && max_load_move > total_load_moved);

3022

} while (class && max_load_move > total_load_moved);

3023

3024

return total_load_moved > 0;

3024

return total_load_moved > 0;

3025

}

3025

}

3026

3027

static int

3027

static int

3028

iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

3028

iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

3029

struct sched_domain *sd, enum cpu_idle_type idle,

3029

struct sched_domain *sd, enum cpu_idle_type idle,

3030

struct rq_iterator *iterator)

3030

struct rq_iterator *iterator)

3031

{

3031

{

3032

struct task_struct *p = iterator->start(iterator->arg);

3032

struct task_struct *p = iterator->start(iterator->arg);

3033

int pinned = 0;

3033

int pinned = 0;

3034

3035

while (p) {

3035

while (p) {

3036

if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {

3036

if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {

3037

pull_task(busiest, p, this_rq, this_cpu);

3037

pull_task(busiest, p, this_rq, this_cpu);

3038

/*

3038

/*

3039

* Right now, this is only the second place pull_task()

3039

* Right now, this is only the second place pull_task()

3040

* is called, so we can safely collect pull_task()

3040

* is called, so we can safely collect pull_task()

3041

* stats here rather than inside pull_task().

3041

* stats here rather than inside pull_task().

3042

*/

3042

*/

3043

schedstat_inc(sd, lb_gained[idle]);

3043

schedstat_inc(sd, lb_gained[idle]);

3044

3045

return 1;

3045

return 1;

3046

}

3046

}

3047

p = iterator->next(iterator->arg);

3047

p = iterator->next(iterator->arg);

3048

}

3048

}

3049

3050

return 0;

3050

return 0;

3051

}

3051

}

3052

3053

/*

3053

/*

3054

* move_one_task tries to move exactly one task from busiest to this_rq, as

3054

* move_one_task tries to move exactly one task from busiest to this_rq, as

3055

* part of active balancing operations within "domain".

3055

* part of active balancing operations within "domain".

3056

* Returns 1 if successful and 0 otherwise.

3056

* Returns 1 if successful and 0 otherwise.

3057

*

3057

*

3058

* Called with both runqueues locked.

3058

* Called with both runqueues locked.

3059

*/

3059

*/

3060

static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

3060

static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

3061

struct sched_domain *sd, enum cpu_idle_type idle)

3061

struct sched_domain *sd, enum cpu_idle_type idle)

3062

{

3062

{

3063

const struct sched_class *class;

3063

const struct sched_class *class;

3064

3065

for (class = sched_class_highest; class; class = class->next)

3065

for (class = sched_class_highest; class; class = class->next)

3066

if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))

3066

if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))

3067

return 1;

3067

return 1;

3068

3069

return 0;

3069

return 0;

3070

}

3070

}

3071

3072

/*

3072

/*

3073

* find_busiest_group finds and returns the busiest CPU group within the

3073

* find_busiest_group finds and returns the busiest CPU group within the

3074

* domain. It calculates and returns the amount of weighted load which

3074

* domain. It calculates and returns the amount of weighted load which

3075

* should be moved to restore balance via the imbalance parameter.

3075

* should be moved to restore balance via the imbalance parameter.

3076

*/

3076

*/

3077

static struct sched_group *

3077

static struct sched_group *

3078

find_busiest_group(struct sched_domain *sd, int this_cpu,

3078

find_busiest_group(struct sched_domain *sd, int this_cpu,

3079

unsigned long *imbalance, enum cpu_idle_type idle,

3079

unsigned long *imbalance, enum cpu_idle_type idle,

3080

int *sd_idle, const cpumask_t *cpus, int *balance)

3080

int *sd_idle, const cpumask_t *cpus, int *balance)

3081

{

3081

{

3082

struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;

3082

struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;

3083

unsigned long max_load, avg_load, total_load, this_load, total_pwr;

3083

unsigned long max_load, avg_load, total_load, this_load, total_pwr;

3084

unsigned long max_pull;

3084

unsigned long max_pull;

3085

unsigned long busiest_load_per_task, busiest_nr_running;

3085

unsigned long busiest_load_per_task, busiest_nr_running;

3086

unsigned long this_load_per_task, this_nr_running;

3086

unsigned long this_load_per_task, this_nr_running;

3087

int load_idx, group_imb = 0;

3087

int load_idx, group_imb = 0;

3088

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

3088

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

3089

int power_savings_balance = 1;

3089

int power_savings_balance = 1;

3090

unsigned long leader_nr_running = 0, min_load_per_task = 0;

3090

unsigned long leader_nr_running = 0, min_load_per_task = 0;

3091

unsigned long min_nr_running = ULONG_MAX;

3091

unsigned long min_nr_running = ULONG_MAX;

3092

struct sched_group *group_min = NULL, *group_leader = NULL;

3092

struct sched_group *group_min = NULL, *group_leader = NULL;

3093

#endif

3093

#endif

3094

3095

max_load = this_load = total_load = total_pwr = 0;

3095

max_load = this_load = total_load = total_pwr = 0;

3096

busiest_load_per_task = busiest_nr_running = 0;

3096

busiest_load_per_task = busiest_nr_running = 0;

3097

this_load_per_task = this_nr_running = 0;

3097

this_load_per_task = this_nr_running = 0;

3098

3099

if (idle == CPU_NOT_IDLE)

3099

if (idle == CPU_NOT_IDLE)

3100

load_idx = sd->busy_idx;

3100

load_idx = sd->busy_idx;

3101

else if (idle == CPU_NEWLY_IDLE)

3101

else if (idle == CPU_NEWLY_IDLE)

3102

load_idx = sd->newidle_idx;

3102

load_idx = sd->newidle_idx;

3103

else

3103

else

3104

load_idx = sd->idle_idx;

3104

load_idx = sd->idle_idx;

3105

3106

do {

3106

do {

3107

unsigned long load, group_capacity, max_cpu_load, min_cpu_load;

3107

unsigned long load, group_capacity, max_cpu_load, min_cpu_load;

3108

int local_group;

3108

int local_group;

3109

int i;

3109

int i;

3110

int __group_imb = 0;

3110

int __group_imb = 0;

3111

unsigned int balance_cpu = -1, first_idle_cpu = 0;

3111

unsigned int balance_cpu = -1, first_idle_cpu = 0;

3112

unsigned long sum_nr_running, sum_weighted_load;

3112

unsigned long sum_nr_running, sum_weighted_load;

3113

unsigned long sum_avg_load_per_task;

3113

unsigned long sum_avg_load_per_task;

3114

unsigned long avg_load_per_task;

3114

unsigned long avg_load_per_task;

3115

3116

local_group = cpu_isset(this_cpu, group->cpumask);

3116

local_group = cpu_isset(this_cpu, group->cpumask);

3117

3118

if (local_group)

3118

if (local_group)

3119

balance_cpu = first_cpu(group->cpumask);

3119

balance_cpu = first_cpu(group->cpumask);

3120

3121

/* Tally up the load of all CPUs in the group */

3121

/* Tally up the load of all CPUs in the group */

3122

sum_weighted_load = sum_nr_running = avg_load = 0;

3122

sum_weighted_load = sum_nr_running = avg_load = 0;

3123

sum_avg_load_per_task = avg_load_per_task = 0;

3123

sum_avg_load_per_task = avg_load_per_task = 0;

3124

3125

max_cpu_load = 0;

3125

max_cpu_load = 0;

3126

min_cpu_load = ~0UL;

3126

min_cpu_load = ~0UL;

3127

3128

for_each_cpu_mask_nr(i, group->cpumask) {

3128

for_each_cpu_mask_nr(i, group->cpumask) {

3129

struct rq *rq;

3129

struct rq *rq;

3130

3131

if (!cpu_isset(i, *cpus))

3131

if (!cpu_isset(i, *cpus))

3132

continue;

3132

continue;

3133

3134

rq = cpu_rq(i);

3134

rq = cpu_rq(i);

3135

3136

if (*sd_idle && rq->nr_running)

3136

if (*sd_idle && rq->nr_running)

3137

*sd_idle = 0;

3137

*sd_idle = 0;

3138

3139

/* Bias balancing toward cpus of our domain */

3139

/* Bias balancing toward cpus of our domain */

3140

if (local_group) {

3140

if (local_group) {

3141

if (idle_cpu(i) && !first_idle_cpu) {

3141

if (idle_cpu(i) && !first_idle_cpu) {

3142

first_idle_cpu = 1;

3142

first_idle_cpu = 1;

3143

balance_cpu = i;

3143

balance_cpu = i;

3144

}

3144

}

3145

3146

load = target_load(i, load_idx);

3146

load = target_load(i, load_idx);

3147

} else {

3147

} else {

3148

load = source_load(i, load_idx);

3148

load = source_load(i, load_idx);

3149

if (load > max_cpu_load)

3149

if (load > max_cpu_load)

3150

max_cpu_load = load;

3150

max_cpu_load = load;

3151

if (min_cpu_load > load)

3151

if (min_cpu_load > load)

3152

min_cpu_load = load;

3152

min_cpu_load = load;

3153

}

3153

}

3154

3155

avg_load += load;

3155

avg_load += load;

3156

sum_nr_running += rq->nr_running;

3156

sum_nr_running += rq->nr_running;

3157

sum_weighted_load += weighted_cpuload(i);

3157

sum_weighted_load += weighted_cpuload(i);

3158

3159

sum_avg_load_per_task += cpu_avg_load_per_task(i);

3159

sum_avg_load_per_task += cpu_avg_load_per_task(i);

3160

}

3160

}

3161

3162

/*

3162

/*

3163

* First idle cpu or the first cpu(busiest) in this sched group

3163

* First idle cpu or the first cpu(busiest) in this sched group

3164

* is eligible for doing load balancing at this and above

3164

* is eligible for doing load balancing at this and above

3165

* domains. In the newly idle case, we will allow all the cpu's

3165

* domains. In the newly idle case, we will allow all the cpu's

3166

* to do the newly idle load balance.

3166

* to do the newly idle load balance.

3167

*/

3167

*/

3168

if (idle != CPU_NEWLY_IDLE && local_group &&

3168

if (idle != CPU_NEWLY_IDLE && local_group &&

3169

balance_cpu != this_cpu && balance) {

3169

balance_cpu != this_cpu && balance) {

3170

*balance = 0;

3170

*balance = 0;

3171

goto ret;

3171

goto ret;

3172

}

3172

}

3173

3174

total_load += avg_load;

3174

total_load += avg_load;

3175

total_pwr += group->__cpu_power;

3175

total_pwr += group->__cpu_power;

3176

3177

/* Adjust by relative CPU power of the group */

3177

/* Adjust by relative CPU power of the group */

3178

avg_load = sg_div_cpu_power(group,

3178

avg_load = sg_div_cpu_power(group,

3179

avg_load * SCHED_LOAD_SCALE);

3179

avg_load * SCHED_LOAD_SCALE);

3180

3181

3182

/*

3182

/*

3183

* Consider the group unbalanced when the imbalance is larger

3183

* Consider the group unbalanced when the imbalance is larger

3184

* than the average weight of two tasks.

3184

* than the average weight of two tasks.

3185

*

3185

*

3186

* APZ: with cgroup the avg task weight can vary wildly and

3186

* APZ: with cgroup the avg task weight can vary wildly and

3187

* might not be a suitable number - should we keep a

3187

* might not be a suitable number - should we keep a

3188

* normalized nr_running number somewhere that negates

3188

* normalized nr_running number somewhere that negates

3189

* the hierarchy?

3189

* the hierarchy?

3190

*/

3190

*/

3191

avg_load_per_task = sg_div_cpu_power(group,

3191

avg_load_per_task = sg_div_cpu_power(group,

3192

sum_avg_load_per_task * SCHED_LOAD_SCALE);

3192

sum_avg_load_per_task * SCHED_LOAD_SCALE);

3193

3194

if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)

3194

if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)

3195

__group_imb = 1;

3195

__group_imb = 1;

3196

3197

group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;

3197

group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;

3198

3199

if (local_group) {

3199

if (local_group) {

3200

this_load = avg_load;

3200

this_load = avg_load;

3201

this = group;

3201

this = group;

3202

this_nr_running = sum_nr_running;

3202

this_nr_running = sum_nr_running;

3203

this_load_per_task = sum_weighted_load;

3203

this_load_per_task = sum_weighted_load;

3204

} else if (avg_load > max_load &&

3204

} else if (avg_load > max_load &&

3205

(sum_nr_running > group_capacity || __group_imb)) {

3205

(sum_nr_running > group_capacity || __group_imb)) {

3206

max_load = avg_load;

3206

max_load = avg_load;

3207

busiest = group;

3207

busiest = group;

3208

busiest_nr_running = sum_nr_running;

3208

busiest_nr_running = sum_nr_running;

3209

busiest_load_per_task = sum_weighted_load;

3209

busiest_load_per_task = sum_weighted_load;

3210

group_imb = __group_imb;

3210

group_imb = __group_imb;

3211

}

3211

}

3212

3213

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

3213

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

3214

/*

3214

/*

3215

* Busy processors will not participate in power savings

3215

* Busy processors will not participate in power savings

3216

* balance.

3216

* balance.

3217

*/

3217

*/

3218

if (idle == CPU_NOT_IDLE ||

3218

if (idle == CPU_NOT_IDLE ||

3219

!(sd->flags & SD_POWERSAVINGS_BALANCE))

3219

!(sd->flags & SD_POWERSAVINGS_BALANCE))

3220

goto group_next;

3220

goto group_next;

3221

3222

/*

3222

/*

3223

* If the local group is idle or completely loaded

3223

* If the local group is idle or completely loaded

3224

* no need to do power savings balance at this domain

3224

* no need to do power savings balance at this domain

3225

*/

3225

*/

3226

if (local_group && (this_nr_running >= group_capacity ||

3226

if (local_group && (this_nr_running >= group_capacity ||

3227

!this_nr_running))

3227

!this_nr_running))

3228

power_savings_balance = 0;

3228

power_savings_balance = 0;

3229

3230

/*

3230

/*

3231

* If a group is already running at full capacity or idle,

3231

* If a group is already running at full capacity or idle,

3232

* don't include that group in power savings calculations

3232

* don't include that group in power savings calculations

3233

*/

3233

*/

3234

if (!power_savings_balance || sum_nr_running >= group_capacity

3234

if (!power_savings_balance || sum_nr_running >= group_capacity

3235

|| !sum_nr_running)

3235

|| !sum_nr_running)

3236

goto group_next;

3236

goto group_next;

3237

3238

/*

3238

/*

3239

* Calculate the group which has the least non-idle load.

3239

* Calculate the group which has the least non-idle load.

3240

* This is the group from where we need to pick up the load

3240

* This is the group from where we need to pick up the load

3241

* for saving power

3241

* for saving power

3242

*/

3242

*/

3243

if ((sum_nr_running < min_nr_running) ||

3243

if ((sum_nr_running < min_nr_running) ||

3244

(sum_nr_running == min_nr_running &&

3244

(sum_nr_running == min_nr_running &&

3245

first_cpu(group->cpumask) <

3245

first_cpu(group->cpumask) <

3246

first_cpu(group_min->cpumask))) {

3246

first_cpu(group_min->cpumask))) {

3247

group_min = group;

3247

group_min = group;

3248

min_nr_running = sum_nr_running;

3248

min_nr_running = sum_nr_running;

3249

min_load_per_task = sum_weighted_load /

3249

min_load_per_task = sum_weighted_load /

3250

sum_nr_running;

3250

sum_nr_running;

3251

}

3251

}

3252

3253

/*

3253

/*

3254

* Calculate the group which is almost near its

3254

* Calculate the group which is almost near its

3255

* capacity but still has some space to pick up some load

3255

* capacity but still has some space to pick up some load

3256

* from other group and save more power

3256

* from other group and save more power

3257

*/

3257

*/

3258

if (sum_nr_running <= group_capacity - 1) {

3258

if (sum_nr_running <= group_capacity - 1) {

3259

if (sum_nr_running > leader_nr_running ||

3259

if (sum_nr_running > leader_nr_running ||

3260

(sum_nr_running == leader_nr_running &&

3260

(sum_nr_running == leader_nr_running &&

3261

first_cpu(group->cpumask) >

3261

first_cpu(group->cpumask) >

3262

first_cpu(group_leader->cpumask))) {

3262

first_cpu(group_leader->cpumask))) {

3263

group_leader = group;

3263

group_leader = group;

3264

leader_nr_running = sum_nr_running;

3264

leader_nr_running = sum_nr_running;

3265

}

3265

}

3266

}

3266

}

3267

group_next:

3267

group_next:

3268

#endif

3268

#endif

3269

group = group->next;

3269

group = group->next;

3270

} while (group != sd->groups);

3270

} while (group != sd->groups);

3271

3272

if (!busiest || this_load >= max_load || busiest_nr_running == 0)

3272

if (!busiest || this_load >= max_load || busiest_nr_running == 0)

3273

goto out_balanced;

3273

goto out_balanced;

3274

3275

avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;

3275

avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;

3276

3277

if (this_load >= avg_load ||

3277

if (this_load >= avg_load ||

3278

100*max_load <= sd->imbalance_pct*this_load)

3278

100*max_load <= sd->imbalance_pct*this_load)

3279

goto out_balanced;

3279

goto out_balanced;

3280

3281

busiest_load_per_task /= busiest_nr_running;

3281

busiest_load_per_task /= busiest_nr_running;

3282

if (group_imb)

3282

if (group_imb)

3283

busiest_load_per_task = min(busiest_load_per_task, avg_load);

3283

busiest_load_per_task = min(busiest_load_per_task, avg_load);

3284

3285

/*

3285

/*

3286

* We're trying to get all the cpus to the average_load, so we don't

3286

* We're trying to get all the cpus to the average_load, so we don't

3287

* want to push ourselves above the average load, nor do we wish to

3287

* want to push ourselves above the average load, nor do we wish to

3288

* reduce the max loaded cpu below the average load, as either of these

3288

* reduce the max loaded cpu below the average load, as either of these

3289

* actions would just result in more rebalancing later, and ping-pong

3289

* actions would just result in more rebalancing later, and ping-pong

3290

* tasks around. Thus we look for the minimum possible imbalance.

3290

* tasks around. Thus we look for the minimum possible imbalance.

3291

* Negative imbalances (*we* are more loaded than anyone else) will

3291

* Negative imbalances (*we* are more loaded than anyone else) will

3292

* be counted as no imbalance for these purposes -- we can't fix that

3292

* be counted as no imbalance for these purposes -- we can't fix that

3293

* by pulling tasks to us. Be careful of negative numbers as they'll

3293

* by pulling tasks to us. Be careful of negative numbers as they'll

3294

* appear as very large values with unsigned longs.

3294

* appear as very large values with unsigned longs.

3295

*/

3295

*/

3296

if (max_load <= busiest_load_per_task)

3296

if (max_load <= busiest_load_per_task)

3297

goto out_balanced;

3297

goto out_balanced;

3298

3299

/*

3299

/*

3300

* In the presence of smp nice balancing, certain scenarios can have

3300

* In the presence of smp nice balancing, certain scenarios can have

3301

* max load less than avg load(as we skip the groups at or below

3301

* max load less than avg load(as we skip the groups at or below

3302

* its cpu_power, while calculating max_load..)

3302

* its cpu_power, while calculating max_load..)

3303

*/

3303

*/

3304

if (max_load < avg_load) {

3304

if (max_load < avg_load) {

3305

*imbalance = 0;

3305

*imbalance = 0;

3306

goto small_imbalance;

3306

goto small_imbalance;

3307

}

3307

}

3308

3309

/* Don't want to pull so many tasks that a group would go idle */

3309

/* Don't want to pull so many tasks that a group would go idle */

3310

max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);

3310

max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);

3311

3312

/* How much load to actually move to equalise the imbalance */

3312

/* How much load to actually move to equalise the imbalance */

3313

*imbalance = min(max_pull * busiest->__cpu_power,

3313

*imbalance = min(max_pull * busiest->__cpu_power,

3314

(avg_load - this_load) * this->__cpu_power)

3314

(avg_load - this_load) * this->__cpu_power)

3315

/ SCHED_LOAD_SCALE;

3315

/ SCHED_LOAD_SCALE;

3316

3317

/*

3317

/*

3318

* if *imbalance is less than the average load per runnable task

3318

* if *imbalance is less than the average load per runnable task

3319

* there is no gaurantee that any tasks will be moved so we'll have

3319

* there is no gaurantee that any tasks will be moved so we'll have

3320

* a think about bumping its value to force at least one task to be

3320

* a think about bumping its value to force at least one task to be

3321

* moved

3321

* moved

3322

*/

3322

*/

3323

if (*imbalance < busiest_load_per_task) {

3323

if (*imbalance < busiest_load_per_task) {

3324

unsigned long tmp, pwr_now, pwr_move;

3324

unsigned long tmp, pwr_now, pwr_move;

3325

unsigned int imbn;

3325

unsigned int imbn;

3326

3327

small_imbalance:

3327

small_imbalance:

3328

pwr_move = pwr_now = 0;

3328

pwr_move = pwr_now = 0;

3329

imbn = 2;

3329

imbn = 2;

3330

if (this_nr_running) {

3330

if (this_nr_running) {

3331

this_load_per_task /= this_nr_running;

3331

this_load_per_task /= this_nr_running;

3332

if (busiest_load_per_task > this_load_per_task)

3332

if (busiest_load_per_task > this_load_per_task)

3333

imbn = 1;

3333

imbn = 1;

3334

} else

3334

} else

3335

this_load_per_task = cpu_avg_load_per_task(this_cpu);

3335

this_load_per_task = cpu_avg_load_per_task(this_cpu);

3336

3337

if (max_load - this_load + busiest_load_per_task >=

3337

if (max_load - this_load + busiest_load_per_task >=

3338

busiest_load_per_task * imbn) {

3338

busiest_load_per_task * imbn) {

3339

*imbalance = busiest_load_per_task;

3339

*imbalance = busiest_load_per_task;

3340

return busiest;

3340

return busiest;

3341

}

3341

}

3342

3343

/*

3343

/*

3344

* OK, we don't have enough imbalance to justify moving tasks,

3344

* OK, we don't have enough imbalance to justify moving tasks,

3345

* however we may be able to increase total CPU power used by

3345

* however we may be able to increase total CPU power used by

3346

* moving them.

3346

* moving them.

3347

*/

3347

*/

3348

3349

pwr_now += busiest->__cpu_power *

3349

pwr_now += busiest->__cpu_power *

3350

min(busiest_load_per_task, max_load);

3350

min(busiest_load_per_task, max_load);

3351

pwr_now += this->__cpu_power *

3351

pwr_now += this->__cpu_power *

3352

min(this_load_per_task, this_load);

3352

min(this_load_per_task, this_load);

3353

pwr_now /= SCHED_LOAD_SCALE;

3353

pwr_now /= SCHED_LOAD_SCALE;

3354

3355

/* Amount of load we'd subtract */

3355

/* Amount of load we'd subtract */

3356

tmp = sg_div_cpu_power(busiest,

3356

tmp = sg_div_cpu_power(busiest,

3357

busiest_load_per_task * SCHED_LOAD_SCALE);

3357

busiest_load_per_task * SCHED_LOAD_SCALE);

3358

if (max_load > tmp)

3358

if (max_load > tmp)

3359

pwr_move += busiest->__cpu_power *

3359

pwr_move += busiest->__cpu_power *

3360

min(busiest_load_per_task, max_load - tmp);

3360

min(busiest_load_per_task, max_load - tmp);

3361

3362

/* Amount of load we'd add */

3362

/* Amount of load we'd add */

3363

if (max_load * busiest->__cpu_power <

3363

if (max_load * busiest->__cpu_power <

3364

busiest_load_per_task * SCHED_LOAD_SCALE)

3364

busiest_load_per_task * SCHED_LOAD_SCALE)

3365

tmp = sg_div_cpu_power(this,

3365

tmp = sg_div_cpu_power(this,

3366

max_load * busiest->__cpu_power);

3366

max_load * busiest->__cpu_power);

3367

else

3367

else

3368

tmp = sg_div_cpu_power(this,

3368

tmp = sg_div_cpu_power(this,

3369

busiest_load_per_task * SCHED_LOAD_SCALE);

3369

busiest_load_per_task * SCHED_LOAD_SCALE);

3370

pwr_move += this->__cpu_power *

3370

pwr_move += this->__cpu_power *

3371

min(this_load_per_task, this_load + tmp);

3371

min(this_load_per_task, this_load + tmp);

3372

pwr_move /= SCHED_LOAD_SCALE;

3372

pwr_move /= SCHED_LOAD_SCALE;

3373

3374

/* Move if we gain throughput */

3374

/* Move if we gain throughput */

3375

if (pwr_move > pwr_now)

3375

if (pwr_move > pwr_now)

3376

*imbalance = busiest_load_per_task;

3376

*imbalance = busiest_load_per_task;

3377

}

3377

}

3378

3379

return busiest;

3379

return busiest;

3380

3381

out_balanced:

3381

out_balanced:

3382

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

3382

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

3383

if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))

3383

if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))

3384

goto ret;

3384

goto ret;

3385

3386

if (this == group_leader && group_leader != group_min) {

3386

if (this == group_leader && group_leader != group_min) {

3387

*imbalance = min_load_per_task;

3387

*imbalance = min_load_per_task;

3388

return group_min;

3388

return group_min;

3389

}

3389

}

3390

#endif

3390

#endif

3391

ret:

3391

ret:

3392

*imbalance = 0;

3392

*imbalance = 0;

3393

return NULL;

3393

return NULL;

3394

}

3394

}

3395

3396

/*

3396

/*

3397

* find_busiest_queue - find the busiest runqueue among the cpus in group.

3397

* find_busiest_queue - find the busiest runqueue among the cpus in group.

3398

*/

3398

*/

3399

static struct rq *

3399

static struct rq *

3400

find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,

3400

find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,

3401

unsigned long imbalance, const cpumask_t *cpus)

3401

unsigned long imbalance, const cpumask_t *cpus)

3402

{

3402

{

3403

struct rq *busiest = NULL, *rq;

3403

struct rq *busiest = NULL, *rq;

3404

unsigned long max_load = 0;

3404

unsigned long max_load = 0;

3405

int i;

3405

int i;

3406

3407

for_each_cpu_mask_nr(i, group->cpumask) {

3407

for_each_cpu_mask_nr(i, group->cpumask) {

3408

unsigned long wl;

3408

unsigned long wl;

3409

3410

if (!cpu_isset(i, *cpus))

3410

if (!cpu_isset(i, *cpus))

3411

continue;

3411

continue;

3412

3413

rq = cpu_rq(i);

3413

rq = cpu_rq(i);

3414

wl = weighted_cpuload(i);

3414

wl = weighted_cpuload(i);

3415

3416

if (rq->nr_running == 1 && wl > imbalance)

3416

if (rq->nr_running == 1 && wl > imbalance)

3417

continue;

3417

continue;

3418

3419

if (wl > max_load) {

3419

if (wl > max_load) {

3420

max_load = wl;

3420

max_load = wl;

3421

busiest = rq;

3421

busiest = rq;

3422

}

3422

}

3423

}

3423

}

3424

3425

return busiest;

3425

return busiest;

3426

}

3426

}

3427

3428

/*

3428

/*

3429

* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but

3429

* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but

3430

* so long as it is large enough.

3430

* so long as it is large enough.

3431

*/

3431

*/

3432

#define MAX_PINNED_INTERVAL 512

3432

#define MAX_PINNED_INTERVAL 512

3433

3434

/*

3434

/*

3435

* Check this_cpu to ensure it is balanced within domain. Attempt to move

3435

* Check this_cpu to ensure it is balanced within domain. Attempt to move

3436

* tasks if there is an imbalance.

3436

* tasks if there is an imbalance.

3437

*/

3437

*/

3438

static int load_balance(int this_cpu, struct rq *this_rq,

3438

static int load_balance(int this_cpu, struct rq *this_rq,

3439

struct sched_domain *sd, enum cpu_idle_type idle,

3439

struct sched_domain *sd, enum cpu_idle_type idle,

3440

int *balance, cpumask_t *cpus)

3440

int *balance, cpumask_t *cpus)

3441

{

3441

{

3442

int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;

3442

int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;

3443

struct sched_group *group;

3443

struct sched_group *group;

3444

unsigned long imbalance;

3444

unsigned long imbalance;

3445

struct rq *busiest;

3445

struct rq *busiest;

3446

unsigned long flags;

3446

unsigned long flags;

3447

3448

cpus_setall(*cpus);

3448

cpus_setall(*cpus);

3449

3450

/*

3450

/*

3451

* When power savings policy is enabled for the parent domain, idle

3451

* When power savings policy is enabled for the parent domain, idle

3452

* sibling can pick up load irrespective of busy siblings. In this case,

3452

* sibling can pick up load irrespective of busy siblings. In this case,

3453

* let the state of idle sibling percolate up as CPU_IDLE, instead of

3453

* let the state of idle sibling percolate up as CPU_IDLE, instead of

3454

* portraying it as CPU_NOT_IDLE.

3454

* portraying it as CPU_NOT_IDLE.

3455

*/

3455

*/

3456

if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&

3456

if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&

3457

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

3457

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

3458

sd_idle = 1;

3458

sd_idle = 1;

3459

3460

schedstat_inc(sd, lb_count[idle]);

3460

schedstat_inc(sd, lb_count[idle]);

3461

3462

redo:

3462

redo:

3463

update_shares(sd);

3463

update_shares(sd);

3464

group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,

3464

group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,

3465

cpus, balance);

3465

cpus, balance);

3466

3467

if (*balance == 0)

3467

if (*balance == 0)

3468

goto out_balanced;

3468

goto out_balanced;

3469

3470

if (!group) {

3470

if (!group) {

3471

schedstat_inc(sd, lb_nobusyg[idle]);

3471

schedstat_inc(sd, lb_nobusyg[idle]);

3472

goto out_balanced;

3472

goto out_balanced;

3473

}

3473

}

3474

3475

busiest = find_busiest_queue(group, idle, imbalance, cpus);

3475

busiest = find_busiest_queue(group, idle, imbalance, cpus);

3476

if (!busiest) {

3476

if (!busiest) {

3477

schedstat_inc(sd, lb_nobusyq[idle]);

3477

schedstat_inc(sd, lb_nobusyq[idle]);

3478

goto out_balanced;

3478

goto out_balanced;

3479

}

3479

}

3480

3481

BUG_ON(busiest == this_rq);

3481

BUG_ON(busiest == this_rq);

3482

3483

schedstat_add(sd, lb_imbalance[idle], imbalance);

3483

schedstat_add(sd, lb_imbalance[idle], imbalance);

3484

3485

ld_moved = 0;

3485

ld_moved = 0;

3486

if (busiest->nr_running > 1) {

3486

if (busiest->nr_running > 1) {

3487

/*

3487

/*

3488

* Attempt to move tasks. If find_busiest_group has found

3488

* Attempt to move tasks. If find_busiest_group has found

3489

* an imbalance but busiest->nr_running <= 1, the group is

3489

* an imbalance but busiest->nr_running <= 1, the group is

3490

* still unbalanced. ld_moved simply stays zero, so it is

3490

* still unbalanced. ld_moved simply stays zero, so it is

3491

* correctly treated as an imbalance.

3491

* correctly treated as an imbalance.

3492

*/

3492

*/

3493

local_irq_save(flags);

3493

local_irq_save(flags);

3494

double_rq_lock(this_rq, busiest);

3494

double_rq_lock(this_rq, busiest);

3495

ld_moved = move_tasks(this_rq, this_cpu, busiest,

3495

ld_moved = move_tasks(this_rq, this_cpu, busiest,

3496

imbalance, sd, idle, &all_pinned);

3496

imbalance, sd, idle, &all_pinned);

3497

double_rq_unlock(this_rq, busiest);

3497

double_rq_unlock(this_rq, busiest);

3498

local_irq_restore(flags);

3498

local_irq_restore(flags);

3499

3500

/*

3500

/*

3501

* some other cpu did the load balance for us.

3501

* some other cpu did the load balance for us.

3502

*/

3502

*/

3503

if (ld_moved && this_cpu != smp_processor_id())

3503

if (ld_moved && this_cpu != smp_processor_id())

3504

resched_cpu(this_cpu);

3504

resched_cpu(this_cpu);

3505

3506

/* All tasks on this runqueue were pinned by CPU affinity */

3506

/* All tasks on this runqueue were pinned by CPU affinity */

3507

if (unlikely(all_pinned)) {

3507

if (unlikely(all_pinned)) {

3508

cpu_clear(cpu_of(busiest), *cpus);

3508

cpu_clear(cpu_of(busiest), *cpus);

3509

if (!cpus_empty(*cpus))

3509

if (!cpus_empty(*cpus))

3510

goto redo;

3510

goto redo;

3511

goto out_balanced;

3511

goto out_balanced;

3512

}

3512

}

3513

}

3513

}

3514

3515

if (!ld_moved) {

3515

if (!ld_moved) {

3516

schedstat_inc(sd, lb_failed[idle]);

3516

schedstat_inc(sd, lb_failed[idle]);

3517

sd->nr_balance_failed++;

3517

sd->nr_balance_failed++;

3518

3519

if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {

3519

if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {

3520

3521

spin_lock_irqsave(&busiest->lock, flags);

3521

spin_lock_irqsave(&busiest->lock, flags);

3522

3523

/* don't kick the migration_thread, if the curr

3523

/* don't kick the migration_thread, if the curr

3524

* task on busiest cpu can't be moved to this_cpu

3524

* task on busiest cpu can't be moved to this_cpu

3525

*/

3525

*/

3526

if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {

3526

if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {

3527

spin_unlock_irqrestore(&busiest->lock, flags);

3527

spin_unlock_irqrestore(&busiest->lock, flags);

3528

all_pinned = 1;

3528

all_pinned = 1;

3529

goto out_one_pinned;

3529

goto out_one_pinned;

3530

}

3530

}

3531

3532

if (!busiest->active_balance) {

3532

if (!busiest->active_balance) {

3533

busiest->active_balance = 1;

3533

busiest->active_balance = 1;

3534

busiest->push_cpu = this_cpu;

3534

busiest->push_cpu = this_cpu;

3535

active_balance = 1;

3535

active_balance = 1;

3536

}

3536

}

3537

spin_unlock_irqrestore(&busiest->lock, flags);

3537

spin_unlock_irqrestore(&busiest->lock, flags);

3538

if (active_balance)

3538

if (active_balance)

3539

wake_up_process(busiest->migration_thread);

3539

wake_up_process(busiest->migration_thread);

3540

3541

/*

3541

/*

3542

* We've kicked active balancing, reset the failure

3542

* We've kicked active balancing, reset the failure

3543

* counter.

3543

* counter.

3544

*/

3544

*/

3545

sd->nr_balance_failed = sd->cache_nice_tries+1;

3545

sd->nr_balance_failed = sd->cache_nice_tries+1;

3546

}

3546

}

3547

} else

3547

} else

3548

sd->nr_balance_failed = 0;

3548

sd->nr_balance_failed = 0;

3549

3550

if (likely(!active_balance)) {

3550

if (likely(!active_balance)) {

3551

/* We were unbalanced, so reset the balancing interval */

3551

/* We were unbalanced, so reset the balancing interval */

3552

sd->balance_interval = sd->min_interval;

3552

sd->balance_interval = sd->min_interval;

3553

} else {

3553

} else {

3554

/*

3554

/*

3555

* If we've begun active balancing, start to back off. This

3555

* If we've begun active balancing, start to back off. This

3556

* case may not be covered by the all_pinned logic if there

3556

* case may not be covered by the all_pinned logic if there

3557

* is only 1 task on the busy runqueue (because we don't call

3557

* is only 1 task on the busy runqueue (because we don't call

3558

* move_tasks).

3558

* move_tasks).

3559

*/

3559

*/

3560

if (sd->balance_interval < sd->max_interval)

3560

if (sd->balance_interval < sd->max_interval)

3561

sd->balance_interval *= 2;

3561

sd->balance_interval *= 2;

3562

}

3562

}

3563

3564

if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

3564

if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

3565

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

3565

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

3566

ld_moved = -1;

3566

ld_moved = -1;

3567

3568

goto out;

3568

goto out;

3569

3570

out_balanced:

3570

out_balanced:

3571

schedstat_inc(sd, lb_balanced[idle]);

3571

schedstat_inc(sd, lb_balanced[idle]);

3572

3573

sd->nr_balance_failed = 0;

3573

sd->nr_balance_failed = 0;

3574

3575

out_one_pinned:

3575

out_one_pinned:

3576

/* tune up the balancing interval */

3576

/* tune up the balancing interval */

3577

if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||

3577

if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||

3578

(sd->balance_interval < sd->max_interval))

3578

(sd->balance_interval < sd->max_interval))

3579

sd->balance_interval *= 2;

3579

sd->balance_interval *= 2;

3580

3581

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

3581

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

3582

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

3582

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

3583

ld_moved = -1;

3583

ld_moved = -1;

3584

else

3584

else

3585

ld_moved = 0;

3585

ld_moved = 0;

3586

out:

3586

out:

3587

if (ld_moved)

3587

if (ld_moved)

3588

update_shares(sd);

3588

update_shares(sd);

3589

return ld_moved;

3589

return ld_moved;

3590

}

3590

}

3591

3592

/*

3592

/*

3593

* Check this_cpu to ensure it is balanced within domain. Attempt to move

3593

* Check this_cpu to ensure it is balanced within domain. Attempt to move

3594

* tasks if there is an imbalance.

3594

* tasks if there is an imbalance.

3595

*

3595

*

3596

* Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).

3596

* Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).

3597

* this_rq is locked.

3597

* this_rq is locked.

3598

*/

3598

*/

3599

static int

3599

static int

3600

load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,

3600

load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,

3601

cpumask_t *cpus)

3601

cpumask_t *cpus)

3602

{

3602

{

3603

struct sched_group *group;

3603

struct sched_group *group;

3604

struct rq *busiest = NULL;

3604

struct rq *busiest = NULL;

3605

unsigned long imbalance;

3605

unsigned long imbalance;

3606

int ld_moved = 0;

3606

int ld_moved = 0;

3607

int sd_idle = 0;

3607

int sd_idle = 0;

3608

int all_pinned = 0;

3608

int all_pinned = 0;

3609

3610

cpus_setall(*cpus);

3610

cpus_setall(*cpus);

3611

3612

/*

3612

/*

3613

* When power savings policy is enabled for the parent domain, idle

3613

* When power savings policy is enabled for the parent domain, idle

3614

* sibling can pick up load irrespective of busy siblings. In this case,

3614

* sibling can pick up load irrespective of busy siblings. In this case,

3615

* let the state of idle sibling percolate up as IDLE, instead of

3615

* let the state of idle sibling percolate up as IDLE, instead of

3616

* portraying it as CPU_NOT_IDLE.

3616

* portraying it as CPU_NOT_IDLE.

3617

*/

3617

*/

3618

if (sd->flags & SD_SHARE_CPUPOWER &&

3618

if (sd->flags & SD_SHARE_CPUPOWER &&

3619

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

3619

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

3620

sd_idle = 1;

3620

sd_idle = 1;

3621

3622

schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);

3622

schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);

3623

redo:

3623

redo:

3624

update_shares_locked(this_rq, sd);

3624

update_shares_locked(this_rq, sd);

3625

group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,

3625

group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,

3626

&sd_idle, cpus, NULL);

3626

&sd_idle, cpus, NULL);

3627

if (!group) {

3627

if (!group) {

3628

schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);

3628

schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);

3629

goto out_balanced;

3629

goto out_balanced;

3630

}

3630

}

3631

3632

busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);

3632

busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);

3633

if (!busiest) {

3633

if (!busiest) {

3634

schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);

3634

schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);

3635

goto out_balanced;

3635

goto out_balanced;

3636

}

3636

}

3637

3638

BUG_ON(busiest == this_rq);

3638

BUG_ON(busiest == this_rq);

3639

3640

schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);

3640

schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);

3641

3642

ld_moved = 0;

3642

ld_moved = 0;

3643

if (busiest->nr_running > 1) {

3643

if (busiest->nr_running > 1) {

3644

/* Attempt to move tasks */

3644

/* Attempt to move tasks */

3645

double_lock_balance(this_rq, busiest);

3645

double_lock_balance(this_rq, busiest);

3646

/* this_rq->clock is already updated */

3646

/* this_rq->clock is already updated */

3647

update_rq_clock(busiest);

3647

update_rq_clock(busiest);

3648

ld_moved = move_tasks(this_rq, this_cpu, busiest,

3648

ld_moved = move_tasks(this_rq, this_cpu, busiest,

3649

imbalance, sd, CPU_NEWLY_IDLE,

3649

imbalance, sd, CPU_NEWLY_IDLE,

3650

&all_pinned);

3650

&all_pinned);

3651

double_unlock_balance(this_rq, busiest);

3651

double_unlock_balance(this_rq, busiest);

3652

3653

if (unlikely(all_pinned)) {

3653

if (unlikely(all_pinned)) {

3654

cpu_clear(cpu_of(busiest), *cpus);

3654

cpu_clear(cpu_of(busiest), *cpus);

3655

if (!cpus_empty(*cpus))

3655

if (!cpus_empty(*cpus))

3656

goto redo;

3656

goto redo;

3657

}

3657

}

3658

}

3658

}

3659

3660

if (!ld_moved) {

3660

if (!ld_moved) {

3661

schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);

3661

schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);

3662

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

3662

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

3663

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

3663

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

3664

return -1;

3664

return -1;

3665

} else

3665

} else

3666

sd->nr_balance_failed = 0;

3666

sd->nr_balance_failed = 0;

3667

3668

update_shares_locked(this_rq, sd);

3668

update_shares_locked(this_rq, sd);

3669

return ld_moved;

3669

return ld_moved;

3670

3671

out_balanced:

3671

out_balanced:

3672

schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);

3672

schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);

3673

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

3673

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

3674

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

3674

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

3675

return -1;

3675

return -1;

3676

sd->nr_balance_failed = 0;

3676

sd->nr_balance_failed = 0;

3677

3678

return 0;

3678

return 0;

3679

}

3679

}

3680

3681

/*

3681

/*

3682

* idle_balance is called by schedule() if this_cpu is about to become

3682

* idle_balance is called by schedule() if this_cpu is about to become

3683

* idle. Attempts to pull tasks from other CPUs.

3683

* idle. Attempts to pull tasks from other CPUs.

3684

*/

3684

*/

3685

static void idle_balance(int this_cpu, struct rq *this_rq)

3685

static void idle_balance(int this_cpu, struct rq *this_rq)

3686

{

3686

{

3687

struct sched_domain *sd;

3687

struct sched_domain *sd;

3688

int pulled_task = 0;

3688

int pulled_task = 0;

3689

unsigned long next_balance = jiffies + HZ;

3689

unsigned long next_balance = jiffies + HZ;

3690

cpumask_t tmpmask;

3690

cpumask_t tmpmask;

3691

3692

for_each_domain(this_cpu, sd) {

3692

for_each_domain(this_cpu, sd) {

3693

unsigned long interval;

3693

unsigned long interval;

3694

3695

if (!(sd->flags & SD_LOAD_BALANCE))

3695

if (!(sd->flags & SD_LOAD_BALANCE))

3696

continue;

3696

continue;

3697

3698

if (sd->flags & SD_BALANCE_NEWIDLE)

3698

if (sd->flags & SD_BALANCE_NEWIDLE)

3699

/* If we've pulled tasks over stop searching: */

3699

/* If we've pulled tasks over stop searching: */

3700

pulled_task = load_balance_newidle(this_cpu, this_rq,

3700

pulled_task = load_balance_newidle(this_cpu, this_rq,

3701

sd, &tmpmask);

3701

sd, &tmpmask);

3702

3703

interval = msecs_to_jiffies(sd->balance_interval);

3703

interval = msecs_to_jiffies(sd->balance_interval);

3704

if (time_after(next_balance, sd->last_balance + interval))

3704

if (time_after(next_balance, sd->last_balance + interval))

3705

next_balance = sd->last_balance + interval;

3705

next_balance = sd->last_balance + interval;

3706

if (pulled_task)

3706

if (pulled_task)

3707

break;

3707

break;

3708

}

3708

}

3709

if (pulled_task || time_after(jiffies, this_rq->next_balance)) {

3709

if (pulled_task || time_after(jiffies, this_rq->next_balance)) {

3710

/*

3710

/*

3711

* We are going idle. next_balance may be set based on

3711

* We are going idle. next_balance may be set based on

3712

* a busy processor. So reset next_balance.

3712

* a busy processor. So reset next_balance.

3713

*/

3713

*/

3714

this_rq->next_balance = next_balance;

3714

this_rq->next_balance = next_balance;

3715

}

3715

}

3716

}

3716

}

3717

3718

/*

3718

/*

3719

* active_load_balance is run by migration threads. It pushes running tasks

3719

* active_load_balance is run by migration threads. It pushes running tasks

3720

* off the busiest CPU onto idle CPUs. It requires at least 1 task to be

3720

* off the busiest CPU onto idle CPUs. It requires at least 1 task to be

3721

* running on each physical CPU where possible, and avoids physical /

3721

* running on each physical CPU where possible, and avoids physical /

3722

* logical imbalances.

3722

* logical imbalances.

3723

*

3723

*

3724

* Called with busiest_rq locked.

3724

* Called with busiest_rq locked.

3725

*/

3725

*/

3726

static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)

3726

static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)

3727

{

3727

{

3728

int target_cpu = busiest_rq->push_cpu;

3728

int target_cpu = busiest_rq->push_cpu;

3729

struct sched_domain *sd;

3729

struct sched_domain *sd;

3730

struct rq *target_rq;

3730

struct rq *target_rq;

3731

3732

/* Is there any task to move? */

3732

/* Is there any task to move? */

3733

if (busiest_rq->nr_running <= 1)

3733

if (busiest_rq->nr_running <= 1)

3734

return;

3734

return;

3735

3736

target_rq = cpu_rq(target_cpu);

3736

target_rq = cpu_rq(target_cpu);

3737

3738

/*

3738

/*

3739

* This condition is "impossible", if it occurs

3739

* This condition is "impossible", if it occurs

3740

* we need to fix it. Originally reported by

3740

* we need to fix it. Originally reported by

3741

* Bjorn Helgaas on a 128-cpu setup.

3741

* Bjorn Helgaas on a 128-cpu setup.

3742

*/

3742

*/

3743

BUG_ON(busiest_rq == target_rq);

3743

BUG_ON(busiest_rq == target_rq);

3744

3745

/* move a task from busiest_rq to target_rq */

3745

/* move a task from busiest_rq to target_rq */

3746

double_lock_balance(busiest_rq, target_rq);

3746

double_lock_balance(busiest_rq, target_rq);

3747

update_rq_clock(busiest_rq);

3747

update_rq_clock(busiest_rq);

3748

update_rq_clock(target_rq);

3748

update_rq_clock(target_rq);

3749

3750

/* Search for an sd spanning us and the target CPU. */

3750

/* Search for an sd spanning us and the target CPU. */

3751

for_each_domain(target_cpu, sd) {

3751

for_each_domain(target_cpu, sd) {

3752

if ((sd->flags & SD_LOAD_BALANCE) &&

3752

if ((sd->flags & SD_LOAD_BALANCE) &&

3753

cpu_isset(busiest_cpu, sd->span))

3753

cpu_isset(busiest_cpu, sd->span))

3754

break;

3754

break;

3755

}

3755

}

3756

3757

if (likely(sd)) {

3757

if (likely(sd)) {

3758

schedstat_inc(sd, alb_count);

3758

schedstat_inc(sd, alb_count);

3759

3760

if (move_one_task(target_rq, target_cpu, busiest_rq,

3760

if (move_one_task(target_rq, target_cpu, busiest_rq,

3761

sd, CPU_IDLE))

3761

sd, CPU_IDLE))

3762

schedstat_inc(sd, alb_pushed);

3762

schedstat_inc(sd, alb_pushed);

3763

else

3763

else

3764

schedstat_inc(sd, alb_failed);

3764

schedstat_inc(sd, alb_failed);

3765

}

3765

}

3766

double_unlock_balance(busiest_rq, target_rq);

3766

double_unlock_balance(busiest_rq, target_rq);

3767

}

3767

}

3768

3769

#ifdef CONFIG_NO_HZ

3769

#ifdef CONFIG_NO_HZ

3770

static struct {

3770

static struct {

3771

atomic_t load_balancer;

3771

atomic_t load_balancer;

3772

cpumask_t cpu_mask;

3772

cpumask_t cpu_mask;

3773

} nohz ____cacheline_aligned = {

3773

} nohz ____cacheline_aligned = {

3774

.load_balancer = ATOMIC_INIT(-1),

3774

.load_balancer = ATOMIC_INIT(-1),

3775

.cpu_mask = CPU_MASK_NONE,

3775

.cpu_mask = CPU_MASK_NONE,

3776

};

3776

};

3777

3778

/*

3778

/*

3779

* This routine will try to nominate the ilb (idle load balancing)

3779

* This routine will try to nominate the ilb (idle load balancing)

3780

* owner among the cpus whose ticks are stopped. ilb owner will do the idle

3780

* owner among the cpus whose ticks are stopped. ilb owner will do the idle

3781

* load balancing on behalf of all those cpus. If all the cpus in the system

3781

* load balancing on behalf of all those cpus. If all the cpus in the system

3782

* go into this tickless mode, then there will be no ilb owner (as there is

3782

* go into this tickless mode, then there will be no ilb owner (as there is

3783

* no need for one) and all the cpus will sleep till the next wakeup event

3783

* no need for one) and all the cpus will sleep till the next wakeup event

3784

* arrives...

3784

* arrives...

3785

*

3785

*

3786

* For the ilb owner, tick is not stopped. And this tick will be used

3786

* For the ilb owner, tick is not stopped. And this tick will be used

3787

* for idle load balancing. ilb owner will still be part of

3787

* for idle load balancing. ilb owner will still be part of

3788

* nohz.cpu_mask..

3788

* nohz.cpu_mask..

3789

*

3789

*

3790

* While stopping the tick, this cpu will become the ilb owner if there

3790

* While stopping the tick, this cpu will become the ilb owner if there

3791

* is no other owner. And will be the owner till that cpu becomes busy

3791

* is no other owner. And will be the owner till that cpu becomes busy

3792

* or if all cpus in the system stop their ticks at which point

3792

* or if all cpus in the system stop their ticks at which point

3793

* there is no need for ilb owner.

3793

* there is no need for ilb owner.

3794

*

3794

*

3795

* When the ilb owner becomes busy, it nominates another owner, during the

3795

* When the ilb owner becomes busy, it nominates another owner, during the

3796

* next busy scheduler_tick()

3796

* next busy scheduler_tick()

3797

*/

3797

*/

3798

int select_nohz_load_balancer(int stop_tick)

3798

int select_nohz_load_balancer(int stop_tick)

3799

{

3799

{

3800

int cpu = smp_processor_id();

3800

int cpu = smp_processor_id();

3801

3802

if (stop_tick) {

3802

if (stop_tick) {

3803

cpu_set(cpu, nohz.cpu_mask);

3803

cpu_set(cpu, nohz.cpu_mask);

3804

cpu_rq(cpu)->in_nohz_recently = 1;

3804

cpu_rq(cpu)->in_nohz_recently = 1;

3805

3806

/*

3806

/*

3807

* If we are going offline and still the leader, give up!

3807

* If we are going offline and still the leader, give up!

3808

*/

3808

*/

3809

if (!cpu_active(cpu) &&

3809

if (!cpu_active(cpu) &&

3810

atomic_read(&nohz.load_balancer) == cpu) {

3810

atomic_read(&nohz.load_balancer) == cpu) {

3811

if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)

3811

if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)

3812

BUG();

3812

BUG();

3813

return 0;

3813

return 0;

3814

}

3814

}

3815

3816

/* time for ilb owner also to sleep */

3816

/* time for ilb owner also to sleep */

3817

if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {

3817

if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {

3818

if (atomic_read(&nohz.load_balancer) == cpu)

3818

if (atomic_read(&nohz.load_balancer) == cpu)

3819

atomic_set(&nohz.load_balancer, -1);

3819

atomic_set(&nohz.load_balancer, -1);

3820

return 0;

3820

return 0;

3821

}

3821

}

3822

3823

if (atomic_read(&nohz.load_balancer) == -1) {

3823

if (atomic_read(&nohz.load_balancer) == -1) {

3824

/* make me the ilb owner */

3824

/* make me the ilb owner */

3825

if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)

3825

if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)

3826

return 1;

3826

return 1;

3827

} else if (atomic_read(&nohz.load_balancer) == cpu)

3827

} else if (atomic_read(&nohz.load_balancer) == cpu)

3828

return 1;

3828

return 1;

3829

} else {

3829

} else {

3830

if (!cpu_isset(cpu, nohz.cpu_mask))

3830

if (!cpu_isset(cpu, nohz.cpu_mask))

3831

return 0;

3831

return 0;

3832

3833

cpu_clear(cpu, nohz.cpu_mask);

3833

cpu_clear(cpu, nohz.cpu_mask);

3834

3835

if (atomic_read(&nohz.load_balancer) == cpu)

3835

if (atomic_read(&nohz.load_balancer) == cpu)

3836

if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)

3836

if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)

3837

BUG();

3837

BUG();

3838

}

3838

}

3839

return 0;

3839

return 0;

3840

}

3840

}

3841

#endif

3841

#endif

3842

3843

static DEFINE_SPINLOCK(balancing);

3843

static DEFINE_SPINLOCK(balancing);

3844

3845

/*

3845

/*

3846

* It checks each scheduling domain to see if it is due to be balanced,

3846

* It checks each scheduling domain to see if it is due to be balanced,

3847

* and initiates a balancing operation if so.

3847

* and initiates a balancing operation if so.

3848

*

3848

*

3849

* Balancing parameters are set up in arch_init_sched_domains.

3849

* Balancing parameters are set up in arch_init_sched_domains.

3850

*/

3850

*/

3851

static void rebalance_domains(int cpu, enum cpu_idle_type idle)

3851

static void rebalance_domains(int cpu, enum cpu_idle_type idle)

3852

{

3852

{

3853

int balance = 1;

3853

int balance = 1;

3854

struct rq *rq = cpu_rq(cpu);

3854

struct rq *rq = cpu_rq(cpu);

3855

unsigned long interval;

3855

unsigned long interval;

3856

struct sched_domain *sd;

3856

struct sched_domain *sd;

3857

/* Earliest time when we have to do rebalance again */

3857

/* Earliest time when we have to do rebalance again */

3858

unsigned long next_balance = jiffies + 60*HZ;

3858

unsigned long next_balance = jiffies + 60*HZ;

3859

int update_next_balance = 0;

3859

int update_next_balance = 0;

3860

int need_serialize;

3860

int need_serialize;

3861

cpumask_t tmp;

3861

cpumask_t tmp;

3862

3863

for_each_domain(cpu, sd) {

3863

for_each_domain(cpu, sd) {

3864

if (!(sd->flags & SD_LOAD_BALANCE))

3864

if (!(sd->flags & SD_LOAD_BALANCE))

3865

continue;

3865

continue;

3866

3867

interval = sd->balance_interval;

3867

interval = sd->balance_interval;

3868

if (idle != CPU_IDLE)

3868

if (idle != CPU_IDLE)

3869

interval *= sd->busy_factor;

3869

interval *= sd->busy_factor;

3870

3871

/* scale ms to jiffies */

3871

/* scale ms to jiffies */

3872

interval = msecs_to_jiffies(interval);

3872

interval = msecs_to_jiffies(interval);

3873

if (unlikely(!interval))

3873

if (unlikely(!interval))

3874

interval = 1;

3874

interval = 1;

3875

if (interval > HZ*NR_CPUS/10)

3875

if (interval > HZ*NR_CPUS/10)

3876

interval = HZ*NR_CPUS/10;

3876

interval = HZ*NR_CPUS/10;

3877

3878

need_serialize = sd->flags & SD_SERIALIZE;

3878

need_serialize = sd->flags & SD_SERIALIZE;

3879

3880

if (need_serialize) {

3880

if (need_serialize) {

3881

if (!spin_trylock(&balancing))

3881

if (!spin_trylock(&balancing))

3882

goto out;

3882

goto out;

3883

}

3883

}

3884

3885

if (time_after_eq(jiffies, sd->last_balance + interval)) {

3885

if (time_after_eq(jiffies, sd->last_balance + interval)) {

3886

if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {

3886

if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {

3887

/*

3887

/*

3888

* We've pulled tasks over so either we're no

3888

* We've pulled tasks over so either we're no

3889

* longer idle, or one of our SMT siblings is

3889

* longer idle, or one of our SMT siblings is

3890

* not idle.

3890

* not idle.

3891

*/

3891

*/

3892

idle = CPU_NOT_IDLE;

3892

idle = CPU_NOT_IDLE;

3893

}

3893

}

3894

sd->last_balance = jiffies;

3894

sd->last_balance = jiffies;

3895

}

3895

}

3896

if (need_serialize)

3896

if (need_serialize)

3897

spin_unlock(&balancing);

3897

spin_unlock(&balancing);

3898

out:

3898

out:

3899

if (time_after(next_balance, sd->last_balance + interval)) {

3899

if (time_after(next_balance, sd->last_balance + interval)) {

3900

next_balance = sd->last_balance + interval;

3900

next_balance = sd->last_balance + interval;

3901

update_next_balance = 1;

3901

update_next_balance = 1;

3902

}

3902

}

3903

3904

/*

3904

/*

3905

* Stop the load balance at this level. There is another

3905

* Stop the load balance at this level. There is another

3906

* CPU in our sched group which is doing load balancing more

3906

* CPU in our sched group which is doing load balancing more

3907

* actively.

3907

* actively.

3908

*/

3908

*/

3909

if (!balance)

3909

if (!balance)

3910

break;

3910

break;

3911

}

3911

}

3912

3913

/*

3913

/*

3914

* next_balance will be updated only when there is a need.

3914

* next_balance will be updated only when there is a need.

3915

* When the cpu is attached to null domain for ex, it will not be

3915

* When the cpu is attached to null domain for ex, it will not be

3916

* updated.

3916

* updated.

3917

*/

3917

*/

3918

if (likely(update_next_balance))

3918

if (likely(update_next_balance))

3919

rq->next_balance = next_balance;

3919

rq->next_balance = next_balance;

3920

}

3920

}

3921

3922

/*

3922

/*

3923

* run_rebalance_domains is triggered when needed from the scheduler tick.

3923

* run_rebalance_domains is triggered when needed from the scheduler tick.

3924

* In CONFIG_NO_HZ case, the idle load balance owner will do the

3924

* In CONFIG_NO_HZ case, the idle load balance owner will do the

3925

* rebalancing for all the cpus for whom scheduler ticks are stopped.

3925

* rebalancing for all the cpus for whom scheduler ticks are stopped.

3926

*/

3926

*/

3927

static void run_rebalance_domains(struct softirq_action *h)

3927

static void run_rebalance_domains(struct softirq_action *h)

3928

{

3928

{

3929

int this_cpu = smp_processor_id();

3929

int this_cpu = smp_processor_id();

3930

struct rq *this_rq = cpu_rq(this_cpu);

3930

struct rq *this_rq = cpu_rq(this_cpu);

3931

enum cpu_idle_type idle = this_rq->idle_at_tick ?

3931

enum cpu_idle_type idle = this_rq->idle_at_tick ?

3932

CPU_IDLE : CPU_NOT_IDLE;

3932

CPU_IDLE : CPU_NOT_IDLE;

3933

3934

rebalance_domains(this_cpu, idle);

3934

rebalance_domains(this_cpu, idle);

3935

3936

#ifdef CONFIG_NO_HZ

3936

#ifdef CONFIG_NO_HZ

3937

/*

3937

/*

3938

* If this cpu is the owner for idle load balancing, then do the

3938

* If this cpu is the owner for idle load balancing, then do the

3939

* balancing on behalf of the other idle cpus whose ticks are

3939

* balancing on behalf of the other idle cpus whose ticks are

3940

* stopped.

3940

* stopped.

3941

*/

3941

*/

3942

if (this_rq->idle_at_tick &&

3942

if (this_rq->idle_at_tick &&

3943

atomic_read(&nohz.load_balancer) == this_cpu) {

3943

atomic_read(&nohz.load_balancer) == this_cpu) {

3944

cpumask_t cpus = nohz.cpu_mask;

3944

cpumask_t cpus = nohz.cpu_mask;

3945

struct rq *rq;

3945

struct rq *rq;

3946

int balance_cpu;

3946

int balance_cpu;

3947

3948

cpu_clear(this_cpu, cpus);

3948

cpu_clear(this_cpu, cpus);

3949

for_each_cpu_mask_nr(balance_cpu, cpus) {

3949

for_each_cpu_mask_nr(balance_cpu, cpus) {

3950

/*

3950

/*

3951

* If this cpu gets work to do, stop the load balancing

3951

* If this cpu gets work to do, stop the load balancing

3952

* work being done for other cpus. Next load

3952

* work being done for other cpus. Next load

3953

* balancing owner will pick it up.

3953

* balancing owner will pick it up.

3954

*/

3954

*/

3955

if (need_resched())

3955

if (need_resched())

3956

break;

3956

break;

3957

3958

rebalance_domains(balance_cpu, CPU_IDLE);

3958

rebalance_domains(balance_cpu, CPU_IDLE);

3959

3960

rq = cpu_rq(balance_cpu);

3960

rq = cpu_rq(balance_cpu);

3961

if (time_after(this_rq->next_balance, rq->next_balance))

3961

if (time_after(this_rq->next_balance, rq->next_balance))

3962

this_rq->next_balance = rq->next_balance;

3962

this_rq->next_balance = rq->next_balance;

3963

}

3963

}

3964

}

3964

}

3965

#endif

3965

#endif

3966

}

3966

}

3967

3968

/*

3968

/*

3969

* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.

3969

* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.

3970

*

3970

*

3971

* In case of CONFIG_NO_HZ, this is the place where we nominate a new

3971

* In case of CONFIG_NO_HZ, this is the place where we nominate a new

3972

* idle load balancing owner or decide to stop the periodic load balancing,

3972

* idle load balancing owner or decide to stop the periodic load balancing,

3973

* if the whole system is idle.

3973

* if the whole system is idle.

3974

*/

3974

*/

3975

static inline void trigger_load_balance(struct rq *rq, int cpu)

3975

static inline void trigger_load_balance(struct rq *rq, int cpu)

3976

{

3976

{

3977

#ifdef CONFIG_NO_HZ

3977

#ifdef CONFIG_NO_HZ

3978

/*

3978

/*

3979

* If we were in the nohz mode recently and busy at the current

3979

* If we were in the nohz mode recently and busy at the current

3980

* scheduler tick, then check if we need to nominate new idle

3980

* scheduler tick, then check if we need to nominate new idle

3981

* load balancer.

3981

* load balancer.

3982

*/

3982

*/

3983

if (rq->in_nohz_recently && !rq->idle_at_tick) {

3983

if (rq->in_nohz_recently && !rq->idle_at_tick) {

3984

rq->in_nohz_recently = 0;

3984

rq->in_nohz_recently = 0;

3985

3986

if (atomic_read(&nohz.load_balancer) == cpu) {

3986

if (atomic_read(&nohz.load_balancer) == cpu) {

3987

cpu_clear(cpu, nohz.cpu_mask);

3987

cpu_clear(cpu, nohz.cpu_mask);

3988

atomic_set(&nohz.load_balancer, -1);

3988

atomic_set(&nohz.load_balancer, -1);

3989

}

3989

}

3990

3991

if (atomic_read(&nohz.load_balancer) == -1) {

3991

if (atomic_read(&nohz.load_balancer) == -1) {

3992

/*

3992

/*

3993

* simple selection for now: Nominate the

3993

* simple selection for now: Nominate the

3994

* first cpu in the nohz list to be the next

3994

* first cpu in the nohz list to be the next

3995

* ilb owner.

3995

* ilb owner.

3996

*

3996

*

3997

* TBD: Traverse the sched domains and nominate

3997

* TBD: Traverse the sched domains and nominate

3998

* the nearest cpu in the nohz.cpu_mask.

3998

* the nearest cpu in the nohz.cpu_mask.

3999

*/

3999

*/

4000

int ilb = first_cpu(nohz.cpu_mask);

4000

int ilb = first_cpu(nohz.cpu_mask);

4001

4002

if (ilb < nr_cpu_ids)

4002

if (ilb < nr_cpu_ids)

4003

resched_cpu(ilb);

4003

resched_cpu(ilb);

4004

}

4004

}

4005

}

4005

}

4006

4007

/*

4007

/*

4008

* If this cpu is idle and doing idle load balancing for all the

4008

* If this cpu is idle and doing idle load balancing for all the

4009

* cpus with ticks stopped, is it time for that to stop?

4009

* cpus with ticks stopped, is it time for that to stop?

4010

*/

4010

*/

4011

if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&

4011

if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&

4012

cpus_weight(nohz.cpu_mask) == num_online_cpus()) {

4012

cpus_weight(nohz.cpu_mask) == num_online_cpus()) {

4013

resched_cpu(cpu);

4013

resched_cpu(cpu);

4014

return;

4014

return;

4015

}

4015

}

4016

4017

/*

4017

/*

4018

* If this cpu is idle and the idle load balancing is done by

4018

* If this cpu is idle and the idle load balancing is done by

4019

* someone else, then no need raise the SCHED_SOFTIRQ

4019

* someone else, then no need raise the SCHED_SOFTIRQ

4020

*/

4020

*/

4021

if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&

4021

if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&

4022

cpu_isset(cpu, nohz.cpu_mask))

4022

cpu_isset(cpu, nohz.cpu_mask))

4023

return;

4023

return;

4024

#endif

4024

#endif

4025

if (time_after_eq(jiffies, rq->next_balance))

4025

if (time_after_eq(jiffies, rq->next_balance))

4026

raise_softirq(SCHED_SOFTIRQ);

4026

raise_softirq(SCHED_SOFTIRQ);

4027

}

4027

}

4028

4029

#else /* CONFIG_SMP */

4029

#else /* CONFIG_SMP */

4030

4031

/*

4031

/*

4032

* on UP we do not need to balance between CPUs:

4032

* on UP we do not need to balance between CPUs:

4033

*/

4033

*/

4034

static inline void idle_balance(int cpu, struct rq *rq)

4034

static inline void idle_balance(int cpu, struct rq *rq)

4035

{

4035

{

4036

}

4036

}

4037

4038

#endif

4038

#endif

4039

4040

DEFINE_PER_CPU(struct kernel_stat, kstat);

4040

DEFINE_PER_CPU(struct kernel_stat, kstat);

4041

4042

EXPORT_PER_CPU_SYMBOL(kstat);

4042

EXPORT_PER_CPU_SYMBOL(kstat);

4043

4044

/*

4044

/*

4045

* Return any ns on the sched_clock that have not yet been banked in

4045

* Return any ns on the sched_clock that have not yet been banked in

4046

* @p in case that task is currently running.

4046

* @p in case that task is currently running.

4047

*/

4047

*/

4048

unsigned long long task_delta_exec(struct task_struct *p)

4048

unsigned long long task_delta_exec(struct task_struct *p)

4049

{

4049

{

4050

unsigned long flags;

4050

unsigned long flags;

4051

struct rq *rq;

4051

struct rq *rq;

4052

u64 ns = 0;

4052

u64 ns = 0;

4053

4054

rq = task_rq_lock(p, &flags);

4054

rq = task_rq_lock(p, &flags);

4055

4056

if (task_current(rq, p)) {

4056

if (task_current(rq, p)) {

4057

u64 delta_exec;

4057

u64 delta_exec;

4058

4059

update_rq_clock(rq);

4059

update_rq_clock(rq);

4060

delta_exec = rq->clock - p->se.exec_start;

4060

delta_exec = rq->clock - p->se.exec_start;

4061

if ((s64)delta_exec > 0)

4061

if ((s64)delta_exec > 0)

4062

ns = delta_exec;

4062

ns = delta_exec;

4063

}

4063

}

4064

4065

task_rq_unlock(rq, &flags);

4065

task_rq_unlock(rq, &flags);

4066

4067

return ns;

4067

return ns;

4068

}

4068

}

4069

4070

/*

4070

/*

4071

* Account user cpu time to a process.

4071

* Account user cpu time to a process.

4072

* @p: the process that the cpu time gets accounted to

4072

* @p: the process that the cpu time gets accounted to

4073

* @cputime: the cpu time spent in user space since the last update

4073

* @cputime: the cpu time spent in user space since the last update

4074

*/

4074

*/

4075

void account_user_time(struct task_struct *p, cputime_t cputime)

4075

void account_user_time(struct task_struct *p, cputime_t cputime)

4076

{

4076

{

4077

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

4077

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

4078

cputime64_t tmp;

4078

cputime64_t tmp;

4079

4080

p->utime = cputime_add(p->utime, cputime);

4080

p->utime = cputime_add(p->utime, cputime);

4081

account_group_user_time(p, cputime);

4081

account_group_user_time(p, cputime);

4082

4083

/* Add user time to cpustat. */

4083

/* Add user time to cpustat. */

4084

tmp = cputime_to_cputime64(cputime);

4084

tmp = cputime_to_cputime64(cputime);

4085

if (TASK_NICE(p) > 0)

4085

if (TASK_NICE(p) > 0)

4086

cpustat->nice = cputime64_add(cpustat->nice, tmp);

4086

cpustat->nice = cputime64_add(cpustat->nice, tmp);

4087

else

4087

else

4088

cpustat->user = cputime64_add(cpustat->user, tmp);

4088

cpustat->user = cputime64_add(cpustat->user, tmp);

4089

/* Account for user time used */

4089

/* Account for user time used */

4090

acct_update_integrals(p);

4090

acct_update_integrals(p);

4091

}

4091

}

4092

4093

/*

4093

/*

4094

* Account guest cpu time to a process.

4094

* Account guest cpu time to a process.

4095

* @p: the process that the cpu time gets accounted to

4095

* @p: the process that the cpu time gets accounted to

4096

* @cputime: the cpu time spent in virtual machine since the last update

4096

* @cputime: the cpu time spent in virtual machine since the last update

4097

*/

4097

*/

4098

static void account_guest_time(struct task_struct *p, cputime_t cputime)

4098

static void account_guest_time(struct task_struct *p, cputime_t cputime)

4099

{

4099

{

4100

cputime64_t tmp;

4100

cputime64_t tmp;

4101

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

4101

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

4102

4103

tmp = cputime_to_cputime64(cputime);

4103

tmp = cputime_to_cputime64(cputime);

4104

4105

p->utime = cputime_add(p->utime, cputime);

4105

p->utime = cputime_add(p->utime, cputime);

4106

account_group_user_time(p, cputime);

4106

account_group_user_time(p, cputime);

4107

p->gtime = cputime_add(p->gtime, cputime);

4107

p->gtime = cputime_add(p->gtime, cputime);

4108

4109

cpustat->user = cputime64_add(cpustat->user, tmp);

4109

cpustat->user = cputime64_add(cpustat->user, tmp);

4110

cpustat->guest = cputime64_add(cpustat->guest, tmp);

4110

cpustat->guest = cputime64_add(cpustat->guest, tmp);

4111

}

4111

}

4112

4113

/*

4113

/*

4114

* Account scaled user cpu time to a process.

4114

* Account scaled user cpu time to a process.

4115

* @p: the process that the cpu time gets accounted to

4115

* @p: the process that the cpu time gets accounted to

4116

* @cputime: the cpu time spent in user space since the last update

4116

* @cputime: the cpu time spent in user space since the last update

4117

*/

4117

*/

4118

void account_user_time_scaled(struct task_struct *p, cputime_t cputime)

4118

void account_user_time_scaled(struct task_struct *p, cputime_t cputime)

4119

{

4119

{

4120

p->utimescaled = cputime_add(p->utimescaled, cputime);

4120

p->utimescaled = cputime_add(p->utimescaled, cputime);

4121

}

4121

}

4122

4123

/*

4123

/*

4124

* Account system cpu time to a process.

4124

* Account system cpu time to a process.

4125

* @p: the process that the cpu time gets accounted to

4125

* @p: the process that the cpu time gets accounted to

4126

* @hardirq_offset: the offset to subtract from hardirq_count()

4126

* @hardirq_offset: the offset to subtract from hardirq_count()

4127

* @cputime: the cpu time spent in kernel space since the last update

4127

* @cputime: the cpu time spent in kernel space since the last update

4128

*/

4128

*/

4129

void account_system_time(struct task_struct *p, int hardirq_offset,

4129

void account_system_time(struct task_struct *p, int hardirq_offset,

4130

cputime_t cputime)

4130

cputime_t cputime)

4131

{

4131

{

4132

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

4132

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

4133

struct rq *rq = this_rq();

4133

struct rq *rq = this_rq();

4134

cputime64_t tmp;

4134

cputime64_t tmp;

4135

4136

if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {

4136

if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {

4137

account_guest_time(p, cputime);

4137

account_guest_time(p, cputime);

4138

return;

4138

return;

4139

}

4139

}

4140

4141

p->stime = cputime_add(p->stime, cputime);

4141

p->stime = cputime_add(p->stime, cputime);

4142

account_group_system_time(p, cputime);

4142

account_group_system_time(p, cputime);

4143

4144

/* Add system time to cpustat. */

4144

/* Add system time to cpustat. */

4145

tmp = cputime_to_cputime64(cputime);

4145

tmp = cputime_to_cputime64(cputime);

4146

if (hardirq_count() - hardirq_offset)

4146

if (hardirq_count() - hardirq_offset)

4147

cpustat->irq = cputime64_add(cpustat->irq, tmp);

4147

cpustat->irq = cputime64_add(cpustat->irq, tmp);

4148

else if (softirq_count())

4148

else if (softirq_count())

4149

cpustat->softirq = cputime64_add(cpustat->softirq, tmp);

4149

cpustat->softirq = cputime64_add(cpustat->softirq, tmp);

4150

else if (p != rq->idle)

4150

else if (p != rq->idle)

4151

cpustat->system = cputime64_add(cpustat->system, tmp);

4151

cpustat->system = cputime64_add(cpustat->system, tmp);

4152

else if (atomic_read(&rq->nr_iowait) > 0)

4152

else if (atomic_read(&rq->nr_iowait) > 0)

4153

cpustat->iowait = cputime64_add(cpustat->iowait, tmp);

4153

cpustat->iowait = cputime64_add(cpustat->iowait, tmp);

4154

else

4154

else

4155

cpustat->idle = cputime64_add(cpustat->idle, tmp);

4155

cpustat->idle = cputime64_add(cpustat->idle, tmp);

4156

/* Account for system time used */

4156

/* Account for system time used */

4157

acct_update_integrals(p);

4157

acct_update_integrals(p);

4158

}

4158

}

4159

4160

/*

4160

/*

4161

* Account scaled system cpu time to a process.

4161

* Account scaled system cpu time to a process.

4162

* @p: the process that the cpu time gets accounted to

4162

* @p: the process that the cpu time gets accounted to

4163

* @hardirq_offset: the offset to subtract from hardirq_count()

4163

* @hardirq_offset: the offset to subtract from hardirq_count()

4164

* @cputime: the cpu time spent in kernel space since the last update

4164

* @cputime: the cpu time spent in kernel space since the last update

4165

*/

4165

*/

4166

void account_system_time_scaled(struct task_struct *p, cputime_t cputime)

4166

void account_system_time_scaled(struct task_struct *p, cputime_t cputime)

4167

{

4167

{

4168

p->stimescaled = cputime_add(p->stimescaled, cputime);

4168

p->stimescaled = cputime_add(p->stimescaled, cputime);

4169

}

4169

}

4170

4171

/*

4171

/*

4172

* Account for involuntary wait time.

4172

* Account for involuntary wait time.

4173

* @p: the process from which the cpu time has been stolen

4173

* @p: the process from which the cpu time has been stolen

4174

* @steal: the cpu time spent in involuntary wait

4174

* @steal: the cpu time spent in involuntary wait

4175

*/

4175

*/

4176

void account_steal_time(struct task_struct *p, cputime_t steal)

4176

void account_steal_time(struct task_struct *p, cputime_t steal)

4177

{

4177

{

4178

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

4178

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

4179

cputime64_t tmp = cputime_to_cputime64(steal);

4179

cputime64_t tmp = cputime_to_cputime64(steal);

4180

struct rq *rq = this_rq();

4180

struct rq *rq = this_rq();

4181

4182

if (p == rq->idle) {

4182

if (p == rq->idle) {

4183

p->stime = cputime_add(p->stime, steal);

4183

p->stime = cputime_add(p->stime, steal);

4184

account_group_system_time(p, steal);

4184

account_group_system_time(p, steal);

4185

if (atomic_read(&rq->nr_iowait) > 0)

4185

if (atomic_read(&rq->nr_iowait) > 0)

4186

cpustat->iowait = cputime64_add(cpustat->iowait, tmp);

4186

cpustat->iowait = cputime64_add(cpustat->iowait, tmp);

4187

else

4187

else

4188

cpustat->idle = cputime64_add(cpustat->idle, tmp);

4188

cpustat->idle = cputime64_add(cpustat->idle, tmp);

4189

} else

4189

} else

4190

cpustat->steal = cputime64_add(cpustat->steal, tmp);

4190

cpustat->steal = cputime64_add(cpustat->steal, tmp);

4191

}

4191

}

4192

4193

/*

4193

/*

4194

* Use precise platform statistics if available:

4194

* Use precise platform statistics if available:

4195

*/

4195

*/

4196

#ifdef CONFIG_VIRT_CPU_ACCOUNTING

4196

#ifdef CONFIG_VIRT_CPU_ACCOUNTING

4197

cputime_t task_utime(struct task_struct *p)

4197

cputime_t task_utime(struct task_struct *p)

4198

{

4198

{

4199

return p->utime;

4199

return p->utime;

4200

}

4200

}

4201

4202

cputime_t task_stime(struct task_struct *p)

4202

cputime_t task_stime(struct task_struct *p)

4203

{

4203

{

4204

return p->stime;

4204

return p->stime;

4205

}

4205

}

4206

#else

4206

#else

4207

cputime_t task_utime(struct task_struct *p)

4207

cputime_t task_utime(struct task_struct *p)

4208

{

4208

{

4209

clock_t utime = cputime_to_clock_t(p->utime),

4209

clock_t utime = cputime_to_clock_t(p->utime),

4210

total = utime + cputime_to_clock_t(p->stime);

4210

total = utime + cputime_to_clock_t(p->stime);

4211

u64 temp;

4211

u64 temp;

4212

4213

/*

4213

/*

4214

* Use CFS's precise accounting:

4214

* Use CFS's precise accounting:

4215

*/

4215

*/

4216

temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);

4216

temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);

4217

4218

if (total) {

4218

if (total) {

4219

temp *= utime;

4219

temp *= utime;

4220

do_div(temp, total);

4220

do_div(temp, total);

4221

}

4221

}

4222

utime = (clock_t)temp;

4222

utime = (clock_t)temp;

4223

4224

p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));

4224

p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));

4225

return p->prev_utime;

4225

return p->prev_utime;

4226

}

4226

}

4227

4228

cputime_t task_stime(struct task_struct *p)

4228

cputime_t task_stime(struct task_struct *p)

4229

{

4229

{

4230

clock_t stime;

4230

clock_t stime;

4231

4232

/*

4232

/*

4233

* Use CFS's precise accounting. (we subtract utime from

4233

* Use CFS's precise accounting. (we subtract utime from

4234

* the total, to make sure the total observed by userspace

4234

* the total, to make sure the total observed by userspace

4235

* grows monotonically - apps rely on that):

4235

* grows monotonically - apps rely on that):

4236

*/

4236

*/

4237

stime = nsec_to_clock_t(p->se.sum_exec_runtime) -

4237

stime = nsec_to_clock_t(p->se.sum_exec_runtime) -

4238

cputime_to_clock_t(task_utime(p));

4238

cputime_to_clock_t(task_utime(p));

4239

4240

if (stime >= 0)

4240

if (stime >= 0)

4241

p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));

4241

p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));

4242

4243

return p->prev_stime;

4243

return p->prev_stime;

4244

}

4244

}

4245

#endif

4245

#endif

4246

4247

inline cputime_t task_gtime(struct task_struct *p)

4247

inline cputime_t task_gtime(struct task_struct *p)

4248

{

4248

{

4249

return p->gtime;

4249

return p->gtime;

4250

}

4250

}

4251

4252

/*

4252

/*

4253

* This function gets called by the timer code, with HZ frequency.

4253

* This function gets called by the timer code, with HZ frequency.

4254

* We call it with interrupts disabled.

4254

* We call it with interrupts disabled.

4255

*

4255

*

4256

* It also gets called by the fork code, when changing the parent's

4256

* It also gets called by the fork code, when changing the parent's

4257

* timeslices.

4257

* timeslices.

4258

*/

4258

*/

4259

void scheduler_tick(void)

4259

void scheduler_tick(void)

4260

{

4260

{

4261

int cpu = smp_processor_id();

4261

int cpu = smp_processor_id();

4262

struct rq *rq = cpu_rq(cpu);

4262

struct rq *rq = cpu_rq(cpu);

4263

struct task_struct *curr = rq->curr;

4263

struct task_struct *curr = rq->curr;

4264

4265

sched_clock_tick();

4265

sched_clock_tick();

4266

4267

spin_lock(&rq->lock);

4267

spin_lock(&rq->lock);

4268

update_rq_clock(rq);

4268

update_rq_clock(rq);

4269

update_cpu_load(rq);

4269

update_cpu_load(rq);

4270

curr->sched_class->task_tick(rq, curr, 0);

4270

curr->sched_class->task_tick(rq, curr, 0);

4271

spin_unlock(&rq->lock);

4271

spin_unlock(&rq->lock);

4272

4273

#ifdef CONFIG_SMP

4273

#ifdef CONFIG_SMP

4274

rq->idle_at_tick = idle_cpu(cpu);

4274

rq->idle_at_tick = idle_cpu(cpu);

4275

trigger_load_balance(rq, cpu);

4275

trigger_load_balance(rq, cpu);

4276

#endif

4276

#endif

4277

}

4277

}

4278

4279

#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \

4279

#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \

4280

defined(CONFIG_PREEMPT_TRACER))

4280

defined(CONFIG_PREEMPT_TRACER))

4281

4282

static inline unsigned long get_parent_ip(unsigned long addr)

4282

static inline unsigned long get_parent_ip(unsigned long addr)

4283

{

4283

{

4284

if (in_lock_functions(addr)) {

4284

if (in_lock_functions(addr)) {

4285

addr = CALLER_ADDR2;

4285

addr = CALLER_ADDR2;

4286

if (in_lock_functions(addr))

4286

if (in_lock_functions(addr))

4287

addr = CALLER_ADDR3;

4287

addr = CALLER_ADDR3;

4288

}

4288

}

4289

return addr;

4289

return addr;

4290

}

4290

}

4291

4292

void __kprobes add_preempt_count(int val)

4292

void __kprobes add_preempt_count(int val)

4293

{

4293

{

4294

#ifdef CONFIG_DEBUG_PREEMPT

4294

#ifdef CONFIG_DEBUG_PREEMPT

4295

/*

4295

/*

4296

* Underflow?

4296

* Underflow?

4297

*/

4297

*/

4298

if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))

4298

if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))

4299

return;

4299

return;

4300

#endif

4300

#endif

4301

preempt_count() += val;

4301

preempt_count() += val;

4302

#ifdef CONFIG_DEBUG_PREEMPT

4302

#ifdef CONFIG_DEBUG_PREEMPT

4303

/*

4303

/*

4304

* Spinlock count overflowing soon?

4304

* Spinlock count overflowing soon?

4305

*/

4305

*/

4306

DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=

4306

DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=

4307

PREEMPT_MASK - 10);

4307

PREEMPT_MASK - 10);

4308

#endif

4308

#endif

4309

if (preempt_count() == val)

4309

if (preempt_count() == val)

4310

trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

4310

trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

4311

}

4311

}

4312

EXPORT_SYMBOL(add_preempt_count);

4312

EXPORT_SYMBOL(add_preempt_count);

4313

4314

void __kprobes sub_preempt_count(int val)

4314

void __kprobes sub_preempt_count(int val)

4315

{

4315

{

4316

#ifdef CONFIG_DEBUG_PREEMPT

4316

#ifdef CONFIG_DEBUG_PREEMPT

4317

/*

4317

/*

4318

* Underflow?

4318

* Underflow?

4319

*/

4319

*/

4320

if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))

4320

if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))

4321

return;

4321

return;

4322

/*

4322

/*

4323

* Is the spinlock portion underflowing?

4323

* Is the spinlock portion underflowing?

4324

*/

4324

*/

4325

if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&

4325

if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&

4326

!(preempt_count() & PREEMPT_MASK)))

4326

!(preempt_count() & PREEMPT_MASK)))

4327

return;

4327

return;

4328

#endif

4328

#endif

4329

4330

if (preempt_count() == val)

4330

if (preempt_count() == val)

4331

trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

4331

trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

4332

preempt_count() -= val;

4332

preempt_count() -= val;

4333

}

4333

}

4334

EXPORT_SYMBOL(sub_preempt_count);

4334

EXPORT_SYMBOL(sub_preempt_count);

4335

4336

#endif

4336

#endif

4337

4338

/*

4338

/*

4339

* Print scheduling while atomic bug:

4339

* Print scheduling while atomic bug:

4340

*/

4340

*/

4341

static noinline void __schedule_bug(struct task_struct *prev)

4341

static noinline void __schedule_bug(struct task_struct *prev)

4342

{

4342

{

4343

struct pt_regs *regs = get_irq_regs();

4343

struct pt_regs *regs = get_irq_regs();

4344

4345

printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",

4345

printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",

4346

prev->comm, prev->pid, preempt_count());

4346

prev->comm, prev->pid, preempt_count());

4347

4348

debug_show_held_locks(prev);

4348

debug_show_held_locks(prev);

4349

print_modules();

4349

print_modules();

4350

if (irqs_disabled())

4350

if (irqs_disabled())

4351

print_irqtrace_events(prev);

4351

print_irqtrace_events(prev);

4352

4353

if (regs)

4353

if (regs)

4354

show_regs(regs);

4354

show_regs(regs);

4355

else

4355

else

4356

dump_stack();

4356

dump_stack();

4357

}

4357

}

4358

4359

/*

4359

/*

4360

* Various schedule()-time debugging checks and statistics:

4360

* Various schedule()-time debugging checks and statistics:

4361

*/

4361

*/

4362

static inline void schedule_debug(struct task_struct *prev)

4362

static inline void schedule_debug(struct task_struct *prev)

4363

{

4363

{

4364

/*

4364

/*

4365

* Test if we are atomic. Since do_exit() needs to call into

4365

* Test if we are atomic. Since do_exit() needs to call into

4366

* schedule() atomically, we ignore that path for now.

4366

* schedule() atomically, we ignore that path for now.

4367

* Otherwise, whine if we are scheduling when we should not be.

4367

* Otherwise, whine if we are scheduling when we should not be.

4368

*/

4368

*/

4369

if (unlikely(in_atomic_preempt_off() && !prev->exit_state))

4369

if (unlikely(in_atomic_preempt_off() && !prev->exit_state))

4370

__schedule_bug(prev);

4370

__schedule_bug(prev);

4371

4372

profile_hit(SCHED_PROFILING, __builtin_return_address(0));

4372

profile_hit(SCHED_PROFILING, __builtin_return_address(0));

4373

4374

schedstat_inc(this_rq(), sched_count);

4374

schedstat_inc(this_rq(), sched_count);

4375

#ifdef CONFIG_SCHEDSTATS

4375

#ifdef CONFIG_SCHEDSTATS

4376

if (unlikely(prev->lock_depth >= 0)) {

4376

if (unlikely(prev->lock_depth >= 0)) {

4377

schedstat_inc(this_rq(), bkl_count);

4377

schedstat_inc(this_rq(), bkl_count);

4378

schedstat_inc(prev, sched_info.bkl_count);

4378

schedstat_inc(prev, sched_info.bkl_count);

4379

}

4379

}

4380

#endif

4380

#endif

4381

}

4381

}

4382

4383

/*

4383

/*

4384

* Pick up the highest-prio task:

4384

* Pick up the highest-prio task:

4385

*/

4385

*/

4386

static inline struct task_struct *

4386

static inline struct task_struct *

4387

pick_next_task(struct rq *rq, struct task_struct *prev)

4387

pick_next_task(struct rq *rq, struct task_struct *prev)

4388

{

4388

{

4389

const struct sched_class *class;

4389

const struct sched_class *class;

4390

struct task_struct *p;

4390

struct task_struct *p;

4391

4392

/*

4392

/*

4393

* Optimization: we know that if all tasks are in

4393

* Optimization: we know that if all tasks are in

4394

* the fair class we can call that function directly:

4394

* the fair class we can call that function directly:

4395

*/

4395

*/

4396

if (likely(rq->nr_running == rq->cfs.nr_running)) {

4396

if (likely(rq->nr_running == rq->cfs.nr_running)) {

4397

p = fair_sched_class.pick_next_task(rq);

4397

p = fair_sched_class.pick_next_task(rq);

4398

if (likely(p))

4398

if (likely(p))

4399

return p;

4399

return p;

4400

}

4400

}

4401

4402

class = sched_class_highest;

4402

class = sched_class_highest;

4403

for ( ; ; ) {

4403

for ( ; ; ) {

4404

p = class->pick_next_task(rq);

4404

p = class->pick_next_task(rq);

4405

if (p)

4405

if (p)

4406

return p;

4406

return p;

4407

/*

4407

/*

4408

* Will never be NULL as the idle class always

4408

* Will never be NULL as the idle class always

4409

* returns a non-NULL p:

4409

* returns a non-NULL p:

4410

*/

4410

*/

4411

class = class->next;

4411

class = class->next;

4412

}

4412

}

4413

}

4413

}

4414

4415

/*

4415

/*

4416

* schedule() is the main scheduler function.

4416

* schedule() is the main scheduler function.

4417

*/

4417

*/

4418

asmlinkage void __sched schedule(void)

4418

asmlinkage void __sched schedule(void)

4419

{

4419

{

4420

struct task_struct *prev, *next;

4420

struct task_struct *prev, *next;

4421

unsigned long *switch_count;

4421

unsigned long *switch_count;

4422

struct rq *rq;

4422

struct rq *rq;

4423

int cpu;

4423

int cpu;

4424

4425

need_resched:

4425

need_resched:

4426

preempt_disable();

4426

preempt_disable();

4427

cpu = smp_processor_id();

4427

cpu = smp_processor_id();

4428

rq = cpu_rq(cpu);

4428

rq = cpu_rq(cpu);

4429

rcu_qsctr_inc(cpu);

4429

rcu_qsctr_inc(cpu);

4430

prev = rq->curr;

4430

prev = rq->curr;

4431

switch_count = &prev->nivcsw;

4431

switch_count = &prev->nivcsw;

4432

4433

release_kernel_lock(prev);

4433

release_kernel_lock(prev);

4434

need_resched_nonpreemptible:

4434

need_resched_nonpreemptible:

4435

4436

schedule_debug(prev);

4436

schedule_debug(prev);

4437

4438

if (sched_feat(HRTICK))

4438

if (sched_feat(HRTICK))

4439

hrtick_clear(rq);

4439

hrtick_clear(rq);

4440

4441

spin_lock_irq(&rq->lock);

4441

spin_lock_irq(&rq->lock);

4442

update_rq_clock(rq);

4442

update_rq_clock(rq);

4443

clear_tsk_need_resched(prev);

4443

clear_tsk_need_resched(prev);

4444

4445

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

4445

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

4446

if (unlikely(signal_pending_state(prev->state, prev)))

4446

if (unlikely(signal_pending_state(prev->state, prev)))

4447

prev->state = TASK_RUNNING;

4447

prev->state = TASK_RUNNING;

4448

else

4448

else

4449

deactivate_task(rq, prev, 1);

4449

deactivate_task(rq, prev, 1);

4450

switch_count = &prev->nvcsw;

4450

switch_count = &prev->nvcsw;

4451

}

4451

}

4452

4453

#ifdef CONFIG_SMP

4453

#ifdef CONFIG_SMP

4454

if (prev->sched_class->pre_schedule)

4454

if (prev->sched_class->pre_schedule)

4455

prev->sched_class->pre_schedule(rq, prev);

4455

prev->sched_class->pre_schedule(rq, prev);

4456

#endif

4456

#endif

4457

4458

if (unlikely(!rq->nr_running))

4458

if (unlikely(!rq->nr_running))

4459

idle_balance(cpu, rq);

4459

idle_balance(cpu, rq);

4460

4461

prev->sched_class->put_prev_task(rq, prev);

4461

prev->sched_class->put_prev_task(rq, prev);

4462

next = pick_next_task(rq, prev);

4462

next = pick_next_task(rq, prev);

4463

4464

if (likely(prev != next)) {

4464

if (likely(prev != next)) {

4465

sched_info_switch(prev, next);

4465

sched_info_switch(prev, next);

4466

4467

rq->nr_switches++;

4467

rq->nr_switches++;

4468

rq->curr = next;

4468

rq->curr = next;

4469

++*switch_count;

4469

++*switch_count;

4470

4471

context_switch(rq, prev, next); /* unlocks the rq */

4471

context_switch(rq, prev, next); /* unlocks the rq */

4472

/*

4472

/*

4473

* the context switch might have flipped the stack from under

4473

* the context switch might have flipped the stack from under

4474

* us, hence refresh the local variables.

4474

* us, hence refresh the local variables.

4475

*/

4475

*/

4476

cpu = smp_processor_id();

4476

cpu = smp_processor_id();

4477

rq = cpu_rq(cpu);

4477

rq = cpu_rq(cpu);

4478

} else

4478

} else

4479

spin_unlock_irq(&rq->lock);

4479

spin_unlock_irq(&rq->lock);

4480

4481

if (unlikely(reacquire_kernel_lock(current) < 0))

4481

if (unlikely(reacquire_kernel_lock(current) < 0))

4482

goto need_resched_nonpreemptible;

4482

goto need_resched_nonpreemptible;

4483

4484

preempt_enable_no_resched();

4484

preempt_enable_no_resched();

4485

if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))

4485

if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))

4486

goto need_resched;

4486

goto need_resched;

4487

}

4487

}

4488

EXPORT_SYMBOL(schedule);

4488

EXPORT_SYMBOL(schedule);

4489

4490

#ifdef CONFIG_PREEMPT

4490

#ifdef CONFIG_PREEMPT

4491

/*

4491

/*

4492

* this is the entry point to schedule() from in-kernel preemption

4492

* this is the entry point to schedule() from in-kernel preemption

4493

* off of preempt_enable. Kernel preemptions off return from interrupt

4493

* off of preempt_enable. Kernel preemptions off return from interrupt

4494

* occur there and call schedule directly.

4494

* occur there and call schedule directly.

4495

*/

4495

*/

4496

asmlinkage void __sched preempt_schedule(void)

4496

asmlinkage void __sched preempt_schedule(void)

4497

{

4497

{

4498

struct thread_info *ti = current_thread_info();

4498

struct thread_info *ti = current_thread_info();

4499

4500

/*

4500

/*

4501

* If there is a non-zero preempt_count or interrupts are disabled,

4501

* If there is a non-zero preempt_count or interrupts are disabled,

4502

* we do not want to preempt the current task. Just return..

4502

* we do not want to preempt the current task. Just return..

4503

*/

4503

*/

4504

if (likely(ti->preempt_count || irqs_disabled()))

4504

if (likely(ti->preempt_count || irqs_disabled()))

4505

return;

4505

return;

4506

4507

do {

4507

do {

4508

add_preempt_count(PREEMPT_ACTIVE);

4508

add_preempt_count(PREEMPT_ACTIVE);

4509

schedule();

4509

schedule();

4510

sub_preempt_count(PREEMPT_ACTIVE);

4510

sub_preempt_count(PREEMPT_ACTIVE);

4511

4512

/*

4512

/*

4513

* Check again in case we missed a preemption opportunity

4513

* Check again in case we missed a preemption opportunity

4514

* between schedule and now.

4514

* between schedule and now.

4515

*/

4515

*/

4516

barrier();

4516

barrier();

4517

} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));

4517

} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));

4518

}

4518

}

4519

EXPORT_SYMBOL(preempt_schedule);

4519

EXPORT_SYMBOL(preempt_schedule);

4520

4521

/*

4521

/*

4522

* this is the entry point to schedule() from kernel preemption

4522

* this is the entry point to schedule() from kernel preemption

4523

* off of irq context.

4523

* off of irq context.

4524

* Note, that this is called and return with irqs disabled. This will

4524

* Note, that this is called and return with irqs disabled. This will

4525

* protect us against recursive calling from irq.

4525

* protect us against recursive calling from irq.

4526

*/

4526

*/

4527

asmlinkage void __sched preempt_schedule_irq(void)

4527

asmlinkage void __sched preempt_schedule_irq(void)

4528

{

4528

{

4529

struct thread_info *ti = current_thread_info();

4529

struct thread_info *ti = current_thread_info();

4530

4531

/* Catch callers which need to be fixed */

4531

/* Catch callers which need to be fixed */

4532

BUG_ON(ti->preempt_count || !irqs_disabled());

4532

BUG_ON(ti->preempt_count || !irqs_disabled());

4533

4534

do {

4534

do {

4535

add_preempt_count(PREEMPT_ACTIVE);

4535

add_preempt_count(PREEMPT_ACTIVE);

4536

local_irq_enable();

4536

local_irq_enable();

4537

schedule();

4537

schedule();

4538

local_irq_disable();

4538

local_irq_disable();

4539

sub_preempt_count(PREEMPT_ACTIVE);

4539

sub_preempt_count(PREEMPT_ACTIVE);

4540

4541

/*

4541

/*

4542

* Check again in case we missed a preemption opportunity

4542

* Check again in case we missed a preemption opportunity

4543

* between schedule and now.

4543

* between schedule and now.

4544

*/

4544

*/

4545

barrier();

4545

barrier();

4546

} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));

4546

} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));

4547

}

4547

}

4548

4549

#endif /* CONFIG_PREEMPT */

4549

#endif /* CONFIG_PREEMPT */

4550

4551

int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,

4551

int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,

4552

void *key)

4552

void *key)

4553

{

4553

{

4554

return try_to_wake_up(curr->private, mode, sync);

4554

return try_to_wake_up(curr->private, mode, sync);

4555

}

4555

}

4556

EXPORT_SYMBOL(default_wake_function);

4556

EXPORT_SYMBOL(default_wake_function);

4557

4558

/*

4558

/*

4559

* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just

4559

* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just

4560

* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve

4560

* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve

4561

* number) then we wake all the non-exclusive tasks and one exclusive task.

4561

* number) then we wake all the non-exclusive tasks and one exclusive task.

4562

*

4562

*

4563

* There are circumstances in which we can try to wake a task which has already

4563

* There are circumstances in which we can try to wake a task which has already

4564

* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns

4564

* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns

4565

* zero in this (rare) case, and we handle it by continuing to scan the queue.

4565

* zero in this (rare) case, and we handle it by continuing to scan the queue.

4566

*/

4566

*/

4567

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,

4567

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,

4568

int nr_exclusive, int sync, void *key)

4568

int nr_exclusive, int sync, void *key)

4569

{

4569

{

4570

wait_queue_t *curr, *next;

4570

wait_queue_t *curr, *next;

4571

4572

list_for_each_entry_safe(curr, next, &q->task_list, task_list) {

4572

list_for_each_entry_safe(curr, next, &q->task_list, task_list) {

4573

unsigned flags = curr->flags;

4573

unsigned flags = curr->flags;

4574

4575

if (curr->func(curr, mode, sync, key) &&

4575

if (curr->func(curr, mode, sync, key) &&

4576

(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)

4576

(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)

4577

break;

4577

break;

4578

}

4578

}

4579

}

4579

}

4580

4581

/**

4581

/**

4582

* __wake_up - wake up threads blocked on a waitqueue.

4582

* __wake_up - wake up threads blocked on a waitqueue.

4583

* @q: the waitqueue

4583

* @q: the waitqueue

4584

* @mode: which threads

4584

* @mode: which threads

4585

* @nr_exclusive: how many wake-one or wake-many threads to wake up

4585

* @nr_exclusive: how many wake-one or wake-many threads to wake up

4586

* @key: is directly passed to the wakeup function

4586

* @key: is directly passed to the wakeup function

4587

*/

4587

*/

4588

void __wake_up(wait_queue_head_t *q, unsigned int mode,

4588

void __wake_up(wait_queue_head_t *q, unsigned int mode,

4589

int nr_exclusive, void *key)

4589

int nr_exclusive, void *key)

4590

{

4590

{

4591

unsigned long flags;

4591

unsigned long flags;

4592

4593

spin_lock_irqsave(&q->lock, flags);

4593

spin_lock_irqsave(&q->lock, flags);

4594

__wake_up_common(q, mode, nr_exclusive, 0, key);

4594

__wake_up_common(q, mode, nr_exclusive, 0, key);

4595

spin_unlock_irqrestore(&q->lock, flags);

4595

spin_unlock_irqrestore(&q->lock, flags);

4596

}

4596

}

4597

EXPORT_SYMBOL(__wake_up);

4597

EXPORT_SYMBOL(__wake_up);

4598

4599

/*

4599

/*

4600

* Same as __wake_up but called with the spinlock in wait_queue_head_t held.

4600

* Same as __wake_up but called with the spinlock in wait_queue_head_t held.

4601

*/

4601

*/

4602

void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)

4602

void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)

4603

{

4603

{

4604

__wake_up_common(q, mode, 1, 0, NULL);

4604

__wake_up_common(q, mode, 1, 0, NULL);

4605

}

4605

}

4606

4607

/**

4607

/**

4608

* __wake_up_sync - wake up threads blocked on a waitqueue.

4608

* __wake_up_sync - wake up threads blocked on a waitqueue.

4609

* @q: the waitqueue

4609

* @q: the waitqueue

4610

* @mode: which threads

4610

* @mode: which threads

4611

* @nr_exclusive: how many wake-one or wake-many threads to wake up

4611

* @nr_exclusive: how many wake-one or wake-many threads to wake up

4612

*

4612

*

4613

* The sync wakeup differs that the waker knows that it will schedule

4613

* The sync wakeup differs that the waker knows that it will schedule

4614

* away soon, so while the target thread will be woken up, it will not

4614

* away soon, so while the target thread will be woken up, it will not

4615

* be migrated to another CPU - ie. the two threads are 'synchronized'

4615

* be migrated to another CPU - ie. the two threads are 'synchronized'

4616

* with each other. This can prevent needless bouncing between CPUs.

4616

* with each other. This can prevent needless bouncing between CPUs.

4617

*

4617

*

4618

* On UP it can prevent extra preemption.

4618

* On UP it can prevent extra preemption.

4619

*/

4619

*/

4620

void

4620

void

4621

__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)

4621

__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)

4622

{

4622

{

4623

unsigned long flags;

4623

unsigned long flags;

4624

int sync = 1;

4624

int sync = 1;

4625

4626

if (unlikely(!q))

4626

if (unlikely(!q))

4627

return;

4627

return;

4628

4629

if (unlikely(!nr_exclusive))

4629

if (unlikely(!nr_exclusive))

4630

sync = 0;

4630

sync = 0;

4631

4632

spin_lock_irqsave(&q->lock, flags);

4632

spin_lock_irqsave(&q->lock, flags);

4633

__wake_up_common(q, mode, nr_exclusive, sync, NULL);

4633

__wake_up_common(q, mode, nr_exclusive, sync, NULL);

4634

spin_unlock_irqrestore(&q->lock, flags);

4634

spin_unlock_irqrestore(&q->lock, flags);

4635

}

4635

}

4636

EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */

4636

EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */

4637

4638

/**

4638

/**

4639

* complete: - signals a single thread waiting on this completion

4639

* complete: - signals a single thread waiting on this completion

4640

* @x: holds the state of this particular completion

4640

* @x: holds the state of this particular completion

4641

*

4641

*

4642

* This will wake up a single thread waiting on this completion. Threads will be

4642

* This will wake up a single thread waiting on this completion. Threads will be

4643

* awakened in the same order in which they were queued.

4643

* awakened in the same order in which they were queued.

4644

*

4644

*

4645

* See also complete_all(), wait_for_completion() and related routines.

4645

* See also complete_all(), wait_for_completion() and related routines.

4646

*/

4646

*/

4647

void complete(struct completion *x)

4647

void complete(struct completion *x)

4648

{

4648

{

4649

unsigned long flags;

4649

unsigned long flags;

4650

4651

spin_lock_irqsave(&x->wait.lock, flags);

4651

spin_lock_irqsave(&x->wait.lock, flags);

4652

x->done++;

4652

x->done++;

4653

__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);

4653

__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);

4654

spin_unlock_irqrestore(&x->wait.lock, flags);

4654

spin_unlock_irqrestore(&x->wait.lock, flags);

4655

}

4655

}

4656

EXPORT_SYMBOL(complete);

4656

EXPORT_SYMBOL(complete);

4657

4658

/**

4658

/**

4659

* complete_all: - signals all threads waiting on this completion

4659

* complete_all: - signals all threads waiting on this completion

4660

* @x: holds the state of this particular completion

4660

* @x: holds the state of this particular completion

4661

*

4661

*

4662

* This will wake up all threads waiting on this particular completion event.

4662

* This will wake up all threads waiting on this particular completion event.

4663

*/

4663

*/

4664

void complete_all(struct completion *x)

4664

void complete_all(struct completion *x)

4665

{

4665

{

4666

unsigned long flags;

4666

unsigned long flags;

4667

4668

spin_lock_irqsave(&x->wait.lock, flags);

4668

spin_lock_irqsave(&x->wait.lock, flags);

4669

x->done += UINT_MAX/2;

4669

x->done += UINT_MAX/2;

4670

__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);

4670

__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);

4671

spin_unlock_irqrestore(&x->wait.lock, flags);

4671

spin_unlock_irqrestore(&x->wait.lock, flags);

4672

}

4672

}

4673

EXPORT_SYMBOL(complete_all);

4673

EXPORT_SYMBOL(complete_all);

4674

4675

static inline long __sched

4675

static inline long __sched

4676

do_wait_for_common(struct completion *x, long timeout, int state)

4676

do_wait_for_common(struct completion *x, long timeout, int state)

4677

{

4677

{

4678

if (!x->done) {

4678

if (!x->done) {

4679

DECLARE_WAITQUEUE(wait, current);

4679

DECLARE_WAITQUEUE(wait, current);

4680

4681

wait.flags |= WQ_FLAG_EXCLUSIVE;

4681

wait.flags |= WQ_FLAG_EXCLUSIVE;

4682

__add_wait_queue_tail(&x->wait, &wait);

4682

__add_wait_queue_tail(&x->wait, &wait);

4683

do {

4683

do {

4684

if (signal_pending_state(state, current)) {

4684

if (signal_pending_state(state, current)) {

4685

timeout = -ERESTARTSYS;

4685

timeout = -ERESTARTSYS;

4686

break;

4686

break;

4687

}

4687

}

4688

__set_current_state(state);

4688

__set_current_state(state);

4689

spin_unlock_irq(&x->wait.lock);

4689

spin_unlock_irq(&x->wait.lock);

4690

timeout = schedule_timeout(timeout);

4690

timeout = schedule_timeout(timeout);

4691

spin_lock_irq(&x->wait.lock);

4691

spin_lock_irq(&x->wait.lock);

4692

} while (!x->done && timeout);

4692

} while (!x->done && timeout);

4693

__remove_wait_queue(&x->wait, &wait);

4693

__remove_wait_queue(&x->wait, &wait);

4694

if (!x->done)

4694

if (!x->done)

4695

return timeout;

4695

return timeout;

4696

}

4696

}

4697

x->done--;

4697

x->done--;

4698

return timeout ?: 1;

4698

return timeout ?: 1;

4699

}

4699

}

4700

4701

static long __sched

4701

static long __sched

4702

wait_for_common(struct completion *x, long timeout, int state)

4702

wait_for_common(struct completion *x, long timeout, int state)

4703

{

4703

{

4704

might_sleep();

4704

might_sleep();

4705

4706

spin_lock_irq(&x->wait.lock);

4706

spin_lock_irq(&x->wait.lock);

4707

timeout = do_wait_for_common(x, timeout, state);

4707

timeout = do_wait_for_common(x, timeout, state);

4708

spin_unlock_irq(&x->wait.lock);

4708

spin_unlock_irq(&x->wait.lock);

4709

return timeout;

4709

return timeout;

4710

}

4710

}

4711

4712

/**

4712

/**

4713

* wait_for_completion: - waits for completion of a task

4713

* wait_for_completion: - waits for completion of a task

4714

* @x: holds the state of this particular completion

4714

* @x: holds the state of this particular completion

4715

*

4715

*

4716

* This waits to be signaled for completion of a specific task. It is NOT

4716

* This waits to be signaled for completion of a specific task. It is NOT

4717

* interruptible and there is no timeout.

4717

* interruptible and there is no timeout.

4718

*

4718

*

4719

* See also similar routines (i.e. wait_for_completion_timeout()) with timeout

4719

* See also similar routines (i.e. wait_for_completion_timeout()) with timeout

4720

* and interrupt capability. Also see complete().

4720

* and interrupt capability. Also see complete().

4721

*/

4721

*/

4722

void __sched wait_for_completion(struct completion *x)

4722

void __sched wait_for_completion(struct completion *x)

4723

{

4723

{

4724

wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);

4724

wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);

4725

}

4725

}

4726

EXPORT_SYMBOL(wait_for_completion);

4726

EXPORT_SYMBOL(wait_for_completion);

4727

4728

/**

4728

/**

4729

* wait_for_completion_timeout: - waits for completion of a task (w/timeout)

4729

* wait_for_completion_timeout: - waits for completion of a task (w/timeout)

4730

* @x: holds the state of this particular completion

4730

* @x: holds the state of this particular completion

4731

* @timeout: timeout value in jiffies

4731

* @timeout: timeout value in jiffies

4732

*

4732

*

4733

* This waits for either a completion of a specific task to be signaled or for a

4733

* This waits for either a completion of a specific task to be signaled or for a

4734

* specified timeout to expire. The timeout is in jiffies. It is not

4734

* specified timeout to expire. The timeout is in jiffies. It is not

4735

* interruptible.

4735

* interruptible.

4736

*/

4736

*/

4737

unsigned long __sched

4737

unsigned long __sched

4738

wait_for_completion_timeout(struct completion *x, unsigned long timeout)

4738

wait_for_completion_timeout(struct completion *x, unsigned long timeout)

4739

{

4739

{

4740

return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);

4740

return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);

4741

}

4741

}

4742

EXPORT_SYMBOL(wait_for_completion_timeout);

4742

EXPORT_SYMBOL(wait_for_completion_timeout);

4743

4744

/**

4744

/**

4745

* wait_for_completion_interruptible: - waits for completion of a task (w/intr)

4745

* wait_for_completion_interruptible: - waits for completion of a task (w/intr)

4746

* @x: holds the state of this particular completion

4746

* @x: holds the state of this particular completion

4747

*

4747

*

4748

* This waits for completion of a specific task to be signaled. It is

4748

* This waits for completion of a specific task to be signaled. It is

4749

* interruptible.

4749

* interruptible.

4750

*/

4750

*/

4751

int __sched wait_for_completion_interruptible(struct completion *x)

4751

int __sched wait_for_completion_interruptible(struct completion *x)

4752

{

4752

{

4753

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);

4753

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);

4754

if (t == -ERESTARTSYS)

4754

if (t == -ERESTARTSYS)

4755

return t;

4755

return t;

4756

return 0;

4756

return 0;

4757

}

4757

}

4758

EXPORT_SYMBOL(wait_for_completion_interruptible);

4758

EXPORT_SYMBOL(wait_for_completion_interruptible);

4759

4760

/**

4760

/**

4761

* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))

4761

* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))

4762

* @x: holds the state of this particular completion

4762

* @x: holds the state of this particular completion

4763

* @timeout: timeout value in jiffies

4763

* @timeout: timeout value in jiffies

4764

*

4764

*

4765

* This waits for either a completion of a specific task to be signaled or for a

4765

* This waits for either a completion of a specific task to be signaled or for a

4766

* specified timeout to expire. It is interruptible. The timeout is in jiffies.

4766

* specified timeout to expire. It is interruptible. The timeout is in jiffies.

4767

*/

4767

*/

4768

unsigned long __sched

4768

unsigned long __sched

4769

wait_for_completion_interruptible_timeout(struct completion *x,

4769

wait_for_completion_interruptible_timeout(struct completion *x,

4770

unsigned long timeout)

4770

unsigned long timeout)

4771

{

4771

{

4772

return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);

4772

return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);

4773

}

4773

}

4774

EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);

4774

EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);

4775

4776

/**

4776

/**

4777

* wait_for_completion_killable: - waits for completion of a task (killable)

4777

* wait_for_completion_killable: - waits for completion of a task (killable)

4778

* @x: holds the state of this particular completion

4778

* @x: holds the state of this particular completion

4779

*

4779

*

4780

* This waits to be signaled for completion of a specific task. It can be

4780

* This waits to be signaled for completion of a specific task. It can be

4781

* interrupted by a kill signal.

4781

* interrupted by a kill signal.

4782

*/

4782

*/

4783

int __sched wait_for_completion_killable(struct completion *x)

4783

int __sched wait_for_completion_killable(struct completion *x)

4784

{

4784

{

4785

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);

4785

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);

4786

if (t == -ERESTARTSYS)

4786

if (t == -ERESTARTSYS)

4787

return t;

4787

return t;

4788

return 0;

4788

return 0;

4789

}

4789

}

4790

EXPORT_SYMBOL(wait_for_completion_killable);

4790

EXPORT_SYMBOL(wait_for_completion_killable);

4791

4792

/**

4792

/**

4793

* try_wait_for_completion - try to decrement a completion without blocking

4793

* try_wait_for_completion - try to decrement a completion without blocking

4794

* @x: completion structure

4794

* @x: completion structure

4795

*

4795

*

4796

* Returns: 0 if a decrement cannot be done without blocking

4796

* Returns: 0 if a decrement cannot be done without blocking

4797

* 1 if a decrement succeeded.

4797

* 1 if a decrement succeeded.

4798

*

4798

*

4799

* If a completion is being used as a counting completion,

4799

* If a completion is being used as a counting completion,

4800

* attempt to decrement the counter without blocking. This

4800

* attempt to decrement the counter without blocking. This

4801

* enables us to avoid waiting if the resource the completion

4801

* enables us to avoid waiting if the resource the completion

4802

* is protecting is not available.

4802

* is protecting is not available.

4803

*/

4803

*/

4804

bool try_wait_for_completion(struct completion *x)

4804

bool try_wait_for_completion(struct completion *x)

4805

{

4805

{

4806

int ret = 1;

4806

int ret = 1;

4807

4808

spin_lock_irq(&x->wait.lock);

4808

spin_lock_irq(&x->wait.lock);

4809

if (!x->done)

4809

if (!x->done)

4810

ret = 0;

4810

ret = 0;

4811

else

4811

else

4812

x->done--;

4812

x->done--;

4813

spin_unlock_irq(&x->wait.lock);

4813

spin_unlock_irq(&x->wait.lock);

4814

return ret;

4814

return ret;

4815

}

4815

}

4816

EXPORT_SYMBOL(try_wait_for_completion);

4816

EXPORT_SYMBOL(try_wait_for_completion);

4817

4818

/**

4818

/**

4819

* completion_done - Test to see if a completion has any waiters

4819

* completion_done - Test to see if a completion has any waiters

4820

* @x: completion structure

4820

* @x: completion structure

4821

*

4821

*

4822

* Returns: 0 if there are waiters (wait_for_completion() in progress)

4822

* Returns: 0 if there are waiters (wait_for_completion() in progress)

4823

* 1 if there are no waiters.

4823

* 1 if there are no waiters.

4824

*

4824

*

4825

*/

4825

*/

4826

bool completion_done(struct completion *x)

4826

bool completion_done(struct completion *x)

4827

{

4827

{

4828

int ret = 1;

4828

int ret = 1;

4829

4830

spin_lock_irq(&x->wait.lock);

4830

spin_lock_irq(&x->wait.lock);

4831

if (!x->done)

4831

if (!x->done)

4832

ret = 0;

4832

ret = 0;

4833

spin_unlock_irq(&x->wait.lock);

4833

spin_unlock_irq(&x->wait.lock);

4834

return ret;

4834

return ret;

4835

}

4835

}

4836

EXPORT_SYMBOL(completion_done);

4836

EXPORT_SYMBOL(completion_done);

4837

4838

static long __sched

4838

static long __sched

4839

sleep_on_common(wait_queue_head_t *q, int state, long timeout)

4839

sleep_on_common(wait_queue_head_t *q, int state, long timeout)

4840

{

4840

{

4841

unsigned long flags;

4841

unsigned long flags;

4842

wait_queue_t wait;

4842

wait_queue_t wait;

4843

4844

init_waitqueue_entry(&wait, current);

4844

init_waitqueue_entry(&wait, current);

4845

4846

__set_current_state(state);

4846

__set_current_state(state);

4847

4848

spin_lock_irqsave(&q->lock, flags);

4848

spin_lock_irqsave(&q->lock, flags);

4849

__add_wait_queue(q, &wait);

4849

__add_wait_queue(q, &wait);

4850

spin_unlock(&q->lock);

4850

spin_unlock(&q->lock);

4851

timeout = schedule_timeout(timeout);

4851

timeout = schedule_timeout(timeout);

4852

spin_lock_irq(&q->lock);

4852

spin_lock_irq(&q->lock);

4853

__remove_wait_queue(q, &wait);

4853

__remove_wait_queue(q, &wait);

4854

spin_unlock_irqrestore(&q->lock, flags);

4854

spin_unlock_irqrestore(&q->lock, flags);

4855

4856

return timeout;

4856

return timeout;

4857

}

4857

}

4858

4859

void __sched interruptible_sleep_on(wait_queue_head_t *q)

4859

void __sched interruptible_sleep_on(wait_queue_head_t *q)

4860

{

4860

{

4861

sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

4861

sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

4862

}

4862

}

4863

EXPORT_SYMBOL(interruptible_sleep_on);

4863

EXPORT_SYMBOL(interruptible_sleep_on);

4864

4865

long __sched

4865

long __sched

4866

interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)

4866

interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)

4867

{

4867

{

4868

return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);

4868

return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);

4869

}

4869

}

4870

EXPORT_SYMBOL(interruptible_sleep_on_timeout);

4870

EXPORT_SYMBOL(interruptible_sleep_on_timeout);

4871

4872

void __sched sleep_on(wait_queue_head_t *q)

4872

void __sched sleep_on(wait_queue_head_t *q)

4873

{

4873

{

4874

sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

4874

sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

4875

}

4875

}

4876

EXPORT_SYMBOL(sleep_on);

4876

EXPORT_SYMBOL(sleep_on);

4877

4878

long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)

4878

long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)

4879

{

4879

{

4880

return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);

4880

return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);

4881

}

4881

}

4882

EXPORT_SYMBOL(sleep_on_timeout);

4882

EXPORT_SYMBOL(sleep_on_timeout);

4883

4884

#ifdef CONFIG_RT_MUTEXES

4884

#ifdef CONFIG_RT_MUTEXES

4885

4886

/*

4886

/*

4887

* rt_mutex_setprio - set the current priority of a task

4887

* rt_mutex_setprio - set the current priority of a task

4888

* @p: task

4888

* @p: task

4889

* @prio: prio value (kernel-internal form)

4889

* @prio: prio value (kernel-internal form)

4890

*

4890

*

4891

* This function changes the 'effective' priority of a task. It does

4891

* This function changes the 'effective' priority of a task. It does

4892

* not touch ->normal_prio like __setscheduler().

4892

* not touch ->normal_prio like __setscheduler().

4893

*

4893

*

4894

* Used by the rt_mutex code to implement priority inheritance logic.

4894

* Used by the rt_mutex code to implement priority inheritance logic.

4895

*/

4895

*/

4896

void rt_mutex_setprio(struct task_struct *p, int prio)

4896

void rt_mutex_setprio(struct task_struct *p, int prio)

4897

{

4897

{

4898

unsigned long flags;

4898

unsigned long flags;

4899

int oldprio, on_rq, running;

4899

int oldprio, on_rq, running;

4900

struct rq *rq;

4900

struct rq *rq;

4901

const struct sched_class *prev_class = p->sched_class;

4901

const struct sched_class *prev_class = p->sched_class;

4902

4903

BUG_ON(prio < 0 || prio > MAX_PRIO);

4903

BUG_ON(prio < 0 || prio > MAX_PRIO);

4904

4905

rq = task_rq_lock(p, &flags);

4905

rq = task_rq_lock(p, &flags);

4906

update_rq_clock(rq);

4906

update_rq_clock(rq);

4907

4908

oldprio = p->prio;

4908

oldprio = p->prio;

4909

on_rq = p->se.on_rq;

4909

on_rq = p->se.on_rq;

4910

running = task_current(rq, p);

4910

running = task_current(rq, p);

4911

if (on_rq)

4911

if (on_rq)

4912

dequeue_task(rq, p, 0);

4912

dequeue_task(rq, p, 0);

4913

if (running)

4913

if (running)

4914

p->sched_class->put_prev_task(rq, p);

4914

p->sched_class->put_prev_task(rq, p);

4915

4916

if (rt_prio(prio))

4916

if (rt_prio(prio))

4917

p->sched_class = &rt_sched_class;

4917

p->sched_class = &rt_sched_class;

4918

else

4918

else

4919

p->sched_class = &fair_sched_class;

4919

p->sched_class = &fair_sched_class;

4920

4921

p->prio = prio;

4921

p->prio = prio;

4922

4923

if (running)

4923

if (running)

4924

p->sched_class->set_curr_task(rq);

4924

p->sched_class->set_curr_task(rq);

4925

if (on_rq) {

4925

if (on_rq) {

4926

enqueue_task(rq, p, 0);

4926

enqueue_task(rq, p, 0);

4927

4928

check_class_changed(rq, p, prev_class, oldprio, running);

4928

check_class_changed(rq, p, prev_class, oldprio, running);

4929

}

4929

}

4930

task_rq_unlock(rq, &flags);

4930

task_rq_unlock(rq, &flags);

4931

}

4931

}

4932

4933

#endif

4933

#endif

4934

4935

void set_user_nice(struct task_struct *p, long nice)

4935

void set_user_nice(struct task_struct *p, long nice)

4936

{

4936

{

4937

int old_prio, delta, on_rq;

4937

int old_prio, delta, on_rq;

4938

unsigned long flags;

4938

unsigned long flags;

4939

struct rq *rq;

4939

struct rq *rq;

4940

4941

if (TASK_NICE(p) == nice || nice < -20 || nice > 19)

4941

if (TASK_NICE(p) == nice || nice < -20 || nice > 19)

4942

return;

4942

return;

4943

/*

4943

/*

4944

* We have to be careful, if called from sys_setpriority(),

4944

* We have to be careful, if called from sys_setpriority(),

4945

* the task might be in the middle of scheduling on another CPU.

4945

* the task might be in the middle of scheduling on another CPU.

4946

*/

4946

*/

4947

rq = task_rq_lock(p, &flags);

4947

rq = task_rq_lock(p, &flags);

4948

update_rq_clock(rq);

4948

update_rq_clock(rq);

4949

/*

4949

/*

4950

* The RT priorities are set via sched_setscheduler(), but we still

4950

* The RT priorities are set via sched_setscheduler(), but we still

4951

* allow the 'normal' nice value to be set - but as expected

4951

* allow the 'normal' nice value to be set - but as expected

4952

* it wont have any effect on scheduling until the task is

4952

* it wont have any effect on scheduling until the task is

4953

* SCHED_FIFO/SCHED_RR:

4953

* SCHED_FIFO/SCHED_RR:

4954

*/

4954

*/

4955

if (task_has_rt_policy(p)) {

4955

if (task_has_rt_policy(p)) {

4956

p->static_prio = NICE_TO_PRIO(nice);

4956

p->static_prio = NICE_TO_PRIO(nice);

4957

goto out_unlock;

4957

goto out_unlock;

4958

}

4958

}

4959

on_rq = p->se.on_rq;

4959

on_rq = p->se.on_rq;

4960

if (on_rq)

4960

if (on_rq)

4961

dequeue_task(rq, p, 0);

4961

dequeue_task(rq, p, 0);

4962

4963

p->static_prio = NICE_TO_PRIO(nice);

4963

p->static_prio = NICE_TO_PRIO(nice);

4964

set_load_weight(p);

4964

set_load_weight(p);

4965

old_prio = p->prio;

4965

old_prio = p->prio;

4966

p->prio = effective_prio(p);

4966

p->prio = effective_prio(p);

4967

delta = p->prio - old_prio;

4967

delta = p->prio - old_prio;

4968

4969

if (on_rq) {

4969

if (on_rq) {

4970

enqueue_task(rq, p, 0);

4970

enqueue_task(rq, p, 0);

4971

/*

4971

/*

4972

* If the task increased its priority or is running and

4972

* If the task increased its priority or is running and

4973

* lowered its priority, then reschedule its CPU:

4973

* lowered its priority, then reschedule its CPU:

4974

*/

4974

*/

4975

if (delta < 0 || (delta > 0 && task_running(rq, p)))

4975

if (delta < 0 || (delta > 0 && task_running(rq, p)))

4976

resched_task(rq->curr);

4976

resched_task(rq->curr);

4977

}

4977

}

4978

out_unlock:

4978

out_unlock:

4979

task_rq_unlock(rq, &flags);

4979

task_rq_unlock(rq, &flags);

4980

}

4980

}

4981

EXPORT_SYMBOL(set_user_nice);

4981

EXPORT_SYMBOL(set_user_nice);

4982

4983

/*

4983

/*

4984

* can_nice - check if a task can reduce its nice value

4984

* can_nice - check if a task can reduce its nice value

4985

* @p: task

4985

* @p: task

4986

* @nice: nice value

4986

* @nice: nice value

4987

*/

4987

*/

4988

int can_nice(const struct task_struct *p, const int nice)

4988

int can_nice(const struct task_struct *p, const int nice)

4989

{

4989

{

4990

/* convert nice value [19,-20] to rlimit style value [1,40] */

4990

/* convert nice value [19,-20] to rlimit style value [1,40] */

4991

int nice_rlim = 20 - nice;

4991

int nice_rlim = 20 - nice;

4992

4993

return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||

4993

return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||

4994

capable(CAP_SYS_NICE));

4994

capable(CAP_SYS_NICE));

4995

}

4995

}

4996

4997

#ifdef __ARCH_WANT_SYS_NICE

4997

#ifdef __ARCH_WANT_SYS_NICE

4998

4999

/*

4999

/*

5000

* sys_nice - change the priority of the current process.

5000

* sys_nice - change the priority of the current process.

5001

* @increment: priority increment

5001

* @increment: priority increment

5002

*

5002

*

5003

* sys_setpriority is a more generic, but much slower function that

5003

* sys_setpriority is a more generic, but much slower function that

5004

* does similar things.

5004

* does similar things.

5005

*/

5005

*/

5006

asmlinkage long sys_nice(int increment)

5006

asmlinkage long sys_nice(int increment)

5007

{

5007

{

5008

long nice, retval;

5008

long nice, retval;

5009

5010

/*

5010

/*

5011

* Setpriority might change our priority at the same moment.

5011

* Setpriority might change our priority at the same moment.

5012

* We don't have to worry. Conceptually one call occurs first

5012

* We don't have to worry. Conceptually one call occurs first

5013

* and we have a single winner.

5013

* and we have a single winner.

5014

*/

5014

*/

5015

if (increment < -40)

5015

if (increment < -40)

5016

increment = -40;

5016

increment = -40;

5017

if (increment > 40)

5017

if (increment > 40)

5018

increment = 40;

5018

increment = 40;

5019

5020

nice = PRIO_TO_NICE(current->static_prio) + increment;

5020

nice = PRIO_TO_NICE(current->static_prio) + increment;

5021

if (nice < -20)

5021

if (nice < -20)

5022

nice = -20;

5022

nice = -20;

5023

if (nice > 19)

5023

if (nice > 19)

5024

nice = 19;

5024

nice = 19;

5025

5026

if (increment < 0 && !can_nice(current, nice))

5026

if (increment < 0 && !can_nice(current, nice))

5027

return -EPERM;

5027

return -EPERM;

5028

5029

retval = security_task_setnice(current, nice);

5029

retval = security_task_setnice(current, nice);

5030

if (retval)

5030

if (retval)

5031

return retval;

5031

return retval;

5032

5033

set_user_nice(current, nice);

5033

set_user_nice(current, nice);

5034

return 0;

5034

return 0;

5035

}

5035

}

5036

5037

#endif

5037

#endif

5038

5039

/**

5039

/**

5040

* task_prio - return the priority value of a given task.

5040

* task_prio - return the priority value of a given task.

5041

* @p: the task in question.

5041

* @p: the task in question.

5042

*

5042

*

5043

* This is the priority value as seen by users in /proc.

5043

* This is the priority value as seen by users in /proc.

5044

* RT tasks are offset by -200. Normal tasks are centered

5044

* RT tasks are offset by -200. Normal tasks are centered

5045

* around 0, value goes from -16 to +15.

5045

* around 0, value goes from -16 to +15.

5046

*/

5046

*/

5047

int task_prio(const struct task_struct *p)

5047

int task_prio(const struct task_struct *p)

5048

{

5048

{

5049

return p->prio - MAX_RT_PRIO;

5049

return p->prio - MAX_RT_PRIO;

5050

}

5050

}

5051

5052

/**

5052

/**

5053

* task_nice - return the nice value of a given task.

5053

* task_nice - return the nice value of a given task.

5054

* @p: the task in question.

5054

* @p: the task in question.

5055

*/

5055

*/

5056

int task_nice(const struct task_struct *p)

5056

int task_nice(const struct task_struct *p)

5057

{

5057

{

5058

return TASK_NICE(p);

5058

return TASK_NICE(p);

5059

}

5059

}

5060

EXPORT_SYMBOL(task_nice);

5060

EXPORT_SYMBOL(task_nice);

5061

5062

/**

5062

/**

5063

* idle_cpu - is a given cpu idle currently?

5063

* idle_cpu - is a given cpu idle currently?

5064

* @cpu: the processor in question.

5064

* @cpu: the processor in question.

5065

*/

5065

*/

5066

int idle_cpu(int cpu)

5066

int idle_cpu(int cpu)

5067

{

5067

{

5068

return cpu_curr(cpu) == cpu_rq(cpu)->idle;

5068

return cpu_curr(cpu) == cpu_rq(cpu)->idle;

5069

}

5069

}

5070

5071

/**

5071

/**

5072

* idle_task - return the idle task for a given cpu.

5072

* idle_task - return the idle task for a given cpu.

5073

* @cpu: the processor in question.

5073

* @cpu: the processor in question.

5074

*/

5074

*/

5075

struct task_struct *idle_task(int cpu)

5075

struct task_struct *idle_task(int cpu)

5076

{

5076

{

5077

return cpu_rq(cpu)->idle;

5077

return cpu_rq(cpu)->idle;

5078

}

5078

}

5079

5080

/**

5080

/**

5081

* find_process_by_pid - find a process with a matching PID value.

5081

* find_process_by_pid - find a process with a matching PID value.

5082

* @pid: the pid in question.

5082

* @pid: the pid in question.

5083

*/

5083

*/

5084

static struct task_struct *find_process_by_pid(pid_t pid)

5084

static struct task_struct *find_process_by_pid(pid_t pid)

5085

{

5085

{

5086

return pid ? find_task_by_vpid(pid) : current;

5086

return pid ? find_task_by_vpid(pid) : current;

5087

}

5087

}

5088

5089

/* Actually do priority change: must hold rq lock. */

5089

/* Actually do priority change: must hold rq lock. */

5090

static void

5090

static void

5091

__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)

5091

__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)

5092

{

5092

{

5093

BUG_ON(p->se.on_rq);

5093

BUG_ON(p->se.on_rq);

5094

5095

p->policy = policy;

5095

p->policy = policy;

5096

switch (p->policy) {

5096

switch (p->policy) {

5097

case SCHED_NORMAL:

5097

case SCHED_NORMAL:

5098

case SCHED_BATCH:

5098

case SCHED_BATCH:

5099

case SCHED_IDLE:

5099

case SCHED_IDLE:

5100

p->sched_class = &fair_sched_class;

5100

p->sched_class = &fair_sched_class;

5101

break;

5101

break;

5102

case SCHED_FIFO:

5102

case SCHED_FIFO:

5103

case SCHED_RR:

5103

case SCHED_RR:

5104

p->sched_class = &rt_sched_class;

5104

p->sched_class = &rt_sched_class;

5105

break;

5105

break;

5106

}

5106

}

5107

5108

p->rt_priority = prio;

5108

p->rt_priority = prio;

5109

p->normal_prio = normal_prio(p);

5109

p->normal_prio = normal_prio(p);

5110

/* we are holding p->pi_lock already */

5110

/* we are holding p->pi_lock already */

5111

p->prio = rt_mutex_getprio(p);

5111

p->prio = rt_mutex_getprio(p);

5112

set_load_weight(p);

5112

set_load_weight(p);

5113

}

5113

}

5114

5115

static int __sched_setscheduler(struct task_struct *p, int policy,

5115

static int __sched_setscheduler(struct task_struct *p, int policy,

5116

struct sched_param *param, bool user)

5116

struct sched_param *param, bool user)

5117

{

5117

{

5118

int retval, oldprio, oldpolicy = -1, on_rq, running;

5118

int retval, oldprio, oldpolicy = -1, on_rq, running;

5119

unsigned long flags;

5119

unsigned long flags;

5120

const struct sched_class *prev_class = p->sched_class;

5120

const struct sched_class *prev_class = p->sched_class;

5121

struct rq *rq;

5121

struct rq *rq;

5122

5123

/* may grab non-irq protected spin_locks */

5123

/* may grab non-irq protected spin_locks */

5124

BUG_ON(in_interrupt());

5124

BUG_ON(in_interrupt());

5125

recheck:

5125

recheck:

5126

/* double check policy once rq lock held */

5126

/* double check policy once rq lock held */

5127

if (policy < 0)

5127

if (policy < 0)

5128

policy = oldpolicy = p->policy;

5128

policy = oldpolicy = p->policy;

5129

else if (policy != SCHED_FIFO && policy != SCHED_RR &&

5129

else if (policy != SCHED_FIFO && policy != SCHED_RR &&

5130

policy != SCHED_NORMAL && policy != SCHED_BATCH &&

5130

policy != SCHED_NORMAL && policy != SCHED_BATCH &&

5131

policy != SCHED_IDLE)

5131

policy != SCHED_IDLE)

5132

return -EINVAL;

5132

return -EINVAL;

5133

/*

5133

/*

5134

* Valid priorities for SCHED_FIFO and SCHED_RR are

5134

* Valid priorities for SCHED_FIFO and SCHED_RR are

5135

* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,

5135

* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,

5136

* SCHED_BATCH and SCHED_IDLE is 0.

5136

* SCHED_BATCH and SCHED_IDLE is 0.

5137

*/

5137

*/

5138

if (param->sched_priority < 0 ||

5138

if (param->sched_priority < 0 ||

5139

(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||

5139

(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||

5140

(!p->mm && param->sched_priority > MAX_RT_PRIO-1))

5140

(!p->mm && param->sched_priority > MAX_RT_PRIO-1))

5141

return -EINVAL;

5141

return -EINVAL;

5142

if (rt_policy(policy) != (param->sched_priority != 0))

5142

if (rt_policy(policy) != (param->sched_priority != 0))

5143

return -EINVAL;

5143

return -EINVAL;

5144

5145

/*

5145

/*

5146

* Allow unprivileged RT tasks to decrease priority:

5146

* Allow unprivileged RT tasks to decrease priority:

5147

*/

5147

*/

5148

if (user && !capable(CAP_SYS_NICE)) {

5148

if (user && !capable(CAP_SYS_NICE)) {

5149

if (rt_policy(policy)) {

5149

if (rt_policy(policy)) {

5150

unsigned long rlim_rtprio;

5150

unsigned long rlim_rtprio;

5151

5152

if (!lock_task_sighand(p, &flags))

5152

if (!lock_task_sighand(p, &flags))

5153

return -ESRCH;

5153

return -ESRCH;

5154

rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;

5154

rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;

5155

unlock_task_sighand(p, &flags);

5155

unlock_task_sighand(p, &flags);

5156

5157

/* can't set/change the rt policy */

5157

/* can't set/change the rt policy */

5158

if (policy != p->policy && !rlim_rtprio)

5158

if (policy != p->policy && !rlim_rtprio)

5159

return -EPERM;

5159

return -EPERM;

5160

5161

/* can't increase priority */

5161

/* can't increase priority */

5162

if (param->sched_priority > p->rt_priority &&

5162

if (param->sched_priority > p->rt_priority &&

5163

param->sched_priority > rlim_rtprio)

5163

param->sched_priority > rlim_rtprio)

5164

return -EPERM;

5164

return -EPERM;

5165

}

5165

}

5166

/*

5166

/*

5167

* Like positive nice levels, dont allow tasks to

5167

* Like positive nice levels, dont allow tasks to

5168

* move out of SCHED_IDLE either:

5168

* move out of SCHED_IDLE either:

5169

*/

5169

*/

5170

if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)

5170

if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)

5171

return -EPERM;

5171

return -EPERM;

5172

5173

/* can't change other user's priorities */

5173

/* can't change other user's priorities */

5174

if ((current->euid != p->euid) &&

5174

if ((current->euid != p->euid) &&

5175

(current->euid != p->uid))

5175

(current->euid != p->uid))

5176

return -EPERM;

5176

return -EPERM;

5177

}

5177

}

5178

5179

if (user) {

5179

if (user) {

5180

#ifdef CONFIG_RT_GROUP_SCHED

5180

#ifdef CONFIG_RT_GROUP_SCHED

5181

/*

5181

/*

5182

* Do not allow realtime tasks into groups that have no runtime

5182

* Do not allow realtime tasks into groups that have no runtime

5183

* assigned.

5183

* assigned.

5184

*/

5184

*/

5185

if (rt_bandwidth_enabled() && rt_policy(policy) &&

5185

if (rt_bandwidth_enabled() && rt_policy(policy) &&

5186

task_group(p)->rt_bandwidth.rt_runtime == 0)

5186

task_group(p)->rt_bandwidth.rt_runtime == 0)

5187

return -EPERM;

5187

return -EPERM;

5188

#endif

5188

#endif

5189

5190

retval = security_task_setscheduler(p, policy, param);

5190

retval = security_task_setscheduler(p, policy, param);

5191

if (retval)

5191

if (retval)

5192

return retval;

5192

return retval;

5193

}

5193

}

5194

5195

/*

5195

/*

5196

* make sure no PI-waiters arrive (or leave) while we are

5196

* make sure no PI-waiters arrive (or leave) while we are

5197

* changing the priority of the task:

5197

* changing the priority of the task:

5198

*/

5198

*/

5199

spin_lock_irqsave(&p->pi_lock, flags);

5199

spin_lock_irqsave(&p->pi_lock, flags);

5200

/*

5200

/*

5201

* To be able to change p->policy safely, the apropriate

5201

* To be able to change p->policy safely, the apropriate

5202

* runqueue lock must be held.

5202

* runqueue lock must be held.

5203

*/

5203

*/

5204

rq = __task_rq_lock(p);

5204

rq = __task_rq_lock(p);

5205

/* recheck policy now with rq lock held */

5205

/* recheck policy now with rq lock held */

5206

if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

5206

if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

5207

policy = oldpolicy = -1;

5207

policy = oldpolicy = -1;

5208

__task_rq_unlock(rq);

5208

__task_rq_unlock(rq);

5209

spin_unlock_irqrestore(&p->pi_lock, flags);

5209

spin_unlock_irqrestore(&p->pi_lock, flags);

5210

goto recheck;

5210

goto recheck;

5211

}

5211

}

5212

update_rq_clock(rq);

5212

update_rq_clock(rq);

5213

on_rq = p->se.on_rq;

5213

on_rq = p->se.on_rq;

5214

running = task_current(rq, p);

5214

running = task_current(rq, p);

5215

if (on_rq)

5215

if (on_rq)

5216

deactivate_task(rq, p, 0);

5216

deactivate_task(rq, p, 0);

5217

if (running)

5217

if (running)

5218

p->sched_class->put_prev_task(rq, p);

5218

p->sched_class->put_prev_task(rq, p);

5219

5220

oldprio = p->prio;

5220

oldprio = p->prio;

5221

__setscheduler(rq, p, policy, param->sched_priority);

5221

__setscheduler(rq, p, policy, param->sched_priority);

5222

5223

if (running)

5223

if (running)

5224

p->sched_class->set_curr_task(rq);

5224

p->sched_class->set_curr_task(rq);

5225

if (on_rq) {

5225

if (on_rq) {

5226

activate_task(rq, p, 0);

5226

activate_task(rq, p, 0);

5227

5228

check_class_changed(rq, p, prev_class, oldprio, running);

5228

check_class_changed(rq, p, prev_class, oldprio, running);

5229

}

5229

}

5230

__task_rq_unlock(rq);

5230

__task_rq_unlock(rq);

5231

spin_unlock_irqrestore(&p->pi_lock, flags);

5231

spin_unlock_irqrestore(&p->pi_lock, flags);

5232

5233

rt_mutex_adjust_pi(p);

5233

rt_mutex_adjust_pi(p);

5234

5235

return 0;

5235

return 0;

5236

}

5236

}

5237

5238

/**

5238

/**

5239

* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.

5239

* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.

5240

* @p: the task in question.

5240

* @p: the task in question.

5241

* @policy: new policy.

5241

* @policy: new policy.

5242

* @param: structure containing the new RT priority.

5242

* @param: structure containing the new RT priority.

5243

*

5243

*

5244

* NOTE that the task may be already dead.

5244

* NOTE that the task may be already dead.

5245

*/

5245

*/

5246

int sched_setscheduler(struct task_struct *p, int policy,

5246

int sched_setscheduler(struct task_struct *p, int policy,

5247

struct sched_param *param)

5247

struct sched_param *param)

5248

{

5248

{

5249

return __sched_setscheduler(p, policy, param, true);

5249

return __sched_setscheduler(p, policy, param, true);

5250

}

5250

}

5251

EXPORT_SYMBOL_GPL(sched_setscheduler);

5251

EXPORT_SYMBOL_GPL(sched_setscheduler);

5252

5253

/**

5253

/**

5254

* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.

5254

* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.

5255

* @p: the task in question.

5255

* @p: the task in question.

5256

* @policy: new policy.

5256

* @policy: new policy.

5257

* @param: structure containing the new RT priority.

5257

* @param: structure containing the new RT priority.

5258

*

5258

*

5259

* Just like sched_setscheduler, only don't bother checking if the

5259

* Just like sched_setscheduler, only don't bother checking if the

5260

* current context has permission. For example, this is needed in

5260

* current context has permission. For example, this is needed in

5261

* stop_machine(): we create temporary high priority worker threads,

5261

* stop_machine(): we create temporary high priority worker threads,

5262

* but our caller might not have that capability.

5262

* but our caller might not have that capability.

5263

*/

5263

*/

5264

int sched_setscheduler_nocheck(struct task_struct *p, int policy,

5264

int sched_setscheduler_nocheck(struct task_struct *p, int policy,

5265

struct sched_param *param)

5265

struct sched_param *param)

5266

{

5266

{

5267

return __sched_setscheduler(p, policy, param, false);

5267

return __sched_setscheduler(p, policy, param, false);

5268

}

5268

}

5269

5270

static int

5270

static int

5271

do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

5271

do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

5272

{

5272

{

5273

struct sched_param lparam;

5273

struct sched_param lparam;

5274

struct task_struct *p;

5274

struct task_struct *p;

5275

int retval;

5275

int retval;

5276

5277

if (!param || pid < 0)

5277

if (!param || pid < 0)

5278

return -EINVAL;

5278

return -EINVAL;

5279

if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

5279

if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

5280

return -EFAULT;

5280

return -EFAULT;

5281

5282

rcu_read_lock();

5282

rcu_read_lock();

5283

retval = -ESRCH;

5283

retval = -ESRCH;

5284

p = find_process_by_pid(pid);

5284

p = find_process_by_pid(pid);

5285

if (p != NULL)

5285

if (p != NULL)

5286

retval = sched_setscheduler(p, policy, &lparam);

5286

retval = sched_setscheduler(p, policy, &lparam);

5287

rcu_read_unlock();

5287

rcu_read_unlock();

5288

5289

return retval;

5289

return retval;

5290

}

5290

}

5291

5292

/**

5292

/**

5293

* sys_sched_setscheduler - set/change the scheduler policy and RT priority

5293

* sys_sched_setscheduler - set/change the scheduler policy and RT priority

5294

* @pid: the pid in question.

5294

* @pid: the pid in question.

5295

* @policy: new policy.

5295

* @policy: new policy.

5296

* @param: structure containing the new RT priority.

5296

* @param: structure containing the new RT priority.

5297

*/

5297

*/

5298

asmlinkage long

5298

asmlinkage long

5299

sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

5299

sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

5300

{

5300

{

5301

/* negative values for policy are not valid */

5301

/* negative values for policy are not valid */

5302

if (policy < 0)

5302

if (policy < 0)

5303

return -EINVAL;

5303

return -EINVAL;

5304

5305

return do_sched_setscheduler(pid, policy, param);

5305

return do_sched_setscheduler(pid, policy, param);

5306

}

5306

}

5307

5308

/**

5308

/**

5309

* sys_sched_setparam - set/change the RT priority of a thread

5309

* sys_sched_setparam - set/change the RT priority of a thread

5310

* @pid: the pid in question.

5310

* @pid: the pid in question.

5311

* @param: structure containing the new RT priority.

5311

* @param: structure containing the new RT priority.

5312

*/

5312

*/

5313

asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)

5313

asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)

5314

{

5314

{

5315

return do_sched_setscheduler(pid, -1, param);

5315

return do_sched_setscheduler(pid, -1, param);

5316

}

5316

}

5317

5318

/**

5318

/**

5319

* sys_sched_getscheduler - get the policy (scheduling class) of a thread

5319

* sys_sched_getscheduler - get the policy (scheduling class) of a thread

5320

* @pid: the pid in question.

5320

* @pid: the pid in question.

5321

*/

5321

*/

5322

asmlinkage long sys_sched_getscheduler(pid_t pid)

5322

asmlinkage long sys_sched_getscheduler(pid_t pid)

5323

{

5323

{

5324

struct task_struct *p;

5324

struct task_struct *p;

5325

int retval;

5325

int retval;

5326

5327

if (pid < 0)

5327

if (pid < 0)

5328

return -EINVAL;

5328

return -EINVAL;

5329

5330

retval = -ESRCH;

5330

retval = -ESRCH;

5331

read_lock(&tasklist_lock);

5331

read_lock(&tasklist_lock);

5332

p = find_process_by_pid(pid);

5332

p = find_process_by_pid(pid);

5333

if (p) {

5333

if (p) {

5334

retval = security_task_getscheduler(p);

5334

retval = security_task_getscheduler(p);

5335

if (!retval)

5335

if (!retval)

5336

retval = p->policy;

5336

retval = p->policy;

5337

}

5337

}

5338

read_unlock(&tasklist_lock);

5338

read_unlock(&tasklist_lock);

5339

return retval;

5339

return retval;

5340

}

5340

}

5341

5342

/**

5342

/**

5343

* sys_sched_getscheduler - get the RT priority of a thread

5343

* sys_sched_getscheduler - get the RT priority of a thread

5344

* @pid: the pid in question.

5344

* @pid: the pid in question.

5345

* @param: structure containing the RT priority.

5345

* @param: structure containing the RT priority.

5346

*/

5346

*/

5347

asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)

5347

asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)

5348

{

5348

{

5349

struct sched_param lp;

5349

struct sched_param lp;

5350

struct task_struct *p;

5350

struct task_struct *p;

5351

int retval;

5351

int retval;

5352

5353

if (!param || pid < 0)

5353

if (!param || pid < 0)

5354

return -EINVAL;

5354

return -EINVAL;

5355

5356

read_lock(&tasklist_lock);

5356

read_lock(&tasklist_lock);

5357

p = find_process_by_pid(pid);

5357

p = find_process_by_pid(pid);

5358

retval = -ESRCH;

5358

retval = -ESRCH;

5359

if (!p)

5359

if (!p)

5360

goto out_unlock;

5360

goto out_unlock;

5361

5362

retval = security_task_getscheduler(p);

5362

retval = security_task_getscheduler(p);

5363

if (retval)

5363

if (retval)

5364

goto out_unlock;

5364

goto out_unlock;

5365

5366

lp.sched_priority = p->rt_priority;

5366

lp.sched_priority = p->rt_priority;

5367

read_unlock(&tasklist_lock);

5367

read_unlock(&tasklist_lock);

5368

5369

/*

5369

/*

5370

* This one might sleep, we cannot do it with a spinlock held ...

5370

* This one might sleep, we cannot do it with a spinlock held ...

5371

*/

5371

*/

5372

retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

5372

retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

5373

5374

return retval;

5374

return retval;

5375

5376

out_unlock:

5376

out_unlock:

5377

read_unlock(&tasklist_lock);

5377

read_unlock(&tasklist_lock);

5378

return retval;

5378

return retval;

5379

}

5379

}

5380

5381

long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)

5381

long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)

5382

{

5382

{

5383

cpumask_t cpus_allowed;

5383

cpumask_t cpus_allowed;

5384

cpumask_t new_mask = *in_mask;

5384

cpumask_t new_mask = *in_mask;

5385

struct task_struct *p;

5385

struct task_struct *p;

5386

int retval;

5386

int retval;

5387

5388

get_online_cpus();

5388

get_online_cpus();

5389

read_lock(&tasklist_lock);

5389

read_lock(&tasklist_lock);

5390

5391

p = find_process_by_pid(pid);

5391

p = find_process_by_pid(pid);

5392

if (!p) {

5392

if (!p) {

5393

read_unlock(&tasklist_lock);

5393

read_unlock(&tasklist_lock);

5394

put_online_cpus();

5394

put_online_cpus();

5395

return -ESRCH;

5395

return -ESRCH;

5396

}

5396

}

5397

5398

/*

5398

/*

5399

* It is not safe to call set_cpus_allowed with the

5399

* It is not safe to call set_cpus_allowed with the

5400

* tasklist_lock held. We will bump the task_struct's

5400

* tasklist_lock held. We will bump the task_struct's

5401

* usage count and then drop tasklist_lock.

5401

* usage count and then drop tasklist_lock.

5402

*/

5402

*/

5403

get_task_struct(p);

5403

get_task_struct(p);

5404

read_unlock(&tasklist_lock);

5404

read_unlock(&tasklist_lock);

5405

5406

retval = -EPERM;

5406

retval = -EPERM;

5407

if ((current->euid != p->euid) && (current->euid != p->uid) &&

5407

if ((current->euid != p->euid) && (current->euid != p->uid) &&

5408

!capable(CAP_SYS_NICE))

5408

!capable(CAP_SYS_NICE))

5409

goto out_unlock;

5409

goto out_unlock;

5410

5411

retval = security_task_setscheduler(p, 0, NULL);

5411

retval = security_task_setscheduler(p, 0, NULL);

5412

if (retval)

5412

if (retval)

5413

goto out_unlock;

5413

goto out_unlock;

5414

5415

cpuset_cpus_allowed(p, &cpus_allowed);

5415

cpuset_cpus_allowed(p, &cpus_allowed);

5416

cpus_and(new_mask, new_mask, cpus_allowed);

5416

cpus_and(new_mask, new_mask, cpus_allowed);

5417

again:

5417

again:

5418

retval = set_cpus_allowed_ptr(p, &new_mask);

5418

retval = set_cpus_allowed_ptr(p, &new_mask);

5419

5420

if (!retval) {

5420

if (!retval) {

5421

cpuset_cpus_allowed(p, &cpus_allowed);

5421

cpuset_cpus_allowed(p, &cpus_allowed);

5422

if (!cpus_subset(new_mask, cpus_allowed)) {

5422

if (!cpus_subset(new_mask, cpus_allowed)) {

5423

/*

5423

/*

5424

* We must have raced with a concurrent cpuset

5424

* We must have raced with a concurrent cpuset

5425

* update. Just reset the cpus_allowed to the

5425

* update. Just reset the cpus_allowed to the

5426

* cpuset's cpus_allowed

5426

* cpuset's cpus_allowed

5427

*/

5427

*/

5428

new_mask = cpus_allowed;

5428

new_mask = cpus_allowed;

5429

goto again;

5429

goto again;

5430

}

5430

}

5431

}

5431

}

5432

out_unlock:

5432

out_unlock:

5433

put_task_struct(p);

5433

put_task_struct(p);

5434

put_online_cpus();

5434

put_online_cpus();

5435

return retval;

5435

return retval;

5436

}

5436

}

5437

5438

static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

5438

static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

5439

cpumask_t *new_mask)

5439

cpumask_t *new_mask)

5440

{

5440

{

5441

if (len < sizeof(cpumask_t)) {

5441

if (len < sizeof(cpumask_t)) {

5442

memset(new_mask, 0, sizeof(cpumask_t));

5442

memset(new_mask, 0, sizeof(cpumask_t));

5443

} else if (len > sizeof(cpumask_t)) {

5443

} else if (len > sizeof(cpumask_t)) {

5444

len = sizeof(cpumask_t);

5444

len = sizeof(cpumask_t);

5445

}

5445

}

5446

return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

5446

return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

5447

}

5447

}

5448

5449

/**

5449

/**

5450

* sys_sched_setaffinity - set the cpu affinity of a process

5450

* sys_sched_setaffinity - set the cpu affinity of a process

5451

* @pid: pid of the process

5451

* @pid: pid of the process

5452

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

5452

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

5453

* @user_mask_ptr: user-space pointer to the new cpu mask

5453

* @user_mask_ptr: user-space pointer to the new cpu mask

5454

*/

5454

*/

5455

asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,

5455

asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,

5456

unsigned long __user *user_mask_ptr)

5456

unsigned long __user *user_mask_ptr)

5457

{

5457

{

5458

cpumask_t new_mask;

5458

cpumask_t new_mask;

5459

int retval;

5459

int retval;

5460

5461

retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);

5461

retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);

5462

if (retval)

5462

if (retval)

5463

return retval;

5463

return retval;

5464

5465

return sched_setaffinity(pid, &new_mask);

5465

return sched_setaffinity(pid, &new_mask);

5466

}

5466

}

5467

5468

long sched_getaffinity(pid_t pid, cpumask_t *mask)

5468

long sched_getaffinity(pid_t pid, cpumask_t *mask)

5469

{

5469

{

5470

struct task_struct *p;

5470

struct task_struct *p;

5471

int retval;

5471

int retval;

5472

5473

get_online_cpus();

5473

get_online_cpus();

5474

read_lock(&tasklist_lock);

5474

read_lock(&tasklist_lock);

5475

5476

retval = -ESRCH;

5476

retval = -ESRCH;

5477

p = find_process_by_pid(pid);

5477

p = find_process_by_pid(pid);

5478

if (!p)

5478

if (!p)

5479

goto out_unlock;

5479

goto out_unlock;

5480

5481

retval = security_task_getscheduler(p);

5481

retval = security_task_getscheduler(p);

5482

if (retval)

5482

if (retval)

5483

goto out_unlock;

5483

goto out_unlock;

5484

5485

cpus_and(*mask, p->cpus_allowed, cpu_online_map);

5485

cpus_and(*mask, p->cpus_allowed, cpu_online_map);

5486

5487

out_unlock:

5487

out_unlock:

5488

read_unlock(&tasklist_lock);

5488

read_unlock(&tasklist_lock);

5489

put_online_cpus();

5489

put_online_cpus();

5490

5491

return retval;

5491

return retval;

5492

}

5492

}

5493

5494

/**

5494

/**

5495

* sys_sched_getaffinity - get the cpu affinity of a process

5495

* sys_sched_getaffinity - get the cpu affinity of a process

5496

* @pid: pid of the process

5496

* @pid: pid of the process

5497

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

5497

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

5498

* @user_mask_ptr: user-space pointer to hold the current cpu mask

5498

* @user_mask_ptr: user-space pointer to hold the current cpu mask

5499

*/

5499

*/

5500

asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,

5500

asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,

5501

unsigned long __user *user_mask_ptr)

5501

unsigned long __user *user_mask_ptr)

5502

{

5502

{

5503

int ret;

5503

int ret;

5504

cpumask_t mask;

5504

cpumask_t mask;

5505

5506

if (len < sizeof(cpumask_t))

5506

if (len < sizeof(cpumask_t))

5507

return -EINVAL;

5507

return -EINVAL;

5508

5509

ret = sched_getaffinity(pid, &mask);

5509

ret = sched_getaffinity(pid, &mask);

5510

if (ret < 0)

5510

if (ret < 0)

5511

return ret;

5511

return ret;

5512

5513

if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))

5513

if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))

5514

return -EFAULT;

5514

return -EFAULT;

5515

5516

return sizeof(cpumask_t);

5516

return sizeof(cpumask_t);

5517

}

5517

}

5518

5519

/**

5519

/**

5520

* sys_sched_yield - yield the current processor to other threads.

5520

* sys_sched_yield - yield the current processor to other threads.

5521

*

5521

*

5522

* This function yields the current CPU to other tasks. If there are no

5522

* This function yields the current CPU to other tasks. If there are no

5523

* other threads running on this CPU then this function will return.

5523

* other threads running on this CPU then this function will return.

5524

*/

5524

*/

5525

asmlinkage long sys_sched_yield(void)

5525

asmlinkage long sys_sched_yield(void)

5526

{

5526

{

5527

struct rq *rq = this_rq_lock();

5527

struct rq *rq = this_rq_lock();

5528

5529

schedstat_inc(rq, yld_count);

5529

schedstat_inc(rq, yld_count);

5530

current->sched_class->yield_task(rq);

5530

current->sched_class->yield_task(rq);

5531

5532

/*

5532

/*

5533

* Since we are going to call schedule() anyway, there's

5533

* Since we are going to call schedule() anyway, there's

5534

* no need to preempt or enable interrupts:

5534

* no need to preempt or enable interrupts:

5535

*/

5535

*/

5536

__release(rq->lock);

5536

__release(rq->lock);

5537

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

5537

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

5538

_raw_spin_unlock(&rq->lock);

5538

_raw_spin_unlock(&rq->lock);

5539

preempt_enable_no_resched();

5539

preempt_enable_no_resched();

5540

5541

schedule();

5541

schedule();

5542

5543

return 0;

5543

return 0;

5544

}

5544

}

5545

5546

static void __cond_resched(void)

5546

static void __cond_resched(void)

5547

{

5547

{

5548

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

5548

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

5549

__might_sleep(__FILE__, __LINE__);

5549

__might_sleep(__FILE__, __LINE__);

5550

#endif

5550

#endif

5551

/*

5551

/*

5552

* The BKS might be reacquired before we have dropped

5552

* The BKS might be reacquired before we have dropped

5553

* PREEMPT_ACTIVE, which could trigger a second

5553

* PREEMPT_ACTIVE, which could trigger a second

5554

* cond_resched() call.

5554

* cond_resched() call.

5555

*/

5555

*/

5556

do {

5556

do {

5557

add_preempt_count(PREEMPT_ACTIVE);

5557

add_preempt_count(PREEMPT_ACTIVE);

5558

schedule();

5558

schedule();

5559

sub_preempt_count(PREEMPT_ACTIVE);

5559

sub_preempt_count(PREEMPT_ACTIVE);

5560

} while (need_resched());

5560

} while (need_resched());

5561

}

5561

}

5562

5563

int __sched _cond_resched(void)

5563

int __sched _cond_resched(void)

5564

{

5564

{

5565

if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&

5565

if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&

5566

system_state == SYSTEM_RUNNING) {

5566

system_state == SYSTEM_RUNNING) {

5567

__cond_resched();

5567

__cond_resched();

5568

return 1;

5568

return 1;

5569

}

5569

}

5570

return 0;

5570

return 0;

5571

}

5571

}

5572

EXPORT_SYMBOL(_cond_resched);

5572

EXPORT_SYMBOL(_cond_resched);

5573

5574

/*

5574

/*

5575

* cond_resched_lock() - if a reschedule is pending, drop the given lock,

5575

* cond_resched_lock() - if a reschedule is pending, drop the given lock,

5576

* call schedule, and on return reacquire the lock.

5576

* call schedule, and on return reacquire the lock.

5577

*

5577

*

5578

* This works OK both with and without CONFIG_PREEMPT. We do strange low-level

5578

* This works OK both with and without CONFIG_PREEMPT. We do strange low-level

5579

* operations here to prevent schedule() from being called twice (once via

5579

* operations here to prevent schedule() from being called twice (once via

5580

* spin_unlock(), once by hand).

5580

* spin_unlock(), once by hand).

5581

*/

5581

*/

5582

int cond_resched_lock(spinlock_t *lock)

5582

int cond_resched_lock(spinlock_t *lock)

5583

{

5583

{

5584

int resched = need_resched() && system_state == SYSTEM_RUNNING;

5584

int resched = need_resched() && system_state == SYSTEM_RUNNING;

5585

int ret = 0;

5585

int ret = 0;

5586

5587

if (spin_needbreak(lock) || resched) {

5587

if (spin_needbreak(lock) || resched) {

5588

spin_unlock(lock);

5588

spin_unlock(lock);

5589

if (resched && need_resched())

5589

if (resched && need_resched())

5590

__cond_resched();

5590

__cond_resched();

5591

else

5591

else

5592

cpu_relax();

5592

cpu_relax();

5593

ret = 1;

5593

ret = 1;

5594

spin_lock(lock);

5594

spin_lock(lock);

5595

}

5595

}

5596

return ret;

5596

return ret;

5597

}

5597

}

5598

EXPORT_SYMBOL(cond_resched_lock);

5598

EXPORT_SYMBOL(cond_resched_lock);

5599

5600

int __sched cond_resched_softirq(void)

5600

int __sched cond_resched_softirq(void)

5601

{

5601

{

5602

BUG_ON(!in_softirq());

5602

BUG_ON(!in_softirq());

5603

5604

if (need_resched() && system_state == SYSTEM_RUNNING) {

5604

if (need_resched() && system_state == SYSTEM_RUNNING) {

5605

local_bh_enable();

5605

local_bh_enable();

5606

__cond_resched();

5606

__cond_resched();

5607

local_bh_disable();

5607

local_bh_disable();

5608

return 1;

5608

return 1;

5609

}

5609

}

5610

return 0;

5610

return 0;

5611

}

5611

}

5612

EXPORT_SYMBOL(cond_resched_softirq);

5612

EXPORT_SYMBOL(cond_resched_softirq);

5613

5614

/**

5614

/**

5615

* yield - yield the current processor to other threads.

5615

* yield - yield the current processor to other threads.

5616

*

5616

*

5617

* This is a shortcut for kernel-space yielding - it marks the

5617

* This is a shortcut for kernel-space yielding - it marks the

5618

* thread runnable and calls sys_sched_yield().

5618

* thread runnable and calls sys_sched_yield().

5619

*/

5619

*/

5620

void __sched yield(void)

5620

void __sched yield(void)

5621

{

5621

{

5622

set_current_state(TASK_RUNNING);

5622

set_current_state(TASK_RUNNING);

5623

sys_sched_yield();

5623

sys_sched_yield();

5624

}

5624

}

5625

EXPORT_SYMBOL(yield);

5625

EXPORT_SYMBOL(yield);

5626

5627

/*

5627

/*

5628

* This task is about to go to sleep on IO. Increment rq->nr_iowait so

5628

* This task is about to go to sleep on IO. Increment rq->nr_iowait so

5629

* that process accounting knows that this is a task in IO wait state.

5629

* that process accounting knows that this is a task in IO wait state.

5630

*

5630

*

5631

* But don't do that if it is a deliberate, throttling IO wait (this task

5631

* But don't do that if it is a deliberate, throttling IO wait (this task

5632

* has set its backing_dev_info: the queue against which it should throttle)

5632

* has set its backing_dev_info: the queue against which it should throttle)

5633

*/

5633

*/

5634

void __sched io_schedule(void)

5634

void __sched io_schedule(void)

5635

{

5635

{

5636

struct rq *rq = &__raw_get_cpu_var(runqueues);

5636

struct rq *rq = &__raw_get_cpu_var(runqueues);

5637

5638

delayacct_blkio_start();

5638

delayacct_blkio_start();

5639

atomic_inc(&rq->nr_iowait);

5639

atomic_inc(&rq->nr_iowait);

5640

schedule();

5640

schedule();

5641

atomic_dec(&rq->nr_iowait);

5641

atomic_dec(&rq->nr_iowait);

5642

delayacct_blkio_end();

5642

delayacct_blkio_end();

5643

}

5643

}

5644

EXPORT_SYMBOL(io_schedule);

5644

EXPORT_SYMBOL(io_schedule);

5645

5646

long __sched io_schedule_timeout(long timeout)

5646

long __sched io_schedule_timeout(long timeout)

5647

{

5647

{

5648

struct rq *rq = &__raw_get_cpu_var(runqueues);

5648

struct rq *rq = &__raw_get_cpu_var(runqueues);

5649

long ret;

5649

long ret;

5650

5651

delayacct_blkio_start();

5651

delayacct_blkio_start();

5652

atomic_inc(&rq->nr_iowait);

5652

atomic_inc(&rq->nr_iowait);

5653

ret = schedule_timeout(timeout);

5653

ret = schedule_timeout(timeout);

5654

atomic_dec(&rq->nr_iowait);

5654

atomic_dec(&rq->nr_iowait);

5655

delayacct_blkio_end();

5655

delayacct_blkio_end();

5656

return ret;

5656

return ret;

5657

}

5657

}

5658

5659

/**

5659

/**

5660

* sys_sched_get_priority_max - return maximum RT priority.

5660

* sys_sched_get_priority_max - return maximum RT priority.

5661

* @policy: scheduling class.

5661

* @policy: scheduling class.

5662

*

5662

*

5663

* this syscall returns the maximum rt_priority that can be used

5663

* this syscall returns the maximum rt_priority that can be used

5664

* by a given scheduling class.

5664

* by a given scheduling class.

5665

*/

5665

*/

5666

asmlinkage long sys_sched_get_priority_max(int policy)

5666

asmlinkage long sys_sched_get_priority_max(int policy)

5667

{

5667

{

5668

int ret = -EINVAL;

5668

int ret = -EINVAL;

5669

5670

switch (policy) {

5670

switch (policy) {

5671

case SCHED_FIFO:

5671

case SCHED_FIFO:

5672

case SCHED_RR:

5672

case SCHED_RR:

5673

ret = MAX_USER_RT_PRIO-1;

5673

ret = MAX_USER_RT_PRIO-1;

5674

break;

5674

break;

5675

case SCHED_NORMAL:

5675

case SCHED_NORMAL:

5676

case SCHED_BATCH:

5676

case SCHED_BATCH:

5677

case SCHED_IDLE:

5677

case SCHED_IDLE:

5678

ret = 0;

5678

ret = 0;

5679

break;

5679

break;

5680

}

5680

}

5681

return ret;

5681

return ret;

5682

}

5682

}

5683

5684

/**

5684

/**

5685

* sys_sched_get_priority_min - return minimum RT priority.

5685

* sys_sched_get_priority_min - return minimum RT priority.

5686

* @policy: scheduling class.

5686

* @policy: scheduling class.

5687

*

5687

*

5688

* this syscall returns the minimum rt_priority that can be used

5688

* this syscall returns the minimum rt_priority that can be used

5689

* by a given scheduling class.

5689

* by a given scheduling class.

5690

*/

5690

*/

5691

asmlinkage long sys_sched_get_priority_min(int policy)

5691

asmlinkage long sys_sched_get_priority_min(int policy)

5692

{

5692

{

5693

int ret = -EINVAL;

5693

int ret = -EINVAL;

5694

5695

switch (policy) {

5695

switch (policy) {

5696

case SCHED_FIFO:

5696

case SCHED_FIFO:

5697

case SCHED_RR:

5697

case SCHED_RR:

5698

ret = 1;

5698

ret = 1;

5699

break;

5699

break;

5700

case SCHED_NORMAL:

5700

case SCHED_NORMAL:

5701

case SCHED_BATCH:

5701

case SCHED_BATCH:

5702

case SCHED_IDLE:

5702

case SCHED_IDLE:

5703

ret = 0;

5703

ret = 0;

5704

}

5704

}

5705

return ret;

5705

return ret;

5706

}

5706

}

5707

5708

/**

5708

/**

5709

* sys_sched_rr_get_interval - return the default timeslice of a process.

5709

* sys_sched_rr_get_interval - return the default timeslice of a process.

5710

* @pid: pid of the process.

5710

* @pid: pid of the process.

5711

* @interval: userspace pointer to the timeslice value.

5711

* @interval: userspace pointer to the timeslice value.

5712

*

5712

*

5713

* this syscall writes the default timeslice value of a given process

5713

* this syscall writes the default timeslice value of a given process

5714

* into the user-space timespec buffer. A value of '0' means infinity.

5714

* into the user-space timespec buffer. A value of '0' means infinity.

5715

*/

5715

*/

5716

asmlinkage

5716

asmlinkage

5717

long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)

5717

long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)

5718

{

5718

{

5719

struct task_struct *p;

5719

struct task_struct *p;

5720

unsigned int time_slice;

5720

unsigned int time_slice;

5721

int retval;

5721

int retval;

5722

struct timespec t;

5722

struct timespec t;

5723

5724

if (pid < 0)

5724

if (pid < 0)

5725

return -EINVAL;

5725

return -EINVAL;

5726

5727

retval = -ESRCH;

5727

retval = -ESRCH;

5728

read_lock(&tasklist_lock);

5728

read_lock(&tasklist_lock);

5729

p = find_process_by_pid(pid);

5729

p = find_process_by_pid(pid);

5730

if (!p)

5730

if (!p)

5731

goto out_unlock;

5731

goto out_unlock;

5732

5733

retval = security_task_getscheduler(p);

5733

retval = security_task_getscheduler(p);

5734

if (retval)

5734

if (retval)

5735

goto out_unlock;

5735

goto out_unlock;

5736

5737

/*

5737

/*

5738

* Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER

5738

* Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER

5739

* tasks that are on an otherwise idle runqueue:

5739

* tasks that are on an otherwise idle runqueue:

5740

*/

5740

*/

5741

time_slice = 0;

5741

time_slice = 0;

5742

if (p->policy == SCHED_RR) {

5742

if (p->policy == SCHED_RR) {

5743

time_slice = DEF_TIMESLICE;

5743

time_slice = DEF_TIMESLICE;

5744

} else if (p->policy != SCHED_FIFO) {

5744

} else if (p->policy != SCHED_FIFO) {

5745

struct sched_entity *se = &p->se;

5745

struct sched_entity *se = &p->se;

5746

unsigned long flags;

5746

unsigned long flags;

5747

struct rq *rq;

5747

struct rq *rq;

5748

5749

rq = task_rq_lock(p, &flags);

5749

rq = task_rq_lock(p, &flags);

5750

if (rq->cfs.load.weight)

5750

if (rq->cfs.load.weight)

5751

time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));

5751

time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));

5752

task_rq_unlock(rq, &flags);

5752

task_rq_unlock(rq, &flags);

5753

}

5753

}

5754

read_unlock(&tasklist_lock);

5754

read_unlock(&tasklist_lock);

5755

jiffies_to_timespec(time_slice, &t);

5755

jiffies_to_timespec(time_slice, &t);

5756

retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;

5756

retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;

5757

return retval;

5757

return retval;

5758

5759

out_unlock:

5759

out_unlock:

5760

read_unlock(&tasklist_lock);

5760

read_unlock(&tasklist_lock);

5761

return retval;

5761

return retval;

5762

}

5762

}

5763

5764

static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;

5764

static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;

5765

5766

void sched_show_task(struct task_struct *p)

5766

void sched_show_task(struct task_struct *p)

5767

{

5767

{

5768

unsigned long free = 0;

5768

unsigned long free = 0;

5769

unsigned state;

5769

unsigned state;

5770

5771

state = p->state ? __ffs(p->state) + 1 : 0;

5771

state = p->state ? __ffs(p->state) + 1 : 0;

5772

printk(KERN_INFO "%-13.13s %c", p->comm,

5772

printk(KERN_INFO "%-13.13s %c", p->comm,

5773

state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');

5773

state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');

5774

#if BITS_PER_LONG == 32

5774

#if BITS_PER_LONG == 32

5775

if (state == TASK_RUNNING)

5775

if (state == TASK_RUNNING)

5776

printk(KERN_CONT " running ");

5776

printk(KERN_CONT " running ");

5777

else

5777

else

5778

printk(KERN_CONT " %08lx ", thread_saved_pc(p));

5778

printk(KERN_CONT " %08lx ", thread_saved_pc(p));

5779

#else

5779

#else

5780

if (state == TASK_RUNNING)

5780

if (state == TASK_RUNNING)

5781

printk(KERN_CONT " running task ");

5781

printk(KERN_CONT " running task ");

5782

else

5782

else

5783

printk(KERN_CONT " %016lx ", thread_saved_pc(p));

5783

printk(KERN_CONT " %016lx ", thread_saved_pc(p));

5784

#endif

5784

#endif

5785

#ifdef CONFIG_DEBUG_STACK_USAGE

5785

#ifdef CONFIG_DEBUG_STACK_USAGE

5786

{

5786

{

5787

unsigned long *n = end_of_stack(p);

5787

unsigned long *n = end_of_stack(p);

5788

while (!*n)

5788

while (!*n)

5789

n++;

5789

n++;

5790

free = (unsigned long)n - (unsigned long)end_of_stack(p);

5790

free = (unsigned long)n - (unsigned long)end_of_stack(p);

5791

}

5791

}

5792

#endif

5792

#endif

5793

printk(KERN_CONT "%5lu %5d %6d\n", free,

5793

printk(KERN_CONT "%5lu %5d %6d\n", free,

5794

task_pid_nr(p), task_pid_nr(p->real_parent));

5794

task_pid_nr(p), task_pid_nr(p->real_parent));

5795

5796

show_stack(p, NULL);

5796

show_stack(p, NULL);

5797

}

5797

}

5798

5799

void show_state_filter(unsigned long state_filter)

5799

void show_state_filter(unsigned long state_filter)

5800

{

5800

{

5801

struct task_struct *g, *p;

5801

struct task_struct *g, *p;

5802

5803

#if BITS_PER_LONG == 32

5803

#if BITS_PER_LONG == 32

5804

printk(KERN_INFO

5804

printk(KERN_INFO

5805

" task PC stack pid father\n");

5805

" task PC stack pid father\n");

5806

#else

5806

#else

5807

printk(KERN_INFO

5807

printk(KERN_INFO

5808

" task PC stack pid father\n");

5808

" task PC stack pid father\n");

5809

#endif

5809

#endif

5810

read_lock(&tasklist_lock);

5810

read_lock(&tasklist_lock);

5811

do_each_thread(g, p) {

5811

do_each_thread(g, p) {

5812

/*

5812

/*

5813

* reset the NMI-timeout, listing all files on a slow

5813

* reset the NMI-timeout, listing all files on a slow

5814

* console might take alot of time:

5814

* console might take alot of time:

5815

*/

5815

*/

5816

touch_nmi_watchdog();

5816

touch_nmi_watchdog();

5817

if (!state_filter || (p->state & state_filter))

5817

if (!state_filter || (p->state & state_filter))

5818

sched_show_task(p);

5818

sched_show_task(p);

5819

} while_each_thread(g, p);

5819

} while_each_thread(g, p);

5820

5821

touch_all_softlockup_watchdogs();

5821

touch_all_softlockup_watchdogs();

5822

5823

#ifdef CONFIG_SCHED_DEBUG

5823

#ifdef CONFIG_SCHED_DEBUG

5824

sysrq_sched_debug_show();

5824

sysrq_sched_debug_show();

5825

#endif

5825

#endif

5826

read_unlock(&tasklist_lock);

5826

read_unlock(&tasklist_lock);

5827

/*

5827

/*

5828

* Only show locks if all tasks are dumped:

5828

* Only show locks if all tasks are dumped:

5829

*/

5829

*/

5830

if (state_filter == -1)

5830

if (state_filter == -1)

5831

debug_show_all_locks();

5831

debug_show_all_locks();

5832

}

5832

}

5833

5834

void __cpuinit init_idle_bootup_task(struct task_struct *idle)

5834

void __cpuinit init_idle_bootup_task(struct task_struct *idle)

5835

{

5835

{

5836

idle->sched_class = &idle_sched_class;

5836

idle->sched_class = &idle_sched_class;

5837

}

5837

}

5838

5839

/**

5839

/**

5840

* init_idle - set up an idle thread for a given CPU

5840

* init_idle - set up an idle thread for a given CPU

5841

* @idle: task in question

5841

* @idle: task in question

5842

* @cpu: cpu the idle task belongs to

5842

* @cpu: cpu the idle task belongs to

5843

*

5843

*

5844

* NOTE: this function does not set the idle thread's NEED_RESCHED

5844

* NOTE: this function does not set the idle thread's NEED_RESCHED

5845

* flag, to make booting more robust.

5845

* flag, to make booting more robust.

5846

*/

5846

*/

5847

void __cpuinit init_idle(struct task_struct *idle, int cpu)

5847

void __cpuinit init_idle(struct task_struct *idle, int cpu)

5848

{

5848

{

5849

struct rq *rq = cpu_rq(cpu);

5849

struct rq *rq = cpu_rq(cpu);

5850

unsigned long flags;

5850

unsigned long flags;

5851

5852

spin_lock_irqsave(&rq->lock, flags);

5852

spin_lock_irqsave(&rq->lock, flags);

5853

5854

__sched_fork(idle);

5854

__sched_fork(idle);

5855

idle->se.exec_start = sched_clock();

5855

idle->se.exec_start = sched_clock();

5856

5857

idle->prio = idle->normal_prio = MAX_PRIO;

5857

idle->prio = idle->normal_prio = MAX_PRIO;

5858

idle->cpus_allowed = cpumask_of_cpu(cpu);

5858

idle->cpus_allowed = cpumask_of_cpu(cpu);

5859

__set_task_cpu(idle, cpu);

5859

__set_task_cpu(idle, cpu);

5860

5861

rq->curr = rq->idle = idle;

5861

rq->curr = rq->idle = idle;

5862

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

5862

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

5863

idle->oncpu = 1;

5863

idle->oncpu = 1;

5864

#endif

5864

#endif

5865

spin_unlock_irqrestore(&rq->lock, flags);

5865

spin_unlock_irqrestore(&rq->lock, flags);

5866

5867

/* Set the preempt count _outside_ the spinlocks! */

5867

/* Set the preempt count _outside_ the spinlocks! */

5868

#if defined(CONFIG_PREEMPT)

5868

#if defined(CONFIG_PREEMPT)

5869

task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);

5869

task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);

5870

#else

5870

#else

5871

task_thread_info(idle)->preempt_count = 0;

5871

task_thread_info(idle)->preempt_count = 0;

5872

#endif

5872

#endif

5873

/*

5873

/*

5874

* The idle tasks have their own, simple scheduling class:

5874

* The idle tasks have their own, simple scheduling class:

5875

*/

5875

*/

5876

idle->sched_class = &idle_sched_class;

5876

idle->sched_class = &idle_sched_class;

5877

}

5877

}

5878

5879

/*

5879

/*

5880

* In a system that switches off the HZ timer nohz_cpu_mask

5880

* In a system that switches off the HZ timer nohz_cpu_mask

5881

* indicates which cpus entered this state. This is used

5881

* indicates which cpus entered this state. This is used

5882

* in the rcu update to wait only for active cpus. For system

5882

* in the rcu update to wait only for active cpus. For system

5883

* which do not switch off the HZ timer nohz_cpu_mask should

5883

* which do not switch off the HZ timer nohz_cpu_mask should

5884

* always be CPU_MASK_NONE.

5884

* always be CPU_MASK_NONE.

5885

*/

5885

*/

5886

cpumask_t nohz_cpu_mask = CPU_MASK_NONE;

5886

cpumask_t nohz_cpu_mask = CPU_MASK_NONE;

5887

5888

/*

5888

/*

5889

* Increase the granularity value when there are more CPUs,

5889

* Increase the granularity value when there are more CPUs,

5890

* because with more CPUs the 'effective latency' as visible

5890

* because with more CPUs the 'effective latency' as visible

5891

* to users decreases. But the relationship is not linear,

5891

* to users decreases. But the relationship is not linear,

5892

* so pick a second-best guess by going with the log2 of the

5892

* so pick a second-best guess by going with the log2 of the

5893

* number of CPUs.

5893

* number of CPUs.

5894

*

5894

*

5895

* This idea comes from the SD scheduler of Con Kolivas:

5895

* This idea comes from the SD scheduler of Con Kolivas:

5896

*/

5896

*/

5897

static inline void sched_init_granularity(void)

5897

static inline void sched_init_granularity(void)

5898

{

5898

{

5899

unsigned int factor = 1 + ilog2(num_online_cpus());

5899

unsigned int factor = 1 + ilog2(num_online_cpus());

5900

const unsigned long limit = 200000000;

5900

const unsigned long limit = 200000000;

5901

5902

sysctl_sched_min_granularity *= factor;

5902

sysctl_sched_min_granularity *= factor;

5903

if (sysctl_sched_min_granularity > limit)

5903

if (sysctl_sched_min_granularity > limit)

5904

sysctl_sched_min_granularity = limit;

5904

sysctl_sched_min_granularity = limit;

5905

5906

sysctl_sched_latency *= factor;

5906

sysctl_sched_latency *= factor;

5907

if (sysctl_sched_latency > limit)

5907

if (sysctl_sched_latency > limit)

5908

sysctl_sched_latency = limit;

5908

sysctl_sched_latency = limit;

5909

5910

sysctl_sched_wakeup_granularity *= factor;

5910

sysctl_sched_wakeup_granularity *= factor;

5911

5912

sysctl_sched_shares_ratelimit *= factor;

5912

sysctl_sched_shares_ratelimit *= factor;

5913

}

5913

}

5914

5915

#ifdef CONFIG_SMP

5915

#ifdef CONFIG_SMP

5916

/*

5916

/*

5917

* This is how migration works:

5917

* This is how migration works:

5918

*

5918

*

5919

* 1) we queue a struct migration_req structure in the source CPU's

5919

* 1) we queue a struct migration_req structure in the source CPU's

5920

* runqueue and wake up that CPU's migration thread.

5920

* runqueue and wake up that CPU's migration thread.

5921

* 2) we down() the locked semaphore => thread blocks.

5921

* 2) we down() the locked semaphore => thread blocks.

5922

* 3) migration thread wakes up (implicitly it forces the migrated

5922

* 3) migration thread wakes up (implicitly it forces the migrated

5923

* thread off the CPU)

5923

* thread off the CPU)

5924

* 4) it gets the migration request and checks whether the migrated

5924

* 4) it gets the migration request and checks whether the migrated

5925

* task is still in the wrong runqueue.

5925

* task is still in the wrong runqueue.

5926

* 5) if it's in the wrong runqueue then the migration thread removes

5926

* 5) if it's in the wrong runqueue then the migration thread removes

5927

* it and puts it into the right queue.

5927

* it and puts it into the right queue.

5928

* 6) migration thread up()s the semaphore.

5928

* 6) migration thread up()s the semaphore.

5929

* 7) we wake up and the migration is done.

5929

* 7) we wake up and the migration is done.

5930

*/

5930

*/

5931

5932

/*

5932

/*

5933

* Change a given task's CPU affinity. Migrate the thread to a

5933

* Change a given task's CPU affinity. Migrate the thread to a

5934

* proper CPU and schedule it away if the CPU it's executing on

5934

* proper CPU and schedule it away if the CPU it's executing on

5935

* is removed from the allowed bitmask.

5935

* is removed from the allowed bitmask.

5936

*

5936

*

5937

* NOTE: the caller must have a valid reference to the task, the

5937

* NOTE: the caller must have a valid reference to the task, the

5938

* task must not exit() & deallocate itself prematurely. The

5938

* task must not exit() & deallocate itself prematurely. The

5939

* call is not atomic; no spinlocks may be held.

5939

* call is not atomic; no spinlocks may be held.

5940

*/

5940

*/

5941

int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)

5941

int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)

5942

{

5942

{

5943

struct migration_req req;

5943

struct migration_req req;

5944

unsigned long flags;

5944

unsigned long flags;

5945

struct rq *rq;

5945

struct rq *rq;

5946

int ret = 0;

5946

int ret = 0;

5947

5948

rq = task_rq_lock(p, &flags);

5948

rq = task_rq_lock(p, &flags);

5949

if (!cpus_intersects(*new_mask, cpu_online_map)) {

5949

if (!cpus_intersects(*new_mask, cpu_online_map)) {

5950

ret = -EINVAL;

5950

ret = -EINVAL;

5951

goto out;

5951

goto out;

5952

}

5952

}

5953

5954

if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&

5954

if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&

5955

!cpus_equal(p->cpus_allowed, *new_mask))) {

5955

!cpus_equal(p->cpus_allowed, *new_mask))) {

5956

ret = -EINVAL;

5956

ret = -EINVAL;

5957

goto out;

5957

goto out;

5958

}

5958

}

5959

5960

if (p->sched_class->set_cpus_allowed)

5960

if (p->sched_class->set_cpus_allowed)

5961

p->sched_class->set_cpus_allowed(p, new_mask);

5961

p->sched_class->set_cpus_allowed(p, new_mask);

5962

else {

5962

else {

5963

p->cpus_allowed = *new_mask;

5963

p->cpus_allowed = *new_mask;

5964

p->rt.nr_cpus_allowed = cpus_weight(*new_mask);

5964

p->rt.nr_cpus_allowed = cpus_weight(*new_mask);

5965

}

5965

}

5966

5967

/* Can the task run on the task's current CPU? If so, we're done */

5967

/* Can the task run on the task's current CPU? If so, we're done */

5968

if (cpu_isset(task_cpu(p), *new_mask))

5968

if (cpu_isset(task_cpu(p), *new_mask))

5969

goto out;

5969

goto out;

5970

5971

if (migrate_task(p, any_online_cpu(*new_mask), &req)) {

5971

if (migrate_task(p, any_online_cpu(*new_mask), &req)) {

5972

/* Need help from migration thread: drop lock and wait. */

5972

/* Need help from migration thread: drop lock and wait. */

5973

task_rq_unlock(rq, &flags);

5973

task_rq_unlock(rq, &flags);

5974

wake_up_process(rq->migration_thread);

5974

wake_up_process(rq->migration_thread);

5975

wait_for_completion(&req.done);

5975

wait_for_completion(&req.done);

5976

tlb_migrate_finish(p->mm);

5976

tlb_migrate_finish(p->mm);

5977

return 0;

5977

return 0;

5978

}

5978

}

5979

out:

5979

out:

5980

task_rq_unlock(rq, &flags);

5980

task_rq_unlock(rq, &flags);

5981

5982

return ret;

5982

return ret;

5983

}

5983

}

5984

EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

5984

EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

5985

5986

/*

5986

/*

5987

* Move (not current) task off this cpu, onto dest cpu. We're doing

5987

* Move (not current) task off this cpu, onto dest cpu. We're doing

5988

* this because either it can't run here any more (set_cpus_allowed()

5988

* this because either it can't run here any more (set_cpus_allowed()

5989

* away from this CPU, or CPU going down), or because we're

5989

* away from this CPU, or CPU going down), or because we're

5990

* attempting to rebalance this task on exec (sched_exec).

5990

* attempting to rebalance this task on exec (sched_exec).

5991

*

5991

*

5992

* So we race with normal scheduler movements, but that's OK, as long

5992

* So we race with normal scheduler movements, but that's OK, as long

5993

* as the task is no longer on this CPU.

5993

* as the task is no longer on this CPU.

5994

*

5994

*

5995

* Returns non-zero if task was successfully migrated.

5995

* Returns non-zero if task was successfully migrated.

5996

*/

5996

*/

5997

static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)

5997

static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)

5998

{

5998

{

5999

struct rq *rq_dest, *rq_src;

5999

struct rq *rq_dest, *rq_src;

6000

int ret = 0, on_rq;

6000

int ret = 0, on_rq;

6001

6002

if (unlikely(!cpu_active(dest_cpu)))

6002

if (unlikely(!cpu_active(dest_cpu)))

6003

return ret;

6003

return ret;

6004

6005

rq_src = cpu_rq(src_cpu);

6005

rq_src = cpu_rq(src_cpu);

6006

rq_dest = cpu_rq(dest_cpu);

6006

rq_dest = cpu_rq(dest_cpu);

6007

6008

double_rq_lock(rq_src, rq_dest);

6008

double_rq_lock(rq_src, rq_dest);

6009

/* Already moved. */

6009

/* Already moved. */

6010

if (task_cpu(p) != src_cpu)

6010

if (task_cpu(p) != src_cpu)

6011

goto done;

6011

goto done;

6012

/* Affinity changed (again). */

6012

/* Affinity changed (again). */

6013

if (!cpu_isset(dest_cpu, p->cpus_allowed))

6013

if (!cpu_isset(dest_cpu, p->cpus_allowed))

6014

goto fail;

6014

goto fail;

6015

6016

on_rq = p->se.on_rq;

6016

on_rq = p->se.on_rq;

6017

if (on_rq)

6017

if (on_rq)

6018

deactivate_task(rq_src, p, 0);

6018

deactivate_task(rq_src, p, 0);

6019

6020

set_task_cpu(p, dest_cpu);

6020

set_task_cpu(p, dest_cpu);

6021

if (on_rq) {

6021

if (on_rq) {

6022

activate_task(rq_dest, p, 0);

6022

activate_task(rq_dest, p, 0);

6023

check_preempt_curr(rq_dest, p, 0);

6023

check_preempt_curr(rq_dest, p, 0);

6024

}

6024

}

6025

done:

6025

done:

6026

ret = 1;

6026

ret = 1;

6027

fail:

6027

fail:

6028

double_rq_unlock(rq_src, rq_dest);

6028

double_rq_unlock(rq_src, rq_dest);

6029

return ret;

6029

return ret;

6030

}

6030

}

6031

6032

/*

6032

/*

6033

* migration_thread - this is a highprio system thread that performs

6033

* migration_thread - this is a highprio system thread that performs

6034

* thread migration by bumping thread off CPU then 'pushing' onto

6034

* thread migration by bumping thread off CPU then 'pushing' onto

6035

* another runqueue.

6035

* another runqueue.

6036

*/

6036

*/

6037

static int migration_thread(void *data)

6037

static int migration_thread(void *data)

6038

{

6038

{

6039

int cpu = (long)data;

6039

int cpu = (long)data;

6040

struct rq *rq;

6040

struct rq *rq;

6041

6042

rq = cpu_rq(cpu);

6042

rq = cpu_rq(cpu);

6043

BUG_ON(rq->migration_thread != current);

6043

BUG_ON(rq->migration_thread != current);

6044

6045

set_current_state(TASK_INTERRUPTIBLE);

6045

set_current_state(TASK_INTERRUPTIBLE);

6046

while (!kthread_should_stop()) {

6046

while (!kthread_should_stop()) {

6047

struct migration_req *req;

6047

struct migration_req *req;

6048

struct list_head *head;

6048

struct list_head *head;

6049

6050

spin_lock_irq(&rq->lock);

6050

spin_lock_irq(&rq->lock);

6051

6052

if (cpu_is_offline(cpu)) {

6052

if (cpu_is_offline(cpu)) {

6053

spin_unlock_irq(&rq->lock);

6053

spin_unlock_irq(&rq->lock);

6054

goto wait_to_die;

6054

goto wait_to_die;

6055

}

6055

}

6056

6057

if (rq->active_balance) {

6057

if (rq->active_balance) {

6058

active_load_balance(rq, cpu);

6058

active_load_balance(rq, cpu);

6059

rq->active_balance = 0;

6059

rq->active_balance = 0;

6060

}

6060

}

6061

6062

head = &rq->migration_queue;

6062

head = &rq->migration_queue;

6063

6064

if (list_empty(head)) {

6064

if (list_empty(head)) {

6065

spin_unlock_irq(&rq->lock);

6065

spin_unlock_irq(&rq->lock);

6066

schedule();

6066

schedule();

6067

set_current_state(TASK_INTERRUPTIBLE);

6067

set_current_state(TASK_INTERRUPTIBLE);

6068

continue;

6068

continue;

6069

}

6069

}

6070

req = list_entry(head->next, struct migration_req, list);

6070

req = list_entry(head->next, struct migration_req, list);

6071

list_del_init(head->next);

6071

list_del_init(head->next);

6072

6073

spin_unlock(&rq->lock);

6073

spin_unlock(&rq->lock);

6074

__migrate_task(req->task, cpu, req->dest_cpu);

6074

__migrate_task(req->task, cpu, req->dest_cpu);

6075

local_irq_enable();

6075

local_irq_enable();

6076

6077

complete(&req->done);

6077

complete(&req->done);

6078

}

6078

}

6079

__set_current_state(TASK_RUNNING);

6079

__set_current_state(TASK_RUNNING);

6080

return 0;

6080

return 0;

6081

6082

wait_to_die:

6082

wait_to_die:

6083

/* Wait for kthread_stop */

6083

/* Wait for kthread_stop */

6084

set_current_state(TASK_INTERRUPTIBLE);

6084

set_current_state(TASK_INTERRUPTIBLE);

6085

while (!kthread_should_stop()) {

6085

while (!kthread_should_stop()) {

6086

schedule();

6086

schedule();

6087

set_current_state(TASK_INTERRUPTIBLE);

6087

set_current_state(TASK_INTERRUPTIBLE);

6088

}

6088

}

6089

__set_current_state(TASK_RUNNING);

6089

__set_current_state(TASK_RUNNING);

6090

return 0;

6090

return 0;

6091

}

6091

}

6092

6093

#ifdef CONFIG_HOTPLUG_CPU

6093

#ifdef CONFIG_HOTPLUG_CPU

6094

6095

static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)

6095

static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)

6096

{

6096

{

6097

int ret;

6097

int ret;

6098

6099

local_irq_disable();

6099

local_irq_disable();

6100

ret = __migrate_task(p, src_cpu, dest_cpu);

6100

ret = __migrate_task(p, src_cpu, dest_cpu);

6101

local_irq_enable();

6101

local_irq_enable();

6102

return ret;

6102

return ret;

6103

}

6103

}

6104

6105

/*

6105

/*

6106

* Figure out where task on dead CPU should go, use force if necessary.

6106

* Figure out where task on dead CPU should go, use force if necessary.

6107

*/

6107

*/

6108

static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)

6108

static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)

6109

{

6109

{

6110

unsigned long flags;

6110

unsigned long flags;

6111

cpumask_t mask;

6111

cpumask_t mask;

6112

struct rq *rq;

6112

struct rq *rq;

6113

int dest_cpu;

6113

int dest_cpu;

6114

6115

do {

6115

do {

6116

/* On same node? */

6116

/* On same node? */

6117

mask = node_to_cpumask(cpu_to_node(dead_cpu));

6117

mask = node_to_cpumask(cpu_to_node(dead_cpu));

6118

cpus_and(mask, mask, p->cpus_allowed);

6118

cpus_and(mask, mask, p->cpus_allowed);

6119

dest_cpu = any_online_cpu(mask);

6119

dest_cpu = any_online_cpu(mask);

6120

6121

/* On any allowed CPU? */

6121

/* On any allowed CPU? */

6122

if (dest_cpu >= nr_cpu_ids)

6122

if (dest_cpu >= nr_cpu_ids)

6123

dest_cpu = any_online_cpu(p->cpus_allowed);

6123

dest_cpu = any_online_cpu(p->cpus_allowed);

6124

6125

/* No more Mr. Nice Guy. */

6125

/* No more Mr. Nice Guy. */

6126

if (dest_cpu >= nr_cpu_ids) {

6126

if (dest_cpu >= nr_cpu_ids) {

6127

cpumask_t cpus_allowed;

6127

cpumask_t cpus_allowed;

6128

6129

cpuset_cpus_allowed_locked(p, &cpus_allowed);

6129

cpuset_cpus_allowed_locked(p, &cpus_allowed);

6130

/*

6130

/*

6131

* Try to stay on the same cpuset, where the

6131

* Try to stay on the same cpuset, where the

6132

* current cpuset may be a subset of all cpus.

6132

* current cpuset may be a subset of all cpus.

6133

* The cpuset_cpus_allowed_locked() variant of

6133

* The cpuset_cpus_allowed_locked() variant of

6134

* cpuset_cpus_allowed() will not block. It must be

6134

* cpuset_cpus_allowed() will not block. It must be

6135

* called within calls to cpuset_lock/cpuset_unlock.

6135

* called within calls to cpuset_lock/cpuset_unlock.

6136

*/

6136

*/

6137

rq = task_rq_lock(p, &flags);

6137

rq = task_rq_lock(p, &flags);

6138

p->cpus_allowed = cpus_allowed;

6138

p->cpus_allowed = cpus_allowed;

6139

dest_cpu = any_online_cpu(p->cpus_allowed);

6139

dest_cpu = any_online_cpu(p->cpus_allowed);

6140

task_rq_unlock(rq, &flags);

6140

task_rq_unlock(rq, &flags);

6141

6142

/*

6142

/*

6143

* Don't tell them about moving exiting tasks or

6143

* Don't tell them about moving exiting tasks or

6144

* kernel threads (both mm NULL), since they never

6144

* kernel threads (both mm NULL), since they never

6145

* leave kernel.

6145

* leave kernel.

6146

*/

6146

*/

6147

if (p->mm && printk_ratelimit()) {

6147

if (p->mm && printk_ratelimit()) {

6148

printk(KERN_INFO "process %d (%s) no "

6148

printk(KERN_INFO "process %d (%s) no "

6149

"longer affine to cpu%d\n",

6149

"longer affine to cpu%d\n",

6150

task_pid_nr(p), p->comm, dead_cpu);

6150

task_pid_nr(p), p->comm, dead_cpu);

6151

}

6151

}

6152

}

6152

}

6153

} while (!__migrate_task_irq(p, dead_cpu, dest_cpu));

6153

} while (!__migrate_task_irq(p, dead_cpu, dest_cpu));

6154

}

6154

}

6155

6156

/*

6156

/*

6157

* While a dead CPU has no uninterruptible tasks queued at this point,

6157

* While a dead CPU has no uninterruptible tasks queued at this point,

6158

* it might still have a nonzero ->nr_uninterruptible counter, because

6158

* it might still have a nonzero ->nr_uninterruptible counter, because

6159

* for performance reasons the counter is not stricly tracking tasks to

6159

* for performance reasons the counter is not stricly tracking tasks to

6160

* their home CPUs. So we just add the counter to another CPU's counter,

6160

* their home CPUs. So we just add the counter to another CPU's counter,

6161

* to keep the global sum constant after CPU-down:

6161

* to keep the global sum constant after CPU-down:

6162

*/

6162

*/

6163

static void migrate_nr_uninterruptible(struct rq *rq_src)

6163

static void migrate_nr_uninterruptible(struct rq *rq_src)

6164

{

6164

{

6165

struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));

6165

struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));

6166

unsigned long flags;

6166

unsigned long flags;

6167

6168

local_irq_save(flags);

6168

local_irq_save(flags);

6169

double_rq_lock(rq_src, rq_dest);

6169

double_rq_lock(rq_src, rq_dest);

6170

rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;

6170

rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;

6171

rq_src->nr_uninterruptible = 0;

6171

rq_src->nr_uninterruptible = 0;

6172

double_rq_unlock(rq_src, rq_dest);

6172

double_rq_unlock(rq_src, rq_dest);

6173

local_irq_restore(flags);

6173

local_irq_restore(flags);

6174

}

6174

}

6175

6176

/* Run through task list and migrate tasks from the dead cpu. */

6176

/* Run through task list and migrate tasks from the dead cpu. */

6177

static void migrate_live_tasks(int src_cpu)

6177

static void migrate_live_tasks(int src_cpu)

6178

{

6178

{

6179

struct task_struct *p, *t;

6179

struct task_struct *p, *t;

6180

6181

read_lock(&tasklist_lock);

6181

read_lock(&tasklist_lock);

6182

6183

do_each_thread(t, p) {

6183

do_each_thread(t, p) {

6184

if (p == current)

6184

if (p == current)

6185

continue;

6185

continue;

6186

6187

if (task_cpu(p) == src_cpu)

6187

if (task_cpu(p) == src_cpu)

6188

move_task_off_dead_cpu(src_cpu, p);

6188

move_task_off_dead_cpu(src_cpu, p);

6189

} while_each_thread(t, p);

6189

} while_each_thread(t, p);

6190

6191

read_unlock(&tasklist_lock);

6191

read_unlock(&tasklist_lock);

6192

}

6192

}

6193

6194

/*

6194

/*

6195

* Schedules idle task to be the next runnable task on current CPU.

6195

* Schedules idle task to be the next runnable task on current CPU.

6196

* It does so by boosting its priority to highest possible.

6196

* It does so by boosting its priority to highest possible.

6197

* Used by CPU offline code.

6197

* Used by CPU offline code.

6198

*/

6198

*/

6199

void sched_idle_next(void)

6199

void sched_idle_next(void)

6200

{

6200

{

6201

int this_cpu = smp_processor_id();

6201

int this_cpu = smp_processor_id();

6202

struct rq *rq = cpu_rq(this_cpu);

6202

struct rq *rq = cpu_rq(this_cpu);

6203

struct task_struct *p = rq->idle;

6203

struct task_struct *p = rq->idle;

6204

unsigned long flags;

6204

unsigned long flags;

6205

6206

/* cpu has to be offline */

6206

/* cpu has to be offline */

6207

BUG_ON(cpu_online(this_cpu));

6207

BUG_ON(cpu_online(this_cpu));

6208

6209

/*

6209

/*

6210

* Strictly not necessary since rest of the CPUs are stopped by now

6210

* Strictly not necessary since rest of the CPUs are stopped by now

6211

* and interrupts disabled on the current cpu.

6211

* and interrupts disabled on the current cpu.

6212

*/

6212

*/

6213

spin_lock_irqsave(&rq->lock, flags);

6213

spin_lock_irqsave(&rq->lock, flags);

6214

6215

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

6215

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

6216

6217

update_rq_clock(rq);

6217

update_rq_clock(rq);

6218

activate_task(rq, p, 0);

6218

activate_task(rq, p, 0);

6219

6220

spin_unlock_irqrestore(&rq->lock, flags);

6220

spin_unlock_irqrestore(&rq->lock, flags);

6221

}

6221

}

6222

6223

/*

6223

/*

6224

* Ensures that the idle task is using init_mm right before its cpu goes

6224

* Ensures that the idle task is using init_mm right before its cpu goes

6225

* offline.

6225

* offline.

6226

*/

6226

*/

6227

void idle_task_exit(void)

6227

void idle_task_exit(void)

6228

{

6228

{

6229

struct mm_struct *mm = current->active_mm;

6229

struct mm_struct *mm = current->active_mm;

6230

6231

BUG_ON(cpu_online(smp_processor_id()));

6231

BUG_ON(cpu_online(smp_processor_id()));

6232

6233

if (mm != &init_mm)

6233

if (mm != &init_mm)

6234

switch_mm(mm, &init_mm, current);

6234

switch_mm(mm, &init_mm, current);

6235

mmdrop(mm);

6235

mmdrop(mm);

6236

}

6236

}

6237

6238

/* called under rq->lock with disabled interrupts */

6238

/* called under rq->lock with disabled interrupts */

6239

static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)

6239

static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)

6240

{

6240

{

6241

struct rq *rq = cpu_rq(dead_cpu);

6241

struct rq *rq = cpu_rq(dead_cpu);

6242

6243

/* Must be exiting, otherwise would be on tasklist. */

6243

/* Must be exiting, otherwise would be on tasklist. */

6244

BUG_ON(!p->exit_state);

6244

BUG_ON(!p->exit_state);

6245

6246

/* Cannot have done final schedule yet: would have vanished. */

6246

/* Cannot have done final schedule yet: would have vanished. */

6247

BUG_ON(p->state == TASK_DEAD);

6247

BUG_ON(p->state == TASK_DEAD);

6248

6249

get_task_struct(p);

6249

get_task_struct(p);

6250

6251

/*

6251

/*

6252

* Drop lock around migration; if someone else moves it,

6252

* Drop lock around migration; if someone else moves it,

6253

* that's OK. No task can be added to this CPU, so iteration is

6253

* that's OK. No task can be added to this CPU, so iteration is

6254

* fine.

6254

* fine.

6255

*/

6255

*/

6256

spin_unlock_irq(&rq->lock);

6256

spin_unlock_irq(&rq->lock);

6257

move_task_off_dead_cpu(dead_cpu, p);

6257

move_task_off_dead_cpu(dead_cpu, p);

6258

spin_lock_irq(&rq->lock);

6258

spin_lock_irq(&rq->lock);

6259

6260

put_task_struct(p);

6260

put_task_struct(p);

6261

}

6261

}

6262

6263

/* release_task() removes task from tasklist, so we won't find dead tasks. */

6263

/* release_task() removes task from tasklist, so we won't find dead tasks. */

6264

static void migrate_dead_tasks(unsigned int dead_cpu)

6264

static void migrate_dead_tasks(unsigned int dead_cpu)

6265

{

6265

{

6266

struct rq *rq = cpu_rq(dead_cpu);

6266

struct rq *rq = cpu_rq(dead_cpu);

6267

struct task_struct *next;

6267

struct task_struct *next;

6268

6269

for ( ; ; ) {

6269

for ( ; ; ) {

6270

if (!rq->nr_running)

6270

if (!rq->nr_running)

6271

break;

6271

break;

6272

update_rq_clock(rq);

6272

update_rq_clock(rq);

6273

next = pick_next_task(rq, rq->curr);

6273

next = pick_next_task(rq, rq->curr);

6274

if (!next)

6274

if (!next)

6275

break;

6275

break;

6276

next->sched_class->put_prev_task(rq, next);

6276

next->sched_class->put_prev_task(rq, next);

6277

migrate_dead(dead_cpu, next);

6277

migrate_dead(dead_cpu, next);

6278

6279

}

6279

}

6280

}

6280

}

6281

#endif /* CONFIG_HOTPLUG_CPU */

6281

#endif /* CONFIG_HOTPLUG_CPU */

6282

6283

#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)

6283

#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)

6284

6285

static struct ctl_table sd_ctl_dir[] = {

6285

static struct ctl_table sd_ctl_dir[] = {

6286

{

6286

{

6287

.procname = "sched_domain",

6287

.procname = "sched_domain",

6288

.mode = 0555,

6288

.mode = 0555,

6289

},

6289

},

6290

{0, },

6290

{0, },

6291

};

6291

};

6292

6293

static struct ctl_table sd_ctl_root[] = {

6293

static struct ctl_table sd_ctl_root[] = {

6294

{

6294

{

6295

.ctl_name = CTL_KERN,

6295

.ctl_name = CTL_KERN,

6296

.procname = "kernel",

6296

.procname = "kernel",

6297

.mode = 0555,

6297

.mode = 0555,

6298

.child = sd_ctl_dir,

6298

.child = sd_ctl_dir,

6299

},

6299

},

6300

{0, },

6300

{0, },

6301

};

6301

};

6302

6303

static struct ctl_table *sd_alloc_ctl_entry(int n)

6303

static struct ctl_table *sd_alloc_ctl_entry(int n)

6304

{

6304

{

6305

struct ctl_table *entry =

6305

struct ctl_table *entry =

6306

kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);

6306

kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);

6307

6308

return entry;

6308

return entry;

6309

}

6309

}

6310

6311

static void sd_free_ctl_entry(struct ctl_table **tablep)

6311

static void sd_free_ctl_entry(struct ctl_table **tablep)

6312

{

6312

{

6313

struct ctl_table *entry;

6313

struct ctl_table *entry;

6314

6315

/*

6315

/*

6316

* In the intermediate directories, both the child directory and

6316

* In the intermediate directories, both the child directory and

6317

* procname are dynamically allocated and could fail but the mode

6317

* procname are dynamically allocated and could fail but the mode

6318

* will always be set. In the lowest directory the names are

6318

* will always be set. In the lowest directory the names are

6319

* static strings and all have proc handlers.

6319

* static strings and all have proc handlers.

6320

*/

6320

*/

6321

for (entry = *tablep; entry->mode; entry++) {

6321

for (entry = *tablep; entry->mode; entry++) {

6322

if (entry->child)

6322

if (entry->child)

6323

sd_free_ctl_entry(&entry->child);

6323

sd_free_ctl_entry(&entry->child);

6324

if (entry->proc_handler == NULL)

6324

if (entry->proc_handler == NULL)

6325

kfree(entry->procname);

6325

kfree(entry->procname);

6326

}

6326

}

6327

6328

kfree(*tablep);

6328

kfree(*tablep);

6329

*tablep = NULL;

6329

*tablep = NULL;

6330

}

6330

}

6331

6332

static void

6332

static void

6333

set_table_entry(struct ctl_table *entry,

6333

set_table_entry(struct ctl_table *entry,

6334

const char *procname, void *data, int maxlen,

6334

const char *procname, void *data, int maxlen,

6335

mode_t mode, proc_handler *proc_handler)

6335

mode_t mode, proc_handler *proc_handler)

6336

{

6336

{

6337

entry->procname = procname;

6337

entry->procname = procname;

6338

entry->data = data;

6338

entry->data = data;

6339

entry->maxlen = maxlen;

6339

entry->maxlen = maxlen;

6340

entry->mode = mode;

6340

entry->mode = mode;

6341

entry->proc_handler = proc_handler;

6341

entry->proc_handler = proc_handler;

6342

}

6342

}

6343

6344

static struct ctl_table *

6344

static struct ctl_table *

6345

sd_alloc_ctl_domain_table(struct sched_domain *sd)

6345

sd_alloc_ctl_domain_table(struct sched_domain *sd)

6346

{

6346

{

6347

struct ctl_table *table = sd_alloc_ctl_entry(13);

6347

struct ctl_table *table = sd_alloc_ctl_entry(13);

6348

6349

if (table == NULL)

6349

if (table == NULL)

6350

return NULL;

6350

return NULL;

6351

6352

set_table_entry(&table[0], "min_interval", &sd->min_interval,

6352

set_table_entry(&table[0], "min_interval", &sd->min_interval,

6353

sizeof(long), 0644, proc_doulongvec_minmax);

6353

sizeof(long), 0644, proc_doulongvec_minmax);

6354

set_table_entry(&table[1], "max_interval", &sd->max_interval,

6354

set_table_entry(&table[1], "max_interval", &sd->max_interval,

6355

sizeof(long), 0644, proc_doulongvec_minmax);

6355

sizeof(long), 0644, proc_doulongvec_minmax);

6356

set_table_entry(&table[2], "busy_idx", &sd->busy_idx,

6356

set_table_entry(&table[2], "busy_idx", &sd->busy_idx,

6357

sizeof(int), 0644, proc_dointvec_minmax);

6357

sizeof(int), 0644, proc_dointvec_minmax);

6358

set_table_entry(&table[3], "idle_idx", &sd->idle_idx,

6358

set_table_entry(&table[3], "idle_idx", &sd->idle_idx,

6359

sizeof(int), 0644, proc_dointvec_minmax);

6359

sizeof(int), 0644, proc_dointvec_minmax);

6360

set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,

6360

set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,

6361

sizeof(int), 0644, proc_dointvec_minmax);

6361

sizeof(int), 0644, proc_dointvec_minmax);

6362

set_table_entry(&table[5], "wake_idx", &sd->wake_idx,

6362

set_table_entry(&table[5], "wake_idx", &sd->wake_idx,

6363

sizeof(int), 0644, proc_dointvec_minmax);

6363

sizeof(int), 0644, proc_dointvec_minmax);

6364

set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,

6364

set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,

6365

sizeof(int), 0644, proc_dointvec_minmax);

6365

sizeof(int), 0644, proc_dointvec_minmax);

6366

set_table_entry(&table[7], "busy_factor", &sd->busy_factor,

6366

set_table_entry(&table[7], "busy_factor", &sd->busy_factor,

6367

sizeof(int), 0644, proc_dointvec_minmax);

6367

sizeof(int), 0644, proc_dointvec_minmax);

6368

set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,

6368

set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,

6369

sizeof(int), 0644, proc_dointvec_minmax);

6369

sizeof(int), 0644, proc_dointvec_minmax);

6370

set_table_entry(&table[9], "cache_nice_tries",

6370

set_table_entry(&table[9], "cache_nice_tries",

6371

&sd->cache_nice_tries,

6371

&sd->cache_nice_tries,

6372

sizeof(int), 0644, proc_dointvec_minmax);

6372

sizeof(int), 0644, proc_dointvec_minmax);

6373

set_table_entry(&table[10], "flags", &sd->flags,

6373

set_table_entry(&table[10], "flags", &sd->flags,

6374

sizeof(int), 0644, proc_dointvec_minmax);

6374

sizeof(int), 0644, proc_dointvec_minmax);

6375

set_table_entry(&table[11], "name", sd->name,

6375

set_table_entry(&table[11], "name", sd->name,

6376

CORENAME_MAX_SIZE, 0444, proc_dostring);

6376

CORENAME_MAX_SIZE, 0444, proc_dostring);

6377

/* &table[12] is terminator */

6377

/* &table[12] is terminator */

6378

6379

return table;

6379

return table;

6380

}

6380

}

6381

6382

static ctl_table *sd_alloc_ctl_cpu_table(int cpu)

6382

static ctl_table *sd_alloc_ctl_cpu_table(int cpu)

6383

{

6383

{

6384

struct ctl_table *entry, *table;

6384

struct ctl_table *entry, *table;

6385

struct sched_domain *sd;

6385

struct sched_domain *sd;

6386

int domain_num = 0, i;

6386

int domain_num = 0, i;

6387

char buf[32];

6387

char buf[32];

6388

6389

for_each_domain(cpu, sd)

6389

for_each_domain(cpu, sd)

6390

domain_num++;

6390

domain_num++;

6391

entry = table = sd_alloc_ctl_entry(domain_num + 1);

6391

entry = table = sd_alloc_ctl_entry(domain_num + 1);

6392

if (table == NULL)

6392

if (table == NULL)

6393

return NULL;

6393

return NULL;

6394

6395

i = 0;

6395

i = 0;

6396

for_each_domain(cpu, sd) {

6396

for_each_domain(cpu, sd) {

6397

snprintf(buf, 32, "domain%d", i);

6397

snprintf(buf, 32, "domain%d", i);

6398

entry->procname = kstrdup(buf, GFP_KERNEL);

6398

entry->procname = kstrdup(buf, GFP_KERNEL);

6399

entry->mode = 0555;

6399

entry->mode = 0555;

6400

entry->child = sd_alloc_ctl_domain_table(sd);

6400

entry->child = sd_alloc_ctl_domain_table(sd);

6401

entry++;

6401

entry++;

6402

i++;

6402

i++;

6403

}

6403

}

6404

return table;

6404

return table;

6405

}

6405

}

6406

6407

static struct ctl_table_header *sd_sysctl_header;

6407

static struct ctl_table_header *sd_sysctl_header;

6408

static void register_sched_domain_sysctl(void)

6408

static void register_sched_domain_sysctl(void)

6409

{

6409

{

6410

int i, cpu_num = num_online_cpus();

6410

int i, cpu_num = num_online_cpus();

6411

struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);

6411

struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);

6412

char buf[32];

6412

char buf[32];

6413

6414

WARN_ON(sd_ctl_dir[0].child);

6414

WARN_ON(sd_ctl_dir[0].child);

6415

sd_ctl_dir[0].child = entry;

6415

sd_ctl_dir[0].child = entry;

6416

6417

if (entry == NULL)

6417

if (entry == NULL)

6418

return;

6418

return;

6419

6420

for_each_online_cpu(i) {

6420

for_each_online_cpu(i) {

6421

snprintf(buf, 32, "cpu%d", i);

6421

snprintf(buf, 32, "cpu%d", i);

6422

entry->procname = kstrdup(buf, GFP_KERNEL);

6422

entry->procname = kstrdup(buf, GFP_KERNEL);

6423

entry->mode = 0555;

6423

entry->mode = 0555;

6424

entry->child = sd_alloc_ctl_cpu_table(i);

6424

entry->child = sd_alloc_ctl_cpu_table(i);

6425

entry++;

6425

entry++;

6426

}

6426

}

6427

6428

WARN_ON(sd_sysctl_header);

6428

WARN_ON(sd_sysctl_header);

6429

sd_sysctl_header = register_sysctl_table(sd_ctl_root);

6429

sd_sysctl_header = register_sysctl_table(sd_ctl_root);

6430

}

6430

}

6431

6432

/* may be called multiple times per register */

6432

/* may be called multiple times per register */

6433

static void unregister_sched_domain_sysctl(void)

6433

static void unregister_sched_domain_sysctl(void)

6434

{

6434

{

6435

if (sd_sysctl_header)

6435

if (sd_sysctl_header)

6436

unregister_sysctl_table(sd_sysctl_header);

6436

unregister_sysctl_table(sd_sysctl_header);

6437

sd_sysctl_header = NULL;

6437

sd_sysctl_header = NULL;

6438

if (sd_ctl_dir[0].child)

6438

if (sd_ctl_dir[0].child)

6439

sd_free_ctl_entry(&sd_ctl_dir[0].child);

6439

sd_free_ctl_entry(&sd_ctl_dir[0].child);

6440

}

6440

}

6441

#else

6441

#else

6442

static void register_sched_domain_sysctl(void)

6442

static void register_sched_domain_sysctl(void)

6443

{

6443

{

6444

}

6444

}

6445

static void unregister_sched_domain_sysctl(void)

6445

static void unregister_sched_domain_sysctl(void)

6446

{

6446

{

6447

}

6447

}

6448

#endif

6448

#endif

6449

6450

static void set_rq_online(struct rq *rq)

6450

static void set_rq_online(struct rq *rq)

6451

{

6451

{

6452

if (!rq->online) {

6452

if (!rq->online) {

6453

const struct sched_class *class;

6453

const struct sched_class *class;

6454

6455

cpu_set(rq->cpu, rq->rd->online);

6455

cpu_set(rq->cpu, rq->rd->online);

6456

rq->online = 1;

6456

rq->online = 1;

6457

6458

for_each_class(class) {

6458

for_each_class(class) {

6459

if (class->rq_online)

6459

if (class->rq_online)

6460

class->rq_online(rq);

6460

class->rq_online(rq);

6461

}

6461

}

6462

}

6462

}

6463

}

6463

}

6464

6465

static void set_rq_offline(struct rq *rq)

6465

static void set_rq_offline(struct rq *rq)

6466

{

6466

{

6467

if (rq->online) {

6467

if (rq->online) {

6468

const struct sched_class *class;

6468

const struct sched_class *class;

6469

6470

for_each_class(class) {

6470

for_each_class(class) {

6471

if (class->rq_offline)

6471

if (class->rq_offline)

6472

class->rq_offline(rq);

6472

class->rq_offline(rq);

6473

}

6473

}

6474

6475

cpu_clear(rq->cpu, rq->rd->online);

6475

cpu_clear(rq->cpu, rq->rd->online);

6476

rq->online = 0;

6476

rq->online = 0;

6477

}

6477

}

6478

}

6478

}

6479

6480

/*

6480

/*

6481

* migration_call - callback that gets triggered when a CPU is added.

6481

* migration_call - callback that gets triggered when a CPU is added.

6482

* Here we can start up the necessary migration thread for the new CPU.

6482

* Here we can start up the necessary migration thread for the new CPU.

6483

*/

6483

*/

6484

static int __cpuinit

6484

static int __cpuinit

6485

migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)

6485

migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)

6486

{

6486

{

6487

struct task_struct *p;

6487

struct task_struct *p;

6488

int cpu = (long)hcpu;

6488

int cpu = (long)hcpu;

6489

unsigned long flags;

6489

unsigned long flags;

6490

struct rq *rq;

6490

struct rq *rq;

6491

6492

switch (action) {

6492

switch (action) {

6493

6494

case CPU_UP_PREPARE:

6494

case CPU_UP_PREPARE:

6495

case CPU_UP_PREPARE_FROZEN:

6495

case CPU_UP_PREPARE_FROZEN:

6496

p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);

6496

p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);

6497

if (IS_ERR(p))

6497

if (IS_ERR(p))

6498

return NOTIFY_BAD;

6498

return NOTIFY_BAD;

6499

kthread_bind(p, cpu);

6499

kthread_bind(p, cpu);

6500

/* Must be high prio: stop_machine expects to yield to it. */

6500

/* Must be high prio: stop_machine expects to yield to it. */

6501

rq = task_rq_lock(p, &flags);

6501

rq = task_rq_lock(p, &flags);

6502

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

6502

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

6503

task_rq_unlock(rq, &flags);

6503

task_rq_unlock(rq, &flags);

6504

cpu_rq(cpu)->migration_thread = p;

6504

cpu_rq(cpu)->migration_thread = p;

6505

break;

6505

break;

6506

6507

case CPU_ONLINE:

6507

case CPU_ONLINE:

6508

case CPU_ONLINE_FROZEN:

6508

case CPU_ONLINE_FROZEN:

6509

/* Strictly unnecessary, as first user will wake it. */

6509

/* Strictly unnecessary, as first user will wake it. */

6510

wake_up_process(cpu_rq(cpu)->migration_thread);

6510

wake_up_process(cpu_rq(cpu)->migration_thread);

6511

6512

/* Update our root-domain */

6512

/* Update our root-domain */

6513

rq = cpu_rq(cpu);

6513

rq = cpu_rq(cpu);

6514

spin_lock_irqsave(&rq->lock, flags);

6514

spin_lock_irqsave(&rq->lock, flags);

6515

if (rq->rd) {

6515

if (rq->rd) {

6516

BUG_ON(!cpu_isset(cpu, rq->rd->span));

6516

BUG_ON(!cpu_isset(cpu, rq->rd->span));

6517

6518

set_rq_online(rq);

6518

set_rq_online(rq);

6519

}

6519

}

6520

spin_unlock_irqrestore(&rq->lock, flags);

6520

spin_unlock_irqrestore(&rq->lock, flags);

6521

break;

6521

break;

6522

6523

#ifdef CONFIG_HOTPLUG_CPU

6523

#ifdef CONFIG_HOTPLUG_CPU

6524

case CPU_UP_CANCELED:

6524

case CPU_UP_CANCELED:

6525

case CPU_UP_CANCELED_FROZEN:

6525

case CPU_UP_CANCELED_FROZEN:

6526

if (!cpu_rq(cpu)->migration_thread)

6526

if (!cpu_rq(cpu)->migration_thread)

6527

break;

6527

break;

6528

/* Unbind it from offline cpu so it can run. Fall thru. */

6528

/* Unbind it from offline cpu so it can run. Fall thru. */

6529

kthread_bind(cpu_rq(cpu)->migration_thread,

6529

kthread_bind(cpu_rq(cpu)->migration_thread,

6530

any_online_cpu(cpu_online_map));

6530

any_online_cpu(cpu_online_map));

6531

kthread_stop(cpu_rq(cpu)->migration_thread);

6531

kthread_stop(cpu_rq(cpu)->migration_thread);

6532

cpu_rq(cpu)->migration_thread = NULL;

6532

cpu_rq(cpu)->migration_thread = NULL;

6533

break;

6533

break;

6534

6535

case CPU_DEAD:

6535

case CPU_DEAD:

6536

case CPU_DEAD_FROZEN:

6536

case CPU_DEAD_FROZEN:

6537

cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */

6537

cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */

6538

migrate_live_tasks(cpu);

6538

migrate_live_tasks(cpu);

6539

rq = cpu_rq(cpu);

6539

rq = cpu_rq(cpu);

6540

kthread_stop(rq->migration_thread);

6540

kthread_stop(rq->migration_thread);

6541

rq->migration_thread = NULL;

6541

rq->migration_thread = NULL;

6542

/* Idle task back to normal (off runqueue, low prio) */

6542

/* Idle task back to normal (off runqueue, low prio) */

6543

spin_lock_irq(&rq->lock);

6543

spin_lock_irq(&rq->lock);

6544

update_rq_clock(rq);

6544

update_rq_clock(rq);

6545

deactivate_task(rq, rq->idle, 0);

6545

deactivate_task(rq, rq->idle, 0);

6546

rq->idle->static_prio = MAX_PRIO;

6546

rq->idle->static_prio = MAX_PRIO;

6547

__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);

6547

__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);

6548

rq->idle->sched_class = &idle_sched_class;

6548

rq->idle->sched_class = &idle_sched_class;

6549

migrate_dead_tasks(cpu);

6549

migrate_dead_tasks(cpu);

6550

spin_unlock_irq(&rq->lock);

6550

spin_unlock_irq(&rq->lock);

6551

cpuset_unlock();

6551

cpuset_unlock();

6552

migrate_nr_uninterruptible(rq);

6552

migrate_nr_uninterruptible(rq);

6553

BUG_ON(rq->nr_running != 0);

6553

BUG_ON(rq->nr_running != 0);

6554

6555

/*

6555

/*

6556

* No need to migrate the tasks: it was best-effort if

6556

* No need to migrate the tasks: it was best-effort if

6557

* they didn't take sched_hotcpu_mutex. Just wake up

6557

* they didn't take sched_hotcpu_mutex. Just wake up

6558

* the requestors.

6558

* the requestors.

6559

*/

6559

*/

6560

spin_lock_irq(&rq->lock);

6560

spin_lock_irq(&rq->lock);

6561

while (!list_empty(&rq->migration_queue)) {

6561

while (!list_empty(&rq->migration_queue)) {

6562

struct migration_req *req;

6562

struct migration_req *req;

6563

6564

req = list_entry(rq->migration_queue.next,

6564

req = list_entry(rq->migration_queue.next,

6565

struct migration_req, list);

6565

struct migration_req, list);

6566

list_del_init(&req->list);

6566

list_del_init(&req->list);

6567

spin_unlock_irq(&rq->lock);

6567

spin_unlock_irq(&rq->lock);

6568

complete(&req->done);

6568

complete(&req->done);

6569

spin_lock_irq(&rq->lock);

6569

spin_lock_irq(&rq->lock);

6570

}

6570

}

6571

spin_unlock_irq(&rq->lock);

6571

spin_unlock_irq(&rq->lock);

6572

break;

6572

break;

6573

6574

case CPU_DYING:

6574

case CPU_DYING:

6575

case CPU_DYING_FROZEN:

6575

case CPU_DYING_FROZEN:

6576

/* Update our root-domain */

6576

/* Update our root-domain */

6577

rq = cpu_rq(cpu);

6577

rq = cpu_rq(cpu);

6578

spin_lock_irqsave(&rq->lock, flags);

6578

spin_lock_irqsave(&rq->lock, flags);

6579

if (rq->rd) {

6579

if (rq->rd) {

6580

BUG_ON(!cpu_isset(cpu, rq->rd->span));

6580

BUG_ON(!cpu_isset(cpu, rq->rd->span));

6581

set_rq_offline(rq);

6581

set_rq_offline(rq);

6582

}

6582

}

6583

spin_unlock_irqrestore(&rq->lock, flags);

6583

spin_unlock_irqrestore(&rq->lock, flags);

6584

break;

6584

break;

6585

#endif

6585

#endif

6586

}

6586

}

6587

return NOTIFY_OK;

6587

return NOTIFY_OK;

6588

}

6588

}

6589

6590

/* Register at highest priority so that task migration (migrate_all_tasks)

6590

/* Register at highest priority so that task migration (migrate_all_tasks)

6591

* happens before everything else.

6591

* happens before everything else.

6592

*/

6592

*/

6593

static struct notifier_block __cpuinitdata migration_notifier = {

6593

static struct notifier_block __cpuinitdata migration_notifier = {

6594

.notifier_call = migration_call,

6594

.notifier_call = migration_call,

6595

.priority = 10

6595

.priority = 10

6596

};

6596

};

6597

6598

static int __init migration_init(void)

6598

static int __init migration_init(void)

6599

{

6599

{

6600

void *cpu = (void *)(long)smp_processor_id();

6600

void *cpu = (void *)(long)smp_processor_id();

6601

int err;

6601

int err;

6602

6603

/* Start one for the boot CPU: */

6603

/* Start one for the boot CPU: */

6604

err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);

6604

err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);

6605

BUG_ON(err == NOTIFY_BAD);

6605

BUG_ON(err == NOTIFY_BAD);

6606

migration_call(&migration_notifier, CPU_ONLINE, cpu);

6606

migration_call(&migration_notifier, CPU_ONLINE, cpu);

6607

register_cpu_notifier(&migration_notifier);

6607

register_cpu_notifier(&migration_notifier);

6608

6609

return err;

6609

return err;

6610

}

6610

}

6611

early_initcall(migration_init);

6611

early_initcall(migration_init);

6612

#endif

6612

#endif

6613

6614

#ifdef CONFIG_SMP

6614

#ifdef CONFIG_SMP

6615

6616

#ifdef CONFIG_SCHED_DEBUG

6616

#ifdef CONFIG_SCHED_DEBUG

6617

6618

static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,

6618

static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,

6619

cpumask_t *groupmask)

6619

cpumask_t *groupmask)

6620

{

6620

{

6621

struct sched_group *group = sd->groups;

6621

struct sched_group *group = sd->groups;

6622

char str[256];

6622

char str[256];

6623

6624

cpulist_scnprintf(str, sizeof(str), sd->span);

6624

cpulist_scnprintf(str, sizeof(str), sd->span);

6625

cpus_clear(*groupmask);

6625

cpus_clear(*groupmask);

6626

6627

printk(KERN_DEBUG "%*s domain %d: ", level, "", level);

6627

printk(KERN_DEBUG "%*s domain %d: ", level, "", level);

6628

6629

if (!(sd->flags & SD_LOAD_BALANCE)) {

6629

if (!(sd->flags & SD_LOAD_BALANCE)) {

6630

printk("does not load-balance\n");

6630

printk("does not load-balance\n");

6631

if (sd->parent)

6631

if (sd->parent)

6632

printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"

6632

printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"

6633

" has parent");

6633

" has parent");

6634

return -1;

6634

return -1;

6635

}

6635

}

6636

6637

printk(KERN_CONT "span %s level %s\n", str, sd->name);

6637

printk(KERN_CONT "span %s level %s\n", str, sd->name);

6638

6639

if (!cpu_isset(cpu, sd->span)) {

6639

if (!cpu_isset(cpu, sd->span)) {

6640

printk(KERN_ERR "ERROR: domain->span does not contain "

6640

printk(KERN_ERR "ERROR: domain->span does not contain "

6641

"CPU%d\n", cpu);

6641

"CPU%d\n", cpu);

6642

}

6642

}

6643

if (!cpu_isset(cpu, group->cpumask)) {

6643

if (!cpu_isset(cpu, group->cpumask)) {

6644

printk(KERN_ERR "ERROR: domain->groups does not contain"

6644

printk(KERN_ERR "ERROR: domain->groups does not contain"

6645

" CPU%d\n", cpu);

6645

" CPU%d\n", cpu);

6646

}

6646

}

6647

6648

printk(KERN_DEBUG "%*s groups:", level + 1, "");

6648

printk(KERN_DEBUG "%*s groups:", level + 1, "");

6649

do {

6649

do {

6650

if (!group) {

6650

if (!group) {

6651

printk("\n");

6651

printk("\n");

6652

printk(KERN_ERR "ERROR: group is NULL\n");

6652

printk(KERN_ERR "ERROR: group is NULL\n");

6653

break;

6653

break;

6654

}

6654

}

6655

6656

if (!group->__cpu_power) {

6656

if (!group->__cpu_power) {

6657

printk(KERN_CONT "\n");

6657

printk(KERN_CONT "\n");

6658

printk(KERN_ERR "ERROR: domain->cpu_power not "

6658

printk(KERN_ERR "ERROR: domain->cpu_power not "

6659

"set\n");

6659

"set\n");

6660

break;

6660

break;

6661

}

6661

}

6662

6663

if (!cpus_weight(group->cpumask)) {

6663

if (!cpus_weight(group->cpumask)) {

6664

printk(KERN_CONT "\n");

6664

printk(KERN_CONT "\n");

6665

printk(KERN_ERR "ERROR: empty group\n");

6665

printk(KERN_ERR "ERROR: empty group\n");

6666

break;

6666

break;

6667

}

6667

}

6668

6669

if (cpus_intersects(*groupmask, group->cpumask)) {

6669

if (cpus_intersects(*groupmask, group->cpumask)) {

6670

printk(KERN_CONT "\n");

6670

printk(KERN_CONT "\n");

6671

printk(KERN_ERR "ERROR: repeated CPUs\n");

6671

printk(KERN_ERR "ERROR: repeated CPUs\n");

6672

break;

6672

break;

6673

}

6673

}

6674

6675

cpus_or(*groupmask, *groupmask, group->cpumask);

6675

cpus_or(*groupmask, *groupmask, group->cpumask);

6676

6677

cpulist_scnprintf(str, sizeof(str), group->cpumask);

6677

cpulist_scnprintf(str, sizeof(str), group->cpumask);

6678

printk(KERN_CONT " %s", str);

6678

printk(KERN_CONT " %s", str);

6679

6680

group = group->next;

6680

group = group->next;

6681

} while (group != sd->groups);

6681

} while (group != sd->groups);

6682

printk(KERN_CONT "\n");

6682

printk(KERN_CONT "\n");

6683

6684

if (!cpus_equal(sd->span, *groupmask))

6684

if (!cpus_equal(sd->span, *groupmask))

6685

printk(KERN_ERR "ERROR: groups don't span domain->span\n");

6685

printk(KERN_ERR "ERROR: groups don't span domain->span\n");

6686

6687

if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))

6687

if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))

6688

printk(KERN_ERR "ERROR: parent span is not a superset "

6688

printk(KERN_ERR "ERROR: parent span is not a superset "

6689

"of domain->span\n");

6689

"of domain->span\n");

6690

return 0;

6690

return 0;

6691

}

6691

}

6692

6693

static void sched_domain_debug(struct sched_domain *sd, int cpu)

6693

static void sched_domain_debug(struct sched_domain *sd, int cpu)

6694

{

6694

{

6695

cpumask_t *groupmask;

6695

cpumask_t *groupmask;

6696

int level = 0;

6696

int level = 0;

6697

6698

if (!sd) {

6698

if (!sd) {

6699

printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);

6699

printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);

6700

return;

6700

return;

6701

}

6701

}

6702

6703

printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

6703

printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

6704

6705

groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);

6705

groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);

6706

if (!groupmask) {

6706

if (!groupmask) {

6707

printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");

6707

printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");

6708

return;

6708

return;

6709

}

6709

}

6710

6711

for (;;) {

6711

for (;;) {

6712

if (sched_domain_debug_one(sd, cpu, level, groupmask))

6712

if (sched_domain_debug_one(sd, cpu, level, groupmask))

6713

break;

6713

break;

6714

level++;

6714

level++;

6715

sd = sd->parent;

6715

sd = sd->parent;

6716

if (!sd)

6716

if (!sd)

6717

break;

6717

break;

6718

}

6718

}

6719

kfree(groupmask);

6719

kfree(groupmask);

6720

}

6720

}

6721

#else /* !CONFIG_SCHED_DEBUG */

6721

#else /* !CONFIG_SCHED_DEBUG */

6722

# define sched_domain_debug(sd, cpu) do { } while (0)

6722

# define sched_domain_debug(sd, cpu) do { } while (0)

6723

#endif /* CONFIG_SCHED_DEBUG */

6723

#endif /* CONFIG_SCHED_DEBUG */

6724

6725

static int sd_degenerate(struct sched_domain *sd)

6725

static int sd_degenerate(struct sched_domain *sd)

6726

{

6726

{

6727

if (cpus_weight(sd->span) == 1)

6727

if (cpus_weight(sd->span) == 1)

6728

return 1;

6728

return 1;

6729

6730

/* Following flags need at least 2 groups */

6730

/* Following flags need at least 2 groups */

6731

if (sd->flags & (SD_LOAD_BALANCE |

6731

if (sd->flags & (SD_LOAD_BALANCE |

6732

SD_BALANCE_NEWIDLE |

6732

SD_BALANCE_NEWIDLE |

6733

SD_BALANCE_FORK |

6733

SD_BALANCE_FORK |

6734

SD_BALANCE_EXEC |

6734

SD_BALANCE_EXEC |

6735

SD_SHARE_CPUPOWER |

6735

SD_SHARE_CPUPOWER |

6736

SD_SHARE_PKG_RESOURCES)) {

6736

SD_SHARE_PKG_RESOURCES)) {

6737

if (sd->groups != sd->groups->next)

6737

if (sd->groups != sd->groups->next)

6738

return 0;

6738

return 0;

6739

}

6739

}

6740

6741

/* Following flags don't use groups */

6741

/* Following flags don't use groups */

6742

if (sd->flags & (SD_WAKE_IDLE |

6742

if (sd->flags & (SD_WAKE_IDLE |

6743

SD_WAKE_AFFINE |

6743

SD_WAKE_AFFINE |

6744

SD_WAKE_BALANCE))

6744

SD_WAKE_BALANCE))

6745

return 0;

6745

return 0;

6746

6747

return 1;

6747

return 1;

6748

}

6748

}

6749

6750

static int

6750

static int

6751

sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)

6751

sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)

6752

{

6752

{

6753

unsigned long cflags = sd->flags, pflags = parent->flags;

6753

unsigned long cflags = sd->flags, pflags = parent->flags;

6754

6755

if (sd_degenerate(parent))

6755

if (sd_degenerate(parent))

6756

return 1;

6756

return 1;

6757

6758

if (!cpus_equal(sd->span, parent->span))

6758

if (!cpus_equal(sd->span, parent->span))

6759

return 0;

6759

return 0;

6760

6761

/* Does parent contain flags not in child? */

6761

/* Does parent contain flags not in child? */

6762

/* WAKE_BALANCE is a subset of WAKE_AFFINE */

6762

/* WAKE_BALANCE is a subset of WAKE_AFFINE */

6763

if (cflags & SD_WAKE_AFFINE)

6763

if (cflags & SD_WAKE_AFFINE)

6764

pflags &= ~SD_WAKE_BALANCE;

6764

pflags &= ~SD_WAKE_BALANCE;

6765

/* Flags needing groups don't count if only 1 group in parent */

6765

/* Flags needing groups don't count if only 1 group in parent */

6766

if (parent->groups == parent->groups->next) {

6766

if (parent->groups == parent->groups->next) {

6767

pflags &= ~(SD_LOAD_BALANCE |

6767

pflags &= ~(SD_LOAD_BALANCE |

6768

SD_BALANCE_NEWIDLE |

6768

SD_BALANCE_NEWIDLE |

6769

SD_BALANCE_FORK |

6769

SD_BALANCE_FORK |

6770

SD_BALANCE_EXEC |

6770

SD_BALANCE_EXEC |

6771

SD_SHARE_CPUPOWER |

6771

SD_SHARE_CPUPOWER |

6772

SD_SHARE_PKG_RESOURCES);

6772

SD_SHARE_PKG_RESOURCES);

6773

if (nr_node_ids == 1)

6773

if (nr_node_ids == 1)

6774

pflags &= ~SD_SERIALIZE;

6774

pflags &= ~SD_SERIALIZE;

6775

}

6775

}

6776

if (~cflags & pflags)

6776

if (~cflags & pflags)

6777

return 0;

6777

return 0;

6778

6779

return 1;

6779

return 1;

6780

}

6780

}

6781

6782

static void rq_attach_root(struct rq *rq, struct root_domain *rd)

6782

static void rq_attach_root(struct rq *rq, struct root_domain *rd)

6783

{

6783

{

6784

unsigned long flags;

6784

unsigned long flags;

6785

6786

spin_lock_irqsave(&rq->lock, flags);

6786

spin_lock_irqsave(&rq->lock, flags);

6787

6788

if (rq->rd) {

6788

if (rq->rd) {

6789

struct root_domain *old_rd = rq->rd;

6789

struct root_domain *old_rd = rq->rd;

6790

6791

if (cpu_isset(rq->cpu, old_rd->online))

6791

if (cpu_isset(rq->cpu, old_rd->online))

6792

set_rq_offline(rq);

6792

set_rq_offline(rq);

6793

6794

cpu_clear(rq->cpu, old_rd->span);

6794

cpu_clear(rq->cpu, old_rd->span);

6795

6796

if (atomic_dec_and_test(&old_rd->refcount))

6796

if (atomic_dec_and_test(&old_rd->refcount))

6797

kfree(old_rd);

6797

kfree(old_rd);

6798

}

6798

}

6799

6800

atomic_inc(&rd->refcount);

6800

atomic_inc(&rd->refcount);

6801

rq->rd = rd;

6801

rq->rd = rd;

6802

6803

cpu_set(rq->cpu, rd->span);

6803

cpu_set(rq->cpu, rd->span);

6804

if (cpu_isset(rq->cpu, cpu_online_map))

6804

if (cpu_isset(rq->cpu, cpu_online_map))

6805

set_rq_online(rq);

6805

set_rq_online(rq);

6806

6807

spin_unlock_irqrestore(&rq->lock, flags);

6807

spin_unlock_irqrestore(&rq->lock, flags);

6808

}

6808

}

6809

6810

static void init_rootdomain(struct root_domain *rd)

6810

static void init_rootdomain(struct root_domain *rd)

6811

{

6811

{

6812

memset(rd, 0, sizeof(*rd));

6812

memset(rd, 0, sizeof(*rd));

6813

6814

cpus_clear(rd->span);

6814

cpus_clear(rd->span);

6815

cpus_clear(rd->online);

6815

cpus_clear(rd->online);

6816

6817

cpupri_init(&rd->cpupri);

6817

cpupri_init(&rd->cpupri);

6818

}

6818

}

6819

6820

static void init_defrootdomain(void)

6820

static void init_defrootdomain(void)

6821

{

6821

{

6822

init_rootdomain(&def_root_domain);

6822

init_rootdomain(&def_root_domain);

6823

atomic_set(&def_root_domain.refcount, 1);

6823

atomic_set(&def_root_domain.refcount, 1);

6824

}

6824

}

6825

6826

static struct root_domain *alloc_rootdomain(void)

6826

static struct root_domain *alloc_rootdomain(void)

6827

{

6827

{

6828

struct root_domain *rd;

6828

struct root_domain *rd;

6829

6830

rd = kmalloc(sizeof(*rd), GFP_KERNEL);

6830

rd = kmalloc(sizeof(*rd), GFP_KERNEL);

6831

if (!rd)

6831

if (!rd)

6832

return NULL;

6832

return NULL;

6833

6834

init_rootdomain(rd);

6834

init_rootdomain(rd);

6835

6836

return rd;

6836

return rd;

6837

}

6837

}

6838

6839

/*

6839

/*

6840

* Attach the domain 'sd' to 'cpu' as its base domain. Callers must

6840

* Attach the domain 'sd' to 'cpu' as its base domain. Callers must

6841

* hold the hotplug lock.

6841

* hold the hotplug lock.

6842

*/

6842

*/

6843

static void

6843

static void

6844

cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)

6844

cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)

6845

{

6845

{

6846

struct rq *rq = cpu_rq(cpu);

6846

struct rq *rq = cpu_rq(cpu);

6847

struct sched_domain *tmp;

6847

struct sched_domain *tmp;

6848

6849

/* Remove the sched domains which do not contribute to scheduling. */

6849

/* Remove the sched domains which do not contribute to scheduling. */

6850

for (tmp = sd; tmp; ) {

6850

for (tmp = sd; tmp; ) {

6851

struct sched_domain *parent = tmp->parent;

6851

struct sched_domain *parent = tmp->parent;

6852

if (!parent)

6852

if (!parent)

6853

break;

6853

break;

6854

6855

if (sd_parent_degenerate(tmp, parent)) {

6855

if (sd_parent_degenerate(tmp, parent)) {

6856

tmp->parent = parent->parent;

6856

tmp->parent = parent->parent;

6857

if (parent->parent)

6857

if (parent->parent)

6858

parent->parent->child = tmp;

6858

parent->parent->child = tmp;

6859

} else

6859

} else

6860

tmp = tmp->parent;

6860

tmp = tmp->parent;

6861

}

6861

}

6862

6863

if (sd && sd_degenerate(sd)) {

6863

if (sd && sd_degenerate(sd)) {

6864

sd = sd->parent;

6864

sd = sd->parent;

6865

if (sd)

6865

if (sd)

6866

sd->child = NULL;

6866

sd->child = NULL;

6867

}

6867

}

6868

6869

sched_domain_debug(sd, cpu);

6869

sched_domain_debug(sd, cpu);

6870

6871

rq_attach_root(rq, rd);

6871

rq_attach_root(rq, rd);

6872

rcu_assign_pointer(rq->sd, sd);

6872

rcu_assign_pointer(rq->sd, sd);

6873

}

6873

}

6874

6875

/* cpus with isolated domains */

6875

/* cpus with isolated domains */

6876

static cpumask_t cpu_isolated_map = CPU_MASK_NONE;

6876

static cpumask_t cpu_isolated_map = CPU_MASK_NONE;

6877

6878

/* Setup the mask of cpus configured for isolated domains */

6878

/* Setup the mask of cpus configured for isolated domains */

6879

static int __init isolated_cpu_setup(char *str)

6879

static int __init isolated_cpu_setup(char *str)

6880

{

6880

{

6881

static int __initdata ints[NR_CPUS];

6881

static int __initdata ints[NR_CPUS];

6882

int i;

6882

int i;

6883

6884

str = get_options(str, ARRAY_SIZE(ints), ints);

6884

str = get_options(str, ARRAY_SIZE(ints), ints);

6885

cpus_clear(cpu_isolated_map);

6885

cpus_clear(cpu_isolated_map);

6886

for (i = 1; i <= ints[0]; i++)

6886

for (i = 1; i <= ints[0]; i++)

6887

if (ints[i] < NR_CPUS)

6887

if (ints[i] < NR_CPUS)

6888

cpu_set(ints[i], cpu_isolated_map);

6888

cpu_set(ints[i], cpu_isolated_map);

6889

return 1;

6889

return 1;

6890

}

6890

}

6891

6892

__setup("isolcpus=", isolated_cpu_setup);

6892

__setup("isolcpus=", isolated_cpu_setup);

6893

6894

/*

6894

/*

6895

* init_sched_build_groups takes the cpumask we wish to span, and a pointer

6895

* init_sched_build_groups takes the cpumask we wish to span, and a pointer

6896

* to a function which identifies what group(along with sched group) a CPU

6896

* to a function which identifies what group(along with sched group) a CPU

6897

* belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS

6897

* belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS

6898

* (due to the fact that we keep track of groups covered with a cpumask_t).

6898

* (due to the fact that we keep track of groups covered with a cpumask_t).

6899

*

6899

*

6900

* init_sched_build_groups will build a circular linked list of the groups

6900

* init_sched_build_groups will build a circular linked list of the groups

6901

* covered by the given span, and will set each group's ->cpumask correctly,

6901

* covered by the given span, and will set each group's ->cpumask correctly,

6902

* and ->cpu_power to 0.

6902

* and ->cpu_power to 0.

6903

*/

6903

*/

6904

static void

6904

static void

6905

init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,

6905

init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,

6906

int (*group_fn)(int cpu, const cpumask_t *cpu_map,

6906

int (*group_fn)(int cpu, const cpumask_t *cpu_map,

6907

struct sched_group **sg,

6907

struct sched_group **sg,

6908

cpumask_t *tmpmask),

6908

cpumask_t *tmpmask),

6909

cpumask_t *covered, cpumask_t *tmpmask)

6909

cpumask_t *covered, cpumask_t *tmpmask)

6910

{

6910

{

6911

struct sched_group *first = NULL, *last = NULL;

6911

struct sched_group *first = NULL, *last = NULL;

6912

int i;

6912

int i;

6913

6914

cpus_clear(*covered);

6914

cpus_clear(*covered);

6915

6916

for_each_cpu_mask_nr(i, *span) {

6916

for_each_cpu_mask_nr(i, *span) {

6917

struct sched_group *sg;

6917

struct sched_group *sg;

6918

int group = group_fn(i, cpu_map, &sg, tmpmask);

6918

int group = group_fn(i, cpu_map, &sg, tmpmask);

6919

int j;

6919

int j;

6920

6921

if (cpu_isset(i, *covered))

6921

if (cpu_isset(i, *covered))

6922

continue;

6922

continue;

6923

6924

cpus_clear(sg->cpumask);

6924

cpus_clear(sg->cpumask);

6925

sg->__cpu_power = 0;

6925

sg->__cpu_power = 0;

6926

6927

for_each_cpu_mask_nr(j, *span) {

6927

for_each_cpu_mask_nr(j, *span) {

6928

if (group_fn(j, cpu_map, NULL, tmpmask) != group)

6928

if (group_fn(j, cpu_map, NULL, tmpmask) != group)

6929

continue;

6929

continue;

6930

6931

cpu_set(j, *covered);

6931

cpu_set(j, *covered);

6932

cpu_set(j, sg->cpumask);

6932

cpu_set(j, sg->cpumask);

6933

}

6933

}

6934

if (!first)

6934

if (!first)

6935

first = sg;

6935

first = sg;

6936

if (last)

6936

if (last)

6937

last->next = sg;

6937

last->next = sg;

6938

last = sg;

6938

last = sg;

6939

}

6939

}

6940

last->next = first;

6940

last->next = first;

6941

}

6941

}

6942

6943

#define SD_NODES_PER_DOMAIN 16

6943

#define SD_NODES_PER_DOMAIN 16

6944

6945

#ifdef CONFIG_NUMA

6945

#ifdef CONFIG_NUMA

6946

6947

/**

6947

/**

6948

* find_next_best_node - find the next node to include in a sched_domain

6948

* find_next_best_node - find the next node to include in a sched_domain

6949

* @node: node whose sched_domain we're building

6949

* @node: node whose sched_domain we're building

6950

* @used_nodes: nodes already in the sched_domain

6950

* @used_nodes: nodes already in the sched_domain

6951

*

6951

*

6952

* Find the next node to include in a given scheduling domain. Simply

6952

* Find the next node to include in a given scheduling domain. Simply

6953

* finds the closest node not already in the @used_nodes map.

6953

* finds the closest node not already in the @used_nodes map.

6954

*

6954

*

6955

* Should use nodemask_t.

6955

* Should use nodemask_t.

6956

*/

6956

*/

6957

static int find_next_best_node(int node, nodemask_t *used_nodes)

6957

static int find_next_best_node(int node, nodemask_t *used_nodes)

6958

{

6958

{

6959

int i, n, val, min_val, best_node = 0;

6959

int i, n, val, min_val, best_node = 0;

6960

6961

min_val = INT_MAX;

6961

min_val = INT_MAX;

6962

6963

for (i = 0; i < nr_node_ids; i++) {

6963

for (i = 0; i < nr_node_ids; i++) {

6964

/* Start at @node */

6964

/* Start at @node */

6965

n = (node + i) % nr_node_ids;

6965

n = (node + i) % nr_node_ids;

6966

6967

if (!nr_cpus_node(n))

6967

if (!nr_cpus_node(n))

6968

continue;

6968

continue;

6969

6970

/* Skip already used nodes */

6970

/* Skip already used nodes */

6971

if (node_isset(n, *used_nodes))

6971

if (node_isset(n, *used_nodes))

6972

continue;

6972

continue;

6973

6974

/* Simple min distance search */

6974

/* Simple min distance search */

6975

val = node_distance(node, n);

6975

val = node_distance(node, n);

6976

6977

if (val < min_val) {

6977

if (val < min_val) {

6978

min_val = val;

6978

min_val = val;

6979

best_node = n;

6979

best_node = n;

6980

}

6980

}

6981

}

6981

}

6982

6983

node_set(best_node, *used_nodes);

6983

node_set(best_node, *used_nodes);

6984

return best_node;

6984

return best_node;

6985

}

6985

}

6986

6987

/**

6987

/**

6988

* sched_domain_node_span - get a cpumask for a node's sched_domain

6988

* sched_domain_node_span - get a cpumask for a node's sched_domain

6989

* @node: node whose cpumask we're constructing

6989

* @node: node whose cpumask we're constructing

6990

* @span: resulting cpumask

6990

* @span: resulting cpumask

6991

*

6991

*

6992

* Given a node, construct a good cpumask for its sched_domain to span. It

6992

* Given a node, construct a good cpumask for its sched_domain to span. It

6993

* should be one that prevents unnecessary balancing, but also spreads tasks

6993

* should be one that prevents unnecessary balancing, but also spreads tasks

6994

* out optimally.

6994

* out optimally.

6995

*/

6995

*/

6996

static void sched_domain_node_span(int node, cpumask_t *span)

6996

static void sched_domain_node_span(int node, cpumask_t *span)

6997

{

6997

{

6998

nodemask_t used_nodes;

6998

nodemask_t used_nodes;

6999

node_to_cpumask_ptr(nodemask, node);

6999

node_to_cpumask_ptr(nodemask, node);

7000

int i;

7000

int i;

7001

7002

cpus_clear(*span);

7002

cpus_clear(*span);

7003

nodes_clear(used_nodes);

7003

nodes_clear(used_nodes);

7004

7005

cpus_or(*span, *span, *nodemask);

7005

cpus_or(*span, *span, *nodemask);

7006

node_set(node, used_nodes);

7006

node_set(node, used_nodes);

7007

7008

for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {

7008

for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {

7009

int next_node = find_next_best_node(node, &used_nodes);

7009

int next_node = find_next_best_node(node, &used_nodes);

7010

7011

node_to_cpumask_ptr_next(nodemask, next_node);

7011

node_to_cpumask_ptr_next(nodemask, next_node);

7012

cpus_or(*span, *span, *nodemask);

7012

cpus_or(*span, *span, *nodemask);

7013

}

7013

}

7014

}

7014

}

7015

#endif /* CONFIG_NUMA */

7015

#endif /* CONFIG_NUMA */

7016

7017

int sched_smt_power_savings = 0, sched_mc_power_savings = 0;

7017

int sched_smt_power_savings = 0, sched_mc_power_savings = 0;

7018

7019

/*

7019

/*

7020

* SMT sched-domains:

7020

* SMT sched-domains:

7021

*/

7021

*/

7022

#ifdef CONFIG_SCHED_SMT

7022

#ifdef CONFIG_SCHED_SMT

7023

static DEFINE_PER_CPU(struct sched_domain, cpu_domains);

7023

static DEFINE_PER_CPU(struct sched_domain, cpu_domains);

7024

static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);

7024

static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);

7025

7026

static int

7026

static int

7027

cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,

7027

cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,

7028

cpumask_t *unused)

7028

cpumask_t *unused)

7029

{

7029

{

7030

if (sg)

7030

if (sg)

7031

*sg = &per_cpu(sched_group_cpus, cpu);

7031

*sg = &per_cpu(sched_group_cpus, cpu);

7032

return cpu;

7032

return cpu;

7033

}

7033

}

7034

#endif /* CONFIG_SCHED_SMT */

7034

#endif /* CONFIG_SCHED_SMT */

7035

7036

/*

7036

/*

7037

* multi-core sched-domains:

7037

* multi-core sched-domains:

7038

*/

7038

*/

7039

#ifdef CONFIG_SCHED_MC

7039

#ifdef CONFIG_SCHED_MC

7040

static DEFINE_PER_CPU(struct sched_domain, core_domains);

7040

static DEFINE_PER_CPU(struct sched_domain, core_domains);

7041

static DEFINE_PER_CPU(struct sched_group, sched_group_core);

7041

static DEFINE_PER_CPU(struct sched_group, sched_group_core);

7042

#endif /* CONFIG_SCHED_MC */

7042

#endif /* CONFIG_SCHED_MC */

7043

7044

#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)

7044

#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)

7045

static int

7045

static int

7046

cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,

7046

cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,

7047

cpumask_t *mask)

7047

cpumask_t *mask)

7048

{

7048

{

7049

int group;

7049

int group;

7050

7051

*mask = per_cpu(cpu_sibling_map, cpu);

7051

*mask = per_cpu(cpu_sibling_map, cpu);

7052

cpus_and(*mask, *mask, *cpu_map);

7052

cpus_and(*mask, *mask, *cpu_map);

7053

group = first_cpu(*mask);

7053

group = first_cpu(*mask);

7054

if (sg)

7054

if (sg)

7055

*sg = &per_cpu(sched_group_core, group);

7055

*sg = &per_cpu(sched_group_core, group);

7056

return group;

7056

return group;

7057

}

7057

}

7058

#elif defined(CONFIG_SCHED_MC)

7058

#elif defined(CONFIG_SCHED_MC)

7059

static int

7059

static int

7060

cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,

7060

cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,

7061

cpumask_t *unused)

7061

cpumask_t *unused)

7062

{

7062

{

7063

if (sg)

7063

if (sg)

7064

*sg = &per_cpu(sched_group_core, cpu);

7064

*sg = &per_cpu(sched_group_core, cpu);

7065

return cpu;

7065

return cpu;

7066

}

7066

}

7067

#endif

7067

#endif

7068

7069

static DEFINE_PER_CPU(struct sched_domain, phys_domains);

7069

static DEFINE_PER_CPU(struct sched_domain, phys_domains);

7070

static DEFINE_PER_CPU(struct sched_group, sched_group_phys);

7070

static DEFINE_PER_CPU(struct sched_group, sched_group_phys);

7071

7072

static int

7072

static int

7073

cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,

7073

cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,

7074

cpumask_t *mask)

7074

cpumask_t *mask)

7075

{

7075

{

7076

int group;

7076

int group;

7077

#ifdef CONFIG_SCHED_MC

7077

#ifdef CONFIG_SCHED_MC

7078

*mask = cpu_coregroup_map(cpu);

7078

*mask = cpu_coregroup_map(cpu);

7079

cpus_and(*mask, *mask, *cpu_map);

7079

cpus_and(*mask, *mask, *cpu_map);

7080

group = first_cpu(*mask);

7080

group = first_cpu(*mask);

7081

#elif defined(CONFIG_SCHED_SMT)

7081

#elif defined(CONFIG_SCHED_SMT)

7082

*mask = per_cpu(cpu_sibling_map, cpu);

7082

*mask = per_cpu(cpu_sibling_map, cpu);

7083

cpus_and(*mask, *mask, *cpu_map);

7083

cpus_and(*mask, *mask, *cpu_map);

7084

group = first_cpu(*mask);

7084

group = first_cpu(*mask);

7085

#else

7085

#else

7086

group = cpu;

7086

group = cpu;

7087

#endif

7087

#endif

7088

if (sg)

7088

if (sg)

7089

*sg = &per_cpu(sched_group_phys, group);

7089

*sg = &per_cpu(sched_group_phys, group);

7090

return group;

7090

return group;

7091

}

7091

}

7092

7093

#ifdef CONFIG_NUMA

7093

#ifdef CONFIG_NUMA

7094

/*

7094

/*

7095

* The init_sched_build_groups can't handle what we want to do with node

7095

* The init_sched_build_groups can't handle what we want to do with node

7096

* groups, so roll our own. Now each node has its own list of groups which

7096

* groups, so roll our own. Now each node has its own list of groups which

7097

* gets dynamically allocated.

7097

* gets dynamically allocated.

7098

*/

7098

*/

7099

static DEFINE_PER_CPU(struct sched_domain, node_domains);

7099

static DEFINE_PER_CPU(struct sched_domain, node_domains);

7100

static struct sched_group ***sched_group_nodes_bycpu;

7100

static struct sched_group ***sched_group_nodes_bycpu;

7101

7102

static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);

7102

static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);

7103

static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);

7103

static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);

7104

7105

static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,

7105

static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,

7106

struct sched_group **sg, cpumask_t *nodemask)

7106

struct sched_group **sg, cpumask_t *nodemask)

7107

{

7107

{

7108

int group;

7108

int group;

7109

7110

*nodemask = node_to_cpumask(cpu_to_node(cpu));

7110

*nodemask = node_to_cpumask(cpu_to_node(cpu));

7111

cpus_and(*nodemask, *nodemask, *cpu_map);

7111

cpus_and(*nodemask, *nodemask, *cpu_map);

7112

group = first_cpu(*nodemask);

7112

group = first_cpu(*nodemask);

7113

7114

if (sg)

7114

if (sg)

7115

*sg = &per_cpu(sched_group_allnodes, group);

7115

*sg = &per_cpu(sched_group_allnodes, group);

7116

return group;

7116

return group;

7117

}

7117

}

7118

7119

static void init_numa_sched_groups_power(struct sched_group *group_head)

7119

static void init_numa_sched_groups_power(struct sched_group *group_head)

7120

{

7120

{

7121

struct sched_group *sg = group_head;

7121

struct sched_group *sg = group_head;

7122

int j;

7122

int j;

7123

7124

if (!sg)

7124

if (!sg)

7125

return;

7125

return;

7126

do {

7126

do {

7127

for_each_cpu_mask_nr(j, sg->cpumask) {

7127

for_each_cpu_mask_nr(j, sg->cpumask) {

7128

struct sched_domain *sd;

7128

struct sched_domain *sd;

7129

7130

sd = &per_cpu(phys_domains, j);

7130

sd = &per_cpu(phys_domains, j);

7131

if (j != first_cpu(sd->groups->cpumask)) {

7131

if (j != first_cpu(sd->groups->cpumask)) {

7132

/*

7132

/*

7133

* Only add "power" once for each

7133

* Only add "power" once for each

7134

* physical package.

7134

* physical package.

7135

*/

7135

*/

7136

continue;

7136

continue;

7137

}

7137

}

7138

7139

sg_inc_cpu_power(sg, sd->groups->__cpu_power);

7139

sg_inc_cpu_power(sg, sd->groups->__cpu_power);

7140

}

7140

}

7141

sg = sg->next;

7141

sg = sg->next;

7142

} while (sg != group_head);

7142

} while (sg != group_head);

7143

}

7143

}

7144

#endif /* CONFIG_NUMA */

7144

#endif /* CONFIG_NUMA */

7145

7146

#ifdef CONFIG_NUMA

7146

#ifdef CONFIG_NUMA

7147

/* Free memory allocated for various sched_group structures */

7147

/* Free memory allocated for various sched_group structures */

7148

static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)

7148

static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)

7149

{

7149

{

7150

int cpu, i;

7150

int cpu, i;

7151

7152

for_each_cpu_mask_nr(cpu, *cpu_map) {

7152

for_each_cpu_mask_nr(cpu, *cpu_map) {

7153

struct sched_group **sched_group_nodes

7153

struct sched_group **sched_group_nodes

7154

= sched_group_nodes_bycpu[cpu];

7154

= sched_group_nodes_bycpu[cpu];

7155

7156

if (!sched_group_nodes)

7156

if (!sched_group_nodes)

7157

continue;

7157

continue;

7158

7159

for (i = 0; i < nr_node_ids; i++) {

7159

for (i = 0; i < nr_node_ids; i++) {

7160

struct sched_group *oldsg, *sg = sched_group_nodes[i];

7160

struct sched_group *oldsg, *sg = sched_group_nodes[i];

7161

7162

*nodemask = node_to_cpumask(i);

7162

*nodemask = node_to_cpumask(i);

7163

cpus_and(*nodemask, *nodemask, *cpu_map);

7163

cpus_and(*nodemask, *nodemask, *cpu_map);

7164

if (cpus_empty(*nodemask))

7164

if (cpus_empty(*nodemask))

7165

continue;

7165

continue;

7166

7167

if (sg == NULL)

7167

if (sg == NULL)

7168

continue;

7168

continue;

7169

sg = sg->next;

7169

sg = sg->next;

7170

next_sg:

7170

next_sg:

7171

oldsg = sg;

7171

oldsg = sg;

7172

sg = sg->next;

7172

sg = sg->next;

7173

kfree(oldsg);

7173

kfree(oldsg);

7174

if (oldsg != sched_group_nodes[i])

7174

if (oldsg != sched_group_nodes[i])

7175

goto next_sg;

7175

goto next_sg;

7176

}

7176

}

7177

kfree(sched_group_nodes);

7177

kfree(sched_group_nodes);

7178

sched_group_nodes_bycpu[cpu] = NULL;

7178

sched_group_nodes_bycpu[cpu] = NULL;

7179

}

7179

}

7180

}

7180

}

7181

#else /* !CONFIG_NUMA */

7181

#else /* !CONFIG_NUMA */

7182

static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)

7182

static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)

7183

{

7183

{

7184

}

7184

}

7185

#endif /* CONFIG_NUMA */

7185

#endif /* CONFIG_NUMA */

7186

7187

/*

7187

/*

7188

* Initialize sched groups cpu_power.

7188

* Initialize sched groups cpu_power.

7189

*

7189

*

7190

* cpu_power indicates the capacity of sched group, which is used while

7190

* cpu_power indicates the capacity of sched group, which is used while

7191

* distributing the load between different sched groups in a sched domain.

7191

* distributing the load between different sched groups in a sched domain.

7192

* Typically cpu_power for all the groups in a sched domain will be same unless

7192

* Typically cpu_power for all the groups in a sched domain will be same unless

7193

* there are asymmetries in the topology. If there are asymmetries, group

7193

* there are asymmetries in the topology. If there are asymmetries, group

7194

* having more cpu_power will pickup more load compared to the group having

7194

* having more cpu_power will pickup more load compared to the group having

7195

* less cpu_power.

7195

* less cpu_power.

7196

*

7196

*

7197

* cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents

7197

* cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents

7198

* the maximum number of tasks a group can handle in the presence of other idle

7198

* the maximum number of tasks a group can handle in the presence of other idle

7199

* or lightly loaded groups in the same sched domain.

7199

* or lightly loaded groups in the same sched domain.

7200

*/

7200

*/

7201

static void init_sched_groups_power(int cpu, struct sched_domain *sd)

7201

static void init_sched_groups_power(int cpu, struct sched_domain *sd)

7202

{

7202

{

7203

struct sched_domain *child;

7203

struct sched_domain *child;

7204

struct sched_group *group;

7204

struct sched_group *group;

7205

7206

WARN_ON(!sd || !sd->groups);

7206

WARN_ON(!sd || !sd->groups);

7207

7208

if (cpu != first_cpu(sd->groups->cpumask))

7208

if (cpu != first_cpu(sd->groups->cpumask))

7209

return;

7209

return;

7210

7211

child = sd->child;

7211

child = sd->child;

7212

7213

sd->groups->__cpu_power = 0;

7213

sd->groups->__cpu_power = 0;

7214

7215

/*

7215

/*

7216

* For perf policy, if the groups in child domain share resources

7216

* For perf policy, if the groups in child domain share resources

7217

* (for example cores sharing some portions of the cache hierarchy

7217

* (for example cores sharing some portions of the cache hierarchy

7218

* or SMT), then set this domain groups cpu_power such that each group

7218

* or SMT), then set this domain groups cpu_power such that each group

7219

* can handle only one task, when there are other idle groups in the

7219

* can handle only one task, when there are other idle groups in the

7220

* same sched domain.

7220

* same sched domain.

7221

*/

7221

*/

7222

if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&

7222

if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&

7223

(child->flags &

7223

(child->flags &

7224

(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {

7224

(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {

7225

sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);

7225

sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);

7226

return;

7226

return;

7227

}

7227

}

7228

7229

/*

7229

/*

7230

* add cpu_power of each child group to this groups cpu_power

7230

* add cpu_power of each child group to this groups cpu_power

7231

*/

7231

*/

7232

group = child->groups;

7232

group = child->groups;

7233

do {

7233

do {

7234

sg_inc_cpu_power(sd->groups, group->__cpu_power);

7234

sg_inc_cpu_power(sd->groups, group->__cpu_power);

7235

group = group->next;

7235

group = group->next;

7236

} while (group != child->groups);

7236

} while (group != child->groups);

7237

}

7237

}

7238

7239

/*

7239

/*

7240

* Initializers for schedule domains

7240

* Initializers for schedule domains

7241

* Non-inlined to reduce accumulated stack pressure in build_sched_domains()

7241

* Non-inlined to reduce accumulated stack pressure in build_sched_domains()

7242

*/

7242

*/

7243

7244

#ifdef CONFIG_SCHED_DEBUG

7244

#ifdef CONFIG_SCHED_DEBUG

7245

# define SD_INIT_NAME(sd, type) sd->name = #type

7245

# define SD_INIT_NAME(sd, type) sd->name = #type

7246

#else

7246

#else

7247

# define SD_INIT_NAME(sd, type) do { } while (0)

7247

# define SD_INIT_NAME(sd, type) do { } while (0)

7248

#endif

7248

#endif

7249

7250

#define SD_INIT(sd, type) sd_init_##type(sd)

7250

#define SD_INIT(sd, type) sd_init_##type(sd)

7251

7252

#define SD_INIT_FUNC(type) \

7252

#define SD_INIT_FUNC(type) \

7253

static noinline void sd_init_##type(struct sched_domain *sd) \

7253

static noinline void sd_init_##type(struct sched_domain *sd) \

7254

{ \

7254

{ \

7255

memset(sd, 0, sizeof(*sd)); \

7255

memset(sd, 0, sizeof(*sd)); \

7256

*sd = SD_##type##_INIT; \

7256

*sd = SD_##type##_INIT; \

7257

sd->level = SD_LV_##type; \

7257

sd->level = SD_LV_##type; \

7258

SD_INIT_NAME(sd, type); \

7258

SD_INIT_NAME(sd, type); \

7259

}

7259

}

7260

7261

SD_INIT_FUNC(CPU)

7261

SD_INIT_FUNC(CPU)

7262

#ifdef CONFIG_NUMA

7262

#ifdef CONFIG_NUMA

7263

SD_INIT_FUNC(ALLNODES)

7263

SD_INIT_FUNC(ALLNODES)

7264

SD_INIT_FUNC(NODE)

7264

SD_INIT_FUNC(NODE)

7265

#endif

7265

#endif

7266

#ifdef CONFIG_SCHED_SMT

7266

#ifdef CONFIG_SCHED_SMT

7267

SD_INIT_FUNC(SIBLING)

7267

SD_INIT_FUNC(SIBLING)

7268

#endif

7268

#endif

7269

#ifdef CONFIG_SCHED_MC

7269

#ifdef CONFIG_SCHED_MC

7270

SD_INIT_FUNC(MC)

7270

SD_INIT_FUNC(MC)

7271

#endif

7271

#endif

7272

7273

/*

7273

/*

7274

* To minimize stack usage kmalloc room for cpumasks and share the

7274

* To minimize stack usage kmalloc room for cpumasks and share the

7275

* space as the usage in build_sched_domains() dictates. Used only

7275

* space as the usage in build_sched_domains() dictates. Used only

7276

* if the amount of space is significant.

7276

* if the amount of space is significant.

7277

*/

7277

*/

7278

struct allmasks {

7278

struct allmasks {

7279

cpumask_t tmpmask; /* make this one first */

7279

cpumask_t tmpmask; /* make this one first */

7280

union {

7280

union {

7281

cpumask_t nodemask;

7281

cpumask_t nodemask;

7282

cpumask_t this_sibling_map;

7282

cpumask_t this_sibling_map;

7283

cpumask_t this_core_map;

7283

cpumask_t this_core_map;

7284

};

7284

};

7285

cpumask_t send_covered;

7285

cpumask_t send_covered;

7286

7287

#ifdef CONFIG_NUMA

7287

#ifdef CONFIG_NUMA

7288

cpumask_t domainspan;

7288

cpumask_t domainspan;

7289

cpumask_t covered;

7289

cpumask_t covered;

7290

cpumask_t notcovered;

7290

cpumask_t notcovered;

7291

#endif

7291

#endif

7292

};

7292

};

7293

7294

#if NR_CPUS > 128

7294

#if NR_CPUS > 128

7295

#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v

7295

#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v

7296

static inline void sched_cpumask_alloc(struct allmasks **masks)

7296

static inline void sched_cpumask_alloc(struct allmasks **masks)

7297

{

7297

{

7298

*masks = kmalloc(sizeof(**masks), GFP_KERNEL);

7298

*masks = kmalloc(sizeof(**masks), GFP_KERNEL);

7299

}

7299

}

7300

static inline void sched_cpumask_free(struct allmasks *masks)

7300

static inline void sched_cpumask_free(struct allmasks *masks)

7301

{

7301

{

7302

kfree(masks);

7302

kfree(masks);

7303

}

7303

}

7304

#else

7304

#else

7305

#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v

7305

#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v

7306

static inline void sched_cpumask_alloc(struct allmasks **masks)

7306

static inline void sched_cpumask_alloc(struct allmasks **masks)

7307

{ }

7307

{ }

7308

static inline void sched_cpumask_free(struct allmasks *masks)

7308

static inline void sched_cpumask_free(struct allmasks *masks)

7309

{ }

7309

{ }

7310

#endif

7310

#endif

7311

7312

#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \

7312

#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \

7313

((unsigned long)(a) + offsetof(struct allmasks, v))

7313

((unsigned long)(a) + offsetof(struct allmasks, v))

7314

7315

static int default_relax_domain_level = -1;

7315

static int default_relax_domain_level = -1;

7316

7317

static int __init setup_relax_domain_level(char *str)

7317

static int __init setup_relax_domain_level(char *str)

7318

{

7318

{

7319

unsigned long val;

7319

unsigned long val;

7320

7321

val = simple_strtoul(str, NULL, 0);

7321

val = simple_strtoul(str, NULL, 0);

7322

if (val < SD_LV_MAX)

7322

if (val < SD_LV_MAX)

7323

default_relax_domain_level = val;

7323

default_relax_domain_level = val;

7324

7325

return 1;

7325

return 1;

7326

}

7326

}

7327

__setup("relax_domain_level=", setup_relax_domain_level);

7327

__setup("relax_domain_level=", setup_relax_domain_level);

7328

7329

static void set_domain_attribute(struct sched_domain *sd,

7329

static void set_domain_attribute(struct sched_domain *sd,

7330

struct sched_domain_attr *attr)

7330

struct sched_domain_attr *attr)

7331

{

7331

{

7332

int request;

7332

int request;

7333

7334

if (!attr || attr->relax_domain_level < 0) {

7334

if (!attr || attr->relax_domain_level < 0) {

7335

if (default_relax_domain_level < 0)

7335

if (default_relax_domain_level < 0)

7336

return;

7336

return;

7337

else

7337

else

7338

request = default_relax_domain_level;

7338

request = default_relax_domain_level;

7339

} else

7339

} else

7340

request = attr->relax_domain_level;

7340

request = attr->relax_domain_level;

7341

if (request < sd->level) {

7341

if (request < sd->level) {

7342

/* turn off idle balance on this domain */

7342

/* turn off idle balance on this domain */

7343

sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);

7343

sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);

7344

} else {

7344

} else {

7345

/* turn on idle balance on this domain */

7345

/* turn on idle balance on this domain */

7346

sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);

7346

sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);

7347

}

7347

}

7348

}

7348

}

7349

7350

/*

7350

/*

7351

* Build sched domains for a given set of cpus and attach the sched domains

7351

* Build sched domains for a given set of cpus and attach the sched domains

7352

* to the individual cpus

7352

* to the individual cpus

7353

*/

7353

*/

7354

static int __build_sched_domains(const cpumask_t *cpu_map,

7354

static int __build_sched_domains(const cpumask_t *cpu_map,

7355

struct sched_domain_attr *attr)

7355

struct sched_domain_attr *attr)

7356

{

7356

{

7357

int i;

7357

int i;

7358

struct root_domain *rd;

7358

struct root_domain *rd;

7359

SCHED_CPUMASK_DECLARE(allmasks);

7359

SCHED_CPUMASK_DECLARE(allmasks);

7360

cpumask_t *tmpmask;

7360

cpumask_t *tmpmask;

7361

#ifdef CONFIG_NUMA

7361

#ifdef CONFIG_NUMA

7362

struct sched_group **sched_group_nodes = NULL;

7362

struct sched_group **sched_group_nodes = NULL;

7363

int sd_allnodes = 0;

7363

int sd_allnodes = 0;

7364

7365

/*

7365

/*

7366

* Allocate the per-node list of sched groups

7366

* Allocate the per-node list of sched groups

7367

*/

7367

*/

7368

sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),

7368

sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),

7369

GFP_KERNEL);

7369

GFP_KERNEL);

7370

if (!sched_group_nodes) {

7370

if (!sched_group_nodes) {

7371

printk(KERN_WARNING "Can not alloc sched group node list\n");

7371

printk(KERN_WARNING "Can not alloc sched group node list\n");

7372

return -ENOMEM;

7372

return -ENOMEM;

7373

}

7373

}

7374

#endif

7374

#endif

7375

7376

rd = alloc_rootdomain();

7376

rd = alloc_rootdomain();

7377

if (!rd) {

7377

if (!rd) {

7378

printk(KERN_WARNING "Cannot alloc root domain\n");

7378

printk(KERN_WARNING "Cannot alloc root domain\n");

7379

#ifdef CONFIG_NUMA

7379

#ifdef CONFIG_NUMA

7380

kfree(sched_group_nodes);

7380

kfree(sched_group_nodes);

7381

#endif

7381

#endif

7382

return -ENOMEM;

7382

return -ENOMEM;

7383

}

7383

}

7384

7385

/* get space for all scratch cpumask variables */

7385

/* get space for all scratch cpumask variables */

7386

sched_cpumask_alloc(&allmasks);

7386

sched_cpumask_alloc(&allmasks);

7387

if (!allmasks) {

7387

if (!allmasks) {

7388

printk(KERN_WARNING "Cannot alloc cpumask array\n");

7388

printk(KERN_WARNING "Cannot alloc cpumask array\n");

7389

kfree(rd);

7389

kfree(rd);

7390

#ifdef CONFIG_NUMA

7390

#ifdef CONFIG_NUMA

7391

kfree(sched_group_nodes);

7391

kfree(sched_group_nodes);

7392

#endif

7392

#endif

7393

return -ENOMEM;

7393

return -ENOMEM;

7394

}

7394

}

7395

7396

tmpmask = (cpumask_t *)allmasks;

7396

tmpmask = (cpumask_t *)allmasks;

7397

7398

7399

#ifdef CONFIG_NUMA

7399

#ifdef CONFIG_NUMA

7400

sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;

7400

sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;

7401

#endif

7401

#endif

7402

7403

/*

7403

/*

7404

* Set up domains for cpus specified by the cpu_map.

7404

* Set up domains for cpus specified by the cpu_map.

7405

*/

7405

*/

7406

for_each_cpu_mask_nr(i, *cpu_map) {

7406

for_each_cpu_mask_nr(i, *cpu_map) {

7407

struct sched_domain *sd = NULL, *p;

7407

struct sched_domain *sd = NULL, *p;

7408

SCHED_CPUMASK_VAR(nodemask, allmasks);

7408

SCHED_CPUMASK_VAR(nodemask, allmasks);

7409

7410

*nodemask = node_to_cpumask(cpu_to_node(i));

7410

*nodemask = node_to_cpumask(cpu_to_node(i));

7411

cpus_and(*nodemask, *nodemask, *cpu_map);

7411

cpus_and(*nodemask, *nodemask, *cpu_map);

7412

7413

#ifdef CONFIG_NUMA

7413

#ifdef CONFIG_NUMA

7414

if (cpus_weight(*cpu_map) >

7414

if (cpus_weight(*cpu_map) >

7415

SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {

7415

SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {

7416

sd = &per_cpu(allnodes_domains, i);

7416

sd = &per_cpu(allnodes_domains, i);

7417

SD_INIT(sd, ALLNODES);

7417

SD_INIT(sd, ALLNODES);

7418

set_domain_attribute(sd, attr);

7418

set_domain_attribute(sd, attr);

7419

sd->span = *cpu_map;

7419

sd->span = *cpu_map;

7420

cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);

7420

cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);

7421

p = sd;

7421

p = sd;

7422

sd_allnodes = 1;

7422

sd_allnodes = 1;

7423

} else

7423

} else

7424

p = NULL;

7424

p = NULL;

7425

7426

sd = &per_cpu(node_domains, i);

7426

sd = &per_cpu(node_domains, i);

7427

SD_INIT(sd, NODE);

7427

SD_INIT(sd, NODE);

7428

set_domain_attribute(sd, attr);

7428

set_domain_attribute(sd, attr);

7429

sched_domain_node_span(cpu_to_node(i), &sd->span);

7429

sched_domain_node_span(cpu_to_node(i), &sd->span);

7430

sd->parent = p;

7430

sd->parent = p;

7431

if (p)

7431

if (p)

7432

p->child = sd;

7432

p->child = sd;

7433

cpus_and(sd->span, sd->span, *cpu_map);

7433

cpus_and(sd->span, sd->span, *cpu_map);

7434

#endif

7434

#endif

7435

7436

p = sd;

7436

p = sd;

7437

sd = &per_cpu(phys_domains, i);

7437

sd = &per_cpu(phys_domains, i);

7438

SD_INIT(sd, CPU);

7438

SD_INIT(sd, CPU);

7439

set_domain_attribute(sd, attr);

7439

set_domain_attribute(sd, attr);

7440

sd->span = *nodemask;

7440

sd->span = *nodemask;

7441

sd->parent = p;

7441

sd->parent = p;

7442

if (p)

7442

if (p)

7443

p->child = sd;

7443

p->child = sd;

7444

cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);

7444

cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);

7445

7446

#ifdef CONFIG_SCHED_MC

7446

#ifdef CONFIG_SCHED_MC

7447

p = sd;

7447

p = sd;

7448

sd = &per_cpu(core_domains, i);

7448

sd = &per_cpu(core_domains, i);

7449

SD_INIT(sd, MC);

7449

SD_INIT(sd, MC);

7450

set_domain_attribute(sd, attr);

7450

set_domain_attribute(sd, attr);

7451

sd->span = cpu_coregroup_map(i);

7451

sd->span = cpu_coregroup_map(i);

7452

cpus_and(sd->span, sd->span, *cpu_map);

7452

cpus_and(sd->span, sd->span, *cpu_map);

7453

sd->parent = p;

7453

sd->parent = p;

7454

p->child = sd;

7454

p->child = sd;

7455

cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);

7455

cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);

7456

#endif

7456

#endif

7457

7458

#ifdef CONFIG_SCHED_SMT

7458

#ifdef CONFIG_SCHED_SMT

7459

p = sd;

7459

p = sd;

7460

sd = &per_cpu(cpu_domains, i);

7460

sd = &per_cpu(cpu_domains, i);

7461

SD_INIT(sd, SIBLING);

7461

SD_INIT(sd, SIBLING);

7462

set_domain_attribute(sd, attr);

7462

set_domain_attribute(sd, attr);

7463

sd->span = per_cpu(cpu_sibling_map, i);

7463

sd->span = per_cpu(cpu_sibling_map, i);

7464

cpus_and(sd->span, sd->span, *cpu_map);

7464

cpus_and(sd->span, sd->span, *cpu_map);

7465

sd->parent = p;

7465

sd->parent = p;

7466

p->child = sd;

7466

p->child = sd;

7467

cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);

7467

cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);

7468

#endif

7468

#endif

7469

}

7469

}

7470

7471

#ifdef CONFIG_SCHED_SMT

7471

#ifdef CONFIG_SCHED_SMT

7472

/* Set up CPU (sibling) groups */

7472

/* Set up CPU (sibling) groups */

7473

for_each_cpu_mask_nr(i, *cpu_map) {

7473

for_each_cpu_mask_nr(i, *cpu_map) {

7474

SCHED_CPUMASK_VAR(this_sibling_map, allmasks);

7474

SCHED_CPUMASK_VAR(this_sibling_map, allmasks);

7475

SCHED_CPUMASK_VAR(send_covered, allmasks);

7475

SCHED_CPUMASK_VAR(send_covered, allmasks);

7476

7477

*this_sibling_map = per_cpu(cpu_sibling_map, i);

7477

*this_sibling_map = per_cpu(cpu_sibling_map, i);

7478

cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);

7478

cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);

7479

if (i != first_cpu(*this_sibling_map))

7479

if (i != first_cpu(*this_sibling_map))

7480

continue;

7480

continue;

7481

7482

init_sched_build_groups(this_sibling_map, cpu_map,

7482

init_sched_build_groups(this_sibling_map, cpu_map,

7483

&cpu_to_cpu_group,

7483

&cpu_to_cpu_group,

7484

send_covered, tmpmask);

7484

send_covered, tmpmask);

7485

}

7485

}

7486

#endif

7486

#endif

7487

7488

#ifdef CONFIG_SCHED_MC

7488

#ifdef CONFIG_SCHED_MC

7489

/* Set up multi-core groups */

7489

/* Set up multi-core groups */

7490

for_each_cpu_mask_nr(i, *cpu_map) {

7490

for_each_cpu_mask_nr(i, *cpu_map) {

7491

SCHED_CPUMASK_VAR(this_core_map, allmasks);

7491

SCHED_CPUMASK_VAR(this_core_map, allmasks);

7492

SCHED_CPUMASK_VAR(send_covered, allmasks);

7492

SCHED_CPUMASK_VAR(send_covered, allmasks);

7493

7494

*this_core_map = cpu_coregroup_map(i);

7494

*this_core_map = cpu_coregroup_map(i);

7495

cpus_and(*this_core_map, *this_core_map, *cpu_map);

7495

cpus_and(*this_core_map, *this_core_map, *cpu_map);

7496

if (i != first_cpu(*this_core_map))

7496

if (i != first_cpu(*this_core_map))

7497

continue;

7497

continue;

7498

7499

init_sched_build_groups(this_core_map, cpu_map,

7499

init_sched_build_groups(this_core_map, cpu_map,

7500

&cpu_to_core_group,

7500

&cpu_to_core_group,

7501

send_covered, tmpmask);

7501

send_covered, tmpmask);

7502

}

7502

}

7503

#endif

7503

#endif

7504

7505

/* Set up physical groups */

7505

/* Set up physical groups */

7506

for (i = 0; i < nr_node_ids; i++) {

7506

for (i = 0; i < nr_node_ids; i++) {

7507

SCHED_CPUMASK_VAR(nodemask, allmasks);

7507

SCHED_CPUMASK_VAR(nodemask, allmasks);

7508

SCHED_CPUMASK_VAR(send_covered, allmasks);

7508

SCHED_CPUMASK_VAR(send_covered, allmasks);

7509

7510

*nodemask = node_to_cpumask(i);

7510

*nodemask = node_to_cpumask(i);

7511

cpus_and(*nodemask, *nodemask, *cpu_map);

7511

cpus_and(*nodemask, *nodemask, *cpu_map);

7512

if (cpus_empty(*nodemask))

7512

if (cpus_empty(*nodemask))

7513

continue;

7513

continue;

7514

7515

init_sched_build_groups(nodemask, cpu_map,

7515

init_sched_build_groups(nodemask, cpu_map,

7516

&cpu_to_phys_group,

7516

&cpu_to_phys_group,

7517

send_covered, tmpmask);

7517

send_covered, tmpmask);

7518

}

7518

}

7519

7520

#ifdef CONFIG_NUMA

7520

#ifdef CONFIG_NUMA

7521

/* Set up node groups */

7521

/* Set up node groups */

7522

if (sd_allnodes) {

7522

if (sd_allnodes) {

7523

SCHED_CPUMASK_VAR(send_covered, allmasks);

7523

SCHED_CPUMASK_VAR(send_covered, allmasks);

7524

7525

init_sched_build_groups(cpu_map, cpu_map,

7525

init_sched_build_groups(cpu_map, cpu_map,

7526

&cpu_to_allnodes_group,

7526

&cpu_to_allnodes_group,

7527

send_covered, tmpmask);

7527

send_covered, tmpmask);

7528

}

7528

}

7529

7530

for (i = 0; i < nr_node_ids; i++) {

7530

for (i = 0; i < nr_node_ids; i++) {

7531

/* Set up node groups */

7531

/* Set up node groups */

7532

struct sched_group *sg, *prev;

7532

struct sched_group *sg, *prev;

7533

SCHED_CPUMASK_VAR(nodemask, allmasks);

7533

SCHED_CPUMASK_VAR(nodemask, allmasks);

7534

SCHED_CPUMASK_VAR(domainspan, allmasks);

7534

SCHED_CPUMASK_VAR(domainspan, allmasks);

7535

SCHED_CPUMASK_VAR(covered, allmasks);

7535

SCHED_CPUMASK_VAR(covered, allmasks);

7536

int j;

7536

int j;

7537

7538

*nodemask = node_to_cpumask(i);

7538

*nodemask = node_to_cpumask(i);

7539

cpus_clear(*covered);

7539

cpus_clear(*covered);

7540

7541

cpus_and(*nodemask, *nodemask, *cpu_map);

7541

cpus_and(*nodemask, *nodemask, *cpu_map);

7542

if (cpus_empty(*nodemask)) {

7542

if (cpus_empty(*nodemask)) {

7543

sched_group_nodes[i] = NULL;

7543

sched_group_nodes[i] = NULL;

7544

continue;

7544

continue;

7545

}

7545

}

7546

7547

sched_domain_node_span(i, domainspan);

7547

sched_domain_node_span(i, domainspan);

7548

cpus_and(*domainspan, *domainspan, *cpu_map);

7548

cpus_and(*domainspan, *domainspan, *cpu_map);

7549

7550

sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);

7550

sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);

7551

if (!sg) {

7551

if (!sg) {

7552

printk(KERN_WARNING "Can not alloc domain group for "

7552

printk(KERN_WARNING "Can not alloc domain group for "

7553

"node %d\n", i);

7553

"node %d\n", i);

7554

goto error;

7554

goto error;

7555

}

7555

}

7556

sched_group_nodes[i] = sg;

7556

sched_group_nodes[i] = sg;

7557

for_each_cpu_mask_nr(j, *nodemask) {

7557

for_each_cpu_mask_nr(j, *nodemask) {

7558

struct sched_domain *sd;

7558

struct sched_domain *sd;

7559

7560

sd = &per_cpu(node_domains, j);

7560

sd = &per_cpu(node_domains, j);

7561

sd->groups = sg;

7561

sd->groups = sg;

7562

}

7562

}

7563

sg->__cpu_power = 0;

7563

sg->__cpu_power = 0;

7564

sg->cpumask = *nodemask;

7564

sg->cpumask = *nodemask;

7565

sg->next = sg;

7565

sg->next = sg;

7566

cpus_or(*covered, *covered, *nodemask);

7566

cpus_or(*covered, *covered, *nodemask);

7567

prev = sg;

7567

prev = sg;

7568

7569

for (j = 0; j < nr_node_ids; j++) {

7569

for (j = 0; j < nr_node_ids; j++) {

7570

SCHED_CPUMASK_VAR(notcovered, allmasks);

7570

SCHED_CPUMASK_VAR(notcovered, allmasks);

7571

int n = (i + j) % nr_node_ids;

7571

int n = (i + j) % nr_node_ids;

7572

node_to_cpumask_ptr(pnodemask, n);

7572

node_to_cpumask_ptr(pnodemask, n);

7573

7574

cpus_complement(*notcovered, *covered);

7574

cpus_complement(*notcovered, *covered);

7575

cpus_and(*tmpmask, *notcovered, *cpu_map);

7575

cpus_and(*tmpmask, *notcovered, *cpu_map);

7576

cpus_and(*tmpmask, *tmpmask, *domainspan);

7576

cpus_and(*tmpmask, *tmpmask, *domainspan);

7577

if (cpus_empty(*tmpmask))

7577

if (cpus_empty(*tmpmask))

7578

break;

7578

break;

7579

7580

cpus_and(*tmpmask, *tmpmask, *pnodemask);

7580

cpus_and(*tmpmask, *tmpmask, *pnodemask);

7581

if (cpus_empty(*tmpmask))

7581

if (cpus_empty(*tmpmask))

7582

continue;

7582

continue;

7583

7584

sg = kmalloc_node(sizeof(struct sched_group),

7584

sg = kmalloc_node(sizeof(struct sched_group),

7585

GFP_KERNEL, i);

7585

GFP_KERNEL, i);

7586

if (!sg) {

7586

if (!sg) {

7587

printk(KERN_WARNING

7587

printk(KERN_WARNING

7588

"Can not alloc domain group for node %d\n", j);

7588

"Can not alloc domain group for node %d\n", j);

7589

goto error;

7589

goto error;

7590

}

7590

}

7591

sg->__cpu_power = 0;

7591

sg->__cpu_power = 0;

7592

sg->cpumask = *tmpmask;

7592

sg->cpumask = *tmpmask;

7593

sg->next = prev->next;

7593

sg->next = prev->next;

7594

cpus_or(*covered, *covered, *tmpmask);

7594

cpus_or(*covered, *covered, *tmpmask);

7595

prev->next = sg;

7595

prev->next = sg;

7596

prev = sg;

7596

prev = sg;

7597

}

7597

}

7598

}

7598

}

7599

#endif

7599

#endif

7600

7601

/* Calculate CPU power for physical packages and nodes */

7601

/* Calculate CPU power for physical packages and nodes */

7602

#ifdef CONFIG_SCHED_SMT

7602

#ifdef CONFIG_SCHED_SMT

7603

for_each_cpu_mask_nr(i, *cpu_map) {

7603

for_each_cpu_mask_nr(i, *cpu_map) {

7604

struct sched_domain *sd = &per_cpu(cpu_domains, i);

7604

struct sched_domain *sd = &per_cpu(cpu_domains, i);

7605

7606

init_sched_groups_power(i, sd);

7606

init_sched_groups_power(i, sd);

7607

}

7607

}

7608

#endif

7608

#endif

7609

#ifdef CONFIG_SCHED_MC

7609

#ifdef CONFIG_SCHED_MC

7610

for_each_cpu_mask_nr(i, *cpu_map) {

7610

for_each_cpu_mask_nr(i, *cpu_map) {

7611

struct sched_domain *sd = &per_cpu(core_domains, i);

7611

struct sched_domain *sd = &per_cpu(core_domains, i);

7612

7613

init_sched_groups_power(i, sd);

7613

init_sched_groups_power(i, sd);

7614

}

7614

}

7615

#endif

7615

#endif

7616

7617

for_each_cpu_mask_nr(i, *cpu_map) {

7617

for_each_cpu_mask_nr(i, *cpu_map) {

7618

struct sched_domain *sd = &per_cpu(phys_domains, i);

7618

struct sched_domain *sd = &per_cpu(phys_domains, i);

7619

7620

init_sched_groups_power(i, sd);

7620

init_sched_groups_power(i, sd);

7621

}

7621

}

7622

7623

#ifdef CONFIG_NUMA

7623

#ifdef CONFIG_NUMA

7624

for (i = 0; i < nr_node_ids; i++)

7624

for (i = 0; i < nr_node_ids; i++)

7625

init_numa_sched_groups_power(sched_group_nodes[i]);

7625

init_numa_sched_groups_power(sched_group_nodes[i]);

7626

7627

if (sd_allnodes) {

7627

if (sd_allnodes) {

7628

struct sched_group *sg;

7628

struct sched_group *sg;

7629

7630

cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,

7630

cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,

7631

tmpmask);

7631

tmpmask);

7632

init_numa_sched_groups_power(sg);

7632

init_numa_sched_groups_power(sg);

7633

}

7633

}

7634

#endif

7634

#endif

7635

7636

/* Attach the domains */

7636

/* Attach the domains */

7637

for_each_cpu_mask_nr(i, *cpu_map) {

7637

for_each_cpu_mask_nr(i, *cpu_map) {

7638

struct sched_domain *sd;

7638

struct sched_domain *sd;

7639

#ifdef CONFIG_SCHED_SMT

7639

#ifdef CONFIG_SCHED_SMT

7640

sd = &per_cpu(cpu_domains, i);

7640

sd = &per_cpu(cpu_domains, i);

7641

#elif defined(CONFIG_SCHED_MC)

7641

#elif defined(CONFIG_SCHED_MC)

7642

sd = &per_cpu(core_domains, i);

7642

sd = &per_cpu(core_domains, i);

7643

#else

7643

#else

7644

sd = &per_cpu(phys_domains, i);

7644

sd = &per_cpu(phys_domains, i);

7645

#endif

7645

#endif

7646

cpu_attach_domain(sd, rd, i);

7646

cpu_attach_domain(sd, rd, i);

7647

}

7647

}

7648

7649

sched_cpumask_free(allmasks);

7649

sched_cpumask_free(allmasks);

7650

return 0;

7650

return 0;

7651

7652

#ifdef CONFIG_NUMA

7652

#ifdef CONFIG_NUMA

7653

error:

7653

error:

7654

free_sched_groups(cpu_map, tmpmask);

7654

free_sched_groups(cpu_map, tmpmask);

7655

sched_cpumask_free(allmasks);

7655

sched_cpumask_free(allmasks);

7656

kfree(rd);

7656

kfree(rd);

7657

return -ENOMEM;

7657

return -ENOMEM;

7658

#endif

7658

#endif

7659

}

7659

}

7660

7661

static int build_sched_domains(const cpumask_t *cpu_map)

7661

static int build_sched_domains(const cpumask_t *cpu_map)

7662

{

7662

{

7663

return __build_sched_domains(cpu_map, NULL);

7663

return __build_sched_domains(cpu_map, NULL);

7664

}

7664

}

7665

7666

static cpumask_t *doms_cur; /* current sched domains */

7666

static cpumask_t *doms_cur; /* current sched domains */

7667

static int ndoms_cur; /* number of sched domains in 'doms_cur' */

7667

static int ndoms_cur; /* number of sched domains in 'doms_cur' */

7668

static struct sched_domain_attr *dattr_cur;

7668

static struct sched_domain_attr *dattr_cur;

7669

/* attribues of custom domains in 'doms_cur' */

7669

/* attribues of custom domains in 'doms_cur' */

7670

7671

/*

7671

/*

7672

* Special case: If a kmalloc of a doms_cur partition (array of

7672

* Special case: If a kmalloc of a doms_cur partition (array of

7673

* cpumask_t) fails, then fallback to a single sched domain,

7673

* cpumask_t) fails, then fallback to a single sched domain,

7674

* as determined by the single cpumask_t fallback_doms.

7674

* as determined by the single cpumask_t fallback_doms.

7675

*/

7675

*/

7676

static cpumask_t fallback_doms;

7676

static cpumask_t fallback_doms;

7677

7678

void __attribute__((weak)) arch_update_cpu_topology(void)

7678

/*

7679

* arch_update_cpu_topology lets virtualized architectures update the

7680

* cpu core maps. It is supposed to return 1 if the topology changed

7681

* or 0 if it stayed the same.

7682

*/

7683

int __attribute__((weak)) arch_update_cpu_topology(void)

7679

{

7684

{

7685

return 0;

7680

}

7686

}

7681

7687

7682

/*

7688

/*

7683

* Set up scheduler domains and groups. Callers must hold the hotplug lock.

7689

* Set up scheduler domains and groups. Callers must hold the hotplug lock.

7684

* For now this just excludes isolated cpus, but could be used to

7690

* For now this just excludes isolated cpus, but could be used to

7685

* exclude other special cases in the future.

7691

* exclude other special cases in the future.

7686

*/

7692

*/

7687

static int arch_init_sched_domains(const cpumask_t *cpu_map)

7693

static int arch_init_sched_domains(const cpumask_t *cpu_map)

7688

{

7694

{

7689

int err;

7695

int err;

7690

7696

7691

arch_update_cpu_topology();

7697

arch_update_cpu_topology();

7692

ndoms_cur = 1;

7698

ndoms_cur = 1;

7693

doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);

7699

doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);

7694

if (!doms_cur)

7700

if (!doms_cur)

7695

doms_cur = &fallback_doms;

7701

doms_cur = &fallback_doms;

7696

cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);

7702

cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);

7697

dattr_cur = NULL;

7703

dattr_cur = NULL;

7698

err = build_sched_domains(doms_cur);

7704

err = build_sched_domains(doms_cur);

7699

register_sched_domain_sysctl();

7705

register_sched_domain_sysctl();

7700

7706

7701

return err;

7707

return err;

7702

}

7708

}

7703

7709

7704

static void arch_destroy_sched_domains(const cpumask_t *cpu_map,

7710

static void arch_destroy_sched_domains(const cpumask_t *cpu_map,

7705

cpumask_t *tmpmask)

7711

cpumask_t *tmpmask)

7706

{

7712

{

7707

free_sched_groups(cpu_map, tmpmask);

7713

free_sched_groups(cpu_map, tmpmask);

7708

}

7714

}

7709

7715

7710

/*

7716

/*

7711

* Detach sched domains from a group of cpus specified in cpu_map

7717

* Detach sched domains from a group of cpus specified in cpu_map

7712

* These cpus will now be attached to the NULL domain

7718

* These cpus will now be attached to the NULL domain

7713

*/

7719

*/

7714

static void detach_destroy_domains(const cpumask_t *cpu_map)

7720

static void detach_destroy_domains(const cpumask_t *cpu_map)

7715

{

7721

{

7716

cpumask_t tmpmask;

7722

cpumask_t tmpmask;

7717

int i;

7723

int i;

7718

7724

7719

for_each_cpu_mask_nr(i, *cpu_map)

7725

for_each_cpu_mask_nr(i, *cpu_map)

7720

cpu_attach_domain(NULL, &def_root_domain, i);

7726

cpu_attach_domain(NULL, &def_root_domain, i);

7721

synchronize_sched();

7727

synchronize_sched();

7722

arch_destroy_sched_domains(cpu_map, &tmpmask);

7728

arch_destroy_sched_domains(cpu_map, &tmpmask);

7723

}

7729

}

7724

7730

7725

/* handle null as "default" */

7731

/* handle null as "default" */

7726

static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,

7732

static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,

7727

struct sched_domain_attr *new, int idx_new)

7733

struct sched_domain_attr *new, int idx_new)

7728

{

7734

{

7729

struct sched_domain_attr tmp;

7735

struct sched_domain_attr tmp;

7730

7736

7731

/* fast path */

7737

/* fast path */

7732

if (!new && !cur)

7738

if (!new && !cur)

7733

return 1;

7739

return 1;

7734

7740

7735

tmp = SD_ATTR_INIT;

7741

tmp = SD_ATTR_INIT;

7736

return !memcmp(cur ? (cur + idx_cur) : &tmp,

7742

return !memcmp(cur ? (cur + idx_cur) : &tmp,

7737

new ? (new + idx_new) : &tmp,

7743

new ? (new + idx_new) : &tmp,

7738

sizeof(struct sched_domain_attr));

7744

sizeof(struct sched_domain_attr));

7739

}

7745

}

7740

7746

7741

/*

7747

/*

7742

* Partition sched domains as specified by the 'ndoms_new'

7748

* Partition sched domains as specified by the 'ndoms_new'

7743

* cpumasks in the array doms_new[] of cpumasks. This compares

7749

* cpumasks in the array doms_new[] of cpumasks. This compares

7744

* doms_new[] to the current sched domain partitioning, doms_cur[].

7750

* doms_new[] to the current sched domain partitioning, doms_cur[].

7745

* It destroys each deleted domain and builds each new domain.

7751

* It destroys each deleted domain and builds each new domain.

7746

*

7752

*

7747

* 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.

7753

* 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.

7748

* The masks don't intersect (don't overlap.) We should setup one

7754

* The masks don't intersect (don't overlap.) We should setup one

7749

* sched domain for each mask. CPUs not in any of the cpumasks will

7755

* sched domain for each mask. CPUs not in any of the cpumasks will

7750

* not be load balanced. If the same cpumask appears both in the

7756

* not be load balanced. If the same cpumask appears both in the

7751

* current 'doms_cur' domains and in the new 'doms_new', we can leave

7757

* current 'doms_cur' domains and in the new 'doms_new', we can leave

7752

* it as it is.

7758

* it as it is.

7753

*

7759

*

7754

* The passed in 'doms_new' should be kmalloc'd. This routine takes

7760

* The passed in 'doms_new' should be kmalloc'd. This routine takes

7755

* ownership of it and will kfree it when done with it. If the caller

7761

* ownership of it and will kfree it when done with it. If the caller

7756

* failed the kmalloc call, then it can pass in doms_new == NULL &&

7762

* failed the kmalloc call, then it can pass in doms_new == NULL &&

7757

* ndoms_new == 1, and partition_sched_domains() will fallback to

7763

* ndoms_new == 1, and partition_sched_domains() will fallback to

7758

* the single partition 'fallback_doms', it also forces the domains

7764

* the single partition 'fallback_doms', it also forces the domains

7759

* to be rebuilt.

7765

* to be rebuilt.

7760

*

7766

*

7761

* If doms_new == NULL it will be replaced with cpu_online_map.

7767

* If doms_new == NULL it will be replaced with cpu_online_map.

7762

* ndoms_new == 0 is a special case for destroying existing domains,

7768

* ndoms_new == 0 is a special case for destroying existing domains,

7763

* and it will not create the default domain.

7769

* and it will not create the default domain.

7764

*

7770

*

7765

* Call with hotplug lock held

7771

* Call with hotplug lock held

7766

*/

7772

*/

7767

void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,

7773

void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,

7768

struct sched_domain_attr *dattr_new)

7774

struct sched_domain_attr *dattr_new)

7769

{

7775

{

7770

int i, j, n;

7776

int i, j, n;

7771

7777

7772

mutex_lock(&sched_domains_mutex);

7778

mutex_lock(&sched_domains_mutex);

7773

7779

7774

/* always unregister in case we don't destroy any domains */

7780

/* always unregister in case we don't destroy any domains */

7775

unregister_sched_domain_sysctl();

7781

unregister_sched_domain_sysctl();

7776

7782

7777

n = doms_new ? ndoms_new : 0;

7783

n = doms_new ? ndoms_new : 0;

7778

7784

7779

/* Destroy deleted domains */

7785

/* Destroy deleted domains */

7780

for (i = 0; i < ndoms_cur; i++) {

7786

for (i = 0; i < ndoms_cur; i++) {

7781

for (j = 0; j < n; j++) {

7787

for (j = 0; j < n; j++) {

7782

if (cpus_equal(doms_cur[i], doms_new[j])

7788

if (cpus_equal(doms_cur[i], doms_new[j])

7783

&& dattrs_equal(dattr_cur, i, dattr_new, j))

7789

&& dattrs_equal(dattr_cur, i, dattr_new, j))

7784

goto match1;

7790

goto match1;

7785

}

7791

}

7786

/* no match - a current sched domain not in new doms_new[] */

7792

/* no match - a current sched domain not in new doms_new[] */

7787

detach_destroy_domains(doms_cur + i);

7793

detach_destroy_domains(doms_cur + i);

7788

match1:

7794

match1:

7789

;

7795

;

7790

}

7796

}

7791

7797

7792

if (doms_new == NULL) {

7798

if (doms_new == NULL) {

7793

ndoms_cur = 0;

7799

ndoms_cur = 0;

7794

doms_new = &fallback_doms;

7800

doms_new = &fallback_doms;

7795

cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);

7801

cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);

7796

WARN_ON_ONCE(dattr_new);

7802

WARN_ON_ONCE(dattr_new);

7797

}

7803

}

7798

7804

7799

/* Build new domains */

7805

/* Build new domains */

7800

for (i = 0; i < ndoms_new; i++) {

7806

for (i = 0; i < ndoms_new; i++) {

7801

for (j = 0; j < ndoms_cur; j++) {

7807

for (j = 0; j < ndoms_cur; j++) {

7802

if (cpus_equal(doms_new[i], doms_cur[j])

7808

if (cpus_equal(doms_new[i], doms_cur[j])

7803

&& dattrs_equal(dattr_new, i, dattr_cur, j))

7809

&& dattrs_equal(dattr_new, i, dattr_cur, j))

7804

goto match2;

7810

goto match2;

7805

}

7811

}

7806

/* no match - add a new doms_new */

7812

/* no match - add a new doms_new */

7807

__build_sched_domains(doms_new + i,

7813

__build_sched_domains(doms_new + i,

7808

dattr_new ? dattr_new + i : NULL);

7814

dattr_new ? dattr_new + i : NULL);

7809

match2:

7815

match2:

7810

;

7816

;

7811

}

7817

}

7812

7818

7813

/* Remember the new sched domains */

7819

/* Remember the new sched domains */

7814

if (doms_cur != &fallback_doms)

7820

if (doms_cur != &fallback_doms)

7815

kfree(doms_cur);

7821

kfree(doms_cur);

7816

kfree(dattr_cur); /* kfree(NULL) is safe */

7822

kfree(dattr_cur); /* kfree(NULL) is safe */

7817

doms_cur = doms_new;

7823

doms_cur = doms_new;

7818

dattr_cur = dattr_new;

7824

dattr_cur = dattr_new;

7819

ndoms_cur = ndoms_new;

7825

ndoms_cur = ndoms_new;

7820

7826

7821

register_sched_domain_sysctl();

7827

register_sched_domain_sysctl();

7822

7828

7823

mutex_unlock(&sched_domains_mutex);

7829

mutex_unlock(&sched_domains_mutex);

7824

}

7830

}

7825

7831

7826

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

7832

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

7827

int arch_reinit_sched_domains(void)

7833

int arch_reinit_sched_domains(void)

7828

{

7834

{

7829

get_online_cpus();

7835

get_online_cpus();

7830

7836

7831

/* Destroy domains first to force the rebuild */

7837

/* Destroy domains first to force the rebuild */

7832

partition_sched_domains(0, NULL, NULL);

7838

partition_sched_domains(0, NULL, NULL);

7833

7839

7834

rebuild_sched_domains();

7840

rebuild_sched_domains();

7835

put_online_cpus();

7841

put_online_cpus();

7836

7842

7837

return 0;

7843

return 0;

7838

}

7844

}

7839

7845

7840

static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)

7846

static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)

7841

{

7847

{

7842

int ret;

7848

int ret;

7843

7849

7844

if (buf[0] != '0' && buf[0] != '1')

7850

if (buf[0] != '0' && buf[0] != '1')

7845

return -EINVAL;

7851

return -EINVAL;

7846

7852

7847

if (smt)

7853

if (smt)

7848

sched_smt_power_savings = (buf[0] == '1');

7854

sched_smt_power_savings = (buf[0] == '1');

7849

else

7855

else

7850

sched_mc_power_savings = (buf[0] == '1');

7856

sched_mc_power_savings = (buf[0] == '1');

7851

7857

7852

ret = arch_reinit_sched_domains();

7858

ret = arch_reinit_sched_domains();

7853

7859

7854

return ret ? ret : count;

7860

return ret ? ret : count;

7855

}

7861

}

7856

7862

7857

#ifdef CONFIG_SCHED_MC

7863

#ifdef CONFIG_SCHED_MC

7858

static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,

7864

static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,

7859

char *page)

7865

char *page)

7860

{

7866

{

7861

return sprintf(page, "%u\n", sched_mc_power_savings);

7867

return sprintf(page, "%u\n", sched_mc_power_savings);

7862

}

7868

}

7863

static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,

7869

static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,

7864

const char *buf, size_t count)

7870

const char *buf, size_t count)

7865

{

7871

{

7866

return sched_power_savings_store(buf, count, 0);

7872

return sched_power_savings_store(buf, count, 0);

7867

}

7873

}

7868

static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,

7874

static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,

7869

sched_mc_power_savings_show,

7875

sched_mc_power_savings_show,

7870

sched_mc_power_savings_store);

7876

sched_mc_power_savings_store);

7871

#endif

7877

#endif

7872

7878

7873

#ifdef CONFIG_SCHED_SMT

7879

#ifdef CONFIG_SCHED_SMT

7874

static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,

7880

static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,

7875

char *page)

7881

char *page)

7876

{

7882

{

7877

return sprintf(page, "%u\n", sched_smt_power_savings);

7883

return sprintf(page, "%u\n", sched_smt_power_savings);

7878

}

7884

}

7879

static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,

7885

static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,

7880

const char *buf, size_t count)

7886

const char *buf, size_t count)

7881

{

7887

{

7882

return sched_power_savings_store(buf, count, 1);

7888

return sched_power_savings_store(buf, count, 1);

7883

}

7889

}

7884

static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,

7890

static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,

7885

sched_smt_power_savings_show,

7891

sched_smt_power_savings_show,

7886

sched_smt_power_savings_store);

7892

sched_smt_power_savings_store);

7887

#endif

7893

#endif

7888

7894

7889

int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)

7895

int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)

7890

{

7896

{

7891

int err = 0;

7897

int err = 0;

7892

7898

7893

#ifdef CONFIG_SCHED_SMT

7899

#ifdef CONFIG_SCHED_SMT

7894

if (smt_capable())

7900

if (smt_capable())

7895

err = sysfs_create_file(&cls->kset.kobj,

7901

err = sysfs_create_file(&cls->kset.kobj,

7896

&attr_sched_smt_power_savings.attr);

7902

&attr_sched_smt_power_savings.attr);

7897

#endif

7903

#endif

7898

#ifdef CONFIG_SCHED_MC

7904

#ifdef CONFIG_SCHED_MC

7899

if (!err && mc_capable())

7905

if (!err && mc_capable())

7900

err = sysfs_create_file(&cls->kset.kobj,

7906

err = sysfs_create_file(&cls->kset.kobj,

7901

&attr_sched_mc_power_savings.attr);

7907

&attr_sched_mc_power_savings.attr);

7902

#endif

7908

#endif

7903

return err;

7909

return err;

7904

}

7910

}

7905

#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

7911

#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

7906

7912

7907

#ifndef CONFIG_CPUSETS

7913

#ifndef CONFIG_CPUSETS

7908

/*

7914

/*

7909

* Add online and remove offline CPUs from the scheduler domains.

7915

* Add online and remove offline CPUs from the scheduler domains.

7910

* When cpusets are enabled they take over this function.

7916

* When cpusets are enabled they take over this function.

7911

*/

7917

*/

7912

static int update_sched_domains(struct notifier_block *nfb,

7918

static int update_sched_domains(struct notifier_block *nfb,

7913

unsigned long action, void *hcpu)

7919

unsigned long action, void *hcpu)

7914

{

7920

{

7915

switch (action) {

7921

switch (action) {

7916

case CPU_ONLINE:

7922

case CPU_ONLINE:

7917

case CPU_ONLINE_FROZEN:

7923

case CPU_ONLINE_FROZEN:

7918

case CPU_DEAD:

7924

case CPU_DEAD:

7919

case CPU_DEAD_FROZEN:

7925

case CPU_DEAD_FROZEN:

7920

partition_sched_domains(1, NULL, NULL);

7926

partition_sched_domains(1, NULL, NULL);

7921

return NOTIFY_OK;

7927

return NOTIFY_OK;

7922

7928

7923

default:

7929

default:

7924

return NOTIFY_DONE;

7930

return NOTIFY_DONE;

7925

}

7931

}

7926

}

7932

}

7927

#endif

7933

#endif

7928

7934

7929

static int update_runtime(struct notifier_block *nfb,

7935

static int update_runtime(struct notifier_block *nfb,

7930

unsigned long action, void *hcpu)

7936

unsigned long action, void *hcpu)

7931

{

7937

{

7932

int cpu = (int)(long)hcpu;

7938

int cpu = (int)(long)hcpu;

7933

7939

7934

switch (action) {

7940

switch (action) {

7935

case CPU_DOWN_PREPARE:

7941

case CPU_DOWN_PREPARE:

7936

case CPU_DOWN_PREPARE_FROZEN:

7942

case CPU_DOWN_PREPARE_FROZEN:

7937

disable_runtime(cpu_rq(cpu));

7943

disable_runtime(cpu_rq(cpu));

7938

return NOTIFY_OK;

7944

return NOTIFY_OK;

7939

7945

7940

case CPU_DOWN_FAILED:

7946

case CPU_DOWN_FAILED:

7941

case CPU_DOWN_FAILED_FROZEN:

7947

case CPU_DOWN_FAILED_FROZEN:

7942

case CPU_ONLINE:

7948

case CPU_ONLINE:

7943

case CPU_ONLINE_FROZEN:

7949

case CPU_ONLINE_FROZEN:

7944

enable_runtime(cpu_rq(cpu));

7950

enable_runtime(cpu_rq(cpu));

7945

return NOTIFY_OK;

7951

return NOTIFY_OK;

7946

7952

7947

default:

7953

default:

7948

return NOTIFY_DONE;

7954

return NOTIFY_DONE;

7949

}

7955

}

7950

}

7956

}

7951

7957

7952

void __init sched_init_smp(void)

7958

void __init sched_init_smp(void)

7953

{

7959

{

7954

cpumask_t non_isolated_cpus;

7960

cpumask_t non_isolated_cpus;

7955

7961

7956

#if defined(CONFIG_NUMA)

7962

#if defined(CONFIG_NUMA)

7957

sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),

7963

sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),

7958

GFP_KERNEL);

7964

GFP_KERNEL);

7959

BUG_ON(sched_group_nodes_bycpu == NULL);

7965

BUG_ON(sched_group_nodes_bycpu == NULL);

7960

#endif

7966

#endif

7961

get_online_cpus();

7967

get_online_cpus();

7962

mutex_lock(&sched_domains_mutex);

7968

mutex_lock(&sched_domains_mutex);

7963

arch_init_sched_domains(&cpu_online_map);

7969

arch_init_sched_domains(&cpu_online_map);

7964

cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);

7970

cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);

7965

if (cpus_empty(non_isolated_cpus))

7971

if (cpus_empty(non_isolated_cpus))

7966

cpu_set(smp_processor_id(), non_isolated_cpus);

7972

cpu_set(smp_processor_id(), non_isolated_cpus);

7967

mutex_unlock(&sched_domains_mutex);

7973

mutex_unlock(&sched_domains_mutex);

7968

put_online_cpus();

7974

put_online_cpus();

7969

7975

7970

#ifndef CONFIG_CPUSETS

7976

#ifndef CONFIG_CPUSETS

7971

/* XXX: Theoretical race here - CPU may be hotplugged now */

7977

/* XXX: Theoretical race here - CPU may be hotplugged now */

7972

hotcpu_notifier(update_sched_domains, 0);

7978

hotcpu_notifier(update_sched_domains, 0);

7973

#endif

7979

#endif

7974

7980

7975

/* RT runtime code needs to handle some hotplug events */

7981

/* RT runtime code needs to handle some hotplug events */

7976

hotcpu_notifier(update_runtime, 0);

7982

hotcpu_notifier(update_runtime, 0);

7977

7983

7978

init_hrtick();

7984

init_hrtick();

7979

7985

7980

/* Move init over to a non-isolated CPU */

7986

/* Move init over to a non-isolated CPU */

7981

if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)

7987

if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)

7982

BUG();

7988

BUG();

7983

sched_init_granularity();

7989

sched_init_granularity();

7984

}

7990

}

7985

#else

7991

#else

7986

void __init sched_init_smp(void)

7992

void __init sched_init_smp(void)

7987

{

7993

{

7988

sched_init_granularity();

7994

sched_init_granularity();

7989

}

7995

}

7990

#endif /* CONFIG_SMP */

7996

#endif /* CONFIG_SMP */

7991

7997

7992

int in_sched_functions(unsigned long addr)

7998

int in_sched_functions(unsigned long addr)

7993

{

7999

{

7994

return in_lock_functions(addr) ||

8000

return in_lock_functions(addr) ||

7995

(addr >= (unsigned long)__sched_text_start

8001

(addr >= (unsigned long)__sched_text_start

7996

&& addr < (unsigned long)__sched_text_end);

8002

&& addr < (unsigned long)__sched_text_end);

7997

}

8003

}

7998

8004

7999

static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)

8005

static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)

8000

{

8006

{

8001

cfs_rq->tasks_timeline = RB_ROOT;

8007

cfs_rq->tasks_timeline = RB_ROOT;

8002

INIT_LIST_HEAD(&cfs_rq->tasks);

8008

INIT_LIST_HEAD(&cfs_rq->tasks);

8003

#ifdef CONFIG_FAIR_GROUP_SCHED

8009

#ifdef CONFIG_FAIR_GROUP_SCHED

8004

cfs_rq->rq = rq;

8010

cfs_rq->rq = rq;

8005

#endif

8011

#endif

8006

cfs_rq->min_vruntime = (u64)(-(1LL << 20));

8012

cfs_rq->min_vruntime = (u64)(-(1LL << 20));

8007

}

8013

}

8008

8014

8009

static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)

8015

static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)

8010

{

8016

{

8011

struct rt_prio_array *array;

8017

struct rt_prio_array *array;

8012

int i;

8018

int i;

8013

8019

8014

array = &rt_rq->active;

8020

array = &rt_rq->active;

8015

for (i = 0; i < MAX_RT_PRIO; i++) {

8021

for (i = 0; i < MAX_RT_PRIO; i++) {

8016

INIT_LIST_HEAD(array->queue + i);

8022

INIT_LIST_HEAD(array->queue + i);

8017

__clear_bit(i, array->bitmap);

8023

__clear_bit(i, array->bitmap);

8018

}

8024

}

8019

/* delimiter for bitsearch: */

8025

/* delimiter for bitsearch: */

8020

__set_bit(MAX_RT_PRIO, array->bitmap);

8026

__set_bit(MAX_RT_PRIO, array->bitmap);

8021

8027

8022

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

8028

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

8023

rt_rq->highest_prio = MAX_RT_PRIO;

8029

rt_rq->highest_prio = MAX_RT_PRIO;

8024

#endif

8030

#endif

8025

#ifdef CONFIG_SMP

8031

#ifdef CONFIG_SMP

8026

rt_rq->rt_nr_migratory = 0;

8032

rt_rq->rt_nr_migratory = 0;

8027

rt_rq->overloaded = 0;

8033

rt_rq->overloaded = 0;

8028

#endif

8034

#endif

8029

8035

8030

rt_rq->rt_time = 0;

8036

rt_rq->rt_time = 0;

8031

rt_rq->rt_throttled = 0;

8037

rt_rq->rt_throttled = 0;

8032

rt_rq->rt_runtime = 0;

8038

rt_rq->rt_runtime = 0;

8033

spin_lock_init(&rt_rq->rt_runtime_lock);

8039

spin_lock_init(&rt_rq->rt_runtime_lock);

8034

8040

8035

#ifdef CONFIG_RT_GROUP_SCHED

8041

#ifdef CONFIG_RT_GROUP_SCHED

8036

rt_rq->rt_nr_boosted = 0;

8042

rt_rq->rt_nr_boosted = 0;

8037

rt_rq->rq = rq;

8043

rt_rq->rq = rq;

8038

#endif

8044

#endif

8039

}

8045

}

8040

8046

8041

#ifdef CONFIG_FAIR_GROUP_SCHED

8047

#ifdef CONFIG_FAIR_GROUP_SCHED

8042

static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,

8048

static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,

8043

struct sched_entity *se, int cpu, int add,

8049

struct sched_entity *se, int cpu, int add,

8044

struct sched_entity *parent)

8050

struct sched_entity *parent)

8045

{

8051

{

8046

struct rq *rq = cpu_rq(cpu);

8052

struct rq *rq = cpu_rq(cpu);

8047

tg->cfs_rq[cpu] = cfs_rq;

8053

tg->cfs_rq[cpu] = cfs_rq;

8048

init_cfs_rq(cfs_rq, rq);

8054

init_cfs_rq(cfs_rq, rq);

8049

cfs_rq->tg = tg;

8055

cfs_rq->tg = tg;

8050

if (add)

8056

if (add)

8051

list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);

8057

list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);

8052

8058

8053

tg->se[cpu] = se;

8059

tg->se[cpu] = se;

8054

/* se could be NULL for init_task_group */

8060

/* se could be NULL for init_task_group */

8055

if (!se)

8061

if (!se)

8056

return;

8062

return;

8057

8063

8058

if (!parent)

8064

if (!parent)

8059

se->cfs_rq = &rq->cfs;

8065

se->cfs_rq = &rq->cfs;

8060

else

8066

else

8061

se->cfs_rq = parent->my_q;

8067

se->cfs_rq = parent->my_q;

8062

8068

8063

se->my_q = cfs_rq;

8069

se->my_q = cfs_rq;

8064

se->load.weight = tg->shares;

8070

se->load.weight = tg->shares;

8065

se->load.inv_weight = 0;

8071

se->load.inv_weight = 0;

8066

se->parent = parent;

8072

se->parent = parent;

8067

}

8073

}

8068

#endif

8074

#endif

8069

8075

8070

#ifdef CONFIG_RT_GROUP_SCHED

8076

#ifdef CONFIG_RT_GROUP_SCHED

8071

static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,

8077

static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,

8072

struct sched_rt_entity *rt_se, int cpu, int add,

8078

struct sched_rt_entity *rt_se, int cpu, int add,

8073

struct sched_rt_entity *parent)

8079

struct sched_rt_entity *parent)

8074

{

8080

{

8075

struct rq *rq = cpu_rq(cpu);

8081

struct rq *rq = cpu_rq(cpu);

8076

8082

8077

tg->rt_rq[cpu] = rt_rq;

8083

tg->rt_rq[cpu] = rt_rq;

8078

init_rt_rq(rt_rq, rq);

8084

init_rt_rq(rt_rq, rq);

8079

rt_rq->tg = tg;

8085

rt_rq->tg = tg;

8080

rt_rq->rt_se = rt_se;

8086

rt_rq->rt_se = rt_se;

8081

rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;

8087

rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;

8082

if (add)

8088

if (add)

8083

list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);

8089

list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);

8084

8090

8085

tg->rt_se[cpu] = rt_se;

8091

tg->rt_se[cpu] = rt_se;

8086

if (!rt_se)

8092

if (!rt_se)

8087

return;

8093

return;

8088

8094

8089

if (!parent)

8095

if (!parent)

8090

rt_se->rt_rq = &rq->rt;

8096

rt_se->rt_rq = &rq->rt;

8091

else

8097

else

8092

rt_se->rt_rq = parent->my_q;

8098

rt_se->rt_rq = parent->my_q;

8093

8099

8094

rt_se->my_q = rt_rq;

8100

rt_se->my_q = rt_rq;

8095

rt_se->parent = parent;

8101

rt_se->parent = parent;

8096

INIT_LIST_HEAD(&rt_se->run_list);

8102

INIT_LIST_HEAD(&rt_se->run_list);

8097

}

8103

}

8098

#endif

8104

#endif

8099

8105

8100

void __init sched_init(void)

8106

void __init sched_init(void)

8101

{

8107

{

8102

int i, j;

8108

int i, j;

8103

unsigned long alloc_size = 0, ptr;

8109

unsigned long alloc_size = 0, ptr;

8104

8110

8105

#ifdef CONFIG_FAIR_GROUP_SCHED

8111

#ifdef CONFIG_FAIR_GROUP_SCHED

8106

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

8112

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

8107

#endif

8113

#endif

8108

#ifdef CONFIG_RT_GROUP_SCHED

8114

#ifdef CONFIG_RT_GROUP_SCHED

8109

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

8115

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

8110

#endif

8116

#endif

8111

#ifdef CONFIG_USER_SCHED

8117

#ifdef CONFIG_USER_SCHED

8112

alloc_size *= 2;

8118

alloc_size *= 2;

8113

#endif

8119

#endif

8114

/*

8120

/*

8115

* As sched_init() is called before page_alloc is setup,

8121

* As sched_init() is called before page_alloc is setup,

8116

* we use alloc_bootmem().

8122

* we use alloc_bootmem().

8117

*/

8123

*/

8118

if (alloc_size) {

8124

if (alloc_size) {

8119

ptr = (unsigned long)alloc_bootmem(alloc_size);

8125

ptr = (unsigned long)alloc_bootmem(alloc_size);

8120

8126

8121

#ifdef CONFIG_FAIR_GROUP_SCHED

8127

#ifdef CONFIG_FAIR_GROUP_SCHED

8122

init_task_group.se = (struct sched_entity **)ptr;

8128

init_task_group.se = (struct sched_entity **)ptr;

8123

ptr += nr_cpu_ids * sizeof(void **);

8129

ptr += nr_cpu_ids * sizeof(void **);

8124

8130

8125

init_task_group.cfs_rq = (struct cfs_rq **)ptr;

8131

init_task_group.cfs_rq = (struct cfs_rq **)ptr;

8126

ptr += nr_cpu_ids * sizeof(void **);

8132

ptr += nr_cpu_ids * sizeof(void **);

8127

8133

8128

#ifdef CONFIG_USER_SCHED

8134

#ifdef CONFIG_USER_SCHED

8129

root_task_group.se = (struct sched_entity **)ptr;

8135

root_task_group.se = (struct sched_entity **)ptr;

8130

ptr += nr_cpu_ids * sizeof(void **);

8136

ptr += nr_cpu_ids * sizeof(void **);

8131

8137

8132

root_task_group.cfs_rq = (struct cfs_rq **)ptr;

8138

root_task_group.cfs_rq = (struct cfs_rq **)ptr;

8133

ptr += nr_cpu_ids * sizeof(void **);

8139

ptr += nr_cpu_ids * sizeof(void **);

8134

#endif /* CONFIG_USER_SCHED */

8140

#endif /* CONFIG_USER_SCHED */

8135

#endif /* CONFIG_FAIR_GROUP_SCHED */

8141

#endif /* CONFIG_FAIR_GROUP_SCHED */

8136

#ifdef CONFIG_RT_GROUP_SCHED

8142

#ifdef CONFIG_RT_GROUP_SCHED

8137

init_task_group.rt_se = (struct sched_rt_entity **)ptr;

8143

init_task_group.rt_se = (struct sched_rt_entity **)ptr;

8138

ptr += nr_cpu_ids * sizeof(void **);

8144

ptr += nr_cpu_ids * sizeof(void **);

8139

8145

8140

init_task_group.rt_rq = (struct rt_rq **)ptr;

8146

init_task_group.rt_rq = (struct rt_rq **)ptr;

8141

ptr += nr_cpu_ids * sizeof(void **);

8147

ptr += nr_cpu_ids * sizeof(void **);

8142

8148

8143

#ifdef CONFIG_USER_SCHED

8149

#ifdef CONFIG_USER_SCHED

8144

root_task_group.rt_se = (struct sched_rt_entity **)ptr;

8150

root_task_group.rt_se = (struct sched_rt_entity **)ptr;

8145

ptr += nr_cpu_ids * sizeof(void **);

8151

ptr += nr_cpu_ids * sizeof(void **);

8146

8152

8147

root_task_group.rt_rq = (struct rt_rq **)ptr;

8153

root_task_group.rt_rq = (struct rt_rq **)ptr;

8148

ptr += nr_cpu_ids * sizeof(void **);

8154

ptr += nr_cpu_ids * sizeof(void **);

8149

#endif /* CONFIG_USER_SCHED */

8155

#endif /* CONFIG_USER_SCHED */

8150

#endif /* CONFIG_RT_GROUP_SCHED */

8156

#endif /* CONFIG_RT_GROUP_SCHED */

8151

}

8157

}

8152

8158

8153

#ifdef CONFIG_SMP

8159

#ifdef CONFIG_SMP

8154

init_defrootdomain();

8160

init_defrootdomain();

8155

#endif

8161

#endif

8156

8162

8157

init_rt_bandwidth(&def_rt_bandwidth,

8163

init_rt_bandwidth(&def_rt_bandwidth,

8158

global_rt_period(), global_rt_runtime());

8164

global_rt_period(), global_rt_runtime());

8159

8165

8160

#ifdef CONFIG_RT_GROUP_SCHED

8166

#ifdef CONFIG_RT_GROUP_SCHED

8161

init_rt_bandwidth(&init_task_group.rt_bandwidth,

8167

init_rt_bandwidth(&init_task_group.rt_bandwidth,

8162

global_rt_period(), global_rt_runtime());

8168

global_rt_period(), global_rt_runtime());

8163

#ifdef CONFIG_USER_SCHED

8169

#ifdef CONFIG_USER_SCHED

8164

init_rt_bandwidth(&root_task_group.rt_bandwidth,

8170

init_rt_bandwidth(&root_task_group.rt_bandwidth,

8165

global_rt_period(), RUNTIME_INF);

8171

global_rt_period(), RUNTIME_INF);

8166

#endif /* CONFIG_USER_SCHED */

8172

#endif /* CONFIG_USER_SCHED */

8167

#endif /* CONFIG_RT_GROUP_SCHED */

8173

#endif /* CONFIG_RT_GROUP_SCHED */

8168

8174

8169

#ifdef CONFIG_GROUP_SCHED

8175

#ifdef CONFIG_GROUP_SCHED

8170

list_add(&init_task_group.list, &task_groups);

8176

list_add(&init_task_group.list, &task_groups);

8171

INIT_LIST_HEAD(&init_task_group.children);

8177

INIT_LIST_HEAD(&init_task_group.children);

8172

8178

8173

#ifdef CONFIG_USER_SCHED

8179

#ifdef CONFIG_USER_SCHED

8174

INIT_LIST_HEAD(&root_task_group.children);

8180

INIT_LIST_HEAD(&root_task_group.children);

8175

init_task_group.parent = &root_task_group;

8181

init_task_group.parent = &root_task_group;

8176

list_add(&init_task_group.siblings, &root_task_group.children);

8182

list_add(&init_task_group.siblings, &root_task_group.children);

8177

#endif /* CONFIG_USER_SCHED */

8183

#endif /* CONFIG_USER_SCHED */

8178

#endif /* CONFIG_GROUP_SCHED */

8184

#endif /* CONFIG_GROUP_SCHED */

8179

8185

8180

for_each_possible_cpu(i) {

8186

for_each_possible_cpu(i) {

8181

struct rq *rq;

8187

struct rq *rq;

8182

8188

8183

rq = cpu_rq(i);

8189

rq = cpu_rq(i);

8184

spin_lock_init(&rq->lock);

8190

spin_lock_init(&rq->lock);

8185

rq->nr_running = 0;

8191

rq->nr_running = 0;

8186

init_cfs_rq(&rq->cfs, rq);

8192

init_cfs_rq(&rq->cfs, rq);

8187

init_rt_rq(&rq->rt, rq);

8193

init_rt_rq(&rq->rt, rq);

8188

#ifdef CONFIG_FAIR_GROUP_SCHED

8194

#ifdef CONFIG_FAIR_GROUP_SCHED

8189

init_task_group.shares = init_task_group_load;

8195

init_task_group.shares = init_task_group_load;

8190

INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);

8196

INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);

8191

#ifdef CONFIG_CGROUP_SCHED

8197

#ifdef CONFIG_CGROUP_SCHED

8192

/*

8198

/*

8193

* How much cpu bandwidth does init_task_group get?

8199

* How much cpu bandwidth does init_task_group get?

8194

*

8200

*

8195

* In case of task-groups formed thr' the cgroup filesystem, it

8201

* In case of task-groups formed thr' the cgroup filesystem, it

8196

* gets 100% of the cpu resources in the system. This overall

8202

* gets 100% of the cpu resources in the system. This overall

8197

* system cpu resource is divided among the tasks of

8203

* system cpu resource is divided among the tasks of

8198

* init_task_group and its child task-groups in a fair manner,

8204

* init_task_group and its child task-groups in a fair manner,

8199

* based on each entity's (task or task-group's) weight

8205

* based on each entity's (task or task-group's) weight

8200

* (se->load.weight).

8206

* (se->load.weight).

8201

*

8207

*

8202

* In other words, if init_task_group has 10 tasks of weight

8208

* In other words, if init_task_group has 10 tasks of weight

8203

* 1024) and two child groups A0 and A1 (of weight 1024 each),

8209

* 1024) and two child groups A0 and A1 (of weight 1024 each),

8204

* then A0's share of the cpu resource is:

8210

* then A0's share of the cpu resource is:

8205

*

8211

*

8206

* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%

8212

* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%

8207

*

8213

*

8208

* We achieve this by letting init_task_group's tasks sit

8214

* We achieve this by letting init_task_group's tasks sit

8209

* directly in rq->cfs (i.e init_task_group->se[] = NULL).

8215

* directly in rq->cfs (i.e init_task_group->se[] = NULL).

8210

*/

8216

*/

8211

init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);

8217

init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);

8212

#elif defined CONFIG_USER_SCHED

8218

#elif defined CONFIG_USER_SCHED

8213

root_task_group.shares = NICE_0_LOAD;

8219

root_task_group.shares = NICE_0_LOAD;

8214

init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);

8220

init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);

8215

/*

8221

/*

8216

* In case of task-groups formed thr' the user id of tasks,

8222

* In case of task-groups formed thr' the user id of tasks,

8217

* init_task_group represents tasks belonging to root user.

8223

* init_task_group represents tasks belonging to root user.

8218

* Hence it forms a sibling of all subsequent groups formed.

8224

* Hence it forms a sibling of all subsequent groups formed.

8219

* In this case, init_task_group gets only a fraction of overall

8225

* In this case, init_task_group gets only a fraction of overall

8220

* system cpu resource, based on the weight assigned to root

8226

* system cpu resource, based on the weight assigned to root

8221

* user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished

8227

* user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished

8222

* by letting tasks of init_task_group sit in a separate cfs_rq

8228

* by letting tasks of init_task_group sit in a separate cfs_rq

8223

* (init_cfs_rq) and having one entity represent this group of

8229

* (init_cfs_rq) and having one entity represent this group of

8224

* tasks in rq->cfs (i.e init_task_group->se[] != NULL).

8230

* tasks in rq->cfs (i.e init_task_group->se[] != NULL).

8225

*/

8231

*/

8226

init_tg_cfs_entry(&init_task_group,

8232

init_tg_cfs_entry(&init_task_group,

8227

&per_cpu(init_cfs_rq, i),

8233

&per_cpu(init_cfs_rq, i),

8228

&per_cpu(init_sched_entity, i), i, 1,

8234

&per_cpu(init_sched_entity, i), i, 1,

8229

root_task_group.se[i]);

8235

root_task_group.se[i]);

8230

8236

8231

#endif

8237

#endif

8232

#endif /* CONFIG_FAIR_GROUP_SCHED */

8238

#endif /* CONFIG_FAIR_GROUP_SCHED */

8233

8239

8234

rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;

8240

rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;

8235

#ifdef CONFIG_RT_GROUP_SCHED

8241

#ifdef CONFIG_RT_GROUP_SCHED

8236

INIT_LIST_HEAD(&rq->leaf_rt_rq_list);

8242

INIT_LIST_HEAD(&rq->leaf_rt_rq_list);

8237

#ifdef CONFIG_CGROUP_SCHED

8243

#ifdef CONFIG_CGROUP_SCHED

8238

init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);

8244

init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);

8239

#elif defined CONFIG_USER_SCHED

8245

#elif defined CONFIG_USER_SCHED

8240

init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);

8246

init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);

8241

init_tg_rt_entry(&init_task_group,

8247

init_tg_rt_entry(&init_task_group,

8242

&per_cpu(init_rt_rq, i),

8248

&per_cpu(init_rt_rq, i),

8243

&per_cpu(init_sched_rt_entity, i), i, 1,

8249

&per_cpu(init_sched_rt_entity, i), i, 1,

8244

root_task_group.rt_se[i]);

8250

root_task_group.rt_se[i]);

8245

#endif

8251

#endif

8246

#endif

8252

#endif

8247

8253

8248

for (j = 0; j < CPU_LOAD_IDX_MAX; j++)

8254

for (j = 0; j < CPU_LOAD_IDX_MAX; j++)

8249

rq->cpu_load[j] = 0;

8255

rq->cpu_load[j] = 0;

8250

#ifdef CONFIG_SMP

8256

#ifdef CONFIG_SMP

8251

rq->sd = NULL;

8257

rq->sd = NULL;

8252

rq->rd = NULL;

8258

rq->rd = NULL;

8253

rq->active_balance = 0;

8259

rq->active_balance = 0;

8254

rq->next_balance = jiffies;

8260

rq->next_balance = jiffies;

8255

rq->push_cpu = 0;

8261

rq->push_cpu = 0;

8256

rq->cpu = i;

8262

rq->cpu = i;

8257

rq->online = 0;

8263

rq->online = 0;

8258

rq->migration_thread = NULL;

8264

rq->migration_thread = NULL;

8259

INIT_LIST_HEAD(&rq->migration_queue);

8265

INIT_LIST_HEAD(&rq->migration_queue);

8260

rq_attach_root(rq, &def_root_domain);

8266

rq_attach_root(rq, &def_root_domain);

8261

#endif

8267

#endif

8262

init_rq_hrtick(rq);

8268

init_rq_hrtick(rq);

8263

atomic_set(&rq->nr_iowait, 0);

8269

atomic_set(&rq->nr_iowait, 0);

8264

}

8270

}

8265

8271

8266

set_load_weight(&init_task);

8272

set_load_weight(&init_task);

8267

8273

8268

#ifdef CONFIG_PREEMPT_NOTIFIERS

8274

#ifdef CONFIG_PREEMPT_NOTIFIERS

8269

INIT_HLIST_HEAD(&init_task.preempt_notifiers);

8275

INIT_HLIST_HEAD(&init_task.preempt_notifiers);

8270

#endif

8276

#endif

8271

8277

8272

#ifdef CONFIG_SMP

8278

#ifdef CONFIG_SMP

8273

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

8279

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

8274

#endif

8280

#endif

8275

8281

8276

#ifdef CONFIG_RT_MUTEXES

8282

#ifdef CONFIG_RT_MUTEXES

8277

plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);

8283

plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);

8278

#endif

8284

#endif

8279

8285

8280

/*

8286

/*

8281

* The boot idle thread does lazy MMU switching as well:

8287

* The boot idle thread does lazy MMU switching as well:

8282

*/

8288

*/

8283

atomic_inc(&init_mm.mm_count);

8289

atomic_inc(&init_mm.mm_count);

8284

enter_lazy_tlb(&init_mm, current);

8290

enter_lazy_tlb(&init_mm, current);

8285

8291

8286

/*

8292

/*

8287

* Make us the idle thread. Technically, schedule() should not be

8293

* Make us the idle thread. Technically, schedule() should not be

8288

* called from this thread, however somewhere below it might be,

8294

* called from this thread, however somewhere below it might be,

8289

* but because we are the idle thread, we just pick up running again

8295

* but because we are the idle thread, we just pick up running again

8290

* when this runqueue becomes "idle".

8296

* when this runqueue becomes "idle".

8291

*/

8297

*/

8292

init_idle(current, smp_processor_id());

8298

init_idle(current, smp_processor_id());

8293

/*

8299

/*

8294

* During early bootup we pretend to be a normal task:

8300

* During early bootup we pretend to be a normal task:

8295

*/

8301

*/

8296

current->sched_class = &fair_sched_class;

8302

current->sched_class = &fair_sched_class;

8297

8303

8298

scheduler_running = 1;

8304

scheduler_running = 1;

8299

}

8305

}

8300

8306

8301

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

8307

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

8302

void __might_sleep(char *file, int line)

8308

void __might_sleep(char *file, int line)

8303

{

8309

{

8304

#ifdef in_atomic

8310

#ifdef in_atomic

8305

static unsigned long prev_jiffy; /* ratelimiting */

8311

static unsigned long prev_jiffy; /* ratelimiting */

8306

8312

8307

if ((!in_atomic() && !irqs_disabled()) ||

8313

if ((!in_atomic() && !irqs_disabled()) ||

8308

system_state != SYSTEM_RUNNING || oops_in_progress)

8314

system_state != SYSTEM_RUNNING || oops_in_progress)

8309

return;

8315

return;

8310

if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

8316

if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

8311

return;

8317

return;

8312

prev_jiffy = jiffies;

8318

prev_jiffy = jiffies;

8313

8319

8314

printk(KERN_ERR

8320

printk(KERN_ERR

8315

"BUG: sleeping function called from invalid context at %s:%d\n",

8321

"BUG: sleeping function called from invalid context at %s:%d\n",

8316

file, line);

8322

file, line);

8317

printk(KERN_ERR

8323

printk(KERN_ERR

8318

"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",

8324

"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",

8319

in_atomic(), irqs_disabled(),

8325

in_atomic(), irqs_disabled(),

8320

current->pid, current->comm);

8326

current->pid, current->comm);

8321

8327

8322

debug_show_held_locks(current);

8328

debug_show_held_locks(current);

8323

if (irqs_disabled())

8329

if (irqs_disabled())

8324

print_irqtrace_events(current);

8330

print_irqtrace_events(current);

8325

dump_stack();

8331

dump_stack();

8326

#endif

8332

#endif

8327

}

8333

}

8328

EXPORT_SYMBOL(__might_sleep);

8334

EXPORT_SYMBOL(__might_sleep);

8329

#endif

8335

#endif

8330

8336

8331

#ifdef CONFIG_MAGIC_SYSRQ

8337

#ifdef CONFIG_MAGIC_SYSRQ

8332

static void normalize_task(struct rq *rq, struct task_struct *p)

8338

static void normalize_task(struct rq *rq, struct task_struct *p)

8333

{

8339

{

8334

int on_rq;

8340

int on_rq;

8335

8341

8336

update_rq_clock(rq);

8342

update_rq_clock(rq);

8337

on_rq = p->se.on_rq;

8343

on_rq = p->se.on_rq;

8338

if (on_rq)

8344

if (on_rq)

8339

deactivate_task(rq, p, 0);

8345

deactivate_task(rq, p, 0);

8340

__setscheduler(rq, p, SCHED_NORMAL, 0);

8346

__setscheduler(rq, p, SCHED_NORMAL, 0);

8341

if (on_rq) {

8347

if (on_rq) {

8342

activate_task(rq, p, 0);

8348

activate_task(rq, p, 0);

8343

resched_task(rq->curr);

8349

resched_task(rq->curr);

8344

}

8350

}

8345

}

8351

}

8346

8352

8347

void normalize_rt_tasks(void)

8353

void normalize_rt_tasks(void)

8348

{

8354

{

8349

struct task_struct *g, *p;

8355

struct task_struct *g, *p;

8350

unsigned long flags;

8356

unsigned long flags;

8351

struct rq *rq;

8357

struct rq *rq;

8352

8358

8353

read_lock_irqsave(&tasklist_lock, flags);

8359

read_lock_irqsave(&tasklist_lock, flags);

8354

do_each_thread(g, p) {

8360

do_each_thread(g, p) {

8355

/*

8361

/*

8356

* Only normalize user tasks:

8362

* Only normalize user tasks:

8357

*/

8363

*/

8358

if (!p->mm)

8364

if (!p->mm)

8359

continue;

8365

continue;

8360

8366

8361

p->se.exec_start = 0;

8367

p->se.exec_start = 0;

8362

#ifdef CONFIG_SCHEDSTATS

8368

#ifdef CONFIG_SCHEDSTATS

8363

p->se.wait_start = 0;

8369

p->se.wait_start = 0;

8364

p->se.sleep_start = 0;

8370

p->se.sleep_start = 0;

8365

p->se.block_start = 0;

8371

p->se.block_start = 0;

8366

#endif

8372

#endif

8367

8373

8368

if (!rt_task(p)) {

8374

if (!rt_task(p)) {

8369

/*

8375

/*

8370

* Renice negative nice level userspace

8376

* Renice negative nice level userspace

8371

* tasks back to 0:

8377

* tasks back to 0:

8372

*/

8378

*/

8373

if (TASK_NICE(p) < 0 && p->mm)

8379

if (TASK_NICE(p) < 0 && p->mm)

8374

set_user_nice(p, 0);

8380

set_user_nice(p, 0);

8375

continue;

8381

continue;

8376

}

8382

}

8377

8383

8378

spin_lock(&p->pi_lock);

8384

spin_lock(&p->pi_lock);

8379

rq = __task_rq_lock(p);

8385

rq = __task_rq_lock(p);

8380

8386

8381

normalize_task(rq, p);

8387

normalize_task(rq, p);

8382

8388

8383

__task_rq_unlock(rq);

8389

__task_rq_unlock(rq);

8384

spin_unlock(&p->pi_lock);

8390

spin_unlock(&p->pi_lock);

8385

} while_each_thread(g, p);

8391

} while_each_thread(g, p);

8386

8392

8387

read_unlock_irqrestore(&tasklist_lock, flags);

8393

read_unlock_irqrestore(&tasklist_lock, flags);

8388

}

8394

}

8389

8395

8390

#endif /* CONFIG_MAGIC_SYSRQ */

8396

#endif /* CONFIG_MAGIC_SYSRQ */

8391

8397

8392

#ifdef CONFIG_IA64

8398

#ifdef CONFIG_IA64

8393

/*

8399

/*

8394

* These functions are only useful for the IA64 MCA handling.

8400

* These functions are only useful for the IA64 MCA handling.

8395

*

8401

*

8396

* They can only be called when the whole system has been

8402

* They can only be called when the whole system has been

8397

* stopped - every CPU needs to be quiescent, and no scheduling

8403

* stopped - every CPU needs to be quiescent, and no scheduling

8398

* activity can take place. Using them for anything else would

8404

* activity can take place. Using them for anything else would

8399

* be a serious bug, and as a result, they aren't even visible

8405

* be a serious bug, and as a result, they aren't even visible

8400

* under any other configuration.

8406

* under any other configuration.

8401

*/

8407

*/

8402

8408

8403

/**

8409

/**

8404

* curr_task - return the current task for a given cpu.

8410

* curr_task - return the current task for a given cpu.

8405

* @cpu: the processor in question.

8411

* @cpu: the processor in question.

8406

*

8412

*

8407

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

8413

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

8408

*/

8414

*/

8409

struct task_struct *curr_task(int cpu)

8415

struct task_struct *curr_task(int cpu)

8410

{

8416

{

8411

return cpu_curr(cpu);

8417

return cpu_curr(cpu);

8412

}

8418

}

8413

8419

8414

/**

8420

/**

8415

* set_curr_task - set the current task for a given cpu.

8421

* set_curr_task - set the current task for a given cpu.

8416

* @cpu: the processor in question.

8422

* @cpu: the processor in question.

8417

* @p: the task pointer to set.

8423

* @p: the task pointer to set.

8418

*

8424

*

8419

* Description: This function must only be used when non-maskable interrupts

8425

* Description: This function must only be used when non-maskable interrupts

8420

* are serviced on a separate stack. It allows the architecture to switch the

8426

* are serviced on a separate stack. It allows the architecture to switch the

8421

* notion of the current task on a cpu in a non-blocking manner. This function

8427

* notion of the current task on a cpu in a non-blocking manner. This function

8422

* must be called with all CPU's synchronized, and interrupts disabled, the

8428

* must be called with all CPU's synchronized, and interrupts disabled, the

8423

* and caller must save the original value of the current task (see

8429

* and caller must save the original value of the current task (see

8424

* curr_task() above) and restore that value before reenabling interrupts and

8430

* curr_task() above) and restore that value before reenabling interrupts and

8425

* re-starting the system.

8431

* re-starting the system.

8426

*

8432

*

8427

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

8433

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

8428

*/

8434

*/

8429

void set_curr_task(int cpu, struct task_struct *p)

8435

void set_curr_task(int cpu, struct task_struct *p)

8430

{

8436

{

8431

cpu_curr(cpu) = p;

8437

cpu_curr(cpu) = p;

8432

}

8438

}

8433

8439

8434

#endif

8440

#endif

8435

8441

8436

#ifdef CONFIG_FAIR_GROUP_SCHED

8442

#ifdef CONFIG_FAIR_GROUP_SCHED

8437

static void free_fair_sched_group(struct task_group *tg)

8443

static void free_fair_sched_group(struct task_group *tg)

8438

{

8444

{

8439

int i;

8445

int i;

8440

8446

8441

for_each_possible_cpu(i) {

8447

for_each_possible_cpu(i) {

8442

if (tg->cfs_rq)

8448

if (tg->cfs_rq)

8443

kfree(tg->cfs_rq[i]);

8449

kfree(tg->cfs_rq[i]);

8444

if (tg->se)

8450

if (tg->se)

8445

kfree(tg->se[i]);

8451

kfree(tg->se[i]);

8446

}

8452

}

8447

8453

8448

kfree(tg->cfs_rq);

8454

kfree(tg->cfs_rq);

8449

kfree(tg->se);

8455

kfree(tg->se);

8450

}

8456

}

8451

8457

8452

static

8458

static

8453

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

8459

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

8454

{

8460

{

8455

struct cfs_rq *cfs_rq;

8461

struct cfs_rq *cfs_rq;

8456

struct sched_entity *se;

8462

struct sched_entity *se;

8457

struct rq *rq;

8463

struct rq *rq;

8458

int i;

8464

int i;

8459

8465

8460

tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);

8466

tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);

8461

if (!tg->cfs_rq)

8467

if (!tg->cfs_rq)

8462

goto err;

8468

goto err;

8463

tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);

8469

tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);

8464

if (!tg->se)

8470

if (!tg->se)

8465

goto err;

8471

goto err;

8466

8472

8467

tg->shares = NICE_0_LOAD;

8473

tg->shares = NICE_0_LOAD;

8468

8474

8469

for_each_possible_cpu(i) {

8475

for_each_possible_cpu(i) {

8470

rq = cpu_rq(i);

8476

rq = cpu_rq(i);

8471

8477

8472

cfs_rq = kzalloc_node(sizeof(struct cfs_rq),

8478

cfs_rq = kzalloc_node(sizeof(struct cfs_rq),

8473

GFP_KERNEL, cpu_to_node(i));

8479

GFP_KERNEL, cpu_to_node(i));

8474

if (!cfs_rq)

8480

if (!cfs_rq)

8475

goto err;

8481

goto err;

8476

8482

8477

se = kzalloc_node(sizeof(struct sched_entity),

8483

se = kzalloc_node(sizeof(struct sched_entity),

8478

GFP_KERNEL, cpu_to_node(i));

8484

GFP_KERNEL, cpu_to_node(i));

8479

if (!se)

8485

if (!se)

8480

goto err;

8486

goto err;

8481

8487

8482

init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);

8488

init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);

8483

}

8489

}

8484

8490

8485

return 1;

8491

return 1;

8486

8492

8487

err:

8493

err:

8488

return 0;

8494

return 0;

8489

}

8495

}

8490

8496

8491

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

8497

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

8492

{

8498

{

8493

list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,

8499

list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,

8494

&cpu_rq(cpu)->leaf_cfs_rq_list);

8500

&cpu_rq(cpu)->leaf_cfs_rq_list);

8495

}

8501

}

8496

8502

8497

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

8503

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

8498

{

8504

{

8499

list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);

8505

list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);

8500

}

8506

}

8501

#else /* !CONFG_FAIR_GROUP_SCHED */

8507

#else /* !CONFG_FAIR_GROUP_SCHED */

8502

static inline void free_fair_sched_group(struct task_group *tg)

8508

static inline void free_fair_sched_group(struct task_group *tg)

8503

{

8509

{

8504

}

8510

}

8505

8511

8506

static inline

8512

static inline

8507

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

8513

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

8508

{

8514

{

8509

return 1;

8515

return 1;

8510

}

8516

}

8511

8517

8512

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

8518

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

8513

{

8519

{

8514

}

8520

}

8515

8521

8516

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

8522

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

8517

{

8523

{

8518

}

8524

}

8519

#endif /* CONFIG_FAIR_GROUP_SCHED */

8525

#endif /* CONFIG_FAIR_GROUP_SCHED */

8520

8526

8521

#ifdef CONFIG_RT_GROUP_SCHED

8527

#ifdef CONFIG_RT_GROUP_SCHED

8522

static void free_rt_sched_group(struct task_group *tg)

8528

static void free_rt_sched_group(struct task_group *tg)

8523

{

8529

{

8524

int i;

8530

int i;

8525

8531

8526

destroy_rt_bandwidth(&tg->rt_bandwidth);

8532

destroy_rt_bandwidth(&tg->rt_bandwidth);

8527

8533

8528

for_each_possible_cpu(i) {

8534

for_each_possible_cpu(i) {

8529

if (tg->rt_rq)

8535

if (tg->rt_rq)

8530

kfree(tg->rt_rq[i]);

8536

kfree(tg->rt_rq[i]);

8531

if (tg->rt_se)

8537

if (tg->rt_se)

8532

kfree(tg->rt_se[i]);

8538

kfree(tg->rt_se[i]);

8533

}

8539

}

8534

8540

8535

kfree(tg->rt_rq);

8541

kfree(tg->rt_rq);

8536

kfree(tg->rt_se);

8542

kfree(tg->rt_se);

8537

}

8543

}

8538

8544

8539

static

8545

static

8540

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

8546

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

8541

{

8547

{

8542

struct rt_rq *rt_rq;

8548

struct rt_rq *rt_rq;

8543

struct sched_rt_entity *rt_se;

8549

struct sched_rt_entity *rt_se;

8544

struct rq *rq;

8550

struct rq *rq;

8545

int i;

8551

int i;

8546

8552

8547

tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);

8553

tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);

8548

if (!tg->rt_rq)

8554

if (!tg->rt_rq)

8549

goto err;

8555

goto err;

8550

tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);

8556

tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);

8551

if (!tg->rt_se)

8557

if (!tg->rt_se)

8552

goto err;

8558

goto err;

8553

8559

8554

init_rt_bandwidth(&tg->rt_bandwidth,

8560

init_rt_bandwidth(&tg->rt_bandwidth,

8555

ktime_to_ns(def_rt_bandwidth.rt_period), 0);

8561

ktime_to_ns(def_rt_bandwidth.rt_period), 0);

8556

8562

8557

for_each_possible_cpu(i) {

8563

for_each_possible_cpu(i) {

8558

rq = cpu_rq(i);

8564

rq = cpu_rq(i);

8559

8565

8560

rt_rq = kzalloc_node(sizeof(struct rt_rq),

8566

rt_rq = kzalloc_node(sizeof(struct rt_rq),

8561

GFP_KERNEL, cpu_to_node(i));

8567

GFP_KERNEL, cpu_to_node(i));

8562

if (!rt_rq)

8568

if (!rt_rq)

8563

goto err;

8569

goto err;

8564

8570

8565

rt_se = kzalloc_node(sizeof(struct sched_rt_entity),

8571

rt_se = kzalloc_node(sizeof(struct sched_rt_entity),

8566

GFP_KERNEL, cpu_to_node(i));

8572

GFP_KERNEL, cpu_to_node(i));

8567

if (!rt_se)

8573

if (!rt_se)

8568

goto err;

8574

goto err;

8569

8575

8570

init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);

8576

init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);

8571

}

8577

}

8572

8578

8573

return 1;

8579

return 1;

8574

8580

8575

err:

8581

err:

8576

return 0;

8582

return 0;

8577

}

8583

}

8578

8584

8579

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

8585

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

8580

{

8586

{

8581

list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,

8587

list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,

8582

&cpu_rq(cpu)->leaf_rt_rq_list);

8588

&cpu_rq(cpu)->leaf_rt_rq_list);

8583

}

8589

}

8584

8590

8585

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

8591

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

8586

{

8592

{

8587

list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);

8593

list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);

8588

}

8594

}

8589

#else /* !CONFIG_RT_GROUP_SCHED */

8595

#else /* !CONFIG_RT_GROUP_SCHED */

8590

static inline void free_rt_sched_group(struct task_group *tg)

8596

static inline void free_rt_sched_group(struct task_group *tg)

8591

{

8597

{

8592

}

8598

}

8593

8599

8594

static inline

8600

static inline

8595

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

8601

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

8596

{

8602

{

8597

return 1;

8603

return 1;

8598

}

8604

}

8599

8605

8600

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

8606

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

8601

{

8607

{

8602

}

8608

}

8603

8609

8604

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

8610

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

8605

{

8611

{

8606

}

8612

}

8607

#endif /* CONFIG_RT_GROUP_SCHED */

8613

#endif /* CONFIG_RT_GROUP_SCHED */

8608

8614

8609

#ifdef CONFIG_GROUP_SCHED

8615

#ifdef CONFIG_GROUP_SCHED

8610

static void free_sched_group(struct task_group *tg)

8616

static void free_sched_group(struct task_group *tg)

8611

{

8617

{

8612

free_fair_sched_group(tg);

8618

free_fair_sched_group(tg);

8613

free_rt_sched_group(tg);

8619

free_rt_sched_group(tg);

8614

kfree(tg);

8620

kfree(tg);

8615

}

8621

}

8616

8622

8617

/* allocate runqueue etc for a new task group */

8623

/* allocate runqueue etc for a new task group */

8618

struct task_group *sched_create_group(struct task_group *parent)

8624

struct task_group *sched_create_group(struct task_group *parent)

8619

{

8625

{

8620

struct task_group *tg;

8626

struct task_group *tg;

8621

unsigned long flags;

8627

unsigned long flags;

8622

int i;

8628

int i;

8623

8629

8624

tg = kzalloc(sizeof(*tg), GFP_KERNEL);

8630

tg = kzalloc(sizeof(*tg), GFP_KERNEL);

8625

if (!tg)

8631

if (!tg)

8626

return ERR_PTR(-ENOMEM);

8632

return ERR_PTR(-ENOMEM);

8627

8633

8628

if (!alloc_fair_sched_group(tg, parent))

8634

if (!alloc_fair_sched_group(tg, parent))

8629

goto err;

8635

goto err;

8630

8636

8631

if (!alloc_rt_sched_group(tg, parent))

8637

if (!alloc_rt_sched_group(tg, parent))

8632

goto err;

8638

goto err;

8633

8639

8634

spin_lock_irqsave(&task_group_lock, flags);

8640

spin_lock_irqsave(&task_group_lock, flags);

8635

for_each_possible_cpu(i) {

8641

for_each_possible_cpu(i) {

8636

register_fair_sched_group(tg, i);

8642

register_fair_sched_group(tg, i);

8637

register_rt_sched_group(tg, i);

8643

register_rt_sched_group(tg, i);

8638

}

8644

}

8639

list_add_rcu(&tg->list, &task_groups);

8645

list_add_rcu(&tg->list, &task_groups);

8640

8646

8641

WARN_ON(!parent); /* root should already exist */

8647

WARN_ON(!parent); /* root should already exist */

8642

8648

8643

tg->parent = parent;

8649

tg->parent = parent;

8644

INIT_LIST_HEAD(&tg->children);

8650

INIT_LIST_HEAD(&tg->children);

8645

list_add_rcu(&tg->siblings, &parent->children);

8651

list_add_rcu(&tg->siblings, &parent->children);

8646

spin_unlock_irqrestore(&task_group_lock, flags);

8652

spin_unlock_irqrestore(&task_group_lock, flags);

8647

8653

8648

return tg;

8654

return tg;

8649

8655

8650

err:

8656

err:

8651

free_sched_group(tg);

8657

free_sched_group(tg);

8652

return ERR_PTR(-ENOMEM);

8658

return ERR_PTR(-ENOMEM);

8653

}

8659

}

8654

8660

8655

/* rcu callback to free various structures associated with a task group */

8661

/* rcu callback to free various structures associated with a task group */

8656

static void free_sched_group_rcu(struct rcu_head *rhp)

8662

static void free_sched_group_rcu(struct rcu_head *rhp)

8657

{

8663

{

8658

/* now it should be safe to free those cfs_rqs */

8664

/* now it should be safe to free those cfs_rqs */

8659

free_sched_group(container_of(rhp, struct task_group, rcu));

8665

free_sched_group(container_of(rhp, struct task_group, rcu));

8660

}

8666

}

8661

8667

8662

/* Destroy runqueue etc associated with a task group */

8668

/* Destroy runqueue etc associated with a task group */

8663

void sched_destroy_group(struct task_group *tg)

8669

void sched_destroy_group(struct task_group *tg)

8664

{

8670

{

8665

unsigned long flags;

8671

unsigned long flags;

8666

int i;

8672

int i;

8667

8673

8668

spin_lock_irqsave(&task_group_lock, flags);

8674

spin_lock_irqsave(&task_group_lock, flags);

8669

for_each_possible_cpu(i) {

8675

for_each_possible_cpu(i) {

8670

unregister_fair_sched_group(tg, i);

8676

unregister_fair_sched_group(tg, i);

8671

unregister_rt_sched_group(tg, i);

8677

unregister_rt_sched_group(tg, i);

8672

}

8678

}

8673

list_del_rcu(&tg->list);

8679

list_del_rcu(&tg->list);

8674

list_del_rcu(&tg->siblings);

8680

list_del_rcu(&tg->siblings);

8675

spin_unlock_irqrestore(&task_group_lock, flags);

8681

spin_unlock_irqrestore(&task_group_lock, flags);

8676

8682

8677

/* wait for possible concurrent references to cfs_rqs complete */

8683

/* wait for possible concurrent references to cfs_rqs complete */

8678

call_rcu(&tg->rcu, free_sched_group_rcu);

8684

call_rcu(&tg->rcu, free_sched_group_rcu);

8679

}

8685

}

8680

8686

8681

/* change task's runqueue when it moves between groups.

8687

/* change task's runqueue when it moves between groups.

8682

* The caller of this function should have put the task in its new group

8688

* The caller of this function should have put the task in its new group

8683

* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to

8689

* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to

8684

* reflect its new group.

8690

* reflect its new group.

8685

*/

8691

*/

8686

void sched_move_task(struct task_struct *tsk)

8692

void sched_move_task(struct task_struct *tsk)

8687

{

8693

{

8688

int on_rq, running;

8694

int on_rq, running;

8689

unsigned long flags;

8695

unsigned long flags;

8690

struct rq *rq;

8696

struct rq *rq;

8691

8697

8692

rq = task_rq_lock(tsk, &flags);

8698

rq = task_rq_lock(tsk, &flags);

8693

8699

8694

update_rq_clock(rq);

8700

update_rq_clock(rq);

8695

8701

8696

running = task_current(rq, tsk);

8702

running = task_current(rq, tsk);

8697

on_rq = tsk->se.on_rq;

8703

on_rq = tsk->se.on_rq;

8698

8704

8699

if (on_rq)

8705

if (on_rq)

8700

dequeue_task(rq, tsk, 0);

8706

dequeue_task(rq, tsk, 0);

8701

if (unlikely(running))

8707

if (unlikely(running))

8702

tsk->sched_class->put_prev_task(rq, tsk);

8708

tsk->sched_class->put_prev_task(rq, tsk);

8703

8709

8704

set_task_rq(tsk, task_cpu(tsk));

8710

set_task_rq(tsk, task_cpu(tsk));

8705

8711

8706

#ifdef CONFIG_FAIR_GROUP_SCHED

8712

#ifdef CONFIG_FAIR_GROUP_SCHED

8707

if (tsk->sched_class->moved_group)

8713

if (tsk->sched_class->moved_group)

8708

tsk->sched_class->moved_group(tsk);

8714

tsk->sched_class->moved_group(tsk);

8709

#endif

8715

#endif

8710

8716

8711

if (unlikely(running))

8717

if (unlikely(running))

8712

tsk->sched_class->set_curr_task(rq);

8718

tsk->sched_class->set_curr_task(rq);

8713

if (on_rq)

8719

if (on_rq)

8714

enqueue_task(rq, tsk, 0);

8720

enqueue_task(rq, tsk, 0);

8715

8721

8716

task_rq_unlock(rq, &flags);

8722

task_rq_unlock(rq, &flags);

8717

}

8723

}

8718

#endif /* CONFIG_GROUP_SCHED */

8724

#endif /* CONFIG_GROUP_SCHED */

8719

8725

8720

#ifdef CONFIG_FAIR_GROUP_SCHED

8726

#ifdef CONFIG_FAIR_GROUP_SCHED

8721

static void __set_se_shares(struct sched_entity *se, unsigned long shares)

8727

static void __set_se_shares(struct sched_entity *se, unsigned long shares)

8722

{

8728

{

8723

struct cfs_rq *cfs_rq = se->cfs_rq;

8729

struct cfs_rq *cfs_rq = se->cfs_rq;

8724

int on_rq;

8730

int on_rq;

8725

8731

8726

on_rq = se->on_rq;

8732

on_rq = se->on_rq;

8727

if (on_rq)

8733

if (on_rq)

8728

dequeue_entity(cfs_rq, se, 0);

8734

dequeue_entity(cfs_rq, se, 0);

8729

8735

8730

se->load.weight = shares;

8736

se->load.weight = shares;

8731

se->load.inv_weight = 0;

8737

se->load.inv_weight = 0;

8732

8738

8733

if (on_rq)

8739

if (on_rq)

8734

enqueue_entity(cfs_rq, se, 0);

8740

enqueue_entity(cfs_rq, se, 0);

8735

}

8741

}

8736

8742

8737

static void set_se_shares(struct sched_entity *se, unsigned long shares)

8743

static void set_se_shares(struct sched_entity *se, unsigned long shares)

8738

{

8744

{

8739

struct cfs_rq *cfs_rq = se->cfs_rq;

8745

struct cfs_rq *cfs_rq = se->cfs_rq;

8740

struct rq *rq = cfs_rq->rq;

8746

struct rq *rq = cfs_rq->rq;

8741

unsigned long flags;

8747

unsigned long flags;

8742

8748

8743

spin_lock_irqsave(&rq->lock, flags);

8749

spin_lock_irqsave(&rq->lock, flags);

8744

__set_se_shares(se, shares);

8750

__set_se_shares(se, shares);

8745

spin_unlock_irqrestore(&rq->lock, flags);

8751

spin_unlock_irqrestore(&rq->lock, flags);

8746

}

8752

}

8747

8753

8748

static DEFINE_MUTEX(shares_mutex);

8754

static DEFINE_MUTEX(shares_mutex);

8749

8755

8750

int sched_group_set_shares(struct task_group *tg, unsigned long shares)

8756

int sched_group_set_shares(struct task_group *tg, unsigned long shares)

8751

{

8757

{

8752

int i;

8758

int i;

8753

unsigned long flags;

8759

unsigned long flags;

8754

8760

8755

/*

8761

/*

8756

* We can't change the weight of the root cgroup.

8762

* We can't change the weight of the root cgroup.

8757

*/

8763

*/

8758

if (!tg->se[0])

8764

if (!tg->se[0])

8759

return -EINVAL;

8765

return -EINVAL;

8760

8766

8761

if (shares < MIN_SHARES)

8767

if (shares < MIN_SHARES)

8762

shares = MIN_SHARES;

8768

shares = MIN_SHARES;

8763

else if (shares > MAX_SHARES)

8769

else if (shares > MAX_SHARES)

8764

shares = MAX_SHARES;

8770

shares = MAX_SHARES;

8765

8771

8766

mutex_lock(&shares_mutex);

8772

mutex_lock(&shares_mutex);

8767

if (tg->shares == shares)

8773

if (tg->shares == shares)

8768

goto done;

8774

goto done;

8769

8775

8770

spin_lock_irqsave(&task_group_lock, flags);

8776

spin_lock_irqsave(&task_group_lock, flags);

8771

for_each_possible_cpu(i)

8777

for_each_possible_cpu(i)

8772

unregister_fair_sched_group(tg, i);

8778

unregister_fair_sched_group(tg, i);

8773

list_del_rcu(&tg->siblings);

8779

list_del_rcu(&tg->siblings);

8774

spin_unlock_irqrestore(&task_group_lock, flags);

8780

spin_unlock_irqrestore(&task_group_lock, flags);

8775

8781

8776

/* wait for any ongoing reference to this group to finish */

8782

/* wait for any ongoing reference to this group to finish */

8777

synchronize_sched();

8783

synchronize_sched();

8778

8784

8779

/*

8785

/*

8780

* Now we are free to modify the group's share on each cpu

8786

* Now we are free to modify the group's share on each cpu

8781

* w/o tripping rebalance_share or load_balance_fair.

8787

* w/o tripping rebalance_share or load_balance_fair.

8782

*/

8788

*/

8783

tg->shares = shares;

8789

tg->shares = shares;

8784

for_each_possible_cpu(i) {

8790

for_each_possible_cpu(i) {

8785

/*

8791

/*

8786

* force a rebalance

8792

* force a rebalance

8787

*/

8793

*/

8788

cfs_rq_set_shares(tg->cfs_rq[i], 0);

8794

cfs_rq_set_shares(tg->cfs_rq[i], 0);

8789

set_se_shares(tg->se[i], shares);

8795

set_se_shares(tg->se[i], shares);

8790

}

8796

}

8791

8797

8792

/*

8798

/*

8793

* Enable load balance activity on this group, by inserting it back on

8799

* Enable load balance activity on this group, by inserting it back on

8794

* each cpu's rq->leaf_cfs_rq_list.

8800

* each cpu's rq->leaf_cfs_rq_list.

8795

*/

8801

*/

8796

spin_lock_irqsave(&task_group_lock, flags);

8802

spin_lock_irqsave(&task_group_lock, flags);

8797

for_each_possible_cpu(i)

8803

for_each_possible_cpu(i)

8798

register_fair_sched_group(tg, i);

8804

register_fair_sched_group(tg, i);

8799

list_add_rcu(&tg->siblings, &tg->parent->children);

8805

list_add_rcu(&tg->siblings, &tg->parent->children);

8800

spin_unlock_irqrestore(&task_group_lock, flags);

8806

spin_unlock_irqrestore(&task_group_lock, flags);

8801

done:

8807

done:

8802

mutex_unlock(&shares_mutex);

8808

mutex_unlock(&shares_mutex);

8803

return 0;

8809

return 0;

8804

}

8810

}

8805

8811

8806

unsigned long sched_group_shares(struct task_group *tg)

8812

unsigned long sched_group_shares(struct task_group *tg)

8807

{

8813

{

8808

return tg->shares;

8814

return tg->shares;

8809

}

8815

}

8810

#endif

8816

#endif

8811

8817

8812

#ifdef CONFIG_RT_GROUP_SCHED

8818

#ifdef CONFIG_RT_GROUP_SCHED

8813

/*

8819

/*

8814

* Ensure that the real time constraints are schedulable.

8820

* Ensure that the real time constraints are schedulable.

8815

*/

8821

*/

8816

static DEFINE_MUTEX(rt_constraints_mutex);

8822

static DEFINE_MUTEX(rt_constraints_mutex);

8817

8823

8818

static unsigned long to_ratio(u64 period, u64 runtime)

8824

static unsigned long to_ratio(u64 period, u64 runtime)

8819

{

8825

{

8820

if (runtime == RUNTIME_INF)

8826

if (runtime == RUNTIME_INF)

8821

return 1ULL << 20;

8827

return 1ULL << 20;

8822

8828

8823

return div64_u64(runtime << 20, period);

8829

return div64_u64(runtime << 20, period);

8824

}

8830

}

8825

8831

8826

/* Must be called with tasklist_lock held */

8832

/* Must be called with tasklist_lock held */

8827

static inline int tg_has_rt_tasks(struct task_group *tg)

8833

static inline int tg_has_rt_tasks(struct task_group *tg)

8828

{

8834

{

8829

struct task_struct *g, *p;

8835

struct task_struct *g, *p;

8830

8836

8831

do_each_thread(g, p) {

8837

do_each_thread(g, p) {

8832

if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)

8838

if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)

8833

return 1;

8839

return 1;

8834

} while_each_thread(g, p);

8840

} while_each_thread(g, p);

8835

8841

8836

return 0;

8842

return 0;

8837

}

8843

}

8838

8844

8839

struct rt_schedulable_data {

8845

struct rt_schedulable_data {

8840

struct task_group *tg;

8846

struct task_group *tg;

8841

u64 rt_period;

8847

u64 rt_period;

8842

u64 rt_runtime;

8848

u64 rt_runtime;

8843

};

8849

};

8844

8850

8845

static int tg_schedulable(struct task_group *tg, void *data)

8851

static int tg_schedulable(struct task_group *tg, void *data)

8846

{

8852

{

8847

struct rt_schedulable_data *d = data;

8853

struct rt_schedulable_data *d = data;

8848

struct task_group *child;

8854

struct task_group *child;

8849

unsigned long total, sum = 0;

8855

unsigned long total, sum = 0;

8850

u64 period, runtime;

8856

u64 period, runtime;

8851

8857

8852

period = ktime_to_ns(tg->rt_bandwidth.rt_period);

8858

period = ktime_to_ns(tg->rt_bandwidth.rt_period);

8853

runtime = tg->rt_bandwidth.rt_runtime;

8859

runtime = tg->rt_bandwidth.rt_runtime;

8854

8860

8855

if (tg == d->tg) {

8861

if (tg == d->tg) {

8856

period = d->rt_period;

8862

period = d->rt_period;

8857

runtime = d->rt_runtime;

8863

runtime = d->rt_runtime;

8858

}

8864

}

8859

8865

8860

/*

8866

/*

8861

* Cannot have more runtime than the period.

8867

* Cannot have more runtime than the period.

8862

*/

8868

*/

8863

if (runtime > period && runtime != RUNTIME_INF)

8869

if (runtime > period && runtime != RUNTIME_INF)

8864

return -EINVAL;

8870

return -EINVAL;

8865

8871

8866

/*

8872

/*

8867

* Ensure we don't starve existing RT tasks.

8873

* Ensure we don't starve existing RT tasks.

8868

*/

8874

*/

8869

if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))

8875

if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))

8870

return -EBUSY;

8876

return -EBUSY;

8871

8877

8872

total = to_ratio(period, runtime);

8878

total = to_ratio(period, runtime);

8873

8879

8874

/*

8880

/*

8875

* Nobody can have more than the global setting allows.

8881

* Nobody can have more than the global setting allows.

8876

*/

8882

*/

8877

if (total > to_ratio(global_rt_period(), global_rt_runtime()))

8883

if (total > to_ratio(global_rt_period(), global_rt_runtime()))

8878

return -EINVAL;

8884

return -EINVAL;

8879

8885

8880

/*

8886

/*

8881

* The sum of our children's runtime should not exceed our own.

8887

* The sum of our children's runtime should not exceed our own.

8882

*/

8888

*/

8883

list_for_each_entry_rcu(child, &tg->children, siblings) {

8889

list_for_each_entry_rcu(child, &tg->children, siblings) {

8884

period = ktime_to_ns(child->rt_bandwidth.rt_period);

8890

period = ktime_to_ns(child->rt_bandwidth.rt_period);

8885

runtime = child->rt_bandwidth.rt_runtime;

8891

runtime = child->rt_bandwidth.rt_runtime;

8886

8892

8887

if (child == d->tg) {

8893

if (child == d->tg) {

8888

period = d->rt_period;

8894

period = d->rt_period;

8889

runtime = d->rt_runtime;

8895

runtime = d->rt_runtime;

8890

}

8896

}

8891

8897

8892

sum += to_ratio(period, runtime);

8898

sum += to_ratio(period, runtime);

8893

}

8899

}

8894

8900

8895

if (sum > total)

8901

if (sum > total)

8896

return -EINVAL;

8902

return -EINVAL;

8897

8903

8898

return 0;

8904

return 0;

8899

}

8905

}

8900

8906

8901

static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)

8907

static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)

8902

{

8908

{

8903

struct rt_schedulable_data data = {

8909

struct rt_schedulable_data data = {

8904

.tg = tg,

8910

.tg = tg,

8905

.rt_period = period,

8911

.rt_period = period,

8906

.rt_runtime = runtime,

8912

.rt_runtime = runtime,

8907

};

8913

};

8908

8914

8909

return walk_tg_tree(tg_schedulable, tg_nop, &data);

8915

return walk_tg_tree(tg_schedulable, tg_nop, &data);

8910

}

8916

}

8911

8917

8912

static int tg_set_bandwidth(struct task_group *tg,

8918

static int tg_set_bandwidth(struct task_group *tg,

8913

u64 rt_period, u64 rt_runtime)

8919

u64 rt_period, u64 rt_runtime)

8914

{

8920

{

8915

int i, err = 0;

8921

int i, err = 0;

8916

8922

8917

mutex_lock(&rt_constraints_mutex);

8923

mutex_lock(&rt_constraints_mutex);

8918

read_lock(&tasklist_lock);

8924

read_lock(&tasklist_lock);

8919

err = __rt_schedulable(tg, rt_period, rt_runtime);

8925

err = __rt_schedulable(tg, rt_period, rt_runtime);

8920

if (err)

8926

if (err)

8921

goto unlock;

8927

goto unlock;

8922

8928

8923

spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);

8929

spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);

8924

tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);

8930

tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);

8925

tg->rt_bandwidth.rt_runtime = rt_runtime;

8931

tg->rt_bandwidth.rt_runtime = rt_runtime;

8926

8932

8927

for_each_possible_cpu(i) {

8933

for_each_possible_cpu(i) {

8928

struct rt_rq *rt_rq = tg->rt_rq[i];

8934

struct rt_rq *rt_rq = tg->rt_rq[i];

8929

8935

8930

spin_lock(&rt_rq->rt_runtime_lock);

8936

spin_lock(&rt_rq->rt_runtime_lock);

8931

rt_rq->rt_runtime = rt_runtime;

8937

rt_rq->rt_runtime = rt_runtime;

8932

spin_unlock(&rt_rq->rt_runtime_lock);

8938

spin_unlock(&rt_rq->rt_runtime_lock);

8933

}

8939

}

8934

spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);

8940

spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);

8935

unlock:

8941

unlock:

8936

read_unlock(&tasklist_lock);

8942

read_unlock(&tasklist_lock);

8937

mutex_unlock(&rt_constraints_mutex);

8943

mutex_unlock(&rt_constraints_mutex);

8938

8944

8939

return err;

8945

return err;

8940

}

8946

}

8941

8947

8942

int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)

8948

int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)

8943

{

8949

{

8944

u64 rt_runtime, rt_period;

8950

u64 rt_runtime, rt_period;

8945

8951

8946

rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);

8952

rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);

8947

rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;

8953

rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;

8948

if (rt_runtime_us < 0)

8954

if (rt_runtime_us < 0)

8949

rt_runtime = RUNTIME_INF;

8955

rt_runtime = RUNTIME_INF;

8950

8956

8951

return tg_set_bandwidth(tg, rt_period, rt_runtime);

8957

return tg_set_bandwidth(tg, rt_period, rt_runtime);

8952

}

8958

}

8953

8959

8954

long sched_group_rt_runtime(struct task_group *tg)

8960

long sched_group_rt_runtime(struct task_group *tg)

8955

{

8961

{

8956

u64 rt_runtime_us;

8962

u64 rt_runtime_us;

8957

8963

8958

if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)

8964

if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)

8959

return -1;

8965

return -1;

8960

8966

8961

rt_runtime_us = tg->rt_bandwidth.rt_runtime;

8967

rt_runtime_us = tg->rt_bandwidth.rt_runtime;

8962

do_div(rt_runtime_us, NSEC_PER_USEC);

8968

do_div(rt_runtime_us, NSEC_PER_USEC);

8963

return rt_runtime_us;

8969

return rt_runtime_us;

8964

}

8970

}

8965

8971

8966

int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)

8972

int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)

8967

{

8973

{

8968

u64 rt_runtime, rt_period;

8974

u64 rt_runtime, rt_period;

8969

8975

8970

rt_period = (u64)rt_period_us * NSEC_PER_USEC;

8976

rt_period = (u64)rt_period_us * NSEC_PER_USEC;

8971

rt_runtime = tg->rt_bandwidth.rt_runtime;

8977

rt_runtime = tg->rt_bandwidth.rt_runtime;

8972

8978

8973

if (rt_period == 0)

8979

if (rt_period == 0)

8974

return -EINVAL;

8980

return -EINVAL;

8975

8981

8976

return tg_set_bandwidth(tg, rt_period, rt_runtime);

8982

return tg_set_bandwidth(tg, rt_period, rt_runtime);

8977

}

8983

}

8978

8984

8979

long sched_group_rt_period(struct task_group *tg)

8985

long sched_group_rt_period(struct task_group *tg)

8980

{

8986

{

8981

u64 rt_period_us;

8987

u64 rt_period_us;

8982

8988

8983

rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);

8989

rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);

8984

do_div(rt_period_us, NSEC_PER_USEC);

8990

do_div(rt_period_us, NSEC_PER_USEC);

8985

return rt_period_us;

8991

return rt_period_us;

8986

}

8992

}

8987

8993

8988

static int sched_rt_global_constraints(void)

8994

static int sched_rt_global_constraints(void)

8989

{

8995

{

8990

u64 runtime, period;

8996

u64 runtime, period;

8991

int ret = 0;

8997

int ret = 0;

8992

8998

8993

if (sysctl_sched_rt_period <= 0)

8999

if (sysctl_sched_rt_period <= 0)

8994

return -EINVAL;

9000

return -EINVAL;

8995

9001

8996

runtime = global_rt_runtime();

9002

runtime = global_rt_runtime();

8997

period = global_rt_period();

9003

period = global_rt_period();

8998

9004

8999

/*

9005

/*

9000

* Sanity check on the sysctl variables.

9006

* Sanity check on the sysctl variables.

9001

*/

9007

*/

9002

if (runtime > period && runtime != RUNTIME_INF)

9008

if (runtime > period && runtime != RUNTIME_INF)

9003

return -EINVAL;

9009

return -EINVAL;

9004

9010

9005

mutex_lock(&rt_constraints_mutex);

9011

mutex_lock(&rt_constraints_mutex);

9006

read_lock(&tasklist_lock);

9012

read_lock(&tasklist_lock);

9007

ret = __rt_schedulable(NULL, 0, 0);

9013

ret = __rt_schedulable(NULL, 0, 0);

9008

read_unlock(&tasklist_lock);

9014

read_unlock(&tasklist_lock);

9009

mutex_unlock(&rt_constraints_mutex);

9015

mutex_unlock(&rt_constraints_mutex);

9010

9016

9011

return ret;

9017

return ret;

9012

}

9018

}

9013

#else /* !CONFIG_RT_GROUP_SCHED */

9019

#else /* !CONFIG_RT_GROUP_SCHED */

9014

static int sched_rt_global_constraints(void)

9020

static int sched_rt_global_constraints(void)

9015

{

9021

{

9016

unsigned long flags;

9022

unsigned long flags;

9017

int i;

9023

int i;

9018

9024

9019

if (sysctl_sched_rt_period <= 0)

9025

if (sysctl_sched_rt_period <= 0)

9020

return -EINVAL;

9026

return -EINVAL;

9021

9027

9022

spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);

9028

spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);

9023

for_each_possible_cpu(i) {

9029

for_each_possible_cpu(i) {

9024

struct rt_rq *rt_rq = &cpu_rq(i)->rt;

9030

struct rt_rq *rt_rq = &cpu_rq(i)->rt;

9025

9031

9026

spin_lock(&rt_rq->rt_runtime_lock);

9032

spin_lock(&rt_rq->rt_runtime_lock);

9027

rt_rq->rt_runtime = global_rt_runtime();

9033

rt_rq->rt_runtime = global_rt_runtime();

9028

spin_unlock(&rt_rq->rt_runtime_lock);

9034

spin_unlock(&rt_rq->rt_runtime_lock);

9029

}

9035

}

9030

spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);

9036

spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);

9031

9037

9032

return 0;

9038

return 0;

9033

}

9039

}

9034

#endif /* CONFIG_RT_GROUP_SCHED */

9040

#endif /* CONFIG_RT_GROUP_SCHED */

9035

9041

9036

int sched_rt_handler(struct ctl_table *table, int write,

9042

int sched_rt_handler(struct ctl_table *table, int write,

9037

struct file *filp, void __user *buffer, size_t *lenp,

9043

struct file *filp, void __user *buffer, size_t *lenp,

9038

loff_t *ppos)

9044

loff_t *ppos)

9039

{

9045

{

9040

int ret;

9046

int ret;

9041

int old_period, old_runtime;

9047

int old_period, old_runtime;

9042

static DEFINE_MUTEX(mutex);

9048

static DEFINE_MUTEX(mutex);

9043

9049

9044

mutex_lock(&mutex);

9050

mutex_lock(&mutex);

9045

old_period = sysctl_sched_rt_period;

9051

old_period = sysctl_sched_rt_period;

9046

old_runtime = sysctl_sched_rt_runtime;

9052

old_runtime = sysctl_sched_rt_runtime;

9047

9053

9048

ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);

9054

ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);

9049

9055

9050

if (!ret && write) {

9056

if (!ret && write) {

9051

ret = sched_rt_global_constraints();

9057

ret = sched_rt_global_constraints();

9052

if (ret) {

9058

if (ret) {

9053

sysctl_sched_rt_period = old_period;

9059

sysctl_sched_rt_period = old_period;

9054

sysctl_sched_rt_runtime = old_runtime;

9060

sysctl_sched_rt_runtime = old_runtime;

9055

} else {

9061

} else {

9056

def_rt_bandwidth.rt_runtime = global_rt_runtime();

9062

def_rt_bandwidth.rt_runtime = global_rt_runtime();

9057

def_rt_bandwidth.rt_period =

9063

def_rt_bandwidth.rt_period =

9058

ns_to_ktime(global_rt_period());

9064

ns_to_ktime(global_rt_period());

9059

}

9065

}

9060

}

9066

}

9061

mutex_unlock(&mutex);

9067

mutex_unlock(&mutex);

9062

9068

9063

return ret;

9069

return ret;

9064

}

9070

}

9065

9071

9066

#ifdef CONFIG_CGROUP_SCHED

9072

#ifdef CONFIG_CGROUP_SCHED

9067

9073

9068

/* return corresponding task_group object of a cgroup */

9074

/* return corresponding task_group object of a cgroup */

9069

static inline struct task_group *cgroup_tg(struct cgroup *cgrp)

9075

static inline struct task_group *cgroup_tg(struct cgroup *cgrp)

9070

{

9076

{

9071

return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),

9077

return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),

9072

struct task_group, css);

9078

struct task_group, css);

9073

}

9079

}

9074

9080

9075

static struct cgroup_subsys_state *

9081

static struct cgroup_subsys_state *

9076

cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)

9082

cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)

9077

{

9083

{

9078

struct task_group *tg, *parent;

9084

struct task_group *tg, *parent;

9079

9085

9080

if (!cgrp->parent) {

9086

if (!cgrp->parent) {

9081

/* This is early initialization for the top cgroup */

9087

/* This is early initialization for the top cgroup */

9082

return &init_task_group.css;

9088

return &init_task_group.css;

9083

}

9089

}

9084

9090

9085

parent = cgroup_tg(cgrp->parent);

9091

parent = cgroup_tg(cgrp->parent);

9086

tg = sched_create_group(parent);

9092

tg = sched_create_group(parent);

9087

if (IS_ERR(tg))

9093

if (IS_ERR(tg))

9088

return ERR_PTR(-ENOMEM);

9094

return ERR_PTR(-ENOMEM);

9089

9095

9090

return &tg->css;

9096

return &tg->css;

9091

}

9097

}

9092

9098

9093

static void

9099

static void

9094

cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

9100

cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

9095

{

9101

{

9096

struct task_group *tg = cgroup_tg(cgrp);

9102

struct task_group *tg = cgroup_tg(cgrp);

9097

9103

9098

sched_destroy_group(tg);

9104

sched_destroy_group(tg);

9099

}

9105

}

9100

9106

9101

static int

9107

static int

9102

cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

9108

cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

9103

struct task_struct *tsk)

9109

struct task_struct *tsk)

9104

{

9110

{

9105

#ifdef CONFIG_RT_GROUP_SCHED

9111

#ifdef CONFIG_RT_GROUP_SCHED

9106

/* Don't accept realtime tasks when there is no way for them to run */

9112

/* Don't accept realtime tasks when there is no way for them to run */

9107

if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)

9113

if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)

9108

return -EINVAL;

9114

return -EINVAL;

9109

#else

9115

#else

9110

/* We don't support RT-tasks being in separate groups */

9116

/* We don't support RT-tasks being in separate groups */

9111

if (tsk->sched_class != &fair_sched_class)

9117

if (tsk->sched_class != &fair_sched_class)

9112

return -EINVAL;

9118

return -EINVAL;

9113

#endif

9119

#endif

9114

9120

9115

return 0;

9121

return 0;

9116

}

9122

}

9117

9123

9118

static void

9124

static void

9119

cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

9125

cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

9120

struct cgroup *old_cont, struct task_struct *tsk)

9126

struct cgroup *old_cont, struct task_struct *tsk)

9121

{

9127

{

9122

sched_move_task(tsk);

9128

sched_move_task(tsk);

9123

}

9129

}

9124

9130

9125

#ifdef CONFIG_FAIR_GROUP_SCHED

9131

#ifdef CONFIG_FAIR_GROUP_SCHED

9126

static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,

9132

static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,

9127

u64 shareval)

9133

u64 shareval)

9128

{

9134

{

9129

return sched_group_set_shares(cgroup_tg(cgrp), shareval);

9135

return sched_group_set_shares(cgroup_tg(cgrp), shareval);

9130

}

9136

}

9131

9137

9132

static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)

9138

static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)

9133

{

9139

{

9134

struct task_group *tg = cgroup_tg(cgrp);

9140

struct task_group *tg = cgroup_tg(cgrp);

9135

9141

9136

return (u64) tg->shares;

9142

return (u64) tg->shares;

9137

}

9143

}

9138

#endif /* CONFIG_FAIR_GROUP_SCHED */

9144

#endif /* CONFIG_FAIR_GROUP_SCHED */

9139

9145

9140

#ifdef CONFIG_RT_GROUP_SCHED

9146

#ifdef CONFIG_RT_GROUP_SCHED

9141

static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,

9147

static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,

9142

s64 val)

9148

s64 val)

9143

{

9149

{

9144

return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);

9150

return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);

9145

}

9151

}

9146

9152

9147

static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)

9153

static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)

9148

{

9154

{

9149

return sched_group_rt_runtime(cgroup_tg(cgrp));

9155

return sched_group_rt_runtime(cgroup_tg(cgrp));

9150

}

9156

}

9151

9157

9152

static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,

9158

static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,

9153

u64 rt_period_us)

9159

u64 rt_period_us)

9154

{

9160

{

9155

return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);

9161

return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);

9156

}

9162

}

9157

9163

9158

static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)

9164

static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)

9159

{

9165

{

9160

return sched_group_rt_period(cgroup_tg(cgrp));

9166

return sched_group_rt_period(cgroup_tg(cgrp));

9161

}

9167

}

9162

#endif /* CONFIG_RT_GROUP_SCHED */

9168

#endif /* CONFIG_RT_GROUP_SCHED */

9163

9169

9164

static struct cftype cpu_files[] = {

9170

static struct cftype cpu_files[] = {

9165

#ifdef CONFIG_FAIR_GROUP_SCHED

9171

#ifdef CONFIG_FAIR_GROUP_SCHED

9166

{

9172

{

9167

.name = "shares",

9173

.name = "shares",

9168

.read_u64 = cpu_shares_read_u64,

9174

.read_u64 = cpu_shares_read_u64,

9169

.write_u64 = cpu_shares_write_u64,

9175

.write_u64 = cpu_shares_write_u64,

9170

},

9176

},

9171

#endif

9177

#endif

9172

#ifdef CONFIG_RT_GROUP_SCHED

9178

#ifdef CONFIG_RT_GROUP_SCHED

9173

{

9179

{

9174

.name = "rt_runtime_us",

9180

.name = "rt_runtime_us",

9175

.read_s64 = cpu_rt_runtime_read,

9181

.read_s64 = cpu_rt_runtime_read,

9176

.write_s64 = cpu_rt_runtime_write,

9182

.write_s64 = cpu_rt_runtime_write,

9177

},

9183

},

9178

{

9184

{

9179

.name = "rt_period_us",

9185

.name = "rt_period_us",

9180

.read_u64 = cpu_rt_period_read_uint,

9186

.read_u64 = cpu_rt_period_read_uint,

9181

.write_u64 = cpu_rt_period_write_uint,

9187

.write_u64 = cpu_rt_period_write_uint,

9182

},

9188

},

9183

#endif

9189

#endif

9184

};

9190

};

9185

9191

9186

static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)

9192

static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)

9187

{

9193

{

9188

return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));

9194

return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));

9189

}

9195

}

9190

9196

9191

struct cgroup_subsys cpu_cgroup_subsys = {

9197

struct cgroup_subsys cpu_cgroup_subsys = {

9192

.name = "cpu",

9198

.name = "cpu",

9193

.create = cpu_cgroup_create,

9199

.create = cpu_cgroup_create,

9194

.destroy = cpu_cgroup_destroy,

9200

.destroy = cpu_cgroup_destroy,

9195

.can_attach = cpu_cgroup_can_attach,

9201

.can_attach = cpu_cgroup_can_attach,

9196

.attach = cpu_cgroup_attach,

9202

.attach = cpu_cgroup_attach,

9197

.populate = cpu_cgroup_populate,

9203

.populate = cpu_cgroup_populate,

9198

.subsys_id = cpu_cgroup_subsys_id,

9204

.subsys_id = cpu_cgroup_subsys_id,

9199

.early_init = 1,

9205

.early_init = 1,

9200

};

9206

};

9201

9207

9202

#endif /* CONFIG_CGROUP_SCHED */

9208

#endif /* CONFIG_CGROUP_SCHED */

9203

9209

9204

#ifdef CONFIG_CGROUP_CPUACCT

9210

#ifdef CONFIG_CGROUP_CPUACCT

9205

9211

9206

/*

9212

/*

9207

* CPU accounting code for task groups.

9213

* CPU accounting code for task groups.

9208

*

9214

*

9209

* Based on the work by Paul Menage (menage@google.com) and Balbir Singh

9215

* Based on the work by Paul Menage (menage@google.com) and Balbir Singh

9210

* (balbir@in.ibm.com).

9216

* (balbir@in.ibm.com).

9211

*/

9217

*/

9212

9218

9213

/* track cpu usage of a group of tasks and its child groups */

9219

/* track cpu usage of a group of tasks and its child groups */

9214

struct cpuacct {

9220

struct cpuacct {

9215

struct cgroup_subsys_state css;

9221

struct cgroup_subsys_state css;

9216

/* cpuusage holds pointer to a u64-type object on every cpu */

9222

/* cpuusage holds pointer to a u64-type object on every cpu */

9217

u64 *cpuusage;

9223

u64 *cpuusage;

9218

struct cpuacct *parent;

9224

struct cpuacct *parent;

9219

};

9225

};

9220

9226

9221

struct cgroup_subsys cpuacct_subsys;

9227

struct cgroup_subsys cpuacct_subsys;

9222

9228

9223

/* return cpu accounting group corresponding to this container */

9229

/* return cpu accounting group corresponding to this container */

9224

static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)

9230

static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)

9225

{

9231

{

9226

return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),

9232

return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),

9227

struct cpuacct, css);

9233

struct cpuacct, css);

9228

}

9234

}

9229

9235

9230

/* return cpu accounting group to which this task belongs */

9236

/* return cpu accounting group to which this task belongs */

9231

static inline struct cpuacct *task_ca(struct task_struct *tsk)

9237

static inline struct cpuacct *task_ca(struct task_struct *tsk)

9232

{

9238

{

9233

return container_of(task_subsys_state(tsk, cpuacct_subsys_id),

9239

return container_of(task_subsys_state(tsk, cpuacct_subsys_id),

9234

struct cpuacct, css);

9240

struct cpuacct, css);

9235

}

9241

}

9236

9242

9237

/* create a new cpu accounting group */

9243

/* create a new cpu accounting group */

9238

static struct cgroup_subsys_state *cpuacct_create(

9244

static struct cgroup_subsys_state *cpuacct_create(

9239

struct cgroup_subsys *ss, struct cgroup *cgrp)

9245

struct cgroup_subsys *ss, struct cgroup *cgrp)

9240

{

9246

{

9241

struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);

9247

struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);

9242

9248

9243

if (!ca)

9249

if (!ca)

9244

return ERR_PTR(-ENOMEM);

9250

return ERR_PTR(-ENOMEM);

9245

9251

9246

ca->cpuusage = alloc_percpu(u64);

9252

ca->cpuusage = alloc_percpu(u64);

9247

if (!ca->cpuusage) {

9253

if (!ca->cpuusage) {

9248

kfree(ca);

9254

kfree(ca);

9249

return ERR_PTR(-ENOMEM);

9255

return ERR_PTR(-ENOMEM);

9250

}

9256

}

9251

9257

9252

if (cgrp->parent)

9258

if (cgrp->parent)

9253

ca->parent = cgroup_ca(cgrp->parent);

9259

ca->parent = cgroup_ca(cgrp->parent);

9254

9260

9255

return &ca->css;

9261

return &ca->css;

9256

}

9262

}

9257

9263

9258

/* destroy an existing cpu accounting group */

9264

/* destroy an existing cpu accounting group */

9259

static void

9265

static void

9260

cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

9266

cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

9261

{

9267

{

9262

struct cpuacct *ca = cgroup_ca(cgrp);

9268

struct cpuacct *ca = cgroup_ca(cgrp);

9263

9269

9264

free_percpu(ca->cpuusage);

9270

free_percpu(ca->cpuusage);

9265

kfree(ca);

9271

kfree(ca);

9266

}

9272

}

9267

9273

9268

/* return total cpu usage (in nanoseconds) of a group */

9274

/* return total cpu usage (in nanoseconds) of a group */

9269

static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)

9275

static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)

9270

{

9276

{

9271

struct cpuacct *ca = cgroup_ca(cgrp);

9277

struct cpuacct *ca = cgroup_ca(cgrp);

9272

u64 totalcpuusage = 0;

9278

u64 totalcpuusage = 0;

9273

int i;

9279

int i;

9274

9280

9275

for_each_possible_cpu(i) {

9281

for_each_possible_cpu(i) {

9276

u64 *cpuusage = percpu_ptr(ca->cpuusage, i);

9282

u64 *cpuusage = percpu_ptr(ca->cpuusage, i);

9277

9283

9278

/*

9284

/*

9279

* Take rq->lock to make 64-bit addition safe on 32-bit

9285

* Take rq->lock to make 64-bit addition safe on 32-bit

9280

* platforms.

9286

* platforms.

9281

*/

9287

*/

9282

spin_lock_irq(&cpu_rq(i)->lock);

9288

spin_lock_irq(&cpu_rq(i)->lock);

9283

totalcpuusage += *cpuusage;

9289

totalcpuusage += *cpuusage;

9284

spin_unlock_irq(&cpu_rq(i)->lock);

9290

spin_unlock_irq(&cpu_rq(i)->lock);

9285

}

9291

}

9286

9292

9287

return totalcpuusage;

9293

return totalcpuusage;

9288

}

9294

}

9289

9295

9290

static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,

9296

static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,

9291

u64 reset)

9297

u64 reset)

9292

{

9298

{

9293

struct cpuacct *ca = cgroup_ca(cgrp);

9299

struct cpuacct *ca = cgroup_ca(cgrp);

9294

int err = 0;

9300

int err = 0;

9295

int i;

9301

int i;

9296

9302

9297

if (reset) {

9303

if (reset) {

9298

err = -EINVAL;

9304

err = -EINVAL;

9299

goto out;

9305

goto out;

9300

}

9306

}

9301

9307

9302

for_each_possible_cpu(i) {

9308

for_each_possible_cpu(i) {

9303

u64 *cpuusage = percpu_ptr(ca->cpuusage, i);

9309

u64 *cpuusage = percpu_ptr(ca->cpuusage, i);

9304

9310

9305

spin_lock_irq(&cpu_rq(i)->lock);

9311

spin_lock_irq(&cpu_rq(i)->lock);

9306

*cpuusage = 0;

9312

*cpuusage = 0;

9307

spin_unlock_irq(&cpu_rq(i)->lock);

9313

spin_unlock_irq(&cpu_rq(i)->lock);

9308

}

9314

}

9309

out:

9315

out:

9310

return err;

9316

return err;

9311

}

9317

}

9312

9318

9313

static struct cftype files[] = {

9319

static struct cftype files[] = {

9314

{

9320

{

9315

.name = "usage",

9321

.name = "usage",

9316

.read_u64 = cpuusage_read,

9322

.read_u64 = cpuusage_read,

9317

.write_u64 = cpuusage_write,

9323

.write_u64 = cpuusage_write,

9318

},

9324

},

9319

};

9325

};

9320

9326

9321

static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)

9327

static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)

9322

{

9328

{

9323

return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));

9329

return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));

9324

}

9330

}

9325

9331

9326

/*

9332

/*

9327

* charge this task's execution time to its accounting group.

9333

* charge this task's execution time to its accounting group.

9328

*

9334

*

9329

* called with rq->lock held.

9335

* called with rq->lock held.

9330

*/

9336

*/

9331

static void cpuacct_charge(struct task_struct *tsk, u64 cputime)

9337

static void cpuacct_charge(struct task_struct *tsk, u64 cputime)

9332

{

9338

{

9333

struct cpuacct *ca;

9339

struct cpuacct *ca;

9334

int cpu;

9340

int cpu;

9335

9341

9336

if (!cpuacct_subsys.active)

9342

if (!cpuacct_subsys.active)

9337

return;

9343

return;

9338

9344

9339

cpu = task_cpu(tsk);

9345

cpu = task_cpu(tsk);

9340

ca = task_ca(tsk);

9346

ca = task_ca(tsk);

9341

9347

9342

for (; ca; ca = ca->parent) {

9348

for (; ca; ca = ca->parent) {

9343

u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);

9349

u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);

9344

*cpuusage += cputime;

9350

*cpuusage += cputime;

9345

}

9351

}

9346

}

9352

}

9347

9353

9348

struct cgroup_subsys cpuacct_subsys = {

9354

struct cgroup_subsys cpuacct_subsys = {

9349

.name = "cpuacct",

9355

.name = "cpuacct",

9350

.create = cpuacct_create,

9356

.create = cpuacct_create,

9351

.destroy = cpuacct_destroy,

9357

.destroy = cpuacct_destroy,

9352

.populate = cpuacct_populate,

9358

.populate = cpuacct_populate,

9353

.subsys_id = cpuacct_subsys_id,

9359

.subsys_id = cpuacct_subsys_id,

9354

};

9360

};

9355

#endif /* CONFIG_CGROUP_CPUACCT */

9361

#endif /* CONFIG_CGROUP_CPUACCT */

9356

9362

GITLAB

sched: let arch_update_cpu_topology indicate if topology changed

 /*
  *    Copyright IBM Corp. 2007
  *    Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
  */
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/bootmem.h>
 #include <linux/sched.h>
 #include <linux/workqueue.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
 #include <asm/delay.h>
 #include <asm/s390_ext.h>
 #include <asm/sysinfo.h>
 #define CPU_BITS 64
 #define NR_MAG 6
 #define PTF_HORIZONTAL	(0UL)
 #define PTF_VERTICAL	(1UL)
 #define PTF_CHECK	(2UL)
 struct tl_cpu {
 	unsigned char reserved0[4];
 	unsigned char :6;
 	unsigned char pp:2;
 	unsigned char reserved1;
 	unsigned short origin;
 	unsigned long mask[CPU_BITS / BITS_PER_LONG];
 };
 struct tl_container {
 	unsigned char reserved[8];
 };
 union tl_entry {
 	unsigned char nl;
 	struct tl_cpu cpu;
 	struct tl_container container;
 };
 struct tl_info {
 	unsigned char reserved0[2];
 	unsigned short length;
 	unsigned char mag[NR_MAG];
 	unsigned char reserved1;
 	unsigned char mnest;
 	unsigned char reserved2[4];
 	union tl_entry tle[0];
 };
 struct core_info {
 	struct core_info *next;
 	cpumask_t mask;
 };
 static void topology_work_fn(struct work_struct *work);
 static struct tl_info *tl_info;
 static struct core_info core_info;
 static int machine_has_topology;
 static int machine_has_topology_irq;
 static struct timer_list topology_timer;
 static void set_topology_timer(void);
 static DECLARE_WORK(topology_work, topology_work_fn);
 /* topology_lock protects the core linked list */
 static DEFINE_SPINLOCK(topology_lock);
 cpumask_t cpu_core_map[NR_CPUS];
 cpumask_t cpu_coregroup_map(unsigned int cpu)
 {
 	struct core_info *core = &core_info;
 	unsigned long flags;
 	cpumask_t mask;
 	cpus_clear(mask);
 	if (!machine_has_topology)
 		return cpu_present_map;
 	spin_lock_irqsave(&topology_lock, flags);
 	while (core) {
 		if (cpu_isset(cpu, core->mask)) {
 			mask = core->mask;
 			break;
 		}
 		core = core->next;
 	}
 	spin_unlock_irqrestore(&topology_lock, flags);
 	if (cpus_empty(mask))
 		mask = cpumask_of_cpu(cpu);
 	return mask;
 }
 static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core)
 {
 	unsigned int cpu;
 	for (cpu = find_first_bit(&tl_cpu->mask[0], CPU_BITS);
 	     cpu < CPU_BITS;
 	     cpu = find_next_bit(&tl_cpu->mask[0], CPU_BITS, cpu + 1))
 	{
 		unsigned int rcpu, lcpu;
 		rcpu = CPU_BITS - 1 - cpu + tl_cpu->origin;
 		for_each_present_cpu(lcpu) {
 			if (__cpu_logical_map[lcpu] == rcpu) {
 				cpu_set(lcpu, core->mask);
 				smp_cpu_polarization[lcpu] = tl_cpu->pp;
 			}
 		}
 	}
 }
 static void clear_cores(void)
 {
 	struct core_info *core = &core_info;
 	while (core) {
 		cpus_clear(core->mask);
 		core = core->next;
 	}
 }
 static union tl_entry *next_tle(union tl_entry *tle)
 {
 	if (tle->nl)
 		return (union tl_entry *)((struct tl_container *)tle + 1);
 	else
 		return (union tl_entry *)((struct tl_cpu *)tle + 1);
 }
 static void tl_to_cores(struct tl_info *info)
 {
 	union tl_entry *tle, *end;
 	struct core_info *core = &core_info;
 	spin_lock_irq(&topology_lock);
 	clear_cores();
 	tle = info->tle;
 	end = (union tl_entry *)((unsigned long)info + info->length);
 	while (tle < end) {
 		switch (tle->nl) {
 		case 5:
 		case 4:
 		case 3:
 		case 2:
 			break;
 		case 1:
 			core = core->next;
 			break;
 		case 0:
 			add_cpus_to_core(&tle->cpu, core);
 			break;
 		default:
 			clear_cores();
 			machine_has_topology = 0;
 			return;
 		}
 		tle = next_tle(tle);
 	}
 	spin_unlock_irq(&topology_lock);
 }
 static void topology_update_polarization_simple(void)
 {
 	int cpu;
 	mutex_lock(&smp_cpu_state_mutex);
 	for_each_present_cpu(cpu)
 		smp_cpu_polarization[cpu] = POLARIZATION_HRZ;
 	mutex_unlock(&smp_cpu_state_mutex);
 }
 static int ptf(unsigned long fc)
 {
 	int rc;
 	asm volatile(
 		"	.insn	rre,0xb9a20000,%1,%1\n"
 		"	ipm	%0\n"
 		"	srl	%0,28\n"
 		: "=d" (rc)
 		: "d" (fc)  : "cc");
 	return rc;
 }
 int topology_set_cpu_management(int fc)
 {
 	int cpu;
 	int rc;
 	if (!machine_has_topology)
 		return -EOPNOTSUPP;
 	if (fc)
 		rc = ptf(PTF_VERTICAL);
 	else
 		rc = ptf(PTF_HORIZONTAL);
 	if (rc)
 		return -EBUSY;
 	for_each_present_cpu(cpu)
 		smp_cpu_polarization[cpu] = POLARIZATION_UNKNWN;
 	return rc;
 }
 static void update_cpu_core_map(void)
 {
 	int cpu;
 	for_each_present_cpu(cpu)
 		cpu_core_map[cpu] = cpu_coregroup_map(cpu);
 }
-void arch_update_cpu_topology(void)
+int arch_update_cpu_topology(void)
 {
 	struct tl_info *info = tl_info;
 	struct sys_device *sysdev;
 	int cpu;
 	if (!machine_has_topology) {
 		update_cpu_core_map();
 		topology_update_polarization_simple();
-		return;
+		return 0;
 	}
 	stsi(info, 15, 1, 2);
 	tl_to_cores(info);
 	update_cpu_core_map();
 	for_each_online_cpu(cpu) {
 		sysdev = get_cpu_sysdev(cpu);
 		kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
 	}
+	return 1;
 }
 static void topology_work_fn(struct work_struct *work)
 {
 	arch_reinit_sched_domains();
 }
 void topology_schedule_update(void)
 {
 	schedule_work(&topology_work);
 }
 static void topology_timer_fn(unsigned long ignored)
 {
 	if (ptf(PTF_CHECK))
 		topology_schedule_update();
 	set_topology_timer();
 }
 static void set_topology_timer(void)
 {
 	topology_timer.function = topology_timer_fn;
 	topology_timer.data = 0;
 	topology_timer.expires = jiffies + 60 * HZ;
 	add_timer(&topology_timer);
 }
 static void topology_interrupt(__u16 code)
 {
 	schedule_work(&topology_work);
 }
 static int __init init_topology_update(void)
 {
 	int rc;
 	rc = 0;
 	if (!machine_has_topology) {
 		topology_update_polarization_simple();
 		goto out;
 	}
 	init_timer_deferrable(&topology_timer);
 	if (machine_has_topology_irq) {
 		rc = register_external_interrupt(0x2005, topology_interrupt);
 		if (rc)
 			goto out;
 		ctl_set_bit(0, 8);
 	}
 	else
 		set_topology_timer();
 out:
 	update_cpu_core_map();
 	return rc;
 }
 __initcall(init_topology_update);
 void __init s390_init_cpu_topology(void)
 {
 	unsigned long long facility_bits;
 	struct tl_info *info;
 	struct core_info *core;
 	int nr_cores;
 	int i;
 	if (stfle(&facility_bits, 1) <= 0)
 		return;
 	if (!(facility_bits & (1ULL << 52)) || !(facility_bits & (1ULL << 61)))
 		return;
 	machine_has_topology = 1;
 	if (facility_bits & (1ULL << 51))
 		machine_has_topology_irq = 1;
 	tl_info = alloc_bootmem_pages(PAGE_SIZE);
 	info = tl_info;
 	stsi(info, 15, 1, 2);
 	nr_cores = info->mag[NR_MAG - 2];
 	for (i = 0; i < info->mnest - 2; i++)
 		nr_cores *= info->mag[NR_MAG - 3 - i];
 	printk(KERN_INFO "CPU topology:");
 	for (i = 0; i < NR_MAG; i++)
 		printk(" %d", info->mag[i]);
 	printk(" / %d\n", info->mnest);
 	core = &core_info;
 	for (i = 0; i < nr_cores; i++) {
 		core->next = alloc_bootmem(sizeof(struct core_info));
 		core = core->next;
 		if (!core)
 			goto error;
 	}
 	return;
 error:
 	machine_has_topology = 0;
 	machine_has_topology_irq = 0;
 }

1	/*	1	/*
2	* include/linux/topology.h	2	* include/linux/topology.h
3	*	3	*
4	* Written by: Matthew Dobson, IBM Corporation	4	* Written by: Matthew Dobson, IBM Corporation
5	*	5	*
6	* Copyright (C) 2002, IBM Corp.	6	* Copyright (C) 2002, IBM Corp.
7	*	7	*
8	* All rights reserved.	8	* All rights reserved.
9	*	9	*
10	* This program is free software; you can redistribute it and/or modify	10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by	11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or	12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.	13	* (at your option) any later version.
14	*	14	*
15	* This program is distributed in the hope that it will be useful, but	15	* This program is distributed in the hope that it will be useful, but
16	* WITHOUT ANY WARRANTY; without even the implied warranty of	16	* WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or	17	* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
18	* NON INFRINGEMENT. See the GNU General Public License for more	18	* NON INFRINGEMENT. See the GNU General Public License for more
19	* details.	19	* details.
20	*	20	*
21	* You should have received a copy of the GNU General Public License	21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software	22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.	23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*	24	*
25	* Send feedback to <colpatch@us.ibm.com>	25	* Send feedback to <colpatch@us.ibm.com>
26	*/	26	*/
27	#ifndef _LINUX_TOPOLOGY_H	27	#ifndef _LINUX_TOPOLOGY_H
28	#define _LINUX_TOPOLOGY_H	28	#define _LINUX_TOPOLOGY_H
29		29
30	#include <linux/cpumask.h>	30	#include <linux/cpumask.h>
31	#include <linux/bitops.h>	31	#include <linux/bitops.h>
32	#include <linux/mmzone.h>	32	#include <linux/mmzone.h>
33	#include <linux/smp.h>	33	#include <linux/smp.h>
34	#include <asm/topology.h>	34	#include <asm/topology.h>
35		35
36	#ifndef node_has_online_mem	36	#ifndef node_has_online_mem
37	#define node_has_online_mem(nid) (1)	37	#define node_has_online_mem(nid) (1)
38	#endif	38	#endif
39		39
40	#ifndef nr_cpus_node	40	#ifndef nr_cpus_node
41	#define nr_cpus_node(node) \	41	#define nr_cpus_node(node) \
42	({ \	42	({ \
43	node_to_cpumask_ptr(__tmp__, node); \	43	node_to_cpumask_ptr(__tmp__, node); \
44	cpus_weight(*__tmp__); \	44	cpus_weight(*__tmp__); \
45	})	45	})
46	#endif	46	#endif
47		47
48	#define for_each_node_with_cpus(node) \	48	#define for_each_node_with_cpus(node) \
49	for_each_online_node(node) \	49	for_each_online_node(node) \
50	if (nr_cpus_node(node))	50	if (nr_cpus_node(node))
51		51
52	void arch_update_cpu_topology(void);	52	int arch_update_cpu_topology(void);
53		53
54	/* Conform to ACPI 2.0 SLIT distance definitions */	54	/* Conform to ACPI 2.0 SLIT distance definitions */
55	#define LOCAL_DISTANCE 10	55	#define LOCAL_DISTANCE 10
56	#define REMOTE_DISTANCE 20	56	#define REMOTE_DISTANCE 20
57	#ifndef node_distance	57	#ifndef node_distance
58	#define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)	58	#define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
59	#endif	59	#endif
60	#ifndef RECLAIM_DISTANCE	60	#ifndef RECLAIM_DISTANCE
61	/*	61	/*
62	* If the distance between nodes in a system is larger than RECLAIM_DISTANCE	62	* If the distance between nodes in a system is larger than RECLAIM_DISTANCE
63	* (in whatever arch specific measurement units returned by node_distance())	63	* (in whatever arch specific measurement units returned by node_distance())
64	* then switch on zone reclaim on boot.	64	* then switch on zone reclaim on boot.
65	*/	65	*/
66	#define RECLAIM_DISTANCE 20	66	#define RECLAIM_DISTANCE 20
67	#endif	67	#endif
68	#ifndef PENALTY_FOR_NODE_WITH_CPUS	68	#ifndef PENALTY_FOR_NODE_WITH_CPUS
69	#define PENALTY_FOR_NODE_WITH_CPUS (1)	69	#define PENALTY_FOR_NODE_WITH_CPUS (1)
70	#endif	70	#endif
71		71
72	/*	72	/*
73	* Below are the 3 major initializers used in building sched_domains:	73	* Below are the 3 major initializers used in building sched_domains:
74	* SD_SIBLING_INIT, for SMT domains	74	* SD_SIBLING_INIT, for SMT domains
75	* SD_CPU_INIT, for SMP domains	75	* SD_CPU_INIT, for SMP domains
76	* SD_NODE_INIT, for NUMA domains	76	* SD_NODE_INIT, for NUMA domains
77	*	77	*
78	* Any architecture that cares to do any tuning to these values should do so	78	* Any architecture that cares to do any tuning to these values should do so
79	* by defining their own arch-specific initializer in include/asm/topology.h.	79	* by defining their own arch-specific initializer in include/asm/topology.h.
80	* A definition there will automagically override these default initializers	80	* A definition there will automagically override these default initializers
81	* and allow arch-specific performance tuning of sched_domains.	81	* and allow arch-specific performance tuning of sched_domains.
82	* (Only non-zero and non-null fields need be specified.)	82	* (Only non-zero and non-null fields need be specified.)
83	*/	83	*/
84		84
85	#ifdef CONFIG_SCHED_SMT	85	#ifdef CONFIG_SCHED_SMT
86	/* MCD - Do we really need this? It is always on if CONFIG_SCHED_SMT is,	86	/* MCD - Do we really need this? It is always on if CONFIG_SCHED_SMT is,
87	* so can't we drop this in favor of CONFIG_SCHED_SMT?	87	* so can't we drop this in favor of CONFIG_SCHED_SMT?
88	*/	88	*/
89	#define ARCH_HAS_SCHED_WAKE_IDLE	89	#define ARCH_HAS_SCHED_WAKE_IDLE
90	/* Common values for SMT siblings */	90	/* Common values for SMT siblings */
91	#ifndef SD_SIBLING_INIT	91	#ifndef SD_SIBLING_INIT
92	#define SD_SIBLING_INIT (struct sched_domain) { \	92	#define SD_SIBLING_INIT (struct sched_domain) { \
93	.min_interval = 1, \	93	.min_interval = 1, \
94	.max_interval = 2, \	94	.max_interval = 2, \
95	.busy_factor = 64, \	95	.busy_factor = 64, \
96	.imbalance_pct = 110, \	96	.imbalance_pct = 110, \
97	.flags = SD_LOAD_BALANCE \	97	.flags = SD_LOAD_BALANCE \
98	\| SD_BALANCE_NEWIDLE \	98	\| SD_BALANCE_NEWIDLE \
99	\| SD_BALANCE_FORK \	99	\| SD_BALANCE_FORK \
100	\| SD_BALANCE_EXEC \	100	\| SD_BALANCE_EXEC \
101	\| SD_WAKE_AFFINE \	101	\| SD_WAKE_AFFINE \
102	\| SD_WAKE_BALANCE \	102	\| SD_WAKE_BALANCE \
103	\| SD_SHARE_CPUPOWER, \	103	\| SD_SHARE_CPUPOWER, \
104	.last_balance = jiffies, \	104	.last_balance = jiffies, \
105	.balance_interval = 1, \	105	.balance_interval = 1, \
106	}	106	}
107	#endif	107	#endif
108	#endif /* CONFIG_SCHED_SMT */	108	#endif /* CONFIG_SCHED_SMT */
109		109
110	#ifdef CONFIG_SCHED_MC	110	#ifdef CONFIG_SCHED_MC
111	/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */	111	/* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */
112	#ifndef SD_MC_INIT	112	#ifndef SD_MC_INIT
113	#define SD_MC_INIT (struct sched_domain) { \	113	#define SD_MC_INIT (struct sched_domain) { \
114	.min_interval = 1, \	114	.min_interval = 1, \
115	.max_interval = 4, \	115	.max_interval = 4, \
116	.busy_factor = 64, \	116	.busy_factor = 64, \
117	.imbalance_pct = 125, \	117	.imbalance_pct = 125, \
118	.cache_nice_tries = 1, \	118	.cache_nice_tries = 1, \
119	.busy_idx = 2, \	119	.busy_idx = 2, \
120	.wake_idx = 1, \	120	.wake_idx = 1, \
121	.forkexec_idx = 1, \	121	.forkexec_idx = 1, \
122	.flags = SD_LOAD_BALANCE \	122	.flags = SD_LOAD_BALANCE \
123	\| SD_BALANCE_FORK \	123	\| SD_BALANCE_FORK \
124	\| SD_BALANCE_EXEC \	124	\| SD_BALANCE_EXEC \
125	\| SD_WAKE_AFFINE \	125	\| SD_WAKE_AFFINE \
126	\| SD_WAKE_BALANCE \	126	\| SD_WAKE_BALANCE \
127	\| SD_SHARE_PKG_RESOURCES\	127	\| SD_SHARE_PKG_RESOURCES\
128	\| BALANCE_FOR_MC_POWER, \	128	\| BALANCE_FOR_MC_POWER, \
129	.last_balance = jiffies, \	129	.last_balance = jiffies, \
130	.balance_interval = 1, \	130	.balance_interval = 1, \
131	}	131	}
132	#endif	132	#endif
133	#endif /* CONFIG_SCHED_MC */	133	#endif /* CONFIG_SCHED_MC */
134		134
135	/* Common values for CPUs */	135	/* Common values for CPUs */
136	#ifndef SD_CPU_INIT	136	#ifndef SD_CPU_INIT
137	#define SD_CPU_INIT (struct sched_domain) { \	137	#define SD_CPU_INIT (struct sched_domain) { \
138	.min_interval = 1, \	138	.min_interval = 1, \
139	.max_interval = 4, \	139	.max_interval = 4, \
140	.busy_factor = 64, \	140	.busy_factor = 64, \
141	.imbalance_pct = 125, \	141	.imbalance_pct = 125, \
142	.cache_nice_tries = 1, \	142	.cache_nice_tries = 1, \
143	.busy_idx = 2, \	143	.busy_idx = 2, \
144	.idle_idx = 1, \	144	.idle_idx = 1, \
145	.newidle_idx = 2, \	145	.newidle_idx = 2, \
146	.wake_idx = 1, \	146	.wake_idx = 1, \
147	.forkexec_idx = 1, \	147	.forkexec_idx = 1, \
148	.flags = SD_LOAD_BALANCE \	148	.flags = SD_LOAD_BALANCE \
149	\| SD_BALANCE_EXEC \	149	\| SD_BALANCE_EXEC \
150	\| SD_BALANCE_FORK \	150	\| SD_BALANCE_FORK \
151	\| SD_WAKE_AFFINE \	151	\| SD_WAKE_AFFINE \
152	\| SD_WAKE_BALANCE \	152	\| SD_WAKE_BALANCE \
153	\| BALANCE_FOR_PKG_POWER,\	153	\| BALANCE_FOR_PKG_POWER,\
154	.last_balance = jiffies, \	154	.last_balance = jiffies, \
155	.balance_interval = 1, \	155	.balance_interval = 1, \
156	}	156	}
157	#endif	157	#endif
158		158
159	/* sched_domains SD_ALLNODES_INIT for NUMA machines */	159	/* sched_domains SD_ALLNODES_INIT for NUMA machines */
160	#define SD_ALLNODES_INIT (struct sched_domain) { \	160	#define SD_ALLNODES_INIT (struct sched_domain) { \
161	.min_interval = 64, \	161	.min_interval = 64, \
162	.max_interval = 64*num_online_cpus(), \	162	.max_interval = 64*num_online_cpus(), \
163	.busy_factor = 128, \	163	.busy_factor = 128, \
164	.imbalance_pct = 133, \	164	.imbalance_pct = 133, \
165	.cache_nice_tries = 1, \	165	.cache_nice_tries = 1, \
166	.busy_idx = 3, \	166	.busy_idx = 3, \
167	.idle_idx = 3, \	167	.idle_idx = 3, \
168	.flags = SD_LOAD_BALANCE \	168	.flags = SD_LOAD_BALANCE \
169	\| SD_BALANCE_NEWIDLE \	169	\| SD_BALANCE_NEWIDLE \
170	\| SD_WAKE_AFFINE \	170	\| SD_WAKE_AFFINE \
171	\| SD_SERIALIZE, \	171	\| SD_SERIALIZE, \
172	.last_balance = jiffies, \	172	.last_balance = jiffies, \
173	.balance_interval = 64, \	173	.balance_interval = 64, \
174	}	174	}
175		175
176	#ifdef CONFIG_NUMA	176	#ifdef CONFIG_NUMA
177	#ifndef SD_NODE_INIT	177	#ifndef SD_NODE_INIT
178	#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!	178	#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
179	#endif	179	#endif
180	#endif /* CONFIG_NUMA */	180	#endif /* CONFIG_NUMA */
181		181
182	#ifndef topology_physical_package_id	182	#ifndef topology_physical_package_id
183	#define topology_physical_package_id(cpu) ((void)(cpu), -1)	183	#define topology_physical_package_id(cpu) ((void)(cpu), -1)
184	#endif	184	#endif
185	#ifndef topology_core_id	185	#ifndef topology_core_id
186	#define topology_core_id(cpu) ((void)(cpu), 0)	186	#define topology_core_id(cpu) ((void)(cpu), 0)
187	#endif	187	#endif
188	#ifndef topology_thread_siblings	188	#ifndef topology_thread_siblings
189	#define topology_thread_siblings(cpu) cpumask_of_cpu(cpu)	189	#define topology_thread_siblings(cpu) cpumask_of_cpu(cpu)
190	#endif	190	#endif
191	#ifndef topology_core_siblings	191	#ifndef topology_core_siblings
192	#define topology_core_siblings(cpu) cpumask_of_cpu(cpu)	192	#define topology_core_siblings(cpu) cpumask_of_cpu(cpu)
193	#endif	193	#endif
194		194
195	#endif /* _LINUX_TOPOLOGY_H */	195	#endif /* _LINUX_TOPOLOGY_H */
196		196

 /*
  *  kernel/sched.c
  *
  *  Kernel scheduler and related syscalls
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
  *
  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
  *		make semaphores SMP safe
  *  1998-11-19	Implemented schedule_timeout() and related stuff
  *		by Andrea Arcangeli
  *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
  *		hybrid priority-list and round-robin design with
  *		an array-switch method of distributing timeslices
  *		and per-CPU runqueues.  Cleanups and useful suggestions
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
  *  2007-04-15  Work begun on replacing all interactivity tuning with a
  *              fair scheduling design by Con Kolivas.
  *  2007-05-05  Load balancing (smp-nice) and other improvements
  *              by Peter Williams
  *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
  *              Thomas Gleixner, Mike Kravetz
  */
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/nmi.h>
 #include <linux/init.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
 #include <linux/smp_lock.h>
 #include <asm/mmu_context.h>
 #include <linux/interrupt.h>
 #include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/debug_locks.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
 #include <linux/freezer.h>
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/delay.h>
 #include <linux/pid_namespace.h>
 #include <linux/smp.h>
 #include <linux/threads.h>
 #include <linux/timer.h>
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/percpu.h>
 #include <linux/kthread.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
 #include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
 #include <linux/reciprocal_div.h>
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
 #include <linux/bootmem.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
 #include <trace/sched.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 #include "sched_cpupri.h"
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  * and back.
  */
 #define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
 #define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
 #define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
 /*
  * 'User priority' is the nice value converted to something we
  * can work with better when scaling various scheduler parameters,
  * it's a [ 0 ... 39 ] range.
  */
 #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
 #define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 /*
  * Helpers for converting nanosecond timing to jiffy resolution
  */
 #define NS_TO_JIFFIES(TIME)	((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
 #define NICE_0_LOAD		SCHED_LOAD_SCALE
 #define NICE_0_SHIFT		SCHED_LOAD_SHIFT
 /*
  * These are the 'tuning knobs' of the scheduler:
  *
  * default timeslice is 100 msecs (used only for SCHED_RR tasks).
  * Timeslices get refilled after they expire.
  */
 #define DEF_TIMESLICE		(100 * HZ / 1000)
 /*
  * single value that denotes runtime == period, ie unlimited time.
  */
 #define RUNTIME_INF	((u64)~0ULL)
 #ifdef CONFIG_SMP
 /*
  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
  * Since cpu_power is a 'constant', we can use a reciprocal divide.
  */
 static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
 {
 	return reciprocal_divide(load, sg->reciprocal_cpu_power);
 }
 /*
  * Each time a sched group cpu_power is changed,
  * we must compute its reciprocal value
  */
 static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
 {
 	sg->__cpu_power += val;
 	sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
 }
 #endif
 static inline int rt_policy(int policy)
 {
 	if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
 		return 1;
 	return 0;
 }
 static inline int task_has_rt_policy(struct task_struct *p)
 {
 	return rt_policy(p->policy);
 }
 /*
  * This is the priority-queue data structure of the RT scheduling class:
  */
 struct rt_prio_array {
 	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
 	struct list_head queue[MAX_RT_PRIO];
 };
 struct rt_bandwidth {
 	/* nests inside the rq lock: */
 	spinlock_t		rt_runtime_lock;
 	ktime_t			rt_period;
 	u64			rt_runtime;
 	struct hrtimer		rt_period_timer;
 };
 static struct rt_bandwidth def_rt_bandwidth;
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
 {
 	struct rt_bandwidth *rt_b =
 		container_of(timer, struct rt_bandwidth, rt_period_timer);
 	ktime_t now;
 	int overrun;
 	int idle = 0;
 	for (;;) {
 		now = hrtimer_cb_get_time(timer);
 		overrun = hrtimer_forward(timer, now, rt_b->rt_period);
 		if (!overrun)
 			break;
 		idle = do_sched_rt_period_timer(rt_b, overrun);
 	}
 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 }
 static
 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 {
 	rt_b->rt_period = ns_to_ktime(period);
 	rt_b->rt_runtime = runtime;
 	spin_lock_init(&rt_b->rt_runtime_lock);
 	hrtimer_init(&rt_b->rt_period_timer,
 			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rt_b->rt_period_timer.function = sched_rt_period_timer;
 	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
 }
 static inline int rt_bandwidth_enabled(void)
 {
 	return sysctl_sched_rt_runtime >= 0;
 }
 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
 	ktime_t now;
 	if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
 		return;
 	if (hrtimer_active(&rt_b->rt_period_timer))
 		return;
 	spin_lock(&rt_b->rt_runtime_lock);
 	for (;;) {
 		if (hrtimer_active(&rt_b->rt_period_timer))
 			break;
 		now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
 		hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
 		hrtimer_start_expires(&rt_b->rt_period_timer,
 				HRTIMER_MODE_ABS);
 	}
 	spin_unlock(&rt_b->rt_runtime_lock);
 }
 #ifdef CONFIG_RT_GROUP_SCHED
 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
 	hrtimer_cancel(&rt_b->rt_period_timer);
 }
 #endif
 /*
  * sched_domains_mutex serializes calls to arch_init_sched_domains,
  * detach_destroy_domains and partition_sched_domains.
  */
 static DEFINE_MUTEX(sched_domains_mutex);
 #ifdef CONFIG_GROUP_SCHED
 #include <linux/cgroup.h>
 struct cfs_rq;
 static LIST_HEAD(task_groups);
 /* task group related information */
 struct task_group {
 #ifdef CONFIG_CGROUP_SCHED
 	struct cgroup_subsys_state css;
 #endif
 #ifdef CONFIG_USER_SCHED
 	uid_t uid;
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* schedulable entities of this group on each cpu */
 	struct sched_entity **se;
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
 	unsigned long shares;
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	struct sched_rt_entity **rt_se;
 	struct rt_rq **rt_rq;
 	struct rt_bandwidth rt_bandwidth;
 #endif
 	struct rcu_head rcu;
 	struct list_head list;
 	struct task_group *parent;
 	struct list_head siblings;
 	struct list_head children;
 };
 #ifdef CONFIG_USER_SCHED
 /* Helper function to pass uid information to create_sched_user() */
 void set_tg_uid(struct user_struct *user)
 {
 	user->tg->uid = user->uid;
 }
 /*
  * Root task group.
  * 	Every UID task group (including init_task_group aka UID-0) will
  * 	be a child to this group.
  */
 struct task_group root_task_group;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /* Default task group's sched entity on each cpu */
 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
 static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 #endif /* CONFIG_RT_GROUP_SCHED */
 #else /* !CONFIG_USER_SCHED */
 #define root_task_group init_task_group
 #endif /* CONFIG_USER_SCHED */
 /* task_group_lock serializes add/remove of task groups and also changes to
  * a task group's cpu shares.
  */
 static DEFINE_SPINLOCK(task_group_lock);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_USER_SCHED
 # define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD)
 #else /* !CONFIG_USER_SCHED */
 # define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
 #endif /* CONFIG_USER_SCHED */
 /*
  * A weight of 0 or 1 can cause arithmetics problems.
  * A weight of a cfs_rq is the sum of weights of which entities
  * are queued on this cfs_rq, so a weight of a entity should not be
  * too large, so as the shares value of a task group.
  * (The default weight is 1024 - so there's no practical
  *  limitation from this.)
  */
 #define MIN_SHARES	2
 #define MAX_SHARES	(1UL << 18)
 static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 #endif
 /* Default task group.
  *	Every task in system belong to this group at bootup.
  */
 struct task_group init_task_group;
 /* return group to which a task belongs */
 static inline struct task_group *task_group(struct task_struct *p)
 {
 	struct task_group *tg;
 #ifdef CONFIG_USER_SCHED
 	tg = p->user->tg;
 #elif defined(CONFIG_CGROUP_SCHED)
 	tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
 				struct task_group, css);
 #else
 	tg = &init_task_group;
 #endif
 	return tg;
 }
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
 	p->se.parent = task_group(p)->se[cpu];
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
 	p->rt.parent = task_group(p)->rt_se[cpu];
 #endif
 }
 #else
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
 static inline struct task_group *task_group(struct task_struct *p)
 {
 	return NULL;
 }
 #endif	/* CONFIG_GROUP_SCHED */
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight load;
 	unsigned long nr_running;
 	u64 exec_clock;
 	u64 min_vruntime;
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
 	struct list_head tasks;
 	struct list_head *balance_iterator;
 	/*
 	 * 'curr' points to currently running entity on this cfs_rq.
 	 * It is set to NULL otherwise (i.e when none are currently running).
 	 */
 	struct sched_entity *curr, *next, *last;
 	unsigned int nr_spread_over;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
 	/*
 	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
 	 * (like users, containers etc.)
 	 *
 	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
 	 * list is used during load balance.
 	 */
 	struct list_head leaf_cfs_rq_list;
 	struct task_group *tg;	/* group that "owns" this runqueue */
 #ifdef CONFIG_SMP
 	/*
 	 * the part of load.weight contributed by tasks
 	 */
 	unsigned long task_weight;
 	/*
 	 *   h_load = weight * f(tg)
 	 *
 	 * Where f(tg) is the recursive weight fraction assigned to
 	 * this group.
 	 */
 	unsigned long h_load;
 	/*
 	 * this cpu's part of tg->shares
 	 */
 	unsigned long shares;
 	/*
 	 * load.weight at the time we set shares
 	 */
 	unsigned long rq_weight;
 #endif
 #endif
 };
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
 	struct rt_prio_array active;
 	unsigned long rt_nr_running;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	int highest_prio; /* highest queued rt task prio */
 #endif
 #ifdef CONFIG_SMP
 	unsigned long rt_nr_migratory;
 	int overloaded;
 #endif
 	int rt_throttled;
 	u64 rt_time;
 	u64 rt_runtime;
 	/* Nests inside the rq lock: */
 	spinlock_t rt_runtime_lock;
 #ifdef CONFIG_RT_GROUP_SCHED
 	unsigned long rt_nr_boosted;
 	struct rq *rq;
 	struct list_head leaf_rt_rq_list;
 	struct task_group *tg;
 	struct sched_rt_entity *rt_se;
 #endif
 };
 #ifdef CONFIG_SMP
 /*
  * We add the notion of a root-domain which will be used to define per-domain
  * variables. Each exclusive cpuset essentially defines an island domain by
  * fully partitioning the member cpus from any other cpuset. Whenever a new
  * exclusive cpuset is created, we also create and attach a new root-domain
  * object.
  *
  */
 struct root_domain {
 	atomic_t refcount;
 	cpumask_t span;
 	cpumask_t online;
 	/*
 	 * The "RT overload" flag: it gets set if a CPU has more than
 	 * one runnable RT task.
 	 */
 	cpumask_t rto_mask;
 	atomic_t rto_count;
 #ifdef CONFIG_SMP
 	struct cpupri cpupri;
 #endif
 };
 /*
  * By default the system creates a single root-domain with all cpus as
  * members (mimicking the global state we have today).
  */
 static struct root_domain def_root_domain;
 #endif
 /*
  * This is the main, per-CPU runqueue data structure.
  *
  * Locking rule: those places that want to lock multiple runqueues
  * (such as the load balancing or the thread migration code), lock
  * acquire operations must be ordered by ascending &runqueue.
  */
 struct rq {
 	/* runqueue lock: */
 	spinlock_t lock;
 	/*
 	 * nr_running and cpu_load should be in the same cacheline because
 	 * remote CPUs use both these fields when doing load calculation.
 	 */
 	unsigned long nr_running;
 	#define CPU_LOAD_IDX_MAX 5
 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 	unsigned char idle_at_tick;
 #ifdef CONFIG_NO_HZ
 	unsigned long last_tick_seen;
 	unsigned char in_nohz_recently;
 #endif
 	/* capture load from *all* tasks on this cpu: */
 	struct load_weight load;
 	unsigned long nr_load_updates;
 	u64 nr_switches;
 	struct cfs_rq cfs;
 	struct rt_rq rt;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
 	struct list_head leaf_cfs_rq_list;
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	struct list_head leaf_rt_rq_list;
 #endif
 	/*
 	 * This is part of a global counter where only the total sum
 	 * over all CPUs matters. A task can increase this counter on
 	 * one CPU and if it got migrated afterwards it may decrease
 	 * it on another CPU. Always updated under the runqueue lock:
 	 */
 	unsigned long nr_uninterruptible;
 	struct task_struct *curr, *idle;
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
 	u64 clock;
 	atomic_t nr_iowait;
 #ifdef CONFIG_SMP
 	struct root_domain *rd;
 	struct sched_domain *sd;
 	/* For active balancing */
 	int active_balance;
 	int push_cpu;
 	/* cpu of this runqueue: */
 	int cpu;
 	int online;
 	unsigned long avg_load_per_task;
 	struct task_struct *migration_thread;
 	struct list_head migration_queue;
 #endif
 #ifdef CONFIG_SCHED_HRTICK
 #ifdef CONFIG_SMP
 	int hrtick_csd_pending;
 	struct call_single_data hrtick_csd;
 #endif
 	struct hrtimer hrtick_timer;
 #endif
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
 	struct sched_info rq_sched_info;
 	/* sys_sched_yield() stats */
 	unsigned int yld_exp_empty;
 	unsigned int yld_act_empty;
 	unsigned int yld_both_empty;
 	unsigned int yld_count;
 	/* schedule() stats */
 	unsigned int sched_switch;
 	unsigned int sched_count;
 	unsigned int sched_goidle;
 	/* try_to_wake_up() stats */
 	unsigned int ttwu_count;
 	unsigned int ttwu_local;
 	/* BKL stats */
 	unsigned int bkl_count;
 #endif
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
 {
 	rq->curr->sched_class->check_preempt_curr(rq, p, sync);
 }
 static inline int cpu_of(struct rq *rq)
 {
 #ifdef CONFIG_SMP
 	return rq->cpu;
 #else
 	return 0;
 #endif
 }
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
  * See detach_destroy_domains: synchronize_sched for details.
  *
  * The domain tree of any CPU may only be accessed from within
  * preempt-disabled sections.
  */
 #define for_each_domain(cpu, __sd) \
 	for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
 #define this_rq()		(&__get_cpu_var(runqueues))
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 static inline void update_rq_clock(struct rq *rq)
 {
 	rq->clock = sched_clock_cpu(cpu_of(rq));
 }
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
 #ifdef CONFIG_SCHED_DEBUG
 # define const_debug __read_mostly
 #else
 # define const_debug static const
 #endif
 /**
  * runqueue_is_locked
  *
  * Returns true if the current cpu runqueue is locked.
  * This interface allows printk to be called with the runqueue lock
  * held and know whether or not it is OK to wake up the klogd.
  */
 int runqueue_is_locked(void)
 {
 	int cpu = get_cpu();
 	struct rq *rq = cpu_rq(cpu);
 	int ret;
 	ret = spin_is_locked(&rq->lock);
 	put_cpu();
 	return ret;
 }
 /*
  * Debugging: various feature bits
  */
 #define SCHED_FEAT(name, enabled)	\
 	__SCHED_FEAT_##name ,
 enum {
 #include "sched_features.h"
 };
 #undef SCHED_FEAT
 #define SCHED_FEAT(name, enabled)	\
 	(1UL << __SCHED_FEAT_##name) * enabled |
 const_debug unsigned int sysctl_sched_features =
 #include "sched_features.h"
 	0;
 #undef SCHED_FEAT
 #ifdef CONFIG_SCHED_DEBUG
 #define SCHED_FEAT(name, enabled)	\
 	#name ,
 static __read_mostly char *sched_feat_names[] = {
 #include "sched_features.h"
 	NULL
 };
 #undef SCHED_FEAT
 static int sched_feat_show(struct seq_file *m, void *v)
 {
 	int i;
 	for (i = 0; sched_feat_names[i]; i++) {
 		if (!(sysctl_sched_features & (1UL << i)))
 			seq_puts(m, "NO_");
 		seq_printf(m, "%s ", sched_feat_names[i]);
 	}
 	seq_puts(m, "\n");
 	return 0;
 }
 static ssize_t
 sched_feat_write(struct file *filp, const char __user *ubuf,
 		size_t cnt, loff_t *ppos)
 {
 	char buf[64];
 	char *cmp = buf;
 	int neg = 0;
 	int i;
 	if (cnt > 63)
 		cnt = 63;
 	if (copy_from_user(&buf, ubuf, cnt))
 		return -EFAULT;
 	buf[cnt] = 0;
 	if (strncmp(buf, "NO_", 3) == 0) {
 		neg = 1;
 		cmp += 3;
 	}
 	for (i = 0; sched_feat_names[i]; i++) {
 		int len = strlen(sched_feat_names[i]);
 		if (strncmp(cmp, sched_feat_names[i], len) == 0) {
 			if (neg)
 				sysctl_sched_features &= ~(1UL << i);
 			else
 				sysctl_sched_features |= (1UL << i);
 			break;
 		}
 	}
 	if (!sched_feat_names[i])
 		return -EINVAL;
 	filp->f_pos += cnt;
 	return cnt;
 }
 static int sched_feat_open(struct inode *inode, struct file *filp)
 {
 	return single_open(filp, sched_feat_show, NULL);
 }
 static struct file_operations sched_feat_fops = {
 	.open		= sched_feat_open,
 	.write		= sched_feat_write,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
 static __init int sched_init_debug(void)
 {
 	debugfs_create_file("sched_features", 0644, NULL, NULL,
 			&sched_feat_fops);
 	return 0;
 }
 late_initcall(sched_init_debug);
 #endif
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 /*
  * Number of tasks to iterate in a single balance run.
  * Limited because this is done with IRQs disabled.
  */
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 /*
  * ratelimit for updating the group shares.
  * default: 0.25ms
  */
 unsigned int sysctl_sched_shares_ratelimit = 250000;
 /*
  * Inject some fuzzyness into changing the per-cpu group shares
  * this avoids remote rq-locks at the expense of fairness.
  * default: 4
  */
 unsigned int sysctl_sched_shares_thresh = 4;
 /*
  * period over which we measure -rt task cpu usage in us.
  * default: 1s
  */
 unsigned int sysctl_sched_rt_period = 1000000;
 static __read_mostly int scheduler_running;
 /*
  * part of the period that we allow rt tasks to run in us.
  * default: 0.95s
  */
 int sysctl_sched_rt_runtime = 950000;
 static inline u64 global_rt_period(void)
 {
 	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
 }
 static inline u64 global_rt_runtime(void)
 {
 	if (sysctl_sched_rt_runtime < 0)
 		return RUNTIME_INF;
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
 #ifndef finish_arch_switch
 # define finish_arch_switch(prev)	do { } while (0)
 #endif
 static inline int task_current(struct rq *rq, struct task_struct *p)
 {
 	return rq->curr == p;
 }
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline int task_running(struct rq *rq, struct task_struct *p)
 {
 	return task_current(rq, p);
 }
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 }
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	/* this is a valid case when another task releases the spinlock */
 	rq->lock.owner = current;
 #endif
 	/*
 	 * If we are tracking spinlock dependencies then we have to
 	 * fix up the runqueue lock - which gets 'carried over' from
 	 * prev into current:
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
 	spin_unlock_irq(&rq->lock);
 }
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
 static inline int task_running(struct rq *rq, struct task_struct *p)
 {
 #ifdef CONFIG_SMP
 	return p->oncpu;
 #else
 	return task_current(rq, p);
 #endif
 }
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 #ifdef CONFIG_SMP
 	/*
 	 * We can optimise this out completely for !SMP, because the
 	 * SMP rebalancing from interrupt is the only thing that cares
 	 * here.
 	 */
 	next->oncpu = 1;
 #endif
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 	spin_unlock_irq(&rq->lock);
 #else
 	spin_unlock(&rq->lock);
 #endif
 }
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
 #ifdef CONFIG_SMP
 	/*
 	 * After ->oncpu is cleared, the task can be moved to a different CPU.
 	 * We must ensure this doesn't happen until the switch is completely
 	 * finished.
 	 */
 	smp_wmb();
 	prev->oncpu = 0;
 #endif
 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 	local_irq_enable();
 #endif
 }
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
  * __task_rq_lock - lock the runqueue a given task resides on.
  * Must be called interrupts disabled.
  */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
 	__acquires(rq->lock)
 {
 	for (;;) {
 		struct rq *rq = task_rq(p);
 		spin_lock(&rq->lock);
 		if (likely(rq == task_rq(p)))
 			return rq;
 		spin_unlock(&rq->lock);
 	}
 }
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
  * interrupts. Note the ordering: we can safely lookup the task_rq without
  * explicitly disabling preemption.
  */
 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 	for (;;) {
 		local_irq_save(*flags);
 		rq = task_rq(p);
 		spin_lock(&rq->lock);
 		if (likely(rq == task_rq(p)))
 			return rq;
 		spin_unlock_irqrestore(&rq->lock, *flags);
 	}
 }
 void task_rq_unlock_wait(struct task_struct *p)
 {
 	struct rq *rq = task_rq(p);
 	smp_mb(); /* spin-unlock-wait is not a full memory barrier */
 	spin_unlock_wait(&rq->lock);
 }
 static void __task_rq_unlock(struct rq *rq)
 	__releases(rq->lock)
 {
 	spin_unlock(&rq->lock);
 }
 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
 	__releases(rq->lock)
 {
 	spin_unlock_irqrestore(&rq->lock, *flags);
 }
 /*
  * this_rq_lock - lock this runqueue and disable interrupts.
  */
 static struct rq *this_rq_lock(void)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 	local_irq_disable();
 	rq = this_rq();
 	spin_lock(&rq->lock);
 	return rq;
 }
 #ifdef CONFIG_SCHED_HRTICK
 /*
  * Use HR-timers to deliver accurate preemption points.
  *
  * Its all a bit involved since we cannot program an hrt while holding the
  * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
  * reschedule event.
  *
  * When we get rescheduled we reprogram the hrtick_timer outside of the
  * rq->lock.
  */
 /*
  * Use hrtick when:
  *  - enabled by features
  *  - hrtimer is actually high res
  */
 static inline int hrtick_enabled(struct rq *rq)
 {
 	if (!sched_feat(HRTICK))
 		return 0;
 	if (!cpu_active(cpu_of(rq)))
 		return 0;
 	return hrtimer_is_hres_active(&rq->hrtick_timer);
 }
 static void hrtick_clear(struct rq *rq)
 {
 	if (hrtimer_active(&rq->hrtick_timer))
 		hrtimer_cancel(&rq->hrtick_timer);
 }
 /*
  * High-resolution timer tick.
  * Runs from hardirq context with interrupts disabled.
  */
 static enum hrtimer_restart hrtick(struct hrtimer *timer)
 {
 	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
 	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 	spin_lock(&rq->lock);
 	update_rq_clock(rq);
 	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
 	spin_unlock(&rq->lock);
 	return HRTIMER_NORESTART;
 }
 #ifdef CONFIG_SMP
 /*
  * called from hardirq (IPI) context
  */
 static void __hrtick_start(void *arg)
 {
 	struct rq *rq = arg;
 	spin_lock(&rq->lock);
 	hrtimer_restart(&rq->hrtick_timer);
 	rq->hrtick_csd_pending = 0;
 	spin_unlock(&rq->lock);
 }
 /*
  * Called to set the hrtick timer state.
  *
  * called with rq->lock held and irqs disabled
  */
 static void hrtick_start(struct rq *rq, u64 delay)
 {
 	struct hrtimer *timer = &rq->hrtick_timer;
 	ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
 	hrtimer_set_expires(timer, time);
 	if (rq == this_rq()) {
 		hrtimer_restart(timer);
 	} else if (!rq->hrtick_csd_pending) {
 		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
 		rq->hrtick_csd_pending = 1;
 	}
 }
 static int
 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	int cpu = (int)(long)hcpu;
 	switch (action) {
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		hrtick_clear(cpu_rq(cpu));
 		return NOTIFY_OK;
 	}
 	return NOTIFY_DONE;
 }
 static __init void init_hrtick(void)
 {
 	hotcpu_notifier(hotplug_hrtick, 0);
 }
 #else
 /*
  * Called to set the hrtick timer state.
  *
  * called with rq->lock held and irqs disabled
  */
 static void hrtick_start(struct rq *rq, u64 delay)
 {
 	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
 }
 static inline void init_hrtick(void)
 {
 }
 #endif /* CONFIG_SMP */
 static void init_rq_hrtick(struct rq *rq)
 {
 #ifdef CONFIG_SMP
 	rq->hrtick_csd_pending = 0;
 	rq->hrtick_csd.flags = 0;
 	rq->hrtick_csd.func = __hrtick_start;
 	rq->hrtick_csd.info = rq;
 #endif
 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rq->hrtick_timer.function = hrtick;
 	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
 }
 #else	/* CONFIG_SCHED_HRTICK */
 static inline void hrtick_clear(struct rq *rq)
 {
 }
 static inline void init_rq_hrtick(struct rq *rq)
 {
 }
 static inline void init_hrtick(void)
 {
 }
 #endif	/* CONFIG_SCHED_HRTICK */
 /*
  * resched_task - mark a task 'to be rescheduled now'.
  *
  * On UP this means the setting of the need_resched flag, on SMP it
  * might also involve a cross-CPU call to trigger the scheduler on
  * the target CPU.
  */
 #ifdef CONFIG_SMP
 #ifndef tsk_is_polling
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
 static void resched_task(struct task_struct *p)
 {
 	int cpu;
 	assert_spin_locked(&task_rq(p)->lock);
 	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
 		return;
 	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
 	cpu = task_cpu(p);
 	if (cpu == smp_processor_id())
 		return;
 	/* NEED_RESCHED must be visible before we test polling */
 	smp_mb();
 	if (!tsk_is_polling(p))
 		smp_send_reschedule(cpu);
 }
 static void resched_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 	if (!spin_trylock_irqsave(&rq->lock, flags))
 		return;
 	resched_task(cpu_curr(cpu));
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 #ifdef CONFIG_NO_HZ
 /*
  * When add_timer_on() enqueues a timer into the timer wheel of an
  * idle CPU then this timer might expire before the next timer event
  * which is scheduled to wake up that CPU. In case of a completely
  * idle system the next event might even be infinite time into the
  * future. wake_up_idle_cpu() ensures that the CPU is woken up and
  * leaves the inner idle loop so the newly added timer is taken into
  * account when the CPU goes back to idle and evaluates the timer
  * wheel for the next timer event.
  */
 void wake_up_idle_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	if (cpu == smp_processor_id())
 		return;
 	/*
 	 * This is safe, as this function is called with the timer
 	 * wheel base lock of (cpu) held. When the CPU is on the way
 	 * to idle and has not yet set rq->curr to idle then it will
 	 * be serialized on the timer wheel base lock and take the new
 	 * timer into account automatically.
 	 */
 	if (rq->curr != rq->idle)
 		return;
 	/*
 	 * We can set TIF_RESCHED on the idle task of the other CPU
 	 * lockless. The worst case is that the other CPU runs the
 	 * idle task through an additional NOOP schedule()
 	 */
 	set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
 	/* NEED_RESCHED must be visible before we test polling */
 	smp_mb();
 	if (!tsk_is_polling(rq->idle))
 		smp_send_reschedule(cpu);
 }
 #endif /* CONFIG_NO_HZ */
 #else /* !CONFIG_SMP */
 static void resched_task(struct task_struct *p)
 {
 	assert_spin_locked(&task_rq(p)->lock);
 	set_tsk_need_resched(p);
 }
 #endif /* CONFIG_SMP */
 #if BITS_PER_LONG == 32
 # define WMULT_CONST	(~0UL)
 #else
 # define WMULT_CONST	(1UL << 32)
 #endif
 #define WMULT_SHIFT	32
 /*
  * Shift right and round:
  */
 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
 /*
  * delta *= weight / lw
  */
 static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 		struct load_weight *lw)
 {
 	u64 tmp;
 	if (!lw->inv_weight) {
 		if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
 			lw->inv_weight = 1;
 		else
 			lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
 				/ (lw->weight+1);
 	}
 	tmp = (u64)delta_exec * weight;
 	/*
 	 * Check whether we'd overflow the 64-bit multiplication:
 	 */
 	if (unlikely(tmp > WMULT_CONST))
 		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
 			WMULT_SHIFT/2);
 	else
 		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
 	lw->inv_weight = 0;
 }
 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 {
 	lw->weight -= dec;
 	lw->inv_weight = 0;
 }
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
  * of tasks with abnormal "nice" values across CPUs the contribution that
  * each task makes to its run queue's load is weighted according to its
  * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
  * scaled version of the new time slice allocation that they receive on time
  * slice expiry etc.
  */
 #define WEIGHT_IDLEPRIO		2
 #define WMULT_IDLEPRIO		(1 << 31)
 /*
  * Nice levels are multiplicative, with a gentle 10% change for every
  * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
  * nice 1, it will get ~10% less CPU time than another CPU-bound task
  * that remained on nice 0.
  *
  * The "10% effect" is relative and cumulative: from _any_ nice level,
  * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
  * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
  * If a task goes up by ~10% and another task goes down by ~10% then
  * the relative distance between them is ~25%.)
  */
 static const int prio_to_weight[40] = {
  /* -20 */     88761,     71755,     56483,     46273,     36291,
  /* -15 */     29154,     23254,     18705,     14949,     11916,
  /* -10 */      9548,      7620,      6100,      4904,      3906,
  /*  -5 */      3121,      2501,      1991,      1586,      1277,
  /*   0 */      1024,       820,       655,       526,       423,
  /*   5 */       335,       272,       215,       172,       137,
  /*  10 */       110,        87,        70,        56,        45,
  /*  15 */        36,        29,        23,        18,        15,
 };
 /*
  * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
  *
  * In cases where the weight does not change often, we can use the
  * precalculated inverse to speed up arithmetics by turning divisions
  * into multiplications:
  */
 static const u32 prio_to_wmult[40] = {
  /* -20 */     48388,     59856,     76040,     92818,    118348,
  /* -15 */    147320,    184698,    229616,    287308,    360437,
  /* -10 */    449829,    563644,    704093,    875809,   1099582,
  /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
  /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
  /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
  /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
 /*
  * runqueue iterator, to support SMP load-balancing between different
  * scheduling classes, without having to expose their internal data
  * structures to the load-balancing proper:
  */
 struct rq_iterator {
 	void *arg;
 	struct task_struct *(*start)(void *);
 	struct task_struct *(*next)(void *);
 };
 #ifdef CONFIG_SMP
 static unsigned long
 balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	      unsigned long max_load_move, struct sched_domain *sd,
 	      enum cpu_idle_type idle, int *all_pinned,
 	      int *this_best_prio, struct rq_iterator *iterator);
 static int
 iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		   struct sched_domain *sd, enum cpu_idle_type idle,
 		   struct rq_iterator *iterator);
 #endif
 #ifdef CONFIG_CGROUP_CPUACCT
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
 #else
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 #endif
 static inline void inc_cpu_load(struct rq *rq, unsigned long load)
 {
 	update_load_add(&rq->load, load);
 }
 static inline void dec_cpu_load(struct rq *rq, unsigned long load)
 {
 	update_load_sub(&rq->load, load);
 }
 #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
 typedef int (*tg_visitor)(struct task_group *, void *);
 /*
  * Iterate the full tree, calling @down when first entering a node and @up when
  * leaving it for the final time.
  */
 static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
 {
 	struct task_group *parent, *child;
 	int ret;
 	rcu_read_lock();
 	parent = &root_task_group;
 down:
 	ret = (*down)(parent, data);
 	if (ret)
 		goto out_unlock;
 	list_for_each_entry_rcu(child, &parent->children, siblings) {
 		parent = child;
 		goto down;
 up:
 		continue;
 	}
 	ret = (*up)(parent, data);
 	if (ret)
 		goto out_unlock;
 	child = parent;
 	parent = parent->parent;
 	if (parent)
 		goto up;
 out_unlock:
 	rcu_read_unlock();
 	return ret;
 }
 static int tg_nop(struct task_group *tg, void *data)
 {
 	return 0;
 }
 #endif
 #ifdef CONFIG_SMP
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
 	if (nr_running)
 		rq->avg_load_per_task = rq->load.weight / nr_running;
 	else
 		rq->avg_load_per_task = 0;
 	return rq->avg_load_per_task;
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 /*
  * Calculate and set the cpu's group shares.
  */
 static void
 update_group_shares_cpu(struct task_group *tg, int cpu,
 			unsigned long sd_shares, unsigned long sd_rq_weight)
 {
 	unsigned long shares;
 	unsigned long rq_weight;
 	if (!tg->se[cpu])
 		return;
 	rq_weight = tg->cfs_rq[cpu]->rq_weight;
 	/*
 	 *           \Sum shares * rq_weight
 	 * shares =  -----------------------
 	 *               \Sum rq_weight
 	 *
 	 */
 	shares = (sd_shares * rq_weight) / sd_rq_weight;
 	shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
 	if (abs(shares - tg->se[cpu]->load.weight) >
 			sysctl_sched_shares_thresh) {
 		struct rq *rq = cpu_rq(cpu);
 		unsigned long flags;
 		spin_lock_irqsave(&rq->lock, flags);
 		tg->cfs_rq[cpu]->shares = shares;
 		__set_se_shares(tg->se[cpu], shares);
 		spin_unlock_irqrestore(&rq->lock, flags);
 	}
 }
 /*
  * Re-compute the task group their per cpu shares over the given domain.
  * This needs to be done in a bottom-up fashion because the rq weight of a
  * parent group depends on the shares of its child groups.
  */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
 	unsigned long weight, rq_weight = 0;
 	unsigned long shares = 0;
 	struct sched_domain *sd = data;
 	int i;
 	for_each_cpu_mask(i, sd->span) {
 		/*
 		 * If there are currently no tasks on the cpu pretend there
 		 * is one of average load so that when a new task gets to
 		 * run here it will not get delayed by group starvation.
 		 */
 		weight = tg->cfs_rq[i]->load.weight;
 		if (!weight)
 			weight = NICE_0_LOAD;
 		tg->cfs_rq[i]->rq_weight = weight;
 		rq_weight += weight;
 		shares += tg->cfs_rq[i]->shares;
 	}
 	if ((!shares && rq_weight) || shares > tg->shares)
 		shares = tg->shares;
 	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
 		shares = tg->shares;
 	for_each_cpu_mask(i, sd->span)
 		update_group_shares_cpu(tg, i, shares, rq_weight);
 	return 0;
 }
 /*
  * Compute the cpu's hierarchical load factor for each task group.
  * This needs to be done in a top-down fashion because the load of a child
  * group is a fraction of its parents load.
  */
 static int tg_load_down(struct task_group *tg, void *data)
 {
 	unsigned long load;
 	long cpu = (long)data;
 	if (!tg->parent) {
 		load = cpu_rq(cpu)->load.weight;
 	} else {
 		load = tg->parent->cfs_rq[cpu]->h_load;
 		load *= tg->cfs_rq[cpu]->shares;
 		load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
 	}
 	tg->cfs_rq[cpu]->h_load = load;
 	return 0;
 }
 static void update_shares(struct sched_domain *sd)
 {
 	u64 now = cpu_clock(raw_smp_processor_id());
 	s64 elapsed = now - sd->last_update;
 	if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
 		sd->last_update = now;
 		walk_tg_tree(tg_nop, tg_shares_up, sd);
 	}
 }
 static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 {
 	spin_unlock(&rq->lock);
 	update_shares(sd);
 	spin_lock(&rq->lock);
 }
 static void update_h_load(long cpu)
 {
 	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
 #else
 static inline void update_shares(struct sched_domain *sd)
 {
 }
 static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 {
 }
 #endif
 /*
  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
  */
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(this_rq->lock)
 	__acquires(busiest->lock)
 	__acquires(this_rq->lock)
 {
 	int ret = 0;
 	if (unlikely(!irqs_disabled())) {
 		/* printk() doesn't work good under rq->lock */
 		spin_unlock(&this_rq->lock);
 		BUG_ON(1);
 	}
 	if (unlikely(!spin_trylock(&busiest->lock))) {
 		if (busiest < this_rq) {
 			spin_unlock(&this_rq->lock);
 			spin_lock(&busiest->lock);
 			spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
 			ret = 1;
 		} else
 			spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
 	}
 	return ret;
 }
 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(busiest->lock)
 {
 	spin_unlock(&busiest->lock);
 	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
 }
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 {
 #ifdef CONFIG_SMP
 	cfs_rq->shares = shares;
 #endif
 }
 #endif
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
 #endif
 #define sched_class_highest (&rt_sched_class)
 #define for_each_class(class) \
    for (class = sched_class_highest; class; class = class->next)
 static void inc_nr_running(struct rq *rq)
 {
 	rq->nr_running++;
 }
 static void dec_nr_running(struct rq *rq)
 {
 	rq->nr_running--;
 }
 static void set_load_weight(struct task_struct *p)
 {
 	if (task_has_rt_policy(p)) {
 		p->se.load.weight = prio_to_weight[0] * 2;
 		p->se.load.inv_weight = prio_to_wmult[0] >> 1;
 		return;
 	}
 	/*
 	 * SCHED_IDLE tasks get minimal weight:
 	 */
 	if (p->policy == SCHED_IDLE) {
 		p->se.load.weight = WEIGHT_IDLEPRIO;
 		p->se.load.inv_weight = WMULT_IDLEPRIO;
 		return;
 	}
 	p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
 	p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 }
 static void update_avg(u64 *avg, u64 sample)
 {
 	s64 diff = sample - *avg;
 	*avg += diff >> 3;
 }
 static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	sched_info_queued(p);
 	p->sched_class->enqueue_task(rq, p, wakeup);
 	p->se.on_rq = 1;
 }
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
 	if (sleep && p->se.last_wakeup) {
 		update_avg(&p->se.avg_overlap,
 			   p->se.sum_exec_runtime - p->se.last_wakeup);
 		p->se.last_wakeup = 0;
 	}
 	sched_info_dequeued(p);
 	p->sched_class->dequeue_task(rq, p, sleep);
 	p->se.on_rq = 0;
 }
 /*
  * __normal_prio - return the priority that is based on the static prio
  */
 static inline int __normal_prio(struct task_struct *p)
 {
 	return p->static_prio;
 }
 /*
  * Calculate the expected normal priority: i.e. priority
  * without taking RT-inheritance into account. Might be
  * boosted by interactivity modifiers. Changes upon fork,
  * setprio syscalls, and whenever the interactivity
  * estimator recalculates.
  */
 static inline int normal_prio(struct task_struct *p)
 {
 	int prio;
 	if (task_has_rt_policy(p))
 		prio = MAX_RT_PRIO-1 - p->rt_priority;
 	else
 		prio = __normal_prio(p);
 	return prio;
 }
 /*
  * Calculate the current priority, i.e. the priority
  * taken into account by the scheduler. This value might
  * be boosted by RT tasks, or might be boosted by
  * interactivity modifiers. Will be RT if the task got
  * RT-boosted. If not then it returns p->normal_prio.
  */
 static int effective_prio(struct task_struct *p)
 {
 	p->normal_prio = normal_prio(p);
 	/*
 	 * If we are RT tasks or we were boosted to RT priority,
 	 * keep the priority unchanged. Otherwise, update priority
 	 * to the normal priority:
 	 */
 	if (!rt_prio(p->prio))
 		return p->normal_prio;
 	return p->prio;
 }
 /*
  * activate_task - move a task to the runqueue.
  */
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible--;
 	enqueue_task(rq, p, wakeup);
 	inc_nr_running(rq);
 }
 /*
  * deactivate_task - remove a task from the runqueue.
  */
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 {
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible++;
 	dequeue_task(rq, p, sleep);
 	dec_nr_running(rq);
 }
 /**
  * task_curr - is this task currently executing on a CPU?
  * @p: the task in question.
  */
 inline int task_curr(const struct task_struct *p)
 {
 	return cpu_curr(task_cpu(p)) == p;
 }
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
 	set_task_rq(p, cpu);
 #ifdef CONFIG_SMP
 	/*
 	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
 	 * successfuly executed on another CPU. We must ensure that updates of
 	 * per-task data have been completed by this moment.
 	 */
 	smp_wmb();
 	task_thread_info(p)->cpu = cpu;
 #endif
 }
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 				       const struct sched_class *prev_class,
 				       int oldprio, int running)
 {
 	if (prev_class != p->sched_class) {
 		if (prev_class->switched_from)
 			prev_class->switched_from(rq, p, running);
 		p->sched_class->switched_to(rq, p, running);
 	} else
 		p->sched_class->prio_changed(rq, p, oldprio, running);
 }
 #ifdef CONFIG_SMP
 /* Used instead of source_load when we know the type == 0 */
 static unsigned long weighted_cpuload(const int cpu)
 {
 	return cpu_rq(cpu)->load.weight;
 }
 /*
  * Is this task likely cache-hot:
  */
 static int
 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 {
 	s64 delta;
 	/*
 	 * Buddy candidates are cache hot:
 	 */
 	if (sched_feat(CACHE_HOT_BUDDY) &&
 			(&p->se == cfs_rq_of(&p->se)->next ||
 			 &p->se == cfs_rq_of(&p->se)->last))
 		return 1;
 	if (p->sched_class != &fair_sched_class)
 		return 0;
 	if (sysctl_sched_migration_cost == -1)
 		return 1;
 	if (sysctl_sched_migration_cost == 0)
 		return 0;
 	delta = now - p->se.exec_start;
 	return delta < (s64)sysctl_sched_migration_cost;
 }
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 	int old_cpu = task_cpu(p);
 	struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
 	struct cfs_rq *old_cfsrq = task_cfs_rq(p),
 		      *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
 	u64 clock_offset;
 	clock_offset = old_rq->clock - new_rq->clock;
 #ifdef CONFIG_SCHEDSTATS
 	if (p->se.wait_start)
 		p->se.wait_start -= clock_offset;
 	if (p->se.sleep_start)
 		p->se.sleep_start -= clock_offset;
 	if (p->se.block_start)
 		p->se.block_start -= clock_offset;
 	if (old_cpu != new_cpu) {
 		schedstat_inc(p, se.nr_migrations);
 		if (task_hot(p, old_rq->clock, NULL))
 			schedstat_inc(p, se.nr_forced2_migrations);
 	}
 #endif
 	p->se.vruntime -= old_cfsrq->min_vruntime -
 					 new_cfsrq->min_vruntime;
 	__set_task_cpu(p, new_cpu);
 }
 struct migration_req {
 	struct list_head list;
 	struct task_struct *task;
 	int dest_cpu;
 	struct completion done;
 };
 /*
  * The task's runqueue lock must be held.
  * Returns true if you have to wait for migration thread.
  */
 static int
 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 {
 	struct rq *rq = task_rq(p);
 	/*
 	 * If the task is not on a runqueue (and not running), then
 	 * it is sufficient to simply update the task's cpu field.
 	 */
 	if (!p->se.on_rq && !task_running(rq, p)) {
 		set_task_cpu(p, dest_cpu);
 		return 0;
 	}
 	init_completion(&req->done);
 	req->task = p;
 	req->dest_cpu = dest_cpu;
 	list_add(&req->list, &rq->migration_queue);
 	return 1;
 }
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
  * If @match_state is nonzero, it's the @p->state value just checked and
  * not expected to change.  If it changes, i.e. @p might have woken up,
  * then return zero.  When we succeed in waiting for @p to be off its CPU,
  * we return a positive number (its total switch count).  If a second call
  * a short while later returns the same number, the caller can be sure that
  * @p has remained unscheduled the whole time.
  *
  * The caller must ensure that the task *will* unschedule sometime soon,
  * else this function might spin for a *long* time. This function can't
  * be called with interrupts off, or it may introduce deadlock with
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
  */
 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
 	unsigned long flags;
 	int running, on_rq;
 	unsigned long ncsw;
 	struct rq *rq;
 	for (;;) {
 		/*
 		 * We do the initial early heuristics without holding
 		 * any task-queue locks at all. We'll only try to get
 		 * the runqueue lock when things look like they will
 		 * work out!
 		 */
 		rq = task_rq(p);
 		/*
 		 * If the task is actively running on another CPU
 		 * still, just relax and busy-wait without holding
 		 * any locks.
 		 *
 		 * NOTE! Since we don't hold any locks, it's not
 		 * even sure that "rq" stays as the right runqueue!
 		 * But we don't care, since "task_running()" will
 		 * return false if the runqueue has changed and p
 		 * is actually now running somewhere else!
 		 */
 		while (task_running(rq, p)) {
 			if (match_state && unlikely(p->state != match_state))
 				return 0;
 			cpu_relax();
 		}
 		/*
 		 * Ok, time to look more closely! We need the rq
 		 * lock now, to be *sure*. If we're wrong, we'll
 		 * just go back and repeat.
 		 */
 		rq = task_rq_lock(p, &flags);
 		trace_sched_wait_task(rq, p);
 		running = task_running(rq, p);
 		on_rq = p->se.on_rq;
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
 		task_rq_unlock(rq, &flags);
 		/*
 		 * If it changed from the expected state, bail out now.
 		 */
 		if (unlikely(!ncsw))
 			break;
 		/*
 		 * Was it really running after all now that we
 		 * checked with the proper locks actually held?
 		 *
 		 * Oops. Go back and try again..
 		 */
 		if (unlikely(running)) {
 			cpu_relax();
 			continue;
 		}
 		/*
 		 * It's not enough that it's not actively running,
 		 * it must be off the runqueue _entirely_, and not
 		 * preempted!
 		 *
 		 * So if it wa still runnable (but just not actively
 		 * running right now), it's preempted, and we should
 		 * yield - it could be a while.
 		 */
 		if (unlikely(on_rq)) {
 			schedule_timeout_uninterruptible(1);
 			continue;
 		}
 		/*
 		 * Ahh, all good. It wasn't running, and it wasn't
 		 * runnable, which means that it will never become
 		 * running in the future either. We're all done!
 		 */
 		break;
 	}
 	return ncsw;
 }
 /***
  * kick_process - kick a running thread to enter/exit the kernel
  * @p: the to-be-kicked thread
  *
  * Cause a process which is running on another CPU to enter
  * kernel-mode, without any delay. (to get signals handled.)
  *
  * NOTE: this function doesnt have to take the runqueue lock,
  * because all it wants to ensure is that the remote task enters
  * the kernel. If the IPI races and the task has been migrated
  * to another CPU then no harm is done and the purpose has been
  * achieved as well.
  */
 void kick_process(struct task_struct *p)
 {
 	int cpu;
 	preempt_disable();
 	cpu = task_cpu(p);
 	if ((cpu != smp_processor_id()) && task_curr(p))
 		smp_send_reschedule(cpu);
 	preempt_enable();
 }
 /*
  * Return a low guess at the load of a migration-source cpu weighted
  * according to the scheduling class and "nice" value.
  *
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
 static unsigned long source_load(int cpu, int type)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
 	if (type == 0 || !sched_feat(LB_BIAS))
 		return total;
 	return min(rq->cpu_load[type-1], total);
 }
 /*
  * Return a high guess at the load of a migration-target cpu weighted
  * according to the scheduling class and "nice" value.
  */
 static unsigned long target_load(int cpu, int type)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
 	if (type == 0 || !sched_feat(LB_BIAS))
 		return total;
 	return max(rq->cpu_load[type-1], total);
 }
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
  */
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 {
 	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
 	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
 	do {
 		unsigned long load, avg_load;
 		int local_group;
 		int i;
 		/* Skip over this group if it has no CPUs allowed */
 		if (!cpus_intersects(group->cpumask, p->cpus_allowed))
 			continue;
 		local_group = cpu_isset(this_cpu, group->cpumask);
 		/* Tally up the load of all CPUs in the group */
 		avg_load = 0;
 		for_each_cpu_mask_nr(i, group->cpumask) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
 				load = source_load(i, load_idx);
 			else
 				load = target_load(i, load_idx);
 			avg_load += load;
 		}
 		/* Adjust by relative CPU power of the group */
 		avg_load = sg_div_cpu_power(group,
 				avg_load * SCHED_LOAD_SCALE);
 		if (local_group) {
 			this_load = avg_load;
 			this = group;
 		} else if (avg_load < min_load) {
 			min_load = avg_load;
 			idlest = group;
 		}
 	} while (group = group->next, group != sd->groups);
 	if (!idlest || 100*this_load < imbalance*min_load)
 		return NULL;
 	return idlest;
 }
 /*
  * find_idlest_cpu - find the idlest cpu among the cpus in group.
  */
 static int
 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
 		cpumask_t *tmp)
 {
 	unsigned long load, min_load = ULONG_MAX;
 	int idlest = -1;
 	int i;
 	/* Traverse only the allowed CPUs */
 	cpus_and(*tmp, group->cpumask, p->cpus_allowed);
 	for_each_cpu_mask_nr(i, *tmp) {
 		load = weighted_cpuload(i);
 		if (load < min_load || (load == min_load && i == this_cpu)) {
 			min_load = load;
 			idlest = i;
 		}
 	}
 	return idlest;
 }
 /*
  * sched_balance_self: balance the current task (running on cpu) in domains
  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
  * SD_BALANCE_EXEC.
  *
  * Balance, ie. select the least loaded group.
  *
  * Returns the target CPU number, or the same CPU if no balancing is needed.
  *
  * preempt must be disabled.
  */
 static int sched_balance_self(int cpu, int flag)
 {
 	struct task_struct *t = current;
 	struct sched_domain *tmp, *sd = NULL;
 	for_each_domain(cpu, tmp) {
 		/*
 		 * If power savings logic is enabled for a domain, stop there.
 		 */
 		if (tmp->flags & SD_POWERSAVINGS_BALANCE)
 			break;
 		if (tmp->flags & flag)
 			sd = tmp;
 	}
 	if (sd)
 		update_shares(sd);
 	while (sd) {
 		cpumask_t span, tmpmask;
 		struct sched_group *group;
 		int new_cpu, weight;
 		if (!(sd->flags & flag)) {
 			sd = sd->child;
 			continue;
 		}
 		span = sd->span;
 		group = find_idlest_group(sd, t, cpu);
 		if (!group) {
 			sd = sd->child;
 			continue;
 		}
 		new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
 		if (new_cpu == -1 || new_cpu == cpu) {
 			/* Now try balancing at a lower domain level of cpu */
 			sd = sd->child;
 			continue;
 		}
 		/* Now try balancing at a lower domain level of new_cpu */
 		cpu = new_cpu;
 		sd = NULL;
 		weight = cpus_weight(span);
 		for_each_domain(cpu, tmp) {
 			if (weight <= cpus_weight(tmp->span))
 				break;
 			if (tmp->flags & flag)
 				sd = tmp;
 		}
 		/* while loop will break here if sd == NULL */
 	}
 	return cpu;
 }
 #endif /* CONFIG_SMP */
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
  * @state: the mask of task states that can be woken
  * @sync: do a synchronous wakeup?
  *
  * Put it on the run-queue if it's not already there. The "current"
  * thread is always on the run-queue (except when the actual
  * re-schedule is in progress), and as such you're allowed to do
  * the simpler "current->state = TASK_RUNNING" to mark yourself
  * runnable without the overhead of this.
  *
  * returns failure only if the task is already active.
  */
 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 {
 	int cpu, orig_cpu, this_cpu, success = 0;
 	unsigned long flags;
 	long old_state;
 	struct rq *rq;
 	if (!sched_feat(SYNC_WAKEUPS))
 		sync = 0;
 #ifdef CONFIG_SMP
 	if (sched_feat(LB_WAKEUP_UPDATE)) {
 		struct sched_domain *sd;
 		this_cpu = raw_smp_processor_id();
 		cpu = task_cpu(p);
 		for_each_domain(this_cpu, sd) {
 			if (cpu_isset(cpu, sd->span)) {
 				update_shares(sd);
 				break;
 			}
 		}
 	}
 #endif
 	smp_wmb();
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
 	if (!(old_state & state))
 		goto out;
 	if (p->se.on_rq)
 		goto out_running;
 	cpu = task_cpu(p);
 	orig_cpu = cpu;
 	this_cpu = smp_processor_id();
 #ifdef CONFIG_SMP
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 	cpu = p->sched_class->select_task_rq(p, sync);
 	if (cpu != orig_cpu) {
 		set_task_cpu(p, cpu);
 		task_rq_unlock(rq, &flags);
 		/* might preempt at this point */
 		rq = task_rq_lock(p, &flags);
 		old_state = p->state;
 		if (!(old_state & state))
 			goto out;
 		if (p->se.on_rq)
 			goto out_running;
 		this_cpu = smp_processor_id();
 		cpu = task_cpu(p);
 	}
 #ifdef CONFIG_SCHEDSTATS
 	schedstat_inc(rq, ttwu_count);
 	if (cpu == this_cpu)
 		schedstat_inc(rq, ttwu_local);
 	else {
 		struct sched_domain *sd;
 		for_each_domain(this_cpu, sd) {
 			if (cpu_isset(cpu, sd->span)) {
 				schedstat_inc(sd, ttwu_wake_remote);
 				break;
 			}
 		}
 	}
 #endif /* CONFIG_SCHEDSTATS */
 out_activate:
 #endif /* CONFIG_SMP */
 	schedstat_inc(p, se.nr_wakeups);
 	if (sync)
 		schedstat_inc(p, se.nr_wakeups_sync);
 	if (orig_cpu != cpu)
 		schedstat_inc(p, se.nr_wakeups_migrate);
 	if (cpu == this_cpu)
 		schedstat_inc(p, se.nr_wakeups_local);
 	else
 		schedstat_inc(p, se.nr_wakeups_remote);
 	update_rq_clock(rq);
 	activate_task(rq, p, 1);
 	success = 1;
 out_running:
 	trace_sched_wakeup(rq, p);
 	check_preempt_curr(rq, p, sync);
 	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
 		p->sched_class->task_wake_up(rq, p);
 #endif
 out:
 	current->se.last_wakeup = current->se.sum_exec_runtime;
 	task_rq_unlock(rq, &flags);
 	return success;
 }
 int wake_up_process(struct task_struct *p)
 {
 	return try_to_wake_up(p, TASK_ALL, 0);
 }
 EXPORT_SYMBOL(wake_up_process);
 int wake_up_state(struct task_struct *p, unsigned int state)
 {
 	return try_to_wake_up(p, state, 0);
 }
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
  *
  * __sched_fork() is basic setup used by init_idle() too:
  */
 static void __sched_fork(struct task_struct *p)
 {
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.last_wakeup		= 0;
 	p->se.avg_overlap		= 0;
 #ifdef CONFIG_SCHEDSTATS
 	p->se.wait_start		= 0;
 	p->se.sum_sleep_runtime		= 0;
 	p->se.sleep_start		= 0;
 	p->se.block_start		= 0;
 	p->se.sleep_max			= 0;
 	p->se.block_max			= 0;
 	p->se.exec_max			= 0;
 	p->se.slice_max			= 0;
 	p->se.wait_max			= 0;
 #endif
 	INIT_LIST_HEAD(&p->rt.run_list);
 	p->se.on_rq = 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
 	/*
 	 * We mark the process as running here, but have not actually
 	 * inserted it onto the runqueue yet. This guarantees that
 	 * nobody will actually run it, and a signal or other external
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
 	p->state = TASK_RUNNING;
 }
 /*
  * fork()/clone()-time setup:
  */
 void sched_fork(struct task_struct *p, int clone_flags)
 {
 	int cpu = get_cpu();
 	__sched_fork(p);
 #ifdef CONFIG_SMP
 	cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
 #endif
 	set_task_cpu(p, cpu);
 	/*
 	 * Make sure we do not leak PI boosting priority to the child:
 	 */
 	p->prio = current->normal_prio;
 	if (!rt_prio(p->prio))
 		p->sched_class = &fair_sched_class;
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	p->oncpu = 0;
 #endif
 #ifdef CONFIG_PREEMPT
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
 	put_cpu();
 }
 /*
  * wake_up_new_task - wake up a newly created task for the first time.
  *
  * This function will do some initial scheduler statistics housekeeping
  * that must be done for every newly created context, then puts the task
  * on the runqueue and wakes it.
  */
 void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
 	unsigned long flags;
 	struct rq *rq;
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
 	update_rq_clock(rq);
 	p->prio = effective_prio(p);
 	if (!p->sched_class->task_new || !current->se.on_rq) {
 		activate_task(rq, p, 0);
 	} else {
 		/*
 		 * Let the scheduling class do new task startup
 		 * management (if any):
 		 */
 		p->sched_class->task_new(rq, p);
 		inc_nr_running(rq);
 	}
 	trace_sched_wakeup_new(rq, p);
 	check_preempt_curr(rq, p, 0);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
 		p->sched_class->task_wake_up(rq, p);
 #endif
 	task_rq_unlock(rq, &flags);
 }
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 /**
  * preempt_notifier_register - tell me when current is being being preempted & rescheduled
  * @notifier: notifier struct to register
  */
 void preempt_notifier_register(struct preempt_notifier *notifier)
 {
 	hlist_add_head(&notifier->link, &current->preempt_notifiers);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_register);
 /**
  * preempt_notifier_unregister - no longer interested in preemption notifications
  * @notifier: notifier struct to unregister
  *
  * This is safe to call from within a preemption notifier.
  */
 void preempt_notifier_unregister(struct preempt_notifier *notifier)
 {
 	hlist_del(&notifier->link);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
 	struct preempt_notifier *notifier;
 	struct hlist_node *node;
 	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
 		notifier->ops->sched_in(notifier, raw_smp_processor_id());
 }
 static void
 fire_sched_out_preempt_notifiers(struct task_struct *curr,
 				 struct task_struct *next)
 {
 	struct preempt_notifier *notifier;
 	struct hlist_node *node;
 	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
 		notifier->ops->sched_out(notifier, next);
 }
 #else /* !CONFIG_PREEMPT_NOTIFIERS */
 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
 }
 static void
 fire_sched_out_preempt_notifiers(struct task_struct *curr,
 				 struct task_struct *next)
 {
 }
 #endif /* CONFIG_PREEMPT_NOTIFIERS */
 /**
  * prepare_task_switch - prepare to switch tasks
  * @rq: the runqueue preparing to switch
  * @prev: the current task that is being switched out
  * @next: the task we are going to switch to.
  *
  * This is called with the rq lock held and interrupts off. It must
  * be paired with a subsequent finish_task_switch after the context
  * switch.
  *
  * prepare_task_switch sets up locking and calls architecture specific
  * hooks.
  */
 static inline void
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
 		    struct task_struct *next)
 {
 	fire_sched_out_preempt_notifiers(prev, next);
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
 }
 /**
  * finish_task_switch - clean up after a task-switch
  * @rq: runqueue associated with task-switch
  * @prev: the thread we just switched away from.
  *
  * finish_task_switch must be called after the context switch, paired
  * with a prepare_task_switch call before the context switch.
  * finish_task_switch will reconcile locking set up by prepare_task_switch,
  * and do any other architecture-specific cleanup actions.
  *
  * Note that we may have delayed dropping an mm in context_switch(). If
  * so, we finish that here outside of the runqueue lock. (Doing it
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
  */
 static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct mm_struct *mm = rq->prev_mm;
 	long prev_state;
 	rq->prev_mm = NULL;
 	/*
 	 * A task struct has one reference for the use as "current".
 	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
 	 * schedule one last time. The schedule call will never return, and
 	 * the scheduled task must drop that reference.
 	 * The test for TASK_DEAD must occur while the runqueue locks are
 	 * still held, otherwise prev could be scheduled on another cpu, die
 	 * there before we look at prev->state, and then the reference would
 	 * be dropped twice.
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
 	prev_state = prev->state;
 	finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
 #ifdef CONFIG_SMP
 	if (current->sched_class->post_schedule)
 		current->sched_class->post_schedule(rq);
 #endif
 	fire_sched_in_preempt_notifiers(current);
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
 		/*
 		 * Remove function-return probe instances associated with this
 		 * task and put them back on the free list.
 		 */
 		kprobe_flush_task(prev);
 		put_task_struct(prev);
 	}
 }
 /**
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
  */
 asmlinkage void schedule_tail(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
 	finish_task_switch(rq, prev);
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	/* In this case, finish_task_switch does not reenable preemption */
 	preempt_enable();
 #endif
 	if (current->set_child_tid)
 		put_user(task_pid_vnr(current), current->set_child_tid);
 }
 /*
  * context_switch - switch to the new MM and the new
  * thread's register state.
  */
 static inline void
 context_switch(struct rq *rq, struct task_struct *prev,
 	       struct task_struct *next)
 {
 	struct mm_struct *mm, *oldmm;
 	prepare_task_switch(rq, prev, next);
 	trace_sched_switch(rq, prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
 	 * For paravirt, this is coupled with an exit in switch_to to
 	 * combine the page table reload and the switch backend into
 	 * one hypercall.
 	 */
 	arch_enter_lazy_cpu_mode();
 	if (unlikely(!mm)) {
 		next->active_mm = oldmm;
 		atomic_inc(&oldmm->mm_count);
 		enter_lazy_tlb(oldmm, next);
 	} else
 		switch_mm(oldmm, mm, next);
 	if (unlikely(!prev->mm)) {
 		prev->active_mm = NULL;
 		rq->prev_mm = oldmm;
 	}
 	/*
 	 * Since the runqueue lock will be released by the next
 	 * task (which is an invalid locking op but in the case
 	 * of the scheduler it's an obvious special-case), so we
 	 * do an early lockdep release here:
 	 */
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 #endif
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 	barrier();
 	/*
 	 * this_rq must be evaluated again because prev may have moved
 	 * CPUs since it called schedule(), thus the 'rq' on its stack
 	 * frame will be invalid.
 	 */
 	finish_task_switch(this_rq(), prev);
 }
 /*
  * nr_running, nr_uninterruptible and nr_context_switches:
  *
  * externally visible scheduler statistics: current number of runnable
  * threads, current number of uninterruptible-sleeping threads, total
  * number of context switches performed since bootup.
  */
 unsigned long nr_running(void)
 {
 	unsigned long i, sum = 0;
 	for_each_online_cpu(i)
 		sum += cpu_rq(i)->nr_running;
 	return sum;
 }
 unsigned long nr_uninterruptible(void)
 {
 	unsigned long i, sum = 0;
 	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_uninterruptible;
 	/*
 	 * Since we read the counters lockless, it might be slightly
 	 * inaccurate. Do not allow it to go below zero though:
 	 */
 	if (unlikely((long)sum < 0))
 		sum = 0;
 	return sum;
 }
 unsigned long long nr_context_switches(void)
 {
 	int i;
 	unsigned long long sum = 0;
 	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_switches;
 	return sum;
 }
 unsigned long nr_iowait(void)
 {
 	unsigned long i, sum = 0;
 	for_each_possible_cpu(i)
 		sum += atomic_read(&cpu_rq(i)->nr_iowait);
 	return sum;
 }
 unsigned long nr_active(void)
 {
 	unsigned long i, running = 0, uninterruptible = 0;
 	for_each_online_cpu(i) {
 		running += cpu_rq(i)->nr_running;
 		uninterruptible += cpu_rq(i)->nr_uninterruptible;
 	}
 	if (unlikely((long)uninterruptible < 0))
 		uninterruptible = 0;
 	return running + uninterruptible;
 }
 /*
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
  */
 static void update_cpu_load(struct rq *this_rq)
 {
 	unsigned long this_load = this_rq->load.weight;
 	int i, scale;
 	this_rq->nr_load_updates++;
 	/* Update our load: */
 	for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
 		unsigned long old_load, new_load;
 		/* scale is effectively 1 << i now, and >> i divides by scale */
 		old_load = this_rq->cpu_load[i];
 		new_load = this_load;
 		/*
 		 * Round up the averaging division if load is increasing. This
 		 * prevents us from getting stuck on 9 if the load is 10, for
 		 * example.
 		 */
 		if (new_load > old_load)
 			new_load += scale-1;
 		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
 	}
 }
 #ifdef CONFIG_SMP
 /*
  * double_rq_lock - safely lock two runqueues
  *
  * Note this does not disable interrupts like task_rq_lock,
  * you need to do so manually before calling.
  */
 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
 	__acquires(rq1->lock)
 	__acquires(rq2->lock)
 {
 	BUG_ON(!irqs_disabled());
 	if (rq1 == rq2) {
 		spin_lock(&rq1->lock);
 		__acquire(rq2->lock);	/* Fake it out ;) */
 	} else {
 		if (rq1 < rq2) {
 			spin_lock(&rq1->lock);
 			spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
 		} else {
 			spin_lock(&rq2->lock);
 			spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
 		}
 	}
 	update_rq_clock(rq1);
 	update_rq_clock(rq2);
 }
 /*
  * double_rq_unlock - safely unlock two runqueues
  *
  * Note this does not restore interrupts like task_rq_unlock,
  * you need to do so manually after calling.
  */
 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 	__releases(rq1->lock)
 	__releases(rq2->lock)
 {
 	spin_unlock(&rq1->lock);
 	if (rq1 != rq2)
 		spin_unlock(&rq2->lock);
 	else
 		__release(rq2->lock);
 }
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
  * allow dest_cpu, which will force the cpu onto dest_cpu. Then
  * the cpu_allowed mask is restored.
  */
 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
 {
 	struct migration_req req;
 	unsigned long flags;
 	struct rq *rq;
 	rq = task_rq_lock(p, &flags);
 	if (!cpu_isset(dest_cpu, p->cpus_allowed)
 	    || unlikely(!cpu_active(dest_cpu)))
 		goto out;
 	trace_sched_migrate_task(rq, p, dest_cpu);
 	/* force the process onto the specified CPU */
 	if (migrate_task(p, dest_cpu, &req)) {
 		/* Need to wait for migration thread (might exit: take ref). */
 		struct task_struct *mt = rq->migration_thread;
 		get_task_struct(mt);
 		task_rq_unlock(rq, &flags);
 		wake_up_process(mt);
 		put_task_struct(mt);
 		wait_for_completion(&req.done);
 		return;
 	}
 out:
 	task_rq_unlock(rq, &flags);
 }
 /*
  * sched_exec - execve() is a valuable balancing opportunity, because at
  * this point the task has the smallest effective memory and cache footprint.
  */
 void sched_exec(void)
 {
 	int new_cpu, this_cpu = get_cpu();
 	new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
 	put_cpu();
 	if (new_cpu != this_cpu)
 		sched_migrate_task(current, new_cpu);
 }
 /*
  * pull_task - move a task from a remote runqueue to the local runqueue.
  * Both runqueues must be locked.
  */
 static void pull_task(struct rq *src_rq, struct task_struct *p,
 		      struct rq *this_rq, int this_cpu)
 {
 	deactivate_task(src_rq, p, 0);
 	set_task_cpu(p, this_cpu);
 	activate_task(this_rq, p, 0);
 	/*
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
 	 * to be always true for them.
 	 */
 	check_preempt_curr(this_rq, p, 0);
 }
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
 static
 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 		     struct sched_domain *sd, enum cpu_idle_type idle,
 		     int *all_pinned)
 {
 	/*
 	 * We do not migrate tasks that are:
 	 * 1) running (obviously), or
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
 	 */
 	if (!cpu_isset(this_cpu, p->cpus_allowed)) {
 		schedstat_inc(p, se.nr_failed_migrations_affine);
 		return 0;
 	}
 	*all_pinned = 0;
 	if (task_running(rq, p)) {
 		schedstat_inc(p, se.nr_failed_migrations_running);
 		return 0;
 	}
 	/*
 	 * Aggressive migration if:
 	 * 1) task is cache cold, or
 	 * 2) too many balance attempts have failed.
 	 */
 	if (!task_hot(p, rq->clock, sd) ||
 			sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
 		if (task_hot(p, rq->clock, sd)) {
 			schedstat_inc(sd, lb_hot_gained[idle]);
 			schedstat_inc(p, se.nr_forced_migrations);
 		}
 #endif
 		return 1;
 	}
 	if (task_hot(p, rq->clock, sd)) {
 		schedstat_inc(p, se.nr_failed_migrations_hot);
 		return 0;
 	}
 	return 1;
 }
 static unsigned long
 balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	      unsigned long max_load_move, struct sched_domain *sd,
 	      enum cpu_idle_type idle, int *all_pinned,
 	      int *this_best_prio, struct rq_iterator *iterator)
 {
 	int loops = 0, pulled = 0, pinned = 0;
 	struct task_struct *p;
 	long rem_load_move = max_load_move;
 	if (max_load_move == 0)
 		goto out;
 	pinned = 1;
 	/*
 	 * Start the load-balancing iterator:
 	 */
 	p = iterator->start(iterator->arg);
 next:
 	if (!p || loops++ > sysctl_sched_nr_migrate)
 		goto out;
 	if ((p->se.load.weight >> 1) > rem_load_move ||
 	    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
 		p = iterator->next(iterator->arg);
 		goto next;
 	}
 	pull_task(busiest, p, this_rq, this_cpu);
 	pulled++;
 	rem_load_move -= p->se.load.weight;
 	/*
 	 * We only want to steal up to the prescribed amount of weighted load.
 	 */
 	if (rem_load_move > 0) {
 		if (p->prio < *this_best_prio)
 			*this_best_prio = p->prio;
 		p = iterator->next(iterator->arg);
 		goto next;
 	}
 out:
 	/*
 	 * Right now, this is one of only two places pull_task() is called,
 	 * so we can safely collect pull_task() stats here rather than
 	 * inside pull_task().
 	 */
 	schedstat_add(sd, lb_gained[idle], pulled);
 	if (all_pinned)
 		*all_pinned = pinned;
 	return max_load_move - rem_load_move;
 }
 /*
  * move_tasks tries to move up to max_load_move weighted load from busiest to
  * this_rq, as part of a balancing operation within domain "sd".
  * Returns 1 if successful and 0 otherwise.
  *
  * Called with both runqueues locked.
  */
 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		      unsigned long max_load_move,
 		      struct sched_domain *sd, enum cpu_idle_type idle,
 		      int *all_pinned)
 {
 	const struct sched_class *class = sched_class_highest;
 	unsigned long total_load_moved = 0;
 	int this_best_prio = this_rq->curr->prio;
 	do {
 		total_load_moved +=
 			class->load_balance(this_rq, this_cpu, busiest,
 				max_load_move - total_load_moved,
 				sd, idle, all_pinned, &this_best_prio);
 		class = class->next;
 		if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
 			break;
 	} while (class && max_load_move > total_load_moved);
 	return total_load_moved > 0;
 }
 static int
 iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		   struct sched_domain *sd, enum cpu_idle_type idle,
 		   struct rq_iterator *iterator)
 {
 	struct task_struct *p = iterator->start(iterator->arg);
 	int pinned = 0;
 	while (p) {
 		if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
 			pull_task(busiest, p, this_rq, this_cpu);
 			/*
 			 * Right now, this is only the second place pull_task()
 			 * is called, so we can safely collect pull_task()
 			 * stats here rather than inside pull_task().
 			 */
 			schedstat_inc(sd, lb_gained[idle]);
 			return 1;
 		}
 		p = iterator->next(iterator->arg);
 	}
 	return 0;
 }
 /*
  * move_one_task tries to move exactly one task from busiest to this_rq, as
  * part of active balancing operations within "domain".
  * Returns 1 if successful and 0 otherwise.
  *
  * Called with both runqueues locked.
  */
 static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 			 struct sched_domain *sd, enum cpu_idle_type idle)
 {
 	const struct sched_class *class;
 	for (class = sched_class_highest; class; class = class->next)
 		if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
 			return 1;
 	return 0;
 }
 /*
  * find_busiest_group finds and returns the busiest CPU group within the
  * domain. It calculates and returns the amount of weighted load which
  * should be moved to restore balance via the imbalance parameter.
  */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
 		   unsigned long *imbalance, enum cpu_idle_type idle,
 		   int *sd_idle, const cpumask_t *cpus, int *balance)
 {
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
 	unsigned long max_pull;
 	unsigned long busiest_load_per_task, busiest_nr_running;
 	unsigned long this_load_per_task, this_nr_running;
 	int load_idx, group_imb = 0;
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 	int power_savings_balance = 1;
 	unsigned long leader_nr_running = 0, min_load_per_task = 0;
 	unsigned long min_nr_running = ULONG_MAX;
 	struct sched_group *group_min = NULL, *group_leader = NULL;
 #endif
 	max_load = this_load = total_load = total_pwr = 0;
 	busiest_load_per_task = busiest_nr_running = 0;
 	this_load_per_task = this_nr_running = 0;
 	if (idle == CPU_NOT_IDLE)
 		load_idx = sd->busy_idx;
 	else if (idle == CPU_NEWLY_IDLE)
 		load_idx = sd->newidle_idx;
 	else
 		load_idx = sd->idle_idx;
 	do {
 		unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
 		int local_group;
 		int i;
 		int __group_imb = 0;
 		unsigned int balance_cpu = -1, first_idle_cpu = 0;
 		unsigned long sum_nr_running, sum_weighted_load;
 		unsigned long sum_avg_load_per_task;
 		unsigned long avg_load_per_task;
 		local_group = cpu_isset(this_cpu, group->cpumask);
 		if (local_group)
 			balance_cpu = first_cpu(group->cpumask);
 		/* Tally up the load of all CPUs in the group */
 		sum_weighted_load = sum_nr_running = avg_load = 0;
 		sum_avg_load_per_task = avg_load_per_task = 0;
 		max_cpu_load = 0;
 		min_cpu_load = ~0UL;
 		for_each_cpu_mask_nr(i, group->cpumask) {
 			struct rq *rq;
 			if (!cpu_isset(i, *cpus))
 				continue;
 			rq = cpu_rq(i);
 			if (*sd_idle && rq->nr_running)
 				*sd_idle = 0;
 			/* Bias balancing toward cpus of our domain */
 			if (local_group) {
 				if (idle_cpu(i) && !first_idle_cpu) {
 					first_idle_cpu = 1;
 					balance_cpu = i;
 				}
 				load = target_load(i, load_idx);
 			} else {
 				load = source_load(i, load_idx);
 				if (load > max_cpu_load)
 					max_cpu_load = load;
 				if (min_cpu_load > load)
 					min_cpu_load = load;
 			}
 			avg_load += load;
 			sum_nr_running += rq->nr_running;
 			sum_weighted_load += weighted_cpuload(i);
 			sum_avg_load_per_task += cpu_avg_load_per_task(i);
 		}
 		/*
 		 * First idle cpu or the first cpu(busiest) in this sched group
 		 * is eligible for doing load balancing at this and above
 		 * domains. In the newly idle case, we will allow all the cpu's
 		 * to do the newly idle load balance.
 		 */
 		if (idle != CPU_NEWLY_IDLE && local_group &&
 		    balance_cpu != this_cpu && balance) {
 			*balance = 0;
 			goto ret;
 		}
 		total_load += avg_load;
 		total_pwr += group->__cpu_power;
 		/* Adjust by relative CPU power of the group */
 		avg_load = sg_div_cpu_power(group,
 				avg_load * SCHED_LOAD_SCALE);
 		/*
 		 * Consider the group unbalanced when the imbalance is larger
 		 * than the average weight of two tasks.
 		 *
 		 * APZ: with cgroup the avg task weight can vary wildly and
 		 *      might not be a suitable number - should we keep a
 		 *      normalized nr_running number somewhere that negates
 		 *      the hierarchy?
 		 */
 		avg_load_per_task = sg_div_cpu_power(group,
 				sum_avg_load_per_task * SCHED_LOAD_SCALE);
 		if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
 			__group_imb = 1;
 		group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
 		if (local_group) {
 			this_load = avg_load;
 			this = group;
 			this_nr_running = sum_nr_running;
 			this_load_per_task = sum_weighted_load;
 		} else if (avg_load > max_load &&
 			   (sum_nr_running > group_capacity || __group_imb)) {
 			max_load = avg_load;
 			busiest = group;
 			busiest_nr_running = sum_nr_running;
 			busiest_load_per_task = sum_weighted_load;
 			group_imb = __group_imb;
 		}
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 		/*
 		 * Busy processors will not participate in power savings
 		 * balance.
 		 */
 		if (idle == CPU_NOT_IDLE ||
 				!(sd->flags & SD_POWERSAVINGS_BALANCE))
 			goto group_next;
 		/*
 		 * If the local group is idle or completely loaded
 		 * no need to do power savings balance at this domain
 		 */
 		if (local_group && (this_nr_running >= group_capacity ||
 				    !this_nr_running))
 			power_savings_balance = 0;
 		/*
 		 * If a group is already running at full capacity or idle,
 		 * don't include that group in power savings calculations
 		 */
 		if (!power_savings_balance || sum_nr_running >= group_capacity
 		    || !sum_nr_running)
 			goto group_next;
 		/*
 		 * Calculate the group which has the least non-idle load.
 		 * This is the group from where we need to pick up the load
 		 * for saving power
 		 */
 		if ((sum_nr_running < min_nr_running) ||
 		    (sum_nr_running == min_nr_running &&
 		     first_cpu(group->cpumask) <
 		     first_cpu(group_min->cpumask))) {
 			group_min = group;
 			min_nr_running = sum_nr_running;
 			min_load_per_task = sum_weighted_load /
 						sum_nr_running;
 		}
 		/*
 		 * Calculate the group which is almost near its
 		 * capacity but still has some space to pick up some load
 		 * from other group and save more power
 		 */
 		if (sum_nr_running <= group_capacity - 1) {
 			if (sum_nr_running > leader_nr_running ||
 			    (sum_nr_running == leader_nr_running &&
 			     first_cpu(group->cpumask) >
 			      first_cpu(group_leader->cpumask))) {
 				group_leader = group;
 				leader_nr_running = sum_nr_running;
 			}
 		}
 group_next:
 #endif
 		group = group->next;
 	} while (group != sd->groups);
 	if (!busiest || this_load >= max_load || busiest_nr_running == 0)
 		goto out_balanced;
 	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
 	if (this_load >= avg_load ||
 			100*max_load <= sd->imbalance_pct*this_load)
 		goto out_balanced;
 	busiest_load_per_task /= busiest_nr_running;
 	if (group_imb)
 		busiest_load_per_task = min(busiest_load_per_task, avg_load);
 	/*
 	 * We're trying to get all the cpus to the average_load, so we don't
 	 * want to push ourselves above the average load, nor do we wish to
 	 * reduce the max loaded cpu below the average load, as either of these
 	 * actions would just result in more rebalancing later, and ping-pong
 	 * tasks around. Thus we look for the minimum possible imbalance.
 	 * Negative imbalances (*we* are more loaded than anyone else) will
 	 * be counted as no imbalance for these purposes -- we can't fix that
 	 * by pulling tasks to us. Be careful of negative numbers as they'll
 	 * appear as very large values with unsigned longs.
 	 */
 	if (max_load <= busiest_load_per_task)
 		goto out_balanced;
 	/*
 	 * In the presence of smp nice balancing, certain scenarios can have
 	 * max load less than avg load(as we skip the groups at or below
 	 * its cpu_power, while calculating max_load..)
 	 */
 	if (max_load < avg_load) {
 		*imbalance = 0;
 		goto small_imbalance;
 	}
 	/* Don't want to pull so many tasks that a group would go idle */
 	max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
 	/* How much load to actually move to equalise the imbalance */
 	*imbalance = min(max_pull * busiest->__cpu_power,
 				(avg_load - this_load) * this->__cpu_power)
 			/ SCHED_LOAD_SCALE;
 	/*
 	 * if *imbalance is less than the average load per runnable task
 	 * there is no gaurantee that any tasks will be moved so we'll have
 	 * a think about bumping its value to force at least one task to be
 	 * moved
 	 */
 	if (*imbalance < busiest_load_per_task) {
 		unsigned long tmp, pwr_now, pwr_move;
 		unsigned int imbn;
 small_imbalance:
 		pwr_move = pwr_now = 0;
 		imbn = 2;
 		if (this_nr_running) {
 			this_load_per_task /= this_nr_running;
 			if (busiest_load_per_task > this_load_per_task)
 				imbn = 1;
 		} else
 			this_load_per_task = cpu_avg_load_per_task(this_cpu);
 		if (max_load - this_load + busiest_load_per_task >=
 					busiest_load_per_task * imbn) {
 			*imbalance = busiest_load_per_task;
 			return busiest;
 		}
 		/*
 		 * OK, we don't have enough imbalance to justify moving tasks,
 		 * however we may be able to increase total CPU power used by
 		 * moving them.
 		 */
 		pwr_now += busiest->__cpu_power *
 				min(busiest_load_per_task, max_load);
 		pwr_now += this->__cpu_power *
 				min(this_load_per_task, this_load);
 		pwr_now /= SCHED_LOAD_SCALE;
 		/* Amount of load we'd subtract */
 		tmp = sg_div_cpu_power(busiest,
 				busiest_load_per_task * SCHED_LOAD_SCALE);
 		if (max_load > tmp)
 			pwr_move += busiest->__cpu_power *
 				min(busiest_load_per_task, max_load - tmp);
 		/* Amount of load we'd add */
 		if (max_load * busiest->__cpu_power <
 				busiest_load_per_task * SCHED_LOAD_SCALE)
 			tmp = sg_div_cpu_power(this,
 					max_load * busiest->__cpu_power);
 		else
 			tmp = sg_div_cpu_power(this,
 				busiest_load_per_task * SCHED_LOAD_SCALE);
 		pwr_move += this->__cpu_power *
 				min(this_load_per_task, this_load + tmp);
 		pwr_move /= SCHED_LOAD_SCALE;
 		/* Move if we gain throughput */
 		if (pwr_move > pwr_now)
 			*imbalance = busiest_load_per_task;
 	}
 	return busiest;
 out_balanced:
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
 		goto ret;
 	if (this == group_leader && group_leader != group_min) {
 		*imbalance = min_load_per_task;
 		return group_min;
 	}
 #endif
 ret:
 	*imbalance = 0;
 	return NULL;
 }
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
 static struct rq *
 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 		   unsigned long imbalance, const cpumask_t *cpus)
 {
 	struct rq *busiest = NULL, *rq;
 	unsigned long max_load = 0;
 	int i;
 	for_each_cpu_mask_nr(i, group->cpumask) {
 		unsigned long wl;
 		if (!cpu_isset(i, *cpus))
 			continue;
 		rq = cpu_rq(i);
 		wl = weighted_cpuload(i);
 		if (rq->nr_running == 1 && wl > imbalance)
 			continue;
 		if (wl > max_load) {
 			max_load = wl;
 			busiest = rq;
 		}
 	}
 	return busiest;
 }
 /*
  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
  * so long as it is large enough.
  */
 #define MAX_PINNED_INTERVAL	512
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *balance, cpumask_t *cpus)
 {
 	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
 	struct sched_group *group;
 	unsigned long imbalance;
 	struct rq *busiest;
 	unsigned long flags;
 	cpus_setall(*cpus);
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
 	 * let the state of idle sibling percolate up as CPU_IDLE, instead of
 	 * portraying it as CPU_NOT_IDLE.
 	 */
 	if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		sd_idle = 1;
 	schedstat_inc(sd, lb_count[idle]);
 redo:
 	update_shares(sd);
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
 				   cpus, balance);
 	if (*balance == 0)
 		goto out_balanced;
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[idle]);
 		goto out_balanced;
 	}
 	busiest = find_busiest_queue(group, idle, imbalance, cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
 	}
 	BUG_ON(busiest == this_rq);
 	schedstat_add(sd, lb_imbalance[idle], imbalance);
 	ld_moved = 0;
 	if (busiest->nr_running > 1) {
 		/*
 		 * Attempt to move tasks. If find_busiest_group has found
 		 * an imbalance but busiest->nr_running <= 1, the group is
 		 * still unbalanced. ld_moved simply stays zero, so it is
 		 * correctly treated as an imbalance.
 		 */
 		local_irq_save(flags);
 		double_rq_lock(this_rq, busiest);
 		ld_moved = move_tasks(this_rq, this_cpu, busiest,
 				      imbalance, sd, idle, &all_pinned);
 		double_rq_unlock(this_rq, busiest);
 		local_irq_restore(flags);
 		/*
 		 * some other cpu did the load balance for us.
 		 */
 		if (ld_moved && this_cpu != smp_processor_id())
 			resched_cpu(this_cpu);
 		/* All tasks on this runqueue were pinned by CPU affinity */
 		if (unlikely(all_pinned)) {
 			cpu_clear(cpu_of(busiest), *cpus);
 			if (!cpus_empty(*cpus))
 				goto redo;
 			goto out_balanced;
 		}
 	}
 	if (!ld_moved) {
 		schedstat_inc(sd, lb_failed[idle]);
 		sd->nr_balance_failed++;
 		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
 			spin_lock_irqsave(&busiest->lock, flags);
 			/* don't kick the migration_thread, if the curr
 			 * task on busiest cpu can't be moved to this_cpu
 			 */
 			if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
 				spin_unlock_irqrestore(&busiest->lock, flags);
 				all_pinned = 1;
 				goto out_one_pinned;
 			}
 			if (!busiest->active_balance) {
 				busiest->active_balance = 1;
 				busiest->push_cpu = this_cpu;
 				active_balance = 1;
 			}
 			spin_unlock_irqrestore(&busiest->lock, flags);
 			if (active_balance)
 				wake_up_process(busiest->migration_thread);
 			/*
 			 * We've kicked active balancing, reset the failure
 			 * counter.
 			 */
 			sd->nr_balance_failed = sd->cache_nice_tries+1;
 		}
 	} else
 		sd->nr_balance_failed = 0;
 	if (likely(!active_balance)) {
 		/* We were unbalanced, so reset the balancing interval */
 		sd->balance_interval = sd->min_interval;
 	} else {
 		/*
 		 * If we've begun active balancing, start to back off. This
 		 * case may not be covered by the all_pinned logic if there
 		 * is only 1 task on the busy runqueue (because we don't call
 		 * move_tasks).
 		 */
 		if (sd->balance_interval < sd->max_interval)
 			sd->balance_interval *= 2;
 	}
 	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		ld_moved = -1;
 	goto out;
 out_balanced:
 	schedstat_inc(sd, lb_balanced[idle]);
 	sd->nr_balance_failed = 0;
 out_one_pinned:
 	/* tune up the balancing interval */
 	if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
 			(sd->balance_interval < sd->max_interval))
 		sd->balance_interval *= 2;
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		ld_moved = -1;
 	else
 		ld_moved = 0;
 out:
 	if (ld_moved)
 		update_shares(sd);
 	return ld_moved;
 }
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
  *
  * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
  * this_rq is locked.
  */
 static int
 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
 			cpumask_t *cpus)
 {
 	struct sched_group *group;
 	struct rq *busiest = NULL;
 	unsigned long imbalance;
 	int ld_moved = 0;
 	int sd_idle = 0;
 	int all_pinned = 0;
 	cpus_setall(*cpus);
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
 	 * let the state of idle sibling percolate up as IDLE, instead of
 	 * portraying it as CPU_NOT_IDLE.
 	 */
 	if (sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		sd_idle = 1;
 	schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
 redo:
 	update_shares_locked(this_rq, sd);
 	group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
 				   &sd_idle, cpus, NULL);
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
 		goto out_balanced;
 	}
 	busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
 		goto out_balanced;
 	}
 	BUG_ON(busiest == this_rq);
 	schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
 	ld_moved = 0;
 	if (busiest->nr_running > 1) {
 		/* Attempt to move tasks */
 		double_lock_balance(this_rq, busiest);
 		/* this_rq->clock is already updated */
 		update_rq_clock(busiest);
 		ld_moved = move_tasks(this_rq, this_cpu, busiest,
 					imbalance, sd, CPU_NEWLY_IDLE,
 					&all_pinned);
 		double_unlock_balance(this_rq, busiest);
 		if (unlikely(all_pinned)) {
 			cpu_clear(cpu_of(busiest), *cpus);
 			if (!cpus_empty(*cpus))
 				goto redo;
 		}
 	}
 	if (!ld_moved) {
 		schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
 		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 		    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 			return -1;
 	} else
 		sd->nr_balance_failed = 0;
 	update_shares_locked(this_rq, sd);
 	return ld_moved;
 out_balanced:
 	schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		return -1;
 	sd->nr_balance_failed = 0;
 	return 0;
 }
 /*
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
  */
 static void idle_balance(int this_cpu, struct rq *this_rq)
 {
 	struct sched_domain *sd;
 	int pulled_task = 0;
 	unsigned long next_balance = jiffies + HZ;
 	cpumask_t tmpmask;
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
 		if (!(sd->flags & SD_LOAD_BALANCE))
 			continue;
 		if (sd->flags & SD_BALANCE_NEWIDLE)
 			/* If we've pulled tasks over stop searching: */
 			pulled_task = load_balance_newidle(this_cpu, this_rq,
 							   sd, &tmpmask);
 		interval = msecs_to_jiffies(sd->balance_interval);
 		if (time_after(next_balance, sd->last_balance + interval))
 			next_balance = sd->last_balance + interval;
 		if (pulled_task)
 			break;
 	}
 	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
 		/*
 		 * We are going idle. next_balance may be set based on
 		 * a busy processor. So reset next_balance.
 		 */
 		this_rq->next_balance = next_balance;
 	}
 }
 /*
  * active_load_balance is run by migration threads. It pushes running tasks
  * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
  * running on each physical CPU where possible, and avoids physical /
  * logical imbalances.
  *
  * Called with busiest_rq locked.
  */
 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 {
 	int target_cpu = busiest_rq->push_cpu;
 	struct sched_domain *sd;
 	struct rq *target_rq;
 	/* Is there any task to move? */
 	if (busiest_rq->nr_running <= 1)
 		return;
 	target_rq = cpu_rq(target_cpu);
 	/*
 	 * This condition is "impossible", if it occurs
 	 * we need to fix it. Originally reported by
 	 * Bjorn Helgaas on a 128-cpu setup.
 	 */
 	BUG_ON(busiest_rq == target_rq);
 	/* move a task from busiest_rq to target_rq */
 	double_lock_balance(busiest_rq, target_rq);
 	update_rq_clock(busiest_rq);
 	update_rq_clock(target_rq);
 	/* Search for an sd spanning us and the target CPU. */
 	for_each_domain(target_cpu, sd) {
 		if ((sd->flags & SD_LOAD_BALANCE) &&
 		    cpu_isset(busiest_cpu, sd->span))
 				break;
 	}
 	if (likely(sd)) {
 		schedstat_inc(sd, alb_count);
 		if (move_one_task(target_rq, target_cpu, busiest_rq,
 				  sd, CPU_IDLE))
 			schedstat_inc(sd, alb_pushed);
 		else
 			schedstat_inc(sd, alb_failed);
 	}
 	double_unlock_balance(busiest_rq, target_rq);
 }
 #ifdef CONFIG_NO_HZ
 static struct {
 	atomic_t load_balancer;
 	cpumask_t cpu_mask;
 } nohz ____cacheline_aligned = {
 	.load_balancer = ATOMIC_INIT(-1),
 	.cpu_mask = CPU_MASK_NONE,
 };
 /*
  * This routine will try to nominate the ilb (idle load balancing)
  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
  * load balancing on behalf of all those cpus. If all the cpus in the system
  * go into this tickless mode, then there will be no ilb owner (as there is
  * no need for one) and all the cpus will sleep till the next wakeup event
  * arrives...
  *
  * For the ilb owner, tick is not stopped. And this tick will be used
  * for idle load balancing. ilb owner will still be part of
  * nohz.cpu_mask..
  *
  * While stopping the tick, this cpu will become the ilb owner if there
  * is no other owner. And will be the owner till that cpu becomes busy
  * or if all cpus in the system stop their ticks at which point
  * there is no need for ilb owner.
  *
  * When the ilb owner becomes busy, it nominates another owner, during the
  * next busy scheduler_tick()
  */
 int select_nohz_load_balancer(int stop_tick)
 {
 	int cpu = smp_processor_id();
 	if (stop_tick) {
 		cpu_set(cpu, nohz.cpu_mask);
 		cpu_rq(cpu)->in_nohz_recently = 1;
 		/*
 		 * If we are going offline and still the leader, give up!
 		 */
 		if (!cpu_active(cpu) &&
 		    atomic_read(&nohz.load_balancer) == cpu) {
 			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
 				BUG();
 			return 0;
 		}
 		/* time for ilb owner also to sleep */
 		if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
 			if (atomic_read(&nohz.load_balancer) == cpu)
 				atomic_set(&nohz.load_balancer, -1);
 			return 0;
 		}
 		if (atomic_read(&nohz.load_balancer) == -1) {
 			/* make me the ilb owner */
 			if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
 				return 1;
 		} else if (atomic_read(&nohz.load_balancer) == cpu)
 			return 1;
 	} else {
 		if (!cpu_isset(cpu, nohz.cpu_mask))
 			return 0;
 		cpu_clear(cpu, nohz.cpu_mask);
 		if (atomic_read(&nohz.load_balancer) == cpu)
 			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
 				BUG();
 	}
 	return 0;
 }
 #endif
 static DEFINE_SPINLOCK(balancing);
 /*
  * It checks each scheduling domain to see if it is due to be balanced,
  * and initiates a balancing operation if so.
  *
  * Balancing parameters are set up in arch_init_sched_domains.
  */
 static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 {
 	int balance = 1;
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long interval;
 	struct sched_domain *sd;
 	/* Earliest time when we have to do rebalance again */
 	unsigned long next_balance = jiffies + 60*HZ;
 	int update_next_balance = 0;
 	int need_serialize;
 	cpumask_t tmp;
 	for_each_domain(cpu, sd) {
 		if (!(sd->flags & SD_LOAD_BALANCE))
 			continue;
 		interval = sd->balance_interval;
 		if (idle != CPU_IDLE)
 			interval *= sd->busy_factor;
 		/* scale ms to jiffies */
 		interval = msecs_to_jiffies(interval);
 		if (unlikely(!interval))
 			interval = 1;
 		if (interval > HZ*NR_CPUS/10)
 			interval = HZ*NR_CPUS/10;
 		need_serialize = sd->flags & SD_SERIALIZE;
 		if (need_serialize) {
 			if (!spin_trylock(&balancing))
 				goto out;
 		}
 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
 			if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
 				/*
 				 * We've pulled tasks over so either we're no
 				 * longer idle, or one of our SMT siblings is
 				 * not idle.
 				 */
 				idle = CPU_NOT_IDLE;
 			}
 			sd->last_balance = jiffies;
 		}
 		if (need_serialize)
 			spin_unlock(&balancing);
 out:
 		if (time_after(next_balance, sd->last_balance + interval)) {
 			next_balance = sd->last_balance + interval;
 			update_next_balance = 1;
 		}
 		/*
 		 * Stop the load balance at this level. There is another
 		 * CPU in our sched group which is doing load balancing more
 		 * actively.
 		 */
 		if (!balance)
 			break;
 	}
 	/*
 	 * next_balance will be updated only when there is a need.
 	 * When the cpu is attached to null domain for ex, it will not be
 	 * updated.
 	 */
 	if (likely(update_next_balance))
 		rq->next_balance = next_balance;
 }
 /*
  * run_rebalance_domains is triggered when needed from the scheduler tick.
  * In CONFIG_NO_HZ case, the idle load balance owner will do the
  * rebalancing for all the cpus for whom scheduler ticks are stopped.
  */
 static void run_rebalance_domains(struct softirq_action *h)
 {
 	int this_cpu = smp_processor_id();
 	struct rq *this_rq = cpu_rq(this_cpu);
 	enum cpu_idle_type idle = this_rq->idle_at_tick ?
 						CPU_IDLE : CPU_NOT_IDLE;
 	rebalance_domains(this_cpu, idle);
 #ifdef CONFIG_NO_HZ
 	/*
 	 * If this cpu is the owner for idle load balancing, then do the
 	 * balancing on behalf of the other idle cpus whose ticks are
 	 * stopped.
 	 */
 	if (this_rq->idle_at_tick &&
 	    atomic_read(&nohz.load_balancer) == this_cpu) {
 		cpumask_t cpus = nohz.cpu_mask;
 		struct rq *rq;
 		int balance_cpu;
 		cpu_clear(this_cpu, cpus);
 		for_each_cpu_mask_nr(balance_cpu, cpus) {
 			/*
 			 * If this cpu gets work to do, stop the load balancing
 			 * work being done for other cpus. Next load
 			 * balancing owner will pick it up.
 			 */
 			if (need_resched())
 				break;
 			rebalance_domains(balance_cpu, CPU_IDLE);
 			rq = cpu_rq(balance_cpu);
 			if (time_after(this_rq->next_balance, rq->next_balance))
 				this_rq->next_balance = rq->next_balance;
 		}
 	}
 #endif
 }
 /*
  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
  *
  * In case of CONFIG_NO_HZ, this is the place where we nominate a new
  * idle load balancing owner or decide to stop the periodic load balancing,
  * if the whole system is idle.
  */
 static inline void trigger_load_balance(struct rq *rq, int cpu)
 {
 #ifdef CONFIG_NO_HZ
 	/*
 	 * If we were in the nohz mode recently and busy at the current
 	 * scheduler tick, then check if we need to nominate new idle
 	 * load balancer.
 	 */
 	if (rq->in_nohz_recently && !rq->idle_at_tick) {
 		rq->in_nohz_recently = 0;
 		if (atomic_read(&nohz.load_balancer) == cpu) {
 			cpu_clear(cpu, nohz.cpu_mask);
 			atomic_set(&nohz.load_balancer, -1);
 		}
 		if (atomic_read(&nohz.load_balancer) == -1) {
 			/*
 			 * simple selection for now: Nominate the
 			 * first cpu in the nohz list to be the next
 			 * ilb owner.
 			 *
 			 * TBD: Traverse the sched domains and nominate
 			 * the nearest cpu in the nohz.cpu_mask.
 			 */
 			int ilb = first_cpu(nohz.cpu_mask);
 			if (ilb < nr_cpu_ids)
 				resched_cpu(ilb);
 		}
 	}
 	/*
 	 * If this cpu is idle and doing idle load balancing for all the
 	 * cpus with ticks stopped, is it time for that to stop?
 	 */
 	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
 	    cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
 		resched_cpu(cpu);
 		return;
 	}
 	/*
 	 * If this cpu is idle and the idle load balancing is done by
 	 * someone else, then no need raise the SCHED_SOFTIRQ
 	 */
 	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
 	    cpu_isset(cpu, nohz.cpu_mask))
 		return;
 #endif
 	if (time_after_eq(jiffies, rq->next_balance))
 		raise_softirq(SCHED_SOFTIRQ);
 }
 #else	/* CONFIG_SMP */
 /*
  * on UP we do not need to balance between CPUs:
  */
 static inline void idle_balance(int cpu, struct rq *rq)
 {
 }
 #endif
 DEFINE_PER_CPU(struct kernel_stat, kstat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 /*
  * Return any ns on the sched_clock that have not yet been banked in
  * @p in case that task is currently running.
  */
 unsigned long long task_delta_exec(struct task_struct *p)
 {
 	unsigned long flags;
 	struct rq *rq;
 	u64 ns = 0;
 	rq = task_rq_lock(p, &flags);
 	if (task_current(rq, p)) {
 		u64 delta_exec;
 		update_rq_clock(rq);
 		delta_exec = rq->clock - p->se.exec_start;
 		if ((s64)delta_exec > 0)
 			ns = delta_exec;
 	}
 	task_rq_unlock(rq, &flags);
 	return ns;
 }
 /*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in user space since the last update
  */
 void account_user_time(struct task_struct *p, cputime_t cputime)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t tmp;
 	p->utime = cputime_add(p->utime, cputime);
 	account_group_user_time(p, cputime);
 	/* Add user time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
 	if (TASK_NICE(p) > 0)
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
 	else
 		cpustat->user = cputime64_add(cpustat->user, tmp);
 	/* Account for user time used */
 	acct_update_integrals(p);
 }
 /*
  * Account guest cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in virtual machine since the last update
  */
 static void account_guest_time(struct task_struct *p, cputime_t cputime)
 {
 	cputime64_t tmp;
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	tmp = cputime_to_cputime64(cputime);
 	p->utime = cputime_add(p->utime, cputime);
 	account_group_user_time(p, cputime);
 	p->gtime = cputime_add(p->gtime, cputime);
 	cpustat->user = cputime64_add(cpustat->user, tmp);
 	cpustat->guest = cputime64_add(cpustat->guest, tmp);
 }
 /*
  * Account scaled user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in user space since the last update
  */
 void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
 {
 	p->utimescaled = cputime_add(p->utimescaled, cputime);
 }
 /*
  * Account system cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
  * @cputime: the cpu time spent in kernel space since the last update
  */
 void account_system_time(struct task_struct *p, int hardirq_offset,
 			 cputime_t cputime)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	struct rq *rq = this_rq();
 	cputime64_t tmp;
 	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
 		account_guest_time(p, cputime);
 		return;
 	}
 	p->stime = cputime_add(p->stime, cputime);
 	account_group_system_time(p, cputime);
 	/* Add system time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
 	if (hardirq_count() - hardirq_offset)
 		cpustat->irq = cputime64_add(cpustat->irq, tmp);
 	else if (softirq_count())
 		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
 	else if (p != rq->idle)
 		cpustat->system = cputime64_add(cpustat->system, tmp);
 	else if (atomic_read(&rq->nr_iowait) > 0)
 		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
 	else
 		cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	/* Account for system time used */
 	acct_update_integrals(p);
 }
 /*
  * Account scaled system cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
  * @cputime: the cpu time spent in kernel space since the last update
  */
 void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
 {
 	p->stimescaled = cputime_add(p->stimescaled, cputime);
 }
 /*
  * Account for involuntary wait time.
  * @p: the process from which the cpu time has been stolen
  * @steal: the cpu time spent in involuntary wait
  */
 void account_steal_time(struct task_struct *p, cputime_t steal)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t tmp = cputime_to_cputime64(steal);
 	struct rq *rq = this_rq();
 	if (p == rq->idle) {
 		p->stime = cputime_add(p->stime, steal);
 		account_group_system_time(p, steal);
 		if (atomic_read(&rq->nr_iowait) > 0)
 			cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
 		else
 			cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	} else
 		cpustat->steal = cputime64_add(cpustat->steal, tmp);
 }
 /*
  * Use precise platform statistics if available:
  */
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 cputime_t task_utime(struct task_struct *p)
 {
 	return p->utime;
 }
 cputime_t task_stime(struct task_struct *p)
 {
 	return p->stime;
 }
 #else
 cputime_t task_utime(struct task_struct *p)
 {
 	clock_t utime = cputime_to_clock_t(p->utime),
 		total = utime + cputime_to_clock_t(p->stime);
 	u64 temp;
 	/*
 	 * Use CFS's precise accounting:
 	 */
 	temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
 	if (total) {
 		temp *= utime;
 		do_div(temp, total);
 	}
 	utime = (clock_t)temp;
 	p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
 	return p->prev_utime;
 }
 cputime_t task_stime(struct task_struct *p)
 {
 	clock_t stime;
 	/*
 	 * Use CFS's precise accounting. (we subtract utime from
 	 * the total, to make sure the total observed by userspace
 	 * grows monotonically - apps rely on that):
 	 */
 	stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
 			cputime_to_clock_t(task_utime(p));
 	if (stime >= 0)
 		p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
 	return p->prev_stime;
 }
 #endif
 inline cputime_t task_gtime(struct task_struct *p)
 {
 	return p->gtime;
 }
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
  *
  * It also gets called by the fork code, when changing the parent's
  * timeslices.
  */
 void scheduler_tick(void)
 {
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
 	struct task_struct *curr = rq->curr;
 	sched_clock_tick();
 	spin_lock(&rq->lock);
 	update_rq_clock(rq);
 	update_cpu_load(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	spin_unlock(&rq->lock);
 #ifdef CONFIG_SMP
 	rq->idle_at_tick = idle_cpu(cpu);
 	trigger_load_balance(rq, cpu);
 #endif
 }
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
 				defined(CONFIG_PREEMPT_TRACER))
 static inline unsigned long get_parent_ip(unsigned long addr)
 {
 	if (in_lock_functions(addr)) {
 		addr = CALLER_ADDR2;
 		if (in_lock_functions(addr))
 			addr = CALLER_ADDR3;
 	}
 	return addr;
 }
 void __kprobes add_preempt_count(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
 		return;
 #endif
 	preempt_count() += val;
 #ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Spinlock count overflowing soon?
 	 */
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
 #endif
 	if (preempt_count() == val)
 		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 EXPORT_SYMBOL(add_preempt_count);
 void __kprobes sub_preempt_count(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
 		return;
 	/*
 	 * Is the spinlock portion underflowing?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
 			!(preempt_count() & PREEMPT_MASK)))
 		return;
 #endif
 	if (preempt_count() == val)
 		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 	preempt_count() -= val;
 }
 EXPORT_SYMBOL(sub_preempt_count);
 #endif
 /*
  * Print scheduling while atomic bug:
  */
 static noinline void __schedule_bug(struct task_struct *prev)
 {
 	struct pt_regs *regs = get_irq_regs();
 	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
 		prev->comm, prev->pid, preempt_count());
 	debug_show_held_locks(prev);
 	print_modules();
 	if (irqs_disabled())
 		print_irqtrace_events(prev);
 	if (regs)
 		show_regs(regs);
 	else
 		dump_stack();
 }
 /*
  * Various schedule()-time debugging checks and statistics:
  */
 static inline void schedule_debug(struct task_struct *prev)
 {
 	/*
 	 * Test if we are atomic. Since do_exit() needs to call into
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
 	if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
 		__schedule_bug(prev);
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 	schedstat_inc(this_rq(), sched_count);
 #ifdef CONFIG_SCHEDSTATS
 	if (unlikely(prev->lock_depth >= 0)) {
 		schedstat_inc(this_rq(), bkl_count);
 		schedstat_inc(prev, sched_info.bkl_count);
 	}
 #endif
 }
 /*
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
 pick_next_task(struct rq *rq, struct task_struct *prev)
 {
 	const struct sched_class *class;
 	struct task_struct *p;
 	/*
 	 * Optimization: we know that if all tasks are in
 	 * the fair class we can call that function directly:
 	 */
 	if (likely(rq->nr_running == rq->cfs.nr_running)) {
 		p = fair_sched_class.pick_next_task(rq);
 		if (likely(p))
 			return p;
 	}
 	class = sched_class_highest;
 	for ( ; ; ) {
 		p = class->pick_next_task(rq);
 		if (p)
 			return p;
 		/*
 		 * Will never be NULL as the idle class always
 		 * returns a non-NULL p:
 		 */
 		class = class->next;
 	}
 }
 /*
  * schedule() is the main scheduler function.
  */
 asmlinkage void __sched schedule(void)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
 	int cpu;
 need_resched:
 	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_qsctr_inc(cpu);
 	prev = rq->curr;
 	switch_count = &prev->nivcsw;
 	release_kernel_lock(prev);
 need_resched_nonpreemptible:
 	schedule_debug(prev);
 	if (sched_feat(HRTICK))
 		hrtick_clear(rq);
 	spin_lock_irq(&rq->lock);
 	update_rq_clock(rq);
 	clear_tsk_need_resched(prev);
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		if (unlikely(signal_pending_state(prev->state, prev)))
 			prev->state = TASK_RUNNING;
 		else
 			deactivate_task(rq, prev, 1);
 		switch_count = &prev->nvcsw;
 	}
 #ifdef CONFIG_SMP
 	if (prev->sched_class->pre_schedule)
 		prev->sched_class->pre_schedule(rq, prev);
 #endif
 	if (unlikely(!rq->nr_running))
 		idle_balance(cpu, rq);
 	prev->sched_class->put_prev_task(rq, prev);
 	next = pick_next_task(rq, prev);
 	if (likely(prev != next)) {
 		sched_info_switch(prev, next);
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
 		context_switch(rq, prev, next); /* unlocks the rq */
 		/*
 		 * the context switch might have flipped the stack from under
 		 * us, hence refresh the local variables.
 		 */
 		cpu = smp_processor_id();
 		rq = cpu_rq(cpu);
 	} else
 		spin_unlock_irq(&rq->lock);
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
  * off of preempt_enable. Kernel preemptions off return from interrupt
  * occur there and call schedule directly.
  */
 asmlinkage void __sched preempt_schedule(void)
 {
 	struct thread_info *ti = current_thread_info();
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task. Just return..
 	 */
 	if (likely(ti->preempt_count || irqs_disabled()))
 		return;
 	do {
 		add_preempt_count(PREEMPT_ACTIVE);
 		schedule();
 		sub_preempt_count(PREEMPT_ACTIVE);
 		/*
 		 * Check again in case we missed a preemption opportunity
 		 * between schedule and now.
 		 */
 		barrier();
 	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
 }
 EXPORT_SYMBOL(preempt_schedule);
 /*
  * this is the entry point to schedule() from kernel preemption
  * off of irq context.
  * Note, that this is called and return with irqs disabled. This will
  * protect us against recursive calling from irq.
  */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
 	struct thread_info *ti = current_thread_info();
 	/* Catch callers which need to be fixed */
 	BUG_ON(ti->preempt_count || !irqs_disabled());
 	do {
 		add_preempt_count(PREEMPT_ACTIVE);
 		local_irq_enable();
 		schedule();
 		local_irq_disable();
 		sub_preempt_count(PREEMPT_ACTIVE);
 		/*
 		 * Check again in case we missed a preemption opportunity
 		 * between schedule and now.
 		 */
 		barrier();
 	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
 }
 #endif /* CONFIG_PREEMPT */
 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
 			  void *key)
 {
 	return try_to_wake_up(curr->private, mode, sync);
 }
 EXPORT_SYMBOL(default_wake_function);
 /*
  * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
  * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
  * number) then we wake all the non-exclusive tasks and one exclusive task.
  *
  * There are circumstances in which we can try to wake a task which has already
  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 			     int nr_exclusive, int sync, void *key)
 {
 	wait_queue_t *curr, *next;
 	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
 		unsigned flags = curr->flags;
 		if (curr->func(curr, mode, sync, key) &&
 				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
 			break;
 	}
 }
 /**
  * __wake_up - wake up threads blocked on a waitqueue.
  * @q: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  * @key: is directly passed to the wakeup function
  */
 void __wake_up(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, void *key)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&q->lock, flags);
 	__wake_up_common(q, mode, nr_exclusive, 0, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(__wake_up);
 /*
  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
  */
 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 {
 	__wake_up_common(q, mode, 1, 0, NULL);
 }
 /**
  * __wake_up_sync - wake up threads blocked on a waitqueue.
  * @q: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  *
  * The sync wakeup differs that the waker knows that it will schedule
  * away soon, so while the target thread will be woken up, it will not
  * be migrated to another CPU - ie. the two threads are 'synchronized'
  * with each other. This can prevent needless bouncing between CPUs.
  *
  * On UP it can prevent extra preemption.
  */
 void
 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 {
 	unsigned long flags;
 	int sync = 1;
 	if (unlikely(!q))
 		return;
 	if (unlikely(!nr_exclusive))
 		sync = 0;
 	spin_lock_irqsave(&q->lock, flags);
 	__wake_up_common(q, mode, nr_exclusive, sync, NULL);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 /**
  * complete: - signals a single thread waiting on this completion
  * @x:  holds the state of this particular completion
  *
  * This will wake up a single thread waiting on this completion. Threads will be
  * awakened in the same order in which they were queued.
  *
  * See also complete_all(), wait_for_completion() and related routines.
  */
 void complete(struct completion *x)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done++;
 	__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete);
 /**
  * complete_all: - signals all threads waiting on this completion
  * @x:  holds the state of this particular completion
  *
  * This will wake up all threads waiting on this particular completion event.
  */
 void complete_all(struct completion *x)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done += UINT_MAX/2;
 	__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete_all);
 static inline long __sched
 do_wait_for_common(struct completion *x, long timeout, int state)
 {
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 		wait.flags |= WQ_FLAG_EXCLUSIVE;
 		__add_wait_queue_tail(&x->wait, &wait);
 		do {
 			if (signal_pending_state(state, current)) {
 				timeout = -ERESTARTSYS;
 				break;
 			}
 			__set_current_state(state);
 			spin_unlock_irq(&x->wait.lock);
 			timeout = schedule_timeout(timeout);
 			spin_lock_irq(&x->wait.lock);
 		} while (!x->done && timeout);
 		__remove_wait_queue(&x->wait, &wait);
 		if (!x->done)
 			return timeout;
 	}
 	x->done--;
 	return timeout ?: 1;
 }
 static long __sched
 wait_for_common(struct completion *x, long timeout, int state)
 {
 	might_sleep();
 	spin_lock_irq(&x->wait.lock);
 	timeout = do_wait_for_common(x, timeout, state);
 	spin_unlock_irq(&x->wait.lock);
 	return timeout;
 }
 /**
  * wait_for_completion: - waits for completion of a task
  * @x:  holds the state of this particular completion
  *
  * This waits to be signaled for completion of a specific task. It is NOT
  * interruptible and there is no timeout.
  *
  * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
  * and interrupt capability. Also see complete().
  */
 void __sched wait_for_completion(struct completion *x)
 {
 	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_for_completion);
 /**
  * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
  * @x:  holds the state of this particular completion
  * @timeout:  timeout value in jiffies
  *
  * This waits for either a completion of a specific task to be signaled or for a
  * specified timeout to expire. The timeout is in jiffies. It is not
  * interruptible.
  */
 unsigned long __sched
 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 {
 	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_for_completion_timeout);
 /**
  * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
  * @x:  holds the state of this particular completion
  *
  * This waits for completion of a specific task to be signaled. It is
  * interruptible.
  */
 int __sched wait_for_completion_interruptible(struct completion *x)
 {
 	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
 	if (t == -ERESTARTSYS)
 		return t;
 	return 0;
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible);
 /**
  * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
  * @x:  holds the state of this particular completion
  * @timeout:  timeout value in jiffies
  *
  * This waits for either a completion of a specific task to be signaled or for a
  * specified timeout to expire. It is interruptible. The timeout is in jiffies.
  */
 unsigned long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
 					  unsigned long timeout)
 {
 	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
 /**
  * wait_for_completion_killable: - waits for completion of a task (killable)
  * @x:  holds the state of this particular completion
  *
  * This waits to be signaled for completion of a specific task. It can be
  * interrupted by a kill signal.
  */
 int __sched wait_for_completion_killable(struct completion *x)
 {
 	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
 	if (t == -ERESTARTSYS)
 		return t;
 	return 0;
 }
 EXPORT_SYMBOL(wait_for_completion_killable);
 /**
  *	try_wait_for_completion - try to decrement a completion without blocking
  *	@x:	completion structure
  *
  *	Returns: 0 if a decrement cannot be done without blocking
  *		 1 if a decrement succeeded.
  *
  *	If a completion is being used as a counting completion,
  *	attempt to decrement the counter without blocking. This
  *	enables us to avoid waiting if the resource the completion
  *	is protecting is not available.
  */
 bool try_wait_for_completion(struct completion *x)
 {
 	int ret = 1;
 	spin_lock_irq(&x->wait.lock);
 	if (!x->done)
 		ret = 0;
 	else
 		x->done--;
 	spin_unlock_irq(&x->wait.lock);
 	return ret;
 }
 EXPORT_SYMBOL(try_wait_for_completion);
 /**
  *	completion_done - Test to see if a completion has any waiters
  *	@x:	completion structure
  *
  *	Returns: 0 if there are waiters (wait_for_completion() in progress)
  *		 1 if there are no waiters.
  *
  */
 bool completion_done(struct completion *x)
 {
 	int ret = 1;
 	spin_lock_irq(&x->wait.lock);
 	if (!x->done)
 		ret = 0;
 	spin_unlock_irq(&x->wait.lock);
 	return ret;
 }
 EXPORT_SYMBOL(completion_done);
 static long __sched
 sleep_on_common(wait_queue_head_t *q, int state, long timeout)
 {
 	unsigned long flags;
 	wait_queue_t wait;
 	init_waitqueue_entry(&wait, current);
 	__set_current_state(state);
 	spin_lock_irqsave(&q->lock, flags);
 	__add_wait_queue(q, &wait);
 	spin_unlock(&q->lock);
 	timeout = schedule_timeout(timeout);
 	spin_lock_irq(&q->lock);
 	__remove_wait_queue(q, &wait);
 	spin_unlock_irqrestore(&q->lock, flags);
 	return timeout;
 }
 void __sched interruptible_sleep_on(wait_queue_head_t *q)
 {
 	sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
 }
 EXPORT_SYMBOL(interruptible_sleep_on);
 long __sched
 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
 }
 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
 void __sched sleep_on(wait_queue_head_t *q)
 {
 	sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
 }
 EXPORT_SYMBOL(sleep_on);
 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
 }
 EXPORT_SYMBOL(sleep_on_timeout);
 #ifdef CONFIG_RT_MUTEXES
 /*
  * rt_mutex_setprio - set the current priority of a task
  * @p: task
  * @prio: prio value (kernel-internal form)
  *
  * This function changes the 'effective' priority of a task. It does
  * not touch ->normal_prio like __setscheduler().
  *
  * Used by the rt_mutex code to implement priority inheritance logic.
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
 	unsigned long flags;
 	int oldprio, on_rq, running;
 	struct rq *rq;
 	const struct sched_class *prev_class = p->sched_class;
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 	rq = task_rq_lock(p, &flags);
 	update_rq_clock(rq);
 	oldprio = p->prio;
 	on_rq = p->se.on_rq;
 	running = task_current(rq, p);
 	if (on_rq)
 		dequeue_task(rq, p, 0);
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
 	if (rt_prio(prio))
 		p->sched_class = &rt_sched_class;
 	else
 		p->sched_class = &fair_sched_class;
 	p->prio = prio;
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (on_rq) {
 		enqueue_task(rq, p, 0);
 		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
 	task_rq_unlock(rq, &flags);
 }
 #endif
 void set_user_nice(struct task_struct *p, long nice)
 {
 	int old_prio, delta, on_rq;
 	unsigned long flags;
 	struct rq *rq;
 	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
 		return;
 	/*
 	 * We have to be careful, if called from sys_setpriority(),
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
 	rq = task_rq_lock(p, &flags);
 	update_rq_clock(rq);
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
 	 * it wont have any effect on scheduling until the task is
 	 * SCHED_FIFO/SCHED_RR:
 	 */
 	if (task_has_rt_policy(p)) {
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
 	on_rq = p->se.on_rq;
 	if (on_rq)
 		dequeue_task(rq, p, 0);
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
 	old_prio = p->prio;
 	p->prio = effective_prio(p);
 	delta = p->prio - old_prio;
 	if (on_rq) {
 		enqueue_task(rq, p, 0);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
 		 */
 		if (delta < 0 || (delta > 0 && task_running(rq, p)))
 			resched_task(rq->curr);
 	}
 out_unlock:
 	task_rq_unlock(rq, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
 /*
  * can_nice - check if a task can reduce its nice value
  * @p: task
  * @nice: nice value
  */
 int can_nice(const struct task_struct *p, const int nice)
 {
 	/* convert nice value [19,-20] to rlimit style value [1,40] */
 	int nice_rlim = 20 - nice;
 	return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
 		capable(CAP_SYS_NICE));
 }
 #ifdef __ARCH_WANT_SYS_NICE
 /*
  * sys_nice - change the priority of the current process.
  * @increment: priority increment
  *
  * sys_setpriority is a more generic, but much slower function that
  * does similar things.
  */
 asmlinkage long sys_nice(int increment)
 {
 	long nice, retval;
 	/*
 	 * Setpriority might change our priority at the same moment.
 	 * We don't have to worry. Conceptually one call occurs first
 	 * and we have a single winner.
 	 */
 	if (increment < -40)
 		increment = -40;
 	if (increment > 40)
 		increment = 40;
 	nice = PRIO_TO_NICE(current->static_prio) + increment;
 	if (nice < -20)
 		nice = -20;
 	if (nice > 19)
 		nice = 19;
 	if (increment < 0 && !can_nice(current, nice))
 		return -EPERM;
 	retval = security_task_setnice(current, nice);
 	if (retval)
 		return retval;
 	set_user_nice(current, nice);
 	return 0;
 }
 #endif
 /**
  * task_prio - return the priority value of a given task.
  * @p: the task in question.
  *
  * This is the priority value as seen by users in /proc.
  * RT tasks are offset by -200. Normal tasks are centered
  * around 0, value goes from -16 to +15.
  */
 int task_prio(const struct task_struct *p)
 {
 	return p->prio - MAX_RT_PRIO;
 }
 /**
  * task_nice - return the nice value of a given task.
  * @p: the task in question.
  */
 int task_nice(const struct task_struct *p)
 {
 	return TASK_NICE(p);
 }
 EXPORT_SYMBOL(task_nice);
 /**
  * idle_cpu - is a given cpu idle currently?
  * @cpu: the processor in question.
  */
 int idle_cpu(int cpu)
 {
 	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
 }
 /**
  * idle_task - return the idle task for a given cpu.
  * @cpu: the processor in question.
  */
 struct task_struct *idle_task(int cpu)
 {
 	return cpu_rq(cpu)->idle;
 }
 /**
  * find_process_by_pid - find a process with a matching PID value.
  * @pid: the pid in question.
  */
 static struct task_struct *find_process_by_pid(pid_t pid)
 {
 	return pid ? find_task_by_vpid(pid) : current;
 }
 /* Actually do priority change: must hold rq lock. */
 static void
 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 {
 	BUG_ON(p->se.on_rq);
 	p->policy = policy;
 	switch (p->policy) {
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 	case SCHED_IDLE:
 		p->sched_class = &fair_sched_class;
 		break;
 	case SCHED_FIFO:
 	case SCHED_RR:
 		p->sched_class = &rt_sched_class;
 		break;
 	}
 	p->rt_priority = prio;
 	p->normal_prio = normal_prio(p);
 	/* we are holding p->pi_lock already */
 	p->prio = rt_mutex_getprio(p);
 	set_load_weight(p);
 }
 static int __sched_setscheduler(struct task_struct *p, int policy,
 				struct sched_param *param, bool user)
 {
 	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	unsigned long flags;
 	const struct sched_class *prev_class = p->sched_class;
 	struct rq *rq;
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
 recheck:
 	/* double check policy once rq lock held */
 	if (policy < 0)
 		policy = oldpolicy = p->policy;
 	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
 			policy != SCHED_NORMAL && policy != SCHED_BATCH &&
 			policy != SCHED_IDLE)
 		return -EINVAL;
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
 	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
 	 * SCHED_BATCH and SCHED_IDLE is 0.
 	 */
 	if (param->sched_priority < 0 ||
 	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
 	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
 		return -EINVAL;
 	if (rt_policy(policy) != (param->sched_priority != 0))
 		return -EINVAL;
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
 	if (user && !capable(CAP_SYS_NICE)) {
 		if (rt_policy(policy)) {
 			unsigned long rlim_rtprio;
 			if (!lock_task_sighand(p, &flags))
 				return -ESRCH;
 			rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
 			unlock_task_sighand(p, &flags);
 			/* can't set/change the rt policy */
 			if (policy != p->policy && !rlim_rtprio)
 				return -EPERM;
 			/* can't increase priority */
 			if (param->sched_priority > p->rt_priority &&
 			    param->sched_priority > rlim_rtprio)
 				return -EPERM;
 		}
 		/*
 		 * Like positive nice levels, dont allow tasks to
 		 * move out of SCHED_IDLE either:
 		 */
 		if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
 			return -EPERM;
 		/* can't change other user's priorities */
 		if ((current->euid != p->euid) &&
 		    (current->euid != p->uid))
 			return -EPERM;
 	}
 	if (user) {
 #ifdef CONFIG_RT_GROUP_SCHED
 		/*
 		 * Do not allow realtime tasks into groups that have no runtime
 		 * assigned.
 		 */
 		if (rt_bandwidth_enabled() && rt_policy(policy) &&
 				task_group(p)->rt_bandwidth.rt_runtime == 0)
 			return -EPERM;
 #endif
 		retval = security_task_setscheduler(p, policy, param);
 		if (retval)
 			return retval;
 	}
 	/*
 	 * make sure no PI-waiters arrive (or leave) while we are
 	 * changing the priority of the task:
 	 */
 	spin_lock_irqsave(&p->pi_lock, flags);
 	/*
 	 * To be able to change p->policy safely, the apropriate
 	 * runqueue lock must be held.
 	 */
 	rq = __task_rq_lock(p);
 	/* recheck policy now with rq lock held */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
 		__task_rq_unlock(rq);
 		spin_unlock_irqrestore(&p->pi_lock, flags);
 		goto recheck;
 	}
 	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
 	running = task_current(rq, p);
 	if (on_rq)
 		deactivate_task(rq, p, 0);
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
 	oldprio = p->prio;
 	__setscheduler(rq, p, policy, param->sched_priority);
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (on_rq) {
 		activate_task(rq, p, 0);
 		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
 	__task_rq_unlock(rq);
 	spin_unlock_irqrestore(&p->pi_lock, flags);
 	rt_mutex_adjust_pi(p);
 	return 0;
 }
 /**
  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
  * @p: the task in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  *
  * NOTE that the task may be already dead.
  */
 int sched_setscheduler(struct task_struct *p, int policy,
 		       struct sched_param *param)
 {
 	return __sched_setscheduler(p, policy, param, true);
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
 /**
  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
  * @p: the task in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  *
  * Just like sched_setscheduler, only don't bother checking if the
  * current context has permission.  For example, this is needed in
  * stop_machine(): we create temporary high priority worker threads,
  * but our caller might not have that capability.
  */
 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
 			       struct sched_param *param)
 {
 	return __sched_setscheduler(p, policy, param, false);
 }
 static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
 	struct sched_param lparam;
 	struct task_struct *p;
 	int retval;
 	if (!param || pid < 0)
 		return -EINVAL;
 	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
 		return -EFAULT;
 	rcu_read_lock();
 	retval = -ESRCH;
 	p = find_process_by_pid(pid);
 	if (p != NULL)
 		retval = sched_setscheduler(p, policy, &lparam);
 	rcu_read_unlock();
 	return retval;
 }
 /**
  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
  * @pid: the pid in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  */
 asmlinkage long
 sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
 	/* negative values for policy are not valid */
 	if (policy < 0)
 		return -EINVAL;
 	return do_sched_setscheduler(pid, policy, param);
 }
 /**
  * sys_sched_setparam - set/change the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the new RT priority.
  */
 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
 {
 	return do_sched_setscheduler(pid, -1, param);
 }
 /**
  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
  * @pid: the pid in question.
  */
 asmlinkage long sys_sched_getscheduler(pid_t pid)
 {
 	struct task_struct *p;
 	int retval;
 	if (pid < 0)
 		return -EINVAL;
 	retval = -ESRCH;
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (p) {
 		retval = security_task_getscheduler(p);
 		if (!retval)
 			retval = p->policy;
 	}
 	read_unlock(&tasklist_lock);
 	return retval;
 }
 /**
  * sys_sched_getscheduler - get the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the RT priority.
  */
 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
 {
 	struct sched_param lp;
 	struct task_struct *p;
 	int retval;
 	if (!param || pid < 0)
 		return -EINVAL;
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	retval = -ESRCH;
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	lp.sched_priority = p->rt_priority;
 	read_unlock(&tasklist_lock);
 	/*
 	 * This one might sleep, we cannot do it with a spinlock held ...
 	 */
 	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
 	return retval;
 out_unlock:
 	read_unlock(&tasklist_lock);
 	return retval;
 }
 long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
 {
 	cpumask_t cpus_allowed;
 	cpumask_t new_mask = *in_mask;
 	struct task_struct *p;
 	int retval;
 	get_online_cpus();
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (!p) {
 		read_unlock(&tasklist_lock);
 		put_online_cpus();
 		return -ESRCH;
 	}
 	/*
 	 * It is not safe to call set_cpus_allowed with the
 	 * tasklist_lock held. We will bump the task_struct's
 	 * usage count and then drop tasklist_lock.
 	 */
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 	retval = -EPERM;
 	if ((current->euid != p->euid) && (current->euid != p->uid) &&
 			!capable(CAP_SYS_NICE))
 		goto out_unlock;
 	retval = security_task_setscheduler(p, 0, NULL);
 	if (retval)
 		goto out_unlock;
 	cpuset_cpus_allowed(p, &cpus_allowed);
 	cpus_and(new_mask, new_mask, cpus_allowed);
  again:
 	retval = set_cpus_allowed_ptr(p, &new_mask);
 	if (!retval) {
 		cpuset_cpus_allowed(p, &cpus_allowed);
 		if (!cpus_subset(new_mask, cpus_allowed)) {
 			/*
 			 * We must have raced with a concurrent cpuset
 			 * update. Just reset the cpus_allowed to the
 			 * cpuset's cpus_allowed
 			 */
 			new_mask = cpus_allowed;
 			goto again;
 		}
 	}
 out_unlock:
 	put_task_struct(p);
 	put_online_cpus();
 	return retval;
 }
 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
 			     cpumask_t *new_mask)
 {
 	if (len < sizeof(cpumask_t)) {
 		memset(new_mask, 0, sizeof(cpumask_t));
 	} else if (len > sizeof(cpumask_t)) {
 		len = sizeof(cpumask_t);
 	}
 	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
 }
 /**
  * sys_sched_setaffinity - set the cpu affinity of a process
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to the new cpu mask
  */
 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
 				      unsigned long __user *user_mask_ptr)
 {
 	cpumask_t new_mask;
 	int retval;
 	retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
 	if (retval)
 		return retval;
 	return sched_setaffinity(pid, &new_mask);
 }
 long sched_getaffinity(pid_t pid, cpumask_t *mask)
 {
 	struct task_struct *p;
 	int retval;
 	get_online_cpus();
 	read_lock(&tasklist_lock);
 	retval = -ESRCH;
 	p = find_process_by_pid(pid);
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	cpus_and(*mask, p->cpus_allowed, cpu_online_map);
 out_unlock:
 	read_unlock(&tasklist_lock);
 	put_online_cpus();
 	return retval;
 }
 /**
  * sys_sched_getaffinity - get the cpu affinity of a process
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to hold the current cpu mask
  */
 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
 				      unsigned long __user *user_mask_ptr)
 {
 	int ret;
 	cpumask_t mask;
 	if (len < sizeof(cpumask_t))
 		return -EINVAL;
 	ret = sched_getaffinity(pid, &mask);
 	if (ret < 0)
 		return ret;
 	if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
 		return -EFAULT;
 	return sizeof(cpumask_t);
 }
 /**
  * sys_sched_yield - yield the current processor to other threads.
  *
  * This function yields the current CPU to other tasks. If there are no
  * other threads running on this CPU then this function will return.
  */
 asmlinkage long sys_sched_yield(void)
 {
 	struct rq *rq = this_rq_lock();
 	schedstat_inc(rq, yld_count);
 	current->sched_class->yield_task(rq);
 	/*
 	 * Since we are going to call schedule() anyway, there's
 	 * no need to preempt or enable interrupts:
 	 */
 	__release(rq->lock);
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 	_raw_spin_unlock(&rq->lock);
 	preempt_enable_no_resched();
 	schedule();
 	return 0;
 }
 static void __cond_resched(void)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
 	__might_sleep(__FILE__, __LINE__);
 #endif
 	/*
 	 * The BKS might be reacquired before we have dropped
 	 * PREEMPT_ACTIVE, which could trigger a second
 	 * cond_resched() call.
 	 */
 	do {
 		add_preempt_count(PREEMPT_ACTIVE);
 		schedule();
 		sub_preempt_count(PREEMPT_ACTIVE);
 	} while (need_resched());
 }
 int __sched _cond_resched(void)
 {
 	if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
 					system_state == SYSTEM_RUNNING) {
 		__cond_resched();
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(_cond_resched);
 /*
  * cond_resched_lock() - if a reschedule is pending, drop the given lock,
  * call schedule, and on return reacquire the lock.
  *
  * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
  * operations here to prevent schedule() from being called twice (once via
  * spin_unlock(), once by hand).
  */
 int cond_resched_lock(spinlock_t *lock)
 {
 	int resched = need_resched() && system_state == SYSTEM_RUNNING;
 	int ret = 0;
 	if (spin_needbreak(lock) || resched) {
 		spin_unlock(lock);
 		if (resched && need_resched())
 			__cond_resched();
 		else
 			cpu_relax();
 		ret = 1;
 		spin_lock(lock);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(cond_resched_lock);
 int __sched cond_resched_softirq(void)
 {
 	BUG_ON(!in_softirq());
 	if (need_resched() && system_state == SYSTEM_RUNNING) {
 		local_bh_enable();
 		__cond_resched();
 		local_bh_disable();
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(cond_resched_softirq);
 /**
  * yield - yield the current processor to other threads.
  *
  * This is a shortcut for kernel-space yielding - it marks the
  * thread runnable and calls sys_sched_yield().
  */
 void __sched yield(void)
 {
 	set_current_state(TASK_RUNNING);
 	sys_sched_yield();
 }
 EXPORT_SYMBOL(yield);
 /*
  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
  * that process accounting knows that this is a task in IO wait state.
  *
  * But don't do that if it is a deliberate, throttling IO wait (this task
  * has set its backing_dev_info: the queue against which it should throttle)
  */
 void __sched io_schedule(void)
 {
 	struct rq *rq = &__raw_get_cpu_var(runqueues);
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	schedule();
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
 }
 EXPORT_SYMBOL(io_schedule);
 long __sched io_schedule_timeout(long timeout)
 {
 	struct rq *rq = &__raw_get_cpu_var(runqueues);
 	long ret;
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	ret = schedule_timeout(timeout);
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
 	return ret;
 }
 /**
  * sys_sched_get_priority_max - return maximum RT priority.
  * @policy: scheduling class.
  *
  * this syscall returns the maximum rt_priority that can be used
  * by a given scheduling class.
  */
 asmlinkage long sys_sched_get_priority_max(int policy)
 {
 	int ret = -EINVAL;
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
 		ret = MAX_USER_RT_PRIO-1;
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 	case SCHED_IDLE:
 		ret = 0;
 		break;
 	}
 	return ret;
 }
 /**
  * sys_sched_get_priority_min - return minimum RT priority.
  * @policy: scheduling class.
  *
  * this syscall returns the minimum rt_priority that can be used
  * by a given scheduling class.
  */
 asmlinkage long sys_sched_get_priority_min(int policy)
 {
 	int ret = -EINVAL;
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
 		ret = 1;
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 	case SCHED_IDLE:
 		ret = 0;
 	}
 	return ret;
 }
 /**
  * sys_sched_rr_get_interval - return the default timeslice of a process.
  * @pid: pid of the process.
  * @interval: userspace pointer to the timeslice value.
  *
  * this syscall writes the default timeslice value of a given process
  * into the user-space timespec buffer. A value of '0' means infinity.
  */
 asmlinkage
 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
 {
 	struct task_struct *p;
 	unsigned int time_slice;
 	int retval;
 	struct timespec t;
 	if (pid < 0)
 		return -EINVAL;
 	retval = -ESRCH;
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	/*
 	 * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
 	 * tasks that are on an otherwise idle runqueue:
 	 */
 	time_slice = 0;
 	if (p->policy == SCHED_RR) {
 		time_slice = DEF_TIMESLICE;
 	} else if (p->policy != SCHED_FIFO) {
 		struct sched_entity *se = &p->se;
 		unsigned long flags;
 		struct rq *rq;
 		rq = task_rq_lock(p, &flags);
 		if (rq->cfs.load.weight)
 			time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
 		task_rq_unlock(rq, &flags);
 	}
 	read_unlock(&tasklist_lock);
 	jiffies_to_timespec(time_slice, &t);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 	return retval;
 out_unlock:
 	read_unlock(&tasklist_lock);
 	return retval;
 }
 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
 void sched_show_task(struct task_struct *p)
 {
 	unsigned long free = 0;
 	unsigned state;
 	state = p->state ? __ffs(p->state) + 1 : 0;
 	printk(KERN_INFO "%-13.13s %c", p->comm,
 		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if BITS_PER_LONG == 32
 	if (state == TASK_RUNNING)
 		printk(KERN_CONT " running  ");
 	else
 		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
 #else
 	if (state == TASK_RUNNING)
 		printk(KERN_CONT "  running task    ");
 	else
 		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	{
 		unsigned long *n = end_of_stack(p);
 		while (!*n)
 			n++;
 		free = (unsigned long)n - (unsigned long)end_of_stack(p);
 	}
 #endif
 	printk(KERN_CONT "%5lu %5d %6d\n", free,
 		task_pid_nr(p), task_pid_nr(p->real_parent));
 	show_stack(p, NULL);
 }
 void show_state_filter(unsigned long state_filter)
 {
 	struct task_struct *g, *p;
 #if BITS_PER_LONG == 32
 	printk(KERN_INFO
 		"  task                PC stack   pid father\n");
 #else
 	printk(KERN_INFO
 		"  task                        PC stack   pid father\n");
 #endif
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
 		 * console might take alot of time:
 		 */
 		touch_nmi_watchdog();
 		if (!state_filter || (p->state & state_filter))
 			sched_show_task(p);
 	} while_each_thread(g, p);
 	touch_all_softlockup_watchdogs();
 #ifdef CONFIG_SCHED_DEBUG
 	sysrq_sched_debug_show();
 #endif
 	read_unlock(&tasklist_lock);
 	/*
 	 * Only show locks if all tasks are dumped:
 	 */
 	if (state_filter == -1)
 		debug_show_all_locks();
 }
 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
 {
 	idle->sched_class = &idle_sched_class;
 }
 /**
  * init_idle - set up an idle thread for a given CPU
  * @idle: task in question
  * @cpu: cpu the idle task belongs to
  *
  * NOTE: this function does not set the idle thread's NEED_RESCHED
  * flag, to make booting more robust.
  */
 void __cpuinit init_idle(struct task_struct *idle, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 	spin_lock_irqsave(&rq->lock, flags);
 	__sched_fork(idle);
 	idle->se.exec_start = sched_clock();
 	idle->prio = idle->normal_prio = MAX_PRIO;
 	idle->cpus_allowed = cpumask_of_cpu(cpu);
 	__set_task_cpu(idle, cpu);
 	rq->curr = rq->idle = idle;
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	idle->oncpu = 1;
 #endif
 	spin_unlock_irqrestore(&rq->lock, flags);
 	/* Set the preempt count _outside_ the spinlocks! */
 #if defined(CONFIG_PREEMPT)
 	task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
 #else
 	task_thread_info(idle)->preempt_count = 0;
 #endif
 	/*
 	 * The idle tasks have their own, simple scheduling class:
 	 */
 	idle->sched_class = &idle_sched_class;
 }
 /*
  * In a system that switches off the HZ timer nohz_cpu_mask
  * indicates which cpus entered this state. This is used
  * in the rcu update to wait only for active cpus. For system
  * which do not switch off the HZ timer nohz_cpu_mask should
  * always be CPU_MASK_NONE.
  */
 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
 /*
  * Increase the granularity value when there are more CPUs,
  * because with more CPUs the 'effective latency' as visible
  * to users decreases. But the relationship is not linear,
  * so pick a second-best guess by going with the log2 of the
  * number of CPUs.
  *
  * This idea comes from the SD scheduler of Con Kolivas:
  */
 static inline void sched_init_granularity(void)
 {
 	unsigned int factor = 1 + ilog2(num_online_cpus());
 	const unsigned long limit = 200000000;
 	sysctl_sched_min_granularity *= factor;
 	if (sysctl_sched_min_granularity > limit)
 		sysctl_sched_min_granularity = limit;
 	sysctl_sched_latency *= factor;
 	if (sysctl_sched_latency > limit)
 		sysctl_sched_latency = limit;
 	sysctl_sched_wakeup_granularity *= factor;
 	sysctl_sched_shares_ratelimit *= factor;
 }
 #ifdef CONFIG_SMP
 /*
  * This is how migration works:
  *
  * 1) we queue a struct migration_req structure in the source CPU's
  *    runqueue and wake up that CPU's migration thread.
  * 2) we down() the locked semaphore => thread blocks.
  * 3) migration thread wakes up (implicitly it forces the migrated
  *    thread off the CPU)
  * 4) it gets the migration request and checks whether the migrated
  *    task is still in the wrong runqueue.
  * 5) if it's in the wrong runqueue then the migration thread removes
  *    it and puts it into the right queue.
  * 6) migration thread up()s the semaphore.
  * 7) we wake up and the migration is done.
  */
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
  * is removed from the allowed bitmask.
  *
  * NOTE: the caller must have a valid reference to the task, the
  * task must not exit() & deallocate itself prematurely. The
  * call is not atomic; no spinlocks may be held.
  */
 int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
 {
 	struct migration_req req;
 	unsigned long flags;
 	struct rq *rq;
 	int ret = 0;
 	rq = task_rq_lock(p, &flags);
 	if (!cpus_intersects(*new_mask, cpu_online_map)) {
 		ret = -EINVAL;
 		goto out;
 	}
 	if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
 		     !cpus_equal(p->cpus_allowed, *new_mask))) {
 		ret = -EINVAL;
 		goto out;
 	}
 	if (p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, new_mask);
 	else {
 		p->cpus_allowed = *new_mask;
 		p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
 	}
 	/* Can the task run on the task's current CPU? If so, we're done */
 	if (cpu_isset(task_cpu(p), *new_mask))
 		goto out;
 	if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
 		/* Need help from migration thread: drop lock and wait. */
 		task_rq_unlock(rq, &flags);
 		wake_up_process(rq->migration_thread);
 		wait_for_completion(&req.done);
 		tlb_migrate_finish(p->mm);
 		return 0;
 	}
 out:
 	task_rq_unlock(rq, &flags);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 /*
  * Move (not current) task off this cpu, onto dest cpu. We're doing
  * this because either it can't run here any more (set_cpus_allowed()
  * away from this CPU, or CPU going down), or because we're
  * attempting to rebalance this task on exec (sched_exec).
  *
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
  *
  * Returns non-zero if task was successfully migrated.
  */
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
 	struct rq *rq_dest, *rq_src;
 	int ret = 0, on_rq;
 	if (unlikely(!cpu_active(dest_cpu)))
 		return ret;
 	rq_src = cpu_rq(src_cpu);
 	rq_dest = cpu_rq(dest_cpu);
 	double_rq_lock(rq_src, rq_dest);
 	/* Already moved. */
 	if (task_cpu(p) != src_cpu)
 		goto done;
 	/* Affinity changed (again). */
 	if (!cpu_isset(dest_cpu, p->cpus_allowed))
 		goto fail;
 	on_rq = p->se.on_rq;
 	if (on_rq)
 		deactivate_task(rq_src, p, 0);
 	set_task_cpu(p, dest_cpu);
 	if (on_rq) {
 		activate_task(rq_dest, p, 0);
 		check_preempt_curr(rq_dest, p, 0);
 	}
 done:
 	ret = 1;
 fail:
 	double_rq_unlock(rq_src, rq_dest);
 	return ret;
 }
 /*
  * migration_thread - this is a highprio system thread that performs
  * thread migration by bumping thread off CPU then 'pushing' onto
  * another runqueue.
  */
 static int migration_thread(void *data)
 {
 	int cpu = (long)data;
 	struct rq *rq;
 	rq = cpu_rq(cpu);
 	BUG_ON(rq->migration_thread != current);
 	set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
 		struct migration_req *req;
 		struct list_head *head;
 		spin_lock_irq(&rq->lock);
 		if (cpu_is_offline(cpu)) {
 			spin_unlock_irq(&rq->lock);
 			goto wait_to_die;
 		}
 		if (rq->active_balance) {
 			active_load_balance(rq, cpu);
 			rq->active_balance = 0;
 		}
 		head = &rq->migration_queue;
 		if (list_empty(head)) {
 			spin_unlock_irq(&rq->lock);
 			schedule();
 			set_current_state(TASK_INTERRUPTIBLE);
 			continue;
 		}
 		req = list_entry(head->next, struct migration_req, list);
 		list_del_init(head->next);
 		spin_unlock(&rq->lock);
 		__migrate_task(req->task, cpu, req->dest_cpu);
 		local_irq_enable();
 		complete(&req->done);
 	}
 	__set_current_state(TASK_RUNNING);
 	return 0;
 wait_to_die:
 	/* Wait for kthread_stop */
 	set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
 		schedule();
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
 	__set_current_state(TASK_RUNNING);
 	return 0;
 }
 #ifdef CONFIG_HOTPLUG_CPU
 static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
 {
 	int ret;
 	local_irq_disable();
 	ret = __migrate_task(p, src_cpu, dest_cpu);
 	local_irq_enable();
 	return ret;
 }
 /*
  * Figure out where task on dead CPU should go, use force if necessary.
  */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
 	unsigned long flags;
 	cpumask_t mask;
 	struct rq *rq;
 	int dest_cpu;
 	do {
 		/* On same node? */
 		mask = node_to_cpumask(cpu_to_node(dead_cpu));
 		cpus_and(mask, mask, p->cpus_allowed);
 		dest_cpu = any_online_cpu(mask);
 		/* On any allowed CPU? */
 		if (dest_cpu >= nr_cpu_ids)
 			dest_cpu = any_online_cpu(p->cpus_allowed);
 		/* No more Mr. Nice Guy. */
 		if (dest_cpu >= nr_cpu_ids) {
 			cpumask_t cpus_allowed;
 			cpuset_cpus_allowed_locked(p, &cpus_allowed);
 			/*
 			 * Try to stay on the same cpuset, where the
 			 * current cpuset may be a subset of all cpus.
 			 * The cpuset_cpus_allowed_locked() variant of
 			 * cpuset_cpus_allowed() will not block. It must be
 			 * called within calls to cpuset_lock/cpuset_unlock.
 			 */
 			rq = task_rq_lock(p, &flags);
 			p->cpus_allowed = cpus_allowed;
 			dest_cpu = any_online_cpu(p->cpus_allowed);
 			task_rq_unlock(rq, &flags);
 			/*
 			 * Don't tell them about moving exiting tasks or
 			 * kernel threads (both mm NULL), since they never
 			 * leave kernel.
 			 */
 			if (p->mm && printk_ratelimit()) {
 				printk(KERN_INFO "process %d (%s) no "
 				       "longer affine to cpu%d\n",
 					task_pid_nr(p), p->comm, dead_cpu);
 			}
 		}
 	} while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
 }
 /*
  * While a dead CPU has no uninterruptible tasks queued at this point,
  * it might still have a nonzero ->nr_uninterruptible counter, because
  * for performance reasons the counter is not stricly tracking tasks to
  * their home CPUs. So we just add the counter to another CPU's counter,
  * to keep the global sum constant after CPU-down:
  */
 static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
 	struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
 	unsigned long flags;
 	local_irq_save(flags);
 	double_rq_lock(rq_src, rq_dest);
 	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
 	rq_src->nr_uninterruptible = 0;
 	double_rq_unlock(rq_src, rq_dest);
 	local_irq_restore(flags);
 }
 /* Run through task list and migrate tasks from the dead cpu. */
 static void migrate_live_tasks(int src_cpu)
 {
 	struct task_struct *p, *t;
 	read_lock(&tasklist_lock);
 	do_each_thread(t, p) {
 		if (p == current)
 			continue;
 		if (task_cpu(p) == src_cpu)
 			move_task_off_dead_cpu(src_cpu, p);
 	} while_each_thread(t, p);
 	read_unlock(&tasklist_lock);
 }
 /*
  * Schedules idle task to be the next runnable task on current CPU.
  * It does so by boosting its priority to highest possible.
  * Used by CPU offline code.
  */
 void sched_idle_next(void)
 {
 	int this_cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(this_cpu);
 	struct task_struct *p = rq->idle;
 	unsigned long flags;
 	/* cpu has to be offline */
 	BUG_ON(cpu_online(this_cpu));
 	/*
 	 * Strictly not necessary since rest of the CPUs are stopped by now
 	 * and interrupts disabled on the current cpu.
 	 */
 	spin_lock_irqsave(&rq->lock, flags);
 	__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
 	update_rq_clock(rq);
 	activate_task(rq, p, 0);
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 /*
  * Ensures that the idle task is using init_mm right before its cpu goes
  * offline.
  */
 void idle_task_exit(void)
 {
 	struct mm_struct *mm = current->active_mm;
 	BUG_ON(cpu_online(smp_processor_id()));
 	if (mm != &init_mm)
 		switch_mm(mm, &init_mm, current);
 	mmdrop(mm);
 }
 /* called under rq->lock with disabled interrupts */
 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 {
 	struct rq *rq = cpu_rq(dead_cpu);
 	/* Must be exiting, otherwise would be on tasklist. */
 	BUG_ON(!p->exit_state);
 	/* Cannot have done final schedule yet: would have vanished. */
 	BUG_ON(p->state == TASK_DEAD);
 	get_task_struct(p);
 	/*
 	 * Drop lock around migration; if someone else moves it,
 	 * that's OK. No task can be added to this CPU, so iteration is
 	 * fine.
 	 */
 	spin_unlock_irq(&rq->lock);
 	move_task_off_dead_cpu(dead_cpu, p);
 	spin_lock_irq(&rq->lock);
 	put_task_struct(p);
 }
 /* release_task() removes task from tasklist, so we won't find dead tasks. */
 static void migrate_dead_tasks(unsigned int dead_cpu)
 {
 	struct rq *rq = cpu_rq(dead_cpu);
 	struct task_struct *next;
 	for ( ; ; ) {
 		if (!rq->nr_running)
 			break;
 		update_rq_clock(rq);
 		next = pick_next_task(rq, rq->curr);
 		if (!next)
 			break;
 		next->sched_class->put_prev_task(rq, next);
 		migrate_dead(dead_cpu, next);
 	}
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
 static struct ctl_table sd_ctl_dir[] = {
 	{
 		.procname	= "sched_domain",
 		.mode		= 0555,
 	},
 	{0, },
 };
 static struct ctl_table sd_ctl_root[] = {
 	{
 		.ctl_name	= CTL_KERN,
 		.procname	= "kernel",
 		.mode		= 0555,
 		.child		= sd_ctl_dir,
 	},
 	{0, },
 };
 static struct ctl_table *sd_alloc_ctl_entry(int n)
 {
 	struct ctl_table *entry =
 		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
 	return entry;
 }
 static void sd_free_ctl_entry(struct ctl_table **tablep)
 {
 	struct ctl_table *entry;
 	/*
 	 * In the intermediate directories, both the child directory and
 	 * procname are dynamically allocated and could fail but the mode
 	 * will always be set. In the lowest directory the names are
 	 * static strings and all have proc handlers.
 	 */
 	for (entry = *tablep; entry->mode; entry++) {
 		if (entry->child)
 			sd_free_ctl_entry(&entry->child);
 		if (entry->proc_handler == NULL)
 			kfree(entry->procname);
 	}
 	kfree(*tablep);
 	*tablep = NULL;
 }
 static void
 set_table_entry(struct ctl_table *entry,
 		const char *procname, void *data, int maxlen,
 		mode_t mode, proc_handler *proc_handler)
 {
 	entry->procname = procname;
 	entry->data = data;
 	entry->maxlen = maxlen;
 	entry->mode = mode;
 	entry->proc_handler = proc_handler;
 }
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
 	struct ctl_table *table = sd_alloc_ctl_entry(13);
 	if (table == NULL)
 		return NULL;
 	set_table_entry(&table[0], "min_interval", &sd->min_interval,
 		sizeof(long), 0644, proc_doulongvec_minmax);
 	set_table_entry(&table[1], "max_interval", &sd->max_interval,
 		sizeof(long), 0644, proc_doulongvec_minmax);
 	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[9], "cache_nice_tries",
 		&sd->cache_nice_tries,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[10], "flags", &sd->flags,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[11], "name", sd->name,
 		CORENAME_MAX_SIZE, 0444, proc_dostring);
 	/* &table[12] is terminator */
 	return table;
 }
 static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
 {
 	struct ctl_table *entry, *table;
 	struct sched_domain *sd;
 	int domain_num = 0, i;
 	char buf[32];
 	for_each_domain(cpu, sd)
 		domain_num++;
 	entry = table = sd_alloc_ctl_entry(domain_num + 1);
 	if (table == NULL)
 		return NULL;
 	i = 0;
 	for_each_domain(cpu, sd) {
 		snprintf(buf, 32, "domain%d", i);
 		entry->procname = kstrdup(buf, GFP_KERNEL);
 		entry->mode = 0555;
 		entry->child = sd_alloc_ctl_domain_table(sd);
 		entry++;
 		i++;
 	}
 	return table;
 }
 static struct ctl_table_header *sd_sysctl_header;
 static void register_sched_domain_sysctl(void)
 {
 	int i, cpu_num = num_online_cpus();
 	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
 	char buf[32];
 	WARN_ON(sd_ctl_dir[0].child);
 	sd_ctl_dir[0].child = entry;
 	if (entry == NULL)
 		return;
 	for_each_online_cpu(i) {
 		snprintf(buf, 32, "cpu%d", i);
 		entry->procname = kstrdup(buf, GFP_KERNEL);
 		entry->mode = 0555;
 		entry->child = sd_alloc_ctl_cpu_table(i);
 		entry++;
 	}
 	WARN_ON(sd_sysctl_header);
 	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
 }
 /* may be called multiple times per register */
 static void unregister_sched_domain_sysctl(void)
 {
 	if (sd_sysctl_header)
 		unregister_sysctl_table(sd_sysctl_header);
 	sd_sysctl_header = NULL;
 	if (sd_ctl_dir[0].child)
 		sd_free_ctl_entry(&sd_ctl_dir[0].child);
 }
 #else
 static void register_sched_domain_sysctl(void)
 {
 }
 static void unregister_sched_domain_sysctl(void)
 {
 }
 #endif
 static void set_rq_online(struct rq *rq)
 {
 	if (!rq->online) {
 		const struct sched_class *class;
 		cpu_set(rq->cpu, rq->rd->online);
 		rq->online = 1;
 		for_each_class(class) {
 			if (class->rq_online)
 				class->rq_online(rq);
 		}
 	}
 }
 static void set_rq_offline(struct rq *rq)
 {
 	if (rq->online) {
 		const struct sched_class *class;
 		for_each_class(class) {
 			if (class->rq_offline)
 				class->rq_offline(rq);
 		}
 		cpu_clear(rq->cpu, rq->rd->online);
 		rq->online = 0;
 	}
 }
 /*
  * migration_call - callback that gets triggered when a CPU is added.
  * Here we can start up the necessary migration thread for the new CPU.
  */
 static int __cpuinit
 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	struct task_struct *p;
 	int cpu = (long)hcpu;
 	unsigned long flags;
 	struct rq *rq;
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
 		p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
 		if (IS_ERR(p))
 			return NOTIFY_BAD;
 		kthread_bind(p, cpu);
 		/* Must be high prio: stop_machine expects to yield to it. */
 		rq = task_rq_lock(p, &flags);
 		__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
 		task_rq_unlock(rq, &flags);
 		cpu_rq(cpu)->migration_thread = p;
 		break;
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		/* Strictly unnecessary, as first user will wake it. */
 		wake_up_process(cpu_rq(cpu)->migration_thread);
 		/* Update our root-domain */
 		rq = cpu_rq(cpu);
 		spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpu_isset(cpu, rq->rd->span));
 			set_rq_online(rq);
 		}
 		spin_unlock_irqrestore(&rq->lock, flags);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 		if (!cpu_rq(cpu)->migration_thread)
 			break;
 		/* Unbind it from offline cpu so it can run. Fall thru. */
 		kthread_bind(cpu_rq(cpu)->migration_thread,
 			     any_online_cpu(cpu_online_map));
 		kthread_stop(cpu_rq(cpu)->migration_thread);
 		cpu_rq(cpu)->migration_thread = NULL;
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
 		migrate_live_tasks(cpu);
 		rq = cpu_rq(cpu);
 		kthread_stop(rq->migration_thread);
 		rq->migration_thread = NULL;
 		/* Idle task back to normal (off runqueue, low prio) */
 		spin_lock_irq(&rq->lock);
 		update_rq_clock(rq);
 		deactivate_task(rq, rq->idle, 0);
 		rq->idle->static_prio = MAX_PRIO;
 		__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
 		rq->idle->sched_class = &idle_sched_class;
 		migrate_dead_tasks(cpu);
 		spin_unlock_irq(&rq->lock);
 		cpuset_unlock();
 		migrate_nr_uninterruptible(rq);
 		BUG_ON(rq->nr_running != 0);
 		/*
 		 * No need to migrate the tasks: it was best-effort if
 		 * they didn't take sched_hotcpu_mutex. Just wake up
 		 * the requestors.
 		 */
 		spin_lock_irq(&rq->lock);
 		while (!list_empty(&rq->migration_queue)) {
 			struct migration_req *req;
 			req = list_entry(rq->migration_queue.next,
 					 struct migration_req, list);
 			list_del_init(&req->list);
 			spin_unlock_irq(&rq->lock);
 			complete(&req->done);
 			spin_lock_irq(&rq->lock);
 		}
 		spin_unlock_irq(&rq->lock);
 		break;
 	case CPU_DYING:
 	case CPU_DYING_FROZEN:
 		/* Update our root-domain */
 		rq = cpu_rq(cpu);
 		spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpu_isset(cpu, rq->rd->span));
 			set_rq_offline(rq);
 		}
 		spin_unlock_irqrestore(&rq->lock, flags);
 		break;
 #endif
 	}
 	return NOTIFY_OK;
 }
 /* Register at highest priority so that task migration (migrate_all_tasks)
  * happens before everything else.
  */
 static struct notifier_block __cpuinitdata migration_notifier = {
 	.notifier_call = migration_call,
 	.priority = 10
 };
 static int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err;
 	/* Start one for the boot CPU: */
 	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
 	BUG_ON(err == NOTIFY_BAD);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
 	return err;
 }
 early_initcall(migration_init);
 #endif
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SCHED_DEBUG
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 				  cpumask_t *groupmask)
 {
 	struct sched_group *group = sd->groups;
 	char str[256];
 	cpulist_scnprintf(str, sizeof(str), sd->span);
 	cpus_clear(*groupmask);
 	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
 	if (!(sd->flags & SD_LOAD_BALANCE)) {
 		printk("does not load-balance\n");
 		if (sd->parent)
 			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
 					" has parent");
 		return -1;
 	}
 	printk(KERN_CONT "span %s level %s\n", str, sd->name);
 	if (!cpu_isset(cpu, sd->span)) {
 		printk(KERN_ERR "ERROR: domain->span does not contain "
 				"CPU%d\n", cpu);
 	}
 	if (!cpu_isset(cpu, group->cpumask)) {
 		printk(KERN_ERR "ERROR: domain->groups does not contain"
 				" CPU%d\n", cpu);
 	}
 	printk(KERN_DEBUG "%*s groups:", level + 1, "");
 	do {
 		if (!group) {
 			printk("\n");
 			printk(KERN_ERR "ERROR: group is NULL\n");
 			break;
 		}
 		if (!group->__cpu_power) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: domain->cpu_power not "
 					"set\n");
 			break;
 		}
 		if (!cpus_weight(group->cpumask)) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: empty group\n");
 			break;
 		}
 		if (cpus_intersects(*groupmask, group->cpumask)) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: repeated CPUs\n");
 			break;
 		}
 		cpus_or(*groupmask, *groupmask, group->cpumask);
 		cpulist_scnprintf(str, sizeof(str), group->cpumask);
 		printk(KERN_CONT " %s", str);
 		group = group->next;
 	} while (group != sd->groups);
 	printk(KERN_CONT "\n");
 	if (!cpus_equal(sd->span, *groupmask))
 		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
 	if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
 		printk(KERN_ERR "ERROR: parent span is not a superset "
 			"of domain->span\n");
 	return 0;
 }
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
 	cpumask_t *groupmask;
 	int level = 0;
 	if (!sd) {
 		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
 		return;
 	}
 	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
 	groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
 	if (!groupmask) {
 		printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
 		return;
 	}
 	for (;;) {
 		if (sched_domain_debug_one(sd, cpu, level, groupmask))
 			break;
 		level++;
 		sd = sd->parent;
 		if (!sd)
 			break;
 	}
 	kfree(groupmask);
 }
 #else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
 #endif /* CONFIG_SCHED_DEBUG */
 static int sd_degenerate(struct sched_domain *sd)
 {
 	if (cpus_weight(sd->span) == 1)
 		return 1;
 	/* Following flags need at least 2 groups */
 	if (sd->flags & (SD_LOAD_BALANCE |
 			 SD_BALANCE_NEWIDLE |
 			 SD_BALANCE_FORK |
 			 SD_BALANCE_EXEC |
 			 SD_SHARE_CPUPOWER |
 			 SD_SHARE_PKG_RESOURCES)) {
 		if (sd->groups != sd->groups->next)
 			return 0;
 	}
 	/* Following flags don't use groups */
 	if (sd->flags & (SD_WAKE_IDLE |
 			 SD_WAKE_AFFINE |
 			 SD_WAKE_BALANCE))
 		return 0;
 	return 1;
 }
 static int
 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 {
 	unsigned long cflags = sd->flags, pflags = parent->flags;
 	if (sd_degenerate(parent))
 		return 1;
 	if (!cpus_equal(sd->span, parent->span))
 		return 0;
 	/* Does parent contain flags not in child? */
 	/* WAKE_BALANCE is a subset of WAKE_AFFINE */
 	if (cflags & SD_WAKE_AFFINE)
 		pflags &= ~SD_WAKE_BALANCE;
 	/* Flags needing groups don't count if only 1 group in parent */
 	if (parent->groups == parent->groups->next) {
 		pflags &= ~(SD_LOAD_BALANCE |
 				SD_BALANCE_NEWIDLE |
 				SD_BALANCE_FORK |
 				SD_BALANCE_EXEC |
 				SD_SHARE_CPUPOWER |
 				SD_SHARE_PKG_RESOURCES);
 		if (nr_node_ids == 1)
 			pflags &= ~SD_SERIALIZE;
 	}
 	if (~cflags & pflags)
 		return 0;
 	return 1;
 }
 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&rq->lock, flags);
 	if (rq->rd) {
 		struct root_domain *old_rd = rq->rd;
 		if (cpu_isset(rq->cpu, old_rd->online))
 			set_rq_offline(rq);
 		cpu_clear(rq->cpu, old_rd->span);
 		if (atomic_dec_and_test(&old_rd->refcount))
 			kfree(old_rd);
 	}
 	atomic_inc(&rd->refcount);
 	rq->rd = rd;
 	cpu_set(rq->cpu, rd->span);
 	if (cpu_isset(rq->cpu, cpu_online_map))
 		set_rq_online(rq);
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 static void init_rootdomain(struct root_domain *rd)
 {
 	memset(rd, 0, sizeof(*rd));
 	cpus_clear(rd->span);
 	cpus_clear(rd->online);
 	cpupri_init(&rd->cpupri);
 }
 static void init_defrootdomain(void)
 {
 	init_rootdomain(&def_root_domain);
 	atomic_set(&def_root_domain.refcount, 1);
 }
 static struct root_domain *alloc_rootdomain(void)
 {
 	struct root_domain *rd;
 	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
 	if (!rd)
 		return NULL;
 	init_rootdomain(rd);
 	return rd;
 }
 /*
  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
  * hold the hotplug lock.
  */
 static void
 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
 	/* Remove the sched domains which do not contribute to scheduling. */
 	for (tmp = sd; tmp; ) {
 		struct sched_domain *parent = tmp->parent;
 		if (!parent)
 			break;
 		if (sd_parent_degenerate(tmp, parent)) {
 			tmp->parent = parent->parent;
 			if (parent->parent)
 				parent->parent->child = tmp;
 		} else
 			tmp = tmp->parent;
 	}
 	if (sd && sd_degenerate(sd)) {
 		sd = sd->parent;
 		if (sd)
 			sd->child = NULL;
 	}
 	sched_domain_debug(sd, cpu);
 	rq_attach_root(rq, rd);
 	rcu_assign_pointer(rq->sd, sd);
 }
 /* cpus with isolated domains */
 static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
 	static int __initdata ints[NR_CPUS];
 	int i;
 	str = get_options(str, ARRAY_SIZE(ints), ints);
 	cpus_clear(cpu_isolated_map);
 	for (i = 1; i <= ints[0]; i++)
 		if (ints[i] < NR_CPUS)
 			cpu_set(ints[i], cpu_isolated_map);
 	return 1;
 }
 __setup("isolcpus=", isolated_cpu_setup);
 /*
  * init_sched_build_groups takes the cpumask we wish to span, and a pointer
  * to a function which identifies what group(along with sched group) a CPU
  * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
  * (due to the fact that we keep track of groups covered with a cpumask_t).
  *
  * init_sched_build_groups will build a circular linked list of the groups
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
 static void
 init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
 			int (*group_fn)(int cpu, const cpumask_t *cpu_map,
 					struct sched_group **sg,
 					cpumask_t *tmpmask),
 			cpumask_t *covered, cpumask_t *tmpmask)
 {
 	struct sched_group *first = NULL, *last = NULL;
 	int i;
 	cpus_clear(*covered);
 	for_each_cpu_mask_nr(i, *span) {
 		struct sched_group *sg;
 		int group = group_fn(i, cpu_map, &sg, tmpmask);
 		int j;
 		if (cpu_isset(i, *covered))
 			continue;
 		cpus_clear(sg->cpumask);
 		sg->__cpu_power = 0;
 		for_each_cpu_mask_nr(j, *span) {
 			if (group_fn(j, cpu_map, NULL, tmpmask) != group)
 				continue;
 			cpu_set(j, *covered);
 			cpu_set(j, sg->cpumask);
 		}
 		if (!first)
 			first = sg;
 		if (last)
 			last->next = sg;
 		last = sg;
 	}
 	last->next = first;
 }
 #define SD_NODES_PER_DOMAIN 16
 #ifdef CONFIG_NUMA
 /**
  * find_next_best_node - find the next node to include in a sched_domain
  * @node: node whose sched_domain we're building
  * @used_nodes: nodes already in the sched_domain
  *
  * Find the next node to include in a given scheduling domain. Simply
  * finds the closest node not already in the @used_nodes map.
  *
  * Should use nodemask_t.
  */
 static int find_next_best_node(int node, nodemask_t *used_nodes)
 {
 	int i, n, val, min_val, best_node = 0;
 	min_val = INT_MAX;
 	for (i = 0; i < nr_node_ids; i++) {
 		/* Start at @node */
 		n = (node + i) % nr_node_ids;
 		if (!nr_cpus_node(n))
 			continue;
 		/* Skip already used nodes */
 		if (node_isset(n, *used_nodes))
 			continue;
 		/* Simple min distance search */
 		val = node_distance(node, n);
 		if (val < min_val) {
 			min_val = val;
 			best_node = n;
 		}
 	}
 	node_set(best_node, *used_nodes);
 	return best_node;
 }
 /**
  * sched_domain_node_span - get a cpumask for a node's sched_domain
  * @node: node whose cpumask we're constructing
  * @span: resulting cpumask
  *
  * Given a node, construct a good cpumask for its sched_domain to span. It
  * should be one that prevents unnecessary balancing, but also spreads tasks
  * out optimally.
  */
 static void sched_domain_node_span(int node, cpumask_t *span)
 {
 	nodemask_t used_nodes;
 	node_to_cpumask_ptr(nodemask, node);
 	int i;
 	cpus_clear(*span);
 	nodes_clear(used_nodes);
 	cpus_or(*span, *span, *nodemask);
 	node_set(node, used_nodes);
 	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
 		int next_node = find_next_best_node(node, &used_nodes);
 		node_to_cpumask_ptr_next(nodemask, next_node);
 		cpus_or(*span, *span, *nodemask);
 	}
 }
 #endif /* CONFIG_NUMA */
 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 /*
  * SMT sched-domains:
  */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
 static int
 cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 		 cpumask_t *unused)
 {
 	if (sg)
 		*sg = &per_cpu(sched_group_cpus, cpu);
 	return cpu;
 }
 #endif /* CONFIG_SCHED_SMT */
 /*
  * multi-core sched-domains:
  */
 #ifdef CONFIG_SCHED_MC
 static DEFINE_PER_CPU(struct sched_domain, core_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
 #endif /* CONFIG_SCHED_MC */
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
 static int
 cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 		  cpumask_t *mask)
 {
 	int group;
 	*mask = per_cpu(cpu_sibling_map, cpu);
 	cpus_and(*mask, *mask, *cpu_map);
 	group = first_cpu(*mask);
 	if (sg)
 		*sg = &per_cpu(sched_group_core, group);
 	return group;
 }
 #elif defined(CONFIG_SCHED_MC)
 static int
 cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 		  cpumask_t *unused)
 {
 	if (sg)
 		*sg = &per_cpu(sched_group_core, cpu);
 	return cpu;
 }
 #endif
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
 static int
 cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 		  cpumask_t *mask)
 {
 	int group;
 #ifdef CONFIG_SCHED_MC
 	*mask = cpu_coregroup_map(cpu);
 	cpus_and(*mask, *mask, *cpu_map);
 	group = first_cpu(*mask);
 #elif defined(CONFIG_SCHED_SMT)
 	*mask = per_cpu(cpu_sibling_map, cpu);
 	cpus_and(*mask, *mask, *cpu_map);
 	group = first_cpu(*mask);
 #else
 	group = cpu;
 #endif
 	if (sg)
 		*sg = &per_cpu(sched_group_phys, group);
 	return group;
 }
 #ifdef CONFIG_NUMA
 /*
  * The init_sched_build_groups can't handle what we want to do with node
  * groups, so roll our own. Now each node has its own list of groups which
  * gets dynamically allocated.
  */
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static struct sched_group ***sched_group_nodes_bycpu;
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
 				 struct sched_group **sg, cpumask_t *nodemask)
 {
 	int group;
 	*nodemask = node_to_cpumask(cpu_to_node(cpu));
 	cpus_and(*nodemask, *nodemask, *cpu_map);
 	group = first_cpu(*nodemask);
 	if (sg)
 		*sg = &per_cpu(sched_group_allnodes, group);
 	return group;
 }
 static void init_numa_sched_groups_power(struct sched_group *group_head)
 {
 	struct sched_group *sg = group_head;
 	int j;
 	if (!sg)
 		return;
 	do {
 		for_each_cpu_mask_nr(j, sg->cpumask) {
 			struct sched_domain *sd;
 			sd = &per_cpu(phys_domains, j);
 			if (j != first_cpu(sd->groups->cpumask)) {
 				/*
 				 * Only add "power" once for each
 				 * physical package.
 				 */
 				continue;
 			}
 			sg_inc_cpu_power(sg, sd->groups->__cpu_power);
 		}
 		sg = sg->next;
 	} while (sg != group_head);
 }
 #endif /* CONFIG_NUMA */
 #ifdef CONFIG_NUMA
 /* Free memory allocated for various sched_group structures */
 static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 {
 	int cpu, i;
 	for_each_cpu_mask_nr(cpu, *cpu_map) {
 		struct sched_group **sched_group_nodes
 			= sched_group_nodes_bycpu[cpu];
 		if (!sched_group_nodes)
 			continue;
 		for (i = 0; i < nr_node_ids; i++) {
 			struct sched_group *oldsg, *sg = sched_group_nodes[i];
 			*nodemask = node_to_cpumask(i);
 			cpus_and(*nodemask, *nodemask, *cpu_map);
 			if (cpus_empty(*nodemask))
 				continue;
 			if (sg == NULL)
 				continue;
 			sg = sg->next;
 next_sg:
 			oldsg = sg;
 			sg = sg->next;
 			kfree(oldsg);
 			if (oldsg != sched_group_nodes[i])
 				goto next_sg;
 		}
 		kfree(sched_group_nodes);
 		sched_group_nodes_bycpu[cpu] = NULL;
 	}
 }
 #else /* !CONFIG_NUMA */
 static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 {
 }
 #endif /* CONFIG_NUMA */
 /*
  * Initialize sched groups cpu_power.
  *
  * cpu_power indicates the capacity of sched group, which is used while
  * distributing the load between different sched groups in a sched domain.
  * Typically cpu_power for all the groups in a sched domain will be same unless
  * there are asymmetries in the topology. If there are asymmetries, group
  * having more cpu_power will pickup more load compared to the group having
  * less cpu_power.
  *
  * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
  * the maximum number of tasks a group can handle in the presence of other idle
  * or lightly loaded groups in the same sched domain.
  */
 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 {
 	struct sched_domain *child;
 	struct sched_group *group;
 	WARN_ON(!sd || !sd->groups);
 	if (cpu != first_cpu(sd->groups->cpumask))
 		return;
 	child = sd->child;
 	sd->groups->__cpu_power = 0;
 	/*
 	 * For perf policy, if the groups in child domain share resources
 	 * (for example cores sharing some portions of the cache hierarchy
 	 * or SMT), then set this domain groups cpu_power such that each group
 	 * can handle only one task, when there are other idle groups in the
 	 * same sched domain.
 	 */
 	if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
 		       (child->flags &
 			(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
 		sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
 		return;
 	}
 	/*
 	 * add cpu_power of each child group to this groups cpu_power
 	 */
 	group = child->groups;
 	do {
 		sg_inc_cpu_power(sd->groups, group->__cpu_power);
 		group = group->next;
 	} while (group != child->groups);
 }
 /*
  * Initializers for schedule domains
  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
  */
 #ifdef CONFIG_SCHED_DEBUG
 # define SD_INIT_NAME(sd, type)		sd->name = #type
 #else
 # define SD_INIT_NAME(sd, type)		do { } while (0)
 #endif
 #define	SD_INIT(sd, type)	sd_init_##type(sd)
 #define SD_INIT_FUNC(type)	\
 static noinline void sd_init_##type(struct sched_domain *sd)	\
 {								\
 	memset(sd, 0, sizeof(*sd));				\
 	*sd = SD_##type##_INIT;					\
 	sd->level = SD_LV_##type;				\
 	SD_INIT_NAME(sd, type);					\
 }
 SD_INIT_FUNC(CPU)
 #ifdef CONFIG_NUMA
  SD_INIT_FUNC(ALLNODES)
  SD_INIT_FUNC(NODE)
 #endif
 #ifdef CONFIG_SCHED_SMT
  SD_INIT_FUNC(SIBLING)
 #endif
 #ifdef CONFIG_SCHED_MC
  SD_INIT_FUNC(MC)
 #endif
 /*
  * To minimize stack usage kmalloc room for cpumasks and share the
  * space as the usage in build_sched_domains() dictates.  Used only
  * if the amount of space is significant.
  */
 struct allmasks {
 	cpumask_t tmpmask;			/* make this one first */
 	union {
 		cpumask_t nodemask;
 		cpumask_t this_sibling_map;
 		cpumask_t this_core_map;
 	};
 	cpumask_t send_covered;
 #ifdef CONFIG_NUMA
 	cpumask_t domainspan;
 	cpumask_t covered;
 	cpumask_t notcovered;
 #endif
 };
 #if	NR_CPUS > 128
 #define SCHED_CPUMASK_DECLARE(v)	struct allmasks *v
 static inline void sched_cpumask_alloc(struct allmasks **masks)
 {
 	*masks = kmalloc(sizeof(**masks), GFP_KERNEL);
 }
 static inline void sched_cpumask_free(struct allmasks *masks)
 {
 	kfree(masks);
 }
 #else
 #define SCHED_CPUMASK_DECLARE(v)	struct allmasks _v, *v = &_v
 static inline void sched_cpumask_alloc(struct allmasks **masks)
 { }
 static inline void sched_cpumask_free(struct allmasks *masks)
 { }
 #endif
 #define	SCHED_CPUMASK_VAR(v, a) 	cpumask_t *v = (cpumask_t *) \
 			((unsigned long)(a) + offsetof(struct allmasks, v))
 static int default_relax_domain_level = -1;
 static int __init setup_relax_domain_level(char *str)
 {
 	unsigned long val;
 	val = simple_strtoul(str, NULL, 0);
 	if (val < SD_LV_MAX)
 		default_relax_domain_level = val;
 	return 1;
 }
 __setup("relax_domain_level=", setup_relax_domain_level);
 static void set_domain_attribute(struct sched_domain *sd,
 				 struct sched_domain_attr *attr)
 {
 	int request;
 	if (!attr || attr->relax_domain_level < 0) {
 		if (default_relax_domain_level < 0)
 			return;
 		else
 			request = default_relax_domain_level;
 	} else
 		request = attr->relax_domain_level;
 	if (request < sd->level) {
 		/* turn off idle balance on this domain */
 		sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
 	} else {
 		/* turn on idle balance on this domain */
 		sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
 	}
 }
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
 static int __build_sched_domains(const cpumask_t *cpu_map,
 				 struct sched_domain_attr *attr)
 {
 	int i;
 	struct root_domain *rd;
 	SCHED_CPUMASK_DECLARE(allmasks);
 	cpumask_t *tmpmask;
 #ifdef CONFIG_NUMA
 	struct sched_group **sched_group_nodes = NULL;
 	int sd_allnodes = 0;
 	/*
 	 * Allocate the per-node list of sched groups
 	 */
 	sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
 				    GFP_KERNEL);
 	if (!sched_group_nodes) {
 		printk(KERN_WARNING "Can not alloc sched group node list\n");
 		return -ENOMEM;
 	}
 #endif
 	rd = alloc_rootdomain();
 	if (!rd) {
 		printk(KERN_WARNING "Cannot alloc root domain\n");
 #ifdef CONFIG_NUMA
 		kfree(sched_group_nodes);
 #endif
 		return -ENOMEM;
 	}
 	/* get space for all scratch cpumask variables */
 	sched_cpumask_alloc(&allmasks);
 	if (!allmasks) {
 		printk(KERN_WARNING "Cannot alloc cpumask array\n");
 		kfree(rd);
 #ifdef CONFIG_NUMA
 		kfree(sched_group_nodes);
 #endif
 		return -ENOMEM;
 	}
 	tmpmask = (cpumask_t *)allmasks;
 #ifdef CONFIG_NUMA
 	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
 #endif
 	/*
 	 * Set up domains for cpus specified by the cpu_map.
 	 */
 	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd = NULL, *p;
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
 		*nodemask = node_to_cpumask(cpu_to_node(i));
 		cpus_and(*nodemask, *nodemask, *cpu_map);
 #ifdef CONFIG_NUMA
 		if (cpus_weight(*cpu_map) >
 				SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
 			sd = &per_cpu(allnodes_domains, i);
 			SD_INIT(sd, ALLNODES);
 			set_domain_attribute(sd, attr);
 			sd->span = *cpu_map;
 			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
 			p = sd;
 			sd_allnodes = 1;
 		} else
 			p = NULL;
 		sd = &per_cpu(node_domains, i);
 		SD_INIT(sd, NODE);
 		set_domain_attribute(sd, attr);
 		sched_domain_node_span(cpu_to_node(i), &sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
 		cpus_and(sd->span, sd->span, *cpu_map);
 #endif
 		p = sd;
 		sd = &per_cpu(phys_domains, i);
 		SD_INIT(sd, CPU);
 		set_domain_attribute(sd, attr);
 		sd->span = *nodemask;
 		sd->parent = p;
 		if (p)
 			p->child = sd;
 		cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
 #ifdef CONFIG_SCHED_MC
 		p = sd;
 		sd = &per_cpu(core_domains, i);
 		SD_INIT(sd, MC);
 		set_domain_attribute(sd, attr);
 		sd->span = cpu_coregroup_map(i);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
 		cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
 #endif
 #ifdef CONFIG_SCHED_SMT
 		p = sd;
 		sd = &per_cpu(cpu_domains, i);
 		SD_INIT(sd, SIBLING);
 		set_domain_attribute(sd, attr);
 		sd->span = per_cpu(cpu_sibling_map, i);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
 		cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
 #endif
 	}
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
 	for_each_cpu_mask_nr(i, *cpu_map) {
 		SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 		*this_sibling_map = per_cpu(cpu_sibling_map, i);
 		cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
 		if (i != first_cpu(*this_sibling_map))
 			continue;
 		init_sched_build_groups(this_sibling_map, cpu_map,
 					&cpu_to_cpu_group,
 					send_covered, tmpmask);
 	}
 #endif
 #ifdef CONFIG_SCHED_MC
 	/* Set up multi-core groups */
 	for_each_cpu_mask_nr(i, *cpu_map) {
 		SCHED_CPUMASK_VAR(this_core_map, allmasks);
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 		*this_core_map = cpu_coregroup_map(i);
 		cpus_and(*this_core_map, *this_core_map, *cpu_map);
 		if (i != first_cpu(*this_core_map))
 			continue;
 		init_sched_build_groups(this_core_map, cpu_map,
 					&cpu_to_core_group,
 					send_covered, tmpmask);
 	}
 #endif
 	/* Set up physical groups */
 	for (i = 0; i < nr_node_ids; i++) {
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 		*nodemask = node_to_cpumask(i);
 		cpus_and(*nodemask, *nodemask, *cpu_map);
 		if (cpus_empty(*nodemask))
 			continue;
 		init_sched_build_groups(nodemask, cpu_map,
 					&cpu_to_phys_group,
 					send_covered, tmpmask);
 	}
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
 	if (sd_allnodes) {
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 		init_sched_build_groups(cpu_map, cpu_map,
 					&cpu_to_allnodes_group,
 					send_covered, tmpmask);
 	}
 	for (i = 0; i < nr_node_ids; i++) {
 		/* Set up node groups */
 		struct sched_group *sg, *prev;
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
 		SCHED_CPUMASK_VAR(domainspan, allmasks);
 		SCHED_CPUMASK_VAR(covered, allmasks);
 		int j;
 		*nodemask = node_to_cpumask(i);
 		cpus_clear(*covered);
 		cpus_and(*nodemask, *nodemask, *cpu_map);
 		if (cpus_empty(*nodemask)) {
 			sched_group_nodes[i] = NULL;
 			continue;
 		}
 		sched_domain_node_span(i, domainspan);
 		cpus_and(*domainspan, *domainspan, *cpu_map);
 		sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
 		if (!sg) {
 			printk(KERN_WARNING "Can not alloc domain group for "
 				"node %d\n", i);
 			goto error;
 		}
 		sched_group_nodes[i] = sg;
 		for_each_cpu_mask_nr(j, *nodemask) {
 			struct sched_domain *sd;
 			sd = &per_cpu(node_domains, j);
 			sd->groups = sg;
 		}
 		sg->__cpu_power = 0;
 		sg->cpumask = *nodemask;
 		sg->next = sg;
 		cpus_or(*covered, *covered, *nodemask);
 		prev = sg;
 		for (j = 0; j < nr_node_ids; j++) {
 			SCHED_CPUMASK_VAR(notcovered, allmasks);
 			int n = (i + j) % nr_node_ids;
 			node_to_cpumask_ptr(pnodemask, n);
 			cpus_complement(*notcovered, *covered);
 			cpus_and(*tmpmask, *notcovered, *cpu_map);
 			cpus_and(*tmpmask, *tmpmask, *domainspan);
 			if (cpus_empty(*tmpmask))
 				break;
 			cpus_and(*tmpmask, *tmpmask, *pnodemask);
 			if (cpus_empty(*tmpmask))
 				continue;
 			sg = kmalloc_node(sizeof(struct sched_group),
 					  GFP_KERNEL, i);
 			if (!sg) {
 				printk(KERN_WARNING
 				"Can not alloc domain group for node %d\n", j);
 				goto error;
 			}
 			sg->__cpu_power = 0;
 			sg->cpumask = *tmpmask;
 			sg->next = prev->next;
 			cpus_or(*covered, *covered, *tmpmask);
 			prev->next = sg;
 			prev = sg;
 		}
 	}
 #endif
 	/* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
 	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd = &per_cpu(cpu_domains, i);
 		init_sched_groups_power(i, sd);
 	}
 #endif
 #ifdef CONFIG_SCHED_MC
 	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd = &per_cpu(core_domains, i);
 		init_sched_groups_power(i, sd);
 	}
 #endif
 	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd = &per_cpu(phys_domains, i);
 		init_sched_groups_power(i, sd);
 	}
 #ifdef CONFIG_NUMA
 	for (i = 0; i < nr_node_ids; i++)
 		init_numa_sched_groups_power(sched_group_nodes[i]);
 	if (sd_allnodes) {
 		struct sched_group *sg;
 		cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
 								tmpmask);
 		init_numa_sched_groups_power(sg);
 	}
 #endif
 	/* Attach the domains */
 	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
 #elif defined(CONFIG_SCHED_MC)
 		sd = &per_cpu(core_domains, i);
 #else
 		sd = &per_cpu(phys_domains, i);
 #endif
 		cpu_attach_domain(sd, rd, i);
 	}
 	sched_cpumask_free(allmasks);
 	return 0;
 #ifdef CONFIG_NUMA
 error:
 	free_sched_groups(cpu_map, tmpmask);
 	sched_cpumask_free(allmasks);
 	kfree(rd);
 	return -ENOMEM;
 #endif
 }
 static int build_sched_domains(const cpumask_t *cpu_map)
 {
 	return __build_sched_domains(cpu_map, NULL);
 }
 static cpumask_t *doms_cur;	/* current sched domains */
 static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
 static struct sched_domain_attr *dattr_cur;
 				/* attribues of custom domains in 'doms_cur' */
 /*
  * Special case: If a kmalloc of a doms_cur partition (array of
  * cpumask_t) fails, then fallback to a single sched domain,
  * as determined by the single cpumask_t fallback_doms.
  */
 static cpumask_t fallback_doms;
-void __attribute__((weak)) arch_update_cpu_topology(void)
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * cpu core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __attribute__((weak)) arch_update_cpu_topology(void)
 {
+	return 0;
 }
 /*
  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
  * For now this just excludes isolated cpus, but could be used to
  * exclude other special cases in the future.
  */
 static int arch_init_sched_domains(const cpumask_t *cpu_map)
 {
 	int err;
 	arch_update_cpu_topology();
 	ndoms_cur = 1;
 	doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
 	if (!doms_cur)
 		doms_cur = &fallback_doms;
 	cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
 	dattr_cur = NULL;
 	err = build_sched_domains(doms_cur);
 	register_sched_domain_sysctl();
 	return err;
 }
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
 				       cpumask_t *tmpmask)
 {
 	free_sched_groups(cpu_map, tmpmask);
 }
 /*
  * Detach sched domains from a group of cpus specified in cpu_map
  * These cpus will now be attached to the NULL domain
  */
 static void detach_destroy_domains(const cpumask_t *cpu_map)
 {
 	cpumask_t tmpmask;
 	int i;
 	for_each_cpu_mask_nr(i, *cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
 	synchronize_sched();
 	arch_destroy_sched_domains(cpu_map, &tmpmask);
 }
 /* handle null as "default" */
 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
 			struct sched_domain_attr *new, int idx_new)
 {
 	struct sched_domain_attr tmp;
 	/* fast path */
 	if (!new && !cur)
 		return 1;
 	tmp = SD_ATTR_INIT;
 	return !memcmp(cur ? (cur + idx_cur) : &tmp,
 			new ? (new + idx_new) : &tmp,
 			sizeof(struct sched_domain_attr));
 }
 /*
  * Partition sched domains as specified by the 'ndoms_new'
  * cpumasks in the array doms_new[] of cpumasks. This compares
  * doms_new[] to the current sched domain partitioning, doms_cur[].
  * It destroys each deleted domain and builds each new domain.
  *
  * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
  * The masks don't intersect (don't overlap.) We should setup one
  * sched domain for each mask. CPUs not in any of the cpumasks will
  * not be load balanced. If the same cpumask appears both in the
  * current 'doms_cur' domains and in the new 'doms_new', we can leave
  * it as it is.
  *
  * The passed in 'doms_new' should be kmalloc'd. This routine takes
  * ownership of it and will kfree it when done with it. If the caller
  * failed the kmalloc call, then it can pass in doms_new == NULL &&
  * ndoms_new == 1, and partition_sched_domains() will fallback to
  * the single partition 'fallback_doms', it also forces the domains
  * to be rebuilt.
  *
  * If doms_new == NULL it will be replaced with cpu_online_map.
  * ndoms_new == 0 is a special case for destroying existing domains,
  * and it will not create the default domain.
  *
  * Call with hotplug lock held
  */
 void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 			     struct sched_domain_attr *dattr_new)
 {
 	int i, j, n;
 	mutex_lock(&sched_domains_mutex);
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
 	n = doms_new ? ndoms_new : 0;
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
 		for (j = 0; j < n; j++) {
 			if (cpus_equal(doms_cur[i], doms_new[j])
 			    && dattrs_equal(dattr_cur, i, dattr_new, j))
 				goto match1;
 		}
 		/* no match - a current sched domain not in new doms_new[] */
 		detach_destroy_domains(doms_cur + i);
 match1:
 		;
 	}
 	if (doms_new == NULL) {
 		ndoms_cur = 0;
 		doms_new = &fallback_doms;
 		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
 		WARN_ON_ONCE(dattr_new);
 	}
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < ndoms_cur; j++) {
 			if (cpus_equal(doms_new[i], doms_cur[j])
 			    && dattrs_equal(dattr_new, i, dattr_cur, j))
 				goto match2;
 		}
 		/* no match - add a new doms_new */
 		__build_sched_domains(doms_new + i,
 					dattr_new ? dattr_new + i : NULL);
 match2:
 		;
 	}
 	/* Remember the new sched domains */
 	if (doms_cur != &fallback_doms)
 		kfree(doms_cur);
 	kfree(dattr_cur);	/* kfree(NULL) is safe */
 	doms_cur = doms_new;
 	dattr_cur = dattr_new;
 	ndoms_cur = ndoms_new;
 	register_sched_domain_sysctl();
 	mutex_unlock(&sched_domains_mutex);
 }
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 int arch_reinit_sched_domains(void)
 {
 	get_online_cpus();
 	/* Destroy domains first to force the rebuild */
 	partition_sched_domains(0, NULL, NULL);
 	rebuild_sched_domains();
 	put_online_cpus();
 	return 0;
 }
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 {
 	int ret;
 	if (buf[0] != '0' && buf[0] != '1')
 		return -EINVAL;
 	if (smt)
 		sched_smt_power_savings = (buf[0] == '1');
 	else
 		sched_mc_power_savings = (buf[0] == '1');
 	ret = arch_reinit_sched_domains();
 	return ret ? ret : count;
 }
 #ifdef CONFIG_SCHED_MC
 static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
 					   char *page)
 {
 	return sprintf(page, "%u\n", sched_mc_power_savings);
 }
 static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
 					    const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 0);
 }
 static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
 			 sched_mc_power_savings_show,
 			 sched_mc_power_savings_store);
 #endif
 #ifdef CONFIG_SCHED_SMT
 static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
 					    char *page)
 {
 	return sprintf(page, "%u\n", sched_smt_power_savings);
 }
 static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
 					     const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 1);
 }
 static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
 		   sched_smt_power_savings_show,
 		   sched_smt_power_savings_store);
 #endif
 int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 {
 	int err = 0;
 #ifdef CONFIG_SCHED_SMT
 	if (smt_capable())
 		err = sysfs_create_file(&cls->kset.kobj,
 					&attr_sched_smt_power_savings.attr);
 #endif
 #ifdef CONFIG_SCHED_MC
 	if (!err && mc_capable())
 		err = sysfs_create_file(&cls->kset.kobj,
 					&attr_sched_mc_power_savings.attr);
 #endif
 	return err;
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 #ifndef CONFIG_CPUSETS
 /*
  * Add online and remove offline CPUs from the scheduler domains.
  * When cpusets are enabled they take over this function.
  */
 static int update_sched_domains(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		partition_sched_domains(1, NULL, NULL);
 		return NOTIFY_OK;
 	default:
 		return NOTIFY_DONE;
 	}
 }
 #endif
 static int update_runtime(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
 	int cpu = (int)(long)hcpu;
 	switch (action) {
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		disable_runtime(cpu_rq(cpu));
 		return NOTIFY_OK;
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		enable_runtime(cpu_rq(cpu));
 		return NOTIFY_OK;
 	default:
 		return NOTIFY_DONE;
 	}
 }
 void __init sched_init_smp(void)
 {
 	cpumask_t non_isolated_cpus;
 #if defined(CONFIG_NUMA)
 	sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
 								GFP_KERNEL);
 	BUG_ON(sched_group_nodes_bycpu == NULL);
 #endif
 	get_online_cpus();
 	mutex_lock(&sched_domains_mutex);
 	arch_init_sched_domains(&cpu_online_map);
 	cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
 	if (cpus_empty(non_isolated_cpus))
 		cpu_set(smp_processor_id(), non_isolated_cpus);
 	mutex_unlock(&sched_domains_mutex);
 	put_online_cpus();
 #ifndef CONFIG_CPUSETS
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
 #endif
 	/* RT runtime code needs to handle some hotplug events */
 	hotcpu_notifier(update_runtime, 0);
 	init_hrtick();
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
 		BUG();
 	sched_init_granularity();
 }
 #else
 void __init sched_init_smp(void)
 {
 	sched_init_granularity();
 }
 #endif /* CONFIG_SMP */
 int in_sched_functions(unsigned long addr)
 {
 	return in_lock_functions(addr) ||
 		(addr >= (unsigned long)__sched_text_start
 		&& addr < (unsigned long)__sched_text_end);
 }
 static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT;
 	INIT_LIST_HEAD(&cfs_rq->tasks);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->rq = rq;
 #endif
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 }
 static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 {
 	struct rt_prio_array *array;
 	int i;
 	array = &rt_rq->active;
 	for (i = 0; i < MAX_RT_PRIO; i++) {
 		INIT_LIST_HEAD(array->queue + i);
 		__clear_bit(i, array->bitmap);
 	}
 	/* delimiter for bitsearch: */
 	__set_bit(MAX_RT_PRIO, array->bitmap);
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	rt_rq->highest_prio = MAX_RT_PRIO;
 #endif
 #ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
 	rt_rq->overloaded = 0;
 #endif
 	rt_rq->rt_time = 0;
 	rt_rq->rt_throttled = 0;
 	rt_rq->rt_runtime = 0;
 	spin_lock_init(&rt_rq->rt_runtime_lock);
 #ifdef CONFIG_RT_GROUP_SCHED
 	rt_rq->rt_nr_boosted = 0;
 	rt_rq->rq = rq;
 #endif
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 				struct sched_entity *se, int cpu, int add,
 				struct sched_entity *parent)
 {
 	struct rq *rq = cpu_rq(cpu);
 	tg->cfs_rq[cpu] = cfs_rq;
 	init_cfs_rq(cfs_rq, rq);
 	cfs_rq->tg = tg;
 	if (add)
 		list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
 	tg->se[cpu] = se;
 	/* se could be NULL for init_task_group */
 	if (!se)
 		return;
 	if (!parent)
 		se->cfs_rq = &rq->cfs;
 	else
 		se->cfs_rq = parent->my_q;
 	se->my_q = cfs_rq;
 	se->load.weight = tg->shares;
 	se->load.inv_weight = 0;
 	se->parent = parent;
 }
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 		struct sched_rt_entity *rt_se, int cpu, int add,
 		struct sched_rt_entity *parent)
 {
 	struct rq *rq = cpu_rq(cpu);
 	tg->rt_rq[cpu] = rt_rq;
 	init_rt_rq(rt_rq, rq);
 	rt_rq->tg = tg;
 	rt_rq->rt_se = rt_se;
 	rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
 	if (add)
 		list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
 	tg->rt_se[cpu] = rt_se;
 	if (!rt_se)
 		return;
 	if (!parent)
 		rt_se->rt_rq = &rq->rt;
 	else
 		rt_se->rt_rq = parent->my_q;
 	rt_se->my_q = rt_rq;
 	rt_se->parent = parent;
 	INIT_LIST_HEAD(&rt_se->run_list);
 }
 #endif
 void __init sched_init(void)
 {
 	int i, j;
 	unsigned long alloc_size = 0, ptr;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
 #ifdef CONFIG_USER_SCHED
 	alloc_size *= 2;
 #endif
 	/*
 	 * As sched_init() is called before page_alloc is setup,
 	 * we use alloc_bootmem().
 	 */
 	if (alloc_size) {
 		ptr = (unsigned long)alloc_bootmem(alloc_size);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		init_task_group.se = (struct sched_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 		init_task_group.cfs_rq = (struct cfs_rq **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 #ifdef CONFIG_USER_SCHED
 		root_task_group.se = (struct sched_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 		root_task_group.cfs_rq = (struct cfs_rq **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 		init_task_group.rt_se = (struct sched_rt_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 		init_task_group.rt_rq = (struct rt_rq **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 #ifdef CONFIG_USER_SCHED
 		root_task_group.rt_se = (struct sched_rt_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 		root_task_group.rt_rq = (struct rt_rq **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_RT_GROUP_SCHED */
 	}
 #ifdef CONFIG_SMP
 	init_defrootdomain();
 #endif
 	init_rt_bandwidth(&def_rt_bandwidth,
 			global_rt_period(), global_rt_runtime());
 #ifdef CONFIG_RT_GROUP_SCHED
 	init_rt_bandwidth(&init_task_group.rt_bandwidth,
 			global_rt_period(), global_rt_runtime());
 #ifdef CONFIG_USER_SCHED
 	init_rt_bandwidth(&root_task_group.rt_bandwidth,
 			global_rt_period(), RUNTIME_INF);
 #endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_GROUP_SCHED
 	list_add(&init_task_group.list, &task_groups);
 	INIT_LIST_HEAD(&init_task_group.children);
 #ifdef CONFIG_USER_SCHED
 	INIT_LIST_HEAD(&root_task_group.children);
 	init_task_group.parent = &root_task_group;
 	list_add(&init_task_group.siblings, &root_task_group.children);
 #endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_GROUP_SCHED */
 	for_each_possible_cpu(i) {
 		struct rq *rq;
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
 		rq->nr_running = 0;
 		init_cfs_rq(&rq->cfs, rq);
 		init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		init_task_group.shares = init_task_group_load;
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
 #ifdef CONFIG_CGROUP_SCHED
 		/*
 		 * How much cpu bandwidth does init_task_group get?
 		 *
 		 * In case of task-groups formed thr' the cgroup filesystem, it
 		 * gets 100% of the cpu resources in the system. This overall
 		 * system cpu resource is divided among the tasks of
 		 * init_task_group and its child task-groups in a fair manner,
 		 * based on each entity's (task or task-group's) weight
 		 * (se->load.weight).
 		 *
 		 * In other words, if init_task_group has 10 tasks of weight
 		 * 1024) and two child groups A0 and A1 (of weight 1024 each),
 		 * then A0's share of the cpu resource is:
 		 *
 		 * 	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
 		 *
 		 * We achieve this by letting init_task_group's tasks sit
 		 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
 		 */
 		init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
 #elif defined CONFIG_USER_SCHED
 		root_task_group.shares = NICE_0_LOAD;
 		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
 		/*
 		 * In case of task-groups formed thr' the user id of tasks,
 		 * init_task_group represents tasks belonging to root user.
 		 * Hence it forms a sibling of all subsequent groups formed.
 		 * In this case, init_task_group gets only a fraction of overall
 		 * system cpu resource, based on the weight assigned to root
 		 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
 		 * by letting tasks of init_task_group sit in a separate cfs_rq
 		 * (init_cfs_rq) and having one entity represent this group of
 		 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
 		 */
 		init_tg_cfs_entry(&init_task_group,
 				&per_cpu(init_cfs_rq, i),
 				&per_cpu(init_sched_entity, i), i, 1,
 				root_task_group.se[i]);
 #endif
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
 		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
 #ifdef CONFIG_CGROUP_SCHED
 		init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
 #elif defined CONFIG_USER_SCHED
 		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
 		init_tg_rt_entry(&init_task_group,
 				&per_cpu(init_rt_rq, i),
 				&per_cpu(init_sched_rt_entity, i), i, 1,
 				root_task_group.rt_se[i]);
 #endif
 #endif
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
 		rq->rd = NULL;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
 		rq->cpu = i;
 		rq->online = 0;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
 		rq_attach_root(rq, &def_root_domain);
 #endif
 		init_rq_hrtick(rq);
 		atomic_set(&rq->nr_iowait, 0);
 	}
 	set_load_weight(&init_task);
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
 #endif
 #ifdef CONFIG_SMP
 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
 #endif
 #ifdef CONFIG_RT_MUTEXES
 	plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
 #endif
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current);
 	/*
 	 * Make us the idle thread. Technically, schedule() should not be
 	 * called from this thread, however somewhere below it might be,
 	 * but because we are the idle thread, we just pick up running again
 	 * when this runqueue becomes "idle".
 	 */
 	init_idle(current, smp_processor_id());
 	/*
 	 * During early bootup we pretend to be a normal task:
 	 */
 	current->sched_class = &fair_sched_class;
 	scheduler_running = 1;
 }
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
 void __might_sleep(char *file, int line)
 {
 #ifdef in_atomic
 	static unsigned long prev_jiffy;	/* ratelimiting */
 	if ((!in_atomic() && !irqs_disabled()) ||
 		    system_state != SYSTEM_RUNNING || oops_in_progress)
 		return;
 	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 		return;
 	prev_jiffy = jiffies;
 	printk(KERN_ERR
 		"BUG: sleeping function called from invalid context at %s:%d\n",
 			file, line);
 	printk(KERN_ERR
 		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
 			in_atomic(), irqs_disabled(),
 			current->pid, current->comm);
 	debug_show_held_locks(current);
 	if (irqs_disabled())
 		print_irqtrace_events(current);
 	dump_stack();
 #endif
 }
 EXPORT_SYMBOL(__might_sleep);
 #endif
 #ifdef CONFIG_MAGIC_SYSRQ
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
 	int on_rq;
 	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
 	if (on_rq)
 		deactivate_task(rq, p, 0);
 	__setscheduler(rq, p, SCHED_NORMAL, 0);
 	if (on_rq) {
 		activate_task(rq, p, 0);
 		resched_task(rq->curr);
 	}
 }
 void normalize_rt_tasks(void)
 {
 	struct task_struct *g, *p;
 	unsigned long flags;
 	struct rq *rq;
 	read_lock_irqsave(&tasklist_lock, flags);
 	do_each_thread(g, p) {
 		/*
 		 * Only normalize user tasks:
 		 */
 		if (!p->mm)
 			continue;
 		p->se.exec_start		= 0;
 #ifdef CONFIG_SCHEDSTATS
 		p->se.wait_start		= 0;
 		p->se.sleep_start		= 0;
 		p->se.block_start		= 0;
 #endif
 		if (!rt_task(p)) {
 			/*
 			 * Renice negative nice level userspace
 			 * tasks back to 0:
 			 */
 			if (TASK_NICE(p) < 0 && p->mm)
 				set_user_nice(p, 0);
 			continue;
 		}
 		spin_lock(&p->pi_lock);
 		rq = __task_rq_lock(p);
 		normalize_task(rq, p);
 		__task_rq_unlock(rq);
 		spin_unlock(&p->pi_lock);
 	} while_each_thread(g, p);
 	read_unlock_irqrestore(&tasklist_lock, flags);
 }
 #endif /* CONFIG_MAGIC_SYSRQ */
 #ifdef CONFIG_IA64
 /*
  * These functions are only useful for the IA64 MCA handling.
  *
  * They can only be called when the whole system has been
  * stopped - every CPU needs to be quiescent, and no scheduling
  * activity can take place. Using them for anything else would
  * be a serious bug, and as a result, they aren't even visible
  * under any other configuration.
  */
 /**
  * curr_task - return the current task for a given cpu.
  * @cpu: the processor in question.
  *
  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
  */
 struct task_struct *curr_task(int cpu)
 {
 	return cpu_curr(cpu);
 }
 /**
  * set_curr_task - set the current task for a given cpu.
  * @cpu: the processor in question.
  * @p: the task pointer to set.
  *
  * Description: This function must only be used when non-maskable interrupts
  * are serviced on a separate stack. It allows the architecture to switch the
  * notion of the current task on a cpu in a non-blocking manner. This function
  * must be called with all CPU's synchronized, and interrupts disabled, the
  * and caller must save the original value of the current task (see
  * curr_task() above) and restore that value before reenabling interrupts and
  * re-starting the system.
  *
  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
  */
 void set_curr_task(int cpu, struct task_struct *p)
 {
 	cpu_curr(cpu) = p;
 }
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void free_fair_sched_group(struct task_group *tg)
 {
 	int i;
 	for_each_possible_cpu(i) {
 		if (tg->cfs_rq)
 			kfree(tg->cfs_rq[i]);
 		if (tg->se)
 			kfree(tg->se[i]);
 	}
 	kfree(tg->cfs_rq);
 	kfree(tg->se);
 }
 static
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se;
 	struct rq *rq;
 	int i;
 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
 	if (!tg->cfs_rq)
 		goto err;
 	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
 	if (!tg->se)
 		goto err;
 	tg->shares = NICE_0_LOAD;
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
 				      GFP_KERNEL, cpu_to_node(i));
 		if (!cfs_rq)
 			goto err;
 		se = kzalloc_node(sizeof(struct sched_entity),
 				  GFP_KERNEL, cpu_to_node(i));
 		if (!se)
 			goto err;
 		init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
 	}
 	return 1;
  err:
 	return 0;
 }
 static inline void register_fair_sched_group(struct task_group *tg, int cpu)
 {
 	list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
 			&cpu_rq(cpu)->leaf_cfs_rq_list);
 }
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
 	list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
 }
 #else /* !CONFG_FAIR_GROUP_SCHED */
 static inline void free_fair_sched_group(struct task_group *tg)
 {
 }
 static inline
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	return 1;
 }
 static inline void register_fair_sched_group(struct task_group *tg, int cpu)
 {
 }
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 static void free_rt_sched_group(struct task_group *tg)
 {
 	int i;
 	destroy_rt_bandwidth(&tg->rt_bandwidth);
 	for_each_possible_cpu(i) {
 		if (tg->rt_rq)
 			kfree(tg->rt_rq[i]);
 		if (tg->rt_se)
 			kfree(tg->rt_se[i]);
 	}
 	kfree(tg->rt_rq);
 	kfree(tg->rt_se);
 }
 static
 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct rt_rq *rt_rq;
 	struct sched_rt_entity *rt_se;
 	struct rq *rq;
 	int i;
 	tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
 	if (!tg->rt_rq)
 		goto err;
 	tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
 	if (!tg->rt_se)
 		goto err;
 	init_rt_bandwidth(&tg->rt_bandwidth,
 			ktime_to_ns(def_rt_bandwidth.rt_period), 0);
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 		rt_rq = kzalloc_node(sizeof(struct rt_rq),
 				     GFP_KERNEL, cpu_to_node(i));
 		if (!rt_rq)
 			goto err;
 		rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
 				     GFP_KERNEL, cpu_to_node(i));
 		if (!rt_se)
 			goto err;
 		init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
 	}
 	return 1;
  err:
 	return 0;
 }
 static inline void register_rt_sched_group(struct task_group *tg, int cpu)
 {
 	list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
 			&cpu_rq(cpu)->leaf_rt_rq_list);
 }
 static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
 {
 	list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
 }
 #else /* !CONFIG_RT_GROUP_SCHED */
 static inline void free_rt_sched_group(struct task_group *tg)
 {
 }
 static inline
 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	return 1;
 }
 static inline void register_rt_sched_group(struct task_group *tg, int cpu)
 {
 }
 static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
 {
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_GROUP_SCHED
 static void free_sched_group(struct task_group *tg)
 {
 	free_fair_sched_group(tg);
 	free_rt_sched_group(tg);
 	kfree(tg);
 }
 /* allocate runqueue etc for a new task group */
 struct task_group *sched_create_group(struct task_group *parent)
 {
 	struct task_group *tg;
 	unsigned long flags;
 	int i;
 	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
 	if (!tg)
 		return ERR_PTR(-ENOMEM);
 	if (!alloc_fair_sched_group(tg, parent))
 		goto err;
 	if (!alloc_rt_sched_group(tg, parent))
 		goto err;
 	spin_lock_irqsave(&task_group_lock, flags);
 	for_each_possible_cpu(i) {
 		register_fair_sched_group(tg, i);
 		register_rt_sched_group(tg, i);
 	}
 	list_add_rcu(&tg->list, &task_groups);
 	WARN_ON(!parent); /* root should already exist */
 	tg->parent = parent;
 	INIT_LIST_HEAD(&tg->children);
 	list_add_rcu(&tg->siblings, &parent->children);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 	return tg;
 err:
 	free_sched_group(tg);
 	return ERR_PTR(-ENOMEM);
 }
 /* rcu callback to free various structures associated with a task group */
 static void free_sched_group_rcu(struct rcu_head *rhp)
 {
 	/* now it should be safe to free those cfs_rqs */
 	free_sched_group(container_of(rhp, struct task_group, rcu));
 }
 /* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 {
 	unsigned long flags;
 	int i;
 	spin_lock_irqsave(&task_group_lock, flags);
 	for_each_possible_cpu(i) {
 		unregister_fair_sched_group(tg, i);
 		unregister_rt_sched_group(tg, i);
 	}
 	list_del_rcu(&tg->list);
 	list_del_rcu(&tg->siblings);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 	/* wait for possible concurrent references to cfs_rqs complete */
 	call_rcu(&tg->rcu, free_sched_group_rcu);
 }
 /* change task's runqueue when it moves between groups.
  *	The caller of this function should have put the task in its new group
  *	by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
  *	reflect its new group.
  */
 void sched_move_task(struct task_struct *tsk)
 {
 	int on_rq, running;
 	unsigned long flags;
 	struct rq *rq;
 	rq = task_rq_lock(tsk, &flags);
 	update_rq_clock(rq);
 	running = task_current(rq, tsk);
 	on_rq = tsk->se.on_rq;
 	if (on_rq)
 		dequeue_task(rq, tsk, 0);
 	if (unlikely(running))
 		tsk->sched_class->put_prev_task(rq, tsk);
 	set_task_rq(tsk, task_cpu(tsk));
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (tsk->sched_class->moved_group)
 		tsk->sched_class->moved_group(tsk);
 #endif
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
 	if (on_rq)
 		enqueue_task(rq, tsk, 0);
 	task_rq_unlock(rq, &flags);
 }
 #endif /* CONFIG_GROUP_SCHED */
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void __set_se_shares(struct sched_entity *se, unsigned long shares)
 {
 	struct cfs_rq *cfs_rq = se->cfs_rq;
 	int on_rq;
 	on_rq = se->on_rq;
 	if (on_rq)
 		dequeue_entity(cfs_rq, se, 0);
 	se->load.weight = shares;
 	se->load.inv_weight = 0;
 	if (on_rq)
 		enqueue_entity(cfs_rq, se, 0);
 }
 static void set_se_shares(struct sched_entity *se, unsigned long shares)
 {
 	struct cfs_rq *cfs_rq = se->cfs_rq;
 	struct rq *rq = cfs_rq->rq;
 	unsigned long flags;
 	spin_lock_irqsave(&rq->lock, flags);
 	__set_se_shares(se, shares);
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 static DEFINE_MUTEX(shares_mutex);
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
 	int i;
 	unsigned long flags;
 	/*
 	 * We can't change the weight of the root cgroup.
 	 */
 	if (!tg->se[0])
 		return -EINVAL;
 	if (shares < MIN_SHARES)
 		shares = MIN_SHARES;
 	else if (shares > MAX_SHARES)
 		shares = MAX_SHARES;
 	mutex_lock(&shares_mutex);
 	if (tg->shares == shares)
 		goto done;
 	spin_lock_irqsave(&task_group_lock, flags);
 	for_each_possible_cpu(i)
 		unregister_fair_sched_group(tg, i);
 	list_del_rcu(&tg->siblings);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 	/* wait for any ongoing reference to this group to finish */
 	synchronize_sched();
 	/*
 	 * Now we are free to modify the group's share on each cpu
 	 * w/o tripping rebalance_share or load_balance_fair.
 	 */
 	tg->shares = shares;
 	for_each_possible_cpu(i) {
 		/*
 		 * force a rebalance
 		 */
 		cfs_rq_set_shares(tg->cfs_rq[i], 0);
 		set_se_shares(tg->se[i], shares);
 	}
 	/*
 	 * Enable load balance activity on this group, by inserting it back on
 	 * each cpu's rq->leaf_cfs_rq_list.
 	 */
 	spin_lock_irqsave(&task_group_lock, flags);
 	for_each_possible_cpu(i)
 		register_fair_sched_group(tg, i);
 	list_add_rcu(&tg->siblings, &tg->parent->children);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 done:
 	mutex_unlock(&shares_mutex);
 	return 0;
 }
 unsigned long sched_group_shares(struct task_group *tg)
 {
 	return tg->shares;
 }
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 /*
  * Ensure that the real time constraints are schedulable.
  */
 static DEFINE_MUTEX(rt_constraints_mutex);
 static unsigned long to_ratio(u64 period, u64 runtime)
 {
 	if (runtime == RUNTIME_INF)
 		return 1ULL << 20;
 	return div64_u64(runtime << 20, period);
 }
 /* Must be called with tasklist_lock held */
 static inline int tg_has_rt_tasks(struct task_group *tg)
 {
 	struct task_struct *g, *p;
 	do_each_thread(g, p) {
 		if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
 			return 1;
 	} while_each_thread(g, p);
 	return 0;
 }
 struct rt_schedulable_data {
 	struct task_group *tg;
 	u64 rt_period;
 	u64 rt_runtime;
 };
 static int tg_schedulable(struct task_group *tg, void *data)
 {
 	struct rt_schedulable_data *d = data;
 	struct task_group *child;
 	unsigned long total, sum = 0;
 	u64 period, runtime;
 	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
 	runtime = tg->rt_bandwidth.rt_runtime;
 	if (tg == d->tg) {
 		period = d->rt_period;
 		runtime = d->rt_runtime;
 	}
 	/*
 	 * Cannot have more runtime than the period.
 	 */
 	if (runtime > period && runtime != RUNTIME_INF)
 		return -EINVAL;
 	/*
 	 * Ensure we don't starve existing RT tasks.
 	 */
 	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
 		return -EBUSY;
 	total = to_ratio(period, runtime);
 	/*
 	 * Nobody can have more than the global setting allows.
 	 */
 	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
 		return -EINVAL;
 	/*
 	 * The sum of our children's runtime should not exceed our own.
 	 */
 	list_for_each_entry_rcu(child, &tg->children, siblings) {
 		period = ktime_to_ns(child->rt_bandwidth.rt_period);
 		runtime = child->rt_bandwidth.rt_runtime;
 		if (child == d->tg) {
 			period = d->rt_period;
 			runtime = d->rt_runtime;
 		}
 		sum += to_ratio(period, runtime);
 	}
 	if (sum > total)
 		return -EINVAL;
 	return 0;
 }
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
 	struct rt_schedulable_data data = {
 		.tg = tg,
 		.rt_period = period,
 		.rt_runtime = runtime,
 	};
 	return walk_tg_tree(tg_schedulable, tg_nop, &data);
 }
 static int tg_set_bandwidth(struct task_group *tg,
 		u64 rt_period, u64 rt_runtime)
 {
 	int i, err = 0;
 	mutex_lock(&rt_constraints_mutex);
 	read_lock(&tasklist_lock);
 	err = __rt_schedulable(tg, rt_period, rt_runtime);
 	if (err)
 		goto unlock;
 	spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
 	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
 	tg->rt_bandwidth.rt_runtime = rt_runtime;
 	for_each_possible_cpu(i) {
 		struct rt_rq *rt_rq = tg->rt_rq[i];
 		spin_lock(&rt_rq->rt_runtime_lock);
 		rt_rq->rt_runtime = rt_runtime;
 		spin_unlock(&rt_rq->rt_runtime_lock);
 	}
 	spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
  unlock:
 	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 	return err;
 }
 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 {
 	u64 rt_runtime, rt_period;
 	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
 	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
 	if (rt_runtime_us < 0)
 		rt_runtime = RUNTIME_INF;
 	return tg_set_bandwidth(tg, rt_period, rt_runtime);
 }
 long sched_group_rt_runtime(struct task_group *tg)
 {
 	u64 rt_runtime_us;
 	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
 		return -1;
 	rt_runtime_us = tg->rt_bandwidth.rt_runtime;
 	do_div(rt_runtime_us, NSEC_PER_USEC);
 	return rt_runtime_us;
 }
 int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
 {
 	u64 rt_runtime, rt_period;
 	rt_period = (u64)rt_period_us * NSEC_PER_USEC;
 	rt_runtime = tg->rt_bandwidth.rt_runtime;
 	if (rt_period == 0)
 		return -EINVAL;
 	return tg_set_bandwidth(tg, rt_period, rt_runtime);
 }
 long sched_group_rt_period(struct task_group *tg)
 {
 	u64 rt_period_us;
 	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
 	do_div(rt_period_us, NSEC_PER_USEC);
 	return rt_period_us;
 }
 static int sched_rt_global_constraints(void)
 {
 	u64 runtime, period;
 	int ret = 0;
 	if (sysctl_sched_rt_period <= 0)
 		return -EINVAL;
 	runtime = global_rt_runtime();
 	period = global_rt_period();
 	/*
 	 * Sanity check on the sysctl variables.
 	 */
 	if (runtime > period && runtime != RUNTIME_INF)
 		return -EINVAL;
 	mutex_lock(&rt_constraints_mutex);
 	read_lock(&tasklist_lock);
 	ret = __rt_schedulable(NULL, 0, 0);
 	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 	return ret;
 }
 #else /* !CONFIG_RT_GROUP_SCHED */
 static int sched_rt_global_constraints(void)
 {
 	unsigned long flags;
 	int i;
 	if (sysctl_sched_rt_period <= 0)
 		return -EINVAL;
 	spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
 	for_each_possible_cpu(i) {
 		struct rt_rq *rt_rq = &cpu_rq(i)->rt;
 		spin_lock(&rt_rq->rt_runtime_lock);
 		rt_rq->rt_runtime = global_rt_runtime();
 		spin_unlock(&rt_rq->rt_runtime_lock);
 	}
 	spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
 	return 0;
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 int sched_rt_handler(struct ctl_table *table, int write,
 		struct file *filp, void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
 	int ret;
 	int old_period, old_runtime;
 	static DEFINE_MUTEX(mutex);
 	mutex_lock(&mutex);
 	old_period = sysctl_sched_rt_period;
 	old_runtime = sysctl_sched_rt_runtime;
 	ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
 	if (!ret && write) {
 		ret = sched_rt_global_constraints();
 		if (ret) {
 			sysctl_sched_rt_period = old_period;
 			sysctl_sched_rt_runtime = old_runtime;
 		} else {
 			def_rt_bandwidth.rt_runtime = global_rt_runtime();
 			def_rt_bandwidth.rt_period =
 				ns_to_ktime(global_rt_period());
 		}
 	}
 	mutex_unlock(&mutex);
 	return ret;
 }
 #ifdef CONFIG_CGROUP_SCHED
 /* return corresponding task_group object of a cgroup */
 static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
 {
 	return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
 			    struct task_group, css);
 }
 static struct cgroup_subsys_state *
 cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	struct task_group *tg, *parent;
 	if (!cgrp->parent) {
 		/* This is early initialization for the top cgroup */
 		return &init_task_group.css;
 	}
 	parent = cgroup_tg(cgrp->parent);
 	tg = sched_create_group(parent);
 	if (IS_ERR(tg))
 		return ERR_PTR(-ENOMEM);
 	return &tg->css;
 }
 static void
 cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	struct task_group *tg = cgroup_tg(cgrp);
 	sched_destroy_group(tg);
 }
 static int
 cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 		      struct task_struct *tsk)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
 	/* Don't accept realtime tasks when there is no way for them to run */
 	if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
 		return -EINVAL;
 #else
 	/* We don't support RT-tasks being in separate groups */
 	if (tsk->sched_class != &fair_sched_class)
 		return -EINVAL;
 #endif
 	return 0;
 }
 static void
 cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 			struct cgroup *old_cont, struct task_struct *tsk)
 {
 	sched_move_task(tsk);
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
 				u64 shareval)
 {
 	return sched_group_set_shares(cgroup_tg(cgrp), shareval);
 }
 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
 {
 	struct task_group *tg = cgroup_tg(cgrp);
 	return (u64) tg->shares;
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
 				s64 val)
 {
 	return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
 }
 static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
 {
 	return sched_group_rt_runtime(cgroup_tg(cgrp));
 }
 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
 		u64 rt_period_us)
 {
 	return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
 }
 static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
 {
 	return sched_group_rt_period(cgroup_tg(cgrp));
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 static struct cftype cpu_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
 		.name = "shares",
 		.read_u64 = cpu_shares_read_u64,
 		.write_u64 = cpu_shares_write_u64,
 	},
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	{
 		.name = "rt_runtime_us",
 		.read_s64 = cpu_rt_runtime_read,
 		.write_s64 = cpu_rt_runtime_write,
 	},
 	{
 		.name = "rt_period_us",
 		.read_u64 = cpu_rt_period_read_uint,
 		.write_u64 = cpu_rt_period_write_uint,
 	},
 #endif
 };
 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
 }
 struct cgroup_subsys cpu_cgroup_subsys = {
 	.name		= "cpu",
 	.create		= cpu_cgroup_create,
 	.destroy	= cpu_cgroup_destroy,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
 	.populate	= cpu_cgroup_populate,
 	.subsys_id	= cpu_cgroup_subsys_id,
 	.early_init	= 1,
 };
 #endif	/* CONFIG_CGROUP_SCHED */
 #ifdef CONFIG_CGROUP_CPUACCT
 /*
  * CPU accounting code for task groups.
  *
  * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
  * (balbir@in.ibm.com).
  */
 /* track cpu usage of a group of tasks and its child groups */
 struct cpuacct {
 	struct cgroup_subsys_state css;
 	/* cpuusage holds pointer to a u64-type object on every cpu */
 	u64 *cpuusage;
 	struct cpuacct *parent;
 };
 struct cgroup_subsys cpuacct_subsys;
 /* return cpu accounting group corresponding to this container */
 static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
 {
 	return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
 			    struct cpuacct, css);
 }
 /* return cpu accounting group to which this task belongs */
 static inline struct cpuacct *task_ca(struct task_struct *tsk)
 {
 	return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
 			    struct cpuacct, css);
 }
 /* create a new cpu accounting group */
 static struct cgroup_subsys_state *cpuacct_create(
 	struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
 	if (!ca)
 		return ERR_PTR(-ENOMEM);
 	ca->cpuusage = alloc_percpu(u64);
 	if (!ca->cpuusage) {
 		kfree(ca);
 		return ERR_PTR(-ENOMEM);
 	}
 	if (cgrp->parent)
 		ca->parent = cgroup_ca(cgrp->parent);
 	return &ca->css;
 }
 /* destroy an existing cpu accounting group */
 static void
 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	struct cpuacct *ca = cgroup_ca(cgrp);
 	free_percpu(ca->cpuusage);
 	kfree(ca);
 }
 /* return total cpu usage (in nanoseconds) of a group */
 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
 {
 	struct cpuacct *ca = cgroup_ca(cgrp);
 	u64 totalcpuusage = 0;
 	int i;
 	for_each_possible_cpu(i) {
 		u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
 		/*
 		 * Take rq->lock to make 64-bit addition safe on 32-bit
 		 * platforms.
 		 */
 		spin_lock_irq(&cpu_rq(i)->lock);
 		totalcpuusage += *cpuusage;
 		spin_unlock_irq(&cpu_rq(i)->lock);
 	}
 	return totalcpuusage;
 }
 static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
 								u64 reset)
 {
 	struct cpuacct *ca = cgroup_ca(cgrp);
 	int err = 0;
 	int i;
 	if (reset) {
 		err = -EINVAL;
 		goto out;
 	}
 	for_each_possible_cpu(i) {
 		u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
 		spin_lock_irq(&cpu_rq(i)->lock);
 		*cpuusage = 0;
 		spin_unlock_irq(&cpu_rq(i)->lock);
 	}
 out:
 	return err;
 }
 static struct cftype files[] = {
 	{
 		.name = "usage",
 		.read_u64 = cpuusage_read,
 		.write_u64 = cpuusage_write,
 	},
 };
 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
 }
 /*
  * charge this task's execution time to its accounting group.
  *
  * called with rq->lock held.
  */
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
 	struct cpuacct *ca;
 	int cpu;
 	if (!cpuacct_subsys.active)
 		return;
 	cpu = task_cpu(tsk);
 	ca = task_ca(tsk);
 	for (; ca; ca = ca->parent) {
 		u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 		*cpuusage += cputime;
 	}
 }
 struct cgroup_subsys cpuacct_subsys = {
 	.name = "cpuacct",
 	.create = cpuacct_create,
 	.destroy = cpuacct_destroy,
 	.populate = cpuacct_populate,
 	.subsys_id = cpuacct_subsys_id,
 };
 #endif	/* CONFIG_CGROUP_CPUACCT */