Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* kernel/sched.c

2

* kernel/sched.c

3

*

3

*

4

* Kernel scheduler and related syscalls

4

* Kernel scheduler and related syscalls

5

*

5

*

6

7

*

7

*

8

* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and

8

* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and

9

* make semaphores SMP safe

9

* make semaphores SMP safe

10

* 1998-11-19 Implemented schedule_timeout() and related stuff

10

* 1998-11-19 Implemented schedule_timeout() and related stuff

11

* by Andrea Arcangeli

11

* by Andrea Arcangeli

12

* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:

12

* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:

13

* hybrid priority-list and round-robin design with

13

* hybrid priority-list and round-robin design with

14

* an array-switch method of distributing timeslices

14

* an array-switch method of distributing timeslices

15

* and per-CPU runqueues. Cleanups and useful suggestions

15

* and per-CPU runqueues. Cleanups and useful suggestions

16

* by Davide Libenzi, preemptible kernel bits by Robert Love.

16

* by Davide Libenzi, preemptible kernel bits by Robert Love.

17

* 2003-09-03 Interactivity tuning by Con Kolivas.

17

* 2003-09-03 Interactivity tuning by Con Kolivas.

18

* 2004-04-02 Scheduler domains code by Nick Piggin

18

* 2004-04-02 Scheduler domains code by Nick Piggin

19

* 2007-04-15 Work begun on replacing all interactivity tuning with a

19

* 2007-04-15 Work begun on replacing all interactivity tuning with a

20

* fair scheduling design by Con Kolivas.

20

* fair scheduling design by Con Kolivas.

21

* 2007-05-05 Load balancing (smp-nice) and other improvements

21

* 2007-05-05 Load balancing (smp-nice) and other improvements

22

* by Peter Williams

22

* by Peter Williams

23

* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith

23

* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith

24

* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri

24

* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri

25

*/

25

*/

26

27

#include <linux/mm.h>

27

#include <linux/mm.h>

28

#include <linux/module.h>

28

#include <linux/module.h>

29

#include <linux/nmi.h>

29

#include <linux/nmi.h>

30

#include <linux/init.h>

30

#include <linux/init.h>

31

#include <linux/uaccess.h>

31

#include <linux/uaccess.h>

32

#include <linux/highmem.h>

32

#include <linux/highmem.h>

33

#include <linux/smp_lock.h>

33

#include <linux/smp_lock.h>

34

#include <asm/mmu_context.h>

34

#include <asm/mmu_context.h>

35

#include <linux/interrupt.h>

35

#include <linux/interrupt.h>

36

#include <linux/capability.h>

36

#include <linux/capability.h>

37

#include <linux/completion.h>

37

#include <linux/completion.h>

38

#include <linux/kernel_stat.h>

38

#include <linux/kernel_stat.h>

39

#include <linux/debug_locks.h>

39

#include <linux/debug_locks.h>

40

#include <linux/security.h>

40

#include <linux/security.h>

41

#include <linux/notifier.h>

41

#include <linux/notifier.h>

42

#include <linux/profile.h>

42

#include <linux/profile.h>

43

#include <linux/freezer.h>

43

#include <linux/freezer.h>

44

#include <linux/vmalloc.h>

44

#include <linux/vmalloc.h>

45

#include <linux/blkdev.h>

45

#include <linux/blkdev.h>

46

#include <linux/delay.h>

46

#include <linux/delay.h>

47

#include <linux/smp.h>

47

#include <linux/smp.h>

48

#include <linux/threads.h>

48

#include <linux/threads.h>

49

#include <linux/timer.h>

49

#include <linux/timer.h>

50

#include <linux/rcupdate.h>

50

#include <linux/rcupdate.h>

51

#include <linux/cpu.h>

51

#include <linux/cpu.h>

52

#include <linux/cpuset.h>

52

#include <linux/cpuset.h>

53

#include <linux/percpu.h>

53

#include <linux/percpu.h>

54

#include <linux/kthread.h>

54

#include <linux/kthread.h>

55

#include <linux/seq_file.h>

55

#include <linux/seq_file.h>

56

#include <linux/sysctl.h>

56

#include <linux/sysctl.h>

57

#include <linux/syscalls.h>

57

#include <linux/syscalls.h>

58

#include <linux/times.h>

58

#include <linux/times.h>

59

#include <linux/tsacct_kern.h>

59

#include <linux/tsacct_kern.h>

60

#include <linux/kprobes.h>

60

#include <linux/kprobes.h>

61

#include <linux/delayacct.h>

61

#include <linux/delayacct.h>

62

#include <linux/reciprocal_div.h>

62

#include <linux/reciprocal_div.h>

63

#include <linux/unistd.h>

63

#include <linux/unistd.h>

64

65

#include <asm/tlb.h>

65

#include <asm/tlb.h>

66

67

/*

67

/*

68

* Scheduler clock - returns current time in nanosec units.

68

* Scheduler clock - returns current time in nanosec units.

69

* This is default implementation.

69

* This is default implementation.

70

* Architectures and sub-architectures can override this.

70

* Architectures and sub-architectures can override this.

71

*/

71

*/

72

unsigned long long __attribute__((weak)) sched_clock(void)

72

unsigned long long __attribute__((weak)) sched_clock(void)

73

{

73

{

74

return (unsigned long long)jiffies * (1000000000 / HZ);

74

return (unsigned long long)jiffies * (1000000000 / HZ);

75

}

75

}

76

77

/*

77

/*

78

* Convert user-nice values [ -20 ... 0 ... 19 ]

78

* Convert user-nice values [ -20 ... 0 ... 19 ]

79

* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],

79

* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],

80

* and back.

80

* and back.

81

*/

81

*/

82

#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)

82

#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)

83

#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)

83

#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)

84

#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)

84

#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)

85

86

/*

86

/*

87

* 'User priority' is the nice value converted to something we

87

* 'User priority' is the nice value converted to something we

88

* can work with better when scaling various scheduler parameters,

88

* can work with better when scaling various scheduler parameters,

89

* it's a [ 0 ... 39 ] range.

89

* it's a [ 0 ... 39 ] range.

90

*/

90

*/

91

#define USER_PRIO(p) ((p)-MAX_RT_PRIO)

91

#define USER_PRIO(p) ((p)-MAX_RT_PRIO)

92

#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)

92

#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)

93

#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))

93

#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))

94

95

/*

95

/*

96

* Some helpers for converting nanosecond timing to jiffy resolution

96

* Some helpers for converting nanosecond timing to jiffy resolution

97

*/

97

*/

98

#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))

98

#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))

99

#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))

99

#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))

100

101

#define NICE_0_LOAD SCHED_LOAD_SCALE

101

#define NICE_0_LOAD SCHED_LOAD_SCALE

102

#define NICE_0_SHIFT SCHED_LOAD_SHIFT

102

#define NICE_0_SHIFT SCHED_LOAD_SHIFT

103

104

/*

104

/*

105

* These are the 'tuning knobs' of the scheduler:

105

* These are the 'tuning knobs' of the scheduler:

106

*

106

*

107

* Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),

107

* Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),

108

* default timeslice is 100 msecs, maximum timeslice is 800 msecs.

108

* default timeslice is 100 msecs, maximum timeslice is 800 msecs.

109

* Timeslices get refilled after they expire.

109

* Timeslices get refilled after they expire.

110

*/

110

*/

111

#define MIN_TIMESLICE max(5 * HZ / 1000, 1)

111

#define MIN_TIMESLICE max(5 * HZ / 1000, 1)

112

#define DEF_TIMESLICE (100 * HZ / 1000)

112

#define DEF_TIMESLICE (100 * HZ / 1000)

113

114

#ifdef CONFIG_SMP

114

#ifdef CONFIG_SMP

115

/*

115

/*

116

* Divide a load by a sched group cpu_power : (load / sg->__cpu_power)

116

* Divide a load by a sched group cpu_power : (load / sg->__cpu_power)

117

* Since cpu_power is a 'constant', we can use a reciprocal divide.

117

* Since cpu_power is a 'constant', we can use a reciprocal divide.

118

*/

118

*/

119

static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)

119

static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)

120

{

120

{

121

return reciprocal_divide(load, sg->reciprocal_cpu_power);

121

return reciprocal_divide(load, sg->reciprocal_cpu_power);

122

}

122

}

123

124

/*

124

/*

125

* Each time a sched group cpu_power is changed,

125

* Each time a sched group cpu_power is changed,

126

* we must compute its reciprocal value

126

* we must compute its reciprocal value

127

*/

127

*/

128

static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)

128

static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)

129

{

129

{

130

sg->__cpu_power += val;

130

sg->__cpu_power += val;

131

sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);

131

sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);

132

}

132

}

133

#endif

133

#endif

134

135

#define SCALE_PRIO(x, prio) \

135

#define SCALE_PRIO(x, prio) \

136

max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)

136

max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)

137

138

/*

138

/*

139

* static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]

139

* static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]

140

* to time slice values: [800ms ... 100ms ... 5ms]

140

* to time slice values: [800ms ... 100ms ... 5ms]

141

*/

141

*/

142

static unsigned int static_prio_timeslice(int static_prio)

142

static unsigned int static_prio_timeslice(int static_prio)

143

{

143

{

144

if (static_prio == NICE_TO_PRIO(19))

144

if (static_prio == NICE_TO_PRIO(19))

145

return 1;

145

return 1;

146

147

if (static_prio < NICE_TO_PRIO(0))

147

if (static_prio < NICE_TO_PRIO(0))

148

return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);

148

return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);

149

else

149

else

150

return SCALE_PRIO(DEF_TIMESLICE, static_prio);

150

return SCALE_PRIO(DEF_TIMESLICE, static_prio);

151

}

151

}

152

153

static inline int rt_policy(int policy)

153

static inline int rt_policy(int policy)

154

{

154

{

155

if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))

155

if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))

156

return 1;

156

return 1;

157

return 0;

157

return 0;

158

}

158

}

159

160

static inline int task_has_rt_policy(struct task_struct *p)

160

static inline int task_has_rt_policy(struct task_struct *p)

161

{

161

{

162

return rt_policy(p->policy);

162

return rt_policy(p->policy);

163

}

163

}

164

165

/*

165

/*

166

* This is the priority-queue data structure of the RT scheduling class:

166

* This is the priority-queue data structure of the RT scheduling class:

167

*/

167

*/

168

struct rt_prio_array {

168

struct rt_prio_array {

169

DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */

169

DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */

170

struct list_head queue[MAX_RT_PRIO];

170

struct list_head queue[MAX_RT_PRIO];

171

};

171

};

172

173

struct load_stat {

173

struct load_stat {

174

struct load_weight load;

174

struct load_weight load;

175

u64 load_update_start, load_update_last;

175

u64 load_update_start, load_update_last;

176

unsigned long delta_fair, delta_exec, delta_stat;

176

unsigned long delta_fair, delta_exec, delta_stat;

177

};

177

};

178

179

/* CFS-related fields in a runqueue */

179

/* CFS-related fields in a runqueue */

180

struct cfs_rq {

180

struct cfs_rq {

181

struct load_weight load;

181

struct load_weight load;

182

unsigned long nr_running;

182

unsigned long nr_running;

183

184

s64 fair_clock;

184

s64 fair_clock;

185

u64 exec_clock;

185

u64 exec_clock;

186

s64 wait_runtime;

186

s64 wait_runtime;

187

u64 sleeper_bonus;

187

u64 sleeper_bonus;

188

unsigned long wait_runtime_overruns, wait_runtime_underruns;

188

unsigned long wait_runtime_overruns, wait_runtime_underruns;

189

190

struct rb_root tasks_timeline;

190

struct rb_root tasks_timeline;

191

struct rb_node *rb_leftmost;

191

struct rb_node *rb_leftmost;

192

struct rb_node *rb_load_balance_curr;

192

struct rb_node *rb_load_balance_curr;

193

#ifdef CONFIG_FAIR_GROUP_SCHED

193

#ifdef CONFIG_FAIR_GROUP_SCHED

194

/* 'curr' points to currently running entity on this cfs_rq.

194

/* 'curr' points to currently running entity on this cfs_rq.

195

* It is set to NULL otherwise (i.e when none are currently running).

195

* It is set to NULL otherwise (i.e when none are currently running).

196

*/

196

*/

197

struct sched_entity *curr;

197

struct sched_entity *curr;

198

struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */

198

struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */

199

200

/* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in

200

/* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in

201

* a hierarchy). Non-leaf lrqs hold other higher schedulable entities

201

* a hierarchy). Non-leaf lrqs hold other higher schedulable entities

202

* (like users, containers etc.)

202

* (like users, containers etc.)

203

*

203

*

204

* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This

204

* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This

205

* list is used during load balance.

205

* list is used during load balance.

206

*/

206

*/

207

struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */

207

struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */

208

#endif

208

#endif

209

};

209

};

210

211

/* Real-Time classes' related field in a runqueue: */

211

/* Real-Time classes' related field in a runqueue: */

212

struct rt_rq {

212

struct rt_rq {

213

struct rt_prio_array active;

213

struct rt_prio_array active;

214

int rt_load_balance_idx;

214

int rt_load_balance_idx;

215

struct list_head *rt_load_balance_head, *rt_load_balance_curr;

215

struct list_head *rt_load_balance_head, *rt_load_balance_curr;

216

};

216

};

217

218

/*

218

/*

219

* This is the main, per-CPU runqueue data structure.

219

* This is the main, per-CPU runqueue data structure.

220

*

220

*

221

* Locking rule: those places that want to lock multiple runqueues

221

* Locking rule: those places that want to lock multiple runqueues

222

* (such as the load balancing or the thread migration code), lock

222

* (such as the load balancing or the thread migration code), lock

223

* acquire operations must be ordered by ascending &runqueue.

223

* acquire operations must be ordered by ascending &runqueue.

224

*/

224

*/

225

struct rq {

225

struct rq {

226

spinlock_t lock; /* runqueue lock */

226

spinlock_t lock; /* runqueue lock */

227

228

/*

228

/*

229

* nr_running and cpu_load should be in the same cacheline because

229

* nr_running and cpu_load should be in the same cacheline because

230

* remote CPUs use both these fields when doing load calculation.

230

* remote CPUs use both these fields when doing load calculation.

231

*/

231

*/

232

unsigned long nr_running;

232

unsigned long nr_running;

233

#define CPU_LOAD_IDX_MAX 5

233

#define CPU_LOAD_IDX_MAX 5

234

unsigned long cpu_load[CPU_LOAD_IDX_MAX];

234

unsigned long cpu_load[CPU_LOAD_IDX_MAX];

235

unsigned char idle_at_tick;

235

unsigned char idle_at_tick;

236

#ifdef CONFIG_NO_HZ

236

#ifdef CONFIG_NO_HZ

237

unsigned char in_nohz_recently;

237

unsigned char in_nohz_recently;

238

#endif

238

#endif

239

struct load_stat ls; /* capture load from *all* tasks on this cpu */

239

struct load_stat ls; /* capture load from *all* tasks on this cpu */

240

unsigned long nr_load_updates;

240

unsigned long nr_load_updates;

241

u64 nr_switches;

241

u64 nr_switches;

242

243

struct cfs_rq cfs;

243

struct cfs_rq cfs;

244

#ifdef CONFIG_FAIR_GROUP_SCHED

244

#ifdef CONFIG_FAIR_GROUP_SCHED

245

struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */

245

struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */

246

#endif

246

#endif

247

struct rt_rq rt;

247

struct rt_rq rt;

248

249

/*

249

/*

250

* This is part of a global counter where only the total sum

250

* This is part of a global counter where only the total sum

251

* over all CPUs matters. A task can increase this counter on

251

* over all CPUs matters. A task can increase this counter on

252

* one CPU and if it got migrated afterwards it may decrease

252

* one CPU and if it got migrated afterwards it may decrease

253

* it on another CPU. Always updated under the runqueue lock:

253

* it on another CPU. Always updated under the runqueue lock:

254

*/

254

*/

255

unsigned long nr_uninterruptible;

255

unsigned long nr_uninterruptible;

256

257

struct task_struct *curr, *idle;

257

struct task_struct *curr, *idle;

258

unsigned long next_balance;

258

unsigned long next_balance;

259

struct mm_struct *prev_mm;

259

struct mm_struct *prev_mm;

260

261

u64 clock, prev_clock_raw;

261

u64 clock, prev_clock_raw;

262

s64 clock_max_delta;

262

s64 clock_max_delta;

263

264

unsigned int clock_warps, clock_overflows;

264

unsigned int clock_warps, clock_overflows;

265

unsigned int clock_unstable_events;

265

unsigned int clock_unstable_events;

266

267

atomic_t nr_iowait;

267

atomic_t nr_iowait;

268

269

#ifdef CONFIG_SMP

269

#ifdef CONFIG_SMP

270

struct sched_domain *sd;

270

struct sched_domain *sd;

271

272

/* For active balancing */

272

/* For active balancing */

273

int active_balance;

273

int active_balance;

274

int push_cpu;

274

int push_cpu;

275

int cpu; /* cpu of this runqueue */

275

int cpu; /* cpu of this runqueue */

276

277

struct task_struct *migration_thread;

277

struct task_struct *migration_thread;

278

struct list_head migration_queue;

278

struct list_head migration_queue;

279

#endif

279

#endif

280

281

#ifdef CONFIG_SCHEDSTATS

281

#ifdef CONFIG_SCHEDSTATS

282

/* latency stats */

282

/* latency stats */

283

struct sched_info rq_sched_info;

283

struct sched_info rq_sched_info;

284

285

/* sys_sched_yield() stats */

285

/* sys_sched_yield() stats */

286

unsigned long yld_exp_empty;

286

unsigned long yld_exp_empty;

287

unsigned long yld_act_empty;

287

unsigned long yld_act_empty;

288

unsigned long yld_both_empty;

288

unsigned long yld_both_empty;

289

unsigned long yld_cnt;

289

unsigned long yld_cnt;

290

291

/* schedule() stats */

291

/* schedule() stats */

292

unsigned long sched_switch;

292

unsigned long sched_switch;

293

unsigned long sched_cnt;

293

unsigned long sched_cnt;

294

unsigned long sched_goidle;

294

unsigned long sched_goidle;

295

296

/* try_to_wake_up() stats */

296

/* try_to_wake_up() stats */

297

unsigned long ttwu_cnt;

297

unsigned long ttwu_cnt;

298

unsigned long ttwu_local;

298

unsigned long ttwu_local;

299

#endif

299

#endif

300

struct lock_class_key rq_lock_key;

300

struct lock_class_key rq_lock_key;

301

};

301

};

302

303

static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

303

static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

304

static DEFINE_MUTEX(sched_hotcpu_mutex);

304

static DEFINE_MUTEX(sched_hotcpu_mutex);

305

306

static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)

306

static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)

307

{

307

{

308

rq->curr->sched_class->check_preempt_curr(rq, p);

308

rq->curr->sched_class->check_preempt_curr(rq, p);

309

}

309

}

310

311

static inline int cpu_of(struct rq *rq)

311

static inline int cpu_of(struct rq *rq)

312

{

312

{

313

#ifdef CONFIG_SMP

313

#ifdef CONFIG_SMP

314

return rq->cpu;

314

return rq->cpu;

315

#else

315

#else

316

return 0;

316

return 0;

317

#endif

317

#endif

318

}

318

}

319

320

/*

320

/*

321

* Per-runqueue clock, as finegrained as the platform can give us:

321

* Per-runqueue clock, as finegrained as the platform can give us:

322

*/

322

*/

323

static unsigned long long __rq_clock(struct rq *rq)

323

static unsigned long long __rq_clock(struct rq *rq)

324

{

324

{

325

u64 prev_raw = rq->prev_clock_raw;

325

u64 prev_raw = rq->prev_clock_raw;

326

u64 now = sched_clock();

326

u64 now = sched_clock();

327

s64 delta = now - prev_raw;

327

s64 delta = now - prev_raw;

328

u64 clock = rq->clock;

328

u64 clock = rq->clock;

329

330

/*

330

/*

331

* Protect against sched_clock() occasionally going backwards:

331

* Protect against sched_clock() occasionally going backwards:

332

*/

332

*/

333

if (unlikely(delta < 0)) {

333

if (unlikely(delta < 0)) {

334

clock++;

334

clock++;

335

rq->clock_warps++;

335

rq->clock_warps++;

336

} else {

336

} else {

337

/*

337

/*

338

* Catch too large forward jumps too:

338

* Catch too large forward jumps too:

339

*/

339

*/

340

if (unlikely(delta > 2*TICK_NSEC)) {

340

if (unlikely(delta > 2*TICK_NSEC)) {

341

clock++;

341

clock++;

342

rq->clock_overflows++;

342

rq->clock_overflows++;

343

} else {

343

} else {

344

if (unlikely(delta > rq->clock_max_delta))

344

if (unlikely(delta > rq->clock_max_delta))

345

rq->clock_max_delta = delta;

345

rq->clock_max_delta = delta;

346

clock += delta;

346

clock += delta;

347

}

347

}

348

}

348

}

349

350

rq->prev_clock_raw = now;

350

rq->prev_clock_raw = now;

351

rq->clock = clock;

351

rq->clock = clock;

352

353

return clock;

353

return clock;

354

}

354

}

355

356

static inline unsigned long long rq_clock(struct rq *rq)

356

static inline unsigned long long rq_clock(struct rq *rq)

357

{

357

{

358

int this_cpu = smp_processor_id();

358

int this_cpu = smp_processor_id();

359

360

if (this_cpu == cpu_of(rq))

360

if (this_cpu == cpu_of(rq))

361

return __rq_clock(rq);

361

return __rq_clock(rq);

362

363

return rq->clock;

363

return rq->clock;

364

}

364

}

365

366

/*

366

/*

367

* The domain tree (rq->sd) is protected by RCU's quiescent state transition.

367

* The domain tree (rq->sd) is protected by RCU's quiescent state transition.

368

* See detach_destroy_domains: synchronize_sched for details.

368

* See detach_destroy_domains: synchronize_sched for details.

369

*

369

*

370

* The domain tree of any CPU may only be accessed from within

370

* The domain tree of any CPU may only be accessed from within

371

* preempt-disabled sections.

371

* preempt-disabled sections.

372

*/

372

*/

373

#define for_each_domain(cpu, __sd) \

373

#define for_each_domain(cpu, __sd) \

374

for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)

374

for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)

375

376

#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))

376

#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))

377

#define this_rq() (&__get_cpu_var(runqueues))

377

#define this_rq() (&__get_cpu_var(runqueues))

378

#define task_rq(p) cpu_rq(task_cpu(p))

378

#define task_rq(p) cpu_rq(task_cpu(p))

379

#define cpu_curr(cpu) (cpu_rq(cpu)->curr)

379

#define cpu_curr(cpu) (cpu_rq(cpu)->curr)

380

381

/*

381

/*

382

* For kernel-internal use: high-speed (but slightly incorrect) per-cpu

382

* For kernel-internal use: high-speed (but slightly incorrect) per-cpu

383

* clock constructed from sched_clock():

383

* clock constructed from sched_clock():

384

*/

384

*/

385

unsigned long long cpu_clock(int cpu)

385

unsigned long long cpu_clock(int cpu)

386

{

386

{

387

unsigned long long now;

387

unsigned long long now;

388

unsigned long flags;

388

unsigned long flags;

389

390

local_irq_save(flags);

390

local_irq_save(flags);

391

now = rq_clock(cpu_rq(cpu));

391

now = rq_clock(cpu_rq(cpu));

392

local_irq_restore(flags);

392

local_irq_restore(flags);

393

394

return now;

394

return now;

395

}

395

}

396

397

#ifdef CONFIG_FAIR_GROUP_SCHED

397

#ifdef CONFIG_FAIR_GROUP_SCHED

398

/* Change a task's ->cfs_rq if it moves across CPUs */

398

/* Change a task's ->cfs_rq if it moves across CPUs */

399

static inline void set_task_cfs_rq(struct task_struct *p)

399

static inline void set_task_cfs_rq(struct task_struct *p)

400

{

400

{

401

p->se.cfs_rq = &task_rq(p)->cfs;

401

p->se.cfs_rq = &task_rq(p)->cfs;

402

}

402

}

403

#else

403

#else

404

static inline void set_task_cfs_rq(struct task_struct *p)

404

static inline void set_task_cfs_rq(struct task_struct *p)

405

{

405

{

406

}

406

}

407

#endif

407

#endif

408

409

#ifndef prepare_arch_switch

409

#ifndef prepare_arch_switch

410

# define prepare_arch_switch(next) do { } while (0)

410

# define prepare_arch_switch(next) do { } while (0)

411

#endif

411

#endif

412

#ifndef finish_arch_switch

412

#ifndef finish_arch_switch

413

# define finish_arch_switch(prev) do { } while (0)

413

# define finish_arch_switch(prev) do { } while (0)

414

#endif

414

#endif

415

416

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

416

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

417

static inline int task_running(struct rq *rq, struct task_struct *p)

417

static inline int task_running(struct rq *rq, struct task_struct *p)

418

{

418

{

419

return rq->curr == p;

419

return rq->curr == p;

420

}

420

}

421

422

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

422

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

423

{

423

{

424

}

424

}

425

426

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

426

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

427

{

427

{

428

#ifdef CONFIG_DEBUG_SPINLOCK

428

#ifdef CONFIG_DEBUG_SPINLOCK

429

/* this is a valid case when another task releases the spinlock */

429

/* this is a valid case when another task releases the spinlock */

430

rq->lock.owner = current;

430

rq->lock.owner = current;

431

#endif

431

#endif

432

/*

432

/*

433

* If we are tracking spinlock dependencies then we have to

433

* If we are tracking spinlock dependencies then we have to

434

* fix up the runqueue lock - which gets 'carried over' from

434

* fix up the runqueue lock - which gets 'carried over' from

435

* prev into current:

435

* prev into current:

436

*/

436

*/

437

spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);

437

spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);

438

439

spin_unlock_irq(&rq->lock);

439

spin_unlock_irq(&rq->lock);

440

}

440

}

441

442

#else /* __ARCH_WANT_UNLOCKED_CTXSW */

442

#else /* __ARCH_WANT_UNLOCKED_CTXSW */

443

static inline int task_running(struct rq *rq, struct task_struct *p)

443

static inline int task_running(struct rq *rq, struct task_struct *p)

444

{

444

{

445

#ifdef CONFIG_SMP

445

#ifdef CONFIG_SMP

446

return p->oncpu;

446

return p->oncpu;

447

#else

447

#else

448

return rq->curr == p;

448

return rq->curr == p;

449

#endif

449

#endif

450

}

450

}

451

452

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

452

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

453

{

453

{

454

#ifdef CONFIG_SMP

454

#ifdef CONFIG_SMP

455

/*

455

/*

456

* We can optimise this out completely for !SMP, because the

456

* We can optimise this out completely for !SMP, because the

457

* SMP rebalancing from interrupt is the only thing that cares

457

* SMP rebalancing from interrupt is the only thing that cares

458

* here.

458

* here.

459

*/

459

*/

460

next->oncpu = 1;

460

next->oncpu = 1;

461

#endif

461

#endif

462

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

462

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

463

spin_unlock_irq(&rq->lock);

463

spin_unlock_irq(&rq->lock);

464

#else

464

#else

465

spin_unlock(&rq->lock);

465

spin_unlock(&rq->lock);

466

#endif

466

#endif

467

}

467

}

468

469

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

469

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

470

{

470

{

471

#ifdef CONFIG_SMP

471

#ifdef CONFIG_SMP

472

/*

472

/*

473

* After ->oncpu is cleared, the task can be moved to a different CPU.

473

* After ->oncpu is cleared, the task can be moved to a different CPU.

474

* We must ensure this doesn't happen until the switch is completely

474

* We must ensure this doesn't happen until the switch is completely

475

* finished.

475

* finished.

476

*/

476

*/

477

smp_wmb();

477

smp_wmb();

478

prev->oncpu = 0;

478

prev->oncpu = 0;

479

#endif

479

#endif

480

#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW

480

#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW

481

local_irq_enable();

481

local_irq_enable();

482

#endif

482

#endif

483

}

483

}

484

#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

484

#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

485

486

/*

486

/*

487

* __task_rq_lock - lock the runqueue a given task resides on.

487

* __task_rq_lock - lock the runqueue a given task resides on.

488

* Must be called interrupts disabled.

488

* Must be called interrupts disabled.

489

*/

489

*/

490

static inline struct rq *__task_rq_lock(struct task_struct *p)

490

static inline struct rq *__task_rq_lock(struct task_struct *p)

491

__acquires(rq->lock)

491

__acquires(rq->lock)

492

{

492

{

493

struct rq *rq;

493

struct rq *rq;

494

495

repeat_lock_task:

495

repeat_lock_task:

496

rq = task_rq(p);

496

rq = task_rq(p);

497

spin_lock(&rq->lock);

497

spin_lock(&rq->lock);

498

if (unlikely(rq != task_rq(p))) {

498

if (unlikely(rq != task_rq(p))) {

499

spin_unlock(&rq->lock);

499

spin_unlock(&rq->lock);

500

goto repeat_lock_task;

500

goto repeat_lock_task;

501

}

501

}

502

return rq;

502

return rq;

503

}

503

}

504

505

/*

505

/*

506

* task_rq_lock - lock the runqueue a given task resides on and disable

506

* task_rq_lock - lock the runqueue a given task resides on and disable

507

* interrupts. Note the ordering: we can safely lookup the task_rq without

507

* interrupts. Note the ordering: we can safely lookup the task_rq without

508

* explicitly disabling preemption.

508

* explicitly disabling preemption.

509

*/

509

*/

510

static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)

510

static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)

511

__acquires(rq->lock)

511

__acquires(rq->lock)

512

{

512

{

513

struct rq *rq;

513

struct rq *rq;

514

515

repeat_lock_task:

515

repeat_lock_task:

516

local_irq_save(*flags);

516

local_irq_save(*flags);

517

rq = task_rq(p);

517

rq = task_rq(p);

518

spin_lock(&rq->lock);

518

spin_lock(&rq->lock);

519

if (unlikely(rq != task_rq(p))) {

519

if (unlikely(rq != task_rq(p))) {

520

spin_unlock_irqrestore(&rq->lock, *flags);

520

spin_unlock_irqrestore(&rq->lock, *flags);

521

goto repeat_lock_task;

521

goto repeat_lock_task;

522

}

522

}

523

return rq;

523

return rq;

524

}

524

}

525

526

static inline void __task_rq_unlock(struct rq *rq)

526

static inline void __task_rq_unlock(struct rq *rq)

527

__releases(rq->lock)

527

__releases(rq->lock)

528

{

528

{

529

spin_unlock(&rq->lock);

529

spin_unlock(&rq->lock);

530

}

530

}

531

532

static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)

532

static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)

533

__releases(rq->lock)

533

__releases(rq->lock)

534

{

534

{

535

spin_unlock_irqrestore(&rq->lock, *flags);

535

spin_unlock_irqrestore(&rq->lock, *flags);

536

}

536

}

537

538

/*

538

/*

539

* this_rq_lock - lock this runqueue and disable interrupts.

539

* this_rq_lock - lock this runqueue and disable interrupts.

540

*/

540

*/

541

static inline struct rq *this_rq_lock(void)

541

static inline struct rq *this_rq_lock(void)

542

__acquires(rq->lock)

542

__acquires(rq->lock)

543

{

543

{

544

struct rq *rq;

544

struct rq *rq;

545

546

local_irq_disable();

546

local_irq_disable();

547

rq = this_rq();

547

rq = this_rq();

548

spin_lock(&rq->lock);

548

spin_lock(&rq->lock);

549

550

return rq;

550

return rq;

551

}

551

}

552

553

/*

553

/*

554

* CPU frequency is/was unstable - start new by setting prev_clock_raw:

554

* CPU frequency is/was unstable - start new by setting prev_clock_raw:

555

*/

555

*/

556

void sched_clock_unstable_event(void)

556

void sched_clock_unstable_event(void)

557

{

557

{

558

unsigned long flags;

558

unsigned long flags;

559

struct rq *rq;

559

struct rq *rq;

560

561

rq = task_rq_lock(current, &flags);

561

rq = task_rq_lock(current, &flags);

562

rq->prev_clock_raw = sched_clock();

562

rq->prev_clock_raw = sched_clock();

563

rq->clock_unstable_events++;

563

rq->clock_unstable_events++;

564

task_rq_unlock(rq, &flags);

564

task_rq_unlock(rq, &flags);

565

}

565

}

566

567

/*

567

/*

568

* resched_task - mark a task 'to be rescheduled now'.

568

* resched_task - mark a task 'to be rescheduled now'.

569

*

569

*

570

* On UP this means the setting of the need_resched flag, on SMP it

570

* On UP this means the setting of the need_resched flag, on SMP it

571

* might also involve a cross-CPU call to trigger the scheduler on

571

* might also involve a cross-CPU call to trigger the scheduler on

572

* the target CPU.

572

* the target CPU.

573

*/

573

*/

574

#ifdef CONFIG_SMP

574

#ifdef CONFIG_SMP

575

576

#ifndef tsk_is_polling

576

#ifndef tsk_is_polling

577

#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)

577

#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)

578

#endif

578

#endif

579

580

static void resched_task(struct task_struct *p)

580

static void resched_task(struct task_struct *p)

581

{

581

{

582

int cpu;

582

int cpu;

583

584

assert_spin_locked(&task_rq(p)->lock);

584

assert_spin_locked(&task_rq(p)->lock);

585

586

if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))

586

if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))

587

return;

587

return;

588

589

set_tsk_thread_flag(p, TIF_NEED_RESCHED);

589

set_tsk_thread_flag(p, TIF_NEED_RESCHED);

590

591

cpu = task_cpu(p);

591

cpu = task_cpu(p);

592

if (cpu == smp_processor_id())

592

if (cpu == smp_processor_id())

593

return;

593

return;

594

595

/* NEED_RESCHED must be visible before we test polling */

595

/* NEED_RESCHED must be visible before we test polling */

596

smp_mb();

596

smp_mb();

597

if (!tsk_is_polling(p))

597

if (!tsk_is_polling(p))

598

smp_send_reschedule(cpu);

598

smp_send_reschedule(cpu);

599

}

599

}

600

601

static void resched_cpu(int cpu)

601

static void resched_cpu(int cpu)

602

{

602

{

603

struct rq *rq = cpu_rq(cpu);

603

struct rq *rq = cpu_rq(cpu);

604

unsigned long flags;

604

unsigned long flags;

605

606

if (!spin_trylock_irqsave(&rq->lock, flags))

606

if (!spin_trylock_irqsave(&rq->lock, flags))

607

return;

607

return;

608

resched_task(cpu_curr(cpu));

608

resched_task(cpu_curr(cpu));

609

spin_unlock_irqrestore(&rq->lock, flags);

609

spin_unlock_irqrestore(&rq->lock, flags);

610

}

610

}

611

#else

611

#else

612

static inline void resched_task(struct task_struct *p)

612

static inline void resched_task(struct task_struct *p)

613

{

613

{

614

assert_spin_locked(&task_rq(p)->lock);

614

assert_spin_locked(&task_rq(p)->lock);

615

set_tsk_need_resched(p);

615

set_tsk_need_resched(p);

616

}

616

}

617

#endif

617

#endif

618

619

static u64 div64_likely32(u64 divident, unsigned long divisor)

619

static u64 div64_likely32(u64 divident, unsigned long divisor)

620

{

620

{

621

#if BITS_PER_LONG == 32

621

#if BITS_PER_LONG == 32

622

if (likely(divident <= 0xffffffffULL))

622

if (likely(divident <= 0xffffffffULL))

623

return (u32)divident / divisor;

623

return (u32)divident / divisor;

624

do_div(divident, divisor);

624

do_div(divident, divisor);

625

626

return divident;

626

return divident;

627

#else

627

#else

628

return divident / divisor;

628

return divident / divisor;

629

#endif

629

#endif

630

}

630

}

631

632

#if BITS_PER_LONG == 32

632

#if BITS_PER_LONG == 32

633

# define WMULT_CONST (~0UL)

633

# define WMULT_CONST (~0UL)

634

#else

634

#else

635

# define WMULT_CONST (1UL << 32)

635

# define WMULT_CONST (1UL << 32)

636

#endif

636

#endif

637

638

#define WMULT_SHIFT 32

638

#define WMULT_SHIFT 32

639

640

static unsigned long

640

static unsigned long

641

calc_delta_mine(unsigned long delta_exec, unsigned long weight,

641

calc_delta_mine(unsigned long delta_exec, unsigned long weight,

642

struct load_weight *lw)

642

struct load_weight *lw)

643

{

643

{

644

u64 tmp;

644

u64 tmp;

645

646

if (unlikely(!lw->inv_weight))

646

if (unlikely(!lw->inv_weight))

647

lw->inv_weight = WMULT_CONST / lw->weight;

647

lw->inv_weight = WMULT_CONST / lw->weight;

648

649

tmp = (u64)delta_exec * weight;

649

tmp = (u64)delta_exec * weight;

650

/*

650

/*

651

* Check whether we'd overflow the 64-bit multiplication:

651

* Check whether we'd overflow the 64-bit multiplication:

652

*/

652

*/

653

if (unlikely(tmp > WMULT_CONST)) {

653

if (unlikely(tmp > WMULT_CONST)) {

654

tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)

654

tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)

655

>> (WMULT_SHIFT/2);

655

>> (WMULT_SHIFT/2);

656

} else {

656

} else {

657

tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;

657

tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;

658

}

658

}

659

660

return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);

660

return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);

661

}

661

}

662

663

static inline unsigned long

663

static inline unsigned long

664

calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)

664

calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)

665

{

665

{

666

return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);

666

return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);

667

}

667

}

668

669

static void update_load_add(struct load_weight *lw, unsigned long inc)

669

static void update_load_add(struct load_weight *lw, unsigned long inc)

670

{

670

{

671

lw->weight += inc;

671

lw->weight += inc;

672

lw->inv_weight = 0;

672

lw->inv_weight = 0;

673

}

673

}

674

675

static void update_load_sub(struct load_weight *lw, unsigned long dec)

675

static void update_load_sub(struct load_weight *lw, unsigned long dec)

676

{

676

{

677

lw->weight -= dec;

677

lw->weight -= dec;

678

lw->inv_weight = 0;

678

lw->inv_weight = 0;

679

}

679

}

680

681

/*

681

/*

682

* To aid in avoiding the subversion of "niceness" due to uneven distribution

682

* To aid in avoiding the subversion of "niceness" due to uneven distribution

683

* of tasks with abnormal "nice" values across CPUs the contribution that

683

* of tasks with abnormal "nice" values across CPUs the contribution that

684

* each task makes to its run queue's load is weighted according to its

684

* each task makes to its run queue's load is weighted according to its

685

* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a

685

* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a

686

* scaled version of the new time slice allocation that they receive on time

686

* scaled version of the new time slice allocation that they receive on time

687

* slice expiry etc.

687

* slice expiry etc.

688

*/

688

*/

689

690

#define WEIGHT_IDLEPRIO 2

690

#define WEIGHT_IDLEPRIO 2

691

#define WMULT_IDLEPRIO (1 << 31)

691

#define WMULT_IDLEPRIO (1 << 31)

692

693

/*

693

/*

694

* Nice levels are multiplicative, with a gentle 10% change for every

694

* Nice levels are multiplicative, with a gentle 10% change for every

695

* nice level changed. I.e. when a CPU-bound task goes from nice 0 to

695

* nice level changed. I.e. when a CPU-bound task goes from nice 0 to

696

* nice 1, it will get ~10% less CPU time than another CPU-bound task

696

* nice 1, it will get ~10% less CPU time than another CPU-bound task

697

* that remained on nice 0.

697

* that remained on nice 0.

698

*

698

*

699

* The "10% effect" is relative and cumulative: from _any_ nice level,

699

* The "10% effect" is relative and cumulative: from _any_ nice level,

700

* if you go up 1 level, it's -10% CPU usage, if you go down 1 level

700

* if you go up 1 level, it's -10% CPU usage, if you go down 1 level

701

* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.

701

* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.

702

* If a task goes up by ~10% and another task goes down by ~10% then

702

* If a task goes up by ~10% and another task goes down by ~10% then

703

* the relative distance between them is ~25%.)

703

* the relative distance between them is ~25%.)

704

*/

704

*/

705

static const int prio_to_weight[40] = {

705

static const int prio_to_weight[40] = {

706

/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,

706

/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,

707

/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,

707

/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,

708

/* 0 */ NICE_0_LOAD /* 1024 */,

708

/* 0 */ NICE_0_LOAD /* 1024 */,

709

/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,

709

/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,

710

/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,

710

/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,

711

};

711

};

712

713

/*

713

/*

714

* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.

714

* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.

715

*

715

*

716

* In cases where the weight does not change often, we can use the

716

* In cases where the weight does not change often, we can use the

717

* precalculated inverse to speed up arithmetics by turning divisions

717

* precalculated inverse to speed up arithmetics by turning divisions

718

* into multiplications:

718

* into multiplications:

719

*/

719

*/

720

static const u32 prio_to_wmult[40] = {

720

static const u32 prio_to_wmult[40] = {

721

/* -20 */ 48356, 60446, 75558, 94446, 118058,

721

/* -20 */ 48356, 60446, 75558, 94446, 118058,

722

/* -15 */ 147573, 184467, 230589, 288233, 360285,

722

/* -15 */ 147573, 184467, 230589, 288233, 360285,

723

/* -10 */ 450347, 562979, 703746, 879575, 1099582,

723

/* -10 */ 450347, 562979, 703746, 879575, 1099582,

724

/* -5 */ 1374389, 1717986, 2147483, 2684354, 3355443,

724

/* -5 */ 1374389, 1717986, 2147483, 2684354, 3355443,

725

/* 0 */ 4194304, 5244160, 6557201, 8196502, 10250518,

725

/* 0 */ 4194304, 5244160, 6557201, 8196502, 10250518,

726

/* 5 */ 12782640, 16025997, 19976592, 24970740, 31350126,

726

/* 5 */ 12782640, 16025997, 19976592, 24970740, 31350126,

727

/* 10 */ 39045157, 49367440, 61356675, 76695844, 95443717,

727

/* 10 */ 39045157, 49367440, 61356675, 76695844, 95443717,

728

/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,

728

/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,

729

};

729

};

730

731

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);

731

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);

732

733

/*

733

/*

734

* runqueue iterator, to support SMP load-balancing between different

734

* runqueue iterator, to support SMP load-balancing between different

735

* scheduling classes, without having to expose their internal data

735

* scheduling classes, without having to expose their internal data

736

* structures to the load-balancing proper:

736

* structures to the load-balancing proper:

737

*/

737

*/

738

struct rq_iterator {

738

struct rq_iterator {

739

void *arg;

739

void *arg;

740

struct task_struct *(*start)(void *);

740

struct task_struct *(*start)(void *);

741

struct task_struct *(*next)(void *);

741

struct task_struct *(*next)(void *);

742

};

742

};

743

744

static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

744

static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

745

unsigned long max_nr_move, unsigned long max_load_move,

745

unsigned long max_nr_move, unsigned long max_load_move,

746

struct sched_domain *sd, enum cpu_idle_type idle,

746

struct sched_domain *sd, enum cpu_idle_type idle,

747

int *all_pinned, unsigned long *load_moved,

747

int *all_pinned, unsigned long *load_moved,

748

int this_best_prio, int best_prio, int best_prio_seen,

748

int this_best_prio, int best_prio, int best_prio_seen,

749

struct rq_iterator *iterator);

749

struct rq_iterator *iterator);

750

751

#include "sched_stats.h"

751

#include "sched_stats.h"

752

#include "sched_rt.c"

752

#include "sched_rt.c"

753

#include "sched_fair.c"

753

#include "sched_fair.c"

754

#include "sched_idletask.c"

754

#include "sched_idletask.c"

755

#ifdef CONFIG_SCHED_DEBUG

755

#ifdef CONFIG_SCHED_DEBUG

756

# include "sched_debug.c"

756

# include "sched_debug.c"

757

#endif

757

#endif

758

759

#define sched_class_highest (&rt_sched_class)

759

#define sched_class_highest (&rt_sched_class)

760

761

static void __update_curr_load(struct rq *rq, struct load_stat *ls)

761

static void __update_curr_load(struct rq *rq, struct load_stat *ls)

762

{

762

{

763

if (rq->curr != rq->idle && ls->load.weight) {

763

if (rq->curr != rq->idle && ls->load.weight) {

764

ls->delta_exec += ls->delta_stat;

764

ls->delta_exec += ls->delta_stat;

765

ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);

765

ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);

766

ls->delta_stat = 0;

766

ls->delta_stat = 0;

767

}

767

}

768

}

768

}

769

770

/*

770

/*

771

* Update delta_exec, delta_fair fields for rq.

771

* Update delta_exec, delta_fair fields for rq.

772

*

772

*

773

* delta_fair clock advances at a rate inversely proportional to

773

* delta_fair clock advances at a rate inversely proportional to

774

* total load (rq->ls.load.weight) on the runqueue, while

774

* total load (rq->ls.load.weight) on the runqueue, while

775

* delta_exec advances at the same rate as wall-clock (provided

775

* delta_exec advances at the same rate as wall-clock (provided

776

* cpu is not idle).

776

* cpu is not idle).

777

*

777

*

778

* delta_exec / delta_fair is a measure of the (smoothened) load on this

778

* delta_exec / delta_fair is a measure of the (smoothened) load on this

779

* runqueue over any given interval. This (smoothened) load is used

779

* runqueue over any given interval. This (smoothened) load is used

780

* during load balance.

780

* during load balance.

781

*

781

*

782

* This function is called /before/ updating rq->ls.load

782

* This function is called /before/ updating rq->ls.load

783

* and when switching tasks.

783

* and when switching tasks.

784

*/

784

*/

785

static void update_curr_load(struct rq *rq, u64 now)

785

static void update_curr_load(struct rq *rq, u64 now)

786

{

786

{

787

struct load_stat *ls = &rq->ls;

787

struct load_stat *ls = &rq->ls;

788

u64 start;

788

u64 start;

789

790

start = ls->load_update_start;

790

start = ls->load_update_start;

791

ls->load_update_start = now;

791

ls->load_update_start = now;

792

ls->delta_stat += now - start;

792

ls->delta_stat += now - start;

793

/*

793

/*

794

* Stagger updates to ls->delta_fair. Very frequent updates

794

* Stagger updates to ls->delta_fair. Very frequent updates

795

* can be expensive.

795

* can be expensive.

796

*/

796

*/

797

if (ls->delta_stat >= sysctl_sched_stat_granularity)

797

if (ls->delta_stat >= sysctl_sched_stat_granularity)

798

__update_curr_load(rq, ls);

798

__update_curr_load(rq, ls);

799

}

799

}

800

801

static inline void

801

static inline void

802

inc_load(struct rq *rq, const struct task_struct *p, u64 now)

802

inc_load(struct rq *rq, const struct task_struct *p, u64 now)

803

{

803

{

804

update_curr_load(rq, now);

804

update_curr_load(rq, now);

805

update_load_add(&rq->ls.load, p->se.load.weight);

805

update_load_add(&rq->ls.load, p->se.load.weight);

806

}

806

}

807

808

static inline void

808

static inline void

809

dec_load(struct rq *rq, const struct task_struct *p, u64 now)

809

dec_load(struct rq *rq, const struct task_struct *p, u64 now)

810

{

810

{

811

update_curr_load(rq, now);

811

update_curr_load(rq, now);

812

update_load_sub(&rq->ls.load, p->se.load.weight);

812

update_load_sub(&rq->ls.load, p->se.load.weight);

813

}

813

}

814

815

static void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)

815

static void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)

816

{

816

{

817

rq->nr_running++;

817

rq->nr_running++;

818

inc_load(rq, p, now);

818

inc_load(rq, p, now);

819

}

819

}

820

821

static void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)

821

static void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)

822

{

822

{

823

rq->nr_running--;

823

rq->nr_running--;

824

dec_load(rq, p, now);

824

dec_load(rq, p, now);

825

}

825

}

826

827

static void set_load_weight(struct task_struct *p)

827

static void set_load_weight(struct task_struct *p)

828

{

828

{

829

task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;

829

task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;

830

p->se.wait_runtime = 0;

830

p->se.wait_runtime = 0;

831

832

if (task_has_rt_policy(p)) {

832

if (task_has_rt_policy(p)) {

833

p->se.load.weight = prio_to_weight[0] * 2;

833

p->se.load.weight = prio_to_weight[0] * 2;

834

p->se.load.inv_weight = prio_to_wmult[0] >> 1;

834

p->se.load.inv_weight = prio_to_wmult[0] >> 1;

835

return;

835

return;

836

}

836

}

837

838

/*

838

/*

839

* SCHED_IDLE tasks get minimal weight:

839

* SCHED_IDLE tasks get minimal weight:

840

*/

840

*/

841

if (p->policy == SCHED_IDLE) {

841

if (p->policy == SCHED_IDLE) {

842

p->se.load.weight = WEIGHT_IDLEPRIO;

842

p->se.load.weight = WEIGHT_IDLEPRIO;

843

p->se.load.inv_weight = WMULT_IDLEPRIO;

843

p->se.load.inv_weight = WMULT_IDLEPRIO;

844

return;

844

return;

845

}

845

}

846

847

p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];

847

p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];

848

p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];

848

p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];

849

}

849

}

850

851

static void

851

static void

852

enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)

852

enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)

853

{

853

{

854

sched_info_queued(p);

854

sched_info_queued(p);

855

p->sched_class->enqueue_task(rq, p, wakeup, now);

855

p->sched_class->enqueue_task(rq, p, wakeup, now);

856

p->se.on_rq = 1;

856

p->se.on_rq = 1;

857

}

857

}

858

859

static void

859

static void

860

dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)

860

dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)

861

{

861

{

862

p->sched_class->dequeue_task(rq, p, sleep, now);

862

p->sched_class->dequeue_task(rq, p, sleep, now);

863

p->se.on_rq = 0;

863

p->se.on_rq = 0;

864

}

864

}

865

866

/*

866

/*

867

* __normal_prio - return the priority that is based on the static prio

867

* __normal_prio - return the priority that is based on the static prio

868

*/

868

*/

869

static inline int __normal_prio(struct task_struct *p)

869

static inline int __normal_prio(struct task_struct *p)

870

{

870

{

871

return p->static_prio;

871

return p->static_prio;

872

}

872

}

873

874

/*

874

/*

875

* Calculate the expected normal priority: i.e. priority

875

* Calculate the expected normal priority: i.e. priority

876

* without taking RT-inheritance into account. Might be

876

* without taking RT-inheritance into account. Might be

877

* boosted by interactivity modifiers. Changes upon fork,

877

* boosted by interactivity modifiers. Changes upon fork,

878

* setprio syscalls, and whenever the interactivity

878

* setprio syscalls, and whenever the interactivity

879

* estimator recalculates.

879

* estimator recalculates.

880

*/

880

*/

881

static inline int normal_prio(struct task_struct *p)

881

static inline int normal_prio(struct task_struct *p)

882

{

882

{

883

int prio;

883

int prio;

884

885

if (task_has_rt_policy(p))

885

if (task_has_rt_policy(p))

886

prio = MAX_RT_PRIO-1 - p->rt_priority;

886

prio = MAX_RT_PRIO-1 - p->rt_priority;

887

else

887

else

888

prio = __normal_prio(p);

888

prio = __normal_prio(p);

889

return prio;

889

return prio;

890

}

890

}

891

892

/*

892

/*

893

* Calculate the current priority, i.e. the priority

893

* Calculate the current priority, i.e. the priority

894

* taken into account by the scheduler. This value might

894

* taken into account by the scheduler. This value might

895

* be boosted by RT tasks, or might be boosted by

895

* be boosted by RT tasks, or might be boosted by

896

* interactivity modifiers. Will be RT if the task got

896

* interactivity modifiers. Will be RT if the task got

897

* RT-boosted. If not then it returns p->normal_prio.

897

* RT-boosted. If not then it returns p->normal_prio.

898

*/

898

*/

899

static int effective_prio(struct task_struct *p)

899

static int effective_prio(struct task_struct *p)

900

{

900

{

901

p->normal_prio = normal_prio(p);

901

p->normal_prio = normal_prio(p);

902

/*

902

/*

903

* If we are RT tasks or we were boosted to RT priority,

903

* If we are RT tasks or we were boosted to RT priority,

904

* keep the priority unchanged. Otherwise, update priority

904

* keep the priority unchanged. Otherwise, update priority

905

* to the normal priority:

905

* to the normal priority:

906

*/

906

*/

907

if (!rt_prio(p->prio))

907

if (!rt_prio(p->prio))

908

return p->normal_prio;

908

return p->normal_prio;

909

return p->prio;

909

return p->prio;

910

}

910

}

911

912

/*

912

/*

913

* activate_task - move a task to the runqueue.

913

* activate_task - move a task to the runqueue.

914

*/

914

*/

915

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)

915

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)

916

{

916

{

917

u64 now = rq_clock(rq);

917

u64 now = rq_clock(rq);

918

919

if (p->state == TASK_UNINTERRUPTIBLE)

919

if (p->state == TASK_UNINTERRUPTIBLE)

920

rq->nr_uninterruptible--;

920

rq->nr_uninterruptible--;

921

922

enqueue_task(rq, p, wakeup, now);

922

enqueue_task(rq, p, wakeup, now);

923

inc_nr_running(p, rq, now);

923

inc_nr_running(p, rq, now);

924

}

924

}

925

926

/*

926

/*

927

* activate_idle_task - move idle task to the _front_ of runqueue.

927

* activate_idle_task - move idle task to the _front_ of runqueue.

928

*/

928

*/

929

static inline void activate_idle_task(struct task_struct *p, struct rq *rq)

929

static inline void activate_idle_task(struct task_struct *p, struct rq *rq)

930

{

930

{

931

u64 now = rq_clock(rq);

931

u64 now = rq_clock(rq);

932

933

if (p->state == TASK_UNINTERRUPTIBLE)

933

if (p->state == TASK_UNINTERRUPTIBLE)

934

rq->nr_uninterruptible--;

934

rq->nr_uninterruptible--;

935

936

enqueue_task(rq, p, 0, now);

936

enqueue_task(rq, p, 0, now);

937

inc_nr_running(p, rq, now);

937

inc_nr_running(p, rq, now);

938

}

938

}

939

940

/*

940

/*

941

* deactivate_task - remove a task from the runqueue.

941

* deactivate_task - remove a task from the runqueue.

942

*/

942

*/

943

static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)

943

static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)

944

{

944

{

945

u64 now = rq_clock(rq);

945

u64 now = rq_clock(rq);

946

947

if (p->state == TASK_UNINTERRUPTIBLE)

947

if (p->state == TASK_UNINTERRUPTIBLE)

948

rq->nr_uninterruptible++;

948

rq->nr_uninterruptible++;

949

950

dequeue_task(rq, p, sleep, now);

950

dequeue_task(rq, p, sleep, now);

951

dec_nr_running(p, rq, now);

951

dec_nr_running(p, rq, now);

952

}

952

}

953

954

/**

954

/**

955

* task_curr - is this task currently executing on a CPU?

955

* task_curr - is this task currently executing on a CPU?

956

* @p: the task in question.

956

* @p: the task in question.

957

*/

957

*/

958

inline int task_curr(const struct task_struct *p)

958

inline int task_curr(const struct task_struct *p)

959

{

959

{

960

return cpu_curr(task_cpu(p)) == p;

960

return cpu_curr(task_cpu(p)) == p;

961

}

961

}

962

963

/* Used instead of source_load when we know the type == 0 */

963

/* Used instead of source_load when we know the type == 0 */

964

unsigned long weighted_cpuload(const int cpu)

964

unsigned long weighted_cpuload(const int cpu)

965

{

965

{

966

return cpu_rq(cpu)->ls.load.weight;

966

return cpu_rq(cpu)->ls.load.weight;

967

}

967

}

968

969

static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)

969

static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)

970

{

970

{

971

#ifdef CONFIG_SMP

971

#ifdef CONFIG_SMP

972

task_thread_info(p)->cpu = cpu;

972

task_thread_info(p)->cpu = cpu;

973

set_task_cfs_rq(p);

973

set_task_cfs_rq(p);

974

#endif

974

#endif

975

}

975

}

976

977

#ifdef CONFIG_SMP

977

#ifdef CONFIG_SMP

978

979

void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

979

void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

980

{

980

{

981

int old_cpu = task_cpu(p);

981

int old_cpu = task_cpu(p);

982

struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);

982

struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);

983

u64 clock_offset, fair_clock_offset;

983

u64 clock_offset, fair_clock_offset;

984

985

clock_offset = old_rq->clock - new_rq->clock;

985

clock_offset = old_rq->clock - new_rq->clock;

986

fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock;

986

fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock;

987

988

if (p->se.wait_start_fair)

988

if (p->se.wait_start_fair)

989

p->se.wait_start_fair -= fair_clock_offset;

989

p->se.wait_start_fair -= fair_clock_offset;

990

if (p->se.sleep_start_fair)

990

if (p->se.sleep_start_fair)

991

p->se.sleep_start_fair -= fair_clock_offset;

991

p->se.sleep_start_fair -= fair_clock_offset;

992

993

#ifdef CONFIG_SCHEDSTATS

993

#ifdef CONFIG_SCHEDSTATS

994

if (p->se.wait_start)

994

if (p->se.wait_start)

995

p->se.wait_start -= clock_offset;

995

p->se.wait_start -= clock_offset;

996

if (p->se.sleep_start)

996

if (p->se.sleep_start)

997

p->se.sleep_start -= clock_offset;

997

p->se.sleep_start -= clock_offset;

998

if (p->se.block_start)

998

if (p->se.block_start)

999

p->se.block_start -= clock_offset;

999

p->se.block_start -= clock_offset;

1000

#endif

1000

#endif

1001

1002

__set_task_cpu(p, new_cpu);

1002

__set_task_cpu(p, new_cpu);

1003

}

1003

}

1004

1005

struct migration_req {

1005

struct migration_req {

1006

struct list_head list;

1006

struct list_head list;

1007

1008

struct task_struct *task;

1008

struct task_struct *task;

1009

int dest_cpu;

1009

int dest_cpu;

1010

1011

struct completion done;

1011

struct completion done;

1012

};

1012

};

1013

1014

/*

1014

/*

1015

* The task's runqueue lock must be held.

1015

* The task's runqueue lock must be held.

1016

* Returns true if you have to wait for migration thread.

1016

* Returns true if you have to wait for migration thread.

1017

*/

1017

*/

1018

static int

1018

static int

1019

migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)

1019

migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)

1020

{

1020

{

1021

struct rq *rq = task_rq(p);

1021

struct rq *rq = task_rq(p);

1022

1023

/*

1023

/*

1024

* If the task is not on a runqueue (and not running), then

1024

* If the task is not on a runqueue (and not running), then

1025

* it is sufficient to simply update the task's cpu field.

1025

* it is sufficient to simply update the task's cpu field.

1026

*/

1026

*/

1027

if (!p->se.on_rq && !task_running(rq, p)) {

1027

if (!p->se.on_rq && !task_running(rq, p)) {

1028

set_task_cpu(p, dest_cpu);

1028

set_task_cpu(p, dest_cpu);

1029

return 0;

1029

return 0;

1030

}

1030

}

1031

1032

init_completion(&req->done);

1032

init_completion(&req->done);

1033

req->task = p;

1033

req->task = p;

1034

req->dest_cpu = dest_cpu;

1034

req->dest_cpu = dest_cpu;

1035

list_add(&req->list, &rq->migration_queue);

1035

list_add(&req->list, &rq->migration_queue);

1036

1037

return 1;

1037

return 1;

1038

}

1038

}

1039

1040

/*

1040

/*

1041

* wait_task_inactive - wait for a thread to unschedule.

1041

* wait_task_inactive - wait for a thread to unschedule.

1042

*

1042

*

1043

* The caller must ensure that the task *will* unschedule sometime soon,

1043

* The caller must ensure that the task *will* unschedule sometime soon,

1044

* else this function might spin for a *long* time. This function can't

1044

* else this function might spin for a *long* time. This function can't

1045

* be called with interrupts off, or it may introduce deadlock with

1045

* be called with interrupts off, or it may introduce deadlock with

1046

* smp_call_function() if an IPI is sent by the same process we are

1046

* smp_call_function() if an IPI is sent by the same process we are

1047

* waiting to become inactive.

1047

* waiting to become inactive.

1048

*/

1048

*/

1049

void wait_task_inactive(struct task_struct *p)

1049

void wait_task_inactive(struct task_struct *p)

1050

{

1050

{

1051

unsigned long flags;

1051

unsigned long flags;

1052

int running, on_rq;

1052

int running, on_rq;

1053

struct rq *rq;

1053

struct rq *rq;

1054

1055

repeat:

1055

repeat:

1056

/*

1056

/*

1057

* We do the initial early heuristics without holding

1057

* We do the initial early heuristics without holding

1058

* any task-queue locks at all. We'll only try to get

1058

* any task-queue locks at all. We'll only try to get

1059

* the runqueue lock when things look like they will

1059

* the runqueue lock when things look like they will

1060

* work out!

1060

* work out!

1061

*/

1061

*/

1062

rq = task_rq(p);

1062

rq = task_rq(p);

1063

1064

/*

1064

/*

1065

* If the task is actively running on another CPU

1065

* If the task is actively running on another CPU

1066

* still, just relax and busy-wait without holding

1066

* still, just relax and busy-wait without holding

1067

* any locks.

1067

* any locks.

1068

*

1068

*

1069

* NOTE! Since we don't hold any locks, it's not

1069

* NOTE! Since we don't hold any locks, it's not

1070

* even sure that "rq" stays as the right runqueue!

1070

* even sure that "rq" stays as the right runqueue!

1071

* But we don't care, since "task_running()" will

1071

* But we don't care, since "task_running()" will

1072

* return false if the runqueue has changed and p

1072

* return false if the runqueue has changed and p

1073

* is actually now running somewhere else!

1073

* is actually now running somewhere else!

1074

*/

1074

*/

1075

while (task_running(rq, p))

1075

while (task_running(rq, p))

1076

cpu_relax();

1076

cpu_relax();

1077

1078

/*

1078

/*

1079

* Ok, time to look more closely! We need the rq

1079

* Ok, time to look more closely! We need the rq

1080

* lock now, to be *sure*. If we're wrong, we'll

1080

* lock now, to be *sure*. If we're wrong, we'll

1081

* just go back and repeat.

1081

* just go back and repeat.

1082

*/

1082

*/

1083

rq = task_rq_lock(p, &flags);

1083

rq = task_rq_lock(p, &flags);

1084

running = task_running(rq, p);

1084

running = task_running(rq, p);

1085

on_rq = p->se.on_rq;

1085

on_rq = p->se.on_rq;

1086

task_rq_unlock(rq, &flags);

1086

task_rq_unlock(rq, &flags);

1087

1088

/*

1088

/*

1089

* Was it really running after all now that we

1089

* Was it really running after all now that we

1090

* checked with the proper locks actually held?

1090

* checked with the proper locks actually held?

1091

*

1091

*

1092

* Oops. Go back and try again..

1092

* Oops. Go back and try again..

1093

*/

1093

*/

1094

if (unlikely(running)) {

1094

if (unlikely(running)) {

1095

cpu_relax();

1095

cpu_relax();

1096

goto repeat;

1096

goto repeat;

1097

}

1097

}

1098

1099

/*

1099

/*

1100

* It's not enough that it's not actively running,

1100

* It's not enough that it's not actively running,

1101

* it must be off the runqueue _entirely_, and not

1101

* it must be off the runqueue _entirely_, and not

1102

* preempted!

1102

* preempted!

1103

*

1103

*

1104

* So if it wa still runnable (but just not actively

1104

* So if it wa still runnable (but just not actively

1105

* running right now), it's preempted, and we should

1105

* running right now), it's preempted, and we should

1106

* yield - it could be a while.

1106

* yield - it could be a while.

1107

*/

1107

*/

1108

if (unlikely(on_rq)) {

1108

if (unlikely(on_rq)) {

1109

yield();

1109

yield();

1110

goto repeat;

1110

goto repeat;

1111

}

1111

}

1112

1113

/*

1113

/*

1114

* Ahh, all good. It wasn't running, and it wasn't

1114

* Ahh, all good. It wasn't running, and it wasn't

1115

* runnable, which means that it will never become

1115

* runnable, which means that it will never become

1116

* running in the future either. We're all done!

1116

* running in the future either. We're all done!

1117

*/

1117

*/

1118

}

1118

}

1119

1120

/***

1120

/***

1121

* kick_process - kick a running thread to enter/exit the kernel

1121

* kick_process - kick a running thread to enter/exit the kernel

1122

* @p: the to-be-kicked thread

1122

* @p: the to-be-kicked thread

1123

*

1123

*

1124

* Cause a process which is running on another CPU to enter

1124

* Cause a process which is running on another CPU to enter

1125

* kernel-mode, without any delay. (to get signals handled.)

1125

* kernel-mode, without any delay. (to get signals handled.)

1126

*

1126

*

1127

* NOTE: this function doesnt have to take the runqueue lock,

1127

* NOTE: this function doesnt have to take the runqueue lock,

1128

* because all it wants to ensure is that the remote task enters

1128

* because all it wants to ensure is that the remote task enters

1129

* the kernel. If the IPI races and the task has been migrated

1129

* the kernel. If the IPI races and the task has been migrated

1130

* to another CPU then no harm is done and the purpose has been

1130

* to another CPU then no harm is done and the purpose has been

1131

* achieved as well.

1131

* achieved as well.

1132

*/

1132

*/

1133

void kick_process(struct task_struct *p)

1133

void kick_process(struct task_struct *p)

1134

{

1134

{

1135

int cpu;

1135

int cpu;

1136

1137

preempt_disable();

1137

preempt_disable();

1138

cpu = task_cpu(p);

1138

cpu = task_cpu(p);

1139

if ((cpu != smp_processor_id()) && task_curr(p))

1139

if ((cpu != smp_processor_id()) && task_curr(p))

1140

smp_send_reschedule(cpu);

1140

smp_send_reschedule(cpu);

1141

preempt_enable();

1141

preempt_enable();

1142

}

1142

}

1143

1144

/*

1144

/*

1145

* Return a low guess at the load of a migration-source cpu weighted

1145

* Return a low guess at the load of a migration-source cpu weighted

1146

* according to the scheduling class and "nice" value.

1146

* according to the scheduling class and "nice" value.

1147

*

1147

*

1148

* We want to under-estimate the load of migration sources, to

1148

* We want to under-estimate the load of migration sources, to

1149

* balance conservatively.

1149

* balance conservatively.

1150

*/

1150

*/

1151

static inline unsigned long source_load(int cpu, int type)

1151

static inline unsigned long source_load(int cpu, int type)

1152

{

1152

{

1153

struct rq *rq = cpu_rq(cpu);

1153

struct rq *rq = cpu_rq(cpu);

1154

unsigned long total = weighted_cpuload(cpu);

1154

unsigned long total = weighted_cpuload(cpu);

1155

1156

if (type == 0)

1156

if (type == 0)

1157

return total;

1157

return total;

1158

1159

return min(rq->cpu_load[type-1], total);

1159

return min(rq->cpu_load[type-1], total);

1160

}

1160

}

1161

1162

/*

1162

/*

1163

* Return a high guess at the load of a migration-target cpu weighted

1163

* Return a high guess at the load of a migration-target cpu weighted

1164

* according to the scheduling class and "nice" value.

1164

* according to the scheduling class and "nice" value.

1165

*/

1165

*/

1166

static inline unsigned long target_load(int cpu, int type)

1166

static inline unsigned long target_load(int cpu, int type)

1167

{

1167

{

1168

struct rq *rq = cpu_rq(cpu);

1168

struct rq *rq = cpu_rq(cpu);

1169

unsigned long total = weighted_cpuload(cpu);

1169

unsigned long total = weighted_cpuload(cpu);

1170

1171

if (type == 0)

1171

if (type == 0)

1172

return total;

1172

return total;

1173

1174

return max(rq->cpu_load[type-1], total);

1174

return max(rq->cpu_load[type-1], total);

1175

}

1175

}

1176

1177

/*

1177

/*

1178

* Return the average load per task on the cpu's run queue

1178

* Return the average load per task on the cpu's run queue

1179

*/

1179

*/

1180

static inline unsigned long cpu_avg_load_per_task(int cpu)

1180

static inline unsigned long cpu_avg_load_per_task(int cpu)

1181

{

1181

{

1182

struct rq *rq = cpu_rq(cpu);

1182

struct rq *rq = cpu_rq(cpu);

1183

unsigned long total = weighted_cpuload(cpu);

1183

unsigned long total = weighted_cpuload(cpu);

1184

unsigned long n = rq->nr_running;

1184

unsigned long n = rq->nr_running;

1185

1186

return n ? total / n : SCHED_LOAD_SCALE;

1186

return n ? total / n : SCHED_LOAD_SCALE;

1187

}

1187

}

1188

1189

/*

1189

/*

1190

* find_idlest_group finds and returns the least busy CPU group within the

1190

* find_idlest_group finds and returns the least busy CPU group within the

1191

* domain.

1191

* domain.

1192

*/

1192

*/

1193

static struct sched_group *

1193

static struct sched_group *

1194

find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)

1194

find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)

1195

{

1195

{

1196

struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;

1196

struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;

1197

unsigned long min_load = ULONG_MAX, this_load = 0;

1197

unsigned long min_load = ULONG_MAX, this_load = 0;

1198

int load_idx = sd->forkexec_idx;

1198

int load_idx = sd->forkexec_idx;

1199

int imbalance = 100 + (sd->imbalance_pct-100)/2;

1199

int imbalance = 100 + (sd->imbalance_pct-100)/2;

1200

1201

do {

1201

do {

1202

unsigned long load, avg_load;

1202

unsigned long load, avg_load;

1203

int local_group;

1203

int local_group;

1204

int i;

1204

int i;

1205

1206

/* Skip over this group if it has no CPUs allowed */

1206

/* Skip over this group if it has no CPUs allowed */

1207

if (!cpus_intersects(group->cpumask, p->cpus_allowed))

1207

if (!cpus_intersects(group->cpumask, p->cpus_allowed))

1208

goto nextgroup;

1208

goto nextgroup;

1209

1210

local_group = cpu_isset(this_cpu, group->cpumask);

1210

local_group = cpu_isset(this_cpu, group->cpumask);

1211

1212

/* Tally up the load of all CPUs in the group */

1212

/* Tally up the load of all CPUs in the group */

1213

avg_load = 0;

1213

avg_load = 0;

1214

1215

for_each_cpu_mask(i, group->cpumask) {

1215

for_each_cpu_mask(i, group->cpumask) {

1216

/* Bias balancing toward cpus of our domain */

1216

/* Bias balancing toward cpus of our domain */

1217

if (local_group)

1217

if (local_group)

1218

load = source_load(i, load_idx);

1218

load = source_load(i, load_idx);

1219

else

1219

else

1220

load = target_load(i, load_idx);

1220

load = target_load(i, load_idx);

1221

1222

avg_load += load;

1222

avg_load += load;

1223

}

1223

}

1224

1225

/* Adjust by relative CPU power of the group */

1225

/* Adjust by relative CPU power of the group */

1226

avg_load = sg_div_cpu_power(group,

1226

avg_load = sg_div_cpu_power(group,

1227

avg_load * SCHED_LOAD_SCALE);

1227

avg_load * SCHED_LOAD_SCALE);

1228

1229

if (local_group) {

1229

if (local_group) {

1230

this_load = avg_load;

1230

this_load = avg_load;

1231

this = group;

1231

this = group;

1232

} else if (avg_load < min_load) {

1232

} else if (avg_load < min_load) {

1233

min_load = avg_load;

1233

min_load = avg_load;

1234

idlest = group;

1234

idlest = group;

1235

}

1235

}

1236

nextgroup:

1236

nextgroup:

1237

group = group->next;

1237

group = group->next;

1238

} while (group != sd->groups);

1238

} while (group != sd->groups);

1239

1240

if (!idlest || 100*this_load < imbalance*min_load)

1240

if (!idlest || 100*this_load < imbalance*min_load)

1241

return NULL;

1241

return NULL;

1242

return idlest;

1242

return idlest;

1243

}

1243

}

1244

1245

/*

1245

/*

1246

* find_idlest_cpu - find the idlest cpu among the cpus in group.

1246

* find_idlest_cpu - find the idlest cpu among the cpus in group.

1247

*/

1247

*/

1248

static int

1248

static int

1249

find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)

1249

find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)

1250

{

1250

{

1251

cpumask_t tmp;

1251

cpumask_t tmp;

1252

unsigned long load, min_load = ULONG_MAX;

1252

unsigned long load, min_load = ULONG_MAX;

1253

int idlest = -1;

1253

int idlest = -1;

1254

int i;

1254

int i;

1255

1256

/* Traverse only the allowed CPUs */

1256

/* Traverse only the allowed CPUs */

1257

cpus_and(tmp, group->cpumask, p->cpus_allowed);

1257

cpus_and(tmp, group->cpumask, p->cpus_allowed);

1258

1259

for_each_cpu_mask(i, tmp) {

1259

for_each_cpu_mask(i, tmp) {

1260

load = weighted_cpuload(i);

1260

load = weighted_cpuload(i);

1261

1262

if (load < min_load || (load == min_load && i == this_cpu)) {

1262

if (load < min_load || (load == min_load && i == this_cpu)) {

1263

min_load = load;

1263

min_load = load;

1264

idlest = i;

1264

idlest = i;

1265

}

1265

}

1266

}

1266

}

1267

1268

return idlest;

1268

return idlest;

1269

}

1269

}

1270

1271

/*

1271

/*

1272

* sched_balance_self: balance the current task (running on cpu) in domains

1272

* sched_balance_self: balance the current task (running on cpu) in domains

1273

* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and

1273

* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and

1274

* SD_BALANCE_EXEC.

1274

* SD_BALANCE_EXEC.

1275

*

1275

*

1276

* Balance, ie. select the least loaded group.

1276

* Balance, ie. select the least loaded group.

1277

*

1277

*

1278

* Returns the target CPU number, or the same CPU if no balancing is needed.

1278

* Returns the target CPU number, or the same CPU if no balancing is needed.

1279

*

1279

*

1280

* preempt must be disabled.

1280

* preempt must be disabled.

1281

*/

1281

*/

1282

static int sched_balance_self(int cpu, int flag)

1282

static int sched_balance_self(int cpu, int flag)

1283

{

1283

{

1284

struct task_struct *t = current;

1284

struct task_struct *t = current;

1285

struct sched_domain *tmp, *sd = NULL;

1285

struct sched_domain *tmp, *sd = NULL;

1286

1287

for_each_domain(cpu, tmp) {

1287

for_each_domain(cpu, tmp) {

1288

/*

1288

/*

1289

* If power savings logic is enabled for a domain, stop there.

1289

* If power savings logic is enabled for a domain, stop there.

1290

*/

1290

*/

1291

if (tmp->flags & SD_POWERSAVINGS_BALANCE)

1291

if (tmp->flags & SD_POWERSAVINGS_BALANCE)

1292

break;

1292

break;

1293

if (tmp->flags & flag)

1293

if (tmp->flags & flag)

1294

sd = tmp;

1294

sd = tmp;

1295

}

1295

}

1296

1297

while (sd) {

1297

while (sd) {

1298

cpumask_t span;

1298

cpumask_t span;

1299

struct sched_group *group;

1299

struct sched_group *group;

1300

int new_cpu, weight;

1300

int new_cpu, weight;

1301

1302

if (!(sd->flags & flag)) {

1302

if (!(sd->flags & flag)) {

1303

sd = sd->child;

1303

sd = sd->child;

1304

continue;

1304

continue;

1305

}

1305

}

1306

1307

span = sd->span;

1307

span = sd->span;

1308

group = find_idlest_group(sd, t, cpu);

1308

group = find_idlest_group(sd, t, cpu);

1309

if (!group) {

1309

if (!group) {

1310

sd = sd->child;

1310

sd = sd->child;

1311

continue;

1311

continue;

1312

}

1312

}

1313

1314

new_cpu = find_idlest_cpu(group, t, cpu);

1314

new_cpu = find_idlest_cpu(group, t, cpu);

1315

if (new_cpu == -1 || new_cpu == cpu) {

1315

if (new_cpu == -1 || new_cpu == cpu) {

1316

/* Now try balancing at a lower domain level of cpu */

1316

/* Now try balancing at a lower domain level of cpu */

1317

sd = sd->child;

1317

sd = sd->child;

1318

continue;

1318

continue;

1319

}

1319

}

1320

1321

/* Now try balancing at a lower domain level of new_cpu */

1321

/* Now try balancing at a lower domain level of new_cpu */

1322

cpu = new_cpu;

1322

cpu = new_cpu;

1323

sd = NULL;

1323

sd = NULL;

1324

weight = cpus_weight(span);

1324

weight = cpus_weight(span);

1325

for_each_domain(cpu, tmp) {

1325

for_each_domain(cpu, tmp) {

1326

if (weight <= cpus_weight(tmp->span))

1326

if (weight <= cpus_weight(tmp->span))

1327

break;

1327

break;

1328

if (tmp->flags & flag)

1328

if (tmp->flags & flag)

1329

sd = tmp;

1329

sd = tmp;

1330

}

1330

}

1331

/* while loop will break here if sd == NULL */

1331

/* while loop will break here if sd == NULL */

1332

}

1332

}

1333

1334

return cpu;

1334

return cpu;

1335

}

1335

}

1336

1337

#endif /* CONFIG_SMP */

1337

#endif /* CONFIG_SMP */

1338

1339

/*

1339

/*

1340

* wake_idle() will wake a task on an idle cpu if task->cpu is

1340

* wake_idle() will wake a task on an idle cpu if task->cpu is

1341

* not idle and an idle cpu is available. The span of cpus to

1341

* not idle and an idle cpu is available. The span of cpus to

1342

* search starts with cpus closest then further out as needed,

1342

* search starts with cpus closest then further out as needed,

1343

* so we always favor a closer, idle cpu.

1343

* so we always favor a closer, idle cpu.

1344

*

1344

*

1345

* Returns the CPU we should wake onto.

1345

* Returns the CPU we should wake onto.

1346

*/

1346

*/

1347

#if defined(ARCH_HAS_SCHED_WAKE_IDLE)

1347

#if defined(ARCH_HAS_SCHED_WAKE_IDLE)

1348

static int wake_idle(int cpu, struct task_struct *p)

1348

static int wake_idle(int cpu, struct task_struct *p)

1349

{

1349

{

1350

cpumask_t tmp;

1350

cpumask_t tmp;

1351

struct sched_domain *sd;

1351

struct sched_domain *sd;

1352

int i;

1352

int i;

1353

1354

/*

1354

/*

1355

* If it is idle, then it is the best cpu to run this task.

1355

* If it is idle, then it is the best cpu to run this task.

1356

*

1356

*

1357

* This cpu is also the best, if it has more than one task already.

1357

* This cpu is also the best, if it has more than one task already.

1358

* Siblings must be also busy(in most cases) as they didn't already

1358

* Siblings must be also busy(in most cases) as they didn't already

1359

* pickup the extra load from this cpu and hence we need not check

1359

* pickup the extra load from this cpu and hence we need not check

1360

* sibling runqueue info. This will avoid the checks and cache miss

1360

* sibling runqueue info. This will avoid the checks and cache miss

1361

* penalities associated with that.

1361

* penalities associated with that.

1362

*/

1362

*/

1363

if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)

1363

if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)

1364

return cpu;

1364

return cpu;

1365

1366

for_each_domain(cpu, sd) {

1366

for_each_domain(cpu, sd) {

1367

if (sd->flags & SD_WAKE_IDLE) {

1367

if (sd->flags & SD_WAKE_IDLE) {

1368

cpus_and(tmp, sd->span, p->cpus_allowed);

1368

cpus_and(tmp, sd->span, p->cpus_allowed);

1369

for_each_cpu_mask(i, tmp) {

1369

for_each_cpu_mask(i, tmp) {

1370

if (idle_cpu(i))

1370

if (idle_cpu(i))

1371

return i;

1371

return i;

1372

}

1372

}

1373

} else {

1373

} else {

1374

break;

1374

break;

1375

}

1375

}

1376

}

1376

}

1377

return cpu;

1377

return cpu;

1378

}

1378

}

1379

#else

1379

#else

1380

static inline int wake_idle(int cpu, struct task_struct *p)

1380

static inline int wake_idle(int cpu, struct task_struct *p)

1381

{

1381

{

1382

return cpu;

1382

return cpu;

1383

}

1383

}

1384

#endif

1384

#endif

1385

1386

/***

1386

/***

1387

* try_to_wake_up - wake up a thread

1387

* try_to_wake_up - wake up a thread

1388

* @p: the to-be-woken-up thread

1388

* @p: the to-be-woken-up thread

1389

* @state: the mask of task states that can be woken

1389

* @state: the mask of task states that can be woken

1390

* @sync: do a synchronous wakeup?

1390

* @sync: do a synchronous wakeup?

1391

*

1391

*

1392

* Put it on the run-queue if it's not already there. The "current"

1392

* Put it on the run-queue if it's not already there. The "current"

1393

* thread is always on the run-queue (except when the actual

1393

* thread is always on the run-queue (except when the actual

1394

* re-schedule is in progress), and as such you're allowed to do

1394

* re-schedule is in progress), and as such you're allowed to do

1395

* the simpler "current->state = TASK_RUNNING" to mark yourself

1395

* the simpler "current->state = TASK_RUNNING" to mark yourself

1396

* runnable without the overhead of this.

1396

* runnable without the overhead of this.

1397

*

1397

*

1398

* returns failure only if the task is already active.

1398

* returns failure only if the task is already active.

1399

*/

1399

*/

1400

static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)

1400

static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)

1401

{

1401

{

1402

int cpu, this_cpu, success = 0;

1402

int cpu, this_cpu, success = 0;

1403

unsigned long flags;

1403

unsigned long flags;

1404

long old_state;

1404

long old_state;

1405

struct rq *rq;

1405

struct rq *rq;

1406

#ifdef CONFIG_SMP

1406

#ifdef CONFIG_SMP

1407

struct sched_domain *sd, *this_sd = NULL;

1407

struct sched_domain *sd, *this_sd = NULL;

1408

unsigned long load, this_load;

1408

unsigned long load, this_load;

1409

int new_cpu;

1409

int new_cpu;

1410

#endif

1410

#endif

1411

1412

rq = task_rq_lock(p, &flags);

1412

rq = task_rq_lock(p, &flags);

1413

old_state = p->state;

1413

old_state = p->state;

1414

if (!(old_state & state))

1414

if (!(old_state & state))

1415

goto out;

1415

goto out;

1416

1417

if (p->se.on_rq)

1417

if (p->se.on_rq)

1418

goto out_running;

1418

goto out_running;

1419

1420

cpu = task_cpu(p);

1420

cpu = task_cpu(p);

1421

this_cpu = smp_processor_id();

1421

this_cpu = smp_processor_id();

1422

1423

#ifdef CONFIG_SMP

1423

#ifdef CONFIG_SMP

1424

if (unlikely(task_running(rq, p)))

1424

if (unlikely(task_running(rq, p)))

1425

goto out_activate;

1425

goto out_activate;

1426

1427

new_cpu = cpu;

1427

new_cpu = cpu;

1428

1429

schedstat_inc(rq, ttwu_cnt);

1429

schedstat_inc(rq, ttwu_cnt);

1430

if (cpu == this_cpu) {

1430

if (cpu == this_cpu) {

1431

schedstat_inc(rq, ttwu_local);

1431

schedstat_inc(rq, ttwu_local);

1432

goto out_set_cpu;

1432

goto out_set_cpu;

1433

}

1433

}

1434

1435

for_each_domain(this_cpu, sd) {

1435

for_each_domain(this_cpu, sd) {

1436

if (cpu_isset(cpu, sd->span)) {

1436

if (cpu_isset(cpu, sd->span)) {

1437

schedstat_inc(sd, ttwu_wake_remote);

1437

schedstat_inc(sd, ttwu_wake_remote);

1438

this_sd = sd;

1438

this_sd = sd;

1439

break;

1439

break;

1440

}

1440

}

1441

}

1441

}

1442

1443

if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))

1443

if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))

1444

goto out_set_cpu;

1444

goto out_set_cpu;

1445

1446

/*

1446

/*

1447

* Check for affine wakeup and passive balancing possibilities.

1447

* Check for affine wakeup and passive balancing possibilities.

1448

*/

1448

*/

1449

if (this_sd) {

1449

if (this_sd) {

1450

int idx = this_sd->wake_idx;

1450

int idx = this_sd->wake_idx;

1451

unsigned int imbalance;

1451

unsigned int imbalance;

1452

1453

imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;

1453

imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;

1454

1455

load = source_load(cpu, idx);

1455

load = source_load(cpu, idx);

1456

this_load = target_load(this_cpu, idx);

1456

this_load = target_load(this_cpu, idx);

1457

1458

new_cpu = this_cpu; /* Wake to this CPU if we can */

1458

new_cpu = this_cpu; /* Wake to this CPU if we can */

1459

1460

if (this_sd->flags & SD_WAKE_AFFINE) {

1460

if (this_sd->flags & SD_WAKE_AFFINE) {

1461

unsigned long tl = this_load;

1461

unsigned long tl = this_load;

1462

unsigned long tl_per_task;

1462

unsigned long tl_per_task;

1463

1464

tl_per_task = cpu_avg_load_per_task(this_cpu);

1464

tl_per_task = cpu_avg_load_per_task(this_cpu);

1465

1466

/*

1466

/*

1467

* If sync wakeup then subtract the (maximum possible)

1467

* If sync wakeup then subtract the (maximum possible)

1468

* effect of the currently running task from the load

1468

* effect of the currently running task from the load

1469

* of the current CPU:

1469

* of the current CPU:

1470

*/

1470

*/

1471

if (sync)

1471

if (sync)

1472

tl -= current->se.load.weight;

1472

tl -= current->se.load.weight;

1473

1474

if ((tl <= load &&

1474

if ((tl <= load &&

1475

tl + target_load(cpu, idx) <= tl_per_task) ||

1475

tl + target_load(cpu, idx) <= tl_per_task) ||

1476

100*(tl + p->se.load.weight) <= imbalance*load) {

1476

100*(tl + p->se.load.weight) <= imbalance*load) {

1477

/*

1477

/*

1478

* This domain has SD_WAKE_AFFINE and

1478

* This domain has SD_WAKE_AFFINE and

1479

* p is cache cold in this domain, and

1479

* p is cache cold in this domain, and

1480

* there is no bad imbalance.

1480

* there is no bad imbalance.

1481

*/

1481

*/

1482

schedstat_inc(this_sd, ttwu_move_affine);

1482

schedstat_inc(this_sd, ttwu_move_affine);

1483

goto out_set_cpu;

1483

goto out_set_cpu;

1484

}

1484

}

1485

}

1485

}

1486

1487

/*

1487

/*

1488

* Start passive balancing when half the imbalance_pct

1488

* Start passive balancing when half the imbalance_pct

1489

* limit is reached.

1489

* limit is reached.

1490

*/

1490

*/

1491

if (this_sd->flags & SD_WAKE_BALANCE) {

1491

if (this_sd->flags & SD_WAKE_BALANCE) {

1492

if (imbalance*this_load <= 100*load) {

1492

if (imbalance*this_load <= 100*load) {

1493

schedstat_inc(this_sd, ttwu_move_balance);

1493

schedstat_inc(this_sd, ttwu_move_balance);

1494

goto out_set_cpu;

1494

goto out_set_cpu;

1495

}

1495

}

1496

}

1496

}

1497

}

1497

}

1498

1499

new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */

1499

new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */

1500

out_set_cpu:

1500

out_set_cpu:

1501

new_cpu = wake_idle(new_cpu, p);

1501

new_cpu = wake_idle(new_cpu, p);

1502

if (new_cpu != cpu) {

1502

if (new_cpu != cpu) {

1503

set_task_cpu(p, new_cpu);

1503

set_task_cpu(p, new_cpu);

1504

task_rq_unlock(rq, &flags);

1504

task_rq_unlock(rq, &flags);

1505

/* might preempt at this point */

1505

/* might preempt at this point */

1506

rq = task_rq_lock(p, &flags);

1506

rq = task_rq_lock(p, &flags);

1507

old_state = p->state;

1507

old_state = p->state;

1508

if (!(old_state & state))

1508

if (!(old_state & state))

1509

goto out;

1509

goto out;

1510

if (p->se.on_rq)

1510

if (p->se.on_rq)

1511

goto out_running;

1511

goto out_running;

1512

1513

this_cpu = smp_processor_id();

1513

this_cpu = smp_processor_id();

1514

cpu = task_cpu(p);

1514

cpu = task_cpu(p);

1515

}

1515

}

1516

1517

out_activate:

1517

out_activate:

1518

#endif /* CONFIG_SMP */

1518

#endif /* CONFIG_SMP */

1519

activate_task(rq, p, 1);

1519

activate_task(rq, p, 1);

1520

/*

1520

/*

1521

* Sync wakeups (i.e. those types of wakeups where the waker

1521

* Sync wakeups (i.e. those types of wakeups where the waker

1522

* has indicated that it will leave the CPU in short order)

1522

* has indicated that it will leave the CPU in short order)

1523

* don't trigger a preemption, if the woken up task will run on

1523

* don't trigger a preemption, if the woken up task will run on

1524

* this cpu. (in this case the 'I will reschedule' promise of

1524

* this cpu. (in this case the 'I will reschedule' promise of

1525

* the waker guarantees that the freshly woken up task is going

1525

* the waker guarantees that the freshly woken up task is going

1526

* to be considered on this CPU.)

1526

* to be considered on this CPU.)

1527

*/

1527

*/

1528

if (!sync || cpu != this_cpu)

1528

if (!sync || cpu != this_cpu)

1529

check_preempt_curr(rq, p);

1529

check_preempt_curr(rq, p);

1530

success = 1;

1530

success = 1;

1531

1532

out_running:

1532

out_running:

1533

p->state = TASK_RUNNING;

1533

p->state = TASK_RUNNING;

1534

out:

1534

out:

1535

task_rq_unlock(rq, &flags);

1535

task_rq_unlock(rq, &flags);

1536

1537

return success;

1537

return success;

1538

}

1538

}

1539

1540

int fastcall wake_up_process(struct task_struct *p)

1540

int fastcall wake_up_process(struct task_struct *p)

1541

{

1541

{

1542

return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |

1542

return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |

1543

TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);

1543

TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);

1544

}

1544

}

1545

EXPORT_SYMBOL(wake_up_process);

1545

EXPORT_SYMBOL(wake_up_process);

1546

1547

int fastcall wake_up_state(struct task_struct *p, unsigned int state)

1547

int fastcall wake_up_state(struct task_struct *p, unsigned int state)

1548

{

1548

{

1549

return try_to_wake_up(p, state, 0);

1549

return try_to_wake_up(p, state, 0);

1550

}

1550

}

1551

1552

/*

1552

/*

1553

* Perform scheduler related setup for a newly forked process p.

1553

* Perform scheduler related setup for a newly forked process p.

1554

* p is forked by current.

1554

* p is forked by current.

1555

*

1555

*

1556

* __sched_fork() is basic setup used by init_idle() too:

1556

* __sched_fork() is basic setup used by init_idle() too:

1557

*/

1557

*/

1558

static void __sched_fork(struct task_struct *p)

1558

static void __sched_fork(struct task_struct *p)

1559

{

1559

{

1560

p->se.wait_start_fair = 0;

1560

p->se.wait_start_fair = 0;

1561

p->se.exec_start = 0;

1561

p->se.exec_start = 0;

1562

p->se.sum_exec_runtime = 0;

1562

p->se.sum_exec_runtime = 0;

1563

p->se.delta_exec = 0;

1563

p->se.delta_exec = 0;

1564

p->se.delta_fair_run = 0;

1564

p->se.delta_fair_run = 0;

1565

p->se.delta_fair_sleep = 0;

1565

p->se.delta_fair_sleep = 0;

1566

p->se.wait_runtime = 0;

1566

p->se.wait_runtime = 0;

1567

p->se.sleep_start_fair = 0;

1567

p->se.sleep_start_fair = 0;

1568

1569

#ifdef CONFIG_SCHEDSTATS

1569

#ifdef CONFIG_SCHEDSTATS

1570

p->se.wait_start = 0;

1570

p->se.wait_start = 0;

1571

p->se.sum_wait_runtime = 0;

1571

p->se.sum_wait_runtime = 0;

1572

p->se.sum_sleep_runtime = 0;

1572

p->se.sum_sleep_runtime = 0;

1573

p->se.sleep_start = 0;

1573

p->se.sleep_start = 0;

1574

p->se.block_start = 0;

1574

p->se.block_start = 0;

1575

p->se.sleep_max = 0;

1575

p->se.sleep_max = 0;

1576

p->se.block_max = 0;

1576

p->se.block_max = 0;

1577

p->se.exec_max = 0;

1577

p->se.exec_max = 0;

1578

p->se.wait_max = 0;

1578

p->se.wait_max = 0;

1579

p->se.wait_runtime_overruns = 0;

1579

p->se.wait_runtime_overruns = 0;

1580

p->se.wait_runtime_underruns = 0;

1580

p->se.wait_runtime_underruns = 0;

1581

#endif

1581

#endif

1582

1583

INIT_LIST_HEAD(&p->run_list);

1583

INIT_LIST_HEAD(&p->run_list);

1584

p->se.on_rq = 0;

1584

p->se.on_rq = 0;

1585

1586

#ifdef CONFIG_PREEMPT_NOTIFIERS

1586

#ifdef CONFIG_PREEMPT_NOTIFIERS

1587

INIT_HLIST_HEAD(&p->preempt_notifiers);

1587

INIT_HLIST_HEAD(&p->preempt_notifiers);

1588

#endif

1588

#endif

1589

1590

/*

1590

/*

1591

* We mark the process as running here, but have not actually

1591

* We mark the process as running here, but have not actually

1592

* inserted it onto the runqueue yet. This guarantees that

1592

* inserted it onto the runqueue yet. This guarantees that

1593

* nobody will actually run it, and a signal or other external

1593

* nobody will actually run it, and a signal or other external

1594

* event cannot wake it up and insert it on the runqueue either.

1594

* event cannot wake it up and insert it on the runqueue either.

1595

*/

1595

*/

1596

p->state = TASK_RUNNING;

1596

p->state = TASK_RUNNING;

1597

}

1597

}

1598

1599

/*

1599

/*

1600

* fork()/clone()-time setup:

1600

* fork()/clone()-time setup:

1601

*/

1601

*/

1602

void sched_fork(struct task_struct *p, int clone_flags)

1602

void sched_fork(struct task_struct *p, int clone_flags)

1603

{

1603

{

1604

int cpu = get_cpu();

1604

int cpu = get_cpu();

1605

1606

__sched_fork(p);

1606

__sched_fork(p);

1607

1608

#ifdef CONFIG_SMP

1608

#ifdef CONFIG_SMP

1609

cpu = sched_balance_self(cpu, SD_BALANCE_FORK);

1609

cpu = sched_balance_self(cpu, SD_BALANCE_FORK);

1610

#endif

1610

#endif

1611

__set_task_cpu(p, cpu);

1611

__set_task_cpu(p, cpu);

1612

1613

/*

1613

/*

1614

* Make sure we do not leak PI boosting priority to the child:

1614

* Make sure we do not leak PI boosting priority to the child:

1615

*/

1615

*/

1616

p->prio = current->normal_prio;

1616

p->prio = current->normal_prio;

1617

1618

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)

1618

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)

1619

if (likely(sched_info_on()))

1619

if (likely(sched_info_on()))

1620

memset(&p->sched_info, 0, sizeof(p->sched_info));

1620

memset(&p->sched_info, 0, sizeof(p->sched_info));

1621

#endif

1621

#endif

1622

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

1622

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

1623

p->oncpu = 0;

1623

p->oncpu = 0;

1624

#endif

1624

#endif

1625

#ifdef CONFIG_PREEMPT

1625

#ifdef CONFIG_PREEMPT

1626

/* Want to start with kernel preemption disabled. */

1626

/* Want to start with kernel preemption disabled. */

1627

task_thread_info(p)->preempt_count = 1;

1627

task_thread_info(p)->preempt_count = 1;

1628

#endif

1628

#endif

1629

put_cpu();

1629

put_cpu();

1630

}

1630

}

1631

1632

/*

1632

/*

1633

* After fork, child runs first. (default) If set to 0 then

1633

* After fork, child runs first. (default) If set to 0 then

1634

* parent will (try to) run first.

1634

* parent will (try to) run first.

1635

*/

1635

*/

1636

unsigned int __read_mostly sysctl_sched_child_runs_first = 1;

1636

unsigned int __read_mostly sysctl_sched_child_runs_first = 1;

1637

1638

/*

1638

/*

1639

* wake_up_new_task - wake up a newly created task for the first time.

1639

* wake_up_new_task - wake up a newly created task for the first time.

1640

*

1640

*

1641

* This function will do some initial scheduler statistics housekeeping

1641

* This function will do some initial scheduler statistics housekeeping

1642

* that must be done for every newly created context, then puts the task

1642

* that must be done for every newly created context, then puts the task

1643

* on the runqueue and wakes it.

1643

* on the runqueue and wakes it.

1644

*/

1644

*/

1645

void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)

1645

void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)

1646

{

1646

{

1647

unsigned long flags;

1647

unsigned long flags;

1648

struct rq *rq;

1648

struct rq *rq;

1649

int this_cpu;

1649

int this_cpu;

1650

u64 now;

1650

u64 now;

1651

1652

rq = task_rq_lock(p, &flags);

1652

rq = task_rq_lock(p, &flags);

1653

BUG_ON(p->state != TASK_RUNNING);

1653

BUG_ON(p->state != TASK_RUNNING);

1654

this_cpu = smp_processor_id(); /* parent's CPU */

1654

this_cpu = smp_processor_id(); /* parent's CPU */

1655

now = rq_clock(rq);

1655

now = rq_clock(rq);

1656

1657

p->prio = effective_prio(p);

1657

p->prio = effective_prio(p);

1658

1659

if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||

1659

if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||

1660

(clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||

1660

(clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||

1661

!current->se.on_rq) {

1661

!current->se.on_rq) {

1662

1663

activate_task(rq, p, 0);

1663

activate_task(rq, p, 0);

1664

} else {

1664

} else {

1665

/*

1665

/*

1666

* Let the scheduling class do new task startup

1666

* Let the scheduling class do new task startup

1667

* management (if any):

1667

* management (if any):

1668

*/

1668

*/

1669

p->sched_class->task_new(rq, p, now);

1669

p->sched_class->task_new(rq, p, now);

1670

inc_nr_running(p, rq, now);

1670

inc_nr_running(p, rq, now);

1671

}

1671

}

1672

check_preempt_curr(rq, p);

1672

check_preempt_curr(rq, p);

1673

task_rq_unlock(rq, &flags);

1673

task_rq_unlock(rq, &flags);

1674

}

1674

}

1675

1676

#ifdef CONFIG_PREEMPT_NOTIFIERS

1676

#ifdef CONFIG_PREEMPT_NOTIFIERS

1677

1678

/**

1678

/**

1679

* preempt_notifier_register - tell me when current is being being preempted & rescheduled

1679

* preempt_notifier_register - tell me when current is being being preempted & rescheduled

1680

* @notifier: notifier struct to register

1680

* @notifier: notifier struct to register

1681

*/

1681

*/

1682

void preempt_notifier_register(struct preempt_notifier *notifier)

1682

void preempt_notifier_register(struct preempt_notifier *notifier)

1683

{

1683

{

1684

hlist_add_head(&notifier->link, &current->preempt_notifiers);

1684

hlist_add_head(&notifier->link, &current->preempt_notifiers);

1685

}

1685

}

1686

EXPORT_SYMBOL_GPL(preempt_notifier_register);

1686

EXPORT_SYMBOL_GPL(preempt_notifier_register);

1687

1688

/**

1688

/**

1689

* preempt_notifier_unregister - no longer interested in preemption notifications

1689

* preempt_notifier_unregister - no longer interested in preemption notifications

1690

* @notifier: notifier struct to unregister

1690

* @notifier: notifier struct to unregister

1691

*

1691

*

1692

* This is safe to call from within a preemption notifier.

1692

* This is safe to call from within a preemption notifier.

1693

*/

1693

*/

1694

void preempt_notifier_unregister(struct preempt_notifier *notifier)

1694

void preempt_notifier_unregister(struct preempt_notifier *notifier)

1695

{

1695

{

1696

hlist_del(&notifier->link);

1696

hlist_del(&notifier->link);

1697

}

1697

}

1698

EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

1698

EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

1699

1700

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

1700

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

1701

{

1701

{

1702

struct preempt_notifier *notifier;

1702

struct preempt_notifier *notifier;

1703

struct hlist_node *node;

1703

struct hlist_node *node;

1704

1705

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

1705

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

1706

notifier->ops->sched_in(notifier, raw_smp_processor_id());

1706

notifier->ops->sched_in(notifier, raw_smp_processor_id());

1707

}

1707

}

1708

1709

static void

1709

static void

1710

fire_sched_out_preempt_notifiers(struct task_struct *curr,

1710

fire_sched_out_preempt_notifiers(struct task_struct *curr,

1711

struct task_struct *next)

1711

struct task_struct *next)

1712

{

1712

{

1713

struct preempt_notifier *notifier;

1713

struct preempt_notifier *notifier;

1714

struct hlist_node *node;

1714

struct hlist_node *node;

1715

1716

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

1716

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

1717

notifier->ops->sched_out(notifier, next);

1717

notifier->ops->sched_out(notifier, next);

1718

}

1718

}

1719

1720

#else

1720

#else

1721

1722

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

1722

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

1723

{

1723

{

1724

}

1724

}

1725

1726

static void

1726

static void

1727

fire_sched_out_preempt_notifiers(struct task_struct *curr,

1727

fire_sched_out_preempt_notifiers(struct task_struct *curr,

1728

struct task_struct *next)

1728

struct task_struct *next)

1729

{

1729

{

1730

}

1730

}

1731

1732

#endif

1732

#endif

1733

1734

/**

1734

/**

1735

* prepare_task_switch - prepare to switch tasks

1735

* prepare_task_switch - prepare to switch tasks

1736

* @rq: the runqueue preparing to switch

1736

* @rq: the runqueue preparing to switch

1737

* @prev: the current task that is being switched out

1737

* @prev: the current task that is being switched out

1738

* @next: the task we are going to switch to.

1738

* @next: the task we are going to switch to.

1739

*

1739

*

1740

* This is called with the rq lock held and interrupts off. It must

1740

* This is called with the rq lock held and interrupts off. It must

1741

* be paired with a subsequent finish_task_switch after the context

1741

* be paired with a subsequent finish_task_switch after the context

1742

* switch.

1742

* switch.

1743

*

1743

*

1744

* prepare_task_switch sets up locking and calls architecture specific

1744

* prepare_task_switch sets up locking and calls architecture specific

1745

* hooks.

1745

* hooks.

1746

*/

1746

*/

1747

static inline void

1747

static inline void

1748

prepare_task_switch(struct rq *rq, struct task_struct *prev,

1748

prepare_task_switch(struct rq *rq, struct task_struct *prev,

1749

struct task_struct *next)

1749

struct task_struct *next)

1750

{

1750

{

1751

fire_sched_out_preempt_notifiers(prev, next);

1751

fire_sched_out_preempt_notifiers(prev, next);

1752

prepare_lock_switch(rq, next);

1752

prepare_lock_switch(rq, next);

1753

prepare_arch_switch(next);

1753

prepare_arch_switch(next);

1754

}

1754

}

1755

1756

/**

1756

/**

1757

* finish_task_switch - clean up after a task-switch

1757

* finish_task_switch - clean up after a task-switch

1758

* @rq: runqueue associated with task-switch

1758

* @rq: runqueue associated with task-switch

1759

* @prev: the thread we just switched away from.

1759

* @prev: the thread we just switched away from.

1760

*

1760

*

1761

* finish_task_switch must be called after the context switch, paired

1761

* finish_task_switch must be called after the context switch, paired

1762

* with a prepare_task_switch call before the context switch.

1762

* with a prepare_task_switch call before the context switch.

1763

* finish_task_switch will reconcile locking set up by prepare_task_switch,

1763

* finish_task_switch will reconcile locking set up by prepare_task_switch,

1764

* and do any other architecture-specific cleanup actions.

1764

* and do any other architecture-specific cleanup actions.

1765

*

1765

*

1766

* Note that we may have delayed dropping an mm in context_switch(). If

1766

* Note that we may have delayed dropping an mm in context_switch(). If

1767

* so, we finish that here outside of the runqueue lock. (Doing it

1767

* so, we finish that here outside of the runqueue lock. (Doing it

1768

* with the lock held can cause deadlocks; see schedule() for

1768

* with the lock held can cause deadlocks; see schedule() for

1769

* details.)

1769

* details.)

1770

*/

1770

*/

1771

static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)

1771

static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)

1772

__releases(rq->lock)

1772

__releases(rq->lock)

1773

{

1773

{

1774

struct mm_struct *mm = rq->prev_mm;

1774

struct mm_struct *mm = rq->prev_mm;

1775

long prev_state;

1775

long prev_state;

1776

1777

rq->prev_mm = NULL;

1777

rq->prev_mm = NULL;

1778

1779

/*

1779

/*

1780

* A task struct has one reference for the use as "current".

1780

* A task struct has one reference for the use as "current".

1781

* If a task dies, then it sets TASK_DEAD in tsk->state and calls

1781

* If a task dies, then it sets TASK_DEAD in tsk->state and calls

1782

* schedule one last time. The schedule call will never return, and

1782

* schedule one last time. The schedule call will never return, and

1783

* the scheduled task must drop that reference.

1783

* the scheduled task must drop that reference.

1784

* The test for TASK_DEAD must occur while the runqueue locks are

1784

* The test for TASK_DEAD must occur while the runqueue locks are

1785

* still held, otherwise prev could be scheduled on another cpu, die

1785

* still held, otherwise prev could be scheduled on another cpu, die

1786

* there before we look at prev->state, and then the reference would

1786

* there before we look at prev->state, and then the reference would

1787

* be dropped twice.

1787

* be dropped twice.

1788

* Manfred Spraul <manfred@colorfullife.com>

1788

* Manfred Spraul <manfred@colorfullife.com>

1789

*/

1789

*/

1790

prev_state = prev->state;

1790

prev_state = prev->state;

1791

finish_arch_switch(prev);

1791

finish_arch_switch(prev);

1792

finish_lock_switch(rq, prev);

1792

finish_lock_switch(rq, prev);

1793

fire_sched_in_preempt_notifiers(current);

1793

fire_sched_in_preempt_notifiers(current);

1794

if (mm)

1794

if (mm)

1795

mmdrop(mm);

1795

mmdrop(mm);

1796

if (unlikely(prev_state == TASK_DEAD)) {

1796

if (unlikely(prev_state == TASK_DEAD)) {

1797

/*

1797

/*

1798

* Remove function-return probe instances associated with this

1798

* Remove function-return probe instances associated with this

1799

* task and put them back on the free list.

1799

* task and put them back on the free list.

1800

*/

1800

*/

1801

kprobe_flush_task(prev);

1801

kprobe_flush_task(prev);

1802

put_task_struct(prev);

1802

put_task_struct(prev);

1803

}

1803

}

1804

}

1804

}

1805

1806

/**

1806

/**

1807

* schedule_tail - first thing a freshly forked thread must call.

1807

* schedule_tail - first thing a freshly forked thread must call.

1808

* @prev: the thread we just switched away from.

1808

* @prev: the thread we just switched away from.

1809

*/

1809

*/

1810

asmlinkage void schedule_tail(struct task_struct *prev)

1810

asmlinkage void schedule_tail(struct task_struct *prev)

1811

__releases(rq->lock)

1811

__releases(rq->lock)

1812

{

1812

{

1813

struct rq *rq = this_rq();

1813

struct rq *rq = this_rq();

1814

1815

finish_task_switch(rq, prev);

1815

finish_task_switch(rq, prev);

1816

#ifdef __ARCH_WANT_UNLOCKED_CTXSW

1816

#ifdef __ARCH_WANT_UNLOCKED_CTXSW

1817

/* In this case, finish_task_switch does not reenable preemption */

1817

/* In this case, finish_task_switch does not reenable preemption */

1818

preempt_enable();

1818

preempt_enable();

1819

#endif

1819

#endif

1820

if (current->set_child_tid)

1820

if (current->set_child_tid)

1821

put_user(current->pid, current->set_child_tid);

1821

put_user(current->pid, current->set_child_tid);

1822

}

1822

}

1823

1824

/*

1824

/*

1825

* context_switch - switch to the new MM and the new

1825

* context_switch - switch to the new MM and the new

1826

* thread's register state.

1826

* thread's register state.

1827

*/

1827

*/

1828

static inline void

1828

static inline void

1829

context_switch(struct rq *rq, struct task_struct *prev,

1829

context_switch(struct rq *rq, struct task_struct *prev,

1830

struct task_struct *next)

1830

struct task_struct *next)

1831

{

1831

{

1832

struct mm_struct *mm, *oldmm;

1832

struct mm_struct *mm, *oldmm;

1833

1834

prepare_task_switch(rq, prev, next);

1834

prepare_task_switch(rq, prev, next);

1835

mm = next->mm;

1835

mm = next->mm;

1836

oldmm = prev->active_mm;

1836

oldmm = prev->active_mm;

1837

/*

1837

/*

1838

* For paravirt, this is coupled with an exit in switch_to to

1838

* For paravirt, this is coupled with an exit in switch_to to

1839

* combine the page table reload and the switch backend into

1839

* combine the page table reload and the switch backend into

1840

* one hypercall.

1840

* one hypercall.

1841

*/

1841

*/

1842

arch_enter_lazy_cpu_mode();

1842

arch_enter_lazy_cpu_mode();

1843

1844

if (unlikely(!mm)) {

1844

if (unlikely(!mm)) {

1845

next->active_mm = oldmm;

1845

next->active_mm = oldmm;

1846

atomic_inc(&oldmm->mm_count);

1846

atomic_inc(&oldmm->mm_count);

1847

enter_lazy_tlb(oldmm, next);

1847

enter_lazy_tlb(oldmm, next);

1848

} else

1848

} else

1849

switch_mm(oldmm, mm, next);

1849

switch_mm(oldmm, mm, next);

1850

1851

if (unlikely(!prev->mm)) {

1851

if (unlikely(!prev->mm)) {

1852

prev->active_mm = NULL;

1852

prev->active_mm = NULL;

1853

rq->prev_mm = oldmm;

1853

rq->prev_mm = oldmm;

1854

}

1854

}

1855

/*

1855

/*

1856

* Since the runqueue lock will be released by the next

1856

* Since the runqueue lock will be released by the next

1857

* task (which is an invalid locking op but in the case

1857

* task (which is an invalid locking op but in the case

1858

* of the scheduler it's an obvious special-case), so we

1858

* of the scheduler it's an obvious special-case), so we

1859

* do an early lockdep release here:

1859

* do an early lockdep release here:

1860

*/

1860

*/

1861

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

1861

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

1862

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

1862

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

1863

#endif

1863

#endif

1864

1865

/* Here we just switch the register state and the stack. */

1865

/* Here we just switch the register state and the stack. */

1866

switch_to(prev, next, prev);

1866

switch_to(prev, next, prev);

1867

1868

barrier();

1868

barrier();

1869

/*

1869

/*

1870

* this_rq must be evaluated again because prev may have moved

1870

* this_rq must be evaluated again because prev may have moved

1871

* CPUs since it called schedule(), thus the 'rq' on its stack

1871

* CPUs since it called schedule(), thus the 'rq' on its stack

1872

* frame will be invalid.

1872

* frame will be invalid.

1873

*/

1873

*/

1874

finish_task_switch(this_rq(), prev);

1874

finish_task_switch(this_rq(), prev);

1875

}

1875

}

1876

1877

/*

1877

/*

1878

* nr_running, nr_uninterruptible and nr_context_switches:

1878

* nr_running, nr_uninterruptible and nr_context_switches:

1879

*

1879

*

1880

* externally visible scheduler statistics: current number of runnable

1880

* externally visible scheduler statistics: current number of runnable

1881

* threads, current number of uninterruptible-sleeping threads, total

1881

* threads, current number of uninterruptible-sleeping threads, total

1882

* number of context switches performed since bootup.

1882

* number of context switches performed since bootup.

1883

*/

1883

*/

1884

unsigned long nr_running(void)

1884

unsigned long nr_running(void)

1885

{

1885

{

1886

unsigned long i, sum = 0;

1886

unsigned long i, sum = 0;

1887

1888

for_each_online_cpu(i)

1888

for_each_online_cpu(i)

1889

sum += cpu_rq(i)->nr_running;

1889

sum += cpu_rq(i)->nr_running;

1890

1891

return sum;

1891

return sum;

1892

}

1892

}

1893

1894

unsigned long nr_uninterruptible(void)

1894

unsigned long nr_uninterruptible(void)

1895

{

1895

{

1896

unsigned long i, sum = 0;

1896

unsigned long i, sum = 0;

1897

1898

for_each_possible_cpu(i)

1898

for_each_possible_cpu(i)

1899

sum += cpu_rq(i)->nr_uninterruptible;

1899

sum += cpu_rq(i)->nr_uninterruptible;

1900

1901

/*

1901

/*

1902

* Since we read the counters lockless, it might be slightly

1902

* Since we read the counters lockless, it might be slightly

1903

* inaccurate. Do not allow it to go below zero though:

1903

* inaccurate. Do not allow it to go below zero though:

1904

*/

1904

*/

1905

if (unlikely((long)sum < 0))

1905

if (unlikely((long)sum < 0))

1906

sum = 0;

1906

sum = 0;

1907

1908

return sum;

1908

return sum;

1909

}

1909

}

1910

1911

unsigned long long nr_context_switches(void)

1911

unsigned long long nr_context_switches(void)

1912

{

1912

{

1913

int i;

1913

int i;

1914

unsigned long long sum = 0;

1914

unsigned long long sum = 0;

1915

1916

for_each_possible_cpu(i)

1916

for_each_possible_cpu(i)

1917

sum += cpu_rq(i)->nr_switches;

1917

sum += cpu_rq(i)->nr_switches;

1918

1919

return sum;

1919

return sum;

1920

}

1920

}

1921

1922

unsigned long nr_iowait(void)

1922

unsigned long nr_iowait(void)

1923

{

1923

{

1924

unsigned long i, sum = 0;

1924

unsigned long i, sum = 0;

1925

1926

for_each_possible_cpu(i)

1926

for_each_possible_cpu(i)

1927

sum += atomic_read(&cpu_rq(i)->nr_iowait);

1927

sum += atomic_read(&cpu_rq(i)->nr_iowait);

1928

1929

return sum;

1929

return sum;

1930

}

1930

}

1931

1932

unsigned long nr_active(void)

1932

unsigned long nr_active(void)

1933

{

1933

{

1934

unsigned long i, running = 0, uninterruptible = 0;

1934

unsigned long i, running = 0, uninterruptible = 0;

1935

1936

for_each_online_cpu(i) {

1936

for_each_online_cpu(i) {

1937

running += cpu_rq(i)->nr_running;

1937

running += cpu_rq(i)->nr_running;

1938

uninterruptible += cpu_rq(i)->nr_uninterruptible;

1938

uninterruptible += cpu_rq(i)->nr_uninterruptible;

1939

}

1939

}

1940

1941

if (unlikely((long)uninterruptible < 0))

1941

if (unlikely((long)uninterruptible < 0))

1942

uninterruptible = 0;

1942

uninterruptible = 0;

1943

1944

return running + uninterruptible;

1944

return running + uninterruptible;

1945

}

1945

}

1946

1947

/*

1947

/*

1948

* Update rq->cpu_load[] statistics. This function is usually called every

1948

* Update rq->cpu_load[] statistics. This function is usually called every

1949

* scheduler tick (TICK_NSEC).

1949

* scheduler tick (TICK_NSEC).

1950

*/

1950

*/

1951

static void update_cpu_load(struct rq *this_rq)

1951

static void update_cpu_load(struct rq *this_rq)

1952

{

1952

{

1953

u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;

1953

u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;

1954

unsigned long total_load = this_rq->ls.load.weight;

1954

unsigned long total_load = this_rq->ls.load.weight;

1955

unsigned long this_load = total_load;

1955

unsigned long this_load = total_load;

1956

struct load_stat *ls = &this_rq->ls;

1956

struct load_stat *ls = &this_rq->ls;

1957

u64 now = __rq_clock(this_rq);

1957

u64 now = __rq_clock(this_rq);

1958

int i, scale;

1958

int i, scale;

1959

1960

this_rq->nr_load_updates++;

1960

this_rq->nr_load_updates++;

1961

if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))

1961

if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))

1962

goto do_avg;

1962

goto do_avg;

1963

1964

/* Update delta_fair/delta_exec fields first */

1964

/* Update delta_fair/delta_exec fields first */

1965

update_curr_load(this_rq, now);

1965

update_curr_load(this_rq, now);

1966

1967

fair_delta64 = ls->delta_fair + 1;

1967

fair_delta64 = ls->delta_fair + 1;

1968

ls->delta_fair = 0;

1968

ls->delta_fair = 0;

1969

1970

exec_delta64 = ls->delta_exec + 1;

1970

exec_delta64 = ls->delta_exec + 1;

1971

ls->delta_exec = 0;

1971

ls->delta_exec = 0;

1972

1973

sample_interval64 = now - ls->load_update_last;

1973

sample_interval64 = now - ls->load_update_last;

1974

ls->load_update_last = now;

1974

ls->load_update_last = now;

1975

1976

if ((s64)sample_interval64 < (s64)TICK_NSEC)

1976

if ((s64)sample_interval64 < (s64)TICK_NSEC)

1977

sample_interval64 = TICK_NSEC;

1977

sample_interval64 = TICK_NSEC;

1978

1979

if (exec_delta64 > sample_interval64)

1979

if (exec_delta64 > sample_interval64)

1980

exec_delta64 = sample_interval64;

1980

exec_delta64 = sample_interval64;

1981

1982

idle_delta64 = sample_interval64 - exec_delta64;

1982

idle_delta64 = sample_interval64 - exec_delta64;

1983

1984

tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);

1984

tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);

1985

tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);

1985

tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);

1986

1987

this_load = (unsigned long)tmp64;

1987

this_load = (unsigned long)tmp64;

1988

1989

do_avg:

1989

do_avg:

1990

1991

/* Update our load: */

1991

/* Update our load: */

1992

for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {

1992

for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {

1993

unsigned long old_load, new_load;

1993

unsigned long old_load, new_load;

1994

1995

/* scale is effectively 1 << i now, and >> i divides by scale */

1995

/* scale is effectively 1 << i now, and >> i divides by scale */

1996

1997

old_load = this_rq->cpu_load[i];

1997

old_load = this_rq->cpu_load[i];

1998

new_load = this_load;

1998

new_load = this_load;

1999

2000

this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;

2000

this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;

2001

}

2001

}

2002

}

2002

}

2003

2004

#ifdef CONFIG_SMP

2004

#ifdef CONFIG_SMP

2005

2006

/*

2006

/*

2007

* double_rq_lock - safely lock two runqueues

2007

* double_rq_lock - safely lock two runqueues

2008

*

2008

*

2009

* Note this does not disable interrupts like task_rq_lock,

2009

* Note this does not disable interrupts like task_rq_lock,

2010

* you need to do so manually before calling.

2010

* you need to do so manually before calling.

2011

*/

2011

*/

2012

static void double_rq_lock(struct rq *rq1, struct rq *rq2)

2012

static void double_rq_lock(struct rq *rq1, struct rq *rq2)

2013

__acquires(rq1->lock)

2013

__acquires(rq1->lock)

2014

__acquires(rq2->lock)

2014

__acquires(rq2->lock)

2015

{

2015

{

2016

BUG_ON(!irqs_disabled());

2016

BUG_ON(!irqs_disabled());

2017

if (rq1 == rq2) {

2017

if (rq1 == rq2) {

2018

spin_lock(&rq1->lock);

2018

spin_lock(&rq1->lock);

2019

__acquire(rq2->lock); /* Fake it out ;) */

2019

__acquire(rq2->lock); /* Fake it out ;) */

2020

} else {

2020

} else {

2021

if (rq1 < rq2) {

2021

if (rq1 < rq2) {

2022

spin_lock(&rq1->lock);

2022

spin_lock(&rq1->lock);

2023

spin_lock(&rq2->lock);

2023

spin_lock(&rq2->lock);

2024

} else {

2024

} else {

2025

spin_lock(&rq2->lock);

2025

spin_lock(&rq2->lock);

2026

spin_lock(&rq1->lock);

2026

spin_lock(&rq1->lock);

2027

}

2027

}

2028

}

2028

}

2029

}

2029

}

2030

2031

/*

2031

/*

2032

* double_rq_unlock - safely unlock two runqueues

2032

* double_rq_unlock - safely unlock two runqueues

2033

*

2033

*

2034

* Note this does not restore interrupts like task_rq_unlock,

2034

* Note this does not restore interrupts like task_rq_unlock,

2035

* you need to do so manually after calling.

2035

* you need to do so manually after calling.

2036

*/

2036

*/

2037

static void double_rq_unlock(struct rq *rq1, struct rq *rq2)

2037

static void double_rq_unlock(struct rq *rq1, struct rq *rq2)

2038

__releases(rq1->lock)

2038

__releases(rq1->lock)

2039

__releases(rq2->lock)

2039

__releases(rq2->lock)

2040

{

2040

{

2041

spin_unlock(&rq1->lock);

2041

spin_unlock(&rq1->lock);

2042

if (rq1 != rq2)

2042

if (rq1 != rq2)

2043

spin_unlock(&rq2->lock);

2043

spin_unlock(&rq2->lock);

2044

else

2044

else

2045

__release(rq2->lock);

2045

__release(rq2->lock);

2046

}

2046

}

2047

2048

/*

2048

/*

2049

* double_lock_balance - lock the busiest runqueue, this_rq is locked already.

2049

* double_lock_balance - lock the busiest runqueue, this_rq is locked already.

2050

*/

2050

*/

2051

static void double_lock_balance(struct rq *this_rq, struct rq *busiest)

2051

static void double_lock_balance(struct rq *this_rq, struct rq *busiest)

2052

__releases(this_rq->lock)

2052

__releases(this_rq->lock)

2053

__acquires(busiest->lock)

2053

__acquires(busiest->lock)

2054

__acquires(this_rq->lock)

2054

__acquires(this_rq->lock)

2055

{

2055

{

2056

if (unlikely(!irqs_disabled())) {

2056

if (unlikely(!irqs_disabled())) {

2057

/* printk() doesn't work good under rq->lock */

2057

/* printk() doesn't work good under rq->lock */

2058

spin_unlock(&this_rq->lock);

2058

spin_unlock(&this_rq->lock);

2059

BUG_ON(1);

2059

BUG_ON(1);

2060

}

2060

}

2061

if (unlikely(!spin_trylock(&busiest->lock))) {

2061

if (unlikely(!spin_trylock(&busiest->lock))) {

2062

if (busiest < this_rq) {

2062

if (busiest < this_rq) {

2063

spin_unlock(&this_rq->lock);

2063

spin_unlock(&this_rq->lock);

2064

spin_lock(&busiest->lock);

2064

spin_lock(&busiest->lock);

2065

spin_lock(&this_rq->lock);

2065

spin_lock(&this_rq->lock);

2066

} else

2066

} else

2067

spin_lock(&busiest->lock);

2067

spin_lock(&busiest->lock);

2068

}

2068

}

2069

}

2069

}

2070

2071

/*

2071

/*

2072

* If dest_cpu is allowed for this process, migrate the task to it.

2072

* If dest_cpu is allowed for this process, migrate the task to it.

2073

* This is accomplished by forcing the cpu_allowed mask to only

2073

* This is accomplished by forcing the cpu_allowed mask to only

2074

* allow dest_cpu, which will force the cpu onto dest_cpu. Then

2074

* allow dest_cpu, which will force the cpu onto dest_cpu. Then

2075

* the cpu_allowed mask is restored.

2075

* the cpu_allowed mask is restored.

2076

*/

2076

*/

2077

static void sched_migrate_task(struct task_struct *p, int dest_cpu)

2077

static void sched_migrate_task(struct task_struct *p, int dest_cpu)

2078

{

2078

{

2079

struct migration_req req;

2079

struct migration_req req;

2080

unsigned long flags;

2080

unsigned long flags;

2081

struct rq *rq;

2081

struct rq *rq;

2082

2083

rq = task_rq_lock(p, &flags);

2083

rq = task_rq_lock(p, &flags);

2084

if (!cpu_isset(dest_cpu, p->cpus_allowed)

2084

if (!cpu_isset(dest_cpu, p->cpus_allowed)

2085

|| unlikely(cpu_is_offline(dest_cpu)))

2085

|| unlikely(cpu_is_offline(dest_cpu)))

2086

goto out;

2086

goto out;

2087

2088

/* force the process onto the specified CPU */

2088

/* force the process onto the specified CPU */

2089

if (migrate_task(p, dest_cpu, &req)) {

2089

if (migrate_task(p, dest_cpu, &req)) {

2090

/* Need to wait for migration thread (might exit: take ref). */

2090

/* Need to wait for migration thread (might exit: take ref). */

2091

struct task_struct *mt = rq->migration_thread;

2091

struct task_struct *mt = rq->migration_thread;

2092

2093

get_task_struct(mt);

2093

get_task_struct(mt);

2094

task_rq_unlock(rq, &flags);

2094

task_rq_unlock(rq, &flags);

2095

wake_up_process(mt);

2095

wake_up_process(mt);

2096

put_task_struct(mt);

2096

put_task_struct(mt);

2097

wait_for_completion(&req.done);

2097

wait_for_completion(&req.done);

2098

2099

return;

2099

return;

2100

}

2100

}

2101

out:

2101

out:

2102

task_rq_unlock(rq, &flags);

2102

task_rq_unlock(rq, &flags);

2103

}

2103

}

2104

2105

/*

2105

/*

2106

* sched_exec - execve() is a valuable balancing opportunity, because at

2106

* sched_exec - execve() is a valuable balancing opportunity, because at

2107

* this point the task has the smallest effective memory and cache footprint.

2107

* this point the task has the smallest effective memory and cache footprint.

2108

*/

2108

*/

2109

void sched_exec(void)

2109

void sched_exec(void)

2110

{

2110

{

2111

int new_cpu, this_cpu = get_cpu();

2111

int new_cpu, this_cpu = get_cpu();

2112

new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);

2112

new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);

2113

put_cpu();

2113

put_cpu();

2114

if (new_cpu != this_cpu)

2114

if (new_cpu != this_cpu)

2115

sched_migrate_task(current, new_cpu);

2115

sched_migrate_task(current, new_cpu);

2116

}

2116

}

2117

2118

/*

2118

/*

2119

* pull_task - move a task from a remote runqueue to the local runqueue.

2119

* pull_task - move a task from a remote runqueue to the local runqueue.

2120

* Both runqueues must be locked.

2120

* Both runqueues must be locked.

2121

*/

2121

*/

2122

static void pull_task(struct rq *src_rq, struct task_struct *p,

2122

static void pull_task(struct rq *src_rq, struct task_struct *p,

2123

struct rq *this_rq, int this_cpu)

2123

struct rq *this_rq, int this_cpu)

2124

{

2124

{

2125

deactivate_task(src_rq, p, 0);

2125

deactivate_task(src_rq, p, 0);

2126

set_task_cpu(p, this_cpu);

2126

set_task_cpu(p, this_cpu);

2127

activate_task(this_rq, p, 0);

2127

activate_task(this_rq, p, 0);

2128

/*

2128

/*

2129

* Note that idle threads have a prio of MAX_PRIO, for this test

2129

* Note that idle threads have a prio of MAX_PRIO, for this test

2130

* to be always true for them.

2130

* to be always true for them.

2131

*/

2131

*/

2132

check_preempt_curr(this_rq, p);

2132

check_preempt_curr(this_rq, p);

2133

}

2133

}

2134

2135

/*

2135

/*

2136

* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?

2136

* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?

2137

*/

2137

*/

2138

static

2138

static

2139

int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,

2139

int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,

2140

struct sched_domain *sd, enum cpu_idle_type idle,

2140

struct sched_domain *sd, enum cpu_idle_type idle,

2141

int *all_pinned)

2141

int *all_pinned)

2142

{

2142

{

2143

/*

2143

/*

2144

* We do not migrate tasks that are:

2144

* We do not migrate tasks that are:

2145

* 1) running (obviously), or

2145

* 1) running (obviously), or

2146

* 2) cannot be migrated to this CPU due to cpus_allowed, or

2146

* 2) cannot be migrated to this CPU due to cpus_allowed, or

2147

* 3) are cache-hot on their current CPU.

2147

* 3) are cache-hot on their current CPU.

2148

*/

2148

*/

2149

if (!cpu_isset(this_cpu, p->cpus_allowed))

2149

if (!cpu_isset(this_cpu, p->cpus_allowed))

2150

return 0;

2150

return 0;

2151

*all_pinned = 0;

2151

*all_pinned = 0;

2152

2153

if (task_running(rq, p))

2153

if (task_running(rq, p))

2154

return 0;

2154

return 0;

2155

2156

/*

2156

/*

2157

* Aggressive migration if too many balance attempts have failed:

2157

* Aggressive migration if too many balance attempts have failed:

2158

*/

2158

*/

2159

if (sd->nr_balance_failed > sd->cache_nice_tries)

2159

if (sd->nr_balance_failed > sd->cache_nice_tries)

2160

return 1;

2160

return 1;

2161

2162

return 1;

2162

return 1;

2163

}

2163

}

2164

2165

static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

2165

static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

2166

unsigned long max_nr_move, unsigned long max_load_move,

2166

unsigned long max_nr_move, unsigned long max_load_move,

2167

struct sched_domain *sd, enum cpu_idle_type idle,

2167

struct sched_domain *sd, enum cpu_idle_type idle,

2168

int *all_pinned, unsigned long *load_moved,

2168

int *all_pinned, unsigned long *load_moved,

2169

int this_best_prio, int best_prio, int best_prio_seen,

2169

int this_best_prio, int best_prio, int best_prio_seen,

2170

struct rq_iterator *iterator)

2170

struct rq_iterator *iterator)

2171

{

2171

{

2172

int pulled = 0, pinned = 0, skip_for_load;

2172

int pulled = 0, pinned = 0, skip_for_load;

2173

struct task_struct *p;

2173

struct task_struct *p;

2174

long rem_load_move = max_load_move;

2174

long rem_load_move = max_load_move;

2175

2176

if (max_nr_move == 0 || max_load_move == 0)

2176

if (max_nr_move == 0 || max_load_move == 0)

2177

goto out;

2177

goto out;

2178

2179

pinned = 1;

2179

pinned = 1;

2180

2181

/*

2181

/*

2182

* Start the load-balancing iterator:

2182

* Start the load-balancing iterator:

2183

*/

2183

*/

2184

p = iterator->start(iterator->arg);

2184

p = iterator->start(iterator->arg);

2185

if (!p)

2186

if (!p)

2187

goto out;

2187

goto out;

2188

/*

2188

/*

2189

* To help distribute high priority tasks accross CPUs we don't

2189

* To help distribute high priority tasks accross CPUs we don't

2190

* skip a task if it will be the highest priority task (i.e. smallest

2190

* skip a task if it will be the highest priority task (i.e. smallest

2191

* prio value) on its new queue regardless of its load weight

2191

* prio value) on its new queue regardless of its load weight

2192

*/

2192

*/

2193

skip_for_load = (p->se.load.weight >> 1) > rem_load_move +

2193

skip_for_load = (p->se.load.weight >> 1) > rem_load_move +

2194

SCHED_LOAD_SCALE_FUZZ;

2194

SCHED_LOAD_SCALE_FUZZ;

2195

if (skip_for_load && p->prio < this_best_prio)

2195

if (skip_for_load && p->prio < this_best_prio)

2196

skip_for_load = !best_prio_seen && p->prio == best_prio;

2196

skip_for_load = !best_prio_seen && p->prio == best_prio;

2197

if (skip_for_load ||

2197

if (skip_for_load ||

2198

!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {

2198

!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {

2199

2200

best_prio_seen |= p->prio == best_prio;

2200

best_prio_seen |= p->prio == best_prio;

2201

p = iterator->next(iterator->arg);

2201

p = iterator->next(iterator->arg);

2202

goto next;

2202

goto next;

2203

}

2203

}

2204

2205

pull_task(busiest, p, this_rq, this_cpu);

2205

pull_task(busiest, p, this_rq, this_cpu);

2206

pulled++;

2206

pulled++;

2207

rem_load_move -= p->se.load.weight;

2207

rem_load_move -= p->se.load.weight;

2208

2209

/*

2209

/*

2210

* We only want to steal up to the prescribed number of tasks

2210

* We only want to steal up to the prescribed number of tasks

2211

* and the prescribed amount of weighted load.

2211

* and the prescribed amount of weighted load.

2212

*/

2212

*/

2213

if (pulled < max_nr_move && rem_load_move > 0) {

2213

if (pulled < max_nr_move && rem_load_move > 0) {

2214

if (p->prio < this_best_prio)

2214

if (p->prio < this_best_prio)

2215

this_best_prio = p->prio;

2215

this_best_prio = p->prio;

2216

p = iterator->next(iterator->arg);

2216

p = iterator->next(iterator->arg);

2217

goto next;

2217

goto next;

2218

}

2218

}

2219

out:

2219

out:

2220

/*

2220

/*

2221

* Right now, this is the only place pull_task() is called,

2221

* Right now, this is the only place pull_task() is called,

2222

* so we can safely collect pull_task() stats here rather than

2222

* so we can safely collect pull_task() stats here rather than

2223

* inside pull_task().

2223

* inside pull_task().

2224

*/

2224

*/

2225

schedstat_add(sd, lb_gained[idle], pulled);

2225

schedstat_add(sd, lb_gained[idle], pulled);

2226

2227

if (all_pinned)

2227

if (all_pinned)

2228

*all_pinned = pinned;

2228

*all_pinned = pinned;

2229

*load_moved = max_load_move - rem_load_move;

2229

*load_moved = max_load_move - rem_load_move;

2230

return pulled;

2230

return pulled;

2231

}

2231

}

2232

2233

/*

2233

/*

2234

* move_tasks tries to move up to max_nr_move tasks and max_load_move weighted

2234

* move_tasks tries to move up to max_load_move weighted load from busiest to

2235

* load from busiest to this_rq, as part of a balancing operation within

2235

* this_rq, as part of a balancing operation within domain "sd".

2236

* "domain". Returns the number of tasks moved.

2236

* Returns 1 if successful and 0 otherwise.

2237

*

2237

*

2238

* Called with both runqueues locked.

2238

* Called with both runqueues locked.

2239

*/

2239

*/

2240

static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

2240

static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

2241

unsigned long max_nr_move, unsigned long max_load_move,

2241

unsigned long max_load_move,

2242

struct sched_domain *sd, enum cpu_idle_type idle,

2242

struct sched_domain *sd, enum cpu_idle_type idle,

2243

int *all_pinned)

2243

int *all_pinned)

2244

{

2244

{

2245

struct sched_class *class = sched_class_highest;

2245

struct sched_class *class = sched_class_highest;

2246

unsigned long load_moved, total_nr_moved = 0, nr_moved;

2246

unsigned long total_load_moved = 0;

2247

long rem_load_move = max_load_move;

2248

2247

2249

do {

2248

do {

2250

nr_moved = class->load_balance(this_rq, this_cpu, busiest,

2249

total_load_moved +=

2251

max_nr_move, (unsigned long)rem_load_move,

2250

class->load_balance(this_rq, this_cpu, busiest,

2252

sd, idle, all_pinned, &load_moved);

2251

ULONG_MAX, max_load_move - total_load_moved,

2253

total_nr_moved += nr_moved;

2252

sd, idle, all_pinned);

2254

max_nr_move -= nr_moved;

2255

rem_load_move -= load_moved;

2256

class = class->next;

2253

class = class->next;

2257

} while (class && max_nr_move && rem_load_move > 0);

2254

} while (class && max_load_move > total_load_moved);

2258

2255

2259

return total_nr_moved;

2256

return total_load_moved > 0;

2260

}

2257

}

2261

2258

2262

/*

2259

/*

2260

* move_one_task tries to move exactly one task from busiest to this_rq, as

2261

* part of active balancing operations within "domain".

2262

* Returns 1 if successful and 0 otherwise.

2263

*

2264

* Called with both runqueues locked.

2265

*/

2266

static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

2267

struct sched_domain *sd, enum cpu_idle_type idle)

2268

{

2269

struct sched_class *class;

2270

2271

for (class = sched_class_highest; class; class = class->next)

2272

if (class->load_balance(this_rq, this_cpu, busiest,

2273

1, ULONG_MAX, sd, idle, NULL))

2274

return 1;

2275

2276

return 0;

2277

}

2278

2279

/*

2263

* find_busiest_group finds and returns the busiest CPU group within the

2280

* find_busiest_group finds and returns the busiest CPU group within the

2264

* domain. It calculates and returns the amount of weighted load which

2281

* domain. It calculates and returns the amount of weighted load which

2265

* should be moved to restore balance via the imbalance parameter.

2282

* should be moved to restore balance via the imbalance parameter.

2266

*/

2283

*/

2267

static struct sched_group *

2284

static struct sched_group *

2268

find_busiest_group(struct sched_domain *sd, int this_cpu,

2285

find_busiest_group(struct sched_domain *sd, int this_cpu,

2269

unsigned long *imbalance, enum cpu_idle_type idle,

2286

unsigned long *imbalance, enum cpu_idle_type idle,

2270

int *sd_idle, cpumask_t *cpus, int *balance)

2287

int *sd_idle, cpumask_t *cpus, int *balance)

2271

{

2288

{

2272

struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;

2289

struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;

2273

unsigned long max_load, avg_load, total_load, this_load, total_pwr;

2290

unsigned long max_load, avg_load, total_load, this_load, total_pwr;

2274

unsigned long max_pull;

2291

unsigned long max_pull;

2275

unsigned long busiest_load_per_task, busiest_nr_running;

2292

unsigned long busiest_load_per_task, busiest_nr_running;

2276

unsigned long this_load_per_task, this_nr_running;

2293

unsigned long this_load_per_task, this_nr_running;

2277

int load_idx;

2294

int load_idx;

2278

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

2295

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

2279

int power_savings_balance = 1;

2296

int power_savings_balance = 1;

2280

unsigned long leader_nr_running = 0, min_load_per_task = 0;

2297

unsigned long leader_nr_running = 0, min_load_per_task = 0;

2281

unsigned long min_nr_running = ULONG_MAX;

2298

unsigned long min_nr_running = ULONG_MAX;

2282

struct sched_group *group_min = NULL, *group_leader = NULL;

2299

struct sched_group *group_min = NULL, *group_leader = NULL;

2283

#endif

2300

#endif

2284

2301

2285

max_load = this_load = total_load = total_pwr = 0;

2302

max_load = this_load = total_load = total_pwr = 0;

2286

busiest_load_per_task = busiest_nr_running = 0;

2303

busiest_load_per_task = busiest_nr_running = 0;

2287

this_load_per_task = this_nr_running = 0;

2304

this_load_per_task = this_nr_running = 0;

2288

if (idle == CPU_NOT_IDLE)

2305

if (idle == CPU_NOT_IDLE)

2289

load_idx = sd->busy_idx;

2306

load_idx = sd->busy_idx;

2290

else if (idle == CPU_NEWLY_IDLE)

2307

else if (idle == CPU_NEWLY_IDLE)

2291

load_idx = sd->newidle_idx;

2308

load_idx = sd->newidle_idx;

2292

else

2309

else

2293

load_idx = sd->idle_idx;

2310

load_idx = sd->idle_idx;

2294

2311

2295

do {

2312

do {

2296

unsigned long load, group_capacity;

2313

unsigned long load, group_capacity;

2297

int local_group;

2314

int local_group;

2298

int i;

2315

int i;

2299

unsigned int balance_cpu = -1, first_idle_cpu = 0;

2316

unsigned int balance_cpu = -1, first_idle_cpu = 0;

2300

unsigned long sum_nr_running, sum_weighted_load;

2317

unsigned long sum_nr_running, sum_weighted_load;

2301

2318

2302

local_group = cpu_isset(this_cpu, group->cpumask);

2319

local_group = cpu_isset(this_cpu, group->cpumask);

2303

2320

2304

if (local_group)

2321

if (local_group)

2305

balance_cpu = first_cpu(group->cpumask);

2322

balance_cpu = first_cpu(group->cpumask);

2306

2323

2307

/* Tally up the load of all CPUs in the group */

2324

/* Tally up the load of all CPUs in the group */

2308

sum_weighted_load = sum_nr_running = avg_load = 0;

2325

sum_weighted_load = sum_nr_running = avg_load = 0;

2309

2326

2310

for_each_cpu_mask(i, group->cpumask) {

2327

for_each_cpu_mask(i, group->cpumask) {

2311

struct rq *rq;

2328

struct rq *rq;

2312

2329

2313

if (!cpu_isset(i, *cpus))

2330

if (!cpu_isset(i, *cpus))

2314

continue;

2331

continue;

2315

2332

2316

rq = cpu_rq(i);

2333

rq = cpu_rq(i);

2317

2334

2318

if (*sd_idle && rq->nr_running)

2335

if (*sd_idle && rq->nr_running)

2319

*sd_idle = 0;

2336

*sd_idle = 0;

2320

2337

2321

/* Bias balancing toward cpus of our domain */

2338

/* Bias balancing toward cpus of our domain */

2322

if (local_group) {

2339

if (local_group) {

2323

if (idle_cpu(i) && !first_idle_cpu) {

2340

if (idle_cpu(i) && !first_idle_cpu) {

2324

first_idle_cpu = 1;

2341

first_idle_cpu = 1;

2325

balance_cpu = i;

2342

balance_cpu = i;

2326

}

2343

}

2327

2344

2328

load = target_load(i, load_idx);

2345

load = target_load(i, load_idx);

2329

} else

2346

} else

2330

load = source_load(i, load_idx);

2347

load = source_load(i, load_idx);

2331

2348

2332

avg_load += load;

2349

avg_load += load;

2333

sum_nr_running += rq->nr_running;

2350

sum_nr_running += rq->nr_running;

2334

sum_weighted_load += weighted_cpuload(i);

2351

sum_weighted_load += weighted_cpuload(i);

2335

}

2352

}

2336

2353

2337

/*

2354

/*

2338

* First idle cpu or the first cpu(busiest) in this sched group

2355

* First idle cpu or the first cpu(busiest) in this sched group

2339

* is eligible for doing load balancing at this and above

2356

* is eligible for doing load balancing at this and above

2340

* domains. In the newly idle case, we will allow all the cpu's

2357

* domains. In the newly idle case, we will allow all the cpu's

2341

* to do the newly idle load balance.

2358

* to do the newly idle load balance.

2342

*/

2359

*/

2343

if (idle != CPU_NEWLY_IDLE && local_group &&

2360

if (idle != CPU_NEWLY_IDLE && local_group &&

2344

balance_cpu != this_cpu && balance) {

2361

balance_cpu != this_cpu && balance) {

2345

*balance = 0;

2362

*balance = 0;

2346

goto ret;

2363

goto ret;

2347

}

2364

}

2348

2365

2349

total_load += avg_load;

2366

total_load += avg_load;

2350

total_pwr += group->__cpu_power;

2367

total_pwr += group->__cpu_power;

2351

2368

2352

/* Adjust by relative CPU power of the group */

2369

/* Adjust by relative CPU power of the group */

2353

avg_load = sg_div_cpu_power(group,

2370

avg_load = sg_div_cpu_power(group,

2354

avg_load * SCHED_LOAD_SCALE);

2371

avg_load * SCHED_LOAD_SCALE);

2355

2372

2356

group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;

2373

group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;

2357

2374

2358

if (local_group) {

2375

if (local_group) {

2359

this_load = avg_load;

2376

this_load = avg_load;

2360

this = group;

2377

this = group;

2361

this_nr_running = sum_nr_running;

2378

this_nr_running = sum_nr_running;

2362

this_load_per_task = sum_weighted_load;

2379

this_load_per_task = sum_weighted_load;

2363

} else if (avg_load > max_load &&

2380

} else if (avg_load > max_load &&

2364

sum_nr_running > group_capacity) {

2381

sum_nr_running > group_capacity) {

2365

max_load = avg_load;

2382

max_load = avg_load;

2366

busiest = group;

2383

busiest = group;

2367

busiest_nr_running = sum_nr_running;

2384

busiest_nr_running = sum_nr_running;

2368

busiest_load_per_task = sum_weighted_load;

2385

busiest_load_per_task = sum_weighted_load;

2369

}

2386

}

2370

2387

2371

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

2388

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

2372

/*

2389

/*

2373

* Busy processors will not participate in power savings

2390

* Busy processors will not participate in power savings

2374

* balance.

2391

* balance.

2375

*/

2392

*/

2376

if (idle == CPU_NOT_IDLE ||

2393

if (idle == CPU_NOT_IDLE ||

2377

!(sd->flags & SD_POWERSAVINGS_BALANCE))

2394

!(sd->flags & SD_POWERSAVINGS_BALANCE))

2378

goto group_next;

2395

goto group_next;

2379

2396

2380

/*

2397

/*

2381

* If the local group is idle or completely loaded

2398

* If the local group is idle or completely loaded

2382

* no need to do power savings balance at this domain

2399

* no need to do power savings balance at this domain

2383

*/

2400

*/

2384

if (local_group && (this_nr_running >= group_capacity ||

2401

if (local_group && (this_nr_running >= group_capacity ||

2385

!this_nr_running))

2402

!this_nr_running))

2386

power_savings_balance = 0;

2403

power_savings_balance = 0;

2387

2404

2388

/*

2405

/*

2389

* If a group is already running at full capacity or idle,

2406

* If a group is already running at full capacity or idle,

2390

* don't include that group in power savings calculations

2407

* don't include that group in power savings calculations

2391

*/

2408

*/

2392

if (!power_savings_balance || sum_nr_running >= group_capacity

2409

if (!power_savings_balance || sum_nr_running >= group_capacity

2393

|| !sum_nr_running)

2410

|| !sum_nr_running)

2394

goto group_next;

2411

goto group_next;

2395

2412

2396

/*

2413

/*

2397

* Calculate the group which has the least non-idle load.

2414

* Calculate the group which has the least non-idle load.

2398

* This is the group from where we need to pick up the load

2415

* This is the group from where we need to pick up the load

2399

* for saving power

2416

* for saving power

2400

*/

2417

*/

2401

if ((sum_nr_running < min_nr_running) ||

2418

if ((sum_nr_running < min_nr_running) ||

2402

(sum_nr_running == min_nr_running &&

2419

(sum_nr_running == min_nr_running &&

2403

first_cpu(group->cpumask) <

2420

first_cpu(group->cpumask) <

2404

first_cpu(group_min->cpumask))) {

2421

first_cpu(group_min->cpumask))) {

2405

group_min = group;

2422

group_min = group;

2406

min_nr_running = sum_nr_running;

2423

min_nr_running = sum_nr_running;

2407

min_load_per_task = sum_weighted_load /

2424

min_load_per_task = sum_weighted_load /

2408

sum_nr_running;

2425

sum_nr_running;

2409

}

2426

}

2410

2427

2411

/*

2428

/*

2412

* Calculate the group which is almost near its

2429

* Calculate the group which is almost near its

2413

* capacity but still has some space to pick up some load

2430

* capacity but still has some space to pick up some load

2414

* from other group and save more power

2431

* from other group and save more power

2415

*/

2432

*/

2416

if (sum_nr_running <= group_capacity - 1) {

2433

if (sum_nr_running <= group_capacity - 1) {

2417

if (sum_nr_running > leader_nr_running ||

2434

if (sum_nr_running > leader_nr_running ||

2418

(sum_nr_running == leader_nr_running &&

2435

(sum_nr_running == leader_nr_running &&

2419

first_cpu(group->cpumask) >

2436

first_cpu(group->cpumask) >

2420

first_cpu(group_leader->cpumask))) {

2437

first_cpu(group_leader->cpumask))) {

2421

group_leader = group;

2438

group_leader = group;

2422

leader_nr_running = sum_nr_running;

2439

leader_nr_running = sum_nr_running;

2423

}

2440

}

2424

}

2441

}

2425

group_next:

2442

group_next:

2426

#endif

2443

#endif

2427

group = group->next;

2444

group = group->next;

2428

} while (group != sd->groups);

2445

} while (group != sd->groups);

2429

2446

2430

if (!busiest || this_load >= max_load || busiest_nr_running == 0)

2447

if (!busiest || this_load >= max_load || busiest_nr_running == 0)

2431

goto out_balanced;

2448

goto out_balanced;

2432

2449

2433

avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;

2450

avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;

2434

2451

2435

if (this_load >= avg_load ||

2452

if (this_load >= avg_load ||

2436

100*max_load <= sd->imbalance_pct*this_load)

2453

100*max_load <= sd->imbalance_pct*this_load)

2437

goto out_balanced;

2454

goto out_balanced;

2438

2455

2439

busiest_load_per_task /= busiest_nr_running;

2456

busiest_load_per_task /= busiest_nr_running;

2440

/*

2457

/*

2441

* We're trying to get all the cpus to the average_load, so we don't

2458

* We're trying to get all the cpus to the average_load, so we don't

2442

* want to push ourselves above the average load, nor do we wish to

2459

* want to push ourselves above the average load, nor do we wish to

2443

* reduce the max loaded cpu below the average load, as either of these

2460

* reduce the max loaded cpu below the average load, as either of these

2444

* actions would just result in more rebalancing later, and ping-pong

2461

* actions would just result in more rebalancing later, and ping-pong

2445

* tasks around. Thus we look for the minimum possible imbalance.

2462

* tasks around. Thus we look for the minimum possible imbalance.

2446

* Negative imbalances (*we* are more loaded than anyone else) will

2463

* Negative imbalances (*we* are more loaded than anyone else) will

2447

* be counted as no imbalance for these purposes -- we can't fix that

2464

* be counted as no imbalance for these purposes -- we can't fix that

2448

* by pulling tasks to us. Be careful of negative numbers as they'll

2465

* by pulling tasks to us. Be careful of negative numbers as they'll

2449

* appear as very large values with unsigned longs.

2466

* appear as very large values with unsigned longs.

2450

*/

2467

*/

2451

if (max_load <= busiest_load_per_task)

2468

if (max_load <= busiest_load_per_task)

2452

goto out_balanced;

2469

goto out_balanced;

2453

2470

2454

/*

2471

/*

2455

* In the presence of smp nice balancing, certain scenarios can have

2472

* In the presence of smp nice balancing, certain scenarios can have

2456

* max load less than avg load(as we skip the groups at or below

2473

* max load less than avg load(as we skip the groups at or below

2457

* its cpu_power, while calculating max_load..)

2474

* its cpu_power, while calculating max_load..)

2458

*/

2475

*/

2459

if (max_load < avg_load) {

2476

if (max_load < avg_load) {

2460

*imbalance = 0;

2477

*imbalance = 0;

2461

goto small_imbalance;

2478

goto small_imbalance;

2462

}

2479

}

2463

2480

2464

/* Don't want to pull so many tasks that a group would go idle */

2481

/* Don't want to pull so many tasks that a group would go idle */

2465

max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);

2482

max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);

2466

2483

2467

/* How much load to actually move to equalise the imbalance */

2484

/* How much load to actually move to equalise the imbalance */

2468

*imbalance = min(max_pull * busiest->__cpu_power,

2485

*imbalance = min(max_pull * busiest->__cpu_power,

2469

(avg_load - this_load) * this->__cpu_power)

2486

(avg_load - this_load) * this->__cpu_power)

2470

/ SCHED_LOAD_SCALE;

2487

/ SCHED_LOAD_SCALE;

2471

2488

2472

/*

2489

/*

2473

* if *imbalance is less than the average load per runnable task

2490

* if *imbalance is less than the average load per runnable task

2474

* there is no gaurantee that any tasks will be moved so we'll have

2491

* there is no gaurantee that any tasks will be moved so we'll have

2475

* a think about bumping its value to force at least one task to be

2492

* a think about bumping its value to force at least one task to be

2476

* moved

2493

* moved

2477

*/

2494

*/

2478

if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {

2495

if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {

2479

unsigned long tmp, pwr_now, pwr_move;

2496

unsigned long tmp, pwr_now, pwr_move;

2480

unsigned int imbn;

2497

unsigned int imbn;

2481

2498

2482

small_imbalance:

2499

small_imbalance:

2483

pwr_move = pwr_now = 0;

2500

pwr_move = pwr_now = 0;

2484

imbn = 2;

2501

imbn = 2;

2485

if (this_nr_running) {

2502

if (this_nr_running) {

2486

this_load_per_task /= this_nr_running;

2503

this_load_per_task /= this_nr_running;

2487

if (busiest_load_per_task > this_load_per_task)

2504

if (busiest_load_per_task > this_load_per_task)

2488

imbn = 1;

2505

imbn = 1;

2489

} else

2506

} else

2490

this_load_per_task = SCHED_LOAD_SCALE;

2507

this_load_per_task = SCHED_LOAD_SCALE;

2491

2508

2492

if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=

2509

if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=

2493

busiest_load_per_task * imbn) {

2510

busiest_load_per_task * imbn) {

2494

*imbalance = busiest_load_per_task;

2511

*imbalance = busiest_load_per_task;

2495

return busiest;

2512

return busiest;

2496

}

2513

}

2497

2514

2498

/*

2515

/*

2499

* OK, we don't have enough imbalance to justify moving tasks,

2516

* OK, we don't have enough imbalance to justify moving tasks,

2500

* however we may be able to increase total CPU power used by

2517

* however we may be able to increase total CPU power used by

2501

* moving them.

2518

* moving them.

2502

*/

2519

*/

2503

2520

2504

pwr_now += busiest->__cpu_power *

2521

pwr_now += busiest->__cpu_power *

2505

min(busiest_load_per_task, max_load);

2522

min(busiest_load_per_task, max_load);

2506

pwr_now += this->__cpu_power *

2523

pwr_now += this->__cpu_power *

2507

min(this_load_per_task, this_load);

2524

min(this_load_per_task, this_load);

2508

pwr_now /= SCHED_LOAD_SCALE;

2525

pwr_now /= SCHED_LOAD_SCALE;

2509

2526

2510

/* Amount of load we'd subtract */

2527

/* Amount of load we'd subtract */

2511

tmp = sg_div_cpu_power(busiest,

2528

tmp = sg_div_cpu_power(busiest,

2512

busiest_load_per_task * SCHED_LOAD_SCALE);

2529

busiest_load_per_task * SCHED_LOAD_SCALE);

2513

if (max_load > tmp)

2530

if (max_load > tmp)

2514

pwr_move += busiest->__cpu_power *

2531

pwr_move += busiest->__cpu_power *

2515

min(busiest_load_per_task, max_load - tmp);

2532

min(busiest_load_per_task, max_load - tmp);

2516

2533

2517

/* Amount of load we'd add */

2534

/* Amount of load we'd add */

2518

if (max_load * busiest->__cpu_power <

2535

if (max_load * busiest->__cpu_power <

2519

busiest_load_per_task * SCHED_LOAD_SCALE)

2536

busiest_load_per_task * SCHED_LOAD_SCALE)

2520

tmp = sg_div_cpu_power(this,

2537

tmp = sg_div_cpu_power(this,

2521

max_load * busiest->__cpu_power);

2538

max_load * busiest->__cpu_power);

2522

else

2539

else

2523

tmp = sg_div_cpu_power(this,

2540

tmp = sg_div_cpu_power(this,

2524

busiest_load_per_task * SCHED_LOAD_SCALE);

2541

busiest_load_per_task * SCHED_LOAD_SCALE);

2525

pwr_move += this->__cpu_power *

2542

pwr_move += this->__cpu_power *

2526

min(this_load_per_task, this_load + tmp);

2543

min(this_load_per_task, this_load + tmp);

2527

pwr_move /= SCHED_LOAD_SCALE;

2544

pwr_move /= SCHED_LOAD_SCALE;

2528

2545

2529

/* Move if we gain throughput */

2546

/* Move if we gain throughput */

2530

if (pwr_move <= pwr_now)

2547

if (pwr_move <= pwr_now)

2531

goto out_balanced;

2548

goto out_balanced;

2532

2549

2533

*imbalance = busiest_load_per_task;

2550

*imbalance = busiest_load_per_task;

2534

}

2551

}

2535

2552

2536

return busiest;

2553

return busiest;

2537

2554

2538

out_balanced:

2555

out_balanced:

2539

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

2556

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

2540

if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))

2557

if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))

2541

goto ret;

2558

goto ret;

2542

2559

2543

if (this == group_leader && group_leader != group_min) {

2560

if (this == group_leader && group_leader != group_min) {

2544

*imbalance = min_load_per_task;

2561

*imbalance = min_load_per_task;

2545

return group_min;

2562

return group_min;

2546

}

2563

}

2547

#endif

2564

#endif

2548

ret:

2565

ret:

2549

*imbalance = 0;

2566

*imbalance = 0;

2550

return NULL;

2567

return NULL;

2551

}

2568

}

2552

2569

2553

/*

2570

/*

2554

* find_busiest_queue - find the busiest runqueue among the cpus in group.

2571

* find_busiest_queue - find the busiest runqueue among the cpus in group.

2555

*/

2572

*/

2556

static struct rq *

2573

static struct rq *

2557

find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,

2574

find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,

2558

unsigned long imbalance, cpumask_t *cpus)

2575

unsigned long imbalance, cpumask_t *cpus)

2559

{

2576

{

2560

struct rq *busiest = NULL, *rq;

2577

struct rq *busiest = NULL, *rq;

2561

unsigned long max_load = 0;

2578

unsigned long max_load = 0;

2562

int i;

2579

int i;

2563

2580

2564

for_each_cpu_mask(i, group->cpumask) {

2581

for_each_cpu_mask(i, group->cpumask) {

2565

unsigned long wl;

2582

unsigned long wl;

2566

2583

2567

if (!cpu_isset(i, *cpus))

2584

if (!cpu_isset(i, *cpus))

2568

continue;

2585

continue;

2569

2586

2570

rq = cpu_rq(i);

2587

rq = cpu_rq(i);

2571

wl = weighted_cpuload(i);

2588

wl = weighted_cpuload(i);

2572

2589

2573

if (rq->nr_running == 1 && wl > imbalance)

2590

if (rq->nr_running == 1 && wl > imbalance)

2574

continue;

2591

continue;

2575

2592

2576

if (wl > max_load) {

2593

if (wl > max_load) {

2577

max_load = wl;

2594

max_load = wl;

2578

busiest = rq;

2595

busiest = rq;

2579

}

2596

}

2580

}

2597

}

2581

2598

2582

return busiest;

2599

return busiest;

2583

}

2600

}

2584

2601

2585

/*

2602

/*

2586

* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but

2603

* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but

2587

* so long as it is large enough.

2604

* so long as it is large enough.

2588

*/

2605

*/

2589

#define MAX_PINNED_INTERVAL 512

2606

#define MAX_PINNED_INTERVAL 512

2590

2607

2591

static inline unsigned long minus_1_or_zero(unsigned long n)

2592

{

2593

return n > 0 ? n - 1 : 0;

2594

}

2595

2596

/*

2608

/*

2597

* Check this_cpu to ensure it is balanced within domain. Attempt to move

2609

* Check this_cpu to ensure it is balanced within domain. Attempt to move

2598

* tasks if there is an imbalance.

2610

* tasks if there is an imbalance.

2599

*/

2611

*/

2600

static int load_balance(int this_cpu, struct rq *this_rq,

2612

static int load_balance(int this_cpu, struct rq *this_rq,

2601

struct sched_domain *sd, enum cpu_idle_type idle,

2613

struct sched_domain *sd, enum cpu_idle_type idle,

2602

int *balance)

2614

int *balance)

2603

{

2615

{

2604

int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;

2616

int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;

2605

struct sched_group *group;

2617

struct sched_group *group;

2606

unsigned long imbalance;

2618

unsigned long imbalance;

2607

struct rq *busiest;

2619

struct rq *busiest;

2608

cpumask_t cpus = CPU_MASK_ALL;

2620

cpumask_t cpus = CPU_MASK_ALL;

2609

unsigned long flags;

2621

unsigned long flags;

2610

2622

2611

/*

2623

/*

2612

* When power savings policy is enabled for the parent domain, idle

2624

* When power savings policy is enabled for the parent domain, idle

2613

* sibling can pick up load irrespective of busy siblings. In this case,

2625

* sibling can pick up load irrespective of busy siblings. In this case,

2614

* let the state of idle sibling percolate up as CPU_IDLE, instead of

2626

* let the state of idle sibling percolate up as CPU_IDLE, instead of

2615

* portraying it as CPU_NOT_IDLE.

2627

* portraying it as CPU_NOT_IDLE.

2616

*/

2628

*/

2617

if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&

2629

if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&

2618

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

2630

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

2619

sd_idle = 1;

2631

sd_idle = 1;

2620

2632

2621

schedstat_inc(sd, lb_cnt[idle]);

2633

schedstat_inc(sd, lb_cnt[idle]);

2622

2634

2623

redo:

2635

redo:

2624

group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,

2636

group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,

2625

&cpus, balance);

2637

&cpus, balance);

2626

2638

2627

if (*balance == 0)

2639

if (*balance == 0)

2628

goto out_balanced;

2640

goto out_balanced;

2629

2641

2630

if (!group) {

2642

if (!group) {

2631

schedstat_inc(sd, lb_nobusyg[idle]);

2643

schedstat_inc(sd, lb_nobusyg[idle]);

2632

goto out_balanced;

2644

goto out_balanced;

2633

}

2645

}

2634

2646

2635

busiest = find_busiest_queue(group, idle, imbalance, &cpus);

2647

busiest = find_busiest_queue(group, idle, imbalance, &cpus);

2636

if (!busiest) {

2648

if (!busiest) {

2637

schedstat_inc(sd, lb_nobusyq[idle]);

2649

schedstat_inc(sd, lb_nobusyq[idle]);

2638

goto out_balanced;

2650

goto out_balanced;

2639

}

2651

}

2640

2652

2641

BUG_ON(busiest == this_rq);

2653

BUG_ON(busiest == this_rq);

2642

2654

2643

schedstat_add(sd, lb_imbalance[idle], imbalance);

2655

schedstat_add(sd, lb_imbalance[idle], imbalance);

2644

2656

2645

nr_moved = 0;

2657

ld_moved = 0;

2646

if (busiest->nr_running > 1) {

2658

if (busiest->nr_running > 1) {

2647

/*

2659

/*

2648

* Attempt to move tasks. If find_busiest_group has found

2660

* Attempt to move tasks. If find_busiest_group has found

2649

* an imbalance but busiest->nr_running <= 1, the group is

2661

* an imbalance but busiest->nr_running <= 1, the group is

2650

* still unbalanced. nr_moved simply stays zero, so it is

2662

* still unbalanced. ld_moved simply stays zero, so it is

2651

* correctly treated as an imbalance.

2663

* correctly treated as an imbalance.

2652

*/

2664

*/

2653

local_irq_save(flags);

2665

local_irq_save(flags);

2654

double_rq_lock(this_rq, busiest);

2666

double_rq_lock(this_rq, busiest);

2655

nr_moved = move_tasks(this_rq, this_cpu, busiest,

2667

ld_moved = move_tasks(this_rq, this_cpu, busiest,

2656

minus_1_or_zero(busiest->nr_running),

2657

imbalance, sd, idle, &all_pinned);

2668

imbalance, sd, idle, &all_pinned);

2658

double_rq_unlock(this_rq, busiest);

2669

double_rq_unlock(this_rq, busiest);

2659

local_irq_restore(flags);

2670

local_irq_restore(flags);

2660

2671

2661

/*

2672

/*

2662

* some other cpu did the load balance for us.

2673

* some other cpu did the load balance for us.

2663

*/

2674

*/

2664

if (nr_moved && this_cpu != smp_processor_id())

2675

if (ld_moved && this_cpu != smp_processor_id())

2665

resched_cpu(this_cpu);

2676

resched_cpu(this_cpu);

2666

2677

2667

/* All tasks on this runqueue were pinned by CPU affinity */

2678

/* All tasks on this runqueue were pinned by CPU affinity */

2668

if (unlikely(all_pinned)) {

2679

if (unlikely(all_pinned)) {

2669

cpu_clear(cpu_of(busiest), cpus);

2680

cpu_clear(cpu_of(busiest), cpus);

2670

if (!cpus_empty(cpus))

2681

if (!cpus_empty(cpus))

2671

goto redo;

2682

goto redo;

2672

goto out_balanced;

2683

goto out_balanced;

2673

}

2684

}

2674

}

2685

}

2675

2686

2676

if (!nr_moved) {

2687

if (!ld_moved) {

2677

schedstat_inc(sd, lb_failed[idle]);

2688

schedstat_inc(sd, lb_failed[idle]);

2678

sd->nr_balance_failed++;

2689

sd->nr_balance_failed++;

2679

2690

2680

if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {

2691

if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {

2681

2692

2682

spin_lock_irqsave(&busiest->lock, flags);

2693

spin_lock_irqsave(&busiest->lock, flags);

2683

2694

2684

/* don't kick the migration_thread, if the curr

2695

/* don't kick the migration_thread, if the curr

2685

* task on busiest cpu can't be moved to this_cpu

2696

* task on busiest cpu can't be moved to this_cpu

2686

*/

2697

*/

2687

if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {

2698

if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {

2688

spin_unlock_irqrestore(&busiest->lock, flags);

2699

spin_unlock_irqrestore(&busiest->lock, flags);

2689

all_pinned = 1;

2700

all_pinned = 1;

2690

goto out_one_pinned;

2701

goto out_one_pinned;

2691

}

2702

}

2692

2703

2693

if (!busiest->active_balance) {

2704

if (!busiest->active_balance) {

2694

busiest->active_balance = 1;

2705

busiest->active_balance = 1;

2695

busiest->push_cpu = this_cpu;

2706

busiest->push_cpu = this_cpu;

2696

active_balance = 1;

2707

active_balance = 1;

2697

}

2708

}

2698

spin_unlock_irqrestore(&busiest->lock, flags);

2709

spin_unlock_irqrestore(&busiest->lock, flags);

2699

if (active_balance)

2710

if (active_balance)

2700

wake_up_process(busiest->migration_thread);

2711

wake_up_process(busiest->migration_thread);

2701

2712

2702

/*

2713

/*

2703

* We've kicked active balancing, reset the failure

2714

* We've kicked active balancing, reset the failure

2704

* counter.

2715

* counter.

2705

*/

2716

*/

2706

sd->nr_balance_failed = sd->cache_nice_tries+1;

2717

sd->nr_balance_failed = sd->cache_nice_tries+1;

2707

}

2718

}

2708

} else

2719

} else

2709

sd->nr_balance_failed = 0;

2720

sd->nr_balance_failed = 0;

2710

2721

2711

if (likely(!active_balance)) {

2722

if (likely(!active_balance)) {

2712

/* We were unbalanced, so reset the balancing interval */

2723

/* We were unbalanced, so reset the balancing interval */

2713

sd->balance_interval = sd->min_interval;

2724

sd->balance_interval = sd->min_interval;

2714

} else {

2725

} else {

2715

/*

2726

/*

2716

* If we've begun active balancing, start to back off. This

2727

* If we've begun active balancing, start to back off. This

2717

* case may not be covered by the all_pinned logic if there

2728

* case may not be covered by the all_pinned logic if there

2718

* is only 1 task on the busy runqueue (because we don't call

2729

* is only 1 task on the busy runqueue (because we don't call

2719

* move_tasks).

2730

* move_tasks).

2720

*/

2731

*/

2721

if (sd->balance_interval < sd->max_interval)

2732

if (sd->balance_interval < sd->max_interval)

2722

sd->balance_interval *= 2;

2733

sd->balance_interval *= 2;

2723

}

2734

}

2724

2735

2725

if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

2736

if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

2726

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

2737

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

2727

return -1;

2738

return -1;

2728

return nr_moved;

2739

return ld_moved;

2729

2740

2730

out_balanced:

2741

out_balanced:

2731

schedstat_inc(sd, lb_balanced[idle]);

2742

schedstat_inc(sd, lb_balanced[idle]);

2732

2743

2733

sd->nr_balance_failed = 0;

2744

sd->nr_balance_failed = 0;

2734

2745

2735

out_one_pinned:

2746

out_one_pinned:

2736

/* tune up the balancing interval */

2747

/* tune up the balancing interval */

2737

if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||

2748

if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||

2738

(sd->balance_interval < sd->max_interval))

2749

(sd->balance_interval < sd->max_interval))

2739

sd->balance_interval *= 2;

2750

sd->balance_interval *= 2;

2740

2751

2741

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

2752

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

2742

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

2753

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

2743

return -1;

2754

return -1;

2744

return 0;

2755

return 0;

2745

}

2756

}

2746

2757

2747

/*

2758

/*

2748

* Check this_cpu to ensure it is balanced within domain. Attempt to move

2759

* Check this_cpu to ensure it is balanced within domain. Attempt to move

2749

* tasks if there is an imbalance.

2760

* tasks if there is an imbalance.

2750

*

2761

*

2751

* Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).

2762

* Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).

2752

* this_rq is locked.

2763

* this_rq is locked.

2753

*/

2764

*/

2754

static int

2765

static int

2755

load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)

2766

load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)

2756

{

2767

{

2757

struct sched_group *group;

2768

struct sched_group *group;

2758

struct rq *busiest = NULL;

2769

struct rq *busiest = NULL;

2759

unsigned long imbalance;

2770

unsigned long imbalance;

2760

int nr_moved = 0;

2771

int ld_moved = 0;

2761

int sd_idle = 0;

2772

int sd_idle = 0;

2762

int all_pinned = 0;

2773

int all_pinned = 0;

2763

cpumask_t cpus = CPU_MASK_ALL;

2774

cpumask_t cpus = CPU_MASK_ALL;

2764

2775

2765

/*

2776

/*

2766

* When power savings policy is enabled for the parent domain, idle

2777

* When power savings policy is enabled for the parent domain, idle

2767

* sibling can pick up load irrespective of busy siblings. In this case,

2778

* sibling can pick up load irrespective of busy siblings. In this case,

2768

* let the state of idle sibling percolate up as IDLE, instead of

2779

* let the state of idle sibling percolate up as IDLE, instead of

2769

* portraying it as CPU_NOT_IDLE.

2780

* portraying it as CPU_NOT_IDLE.

2770

*/

2781

*/

2771

if (sd->flags & SD_SHARE_CPUPOWER &&

2782

if (sd->flags & SD_SHARE_CPUPOWER &&

2772

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

2783

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

2773

sd_idle = 1;

2784

sd_idle = 1;

2774

2785

2775

schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);

2786

schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);

2776

redo:

2787

redo:

2777

group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,

2788

group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,

2778

&sd_idle, &cpus, NULL);

2789

&sd_idle, &cpus, NULL);

2779

if (!group) {

2790

if (!group) {

2780

schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);

2791

schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);

2781

goto out_balanced;

2792

goto out_balanced;

2782

}

2793

}

2783

2794

2784

busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,

2795

busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,

2785

&cpus);

2796

&cpus);

2786

if (!busiest) {

2797

if (!busiest) {

2787

schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);

2798

schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);

2788

goto out_balanced;

2799

goto out_balanced;

2789

}

2800

}

2790

2801

2791

BUG_ON(busiest == this_rq);

2802

BUG_ON(busiest == this_rq);

2792

2803

2793

schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);

2804

schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);

2794

2805

2795

nr_moved = 0;

2806

ld_moved = 0;

2796

if (busiest->nr_running > 1) {

2807

if (busiest->nr_running > 1) {

2797

/* Attempt to move tasks */

2808

/* Attempt to move tasks */

2798

double_lock_balance(this_rq, busiest);

2809

double_lock_balance(this_rq, busiest);

2799

nr_moved = move_tasks(this_rq, this_cpu, busiest,

2810

ld_moved = move_tasks(this_rq, this_cpu, busiest,

2800

minus_1_or_zero(busiest->nr_running),

2801

imbalance, sd, CPU_NEWLY_IDLE,

2811

imbalance, sd, CPU_NEWLY_IDLE,

2802

&all_pinned);

2812

&all_pinned);

2803

spin_unlock(&busiest->lock);

2813

spin_unlock(&busiest->lock);

2804

2814

2805

if (unlikely(all_pinned)) {

2815

if (unlikely(all_pinned)) {

2806

cpu_clear(cpu_of(busiest), cpus);

2816

cpu_clear(cpu_of(busiest), cpus);

2807

if (!cpus_empty(cpus))

2817

if (!cpus_empty(cpus))

2808

goto redo;

2818

goto redo;

2809

}

2819

}

2810

}

2820

}

2811

2821

2812

if (!nr_moved) {

2822

if (!ld_moved) {

2813

schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);

2823

schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);

2814

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

2824

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

2815

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

2825

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

2816

return -1;

2826

return -1;

2817

} else

2827

} else

2818

sd->nr_balance_failed = 0;

2828

sd->nr_balance_failed = 0;

2819

2829

2820

return nr_moved;

2830

return ld_moved;

2821

2831

2822

out_balanced:

2832

out_balanced:

2823

schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);

2833

schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);

2824

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

2834

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

2825

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

2835

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

2826

return -1;

2836

return -1;

2827

sd->nr_balance_failed = 0;

2837

sd->nr_balance_failed = 0;

2828

2838

2829

return 0;

2839

return 0;

2830

}

2840

}

2831

2841

2832

/*

2842

/*

2833

* idle_balance is called by schedule() if this_cpu is about to become

2843

* idle_balance is called by schedule() if this_cpu is about to become

2834

* idle. Attempts to pull tasks from other CPUs.

2844

* idle. Attempts to pull tasks from other CPUs.

2835

*/

2845

*/

2836

static void idle_balance(int this_cpu, struct rq *this_rq)

2846

static void idle_balance(int this_cpu, struct rq *this_rq)

2837

{

2847

{

2838

struct sched_domain *sd;

2848

struct sched_domain *sd;

2839

int pulled_task = -1;

2849

int pulled_task = -1;

2840

unsigned long next_balance = jiffies + HZ;

2850

unsigned long next_balance = jiffies + HZ;

2841

2851

2842

for_each_domain(this_cpu, sd) {

2852

for_each_domain(this_cpu, sd) {

2843

unsigned long interval;

2853

unsigned long interval;

2844

2854

2845

if (!(sd->flags & SD_LOAD_BALANCE))

2855

if (!(sd->flags & SD_LOAD_BALANCE))

2846

continue;

2856

continue;

2847

2857

2848

if (sd->flags & SD_BALANCE_NEWIDLE)

2858

if (sd->flags & SD_BALANCE_NEWIDLE)

2849

/* If we've pulled tasks over stop searching: */

2859

/* If we've pulled tasks over stop searching: */

2850

pulled_task = load_balance_newidle(this_cpu,

2860

pulled_task = load_balance_newidle(this_cpu,

2851

this_rq, sd);

2861

this_rq, sd);

2852

2862

2853

interval = msecs_to_jiffies(sd->balance_interval);

2863

interval = msecs_to_jiffies(sd->balance_interval);

2854

if (time_after(next_balance, sd->last_balance + interval))

2864

if (time_after(next_balance, sd->last_balance + interval))

2855

next_balance = sd->last_balance + interval;

2865

next_balance = sd->last_balance + interval;

2856

if (pulled_task)

2866

if (pulled_task)

2857

break;

2867

break;

2858

}

2868

}

2859

if (pulled_task || time_after(jiffies, this_rq->next_balance)) {

2869

if (pulled_task || time_after(jiffies, this_rq->next_balance)) {

2860

/*

2870

/*

2861

* We are going idle. next_balance may be set based on

2871

* We are going idle. next_balance may be set based on

2862

* a busy processor. So reset next_balance.

2872

* a busy processor. So reset next_balance.

2863

*/

2873

*/

2864

this_rq->next_balance = next_balance;

2874

this_rq->next_balance = next_balance;

2865

}

2875

}

2866

}

2876

}

2867

2877

2868

/*

2878

/*

2869

* active_load_balance is run by migration threads. It pushes running tasks

2879

* active_load_balance is run by migration threads. It pushes running tasks

2870

* off the busiest CPU onto idle CPUs. It requires at least 1 task to be

2880

* off the busiest CPU onto idle CPUs. It requires at least 1 task to be

2871

* running on each physical CPU where possible, and avoids physical /

2881

* running on each physical CPU where possible, and avoids physical /

2872

* logical imbalances.

2882

* logical imbalances.

2873

*

2883

*

2874

* Called with busiest_rq locked.

2884

* Called with busiest_rq locked.

2875

*/

2885

*/

2876

static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)

2886

static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)

2877

{

2887

{

2878

int target_cpu = busiest_rq->push_cpu;

2888

int target_cpu = busiest_rq->push_cpu;

2879

struct sched_domain *sd;

2889

struct sched_domain *sd;

2880

struct rq *target_rq;

2890

struct rq *target_rq;

2881

2891

2882

/* Is there any task to move? */

2892

/* Is there any task to move? */

2883

if (busiest_rq->nr_running <= 1)

2893

if (busiest_rq->nr_running <= 1)

2884

return;

2894

return;

2885

2895

2886

target_rq = cpu_rq(target_cpu);

2896

target_rq = cpu_rq(target_cpu);

2887

2897

2888

/*

2898

/*

2889

* This condition is "impossible", if it occurs

2899

* This condition is "impossible", if it occurs

2890

* we need to fix it. Originally reported by

2900

* we need to fix it. Originally reported by

2891

* Bjorn Helgaas on a 128-cpu setup.

2901

* Bjorn Helgaas on a 128-cpu setup.

2892

*/

2902

*/

2893

BUG_ON(busiest_rq == target_rq);

2903

BUG_ON(busiest_rq == target_rq);

2894

2904

2895

/* move a task from busiest_rq to target_rq */

2905

/* move a task from busiest_rq to target_rq */

2896

double_lock_balance(busiest_rq, target_rq);

2906

double_lock_balance(busiest_rq, target_rq);

2897

2907

2898

/* Search for an sd spanning us and the target CPU. */

2908

/* Search for an sd spanning us and the target CPU. */

2899

for_each_domain(target_cpu, sd) {

2909

for_each_domain(target_cpu, sd) {

2900

if ((sd->flags & SD_LOAD_BALANCE) &&

2910

if ((sd->flags & SD_LOAD_BALANCE) &&

2901

cpu_isset(busiest_cpu, sd->span))

2911

cpu_isset(busiest_cpu, sd->span))

2902

break;

2912

break;

2903

}

2913

}

2904

2914

2905

if (likely(sd)) {

2915

if (likely(sd)) {

2906

schedstat_inc(sd, alb_cnt);

2916

schedstat_inc(sd, alb_cnt);

2907

2917

2908

if (move_tasks(target_rq, target_cpu, busiest_rq, 1,

2918

if (move_one_task(target_rq, target_cpu, busiest_rq,

2909

ULONG_MAX, sd, CPU_IDLE, NULL))

2919

sd, CPU_IDLE))

2910

schedstat_inc(sd, alb_pushed);

2920

schedstat_inc(sd, alb_pushed);

2911

else

2921

else

2912

schedstat_inc(sd, alb_failed);

2922

schedstat_inc(sd, alb_failed);

2913

}

2923

}

2914

spin_unlock(&target_rq->lock);

2924

spin_unlock(&target_rq->lock);

2915

}

2925

}

2916

2926

2917

#ifdef CONFIG_NO_HZ

2927

#ifdef CONFIG_NO_HZ

2918

static struct {

2928

static struct {

2919

atomic_t load_balancer;

2929

atomic_t load_balancer;

2920

cpumask_t cpu_mask;

2930

cpumask_t cpu_mask;

2921

} nohz ____cacheline_aligned = {

2931

} nohz ____cacheline_aligned = {

2922

.load_balancer = ATOMIC_INIT(-1),

2932

.load_balancer = ATOMIC_INIT(-1),

2923

.cpu_mask = CPU_MASK_NONE,

2933

.cpu_mask = CPU_MASK_NONE,

2924

};

2934

};

2925

2935

2926

/*

2936

/*

2927

* This routine will try to nominate the ilb (idle load balancing)

2937

* This routine will try to nominate the ilb (idle load balancing)

2928

* owner among the cpus whose ticks are stopped. ilb owner will do the idle

2938

* owner among the cpus whose ticks are stopped. ilb owner will do the idle

2929

* load balancing on behalf of all those cpus. If all the cpus in the system

2939

* load balancing on behalf of all those cpus. If all the cpus in the system

2930

* go into this tickless mode, then there will be no ilb owner (as there is

2940

* go into this tickless mode, then there will be no ilb owner (as there is

2931

* no need for one) and all the cpus will sleep till the next wakeup event

2941

* no need for one) and all the cpus will sleep till the next wakeup event

2932

* arrives...

2942

* arrives...

2933

*

2943

*

2934

* For the ilb owner, tick is not stopped. And this tick will be used

2944

* For the ilb owner, tick is not stopped. And this tick will be used

2935

* for idle load balancing. ilb owner will still be part of

2945

* for idle load balancing. ilb owner will still be part of

2936

* nohz.cpu_mask..

2946

* nohz.cpu_mask..

2937

*

2947

*

2938

* While stopping the tick, this cpu will become the ilb owner if there

2948

* While stopping the tick, this cpu will become the ilb owner if there

2939

* is no other owner. And will be the owner till that cpu becomes busy

2949

* is no other owner. And will be the owner till that cpu becomes busy

2940

* or if all cpus in the system stop their ticks at which point

2950

* or if all cpus in the system stop their ticks at which point

2941

* there is no need for ilb owner.

2951

* there is no need for ilb owner.

2942

*

2952

*

2943

* When the ilb owner becomes busy, it nominates another owner, during the

2953

* When the ilb owner becomes busy, it nominates another owner, during the

2944

* next busy scheduler_tick()

2954

* next busy scheduler_tick()

2945

*/

2955

*/

2946

int select_nohz_load_balancer(int stop_tick)

2956

int select_nohz_load_balancer(int stop_tick)

2947

{

2957

{

2948

int cpu = smp_processor_id();

2958

int cpu = smp_processor_id();

2949

2959

2950

if (stop_tick) {

2960

if (stop_tick) {

2951

cpu_set(cpu, nohz.cpu_mask);

2961

cpu_set(cpu, nohz.cpu_mask);

2952

cpu_rq(cpu)->in_nohz_recently = 1;

2962

cpu_rq(cpu)->in_nohz_recently = 1;

2953

2963

2954

/*

2964

/*

2955

* If we are going offline and still the leader, give up!

2965

* If we are going offline and still the leader, give up!

2956

*/

2966

*/

2957

if (cpu_is_offline(cpu) &&

2967

if (cpu_is_offline(cpu) &&

2958

atomic_read(&nohz.load_balancer) == cpu) {

2968

atomic_read(&nohz.load_balancer) == cpu) {

2959

if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)

2969

if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)

2960

BUG();

2970

BUG();

2961

return 0;

2971

return 0;

2962

}

2972

}

2963

2973

2964

/* time for ilb owner also to sleep */

2974

/* time for ilb owner also to sleep */

2965

if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {

2975

if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {

2966

if (atomic_read(&nohz.load_balancer) == cpu)

2976

if (atomic_read(&nohz.load_balancer) == cpu)

2967

atomic_set(&nohz.load_balancer, -1);

2977

atomic_set(&nohz.load_balancer, -1);

2968

return 0;

2978

return 0;

2969

}

2979

}

2970

2980

2971

if (atomic_read(&nohz.load_balancer) == -1) {

2981

if (atomic_read(&nohz.load_balancer) == -1) {

2972

/* make me the ilb owner */

2982

/* make me the ilb owner */

2973

if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)

2983

if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)

2974

return 1;

2984

return 1;

2975

} else if (atomic_read(&nohz.load_balancer) == cpu)

2985

} else if (atomic_read(&nohz.load_balancer) == cpu)

2976

return 1;

2986

return 1;

2977

} else {

2987

} else {

2978

if (!cpu_isset(cpu, nohz.cpu_mask))

2988

if (!cpu_isset(cpu, nohz.cpu_mask))

2979

return 0;

2989

return 0;

2980

2990

2981

cpu_clear(cpu, nohz.cpu_mask);

2991

cpu_clear(cpu, nohz.cpu_mask);

2982

2992

2983

if (atomic_read(&nohz.load_balancer) == cpu)

2993

if (atomic_read(&nohz.load_balancer) == cpu)

2984

if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)

2994

if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)

2985

BUG();

2995

BUG();

2986

}

2996

}

2987

return 0;

2997

return 0;

2988

}

2998

}

2989

#endif

2999

#endif

2990

3000

2991

static DEFINE_SPINLOCK(balancing);

3001

static DEFINE_SPINLOCK(balancing);

2992

3002

2993

/*

3003

/*

2994

* It checks each scheduling domain to see if it is due to be balanced,

3004

* It checks each scheduling domain to see if it is due to be balanced,

2995

* and initiates a balancing operation if so.

3005

* and initiates a balancing operation if so.

2996

*

3006

*

2997

* Balancing parameters are set up in arch_init_sched_domains.

3007

* Balancing parameters are set up in arch_init_sched_domains.

2998

*/

3008

*/

2999

static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)

3009

static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)

3000

{

3010

{

3001

int balance = 1;

3011

int balance = 1;

3002

struct rq *rq = cpu_rq(cpu);

3012

struct rq *rq = cpu_rq(cpu);

3003

unsigned long interval;

3013

unsigned long interval;

3004

struct sched_domain *sd;

3014

struct sched_domain *sd;

3005

/* Earliest time when we have to do rebalance again */

3015

/* Earliest time when we have to do rebalance again */

3006

unsigned long next_balance = jiffies + 60*HZ;

3016

unsigned long next_balance = jiffies + 60*HZ;

3007

3017

3008

for_each_domain(cpu, sd) {

3018

for_each_domain(cpu, sd) {

3009

if (!(sd->flags & SD_LOAD_BALANCE))

3019

if (!(sd->flags & SD_LOAD_BALANCE))

3010

continue;

3020

continue;

3011

3021

3012

interval = sd->balance_interval;

3022

interval = sd->balance_interval;

3013

if (idle != CPU_IDLE)

3023

if (idle != CPU_IDLE)

3014

interval *= sd->busy_factor;

3024

interval *= sd->busy_factor;

3015

3025

3016

/* scale ms to jiffies */

3026

/* scale ms to jiffies */

3017

interval = msecs_to_jiffies(interval);

3027

interval = msecs_to_jiffies(interval);

3018

if (unlikely(!interval))

3028

if (unlikely(!interval))

3019

interval = 1;

3029

interval = 1;

3020

if (interval > HZ*NR_CPUS/10)

3030

if (interval > HZ*NR_CPUS/10)

3021

interval = HZ*NR_CPUS/10;

3031

interval = HZ*NR_CPUS/10;

3022

3032

3023

3033

3024

if (sd->flags & SD_SERIALIZE) {

3034

if (sd->flags & SD_SERIALIZE) {

3025

if (!spin_trylock(&balancing))

3035

if (!spin_trylock(&balancing))

3026

goto out;

3036

goto out;

3027

}

3037

}

3028

3038

3029

if (time_after_eq(jiffies, sd->last_balance + interval)) {

3039

if (time_after_eq(jiffies, sd->last_balance + interval)) {

3030

if (load_balance(cpu, rq, sd, idle, &balance)) {

3040

if (load_balance(cpu, rq, sd, idle, &balance)) {

3031

/*

3041

/*

3032

* We've pulled tasks over so either we're no

3042

* We've pulled tasks over so either we're no

3033

* longer idle, or one of our SMT siblings is

3043

* longer idle, or one of our SMT siblings is

3034

* not idle.

3044

* not idle.

3035

*/

3045

*/

3036

idle = CPU_NOT_IDLE;

3046

idle = CPU_NOT_IDLE;

3037

}

3047

}

3038

sd->last_balance = jiffies;

3048

sd->last_balance = jiffies;

3039

}

3049

}

3040

if (sd->flags & SD_SERIALIZE)

3050

if (sd->flags & SD_SERIALIZE)

3041

spin_unlock(&balancing);

3051

spin_unlock(&balancing);

3042

out:

3052

out:

3043

if (time_after(next_balance, sd->last_balance + interval))

3053

if (time_after(next_balance, sd->last_balance + interval))

3044

next_balance = sd->last_balance + interval;

3054

next_balance = sd->last_balance + interval;

3045

3055

3046

/*

3056

/*

3047

* Stop the load balance at this level. There is another

3057

* Stop the load balance at this level. There is another

3048

* CPU in our sched group which is doing load balancing more

3058

* CPU in our sched group which is doing load balancing more

3049

* actively.

3059

* actively.

3050

*/

3060

*/

3051

if (!balance)

3061

if (!balance)

3052

break;

3062

break;

3053

}

3063

}

3054

rq->next_balance = next_balance;

3064

rq->next_balance = next_balance;

3055

}

3065

}

3056

3066

3057

/*

3067

/*

3058

* run_rebalance_domains is triggered when needed from the scheduler tick.

3068

* run_rebalance_domains is triggered when needed from the scheduler tick.

3059

* In CONFIG_NO_HZ case, the idle load balance owner will do the

3069

* In CONFIG_NO_HZ case, the idle load balance owner will do the

3060

* rebalancing for all the cpus for whom scheduler ticks are stopped.

3070

* rebalancing for all the cpus for whom scheduler ticks are stopped.

3061

*/

3071

*/

3062

static void run_rebalance_domains(struct softirq_action *h)

3072

static void run_rebalance_domains(struct softirq_action *h)

3063

{

3073

{

3064

int this_cpu = smp_processor_id();

3074

int this_cpu = smp_processor_id();

3065

struct rq *this_rq = cpu_rq(this_cpu);

3075

struct rq *this_rq = cpu_rq(this_cpu);

3066

enum cpu_idle_type idle = this_rq->idle_at_tick ?

3076

enum cpu_idle_type idle = this_rq->idle_at_tick ?

3067

CPU_IDLE : CPU_NOT_IDLE;

3077

CPU_IDLE : CPU_NOT_IDLE;

3068

3078

3069

rebalance_domains(this_cpu, idle);

3079

rebalance_domains(this_cpu, idle);

3070

3080

3071

#ifdef CONFIG_NO_HZ

3081

#ifdef CONFIG_NO_HZ

3072

/*

3082

/*

3073

* If this cpu is the owner for idle load balancing, then do the

3083

* If this cpu is the owner for idle load balancing, then do the

3074

* balancing on behalf of the other idle cpus whose ticks are

3084

* balancing on behalf of the other idle cpus whose ticks are

3075

* stopped.

3085

* stopped.

3076

*/

3086

*/

3077

if (this_rq->idle_at_tick &&

3087

if (this_rq->idle_at_tick &&

3078

atomic_read(&nohz.load_balancer) == this_cpu) {

3088

atomic_read(&nohz.load_balancer) == this_cpu) {

3079

cpumask_t cpus = nohz.cpu_mask;

3089

cpumask_t cpus = nohz.cpu_mask;

3080

struct rq *rq;

3090

struct rq *rq;

3081

int balance_cpu;

3091

int balance_cpu;

3082

3092

3083

cpu_clear(this_cpu, cpus);

3093

cpu_clear(this_cpu, cpus);

3084

for_each_cpu_mask(balance_cpu, cpus) {

3094

for_each_cpu_mask(balance_cpu, cpus) {

3085

/*

3095

/*

3086

* If this cpu gets work to do, stop the load balancing

3096

* If this cpu gets work to do, stop the load balancing

3087

* work being done for other cpus. Next load

3097

* work being done for other cpus. Next load

3088

* balancing owner will pick it up.

3098

* balancing owner will pick it up.

3089

*/

3099

*/

3090

if (need_resched())

3100

if (need_resched())

3091

break;

3101

break;

3092

3102

3093

rebalance_domains(balance_cpu, SCHED_IDLE);

3103

rebalance_domains(balance_cpu, SCHED_IDLE);

3094

3104

3095

rq = cpu_rq(balance_cpu);

3105

rq = cpu_rq(balance_cpu);

3096

if (time_after(this_rq->next_balance, rq->next_balance))

3106

if (time_after(this_rq->next_balance, rq->next_balance))

3097

this_rq->next_balance = rq->next_balance;

3107

this_rq->next_balance = rq->next_balance;

3098

}

3108

}

3099

}

3109

}

3100

#endif

3110

#endif

3101

}

3111

}

3102

3112

3103

/*

3113

/*

3104

* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.

3114

* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.

3105

*

3115

*

3106

* In case of CONFIG_NO_HZ, this is the place where we nominate a new

3116

* In case of CONFIG_NO_HZ, this is the place where we nominate a new

3107

* idle load balancing owner or decide to stop the periodic load balancing,

3117

* idle load balancing owner or decide to stop the periodic load balancing,

3108

* if the whole system is idle.

3118

* if the whole system is idle.

3109

*/

3119

*/

3110

static inline void trigger_load_balance(struct rq *rq, int cpu)

3120

static inline void trigger_load_balance(struct rq *rq, int cpu)

3111

{

3121

{

3112

#ifdef CONFIG_NO_HZ

3122

#ifdef CONFIG_NO_HZ

3113

/*

3123

/*

3114

* If we were in the nohz mode recently and busy at the current

3124

* If we were in the nohz mode recently and busy at the current

3115

* scheduler tick, then check if we need to nominate new idle

3125

* scheduler tick, then check if we need to nominate new idle

3116

* load balancer.

3126

* load balancer.

3117

*/

3127

*/

3118

if (rq->in_nohz_recently && !rq->idle_at_tick) {

3128

if (rq->in_nohz_recently && !rq->idle_at_tick) {

3119

rq->in_nohz_recently = 0;

3129

rq->in_nohz_recently = 0;

3120

3130

3121

if (atomic_read(&nohz.load_balancer) == cpu) {

3131

if (atomic_read(&nohz.load_balancer) == cpu) {

3122

cpu_clear(cpu, nohz.cpu_mask);

3132

cpu_clear(cpu, nohz.cpu_mask);

3123

atomic_set(&nohz.load_balancer, -1);

3133

atomic_set(&nohz.load_balancer, -1);

3124

}

3134

}

3125

3135

3126

if (atomic_read(&nohz.load_balancer) == -1) {

3136

if (atomic_read(&nohz.load_balancer) == -1) {

3127

/*

3137

/*

3128

* simple selection for now: Nominate the

3138

* simple selection for now: Nominate the

3129

* first cpu in the nohz list to be the next

3139

* first cpu in the nohz list to be the next

3130

* ilb owner.

3140

* ilb owner.

3131

*

3141

*

3132

* TBD: Traverse the sched domains and nominate

3142

* TBD: Traverse the sched domains and nominate

3133

* the nearest cpu in the nohz.cpu_mask.

3143

* the nearest cpu in the nohz.cpu_mask.

3134

*/

3144

*/

3135

int ilb = first_cpu(nohz.cpu_mask);

3145

int ilb = first_cpu(nohz.cpu_mask);

3136

3146

3137

if (ilb != NR_CPUS)

3147

if (ilb != NR_CPUS)

3138

resched_cpu(ilb);

3148

resched_cpu(ilb);

3139

}

3149

}

3140

}

3150

}

3141

3151

3142

/*

3152

/*

3143

* If this cpu is idle and doing idle load balancing for all the

3153

* If this cpu is idle and doing idle load balancing for all the

3144

* cpus with ticks stopped, is it time for that to stop?

3154

* cpus with ticks stopped, is it time for that to stop?

3145

*/

3155

*/

3146

if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&

3156

if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&

3147

cpus_weight(nohz.cpu_mask) == num_online_cpus()) {

3157

cpus_weight(nohz.cpu_mask) == num_online_cpus()) {

3148

resched_cpu(cpu);

3158

resched_cpu(cpu);

3149

return;

3159

return;

3150

}

3160

}

3151

3161

3152

/*

3162

/*

3153

* If this cpu is idle and the idle load balancing is done by

3163

* If this cpu is idle and the idle load balancing is done by

3154

* someone else, then no need raise the SCHED_SOFTIRQ

3164

* someone else, then no need raise the SCHED_SOFTIRQ

3155

*/

3165

*/

3156

if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&

3166

if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&

3157

cpu_isset(cpu, nohz.cpu_mask))

3167

cpu_isset(cpu, nohz.cpu_mask))

3158

return;

3168

return;

3159

#endif

3169

#endif

3160

if (time_after_eq(jiffies, rq->next_balance))

3170

if (time_after_eq(jiffies, rq->next_balance))

3161

raise_softirq(SCHED_SOFTIRQ);

3171

raise_softirq(SCHED_SOFTIRQ);

3162

}

3172

}

3163

3173

3164

#else /* CONFIG_SMP */

3174

#else /* CONFIG_SMP */

3165

3175

3166

/*

3176

/*

3167

* on UP we do not need to balance between CPUs:

3177

* on UP we do not need to balance between CPUs:

3168

*/

3178

*/

3169

static inline void idle_balance(int cpu, struct rq *rq)

3179

static inline void idle_balance(int cpu, struct rq *rq)

3170

{

3180

{

3171

}

3181

}

3172

3182

3173

/* Avoid "used but not defined" warning on UP */

3183

/* Avoid "used but not defined" warning on UP */

3174

static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

3184

static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

3175

unsigned long max_nr_move, unsigned long max_load_move,

3185

unsigned long max_nr_move, unsigned long max_load_move,

3176

struct sched_domain *sd, enum cpu_idle_type idle,

3186

struct sched_domain *sd, enum cpu_idle_type idle,

3177

int *all_pinned, unsigned long *load_moved,

3187

int *all_pinned, unsigned long *load_moved,

3178

int this_best_prio, int best_prio, int best_prio_seen,

3188

int this_best_prio, int best_prio, int best_prio_seen,

3179

struct rq_iterator *iterator)

3189

struct rq_iterator *iterator)

3180

{

3190

{

3181

*load_moved = 0;

3191

*load_moved = 0;

3182

3192

3183

return 0;

3193

return 0;

3184

}

3194

}

3185

3195

3186

#endif

3196

#endif

3187

3197

3188

DEFINE_PER_CPU(struct kernel_stat, kstat);

3198

DEFINE_PER_CPU(struct kernel_stat, kstat);

3189

3199

3190

EXPORT_PER_CPU_SYMBOL(kstat);

3200

EXPORT_PER_CPU_SYMBOL(kstat);

3191

3201

3192

/*

3202

/*

3193

* Return p->sum_exec_runtime plus any more ns on the sched_clock

3203

* Return p->sum_exec_runtime plus any more ns on the sched_clock

3194

* that have not yet been banked in case the task is currently running.

3204

* that have not yet been banked in case the task is currently running.

3195

*/

3205

*/

3196

unsigned long long task_sched_runtime(struct task_struct *p)

3206

unsigned long long task_sched_runtime(struct task_struct *p)

3197

{

3207

{

3198

unsigned long flags;

3208

unsigned long flags;

3199

u64 ns, delta_exec;

3209

u64 ns, delta_exec;

3200

struct rq *rq;

3210

struct rq *rq;

3201

3211

3202

rq = task_rq_lock(p, &flags);

3212

rq = task_rq_lock(p, &flags);

3203

ns = p->se.sum_exec_runtime;

3213

ns = p->se.sum_exec_runtime;

3204

if (rq->curr == p) {

3214

if (rq->curr == p) {

3205

delta_exec = rq_clock(rq) - p->se.exec_start;

3215

delta_exec = rq_clock(rq) - p->se.exec_start;

3206

if ((s64)delta_exec > 0)

3216

if ((s64)delta_exec > 0)

3207

ns += delta_exec;

3217

ns += delta_exec;

3208

}

3218

}

3209

task_rq_unlock(rq, &flags);

3219

task_rq_unlock(rq, &flags);

3210

3220

3211

return ns;

3221

return ns;

3212

}

3222

}

3213

3223

3214

/*

3224

/*

3215

* Account user cpu time to a process.

3225

* Account user cpu time to a process.

3216

* @p: the process that the cpu time gets accounted to

3226

* @p: the process that the cpu time gets accounted to

3217

* @hardirq_offset: the offset to subtract from hardirq_count()

3227

* @hardirq_offset: the offset to subtract from hardirq_count()

3218

* @cputime: the cpu time spent in user space since the last update

3228

* @cputime: the cpu time spent in user space since the last update

3219

*/

3229

*/

3220

void account_user_time(struct task_struct *p, cputime_t cputime)

3230

void account_user_time(struct task_struct *p, cputime_t cputime)

3221

{

3231

{

3222

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3232

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3223

cputime64_t tmp;

3233

cputime64_t tmp;

3224

3234

3225

p->utime = cputime_add(p->utime, cputime);

3235

p->utime = cputime_add(p->utime, cputime);

3226

3236

3227

/* Add user time to cpustat. */

3237

/* Add user time to cpustat. */

3228

tmp = cputime_to_cputime64(cputime);

3238

tmp = cputime_to_cputime64(cputime);

3229

if (TASK_NICE(p) > 0)

3239

if (TASK_NICE(p) > 0)

3230

cpustat->nice = cputime64_add(cpustat->nice, tmp);

3240

cpustat->nice = cputime64_add(cpustat->nice, tmp);

3231

else

3241

else

3232

cpustat->user = cputime64_add(cpustat->user, tmp);

3242

cpustat->user = cputime64_add(cpustat->user, tmp);

3233

}

3243

}

3234

3244

3235

/*

3245

/*

3236

* Account system cpu time to a process.

3246

* Account system cpu time to a process.

3237

* @p: the process that the cpu time gets accounted to

3247

* @p: the process that the cpu time gets accounted to

3238

* @hardirq_offset: the offset to subtract from hardirq_count()

3248

* @hardirq_offset: the offset to subtract from hardirq_count()

3239

* @cputime: the cpu time spent in kernel space since the last update

3249

* @cputime: the cpu time spent in kernel space since the last update

3240

*/

3250

*/

3241

void account_system_time(struct task_struct *p, int hardirq_offset,

3251

void account_system_time(struct task_struct *p, int hardirq_offset,

3242

cputime_t cputime)

3252

cputime_t cputime)

3243

{

3253

{

3244

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3254

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3245

struct rq *rq = this_rq();

3255

struct rq *rq = this_rq();

3246

cputime64_t tmp;

3256

cputime64_t tmp;

3247

3257

3248

p->stime = cputime_add(p->stime, cputime);

3258

p->stime = cputime_add(p->stime, cputime);

3249

3259

3250

/* Add system time to cpustat. */

3260

/* Add system time to cpustat. */

3251

tmp = cputime_to_cputime64(cputime);

3261

tmp = cputime_to_cputime64(cputime);

3252

if (hardirq_count() - hardirq_offset)

3262

if (hardirq_count() - hardirq_offset)

3253

cpustat->irq = cputime64_add(cpustat->irq, tmp);

3263

cpustat->irq = cputime64_add(cpustat->irq, tmp);

3254

else if (softirq_count())

3264

else if (softirq_count())

3255

cpustat->softirq = cputime64_add(cpustat->softirq, tmp);

3265

cpustat->softirq = cputime64_add(cpustat->softirq, tmp);

3256

else if (p != rq->idle)

3266

else if (p != rq->idle)

3257

cpustat->system = cputime64_add(cpustat->system, tmp);

3267

cpustat->system = cputime64_add(cpustat->system, tmp);

3258

else if (atomic_read(&rq->nr_iowait) > 0)

3268

else if (atomic_read(&rq->nr_iowait) > 0)

3259

cpustat->iowait = cputime64_add(cpustat->iowait, tmp);

3269

cpustat->iowait = cputime64_add(cpustat->iowait, tmp);

3260

else

3270

else

3261

cpustat->idle = cputime64_add(cpustat->idle, tmp);

3271

cpustat->idle = cputime64_add(cpustat->idle, tmp);

3262

/* Account for system time used */

3272

/* Account for system time used */

3263

acct_update_integrals(p);

3273

acct_update_integrals(p);

3264

}

3274

}

3265

3275

3266

/*

3276

/*

3267

* Account for involuntary wait time.

3277

* Account for involuntary wait time.

3268

* @p: the process from which the cpu time has been stolen

3278

* @p: the process from which the cpu time has been stolen

3269

* @steal: the cpu time spent in involuntary wait

3279

* @steal: the cpu time spent in involuntary wait

3270

*/

3280

*/

3271

void account_steal_time(struct task_struct *p, cputime_t steal)

3281

void account_steal_time(struct task_struct *p, cputime_t steal)

3272

{

3282

{

3273

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3283

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3274

cputime64_t tmp = cputime_to_cputime64(steal);

3284

cputime64_t tmp = cputime_to_cputime64(steal);

3275

struct rq *rq = this_rq();

3285

struct rq *rq = this_rq();

3276

3286

3277

if (p == rq->idle) {

3287

if (p == rq->idle) {

3278

p->stime = cputime_add(p->stime, steal);

3288

p->stime = cputime_add(p->stime, steal);

3279

if (atomic_read(&rq->nr_iowait) > 0)

3289

if (atomic_read(&rq->nr_iowait) > 0)

3280

cpustat->iowait = cputime64_add(cpustat->iowait, tmp);

3290

cpustat->iowait = cputime64_add(cpustat->iowait, tmp);

3281

else

3291

else

3282

cpustat->idle = cputime64_add(cpustat->idle, tmp);

3292

cpustat->idle = cputime64_add(cpustat->idle, tmp);

3283

} else

3293

} else

3284

cpustat->steal = cputime64_add(cpustat->steal, tmp);

3294

cpustat->steal = cputime64_add(cpustat->steal, tmp);

3285

}

3295

}

3286

3296

3287

/*

3297

/*

3288

* This function gets called by the timer code, with HZ frequency.

3298

* This function gets called by the timer code, with HZ frequency.

3289

* We call it with interrupts disabled.

3299

* We call it with interrupts disabled.

3290

*

3300

*

3291

* It also gets called by the fork code, when changing the parent's

3301

* It also gets called by the fork code, when changing the parent's

3292

* timeslices.

3302

* timeslices.

3293

*/

3303

*/

3294

void scheduler_tick(void)

3304

void scheduler_tick(void)

3295

{

3305

{

3296

int cpu = smp_processor_id();

3306

int cpu = smp_processor_id();

3297

struct rq *rq = cpu_rq(cpu);

3307

struct rq *rq = cpu_rq(cpu);

3298

struct task_struct *curr = rq->curr;

3308

struct task_struct *curr = rq->curr;

3299

3309

3300

spin_lock(&rq->lock);

3310

spin_lock(&rq->lock);

3301

update_cpu_load(rq);

3311

update_cpu_load(rq);

3302

if (curr != rq->idle) /* FIXME: needed? */

3312

if (curr != rq->idle) /* FIXME: needed? */

3303

curr->sched_class->task_tick(rq, curr);

3313

curr->sched_class->task_tick(rq, curr);

3304

spin_unlock(&rq->lock);

3314

spin_unlock(&rq->lock);

3305

3315

3306

#ifdef CONFIG_SMP

3316

#ifdef CONFIG_SMP

3307

rq->idle_at_tick = idle_cpu(cpu);

3317

rq->idle_at_tick = idle_cpu(cpu);

3308

trigger_load_balance(rq, cpu);

3318

trigger_load_balance(rq, cpu);

3309

#endif

3319

#endif

3310

}

3320

}

3311

3321

3312

#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)

3322

#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)

3313

3323

3314

void fastcall add_preempt_count(int val)

3324

void fastcall add_preempt_count(int val)

3315

{

3325

{

3316

/*

3326

/*

3317

* Underflow?

3327

* Underflow?

3318

*/

3328

*/

3319

if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))

3329

if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))

3320

return;

3330

return;

3321

preempt_count() += val;

3331

preempt_count() += val;

3322

/*

3332

/*

3323

* Spinlock count overflowing soon?

3333

* Spinlock count overflowing soon?

3324

*/

3334

*/

3325

DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=

3335

DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=

3326

PREEMPT_MASK - 10);

3336

PREEMPT_MASK - 10);

3327

}

3337

}

3328

EXPORT_SYMBOL(add_preempt_count);

3338

EXPORT_SYMBOL(add_preempt_count);

3329

3339

3330

void fastcall sub_preempt_count(int val)

3340

void fastcall sub_preempt_count(int val)

3331

{

3341

{

3332

/*

3342

/*

3333

* Underflow?

3343

* Underflow?

3334

*/

3344

*/

3335

if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))

3345

if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))

3336

return;

3346

return;

3337

/*

3347

/*

3338

* Is the spinlock portion underflowing?

3348

* Is the spinlock portion underflowing?

3339

*/

3349

*/

3340

if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&

3350

if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&

3341

!(preempt_count() & PREEMPT_MASK)))

3351

!(preempt_count() & PREEMPT_MASK)))

3342

return;

3352

return;

3343

3353

3344

preempt_count() -= val;

3354

preempt_count() -= val;

3345

}

3355

}

3346

EXPORT_SYMBOL(sub_preempt_count);

3356

EXPORT_SYMBOL(sub_preempt_count);

3347

3357

3348

#endif

3358

#endif

3349

3359

3350

/*

3360

/*

3351

* Print scheduling while atomic bug:

3361

* Print scheduling while atomic bug:

3352

*/

3362

*/

3353

static noinline void __schedule_bug(struct task_struct *prev)

3363

static noinline void __schedule_bug(struct task_struct *prev)

3354

{

3364

{

3355

printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",

3365

printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",

3356

prev->comm, preempt_count(), prev->pid);

3366

prev->comm, preempt_count(), prev->pid);

3357

debug_show_held_locks(prev);

3367

debug_show_held_locks(prev);

3358

if (irqs_disabled())

3368

if (irqs_disabled())

3359

print_irqtrace_events(prev);

3369

print_irqtrace_events(prev);

3360

dump_stack();

3370

dump_stack();

3361

}

3371

}

3362

3372

3363

/*

3373

/*

3364

* Various schedule()-time debugging checks and statistics:

3374

* Various schedule()-time debugging checks and statistics:

3365

*/

3375

*/

3366

static inline void schedule_debug(struct task_struct *prev)

3376

static inline void schedule_debug(struct task_struct *prev)

3367

{

3377

{

3368

/*

3378

/*

3369

* Test if we are atomic. Since do_exit() needs to call into

3379

* Test if we are atomic. Since do_exit() needs to call into

3370

* schedule() atomically, we ignore that path for now.

3380

* schedule() atomically, we ignore that path for now.

3371

* Otherwise, whine if we are scheduling when we should not be.

3381

* Otherwise, whine if we are scheduling when we should not be.

3372

*/

3382

*/

3373

if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))

3383

if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))

3374

__schedule_bug(prev);

3384

__schedule_bug(prev);

3375

3385

3376

profile_hit(SCHED_PROFILING, __builtin_return_address(0));

3386

profile_hit(SCHED_PROFILING, __builtin_return_address(0));

3377

3387

3378

schedstat_inc(this_rq(), sched_cnt);

3388

schedstat_inc(this_rq(), sched_cnt);

3379

}

3389

}

3380

3390

3381

/*

3391

/*

3382

* Pick up the highest-prio task:

3392

* Pick up the highest-prio task:

3383

*/

3393

*/

3384

static inline struct task_struct *

3394

static inline struct task_struct *

3385

pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)

3395

pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)

3386

{

3396

{

3387

struct sched_class *class;

3397

struct sched_class *class;

3388

struct task_struct *p;

3398

struct task_struct *p;

3389

3399

3390

/*

3400

/*

3391

* Optimization: we know that if all tasks are in

3401

* Optimization: we know that if all tasks are in

3392

* the fair class we can call that function directly:

3402

* the fair class we can call that function directly:

3393

*/

3403

*/

3394

if (likely(rq->nr_running == rq->cfs.nr_running)) {

3404

if (likely(rq->nr_running == rq->cfs.nr_running)) {

3395

p = fair_sched_class.pick_next_task(rq, now);

3405

p = fair_sched_class.pick_next_task(rq, now);

3396

if (likely(p))

3406

if (likely(p))

3397

return p;

3407

return p;

3398

}

3408

}

3399

3409

3400

class = sched_class_highest;

3410

class = sched_class_highest;

3401

for ( ; ; ) {

3411

for ( ; ; ) {

3402

p = class->pick_next_task(rq, now);

3412

p = class->pick_next_task(rq, now);

3403

if (p)

3413

if (p)

3404

return p;

3414

return p;

3405

/*

3415

/*

3406

* Will never be NULL as the idle class always

3416

* Will never be NULL as the idle class always

3407

* returns a non-NULL p:

3417

* returns a non-NULL p:

3408

*/

3418

*/

3409

class = class->next;

3419

class = class->next;

3410

}

3420

}

3411

}

3421

}

3412

3422

3413

/*

3423

/*

3414

* schedule() is the main scheduler function.

3424

* schedule() is the main scheduler function.

3415

*/

3425

*/

3416

asmlinkage void __sched schedule(void)

3426

asmlinkage void __sched schedule(void)

3417

{

3427

{

3418

struct task_struct *prev, *next;

3428

struct task_struct *prev, *next;

3419

long *switch_count;

3429

long *switch_count;

3420

struct rq *rq;

3430

struct rq *rq;

3421

u64 now;

3431

u64 now;

3422

int cpu;

3432

int cpu;

3423

3433

3424

need_resched:

3434

need_resched:

3425

preempt_disable();

3435

preempt_disable();

3426

cpu = smp_processor_id();

3436

cpu = smp_processor_id();

3427

rq = cpu_rq(cpu);

3437

rq = cpu_rq(cpu);

3428

rcu_qsctr_inc(cpu);

3438

rcu_qsctr_inc(cpu);

3429

prev = rq->curr;

3439

prev = rq->curr;

3430

switch_count = &prev->nivcsw;

3440

switch_count = &prev->nivcsw;

3431

3441

3432

release_kernel_lock(prev);

3442

release_kernel_lock(prev);

3433

need_resched_nonpreemptible:

3443

need_resched_nonpreemptible:

3434

3444

3435

schedule_debug(prev);

3445

schedule_debug(prev);

3436

3446

3437

spin_lock_irq(&rq->lock);

3447

spin_lock_irq(&rq->lock);

3438

clear_tsk_need_resched(prev);

3448

clear_tsk_need_resched(prev);

3439

3449

3440

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

3450

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

3441

if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&

3451

if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&

3442

unlikely(signal_pending(prev)))) {

3452

unlikely(signal_pending(prev)))) {

3443

prev->state = TASK_RUNNING;

3453

prev->state = TASK_RUNNING;

3444

} else {

3454

} else {

3445

deactivate_task(rq, prev, 1);

3455

deactivate_task(rq, prev, 1);

3446

}

3456

}

3447

switch_count = &prev->nvcsw;

3457

switch_count = &prev->nvcsw;

3448

}

3458

}

3449

3459

3450

if (unlikely(!rq->nr_running))

3460

if (unlikely(!rq->nr_running))

3451

idle_balance(cpu, rq);

3461

idle_balance(cpu, rq);

3452

3462

3453

now = __rq_clock(rq);

3463

now = __rq_clock(rq);

3454

prev->sched_class->put_prev_task(rq, prev, now);

3464

prev->sched_class->put_prev_task(rq, prev, now);

3455

next = pick_next_task(rq, prev, now);

3465

next = pick_next_task(rq, prev, now);

3456

3466

3457

sched_info_switch(prev, next);

3467

sched_info_switch(prev, next);

3458

3468

3459

if (likely(prev != next)) {

3469

if (likely(prev != next)) {

3460

rq->nr_switches++;

3470

rq->nr_switches++;

3461

rq->curr = next;

3471

rq->curr = next;

3462

++*switch_count;

3472

++*switch_count;

3463

3473

3464

context_switch(rq, prev, next); /* unlocks the rq */

3474

context_switch(rq, prev, next); /* unlocks the rq */

3465

} else

3475

} else

3466

spin_unlock_irq(&rq->lock);

3476

spin_unlock_irq(&rq->lock);

3467

3477

3468

if (unlikely(reacquire_kernel_lock(current) < 0)) {

3478

if (unlikely(reacquire_kernel_lock(current) < 0)) {

3469

cpu = smp_processor_id();

3479

cpu = smp_processor_id();

3470

rq = cpu_rq(cpu);

3480

rq = cpu_rq(cpu);

3471

goto need_resched_nonpreemptible;

3481

goto need_resched_nonpreemptible;

3472

}

3482

}

3473

preempt_enable_no_resched();

3483

preempt_enable_no_resched();

3474

if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))

3484

if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))

3475

goto need_resched;

3485

goto need_resched;

3476

}

3486

}

3477

EXPORT_SYMBOL(schedule);

3487

EXPORT_SYMBOL(schedule);

3478

3488

3479

#ifdef CONFIG_PREEMPT

3489

#ifdef CONFIG_PREEMPT

3480

/*

3490

/*

3481

* this is the entry point to schedule() from in-kernel preemption

3491

* this is the entry point to schedule() from in-kernel preemption

3482

* off of preempt_enable. Kernel preemptions off return from interrupt

3492

* off of preempt_enable. Kernel preemptions off return from interrupt

3483

* occur there and call schedule directly.

3493

* occur there and call schedule directly.

3484

*/

3494

*/

3485

asmlinkage void __sched preempt_schedule(void)

3495

asmlinkage void __sched preempt_schedule(void)

3486

{

3496

{

3487

struct thread_info *ti = current_thread_info();

3497

struct thread_info *ti = current_thread_info();

3488

#ifdef CONFIG_PREEMPT_BKL

3498

#ifdef CONFIG_PREEMPT_BKL

3489

struct task_struct *task = current;

3499

struct task_struct *task = current;

3490

int saved_lock_depth;

3500

int saved_lock_depth;

3491

#endif

3501

#endif

3492

/*

3502

/*

3493

* If there is a non-zero preempt_count or interrupts are disabled,

3503

* If there is a non-zero preempt_count or interrupts are disabled,

3494

* we do not want to preempt the current task. Just return..

3504

* we do not want to preempt the current task. Just return..

3495

*/

3505

*/

3496

if (likely(ti->preempt_count || irqs_disabled()))

3506

if (likely(ti->preempt_count || irqs_disabled()))

3497

return;

3507

return;

3498

3508

3499

need_resched:

3509

need_resched:

3500

add_preempt_count(PREEMPT_ACTIVE);

3510

add_preempt_count(PREEMPT_ACTIVE);

3501

/*

3511

/*

3502

* We keep the big kernel semaphore locked, but we

3512

* We keep the big kernel semaphore locked, but we

3503

* clear ->lock_depth so that schedule() doesnt

3513

* clear ->lock_depth so that schedule() doesnt

3504

* auto-release the semaphore:

3514

* auto-release the semaphore:

3505

*/

3515

*/

3506

#ifdef CONFIG_PREEMPT_BKL

3516

#ifdef CONFIG_PREEMPT_BKL

3507

saved_lock_depth = task->lock_depth;

3517

saved_lock_depth = task->lock_depth;

3508

task->lock_depth = -1;

3518

task->lock_depth = -1;

3509

#endif

3519

#endif

3510

schedule();

3520

schedule();

3511

#ifdef CONFIG_PREEMPT_BKL

3521

#ifdef CONFIG_PREEMPT_BKL

3512

task->lock_depth = saved_lock_depth;

3522

task->lock_depth = saved_lock_depth;

3513

#endif

3523

#endif

3514

sub_preempt_count(PREEMPT_ACTIVE);

3524

sub_preempt_count(PREEMPT_ACTIVE);

3515

3525

3516

/* we could miss a preemption opportunity between schedule and now */

3526

/* we could miss a preemption opportunity between schedule and now */

3517

barrier();

3527

barrier();

3518

if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))

3528

if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))

3519

goto need_resched;

3529

goto need_resched;

3520

}

3530

}

3521

EXPORT_SYMBOL(preempt_schedule);

3531

EXPORT_SYMBOL(preempt_schedule);

3522

3532

3523

/*

3533

/*

3524

* this is the entry point to schedule() from kernel preemption

3534

* this is the entry point to schedule() from kernel preemption

3525

* off of irq context.

3535

* off of irq context.

3526

* Note, that this is called and return with irqs disabled. This will

3536

* Note, that this is called and return with irqs disabled. This will

3527

* protect us against recursive calling from irq.

3537

* protect us against recursive calling from irq.

3528

*/

3538

*/

3529

asmlinkage void __sched preempt_schedule_irq(void)

3539

asmlinkage void __sched preempt_schedule_irq(void)

3530

{

3540

{

3531

struct thread_info *ti = current_thread_info();

3541

struct thread_info *ti = current_thread_info();

3532

#ifdef CONFIG_PREEMPT_BKL

3542

#ifdef CONFIG_PREEMPT_BKL

3533

struct task_struct *task = current;

3543

struct task_struct *task = current;

3534

int saved_lock_depth;

3544

int saved_lock_depth;

3535

#endif

3545

#endif

3536

/* Catch callers which need to be fixed */

3546

/* Catch callers which need to be fixed */

3537

BUG_ON(ti->preempt_count || !irqs_disabled());

3547

BUG_ON(ti->preempt_count || !irqs_disabled());

3538

3548

3539

need_resched:

3549

need_resched:

3540

add_preempt_count(PREEMPT_ACTIVE);

3550

add_preempt_count(PREEMPT_ACTIVE);

3541

/*

3551

/*

3542

* We keep the big kernel semaphore locked, but we

3552

* We keep the big kernel semaphore locked, but we

3543

* clear ->lock_depth so that schedule() doesnt

3553

* clear ->lock_depth so that schedule() doesnt

3544

* auto-release the semaphore:

3554

* auto-release the semaphore:

3545

*/

3555

*/

3546

#ifdef CONFIG_PREEMPT_BKL

3556

#ifdef CONFIG_PREEMPT_BKL

3547

saved_lock_depth = task->lock_depth;

3557

saved_lock_depth = task->lock_depth;

3548

task->lock_depth = -1;

3558

task->lock_depth = -1;

3549

#endif

3559

#endif

3550

local_irq_enable();

3560

local_irq_enable();

3551

schedule();

3561

schedule();

3552

local_irq_disable();

3562

local_irq_disable();

3553

#ifdef CONFIG_PREEMPT_BKL

3563

#ifdef CONFIG_PREEMPT_BKL

3554

task->lock_depth = saved_lock_depth;

3564

task->lock_depth = saved_lock_depth;

3555

#endif

3565

#endif

3556

sub_preempt_count(PREEMPT_ACTIVE);

3566

sub_preempt_count(PREEMPT_ACTIVE);

3557

3567

3558

/* we could miss a preemption opportunity between schedule and now */

3568

/* we could miss a preemption opportunity between schedule and now */

3559

barrier();

3569

barrier();

3560

if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))

3570

if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))

3561

goto need_resched;

3571

goto need_resched;

3562

}

3572

}

3563

3573

3564

#endif /* CONFIG_PREEMPT */

3574

#endif /* CONFIG_PREEMPT */

3565

3575

3566

int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,

3576

int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,

3567

void *key)

3577

void *key)

3568

{

3578

{

3569

return try_to_wake_up(curr->private, mode, sync);

3579

return try_to_wake_up(curr->private, mode, sync);

3570

}

3580

}

3571

EXPORT_SYMBOL(default_wake_function);

3581

EXPORT_SYMBOL(default_wake_function);

3572

3582

3573

/*

3583

/*

3574

* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just

3584

* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just

3575

* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve

3585

* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve

3576

* number) then we wake all the non-exclusive tasks and one exclusive task.

3586

* number) then we wake all the non-exclusive tasks and one exclusive task.

3577

*

3587

*

3578

* There are circumstances in which we can try to wake a task which has already

3588

* There are circumstances in which we can try to wake a task which has already

3579

* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns

3589

* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns

3580

* zero in this (rare) case, and we handle it by continuing to scan the queue.

3590

* zero in this (rare) case, and we handle it by continuing to scan the queue.

3581

*/

3591

*/

3582

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,

3592

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,

3583

int nr_exclusive, int sync, void *key)

3593

int nr_exclusive, int sync, void *key)

3584

{

3594

{

3585

struct list_head *tmp, *next;

3595

struct list_head *tmp, *next;

3586

3596

3587

list_for_each_safe(tmp, next, &q->task_list) {

3597

list_for_each_safe(tmp, next, &q->task_list) {

3588

wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);

3598

wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);

3589

unsigned flags = curr->flags;

3599

unsigned flags = curr->flags;

3590

3600

3591

if (curr->func(curr, mode, sync, key) &&

3601

if (curr->func(curr, mode, sync, key) &&

3592

(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)

3602

(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)

3593

break;

3603

break;

3594

}

3604

}

3595

}

3605

}

3596

3606

3597

/**

3607

/**

3598

* __wake_up - wake up threads blocked on a waitqueue.

3608

* __wake_up - wake up threads blocked on a waitqueue.

3599

* @q: the waitqueue

3609

* @q: the waitqueue

3600

* @mode: which threads

3610

* @mode: which threads

3601

* @nr_exclusive: how many wake-one or wake-many threads to wake up

3611

* @nr_exclusive: how many wake-one or wake-many threads to wake up

3602

* @key: is directly passed to the wakeup function

3612

* @key: is directly passed to the wakeup function

3603

*/

3613

*/

3604

void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,

3614

void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,

3605

int nr_exclusive, void *key)

3615

int nr_exclusive, void *key)

3606

{

3616

{

3607

unsigned long flags;

3617

unsigned long flags;

3608

3618

3609

spin_lock_irqsave(&q->lock, flags);

3619

spin_lock_irqsave(&q->lock, flags);

3610

__wake_up_common(q, mode, nr_exclusive, 0, key);

3620

__wake_up_common(q, mode, nr_exclusive, 0, key);

3611

spin_unlock_irqrestore(&q->lock, flags);

3621

spin_unlock_irqrestore(&q->lock, flags);

3612

}

3622

}

3613

EXPORT_SYMBOL(__wake_up);

3623

EXPORT_SYMBOL(__wake_up);

3614

3624

3615

/*

3625

/*

3616

* Same as __wake_up but called with the spinlock in wait_queue_head_t held.

3626

* Same as __wake_up but called with the spinlock in wait_queue_head_t held.

3617

*/

3627

*/

3618

void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)

3628

void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)

3619

{

3629

{

3620

__wake_up_common(q, mode, 1, 0, NULL);

3630

__wake_up_common(q, mode, 1, 0, NULL);

3621

}

3631

}

3622

3632

3623

/**

3633

/**

3624

* __wake_up_sync - wake up threads blocked on a waitqueue.

3634

* __wake_up_sync - wake up threads blocked on a waitqueue.

3625

* @q: the waitqueue

3635

* @q: the waitqueue

3626

* @mode: which threads

3636

* @mode: which threads

3627

* @nr_exclusive: how many wake-one or wake-many threads to wake up

3637

* @nr_exclusive: how many wake-one or wake-many threads to wake up

3628

*

3638

*

3629

* The sync wakeup differs that the waker knows that it will schedule

3639

* The sync wakeup differs that the waker knows that it will schedule

3630

* away soon, so while the target thread will be woken up, it will not

3640

* away soon, so while the target thread will be woken up, it will not

3631

* be migrated to another CPU - ie. the two threads are 'synchronized'

3641

* be migrated to another CPU - ie. the two threads are 'synchronized'

3632

* with each other. This can prevent needless bouncing between CPUs.

3642

* with each other. This can prevent needless bouncing between CPUs.

3633

*

3643

*

3634

* On UP it can prevent extra preemption.

3644

* On UP it can prevent extra preemption.

3635

*/

3645

*/

3636

void fastcall

3646

void fastcall

3637

__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)

3647

__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)

3638

{

3648

{

3639

unsigned long flags;

3649

unsigned long flags;

3640

int sync = 1;

3650

int sync = 1;

3641

3651

3642

if (unlikely(!q))

3652

if (unlikely(!q))

3643

return;

3653

return;

3644

3654

3645

if (unlikely(!nr_exclusive))

3655

if (unlikely(!nr_exclusive))

3646

sync = 0;

3656

sync = 0;

3647

3657

3648

spin_lock_irqsave(&q->lock, flags);

3658

spin_lock_irqsave(&q->lock, flags);

3649

__wake_up_common(q, mode, nr_exclusive, sync, NULL);

3659

__wake_up_common(q, mode, nr_exclusive, sync, NULL);

3650

spin_unlock_irqrestore(&q->lock, flags);

3660

spin_unlock_irqrestore(&q->lock, flags);

3651

}

3661

}

3652

EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */

3662

EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */

3653

3663

3654

void fastcall complete(struct completion *x)

3664

void fastcall complete(struct completion *x)

3655

{

3665

{

3656

unsigned long flags;

3666

unsigned long flags;

3657

3667

3658

spin_lock_irqsave(&x->wait.lock, flags);

3668

spin_lock_irqsave(&x->wait.lock, flags);

3659

x->done++;

3669

x->done++;

3660

__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,

3670

__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,

3661

1, 0, NULL);

3671

1, 0, NULL);

3662

spin_unlock_irqrestore(&x->wait.lock, flags);

3672

spin_unlock_irqrestore(&x->wait.lock, flags);

3663

}

3673

}

3664

EXPORT_SYMBOL(complete);

3674

EXPORT_SYMBOL(complete);

3665

3675

3666

void fastcall complete_all(struct completion *x)

3676

void fastcall complete_all(struct completion *x)

3667

{

3677

{

3668

unsigned long flags;

3678

unsigned long flags;

3669

3679

3670

spin_lock_irqsave(&x->wait.lock, flags);

3680

spin_lock_irqsave(&x->wait.lock, flags);

3671

x->done += UINT_MAX/2;

3681

x->done += UINT_MAX/2;

3672

__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,

3682

__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,

3673

0, 0, NULL);

3683

0, 0, NULL);

3674

spin_unlock_irqrestore(&x->wait.lock, flags);

3684

spin_unlock_irqrestore(&x->wait.lock, flags);

3675

}

3685

}

3676

EXPORT_SYMBOL(complete_all);

3686

EXPORT_SYMBOL(complete_all);

3677

3687

3678

void fastcall __sched wait_for_completion(struct completion *x)

3688

void fastcall __sched wait_for_completion(struct completion *x)

3679

{

3689

{

3680

might_sleep();

3690

might_sleep();

3681

3691

3682

spin_lock_irq(&x->wait.lock);

3692

spin_lock_irq(&x->wait.lock);

3683

if (!x->done) {

3693

if (!x->done) {

3684

DECLARE_WAITQUEUE(wait, current);

3694

DECLARE_WAITQUEUE(wait, current);

3685

3695

3686

wait.flags |= WQ_FLAG_EXCLUSIVE;

3696

wait.flags |= WQ_FLAG_EXCLUSIVE;

3687

__add_wait_queue_tail(&x->wait, &wait);

3697

__add_wait_queue_tail(&x->wait, &wait);

3688

do {

3698

do {

3689

__set_current_state(TASK_UNINTERRUPTIBLE);

3699

__set_current_state(TASK_UNINTERRUPTIBLE);

3690

spin_unlock_irq(&x->wait.lock);

3700

spin_unlock_irq(&x->wait.lock);

3691

schedule();

3701

schedule();

3692

spin_lock_irq(&x->wait.lock);

3702

spin_lock_irq(&x->wait.lock);

3693

} while (!x->done);

3703

} while (!x->done);

3694

__remove_wait_queue(&x->wait, &wait);

3704

__remove_wait_queue(&x->wait, &wait);

3695

}

3705

}

3696

x->done--;

3706

x->done--;

3697

spin_unlock_irq(&x->wait.lock);

3707

spin_unlock_irq(&x->wait.lock);

3698

}

3708

}

3699

EXPORT_SYMBOL(wait_for_completion);

3709

EXPORT_SYMBOL(wait_for_completion);

3700

3710

3701

unsigned long fastcall __sched

3711

unsigned long fastcall __sched

3702

wait_for_completion_timeout(struct completion *x, unsigned long timeout)

3712

wait_for_completion_timeout(struct completion *x, unsigned long timeout)

3703

{

3713

{

3704

might_sleep();

3714

might_sleep();

3705

3715

3706

spin_lock_irq(&x->wait.lock);

3716

spin_lock_irq(&x->wait.lock);

3707

if (!x->done) {

3717

if (!x->done) {

3708

DECLARE_WAITQUEUE(wait, current);

3718

DECLARE_WAITQUEUE(wait, current);

3709

3719

3710

wait.flags |= WQ_FLAG_EXCLUSIVE;

3720

wait.flags |= WQ_FLAG_EXCLUSIVE;

3711

__add_wait_queue_tail(&x->wait, &wait);

3721

__add_wait_queue_tail(&x->wait, &wait);

3712

do {

3722

do {

3713

__set_current_state(TASK_UNINTERRUPTIBLE);

3723

__set_current_state(TASK_UNINTERRUPTIBLE);

3714

spin_unlock_irq(&x->wait.lock);

3724

spin_unlock_irq(&x->wait.lock);

3715

timeout = schedule_timeout(timeout);

3725

timeout = schedule_timeout(timeout);

3716

spin_lock_irq(&x->wait.lock);

3726

spin_lock_irq(&x->wait.lock);

3717

if (!timeout) {

3727

if (!timeout) {

3718

__remove_wait_queue(&x->wait, &wait);

3728

__remove_wait_queue(&x->wait, &wait);

3719

goto out;

3729

goto out;

3720

}

3730

}

3721

} while (!x->done);

3731

} while (!x->done);

3722

__remove_wait_queue(&x->wait, &wait);

3732

__remove_wait_queue(&x->wait, &wait);

3723

}

3733

}

3724

x->done--;

3734

x->done--;

3725

out:

3735

out:

3726

spin_unlock_irq(&x->wait.lock);

3736

spin_unlock_irq(&x->wait.lock);

3727

return timeout;

3737

return timeout;

3728

}

3738

}

3729

EXPORT_SYMBOL(wait_for_completion_timeout);

3739

EXPORT_SYMBOL(wait_for_completion_timeout);

3730

3740

3731

int fastcall __sched wait_for_completion_interruptible(struct completion *x)

3741

int fastcall __sched wait_for_completion_interruptible(struct completion *x)

3732

{

3742

{

3733

int ret = 0;

3743

int ret = 0;

3734

3744

3735

might_sleep();

3745

might_sleep();

3736

3746

3737

spin_lock_irq(&x->wait.lock);

3747

spin_lock_irq(&x->wait.lock);

3738

if (!x->done) {

3748

if (!x->done) {

3739

DECLARE_WAITQUEUE(wait, current);

3749

DECLARE_WAITQUEUE(wait, current);

3740

3750

3741

wait.flags |= WQ_FLAG_EXCLUSIVE;

3751

wait.flags |= WQ_FLAG_EXCLUSIVE;

3742

__add_wait_queue_tail(&x->wait, &wait);

3752

__add_wait_queue_tail(&x->wait, &wait);

3743

do {

3753

do {

3744

if (signal_pending(current)) {

3754

if (signal_pending(current)) {

3745

ret = -ERESTARTSYS;

3755

ret = -ERESTARTSYS;

3746

__remove_wait_queue(&x->wait, &wait);

3756

__remove_wait_queue(&x->wait, &wait);

3747

goto out;

3757

goto out;

3748

}

3758

}

3749

__set_current_state(TASK_INTERRUPTIBLE);

3759

__set_current_state(TASK_INTERRUPTIBLE);

3750

spin_unlock_irq(&x->wait.lock);

3760

spin_unlock_irq(&x->wait.lock);

3751

schedule();

3761

schedule();

3752

spin_lock_irq(&x->wait.lock);

3762

spin_lock_irq(&x->wait.lock);

3753

} while (!x->done);

3763

} while (!x->done);

3754

__remove_wait_queue(&x->wait, &wait);

3764

__remove_wait_queue(&x->wait, &wait);

3755

}

3765

}

3756

x->done--;

3766

x->done--;

3757

out:

3767

out:

3758

spin_unlock_irq(&x->wait.lock);

3768

spin_unlock_irq(&x->wait.lock);

3759

3769

3760

return ret;

3770

return ret;

3761

}

3771

}

3762

EXPORT_SYMBOL(wait_for_completion_interruptible);

3772

EXPORT_SYMBOL(wait_for_completion_interruptible);

3763

3773

3764

unsigned long fastcall __sched

3774

unsigned long fastcall __sched

3765

wait_for_completion_interruptible_timeout(struct completion *x,

3775

wait_for_completion_interruptible_timeout(struct completion *x,

3766

unsigned long timeout)

3776

unsigned long timeout)

3767

{

3777

{

3768

might_sleep();

3778

might_sleep();

3769

3779

3770

spin_lock_irq(&x->wait.lock);

3780

spin_lock_irq(&x->wait.lock);

3771

if (!x->done) {

3781

if (!x->done) {

3772

DECLARE_WAITQUEUE(wait, current);

3782

DECLARE_WAITQUEUE(wait, current);

3773

3783

3774

wait.flags |= WQ_FLAG_EXCLUSIVE;

3784

wait.flags |= WQ_FLAG_EXCLUSIVE;

3775

__add_wait_queue_tail(&x->wait, &wait);

3785

__add_wait_queue_tail(&x->wait, &wait);

3776

do {

3786

do {

3777

if (signal_pending(current)) {

3787

if (signal_pending(current)) {

3778

timeout = -ERESTARTSYS;

3788

timeout = -ERESTARTSYS;

3779

__remove_wait_queue(&x->wait, &wait);

3789

__remove_wait_queue(&x->wait, &wait);

3780

goto out;

3790

goto out;

3781

}

3791

}

3782

__set_current_state(TASK_INTERRUPTIBLE);

3792

__set_current_state(TASK_INTERRUPTIBLE);

3783

spin_unlock_irq(&x->wait.lock);

3793

spin_unlock_irq(&x->wait.lock);

3784

timeout = schedule_timeout(timeout);

3794

timeout = schedule_timeout(timeout);

3785

spin_lock_irq(&x->wait.lock);

3795

spin_lock_irq(&x->wait.lock);

3786

if (!timeout) {

3796

if (!timeout) {

3787

__remove_wait_queue(&x->wait, &wait);

3797

__remove_wait_queue(&x->wait, &wait);

3788

goto out;

3798

goto out;

3789

}

3799

}

3790

} while (!x->done);

3800

} while (!x->done);

3791

__remove_wait_queue(&x->wait, &wait);

3801

__remove_wait_queue(&x->wait, &wait);

3792

}

3802

}

3793

x->done--;

3803

x->done--;

3794

out:

3804

out:

3795

spin_unlock_irq(&x->wait.lock);

3805

spin_unlock_irq(&x->wait.lock);

3796

return timeout;

3806

return timeout;

3797

}

3807

}

3798

EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);

3808

EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);

3799

3809

3800

static inline void

3810

static inline void

3801

sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)

3811

sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)

3802

{

3812

{

3803

spin_lock_irqsave(&q->lock, *flags);

3813

spin_lock_irqsave(&q->lock, *flags);

3804

__add_wait_queue(q, wait);

3814

__add_wait_queue(q, wait);

3805

spin_unlock(&q->lock);

3815

spin_unlock(&q->lock);

3806

}

3816

}

3807

3817

3808

static inline void

3818

static inline void

3809

sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)

3819

sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)

3810

{

3820

{

3811

spin_lock_irq(&q->lock);

3821

spin_lock_irq(&q->lock);

3812

__remove_wait_queue(q, wait);

3822

__remove_wait_queue(q, wait);

3813

spin_unlock_irqrestore(&q->lock, *flags);

3823

spin_unlock_irqrestore(&q->lock, *flags);

3814

}

3824

}

3815

3825

3816

void __sched interruptible_sleep_on(wait_queue_head_t *q)

3826

void __sched interruptible_sleep_on(wait_queue_head_t *q)

3817

{

3827

{

3818

unsigned long flags;

3828

unsigned long flags;

3819

wait_queue_t wait;

3829

wait_queue_t wait;

3820

3830

3821

init_waitqueue_entry(&wait, current);

3831

init_waitqueue_entry(&wait, current);

3822

3832

3823

current->state = TASK_INTERRUPTIBLE;

3833

current->state = TASK_INTERRUPTIBLE;

3824

3834

3825

sleep_on_head(q, &wait, &flags);

3835

sleep_on_head(q, &wait, &flags);

3826

schedule();

3836

schedule();

3827

sleep_on_tail(q, &wait, &flags);

3837

sleep_on_tail(q, &wait, &flags);

3828

}

3838

}

3829

EXPORT_SYMBOL(interruptible_sleep_on);

3839

EXPORT_SYMBOL(interruptible_sleep_on);

3830

3840

3831

long __sched

3841

long __sched

3832

interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)

3842

interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)

3833

{

3843

{

3834

unsigned long flags;

3844

unsigned long flags;

3835

wait_queue_t wait;

3845

wait_queue_t wait;

3836

3846

3837

init_waitqueue_entry(&wait, current);

3847

init_waitqueue_entry(&wait, current);

3838

3848

3839

current->state = TASK_INTERRUPTIBLE;

3849

current->state = TASK_INTERRUPTIBLE;

3840

3850

3841

sleep_on_head(q, &wait, &flags);

3851

sleep_on_head(q, &wait, &flags);

3842

timeout = schedule_timeout(timeout);

3852

timeout = schedule_timeout(timeout);

3843

sleep_on_tail(q, &wait, &flags);

3853

sleep_on_tail(q, &wait, &flags);

3844

3854

3845

return timeout;

3855

return timeout;

3846

}

3856

}

3847

EXPORT_SYMBOL(interruptible_sleep_on_timeout);

3857

EXPORT_SYMBOL(interruptible_sleep_on_timeout);

3848

3858

3849

void __sched sleep_on(wait_queue_head_t *q)

3859

void __sched sleep_on(wait_queue_head_t *q)

3850

{

3860

{

3851

unsigned long flags;

3861

unsigned long flags;

3852

wait_queue_t wait;

3862

wait_queue_t wait;

3853

3863

3854

init_waitqueue_entry(&wait, current);

3864

init_waitqueue_entry(&wait, current);

3855

3865

3856

current->state = TASK_UNINTERRUPTIBLE;

3866

current->state = TASK_UNINTERRUPTIBLE;

3857

3867

3858

sleep_on_head(q, &wait, &flags);

3868

sleep_on_head(q, &wait, &flags);

3859

schedule();

3869

schedule();

3860

sleep_on_tail(q, &wait, &flags);

3870

sleep_on_tail(q, &wait, &flags);

3861

}

3871

}

3862

EXPORT_SYMBOL(sleep_on);

3872

EXPORT_SYMBOL(sleep_on);

3863

3873

3864

long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)

3874

long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)

3865

{

3875

{

3866

unsigned long flags;

3876

unsigned long flags;

3867

wait_queue_t wait;

3877

wait_queue_t wait;

3868

3878

3869

init_waitqueue_entry(&wait, current);

3879

init_waitqueue_entry(&wait, current);

3870

3880

3871

current->state = TASK_UNINTERRUPTIBLE;

3881

current->state = TASK_UNINTERRUPTIBLE;

3872

3882

3873

sleep_on_head(q, &wait, &flags);

3883

sleep_on_head(q, &wait, &flags);

3874

timeout = schedule_timeout(timeout);

3884

timeout = schedule_timeout(timeout);

3875

sleep_on_tail(q, &wait, &flags);

3885

sleep_on_tail(q, &wait, &flags);

3876

3886

3877

return timeout;

3887

return timeout;

3878

}

3888

}

3879

EXPORT_SYMBOL(sleep_on_timeout);

3889

EXPORT_SYMBOL(sleep_on_timeout);

3880

3890

3881

#ifdef CONFIG_RT_MUTEXES

3891

#ifdef CONFIG_RT_MUTEXES

3882

3892

3883

/*

3893

/*

3884

* rt_mutex_setprio - set the current priority of a task

3894

* rt_mutex_setprio - set the current priority of a task

3885

* @p: task

3895

* @p: task

3886

* @prio: prio value (kernel-internal form)

3896

* @prio: prio value (kernel-internal form)

3887

*

3897

*

3888

* This function changes the 'effective' priority of a task. It does

3898

* This function changes the 'effective' priority of a task. It does

3889

* not touch ->normal_prio like __setscheduler().

3899

* not touch ->normal_prio like __setscheduler().

3890

*

3900

*

3891

* Used by the rt_mutex code to implement priority inheritance logic.

3901

* Used by the rt_mutex code to implement priority inheritance logic.

3892

*/

3902

*/

3893

void rt_mutex_setprio(struct task_struct *p, int prio)

3903

void rt_mutex_setprio(struct task_struct *p, int prio)

3894

{

3904

{

3895

unsigned long flags;

3905

unsigned long flags;

3896

int oldprio, on_rq;

3906

int oldprio, on_rq;

3897

struct rq *rq;

3907

struct rq *rq;

3898

u64 now;

3908

u64 now;

3899

3909

3900

BUG_ON(prio < 0 || prio > MAX_PRIO);

3910

BUG_ON(prio < 0 || prio > MAX_PRIO);

3901

3911

3902

rq = task_rq_lock(p, &flags);

3912

rq = task_rq_lock(p, &flags);

3903

now = rq_clock(rq);

3913

now = rq_clock(rq);

3904

3914

3905

oldprio = p->prio;

3915

oldprio = p->prio;

3906

on_rq = p->se.on_rq;

3916

on_rq = p->se.on_rq;

3907

if (on_rq)

3917

if (on_rq)

3908

dequeue_task(rq, p, 0, now);

3918

dequeue_task(rq, p, 0, now);

3909

3919

3910

if (rt_prio(prio))

3920

if (rt_prio(prio))

3911

p->sched_class = &rt_sched_class;

3921

p->sched_class = &rt_sched_class;

3912

else

3922

else

3913

p->sched_class = &fair_sched_class;

3923

p->sched_class = &fair_sched_class;

3914

3924

3915

p->prio = prio;

3925

p->prio = prio;

3916

3926

3917

if (on_rq) {

3927

if (on_rq) {

3918

enqueue_task(rq, p, 0, now);

3928

enqueue_task(rq, p, 0, now);

3919

/*

3929

/*

3920

* Reschedule if we are currently running on this runqueue and

3930

* Reschedule if we are currently running on this runqueue and

3921

* our priority decreased, or if we are not currently running on

3931

* our priority decreased, or if we are not currently running on

3922

* this runqueue and our priority is higher than the current's

3932

* this runqueue and our priority is higher than the current's

3923

*/

3933

*/

3924

if (task_running(rq, p)) {

3934

if (task_running(rq, p)) {

3925

if (p->prio > oldprio)

3935

if (p->prio > oldprio)

3926

resched_task(rq->curr);

3936

resched_task(rq->curr);

3927

} else {

3937

} else {

3928

check_preempt_curr(rq, p);

3938

check_preempt_curr(rq, p);

3929

}

3939

}

3930

}

3940

}

3931

task_rq_unlock(rq, &flags);

3941

task_rq_unlock(rq, &flags);

3932

}

3942

}

3933

3943

3934

#endif

3944

#endif

3935

3945

3936

void set_user_nice(struct task_struct *p, long nice)

3946

void set_user_nice(struct task_struct *p, long nice)

3937

{

3947

{

3938

int old_prio, delta, on_rq;

3948

int old_prio, delta, on_rq;

3939

unsigned long flags;

3949

unsigned long flags;

3940

struct rq *rq;

3950

struct rq *rq;

3941

u64 now;

3951

u64 now;

3942

3952

3943

if (TASK_NICE(p) == nice || nice < -20 || nice > 19)

3953

if (TASK_NICE(p) == nice || nice < -20 || nice > 19)

3944

return;

3954

return;

3945

/*

3955

/*

3946

* We have to be careful, if called from sys_setpriority(),

3956

* We have to be careful, if called from sys_setpriority(),

3947

* the task might be in the middle of scheduling on another CPU.

3957

* the task might be in the middle of scheduling on another CPU.

3948

*/

3958

*/

3949

rq = task_rq_lock(p, &flags);

3959

rq = task_rq_lock(p, &flags);

3950

now = rq_clock(rq);

3960

now = rq_clock(rq);

3951

/*

3961

/*

3952

* The RT priorities are set via sched_setscheduler(), but we still

3962

* The RT priorities are set via sched_setscheduler(), but we still

3953

* allow the 'normal' nice value to be set - but as expected

3963

* allow the 'normal' nice value to be set - but as expected

3954

* it wont have any effect on scheduling until the task is

3964

* it wont have any effect on scheduling until the task is

3955

* SCHED_FIFO/SCHED_RR:

3965

* SCHED_FIFO/SCHED_RR:

3956

*/

3966

*/

3957

if (task_has_rt_policy(p)) {

3967

if (task_has_rt_policy(p)) {

3958

p->static_prio = NICE_TO_PRIO(nice);

3968

p->static_prio = NICE_TO_PRIO(nice);

3959

goto out_unlock;

3969

goto out_unlock;

3960

}

3970

}

3961

on_rq = p->se.on_rq;

3971

on_rq = p->se.on_rq;

3962

if (on_rq) {

3972

if (on_rq) {

3963

dequeue_task(rq, p, 0, now);

3973

dequeue_task(rq, p, 0, now);

3964

dec_load(rq, p, now);

3974

dec_load(rq, p, now);

3965

}

3975

}

3966

3976

3967

p->static_prio = NICE_TO_PRIO(nice);

3977

p->static_prio = NICE_TO_PRIO(nice);

3968

set_load_weight(p);

3978

set_load_weight(p);

3969

old_prio = p->prio;

3979

old_prio = p->prio;

3970

p->prio = effective_prio(p);

3980

p->prio = effective_prio(p);

3971

delta = p->prio - old_prio;

3981

delta = p->prio - old_prio;

3972

3982

3973

if (on_rq) {

3983

if (on_rq) {

3974

enqueue_task(rq, p, 0, now);

3984

enqueue_task(rq, p, 0, now);

3975

inc_load(rq, p, now);

3985

inc_load(rq, p, now);

3976

/*

3986

/*

3977

* If the task increased its priority or is running and

3987

* If the task increased its priority or is running and

3978

* lowered its priority, then reschedule its CPU:

3988

* lowered its priority, then reschedule its CPU:

3979

*/

3989

*/

3980

if (delta < 0 || (delta > 0 && task_running(rq, p)))

3990

if (delta < 0 || (delta > 0 && task_running(rq, p)))

3981

resched_task(rq->curr);

3991

resched_task(rq->curr);

3982

}

3992

}

3983

out_unlock:

3993

out_unlock:

3984

task_rq_unlock(rq, &flags);

3994

task_rq_unlock(rq, &flags);

3985

}

3995

}

3986

EXPORT_SYMBOL(set_user_nice);

3996

EXPORT_SYMBOL(set_user_nice);

3987

3997

3988

/*

3998

/*

3989

* can_nice - check if a task can reduce its nice value

3999

* can_nice - check if a task can reduce its nice value

3990

* @p: task

4000

* @p: task

3991

* @nice: nice value

4001

* @nice: nice value

3992

*/

4002

*/

3993

int can_nice(const struct task_struct *p, const int nice)

4003

int can_nice(const struct task_struct *p, const int nice)

3994

{

4004

{

3995

/* convert nice value [19,-20] to rlimit style value [1,40] */

4005

/* convert nice value [19,-20] to rlimit style value [1,40] */

3996

int nice_rlim = 20 - nice;

4006

int nice_rlim = 20 - nice;

3997

4007

3998

return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||

4008

return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||

3999

capable(CAP_SYS_NICE));

4009

capable(CAP_SYS_NICE));

4000

}

4010

}

4001

4011

4002

#ifdef __ARCH_WANT_SYS_NICE

4012

#ifdef __ARCH_WANT_SYS_NICE

4003

4013

4004

/*

4014

/*

4005

* sys_nice - change the priority of the current process.

4015

* sys_nice - change the priority of the current process.

4006

* @increment: priority increment

4016

* @increment: priority increment

4007

*

4017

*

4008

* sys_setpriority is a more generic, but much slower function that

4018

* sys_setpriority is a more generic, but much slower function that

4009

* does similar things.

4019

* does similar things.

4010

*/

4020

*/

4011

asmlinkage long sys_nice(int increment)

4021

asmlinkage long sys_nice(int increment)

4012

{

4022

{

4013

long nice, retval;

4023

long nice, retval;

4014

4024

4015

/*

4025

/*

4016

* Setpriority might change our priority at the same moment.

4026

* Setpriority might change our priority at the same moment.

4017

* We don't have to worry. Conceptually one call occurs first

4027

* We don't have to worry. Conceptually one call occurs first

4018

* and we have a single winner.

4028

* and we have a single winner.

4019

*/

4029

*/

4020

if (increment < -40)

4030

if (increment < -40)

4021

increment = -40;

4031

increment = -40;

4022

if (increment > 40)

4032

if (increment > 40)

4023

increment = 40;

4033

increment = 40;

4024

4034

4025

nice = PRIO_TO_NICE(current->static_prio) + increment;

4035

nice = PRIO_TO_NICE(current->static_prio) + increment;

4026

if (nice < -20)

4036

if (nice < -20)

4027

nice = -20;

4037

nice = -20;

4028

if (nice > 19)

4038

if (nice > 19)

4029

nice = 19;

4039

nice = 19;

4030

4040

4031

if (increment < 0 && !can_nice(current, nice))

4041

if (increment < 0 && !can_nice(current, nice))

4032

return -EPERM;

4042

return -EPERM;

4033

4043

4034

retval = security_task_setnice(current, nice);

4044

retval = security_task_setnice(current, nice);

4035

if (retval)

4045

if (retval)

4036

return retval;

4046

return retval;

4037

4047

4038

set_user_nice(current, nice);

4048

set_user_nice(current, nice);

4039

return 0;

4049

return 0;

4040

}

4050

}

4041

4051

4042

#endif

4052

#endif

4043

4053

4044

/**

4054

/**

4045

* task_prio - return the priority value of a given task.

4055

* task_prio - return the priority value of a given task.

4046

* @p: the task in question.

4056

* @p: the task in question.

4047

*

4057

*

4048

* This is the priority value as seen by users in /proc.

4058

* This is the priority value as seen by users in /proc.

4049

* RT tasks are offset by -200. Normal tasks are centered

4059

* RT tasks are offset by -200. Normal tasks are centered

4050

* around 0, value goes from -16 to +15.

4060

* around 0, value goes from -16 to +15.

4051

*/

4061

*/

4052

int task_prio(const struct task_struct *p)

4062

int task_prio(const struct task_struct *p)

4053

{

4063

{

4054

return p->prio - MAX_RT_PRIO;

4064

return p->prio - MAX_RT_PRIO;

4055

}

4065

}

4056

4066

4057

/**

4067

/**

4058

* task_nice - return the nice value of a given task.

4068

* task_nice - return the nice value of a given task.

4059

* @p: the task in question.

4069

* @p: the task in question.

4060

*/

4070

*/

4061

int task_nice(const struct task_struct *p)

4071

int task_nice(const struct task_struct *p)

4062

{

4072

{

4063

return TASK_NICE(p);

4073

return TASK_NICE(p);

4064

}

4074

}

4065

EXPORT_SYMBOL_GPL(task_nice);

4075

EXPORT_SYMBOL_GPL(task_nice);

4066

4076

4067

/**

4077

/**

4068

* idle_cpu - is a given cpu idle currently?

4078

* idle_cpu - is a given cpu idle currently?

4069

* @cpu: the processor in question.

4079

* @cpu: the processor in question.

4070

*/

4080

*/

4071

int idle_cpu(int cpu)

4081

int idle_cpu(int cpu)

4072

{

4082

{

4073

return cpu_curr(cpu) == cpu_rq(cpu)->idle;

4083

return cpu_curr(cpu) == cpu_rq(cpu)->idle;

4074

}

4084

}

4075

4085

4076

/**

4086

/**

4077

* idle_task - return the idle task for a given cpu.

4087

* idle_task - return the idle task for a given cpu.

4078

* @cpu: the processor in question.

4088

* @cpu: the processor in question.

4079

*/

4089

*/

4080

struct task_struct *idle_task(int cpu)

4090

struct task_struct *idle_task(int cpu)

4081

{

4091

{

4082

return cpu_rq(cpu)->idle;

4092

return cpu_rq(cpu)->idle;

4083

}

4093

}

4084

4094

4085

/**

4095

/**

4086

* find_process_by_pid - find a process with a matching PID value.

4096

* find_process_by_pid - find a process with a matching PID value.

4087

* @pid: the pid in question.

4097

* @pid: the pid in question.

4088

*/

4098

*/

4089

static inline struct task_struct *find_process_by_pid(pid_t pid)

4099

static inline struct task_struct *find_process_by_pid(pid_t pid)

4090

{

4100

{

4091

return pid ? find_task_by_pid(pid) : current;

4101

return pid ? find_task_by_pid(pid) : current;

4092

}

4102

}

4093

4103

4094

/* Actually do priority change: must hold rq lock. */

4104

/* Actually do priority change: must hold rq lock. */

4095

static void

4105

static void

4096

__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)

4106

__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)

4097

{

4107

{

4098

BUG_ON(p->se.on_rq);

4108

BUG_ON(p->se.on_rq);

4099

4109

4100

p->policy = policy;

4110

p->policy = policy;

4101

switch (p->policy) {

4111

switch (p->policy) {

4102

case SCHED_NORMAL:

4112

case SCHED_NORMAL:

4103

case SCHED_BATCH:

4113

case SCHED_BATCH:

4104

case SCHED_IDLE:

4114

case SCHED_IDLE:

4105

p->sched_class = &fair_sched_class;

4115

p->sched_class = &fair_sched_class;

4106

break;

4116

break;

4107

case SCHED_FIFO:

4117

case SCHED_FIFO:

4108

case SCHED_RR:

4118

case SCHED_RR:

4109

p->sched_class = &rt_sched_class;

4119

p->sched_class = &rt_sched_class;

4110

break;

4120

break;

4111

}

4121

}

4112

4122

4113

p->rt_priority = prio;

4123

p->rt_priority = prio;

4114

p->normal_prio = normal_prio(p);

4124

p->normal_prio = normal_prio(p);

4115

/* we are holding p->pi_lock already */

4125

/* we are holding p->pi_lock already */

4116

p->prio = rt_mutex_getprio(p);

4126

p->prio = rt_mutex_getprio(p);

4117

set_load_weight(p);

4127

set_load_weight(p);

4118

}

4128

}

4119

4129

4120

/**

4130

/**

4121

* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.

4131

* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.

4122

* @p: the task in question.

4132

* @p: the task in question.

4123

* @policy: new policy.

4133

* @policy: new policy.

4124

* @param: structure containing the new RT priority.

4134

* @param: structure containing the new RT priority.

4125

*

4135

*

4126

* NOTE that the task may be already dead.

4136

* NOTE that the task may be already dead.

4127

*/

4137

*/

4128

int sched_setscheduler(struct task_struct *p, int policy,

4138

int sched_setscheduler(struct task_struct *p, int policy,

4129

struct sched_param *param)

4139

struct sched_param *param)

4130

{

4140

{

4131

int retval, oldprio, oldpolicy = -1, on_rq;

4141

int retval, oldprio, oldpolicy = -1, on_rq;

4132

unsigned long flags;

4142

unsigned long flags;

4133

struct rq *rq;

4143

struct rq *rq;

4134

4144

4135

/* may grab non-irq protected spin_locks */

4145

/* may grab non-irq protected spin_locks */

4136

BUG_ON(in_interrupt());

4146

BUG_ON(in_interrupt());

4137

recheck:

4147

recheck:

4138

/* double check policy once rq lock held */

4148

/* double check policy once rq lock held */

4139

if (policy < 0)

4149

if (policy < 0)

4140

policy = oldpolicy = p->policy;

4150

policy = oldpolicy = p->policy;

4141

else if (policy != SCHED_FIFO && policy != SCHED_RR &&

4151

else if (policy != SCHED_FIFO && policy != SCHED_RR &&

4142

policy != SCHED_NORMAL && policy != SCHED_BATCH &&

4152

policy != SCHED_NORMAL && policy != SCHED_BATCH &&

4143

policy != SCHED_IDLE)

4153

policy != SCHED_IDLE)

4144

return -EINVAL;

4154

return -EINVAL;

4145

/*

4155

/*

4146

* Valid priorities for SCHED_FIFO and SCHED_RR are

4156

* Valid priorities for SCHED_FIFO and SCHED_RR are

4147

* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,

4157

* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,

4148

* SCHED_BATCH and SCHED_IDLE is 0.

4158

* SCHED_BATCH and SCHED_IDLE is 0.

4149

*/

4159

*/

4150

if (param->sched_priority < 0 ||

4160

if (param->sched_priority < 0 ||

4151

(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||

4161

(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||

4152

(!p->mm && param->sched_priority > MAX_RT_PRIO-1))

4162

(!p->mm && param->sched_priority > MAX_RT_PRIO-1))

4153

return -EINVAL;

4163

return -EINVAL;

4154

if (rt_policy(policy) != (param->sched_priority != 0))

4164

if (rt_policy(policy) != (param->sched_priority != 0))

4155

return -EINVAL;

4165

return -EINVAL;

4156

4166

4157

/*

4167

/*

4158

* Allow unprivileged RT tasks to decrease priority:

4168

* Allow unprivileged RT tasks to decrease priority:

4159

*/

4169

*/

4160

if (!capable(CAP_SYS_NICE)) {

4170

if (!capable(CAP_SYS_NICE)) {

4161

if (rt_policy(policy)) {

4171

if (rt_policy(policy)) {

4162

unsigned long rlim_rtprio;

4172

unsigned long rlim_rtprio;

4163

4173

4164

if (!lock_task_sighand(p, &flags))

4174

if (!lock_task_sighand(p, &flags))

4165

return -ESRCH;

4175

return -ESRCH;

4166

rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;

4176

rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;

4167

unlock_task_sighand(p, &flags);

4177

unlock_task_sighand(p, &flags);

4168

4178

4169

/* can't set/change the rt policy */

4179

/* can't set/change the rt policy */

4170

if (policy != p->policy && !rlim_rtprio)

4180

if (policy != p->policy && !rlim_rtprio)

4171

return -EPERM;

4181

return -EPERM;

4172

4182

4173

/* can't increase priority */

4183

/* can't increase priority */

4174

if (param->sched_priority > p->rt_priority &&

4184

if (param->sched_priority > p->rt_priority &&

4175

param->sched_priority > rlim_rtprio)

4185

param->sched_priority > rlim_rtprio)

4176

return -EPERM;

4186

return -EPERM;

4177

}

4187

}

4178

/*

4188

/*

4179

* Like positive nice levels, dont allow tasks to

4189

* Like positive nice levels, dont allow tasks to

4180

* move out of SCHED_IDLE either:

4190

* move out of SCHED_IDLE either:

4181

*/

4191

*/

4182

if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)

4192

if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)

4183

return -EPERM;

4193

return -EPERM;

4184

4194

4185

/* can't change other user's priorities */

4195

/* can't change other user's priorities */

4186

if ((current->euid != p->euid) &&

4196

if ((current->euid != p->euid) &&

4187

(current->euid != p->uid))

4197

(current->euid != p->uid))

4188

return -EPERM;

4198

return -EPERM;

4189

}

4199

}

4190

4200

4191

retval = security_task_setscheduler(p, policy, param);

4201

retval = security_task_setscheduler(p, policy, param);

4192

if (retval)

4202

if (retval)

4193

return retval;

4203

return retval;

4194

/*

4204

/*

4195

* make sure no PI-waiters arrive (or leave) while we are

4205

* make sure no PI-waiters arrive (or leave) while we are

4196

* changing the priority of the task:

4206

* changing the priority of the task:

4197

*/

4207

*/

4198

spin_lock_irqsave(&p->pi_lock, flags);

4208

spin_lock_irqsave(&p->pi_lock, flags);

4199

/*

4209

/*

4200

* To be able to change p->policy safely, the apropriate

4210

* To be able to change p->policy safely, the apropriate

4201

* runqueue lock must be held.

4211

* runqueue lock must be held.

4202

*/

4212

*/

4203

rq = __task_rq_lock(p);

4213

rq = __task_rq_lock(p);

4204

/* recheck policy now with rq lock held */

4214

/* recheck policy now with rq lock held */

4205

if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

4215

if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

4206

policy = oldpolicy = -1;

4216

policy = oldpolicy = -1;

4207

__task_rq_unlock(rq);

4217

__task_rq_unlock(rq);

4208

spin_unlock_irqrestore(&p->pi_lock, flags);

4218

spin_unlock_irqrestore(&p->pi_lock, flags);

4209

goto recheck;

4219

goto recheck;

4210

}

4220

}

4211

on_rq = p->se.on_rq;

4221

on_rq = p->se.on_rq;

4212

if (on_rq)

4222

if (on_rq)

4213

deactivate_task(rq, p, 0);

4223

deactivate_task(rq, p, 0);

4214

oldprio = p->prio;

4224

oldprio = p->prio;

4215

__setscheduler(rq, p, policy, param->sched_priority);

4225

__setscheduler(rq, p, policy, param->sched_priority);

4216

if (on_rq) {

4226

if (on_rq) {

4217

activate_task(rq, p, 0);

4227

activate_task(rq, p, 0);

4218

/*

4228

/*

4219

* Reschedule if we are currently running on this runqueue and

4229

* Reschedule if we are currently running on this runqueue and

4220

* our priority decreased, or if we are not currently running on

4230

* our priority decreased, or if we are not currently running on

4221

* this runqueue and our priority is higher than the current's

4231

* this runqueue and our priority is higher than the current's

4222

*/

4232

*/

4223

if (task_running(rq, p)) {

4233

if (task_running(rq, p)) {

4224

if (p->prio > oldprio)

4234

if (p->prio > oldprio)

4225

resched_task(rq->curr);

4235

resched_task(rq->curr);

4226

} else {

4236

} else {

4227

check_preempt_curr(rq, p);

4237

check_preempt_curr(rq, p);

4228

}

4238

}

4229

}

4239

}

4230

__task_rq_unlock(rq);

4240

__task_rq_unlock(rq);

4231

spin_unlock_irqrestore(&p->pi_lock, flags);

4241

spin_unlock_irqrestore(&p->pi_lock, flags);

4232

4242

4233

rt_mutex_adjust_pi(p);

4243

rt_mutex_adjust_pi(p);

4234

4244

4235

return 0;

4245

return 0;

4236

}

4246

}

4237

EXPORT_SYMBOL_GPL(sched_setscheduler);

4247

EXPORT_SYMBOL_GPL(sched_setscheduler);

4238

4248

4239

static int

4249

static int

4240

do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

4250

do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

4241

{

4251

{

4242

struct sched_param lparam;

4252

struct sched_param lparam;

4243

struct task_struct *p;

4253

struct task_struct *p;

4244

int retval;

4254

int retval;

4245

4255

4246

if (!param || pid < 0)

4256

if (!param || pid < 0)

4247

return -EINVAL;

4257

return -EINVAL;

4248

if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

4258

if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

4249

return -EFAULT;

4259

return -EFAULT;

4250

4260

4251

rcu_read_lock();

4261

rcu_read_lock();

4252

retval = -ESRCH;

4262

retval = -ESRCH;

4253

p = find_process_by_pid(pid);

4263

p = find_process_by_pid(pid);

4254

if (p != NULL)

4264

if (p != NULL)

4255

retval = sched_setscheduler(p, policy, &lparam);

4265

retval = sched_setscheduler(p, policy, &lparam);

4256

rcu_read_unlock();

4266

rcu_read_unlock();

4257

4267

4258

return retval;

4268

return retval;

4259

}

4269

}

4260

4270

4261

/**

4271

/**

4262

* sys_sched_setscheduler - set/change the scheduler policy and RT priority

4272

* sys_sched_setscheduler - set/change the scheduler policy and RT priority

4263

* @pid: the pid in question.

4273

* @pid: the pid in question.

4264

* @policy: new policy.

4274

* @policy: new policy.

4265

* @param: structure containing the new RT priority.

4275

* @param: structure containing the new RT priority.

4266

*/

4276

*/

4267

asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,

4277

asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,

4268

struct sched_param __user *param)

4278

struct sched_param __user *param)

4269

{

4279

{

4270

/* negative values for policy are not valid */

4280

/* negative values for policy are not valid */

4271

if (policy < 0)

4281

if (policy < 0)

4272

return -EINVAL;

4282

return -EINVAL;

4273

4283

4274

return do_sched_setscheduler(pid, policy, param);

4284

return do_sched_setscheduler(pid, policy, param);

4275

}

4285

}

4276

4286

4277

/**

4287

/**

4278

* sys_sched_setparam - set/change the RT priority of a thread

4288

* sys_sched_setparam - set/change the RT priority of a thread

4279

* @pid: the pid in question.

4289

* @pid: the pid in question.

4280

* @param: structure containing the new RT priority.

4290

* @param: structure containing the new RT priority.

4281

*/

4291

*/

4282

asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)

4292

asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)

4283

{

4293

{

4284

return do_sched_setscheduler(pid, -1, param);

4294

return do_sched_setscheduler(pid, -1, param);

4285

}

4295

}

4286

4296

4287

/**

4297

/**

4288

* sys_sched_getscheduler - get the policy (scheduling class) of a thread

4298

* sys_sched_getscheduler - get the policy (scheduling class) of a thread

4289

* @pid: the pid in question.

4299

* @pid: the pid in question.

4290

*/

4300

*/

4291

asmlinkage long sys_sched_getscheduler(pid_t pid)

4301

asmlinkage long sys_sched_getscheduler(pid_t pid)

4292

{

4302

{

4293

struct task_struct *p;

4303

struct task_struct *p;

4294

int retval = -EINVAL;

4304

int retval = -EINVAL;

4295

4305

4296

if (pid < 0)

4306

if (pid < 0)

4297

goto out_nounlock;

4307

goto out_nounlock;

4298

4308

4299

retval = -ESRCH;

4309

retval = -ESRCH;

4300

read_lock(&tasklist_lock);

4310

read_lock(&tasklist_lock);

4301

p = find_process_by_pid(pid);

4311

p = find_process_by_pid(pid);

4302

if (p) {

4312

if (p) {

4303

retval = security_task_getscheduler(p);

4313

retval = security_task_getscheduler(p);

4304

if (!retval)

4314

if (!retval)

4305

retval = p->policy;

4315

retval = p->policy;

4306

}

4316

}

4307

read_unlock(&tasklist_lock);

4317

read_unlock(&tasklist_lock);

4308

4318

4309

out_nounlock:

4319

out_nounlock:

4310

return retval;

4320

return retval;

4311

}

4321

}

4312

4322

4313

/**

4323

/**

4314

* sys_sched_getscheduler - get the RT priority of a thread

4324

* sys_sched_getscheduler - get the RT priority of a thread

4315

* @pid: the pid in question.

4325

* @pid: the pid in question.

4316

* @param: structure containing the RT priority.

4326

* @param: structure containing the RT priority.

4317

*/

4327

*/

4318

asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)

4328

asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)

4319

{

4329

{

4320

struct sched_param lp;

4330

struct sched_param lp;

4321

struct task_struct *p;

4331

struct task_struct *p;

4322

int retval = -EINVAL;

4332

int retval = -EINVAL;

4323

4333

4324

if (!param || pid < 0)

4334

if (!param || pid < 0)

4325

goto out_nounlock;

4335

goto out_nounlock;

4326

4336

4327

read_lock(&tasklist_lock);

4337

read_lock(&tasklist_lock);

4328

p = find_process_by_pid(pid);

4338

p = find_process_by_pid(pid);

4329

retval = -ESRCH;

4339

retval = -ESRCH;

4330

if (!p)

4340

if (!p)

4331

goto out_unlock;

4341

goto out_unlock;

4332

4342

4333

retval = security_task_getscheduler(p);

4343

retval = security_task_getscheduler(p);

4334

if (retval)

4344

if (retval)

4335

goto out_unlock;

4345

goto out_unlock;

4336

4346

4337

lp.sched_priority = p->rt_priority;

4347

lp.sched_priority = p->rt_priority;

4338

read_unlock(&tasklist_lock);

4348

read_unlock(&tasklist_lock);

4339

4349

4340

/*

4350

/*

4341

* This one might sleep, we cannot do it with a spinlock held ...

4351

* This one might sleep, we cannot do it with a spinlock held ...

4342

*/

4352

*/

4343

retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

4353

retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

4344

4354

4345

out_nounlock:

4355

out_nounlock:

4346

return retval;

4356

return retval;

4347

4357

4348

out_unlock:

4358

out_unlock:

4349

read_unlock(&tasklist_lock);

4359

read_unlock(&tasklist_lock);

4350

return retval;

4360

return retval;

4351

}

4361

}

4352

4362

4353

long sched_setaffinity(pid_t pid, cpumask_t new_mask)

4363

long sched_setaffinity(pid_t pid, cpumask_t new_mask)

4354

{

4364

{

4355

cpumask_t cpus_allowed;

4365

cpumask_t cpus_allowed;

4356

struct task_struct *p;

4366

struct task_struct *p;

4357

int retval;

4367

int retval;

4358

4368

4359

mutex_lock(&sched_hotcpu_mutex);

4369

mutex_lock(&sched_hotcpu_mutex);

4360

read_lock(&tasklist_lock);

4370

read_lock(&tasklist_lock);

4361

4371

4362

p = find_process_by_pid(pid);

4372

p = find_process_by_pid(pid);

4363

if (!p) {

4373

if (!p) {

4364

read_unlock(&tasklist_lock);

4374

read_unlock(&tasklist_lock);

4365

mutex_unlock(&sched_hotcpu_mutex);

4375

mutex_unlock(&sched_hotcpu_mutex);

4366

return -ESRCH;

4376

return -ESRCH;

4367

}

4377

}

4368

4378

4369

/*

4379

/*

4370

* It is not safe to call set_cpus_allowed with the

4380

* It is not safe to call set_cpus_allowed with the

4371

* tasklist_lock held. We will bump the task_struct's

4381

* tasklist_lock held. We will bump the task_struct's

4372

* usage count and then drop tasklist_lock.

4382

* usage count and then drop tasklist_lock.

4373

*/

4383

*/

4374

get_task_struct(p);

4384

get_task_struct(p);

4375

read_unlock(&tasklist_lock);

4385

read_unlock(&tasklist_lock);

4376

4386

4377

retval = -EPERM;

4387

retval = -EPERM;

4378

if ((current->euid != p->euid) && (current->euid != p->uid) &&

4388

if ((current->euid != p->euid) && (current->euid != p->uid) &&

4379

!capable(CAP_SYS_NICE))

4389

!capable(CAP_SYS_NICE))

4380

goto out_unlock;

4390

goto out_unlock;

4381

4391

4382

retval = security_task_setscheduler(p, 0, NULL);

4392

retval = security_task_setscheduler(p, 0, NULL);

4383

if (retval)

4393

if (retval)

4384

goto out_unlock;

4394

goto out_unlock;

4385

4395

4386

cpus_allowed = cpuset_cpus_allowed(p);

4396

cpus_allowed = cpuset_cpus_allowed(p);

4387

cpus_and(new_mask, new_mask, cpus_allowed);

4397

cpus_and(new_mask, new_mask, cpus_allowed);

4388

retval = set_cpus_allowed(p, new_mask);

4398

retval = set_cpus_allowed(p, new_mask);

4389

4399

4390

out_unlock:

4400

out_unlock:

4391

put_task_struct(p);

4401

put_task_struct(p);

4392

mutex_unlock(&sched_hotcpu_mutex);

4402

mutex_unlock(&sched_hotcpu_mutex);

4393

return retval;

4403

return retval;

4394

}

4404

}

4395

4405

4396

static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

4406

static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

4397

cpumask_t *new_mask)

4407

cpumask_t *new_mask)

4398

{

4408

{

4399

if (len < sizeof(cpumask_t)) {

4409

if (len < sizeof(cpumask_t)) {

4400

memset(new_mask, 0, sizeof(cpumask_t));

4410

memset(new_mask, 0, sizeof(cpumask_t));

4401

} else if (len > sizeof(cpumask_t)) {

4411

} else if (len > sizeof(cpumask_t)) {

4402

len = sizeof(cpumask_t);

4412

len = sizeof(cpumask_t);

4403

}

4413

}

4404

return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

4414

return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

4405

}

4415

}

4406

4416

4407

/**

4417

/**

4408

* sys_sched_setaffinity - set the cpu affinity of a process

4418

* sys_sched_setaffinity - set the cpu affinity of a process

4409

* @pid: pid of the process

4419

* @pid: pid of the process

4410

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

4420

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

4411

* @user_mask_ptr: user-space pointer to the new cpu mask

4421

* @user_mask_ptr: user-space pointer to the new cpu mask

4412

*/

4422

*/

4413

asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,

4423

asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,

4414

unsigned long __user *user_mask_ptr)

4424

unsigned long __user *user_mask_ptr)

4415

{

4425

{

4416

cpumask_t new_mask;

4426

cpumask_t new_mask;

4417

int retval;

4427

int retval;

4418

4428

4419

retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);

4429

retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);

4420

if (retval)

4430

if (retval)

4421

return retval;

4431

return retval;

4422

4432

4423

return sched_setaffinity(pid, new_mask);

4433

return sched_setaffinity(pid, new_mask);

4424

}

4434

}

4425

4435

4426

/*

4436

/*

4427

* Represents all cpu's present in the system

4437

* Represents all cpu's present in the system

4428

* In systems capable of hotplug, this map could dynamically grow

4438

* In systems capable of hotplug, this map could dynamically grow

4429

* as new cpu's are detected in the system via any platform specific

4439

* as new cpu's are detected in the system via any platform specific

4430

* method, such as ACPI for e.g.

4440

* method, such as ACPI for e.g.

4431

*/

4441

*/

4432

4442

4433

cpumask_t cpu_present_map __read_mostly;

4443

cpumask_t cpu_present_map __read_mostly;

4434

EXPORT_SYMBOL(cpu_present_map);

4444

EXPORT_SYMBOL(cpu_present_map);

4435

4445

4436

#ifndef CONFIG_SMP

4446

#ifndef CONFIG_SMP

4437

cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;

4447

cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;

4438

EXPORT_SYMBOL(cpu_online_map);

4448

EXPORT_SYMBOL(cpu_online_map);

4439

4449

4440

cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;

4450

cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;

4441

EXPORT_SYMBOL(cpu_possible_map);

4451

EXPORT_SYMBOL(cpu_possible_map);

4442

#endif

4452

#endif

4443

4453

4444

long sched_getaffinity(pid_t pid, cpumask_t *mask)

4454

long sched_getaffinity(pid_t pid, cpumask_t *mask)

4445

{

4455

{

4446

struct task_struct *p;

4456

struct task_struct *p;

4447

int retval;

4457

int retval;

4448

4458

4449

mutex_lock(&sched_hotcpu_mutex);

4459

mutex_lock(&sched_hotcpu_mutex);

4450

read_lock(&tasklist_lock);

4460

read_lock(&tasklist_lock);

4451

4461

4452

retval = -ESRCH;

4462

retval = -ESRCH;

4453

p = find_process_by_pid(pid);

4463

p = find_process_by_pid(pid);

4454

if (!p)

4464

if (!p)

4455

goto out_unlock;

4465

goto out_unlock;

4456

4466

4457

retval = security_task_getscheduler(p);

4467

retval = security_task_getscheduler(p);

4458

if (retval)

4468

if (retval)

4459

goto out_unlock;

4469

goto out_unlock;

4460

4470

4461

cpus_and(*mask, p->cpus_allowed, cpu_online_map);

4471

cpus_and(*mask, p->cpus_allowed, cpu_online_map);

4462

4472

4463

out_unlock:

4473

out_unlock:

4464

read_unlock(&tasklist_lock);

4474

read_unlock(&tasklist_lock);

4465

mutex_unlock(&sched_hotcpu_mutex);

4475

mutex_unlock(&sched_hotcpu_mutex);

4466

if (retval)

4476

if (retval)

4467

return retval;

4477

return retval;

4468

4478

4469

return 0;

4479

return 0;

4470

}

4480

}

4471

4481

4472

/**

4482

/**

4473

* sys_sched_getaffinity - get the cpu affinity of a process

4483

* sys_sched_getaffinity - get the cpu affinity of a process

4474

* @pid: pid of the process

4484

* @pid: pid of the process

4475

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

4485

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

4476

* @user_mask_ptr: user-space pointer to hold the current cpu mask

4486

* @user_mask_ptr: user-space pointer to hold the current cpu mask

4477

*/

4487

*/

4478

asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,

4488

asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,

4479

unsigned long __user *user_mask_ptr)

4489

unsigned long __user *user_mask_ptr)

4480

{

4490

{

4481

int ret;

4491

int ret;

4482

cpumask_t mask;

4492

cpumask_t mask;

4483

4493

4484

if (len < sizeof(cpumask_t))

4494

if (len < sizeof(cpumask_t))

4485

return -EINVAL;

4495

return -EINVAL;

4486

4496

4487

ret = sched_getaffinity(pid, &mask);

4497

ret = sched_getaffinity(pid, &mask);

4488

if (ret < 0)

4498

if (ret < 0)

4489

return ret;

4499

return ret;

4490

4500

4491

if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))

4501

if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))

4492

return -EFAULT;

4502

return -EFAULT;

4493

4503

4494

return sizeof(cpumask_t);

4504

return sizeof(cpumask_t);

4495

}

4505

}

4496

4506

4497

/**

4507

/**

4498

* sys_sched_yield - yield the current processor to other threads.

4508

* sys_sched_yield - yield the current processor to other threads.

4499

*

4509

*

4500

* This function yields the current CPU to other tasks. If there are no

4510

* This function yields the current CPU to other tasks. If there are no

4501

* other threads running on this CPU then this function will return.

4511

* other threads running on this CPU then this function will return.

4502

*/

4512

*/

4503

asmlinkage long sys_sched_yield(void)

4513

asmlinkage long sys_sched_yield(void)

4504

{

4514

{

4505

struct rq *rq = this_rq_lock();

4515

struct rq *rq = this_rq_lock();

4506

4516

4507

schedstat_inc(rq, yld_cnt);

4517

schedstat_inc(rq, yld_cnt);

4508

if (unlikely(rq->nr_running == 1))

4518

if (unlikely(rq->nr_running == 1))

4509

schedstat_inc(rq, yld_act_empty);

4519

schedstat_inc(rq, yld_act_empty);

4510

else

4520

else

4511

current->sched_class->yield_task(rq, current);

4521

current->sched_class->yield_task(rq, current);

4512

4522

4513

/*

4523

/*

4514

* Since we are going to call schedule() anyway, there's

4524

* Since we are going to call schedule() anyway, there's

4515

* no need to preempt or enable interrupts:

4525

* no need to preempt or enable interrupts:

4516

*/

4526

*/

4517

__release(rq->lock);

4527

__release(rq->lock);

4518

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

4528

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

4519

_raw_spin_unlock(&rq->lock);

4529

_raw_spin_unlock(&rq->lock);

4520

preempt_enable_no_resched();

4530

preempt_enable_no_resched();

4521

4531

4522

schedule();

4532

schedule();

4523

4533

4524

return 0;

4534

return 0;

4525

}

4535

}

4526

4536

4527

static void __cond_resched(void)

4537

static void __cond_resched(void)

4528

{

4538

{

4529

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

4539

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

4530

__might_sleep(__FILE__, __LINE__);

4540

__might_sleep(__FILE__, __LINE__);

4531

#endif

4541

#endif

4532

/*

4542

/*

4533

* The BKS might be reacquired before we have dropped

4543

* The BKS might be reacquired before we have dropped

4534

* PREEMPT_ACTIVE, which could trigger a second

4544

* PREEMPT_ACTIVE, which could trigger a second

4535

* cond_resched() call.

4545

* cond_resched() call.

4536

*/

4546

*/

4537

do {

4547

do {

4538

add_preempt_count(PREEMPT_ACTIVE);

4548

add_preempt_count(PREEMPT_ACTIVE);

4539

schedule();

4549

schedule();

4540

sub_preempt_count(PREEMPT_ACTIVE);

4550

sub_preempt_count(PREEMPT_ACTIVE);

4541

} while (need_resched());

4551

} while (need_resched());

4542

}

4552

}

4543

4553

4544

int __sched cond_resched(void)

4554

int __sched cond_resched(void)

4545

{

4555

{

4546

if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&

4556

if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&

4547

system_state == SYSTEM_RUNNING) {

4557

system_state == SYSTEM_RUNNING) {

4548

__cond_resched();

4558

__cond_resched();

4549

return 1;

4559

return 1;

4550

}

4560

}

4551

return 0;

4561

return 0;

4552

}

4562

}

4553

EXPORT_SYMBOL(cond_resched);

4563

EXPORT_SYMBOL(cond_resched);

4554

4564

4555

/*

4565

/*

4556

* cond_resched_lock() - if a reschedule is pending, drop the given lock,

4566

* cond_resched_lock() - if a reschedule is pending, drop the given lock,

4557

* call schedule, and on return reacquire the lock.

4567

* call schedule, and on return reacquire the lock.

4558

*

4568

*

4559

* This works OK both with and without CONFIG_PREEMPT. We do strange low-level

4569

* This works OK both with and without CONFIG_PREEMPT. We do strange low-level

4560

* operations here to prevent schedule() from being called twice (once via

4570

* operations here to prevent schedule() from being called twice (once via

4561

* spin_unlock(), once by hand).

4571

* spin_unlock(), once by hand).

4562

*/

4572

*/

4563

int cond_resched_lock(spinlock_t *lock)

4573

int cond_resched_lock(spinlock_t *lock)

4564

{

4574

{

4565

int ret = 0;

4575

int ret = 0;

4566

4576

4567

if (need_lockbreak(lock)) {

4577

if (need_lockbreak(lock)) {

4568

spin_unlock(lock);

4578

spin_unlock(lock);

4569

cpu_relax();

4579

cpu_relax();

4570

ret = 1;

4580

ret = 1;

4571

spin_lock(lock);

4581

spin_lock(lock);

4572

}

4582

}

4573

if (need_resched() && system_state == SYSTEM_RUNNING) {

4583

if (need_resched() && system_state == SYSTEM_RUNNING) {

4574

spin_release(&lock->dep_map, 1, _THIS_IP_);

4584

spin_release(&lock->dep_map, 1, _THIS_IP_);

4575

_raw_spin_unlock(lock);

4585

_raw_spin_unlock(lock);

4576

preempt_enable_no_resched();

4586

preempt_enable_no_resched();

4577

__cond_resched();

4587

__cond_resched();

4578

ret = 1;

4588

ret = 1;

4579

spin_lock(lock);

4589

spin_lock(lock);

4580

}

4590

}

4581

return ret;

4591

return ret;

4582

}

4592

}

4583

EXPORT_SYMBOL(cond_resched_lock);

4593

EXPORT_SYMBOL(cond_resched_lock);

4584

4594

4585

int __sched cond_resched_softirq(void)

4595

int __sched cond_resched_softirq(void)

4586

{

4596

{

4587

BUG_ON(!in_softirq());

4597

BUG_ON(!in_softirq());

4588

4598

4589

if (need_resched() && system_state == SYSTEM_RUNNING) {

4599

if (need_resched() && system_state == SYSTEM_RUNNING) {

4590

local_bh_enable();

4600

local_bh_enable();

4591

__cond_resched();

4601

__cond_resched();

4592

local_bh_disable();

4602

local_bh_disable();

4593

return 1;

4603

return 1;

4594

}

4604

}

4595

return 0;

4605

return 0;

4596

}

4606

}

4597

EXPORT_SYMBOL(cond_resched_softirq);

4607

EXPORT_SYMBOL(cond_resched_softirq);

4598

4608

4599

/**

4609

/**

4600

* yield - yield the current processor to other threads.

4610

* yield - yield the current processor to other threads.

4601

*

4611

*

4602

* This is a shortcut for kernel-space yielding - it marks the

4612

* This is a shortcut for kernel-space yielding - it marks the

4603

* thread runnable and calls sys_sched_yield().

4613

* thread runnable and calls sys_sched_yield().

4604

*/

4614

*/

4605

void __sched yield(void)

4615

void __sched yield(void)

4606

{

4616

{

4607

set_current_state(TASK_RUNNING);

4617

set_current_state(TASK_RUNNING);

4608

sys_sched_yield();

4618

sys_sched_yield();

4609

}

4619

}

4610

EXPORT_SYMBOL(yield);

4620

EXPORT_SYMBOL(yield);

4611

4621

4612

/*

4622

/*

4613

* This task is about to go to sleep on IO. Increment rq->nr_iowait so

4623

* This task is about to go to sleep on IO. Increment rq->nr_iowait so

4614

* that process accounting knows that this is a task in IO wait state.

4624

* that process accounting knows that this is a task in IO wait state.

4615

*

4625

*

4616

* But don't do that if it is a deliberate, throttling IO wait (this task

4626

* But don't do that if it is a deliberate, throttling IO wait (this task

4617

* has set its backing_dev_info: the queue against which it should throttle)

4627

* has set its backing_dev_info: the queue against which it should throttle)

4618

*/

4628

*/

4619

void __sched io_schedule(void)

4629

void __sched io_schedule(void)

4620

{

4630

{

4621

struct rq *rq = &__raw_get_cpu_var(runqueues);

4631

struct rq *rq = &__raw_get_cpu_var(runqueues);

4622

4632

4623

delayacct_blkio_start();

4633

delayacct_blkio_start();

4624

atomic_inc(&rq->nr_iowait);

4634

atomic_inc(&rq->nr_iowait);

4625

schedule();

4635

schedule();

4626

atomic_dec(&rq->nr_iowait);

4636

atomic_dec(&rq->nr_iowait);

4627

delayacct_blkio_end();

4637

delayacct_blkio_end();

4628

}

4638

}

4629

EXPORT_SYMBOL(io_schedule);

4639

EXPORT_SYMBOL(io_schedule);

4630

4640

4631

long __sched io_schedule_timeout(long timeout)

4641

long __sched io_schedule_timeout(long timeout)

4632

{

4642

{

4633

struct rq *rq = &__raw_get_cpu_var(runqueues);

4643

struct rq *rq = &__raw_get_cpu_var(runqueues);

4634

long ret;

4644

long ret;

4635

4645

4636

delayacct_blkio_start();

4646

delayacct_blkio_start();

4637

atomic_inc(&rq->nr_iowait);

4647

atomic_inc(&rq->nr_iowait);

4638

ret = schedule_timeout(timeout);

4648

ret = schedule_timeout(timeout);

4639

atomic_dec(&rq->nr_iowait);

4649

atomic_dec(&rq->nr_iowait);

4640

delayacct_blkio_end();

4650

delayacct_blkio_end();

4641

return ret;

4651

return ret;

4642

}

4652

}

4643

4653

4644

/**

4654

/**

4645

* sys_sched_get_priority_max - return maximum RT priority.

4655

* sys_sched_get_priority_max - return maximum RT priority.

4646

* @policy: scheduling class.

4656

* @policy: scheduling class.

4647

*

4657

*

4648

* this syscall returns the maximum rt_priority that can be used

4658

* this syscall returns the maximum rt_priority that can be used

4649

* by a given scheduling class.

4659

* by a given scheduling class.

4650

*/

4660

*/

4651

asmlinkage long sys_sched_get_priority_max(int policy)

4661

asmlinkage long sys_sched_get_priority_max(int policy)

4652

{

4662

{

4653

int ret = -EINVAL;

4663

int ret = -EINVAL;

4654

4664

4655

switch (policy) {

4665

switch (policy) {

4656

case SCHED_FIFO:

4666

case SCHED_FIFO:

4657

case SCHED_RR:

4667

case SCHED_RR:

4658

ret = MAX_USER_RT_PRIO-1;

4668

ret = MAX_USER_RT_PRIO-1;

4659

break;

4669

break;

4660

case SCHED_NORMAL:

4670

case SCHED_NORMAL:

4661

case SCHED_BATCH:

4671

case SCHED_BATCH:

4662

case SCHED_IDLE:

4672

case SCHED_IDLE:

4663

ret = 0;

4673

ret = 0;

4664

break;

4674

break;

4665

}

4675

}

4666

return ret;

4676

return ret;

4667

}

4677

}

4668

4678

4669

/**

4679

/**

4670

* sys_sched_get_priority_min - return minimum RT priority.

4680

* sys_sched_get_priority_min - return minimum RT priority.

4671

* @policy: scheduling class.

4681

* @policy: scheduling class.

4672

*

4682

*

4673

* this syscall returns the minimum rt_priority that can be used

4683

* this syscall returns the minimum rt_priority that can be used

4674

* by a given scheduling class.

4684

* by a given scheduling class.

4675

*/

4685

*/

4676

asmlinkage long sys_sched_get_priority_min(int policy)

4686

asmlinkage long sys_sched_get_priority_min(int policy)

4677

{

4687

{

4678

int ret = -EINVAL;

4688

int ret = -EINVAL;

4679

4689

4680

switch (policy) {

4690

switch (policy) {

4681

case SCHED_FIFO:

4691

case SCHED_FIFO:

4682

case SCHED_RR:

4692

case SCHED_RR:

4683

ret = 1;

4693

ret = 1;

4684

break;

4694

break;

4685

case SCHED_NORMAL:

4695

case SCHED_NORMAL:

4686

case SCHED_BATCH:

4696

case SCHED_BATCH:

4687

case SCHED_IDLE:

4697

case SCHED_IDLE:

4688

ret = 0;

4698

ret = 0;

4689

}

4699

}

4690

return ret;

4700

return ret;

4691

}

4701

}

4692

4702

4693

/**

4703

/**

4694

* sys_sched_rr_get_interval - return the default timeslice of a process.

4704

* sys_sched_rr_get_interval - return the default timeslice of a process.

4695

* @pid: pid of the process.

4705

* @pid: pid of the process.

4696

* @interval: userspace pointer to the timeslice value.

4706

* @interval: userspace pointer to the timeslice value.

4697

*

4707

*

4698

* this syscall writes the default timeslice value of a given process

4708

* this syscall writes the default timeslice value of a given process

4699

* into the user-space timespec buffer. A value of '0' means infinity.

4709

* into the user-space timespec buffer. A value of '0' means infinity.

4700

*/

4710

*/

4701

asmlinkage

4711

asmlinkage

4702

long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)

4712

long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)

4703

{

4713

{

4704

struct task_struct *p;

4714

struct task_struct *p;

4705

int retval = -EINVAL;

4715

int retval = -EINVAL;

4706

struct timespec t;

4716

struct timespec t;

4707

4717

4708

if (pid < 0)

4718

if (pid < 0)

4709

goto out_nounlock;

4719

goto out_nounlock;

4710

4720

4711

retval = -ESRCH;

4721

retval = -ESRCH;

4712

read_lock(&tasklist_lock);

4722

read_lock(&tasklist_lock);

4713

p = find_process_by_pid(pid);

4723

p = find_process_by_pid(pid);

4714

if (!p)

4724

if (!p)

4715

goto out_unlock;

4725

goto out_unlock;

4716

4726

4717

retval = security_task_getscheduler(p);

4727

retval = security_task_getscheduler(p);

4718

if (retval)

4728

if (retval)

4719

goto out_unlock;

4729

goto out_unlock;

4720

4730

4721

jiffies_to_timespec(p->policy == SCHED_FIFO ?

4731

jiffies_to_timespec(p->policy == SCHED_FIFO ?

4722

0 : static_prio_timeslice(p->static_prio), &t);

4732

0 : static_prio_timeslice(p->static_prio), &t);

4723

read_unlock(&tasklist_lock);

4733

read_unlock(&tasklist_lock);

4724

retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;

4734

retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;

4725

out_nounlock:

4735

out_nounlock:

4726

return retval;

4736

return retval;

4727

out_unlock:

4737

out_unlock:

4728

read_unlock(&tasklist_lock);

4738

read_unlock(&tasklist_lock);

4729

return retval;

4739

return retval;

4730

}

4740

}

4731

4741

4732

static const char stat_nam[] = "RSDTtZX";

4742

static const char stat_nam[] = "RSDTtZX";

4733

4743

4734

static void show_task(struct task_struct *p)

4744

static void show_task(struct task_struct *p)

4735

{

4745

{

4736

unsigned long free = 0;

4746

unsigned long free = 0;

4737

unsigned state;

4747

unsigned state;

4738

4748

4739

state = p->state ? __ffs(p->state) + 1 : 0;

4749

state = p->state ? __ffs(p->state) + 1 : 0;

4740

printk("%-13.13s %c", p->comm,

4750

printk("%-13.13s %c", p->comm,

4741

state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');

4751

state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');

4742

#if BITS_PER_LONG == 32

4752

#if BITS_PER_LONG == 32

4743

if (state == TASK_RUNNING)

4753

if (state == TASK_RUNNING)

4744

printk(" running ");

4754

printk(" running ");

4745

else

4755

else

4746

printk(" %08lx ", thread_saved_pc(p));

4756

printk(" %08lx ", thread_saved_pc(p));

4747

#else

4757

#else

4748

if (state == TASK_RUNNING)

4758

if (state == TASK_RUNNING)

4749

printk(" running task ");

4759

printk(" running task ");

4750

else

4760

else

4751

printk(" %016lx ", thread_saved_pc(p));

4761

printk(" %016lx ", thread_saved_pc(p));

4752

#endif

4762

#endif

4753

#ifdef CONFIG_DEBUG_STACK_USAGE

4763

#ifdef CONFIG_DEBUG_STACK_USAGE

4754

{

4764

{

4755

unsigned long *n = end_of_stack(p);

4765

unsigned long *n = end_of_stack(p);

4756

while (!*n)

4766

while (!*n)

4757

n++;

4767

n++;

4758

free = (unsigned long)n - (unsigned long)end_of_stack(p);

4768

free = (unsigned long)n - (unsigned long)end_of_stack(p);

4759

}

4769

}

4760

#endif

4770

#endif

4761

printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid);

4771

printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid);

4762

4772

4763

if (state != TASK_RUNNING)

4773

if (state != TASK_RUNNING)

4764

show_stack(p, NULL);

4774

show_stack(p, NULL);

4765

}

4775

}

4766

4776

4767

void show_state_filter(unsigned long state_filter)

4777

void show_state_filter(unsigned long state_filter)

4768

{

4778

{

4769

struct task_struct *g, *p;

4779

struct task_struct *g, *p;

4770

4780

4771

#if BITS_PER_LONG == 32

4781

#if BITS_PER_LONG == 32

4772

printk(KERN_INFO

4782

printk(KERN_INFO

4773

" task PC stack pid father\n");

4783

" task PC stack pid father\n");

4774

#else

4784

#else

4775

printk(KERN_INFO

4785

printk(KERN_INFO

4776

" task PC stack pid father\n");

4786

" task PC stack pid father\n");

4777

#endif

4787

#endif

4778

read_lock(&tasklist_lock);

4788

read_lock(&tasklist_lock);

4779

do_each_thread(g, p) {

4789

do_each_thread(g, p) {

4780

/*

4790

/*

4781

* reset the NMI-timeout, listing all files on a slow

4791

* reset the NMI-timeout, listing all files on a slow

4782

* console might take alot of time:

4792

* console might take alot of time:

4783

*/

4793

*/

4784

touch_nmi_watchdog();

4794

touch_nmi_watchdog();

4785

if (!state_filter || (p->state & state_filter))

4795

if (!state_filter || (p->state & state_filter))

4786

show_task(p);

4796

show_task(p);

4787

} while_each_thread(g, p);

4797

} while_each_thread(g, p);

4788

4798

4789

touch_all_softlockup_watchdogs();

4799

touch_all_softlockup_watchdogs();

4790

4800

4791

#ifdef CONFIG_SCHED_DEBUG

4801

#ifdef CONFIG_SCHED_DEBUG

4792

sysrq_sched_debug_show();

4802

sysrq_sched_debug_show();

4793

#endif

4803

#endif

4794

read_unlock(&tasklist_lock);

4804

read_unlock(&tasklist_lock);

4795

/*

4805

/*

4796

* Only show locks if all tasks are dumped:

4806

* Only show locks if all tasks are dumped:

4797

*/

4807

*/

4798

if (state_filter == -1)

4808

if (state_filter == -1)

4799

debug_show_all_locks();

4809

debug_show_all_locks();

4800

}

4810

}

4801

4811

4802

void __cpuinit init_idle_bootup_task(struct task_struct *idle)

4812

void __cpuinit init_idle_bootup_task(struct task_struct *idle)

4803

{

4813

{

4804

idle->sched_class = &idle_sched_class;

4814

idle->sched_class = &idle_sched_class;

4805

}

4815

}

4806

4816

4807

/**

4817

/**

4808

* init_idle - set up an idle thread for a given CPU

4818

* init_idle - set up an idle thread for a given CPU

4809

* @idle: task in question

4819

* @idle: task in question

4810

* @cpu: cpu the idle task belongs to

4820

* @cpu: cpu the idle task belongs to

4811

*

4821

*

4812

* NOTE: this function does not set the idle thread's NEED_RESCHED

4822

* NOTE: this function does not set the idle thread's NEED_RESCHED

4813

* flag, to make booting more robust.

4823

* flag, to make booting more robust.

4814

*/

4824

*/

4815

void __cpuinit init_idle(struct task_struct *idle, int cpu)

4825

void __cpuinit init_idle(struct task_struct *idle, int cpu)

4816

{

4826

{

4817

struct rq *rq = cpu_rq(cpu);

4827

struct rq *rq = cpu_rq(cpu);

4818

unsigned long flags;

4828

unsigned long flags;

4819

4829

4820

__sched_fork(idle);

4830

__sched_fork(idle);

4821

idle->se.exec_start = sched_clock();

4831

idle->se.exec_start = sched_clock();

4822

4832

4823

idle->prio = idle->normal_prio = MAX_PRIO;

4833

idle->prio = idle->normal_prio = MAX_PRIO;

4824

idle->cpus_allowed = cpumask_of_cpu(cpu);

4834

idle->cpus_allowed = cpumask_of_cpu(cpu);

4825

__set_task_cpu(idle, cpu);

4835

__set_task_cpu(idle, cpu);

4826

4836

4827

spin_lock_irqsave(&rq->lock, flags);

4837

spin_lock_irqsave(&rq->lock, flags);

4828

rq->curr = rq->idle = idle;

4838

rq->curr = rq->idle = idle;

4829

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

4839

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

4830

idle->oncpu = 1;

4840

idle->oncpu = 1;

4831

#endif

4841

#endif

4832

spin_unlock_irqrestore(&rq->lock, flags);

4842

spin_unlock_irqrestore(&rq->lock, flags);

4833

4843

4834

/* Set the preempt count _outside_ the spinlocks! */

4844

/* Set the preempt count _outside_ the spinlocks! */

4835

#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)

4845

#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)

4836

task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);

4846

task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);

4837

#else

4847

#else

4838

task_thread_info(idle)->preempt_count = 0;

4848

task_thread_info(idle)->preempt_count = 0;

4839

#endif

4849

#endif

4840

/*

4850

/*

4841

* The idle tasks have their own, simple scheduling class:

4851

* The idle tasks have their own, simple scheduling class:

4842

*/

4852

*/

4843

idle->sched_class = &idle_sched_class;

4853

idle->sched_class = &idle_sched_class;

4844

}

4854

}

4845

4855

4846

/*

4856

/*

4847

* In a system that switches off the HZ timer nohz_cpu_mask

4857

* In a system that switches off the HZ timer nohz_cpu_mask

4848

* indicates which cpus entered this state. This is used

4858

* indicates which cpus entered this state. This is used

4849

* in the rcu update to wait only for active cpus. For system

4859

* in the rcu update to wait only for active cpus. For system

4850

* which do not switch off the HZ timer nohz_cpu_mask should

4860

* which do not switch off the HZ timer nohz_cpu_mask should

4851

* always be CPU_MASK_NONE.

4861

* always be CPU_MASK_NONE.

4852

*/

4862

*/

4853

cpumask_t nohz_cpu_mask = CPU_MASK_NONE;

4863

cpumask_t nohz_cpu_mask = CPU_MASK_NONE;

4854

4864

4855

/*

4865

/*

4856

* Increase the granularity value when there are more CPUs,

4866

* Increase the granularity value when there are more CPUs,

4857

* because with more CPUs the 'effective latency' as visible

4867

* because with more CPUs the 'effective latency' as visible

4858

* to users decreases. But the relationship is not linear,

4868

* to users decreases. But the relationship is not linear,

4859

* so pick a second-best guess by going with the log2 of the

4869

* so pick a second-best guess by going with the log2 of the

4860

* number of CPUs.

4870

* number of CPUs.

4861

*

4871

*

4862

* This idea comes from the SD scheduler of Con Kolivas:

4872

* This idea comes from the SD scheduler of Con Kolivas:

4863

*/

4873

*/

4864

static inline void sched_init_granularity(void)

4874

static inline void sched_init_granularity(void)

4865

{

4875

{

4866

unsigned int factor = 1 + ilog2(num_online_cpus());

4876

unsigned int factor = 1 + ilog2(num_online_cpus());

4867

const unsigned long gran_limit = 100000000;

4877

const unsigned long gran_limit = 100000000;

4868

4878

4869

sysctl_sched_granularity *= factor;

4879

sysctl_sched_granularity *= factor;

4870

if (sysctl_sched_granularity > gran_limit)

4880

if (sysctl_sched_granularity > gran_limit)

4871

sysctl_sched_granularity = gran_limit;

4881

sysctl_sched_granularity = gran_limit;

4872

4882

4873

sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;

4883

sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;

4874

sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;

4884

sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;

4875

}

4885

}

4876

4886

4877

#ifdef CONFIG_SMP

4887

#ifdef CONFIG_SMP

4878

/*

4888

/*

4879

* This is how migration works:

4889

* This is how migration works:

4880

*

4890

*

4881

* 1) we queue a struct migration_req structure in the source CPU's

4891

* 1) we queue a struct migration_req structure in the source CPU's

4882

* runqueue and wake up that CPU's migration thread.

4892

* runqueue and wake up that CPU's migration thread.

4883

* 2) we down() the locked semaphore => thread blocks.

4893

* 2) we down() the locked semaphore => thread blocks.

4884

* 3) migration thread wakes up (implicitly it forces the migrated

4894

* 3) migration thread wakes up (implicitly it forces the migrated

4885

* thread off the CPU)

4895

* thread off the CPU)

4886

* 4) it gets the migration request and checks whether the migrated

4896

* 4) it gets the migration request and checks whether the migrated

4887

* task is still in the wrong runqueue.

4897

* task is still in the wrong runqueue.

4888

* 5) if it's in the wrong runqueue then the migration thread removes

4898

* 5) if it's in the wrong runqueue then the migration thread removes

4889

* it and puts it into the right queue.

4899

* it and puts it into the right queue.

4890

* 6) migration thread up()s the semaphore.

4900

* 6) migration thread up()s the semaphore.

4891

* 7) we wake up and the migration is done.

4901

* 7) we wake up and the migration is done.

4892

*/

4902

*/

4893

4903

4894

/*

4904

/*

4895

* Change a given task's CPU affinity. Migrate the thread to a

4905

* Change a given task's CPU affinity. Migrate the thread to a

4896

* proper CPU and schedule it away if the CPU it's executing on

4906

* proper CPU and schedule it away if the CPU it's executing on

4897

* is removed from the allowed bitmask.

4907

* is removed from the allowed bitmask.

4898

*

4908

*

4899

* NOTE: the caller must have a valid reference to the task, the

4909

* NOTE: the caller must have a valid reference to the task, the

4900

* task must not exit() & deallocate itself prematurely. The

4910

* task must not exit() & deallocate itself prematurely. The

4901

* call is not atomic; no spinlocks may be held.

4911

* call is not atomic; no spinlocks may be held.

4902

*/

4912

*/

4903

int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)

4913

int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)

4904

{

4914

{

4905

struct migration_req req;

4915

struct migration_req req;

4906

unsigned long flags;

4916

unsigned long flags;

4907

struct rq *rq;

4917

struct rq *rq;

4908

int ret = 0;

4918

int ret = 0;

4909

4919

4910

rq = task_rq_lock(p, &flags);

4920

rq = task_rq_lock(p, &flags);

4911

if (!cpus_intersects(new_mask, cpu_online_map)) {

4921

if (!cpus_intersects(new_mask, cpu_online_map)) {

4912

ret = -EINVAL;

4922

ret = -EINVAL;

4913

goto out;

4923

goto out;

4914

}

4924

}

4915

4925

4916

p->cpus_allowed = new_mask;

4926

p->cpus_allowed = new_mask;

4917

/* Can the task run on the task's current CPU? If so, we're done */

4927

/* Can the task run on the task's current CPU? If so, we're done */

4918

if (cpu_isset(task_cpu(p), new_mask))

4928

if (cpu_isset(task_cpu(p), new_mask))

4919

goto out;

4929

goto out;

4920

4930

4921

if (migrate_task(p, any_online_cpu(new_mask), &req)) {

4931

if (migrate_task(p, any_online_cpu(new_mask), &req)) {

4922

/* Need help from migration thread: drop lock and wait. */

4932

/* Need help from migration thread: drop lock and wait. */

4923

task_rq_unlock(rq, &flags);

4933

task_rq_unlock(rq, &flags);

4924

wake_up_process(rq->migration_thread);

4934

wake_up_process(rq->migration_thread);

4925

wait_for_completion(&req.done);

4935

wait_for_completion(&req.done);

4926

tlb_migrate_finish(p->mm);

4936

tlb_migrate_finish(p->mm);

4927

return 0;

4937

return 0;

4928

}

4938

}

4929

out:

4939

out:

4930

task_rq_unlock(rq, &flags);

4940

task_rq_unlock(rq, &flags);

4931

4941

4932

return ret;

4942

return ret;

4933

}

4943

}

4934

EXPORT_SYMBOL_GPL(set_cpus_allowed);

4944

EXPORT_SYMBOL_GPL(set_cpus_allowed);

4935

4945

4936

/*

4946

/*

4937

* Move (not current) task off this cpu, onto dest cpu. We're doing

4947

* Move (not current) task off this cpu, onto dest cpu. We're doing

4938

* this because either it can't run here any more (set_cpus_allowed()

4948

* this because either it can't run here any more (set_cpus_allowed()

4939

* away from this CPU, or CPU going down), or because we're

4949

* away from this CPU, or CPU going down), or because we're

4940

* attempting to rebalance this task on exec (sched_exec).

4950

* attempting to rebalance this task on exec (sched_exec).

4941

*

4951

*

4942

* So we race with normal scheduler movements, but that's OK, as long

4952

* So we race with normal scheduler movements, but that's OK, as long

4943

* as the task is no longer on this CPU.

4953

* as the task is no longer on this CPU.

4944

*

4954

*

4945

* Returns non-zero if task was successfully migrated.

4955

* Returns non-zero if task was successfully migrated.

4946

*/

4956

*/

4947

static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)

4957

static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)

4948

{

4958

{

4949

struct rq *rq_dest, *rq_src;

4959

struct rq *rq_dest, *rq_src;

4950

int ret = 0, on_rq;

4960

int ret = 0, on_rq;

4951

4961

4952

if (unlikely(cpu_is_offline(dest_cpu)))

4962

if (unlikely(cpu_is_offline(dest_cpu)))

4953

return ret;

4963

return ret;

4954

4964

4955

rq_src = cpu_rq(src_cpu);

4965

rq_src = cpu_rq(src_cpu);

4956

rq_dest = cpu_rq(dest_cpu);

4966

rq_dest = cpu_rq(dest_cpu);

4957

4967

4958

double_rq_lock(rq_src, rq_dest);

4968

double_rq_lock(rq_src, rq_dest);

4959

/* Already moved. */

4969

/* Already moved. */

4960

if (task_cpu(p) != src_cpu)

4970

if (task_cpu(p) != src_cpu)

4961

goto out;

4971

goto out;

4962

/* Affinity changed (again). */

4972

/* Affinity changed (again). */

4963

if (!cpu_isset(dest_cpu, p->cpus_allowed))

4973

if (!cpu_isset(dest_cpu, p->cpus_allowed))

4964

goto out;

4974

goto out;

4965

4975

4966

on_rq = p->se.on_rq;

4976

on_rq = p->se.on_rq;

4967

if (on_rq)

4977

if (on_rq)

4968

deactivate_task(rq_src, p, 0);

4978

deactivate_task(rq_src, p, 0);

4969

set_task_cpu(p, dest_cpu);

4979

set_task_cpu(p, dest_cpu);

4970

if (on_rq) {

4980

if (on_rq) {

4971

activate_task(rq_dest, p, 0);

4981

activate_task(rq_dest, p, 0);

4972

check_preempt_curr(rq_dest, p);

4982

check_preempt_curr(rq_dest, p);

4973

}

4983

}

4974

ret = 1;

4984

ret = 1;

4975

out:

4985

out:

4976

double_rq_unlock(rq_src, rq_dest);

4986

double_rq_unlock(rq_src, rq_dest);

4977

return ret;

4987

return ret;

4978

}

4988

}

4979

4989

4980

/*

4990

/*

4981

* migration_thread - this is a highprio system thread that performs

4991

* migration_thread - this is a highprio system thread that performs

4982

* thread migration by bumping thread off CPU then 'pushing' onto

4992

* thread migration by bumping thread off CPU then 'pushing' onto

4983

* another runqueue.

4993

* another runqueue.

4984

*/

4994

*/

4985

static int migration_thread(void *data)

4995

static int migration_thread(void *data)

4986

{

4996

{

4987

int cpu = (long)data;

4997

int cpu = (long)data;

4988

struct rq *rq;

4998

struct rq *rq;

4989

4999

4990

rq = cpu_rq(cpu);

5000

rq = cpu_rq(cpu);

4991

BUG_ON(rq->migration_thread != current);

5001

BUG_ON(rq->migration_thread != current);

4992

5002

4993

set_current_state(TASK_INTERRUPTIBLE);

5003

set_current_state(TASK_INTERRUPTIBLE);

4994

while (!kthread_should_stop()) {

5004

while (!kthread_should_stop()) {

4995

struct migration_req *req;

5005

struct migration_req *req;

4996

struct list_head *head;

5006

struct list_head *head;

4997

5007

4998

spin_lock_irq(&rq->lock);

5008

spin_lock_irq(&rq->lock);

4999

5009

5000

if (cpu_is_offline(cpu)) {

5010

if (cpu_is_offline(cpu)) {

5001

spin_unlock_irq(&rq->lock);

5011

spin_unlock_irq(&rq->lock);

5002

goto wait_to_die;

5012

goto wait_to_die;

5003

}

5013

}

5004

5014

5005

if (rq->active_balance) {

5015

if (rq->active_balance) {

5006

active_load_balance(rq, cpu);

5016

active_load_balance(rq, cpu);

5007

rq->active_balance = 0;

5017

rq->active_balance = 0;

5008

}

5018

}

5009

5019

5010

head = &rq->migration_queue;

5020

head = &rq->migration_queue;

5011

5021

5012

if (list_empty(head)) {

5022

if (list_empty(head)) {

5013

spin_unlock_irq(&rq->lock);

5023

spin_unlock_irq(&rq->lock);

5014

schedule();

5024

schedule();

5015

set_current_state(TASK_INTERRUPTIBLE);

5025

set_current_state(TASK_INTERRUPTIBLE);

5016

continue;

5026

continue;

5017

}

5027

}

5018

req = list_entry(head->next, struct migration_req, list);

5028

req = list_entry(head->next, struct migration_req, list);

5019

list_del_init(head->next);

5029

list_del_init(head->next);

5020

5030

5021

spin_unlock(&rq->lock);

5031

spin_unlock(&rq->lock);

5022

__migrate_task(req->task, cpu, req->dest_cpu);

5032

__migrate_task(req->task, cpu, req->dest_cpu);

5023

local_irq_enable();

5033

local_irq_enable();

5024

5034

5025

complete(&req->done);

5035

complete(&req->done);

5026

}

5036

}

5027

__set_current_state(TASK_RUNNING);

5037

__set_current_state(TASK_RUNNING);

5028

return 0;

5038

return 0;

5029

5039

5030

wait_to_die:

5040

wait_to_die:

5031

/* Wait for kthread_stop */

5041

/* Wait for kthread_stop */

5032

set_current_state(TASK_INTERRUPTIBLE);

5042

set_current_state(TASK_INTERRUPTIBLE);

5033

while (!kthread_should_stop()) {

5043

while (!kthread_should_stop()) {

5034

schedule();

5044

schedule();

5035

set_current_state(TASK_INTERRUPTIBLE);

5045

set_current_state(TASK_INTERRUPTIBLE);

5036

}

5046

}

5037

__set_current_state(TASK_RUNNING);

5047

__set_current_state(TASK_RUNNING);

5038

return 0;

5048

return 0;

5039

}

5049

}

5040

5050

5041

#ifdef CONFIG_HOTPLUG_CPU

5051

#ifdef CONFIG_HOTPLUG_CPU

5042

/*

5052

/*

5043

* Figure out where task on dead CPU should go, use force if neccessary.

5053

* Figure out where task on dead CPU should go, use force if neccessary.

5044

* NOTE: interrupts should be disabled by the caller

5054

* NOTE: interrupts should be disabled by the caller

5045

*/

5055

*/

5046

static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)

5056

static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)

5047

{

5057

{

5048

unsigned long flags;

5058

unsigned long flags;

5049

cpumask_t mask;

5059

cpumask_t mask;

5050

struct rq *rq;

5060

struct rq *rq;

5051

int dest_cpu;

5061

int dest_cpu;

5052

5062

5053

restart:

5063

restart:

5054

/* On same node? */

5064

/* On same node? */

5055

mask = node_to_cpumask(cpu_to_node(dead_cpu));

5065

mask = node_to_cpumask(cpu_to_node(dead_cpu));

5056

cpus_and(mask, mask, p->cpus_allowed);

5066

cpus_and(mask, mask, p->cpus_allowed);

5057

dest_cpu = any_online_cpu(mask);

5067

dest_cpu = any_online_cpu(mask);

5058

5068

5059

/* On any allowed CPU? */

5069

/* On any allowed CPU? */

5060

if (dest_cpu == NR_CPUS)

5070

if (dest_cpu == NR_CPUS)

5061

dest_cpu = any_online_cpu(p->cpus_allowed);

5071

dest_cpu = any_online_cpu(p->cpus_allowed);

5062

5072

5063

/* No more Mr. Nice Guy. */

5073

/* No more Mr. Nice Guy. */

5064

if (dest_cpu == NR_CPUS) {

5074

if (dest_cpu == NR_CPUS) {

5065

rq = task_rq_lock(p, &flags);

5075

rq = task_rq_lock(p, &flags);

5066

cpus_setall(p->cpus_allowed);

5076

cpus_setall(p->cpus_allowed);

5067

dest_cpu = any_online_cpu(p->cpus_allowed);

5077

dest_cpu = any_online_cpu(p->cpus_allowed);

5068

task_rq_unlock(rq, &flags);

5078

task_rq_unlock(rq, &flags);

5069

5079

5070

/*

5080

/*

5071

* Don't tell them about moving exiting tasks or

5081

* Don't tell them about moving exiting tasks or

5072

* kernel threads (both mm NULL), since they never

5082

* kernel threads (both mm NULL), since they never

5073

* leave kernel.

5083

* leave kernel.

5074

*/

5084

*/

5075

if (p->mm && printk_ratelimit())

5085

if (p->mm && printk_ratelimit())

5076

printk(KERN_INFO "process %d (%s) no "

5086

printk(KERN_INFO "process %d (%s) no "

5077

"longer affine to cpu%d\n",

5087

"longer affine to cpu%d\n",

5078

p->pid, p->comm, dead_cpu);

5088

p->pid, p->comm, dead_cpu);

5079

}

5089

}

5080

if (!__migrate_task(p, dead_cpu, dest_cpu))

5090

if (!__migrate_task(p, dead_cpu, dest_cpu))

5081

goto restart;

5091

goto restart;

5082

}

5092

}

5083

5093

5084

/*

5094

/*

5085

* While a dead CPU has no uninterruptible tasks queued at this point,

5095

* While a dead CPU has no uninterruptible tasks queued at this point,

5086

* it might still have a nonzero ->nr_uninterruptible counter, because

5096

* it might still have a nonzero ->nr_uninterruptible counter, because

5087

* for performance reasons the counter is not stricly tracking tasks to

5097

* for performance reasons the counter is not stricly tracking tasks to

5088

* their home CPUs. So we just add the counter to another CPU's counter,

5098

* their home CPUs. So we just add the counter to another CPU's counter,

5089

* to keep the global sum constant after CPU-down:

5099

* to keep the global sum constant after CPU-down:

5090

*/

5100

*/

5091

static void migrate_nr_uninterruptible(struct rq *rq_src)

5101

static void migrate_nr_uninterruptible(struct rq *rq_src)

5092

{

5102

{

5093

struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));

5103

struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));

5094

unsigned long flags;

5104

unsigned long flags;

5095

5105

5096

local_irq_save(flags);

5106

local_irq_save(flags);

5097

double_rq_lock(rq_src, rq_dest);

5107

double_rq_lock(rq_src, rq_dest);

5098

rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;

5108

rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;

5099

rq_src->nr_uninterruptible = 0;

5109

rq_src->nr_uninterruptible = 0;

5100

double_rq_unlock(rq_src, rq_dest);

5110

double_rq_unlock(rq_src, rq_dest);

5101

local_irq_restore(flags);

5111

local_irq_restore(flags);

5102

}

5112

}

5103

5113

5104

/* Run through task list and migrate tasks from the dead cpu. */

5114

/* Run through task list and migrate tasks from the dead cpu. */

5105

static void migrate_live_tasks(int src_cpu)

5115

static void migrate_live_tasks(int src_cpu)

5106

{

5116

{

5107

struct task_struct *p, *t;

5117

struct task_struct *p, *t;

5108

5118

5109

write_lock_irq(&tasklist_lock);

5119

write_lock_irq(&tasklist_lock);

5110

5120

5111

do_each_thread(t, p) {

5121

do_each_thread(t, p) {

5112

if (p == current)

5122

if (p == current)

5113

continue;

5123

continue;

5114

5124

5115

if (task_cpu(p) == src_cpu)

5125

if (task_cpu(p) == src_cpu)

5116

move_task_off_dead_cpu(src_cpu, p);

5126

move_task_off_dead_cpu(src_cpu, p);

5117

} while_each_thread(t, p);

5127

} while_each_thread(t, p);

5118

5128

5119

write_unlock_irq(&tasklist_lock);

5129

write_unlock_irq(&tasklist_lock);

5120

}

5130

}

5121

5131

5122

/*

5132

/*

5123

* Schedules idle task to be the next runnable task on current CPU.

5133

* Schedules idle task to be the next runnable task on current CPU.

5124

* It does so by boosting its priority to highest possible and adding it to

5134

* It does so by boosting its priority to highest possible and adding it to

5125

* the _front_ of the runqueue. Used by CPU offline code.

5135

* the _front_ of the runqueue. Used by CPU offline code.

5126

*/

5136

*/

5127

void sched_idle_next(void)

5137

void sched_idle_next(void)

5128

{

5138

{

5129

int this_cpu = smp_processor_id();

5139

int this_cpu = smp_processor_id();

5130

struct rq *rq = cpu_rq(this_cpu);

5140

struct rq *rq = cpu_rq(this_cpu);

5131

struct task_struct *p = rq->idle;

5141

struct task_struct *p = rq->idle;

5132

unsigned long flags;

5142

unsigned long flags;

5133

5143

5134

/* cpu has to be offline */

5144

/* cpu has to be offline */

5135

BUG_ON(cpu_online(this_cpu));

5145

BUG_ON(cpu_online(this_cpu));

5136

5146

5137

/*

5147

/*

5138

* Strictly not necessary since rest of the CPUs are stopped by now

5148

* Strictly not necessary since rest of the CPUs are stopped by now

5139

* and interrupts disabled on the current cpu.

5149

* and interrupts disabled on the current cpu.

5140

*/

5150

*/

5141

spin_lock_irqsave(&rq->lock, flags);

5151

spin_lock_irqsave(&rq->lock, flags);

5142

5152

5143

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

5153

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

5144

5154

5145

/* Add idle task to the _front_ of its priority queue: */

5155

/* Add idle task to the _front_ of its priority queue: */

5146

activate_idle_task(p, rq);

5156

activate_idle_task(p, rq);

5147

5157

5148

spin_unlock_irqrestore(&rq->lock, flags);

5158

spin_unlock_irqrestore(&rq->lock, flags);

5149

}

5159

}

5150

5160

5151

/*

5161

/*

5152

* Ensures that the idle task is using init_mm right before its cpu goes

5162

* Ensures that the idle task is using init_mm right before its cpu goes

5153

* offline.

5163

* offline.

5154

*/

5164

*/

5155

void idle_task_exit(void)

5165

void idle_task_exit(void)

5156

{

5166

{

5157

struct mm_struct *mm = current->active_mm;

5167

struct mm_struct *mm = current->active_mm;

5158

5168

5159

BUG_ON(cpu_online(smp_processor_id()));

5169

BUG_ON(cpu_online(smp_processor_id()));

5160

5170

5161

if (mm != &init_mm)

5171

if (mm != &init_mm)

5162

switch_mm(mm, &init_mm, current);

5172

switch_mm(mm, &init_mm, current);

5163

mmdrop(mm);

5173

mmdrop(mm);

5164

}

5174

}

5165

5175

5166

/* called under rq->lock with disabled interrupts */

5176

/* called under rq->lock with disabled interrupts */

5167

static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)

5177

static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)

5168

{

5178

{

5169

struct rq *rq = cpu_rq(dead_cpu);

5179

struct rq *rq = cpu_rq(dead_cpu);

5170

5180

5171

/* Must be exiting, otherwise would be on tasklist. */

5181

/* Must be exiting, otherwise would be on tasklist. */

5172

BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);

5182

BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);

5173

5183

5174

/* Cannot have done final schedule yet: would have vanished. */

5184

/* Cannot have done final schedule yet: would have vanished. */

5175

BUG_ON(p->state == TASK_DEAD);

5185

BUG_ON(p->state == TASK_DEAD);

5176

5186

5177

get_task_struct(p);

5187

get_task_struct(p);

5178

5188

5179

/*

5189

/*

5180

* Drop lock around migration; if someone else moves it,

5190

* Drop lock around migration; if someone else moves it,

5181

* that's OK. No task can be added to this CPU, so iteration is

5191

* that's OK. No task can be added to this CPU, so iteration is

5182

* fine.

5192

* fine.

5183

* NOTE: interrupts should be left disabled --dev@

5193

* NOTE: interrupts should be left disabled --dev@

5184

*/

5194

*/

5185

spin_unlock(&rq->lock);

5195

spin_unlock(&rq->lock);

5186

move_task_off_dead_cpu(dead_cpu, p);

5196

move_task_off_dead_cpu(dead_cpu, p);

5187

spin_lock(&rq->lock);

5197

spin_lock(&rq->lock);

5188

5198

5189

put_task_struct(p);

5199

put_task_struct(p);

5190

}

5200

}

5191

5201

5192

/* release_task() removes task from tasklist, so we won't find dead tasks. */

5202

/* release_task() removes task from tasklist, so we won't find dead tasks. */

5193

static void migrate_dead_tasks(unsigned int dead_cpu)

5203

static void migrate_dead_tasks(unsigned int dead_cpu)

5194

{

5204

{

5195

struct rq *rq = cpu_rq(dead_cpu);

5205

struct rq *rq = cpu_rq(dead_cpu);

5196

struct task_struct *next;

5206

struct task_struct *next;

5197

5207

5198

for ( ; ; ) {

5208

for ( ; ; ) {

5199

if (!rq->nr_running)

5209

if (!rq->nr_running)

5200

break;

5210

break;

5201

next = pick_next_task(rq, rq->curr, rq_clock(rq));

5211

next = pick_next_task(rq, rq->curr, rq_clock(rq));

5202

if (!next)

5212

if (!next)

5203

break;

5213

break;

5204

migrate_dead(dead_cpu, next);

5214

migrate_dead(dead_cpu, next);

5205

5215

5206

}

5216

}

5207

}

5217

}

5208

#endif /* CONFIG_HOTPLUG_CPU */

5218

#endif /* CONFIG_HOTPLUG_CPU */

5209

5219

5210

#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)

5220

#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)

5211

5221

5212

static struct ctl_table sd_ctl_dir[] = {

5222

static struct ctl_table sd_ctl_dir[] = {

5213

{CTL_UNNUMBERED, "sched_domain", NULL, 0, 0755, NULL, },

5223

{CTL_UNNUMBERED, "sched_domain", NULL, 0, 0755, NULL, },

5214

{0,},

5224

{0,},

5215

};

5225

};

5216

5226

5217

static struct ctl_table sd_ctl_root[] = {

5227

static struct ctl_table sd_ctl_root[] = {

5218

{CTL_UNNUMBERED, "kernel", NULL, 0, 0755, sd_ctl_dir, },

5228

{CTL_UNNUMBERED, "kernel", NULL, 0, 0755, sd_ctl_dir, },

5219

{0,},

5229

{0,},

5220

};

5230

};

5221

5231

5222

static struct ctl_table *sd_alloc_ctl_entry(int n)

5232

static struct ctl_table *sd_alloc_ctl_entry(int n)

5223

{

5233

{

5224

struct ctl_table *entry =

5234

struct ctl_table *entry =

5225

kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL);

5235

kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL);

5226

5236

5227

BUG_ON(!entry);

5237

BUG_ON(!entry);

5228

memset(entry, 0, n * sizeof(struct ctl_table));

5238

memset(entry, 0, n * sizeof(struct ctl_table));

5229

5239

5230

return entry;

5240

return entry;

5231

}

5241

}

5232

5242

5233

static void

5243

static void

5234

set_table_entry(struct ctl_table *entry, int ctl_name,

5244

set_table_entry(struct ctl_table *entry, int ctl_name,

5235

const char *procname, void *data, int maxlen,

5245

const char *procname, void *data, int maxlen,

5236

mode_t mode, proc_handler *proc_handler)

5246

mode_t mode, proc_handler *proc_handler)

5237

{

5247

{

5238

entry->ctl_name = ctl_name;

5248

entry->ctl_name = ctl_name;

5239

entry->procname = procname;

5249

entry->procname = procname;

5240

entry->data = data;

5250

entry->data = data;

5241

entry->maxlen = maxlen;

5251

entry->maxlen = maxlen;

5242

entry->mode = mode;

5252

entry->mode = mode;

5243

entry->proc_handler = proc_handler;

5253

entry->proc_handler = proc_handler;

5244

}

5254

}

5245

5255

5246

static struct ctl_table *

5256

static struct ctl_table *

5247

sd_alloc_ctl_domain_table(struct sched_domain *sd)

5257

sd_alloc_ctl_domain_table(struct sched_domain *sd)

5248

{

5258

{

5249

struct ctl_table *table = sd_alloc_ctl_entry(14);

5259

struct ctl_table *table = sd_alloc_ctl_entry(14);

5250

5260

5251

set_table_entry(&table[0], 1, "min_interval", &sd->min_interval,

5261

set_table_entry(&table[0], 1, "min_interval", &sd->min_interval,

5252

sizeof(long), 0644, proc_doulongvec_minmax);

5262

sizeof(long), 0644, proc_doulongvec_minmax);

5253

set_table_entry(&table[1], 2, "max_interval", &sd->max_interval,

5263

set_table_entry(&table[1], 2, "max_interval", &sd->max_interval,

5254

sizeof(long), 0644, proc_doulongvec_minmax);

5264

sizeof(long), 0644, proc_doulongvec_minmax);

5255

set_table_entry(&table[2], 3, "busy_idx", &sd->busy_idx,

5265

set_table_entry(&table[2], 3, "busy_idx", &sd->busy_idx,

5256

sizeof(int), 0644, proc_dointvec_minmax);

5266

sizeof(int), 0644, proc_dointvec_minmax);

5257

set_table_entry(&table[3], 4, "idle_idx", &sd->idle_idx,

5267

set_table_entry(&table[3], 4, "idle_idx", &sd->idle_idx,

5258

sizeof(int), 0644, proc_dointvec_minmax);

5268

sizeof(int), 0644, proc_dointvec_minmax);

5259

set_table_entry(&table[4], 5, "newidle_idx", &sd->newidle_idx,

5269

set_table_entry(&table[4], 5, "newidle_idx", &sd->newidle_idx,

5260

sizeof(int), 0644, proc_dointvec_minmax);

5270

sizeof(int), 0644, proc_dointvec_minmax);

5261

set_table_entry(&table[5], 6, "wake_idx", &sd->wake_idx,

5271

set_table_entry(&table[5], 6, "wake_idx", &sd->wake_idx,

5262

sizeof(int), 0644, proc_dointvec_minmax);

5272

sizeof(int), 0644, proc_dointvec_minmax);

5263

set_table_entry(&table[6], 7, "forkexec_idx", &sd->forkexec_idx,

5273

set_table_entry(&table[6], 7, "forkexec_idx", &sd->forkexec_idx,

5264

sizeof(int), 0644, proc_dointvec_minmax);

5274

sizeof(int), 0644, proc_dointvec_minmax);

5265

set_table_entry(&table[7], 8, "busy_factor", &sd->busy_factor,

5275

set_table_entry(&table[7], 8, "busy_factor", &sd->busy_factor,

5266

sizeof(int), 0644, proc_dointvec_minmax);

5276

sizeof(int), 0644, proc_dointvec_minmax);

5267

set_table_entry(&table[8], 9, "imbalance_pct", &sd->imbalance_pct,

5277

set_table_entry(&table[8], 9, "imbalance_pct", &sd->imbalance_pct,

5268

sizeof(int), 0644, proc_dointvec_minmax);

5278

sizeof(int), 0644, proc_dointvec_minmax);

5269

set_table_entry(&table[10], 11, "cache_nice_tries",

5279

set_table_entry(&table[10], 11, "cache_nice_tries",

5270

&sd->cache_nice_tries,

5280

&sd->cache_nice_tries,

5271

sizeof(int), 0644, proc_dointvec_minmax);

5281

sizeof(int), 0644, proc_dointvec_minmax);

5272

set_table_entry(&table[12], 13, "flags", &sd->flags,

5282

set_table_entry(&table[12], 13, "flags", &sd->flags,

5273

sizeof(int), 0644, proc_dointvec_minmax);

5283

sizeof(int), 0644, proc_dointvec_minmax);

5274

5284

5275

return table;

5285

return table;

5276

}

5286

}

5277

5287

5278

static ctl_table *sd_alloc_ctl_cpu_table(int cpu)

5288

static ctl_table *sd_alloc_ctl_cpu_table(int cpu)

5279

{

5289

{

5280

struct ctl_table *entry, *table;

5290

struct ctl_table *entry, *table;

5281

struct sched_domain *sd;

5291

struct sched_domain *sd;

5282

int domain_num = 0, i;

5292

int domain_num = 0, i;

5283

char buf[32];

5293

char buf[32];

5284

5294

5285

for_each_domain(cpu, sd)

5295

for_each_domain(cpu, sd)

5286

domain_num++;

5296

domain_num++;

5287

entry = table = sd_alloc_ctl_entry(domain_num + 1);

5297

entry = table = sd_alloc_ctl_entry(domain_num + 1);

5288

5298

5289

i = 0;

5299

i = 0;

5290

for_each_domain(cpu, sd) {

5300

for_each_domain(cpu, sd) {

5291

snprintf(buf, 32, "domain%d", i);

5301

snprintf(buf, 32, "domain%d", i);

5292

entry->ctl_name = i + 1;

5302

entry->ctl_name = i + 1;

5293

entry->procname = kstrdup(buf, GFP_KERNEL);

5303

entry->procname = kstrdup(buf, GFP_KERNEL);

5294

entry->mode = 0755;

5304

entry->mode = 0755;

5295

entry->child = sd_alloc_ctl_domain_table(sd);

5305

entry->child = sd_alloc_ctl_domain_table(sd);

5296

entry++;

5306

entry++;

5297

i++;

5307

i++;

5298

}

5308

}

5299

return table;

5309

return table;

5300

}

5310

}

5301

5311

5302

static struct ctl_table_header *sd_sysctl_header;

5312

static struct ctl_table_header *sd_sysctl_header;

5303

static void init_sched_domain_sysctl(void)

5313

static void init_sched_domain_sysctl(void)

5304

{

5314

{

5305

int i, cpu_num = num_online_cpus();

5315

int i, cpu_num = num_online_cpus();

5306

struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);

5316

struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);

5307

char buf[32];

5317

char buf[32];

5308

5318

5309

sd_ctl_dir[0].child = entry;

5319

sd_ctl_dir[0].child = entry;

5310

5320

5311

for (i = 0; i < cpu_num; i++, entry++) {

5321

for (i = 0; i < cpu_num; i++, entry++) {

5312

snprintf(buf, 32, "cpu%d", i);

5322

snprintf(buf, 32, "cpu%d", i);

5313

entry->ctl_name = i + 1;

5323

entry->ctl_name = i + 1;

5314

entry->procname = kstrdup(buf, GFP_KERNEL);

5324

entry->procname = kstrdup(buf, GFP_KERNEL);

5315

entry->mode = 0755;

5325

entry->mode = 0755;

5316

entry->child = sd_alloc_ctl_cpu_table(i);

5326

entry->child = sd_alloc_ctl_cpu_table(i);

5317

}

5327

}

5318

sd_sysctl_header = register_sysctl_table(sd_ctl_root);

5328

sd_sysctl_header = register_sysctl_table(sd_ctl_root);

5319

}

5329

}

5320

#else

5330

#else

5321

static void init_sched_domain_sysctl(void)

5331

static void init_sched_domain_sysctl(void)

5322

{

5332

{

5323

}

5333

}

5324

#endif

5334

#endif

5325

5335

5326

/*

5336

/*

5327

* migration_call - callback that gets triggered when a CPU is added.

5337

* migration_call - callback that gets triggered when a CPU is added.

5328

* Here we can start up the necessary migration thread for the new CPU.

5338

* Here we can start up the necessary migration thread for the new CPU.

5329

*/

5339

*/

5330

static int __cpuinit

5340

static int __cpuinit

5331

migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)

5341

migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)

5332

{

5342

{

5333

struct task_struct *p;

5343

struct task_struct *p;

5334

int cpu = (long)hcpu;

5344

int cpu = (long)hcpu;

5335

unsigned long flags;

5345

unsigned long flags;

5336

struct rq *rq;

5346

struct rq *rq;

5337

5347

5338

switch (action) {

5348

switch (action) {

5339

case CPU_LOCK_ACQUIRE:

5349

case CPU_LOCK_ACQUIRE:

5340

mutex_lock(&sched_hotcpu_mutex);

5350

mutex_lock(&sched_hotcpu_mutex);

5341

break;

5351

break;

5342

5352

5343

case CPU_UP_PREPARE:

5353

case CPU_UP_PREPARE:

5344

case CPU_UP_PREPARE_FROZEN:

5354

case CPU_UP_PREPARE_FROZEN:

5345

p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);

5355

p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);

5346

if (IS_ERR(p))

5356

if (IS_ERR(p))

5347

return NOTIFY_BAD;

5357

return NOTIFY_BAD;

5348

kthread_bind(p, cpu);

5358

kthread_bind(p, cpu);

5349

/* Must be high prio: stop_machine expects to yield to it. */

5359

/* Must be high prio: stop_machine expects to yield to it. */

5350

rq = task_rq_lock(p, &flags);

5360

rq = task_rq_lock(p, &flags);

5351

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

5361

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

5352

task_rq_unlock(rq, &flags);

5362

task_rq_unlock(rq, &flags);

5353

cpu_rq(cpu)->migration_thread = p;

5363

cpu_rq(cpu)->migration_thread = p;

5354

break;

5364

break;

5355

5365

5356

case CPU_ONLINE:

5366

case CPU_ONLINE:

5357

case CPU_ONLINE_FROZEN:

5367

case CPU_ONLINE_FROZEN:

5358

/* Strictly unneccessary, as first user will wake it. */

5368

/* Strictly unneccessary, as first user will wake it. */

5359

wake_up_process(cpu_rq(cpu)->migration_thread);

5369

wake_up_process(cpu_rq(cpu)->migration_thread);

5360

break;

5370

break;

5361

5371

5362

#ifdef CONFIG_HOTPLUG_CPU

5372

#ifdef CONFIG_HOTPLUG_CPU

5363

case CPU_UP_CANCELED:

5373

case CPU_UP_CANCELED:

5364

case CPU_UP_CANCELED_FROZEN:

5374

case CPU_UP_CANCELED_FROZEN:

5365

if (!cpu_rq(cpu)->migration_thread)

5375

if (!cpu_rq(cpu)->migration_thread)

5366

break;

5376

break;

5367

/* Unbind it from offline cpu so it can run. Fall thru. */

5377

/* Unbind it from offline cpu so it can run. Fall thru. */

5368

kthread_bind(cpu_rq(cpu)->migration_thread,

5378

kthread_bind(cpu_rq(cpu)->migration_thread,

5369

any_online_cpu(cpu_online_map));

5379

any_online_cpu(cpu_online_map));

5370

kthread_stop(cpu_rq(cpu)->migration_thread);

5380

kthread_stop(cpu_rq(cpu)->migration_thread);

5371

cpu_rq(cpu)->migration_thread = NULL;

5381

cpu_rq(cpu)->migration_thread = NULL;

5372

break;

5382

break;

5373

5383

5374

case CPU_DEAD:

5384

case CPU_DEAD:

5375

case CPU_DEAD_FROZEN:

5385

case CPU_DEAD_FROZEN:

5376

migrate_live_tasks(cpu);

5386

migrate_live_tasks(cpu);

5377

rq = cpu_rq(cpu);

5387

rq = cpu_rq(cpu);

5378

kthread_stop(rq->migration_thread);

5388

kthread_stop(rq->migration_thread);

5379

rq->migration_thread = NULL;

5389

rq->migration_thread = NULL;

5380

/* Idle task back to normal (off runqueue, low prio) */

5390

/* Idle task back to normal (off runqueue, low prio) */

5381

rq = task_rq_lock(rq->idle, &flags);

5391

rq = task_rq_lock(rq->idle, &flags);

5382

deactivate_task(rq, rq->idle, 0);

5392

deactivate_task(rq, rq->idle, 0);

5383

rq->idle->static_prio = MAX_PRIO;

5393

rq->idle->static_prio = MAX_PRIO;

5384

__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);

5394

__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);

5385

rq->idle->sched_class = &idle_sched_class;

5395

rq->idle->sched_class = &idle_sched_class;

5386

migrate_dead_tasks(cpu);

5396

migrate_dead_tasks(cpu);

5387

task_rq_unlock(rq, &flags);

5397

task_rq_unlock(rq, &flags);

5388

migrate_nr_uninterruptible(rq);

5398

migrate_nr_uninterruptible(rq);

5389

BUG_ON(rq->nr_running != 0);

5399

BUG_ON(rq->nr_running != 0);

5390

5400

5391

/* No need to migrate the tasks: it was best-effort if

5401

/* No need to migrate the tasks: it was best-effort if

5392

* they didn't take sched_hotcpu_mutex. Just wake up

5402

* they didn't take sched_hotcpu_mutex. Just wake up

5393

* the requestors. */

5403

* the requestors. */

5394

spin_lock_irq(&rq->lock);

5404

spin_lock_irq(&rq->lock);

5395

while (!list_empty(&rq->migration_queue)) {

5405

while (!list_empty(&rq->migration_queue)) {

5396

struct migration_req *req;

5406

struct migration_req *req;

5397

5407

5398

req = list_entry(rq->migration_queue.next,

5408

req = list_entry(rq->migration_queue.next,

5399

struct migration_req, list);

5409

struct migration_req, list);

5400

list_del_init(&req->list);

5410

list_del_init(&req->list);

5401

complete(&req->done);

5411

complete(&req->done);

5402

}

5412

}

5403

spin_unlock_irq(&rq->lock);

5413

spin_unlock_irq(&rq->lock);

5404

break;

5414

break;

5405

#endif

5415

#endif

5406

case CPU_LOCK_RELEASE:

5416

case CPU_LOCK_RELEASE:

5407

mutex_unlock(&sched_hotcpu_mutex);

5417

mutex_unlock(&sched_hotcpu_mutex);

5408

break;

5418

break;

5409

}

5419

}

5410

return NOTIFY_OK;

5420

return NOTIFY_OK;

5411

}

5421

}

5412

5422

5413

/* Register at highest priority so that task migration (migrate_all_tasks)

5423

/* Register at highest priority so that task migration (migrate_all_tasks)

5414

* happens before everything else.

5424

* happens before everything else.

5415

*/

5425

*/

5416

static struct notifier_block __cpuinitdata migration_notifier = {

5426

static struct notifier_block __cpuinitdata migration_notifier = {

5417

.notifier_call = migration_call,

5427

.notifier_call = migration_call,

5418

.priority = 10

5428

.priority = 10

5419

};

5429

};

5420

5430

5421

int __init migration_init(void)

5431

int __init migration_init(void)

5422

{

5432

{

5423

void *cpu = (void *)(long)smp_processor_id();

5433

void *cpu = (void *)(long)smp_processor_id();

5424

int err;

5434

int err;

5425

5435

5426

/* Start one for the boot CPU: */

5436

/* Start one for the boot CPU: */

5427

err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);

5437

err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);

5428

BUG_ON(err == NOTIFY_BAD);

5438

BUG_ON(err == NOTIFY_BAD);

5429

migration_call(&migration_notifier, CPU_ONLINE, cpu);

5439

migration_call(&migration_notifier, CPU_ONLINE, cpu);

5430

register_cpu_notifier(&migration_notifier);

5440

register_cpu_notifier(&migration_notifier);

5431

5441

5432

return 0;

5442

return 0;

5433

}

5443

}

5434

#endif

5444

#endif

5435

5445

5436

#ifdef CONFIG_SMP

5446

#ifdef CONFIG_SMP

5437

5447

5438

/* Number of possible processor ids */

5448

/* Number of possible processor ids */

5439

int nr_cpu_ids __read_mostly = NR_CPUS;

5449

int nr_cpu_ids __read_mostly = NR_CPUS;

5440

EXPORT_SYMBOL(nr_cpu_ids);

5450

EXPORT_SYMBOL(nr_cpu_ids);

5441

5451

5442

#undef SCHED_DOMAIN_DEBUG

5452

#undef SCHED_DOMAIN_DEBUG

5443

#ifdef SCHED_DOMAIN_DEBUG

5453

#ifdef SCHED_DOMAIN_DEBUG

5444

static void sched_domain_debug(struct sched_domain *sd, int cpu)

5454

static void sched_domain_debug(struct sched_domain *sd, int cpu)

5445

{

5455

{

5446

int level = 0;

5456

int level = 0;

5447

5457

5448

if (!sd) {

5458

if (!sd) {

5449

printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);

5459

printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);

5450

return;

5460

return;

5451

}

5461

}

5452

5462

5453

printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

5463

printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

5454

5464

5455

do {

5465

do {

5456

int i;

5466

int i;

5457

char str[NR_CPUS];

5467

char str[NR_CPUS];

5458

struct sched_group *group = sd->groups;

5468

struct sched_group *group = sd->groups;

5459

cpumask_t groupmask;

5469

cpumask_t groupmask;

5460

5470

5461

cpumask_scnprintf(str, NR_CPUS, sd->span);

5471

cpumask_scnprintf(str, NR_CPUS, sd->span);

5462

cpus_clear(groupmask);

5472

cpus_clear(groupmask);

5463

5473

5464

printk(KERN_DEBUG);

5474

printk(KERN_DEBUG);

5465

for (i = 0; i < level + 1; i++)

5475

for (i = 0; i < level + 1; i++)

5466

printk(" ");

5476

printk(" ");

5467

printk("domain %d: ", level);

5477

printk("domain %d: ", level);

5468

5478

5469

if (!(sd->flags & SD_LOAD_BALANCE)) {

5479

if (!(sd->flags & SD_LOAD_BALANCE)) {

5470

printk("does not load-balance\n");

5480

printk("does not load-balance\n");

5471

if (sd->parent)

5481

if (sd->parent)

5472

printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"

5482

printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"

5473

" has parent");

5483

" has parent");

5474

break;

5484

break;

5475

}

5485

}

5476

5486

5477

printk("span %s\n", str);

5487

printk("span %s\n", str);

5478

5488

5479

if (!cpu_isset(cpu, sd->span))

5489

if (!cpu_isset(cpu, sd->span))

5480

printk(KERN_ERR "ERROR: domain->span does not contain "

5490

printk(KERN_ERR "ERROR: domain->span does not contain "

5481

"CPU%d\n", cpu);

5491

"CPU%d\n", cpu);

5482

if (!cpu_isset(cpu, group->cpumask))

5492

if (!cpu_isset(cpu, group->cpumask))

5483

printk(KERN_ERR "ERROR: domain->groups does not contain"

5493

printk(KERN_ERR "ERROR: domain->groups does not contain"

5484

" CPU%d\n", cpu);

5494

" CPU%d\n", cpu);

5485

5495

5486

printk(KERN_DEBUG);

5496

printk(KERN_DEBUG);

5487

for (i = 0; i < level + 2; i++)

5497

for (i = 0; i < level + 2; i++)

5488

printk(" ");

5498

printk(" ");

5489

printk("groups:");

5499

printk("groups:");

5490

do {

5500

do {

5491

if (!group) {

5501

if (!group) {

5492

printk("\n");

5502

printk("\n");

5493

printk(KERN_ERR "ERROR: group is NULL\n");

5503

printk(KERN_ERR "ERROR: group is NULL\n");

5494

break;

5504

break;

5495

}

5505

}

5496

5506

5497

if (!group->__cpu_power) {

5507

if (!group->__cpu_power) {

5498

printk("\n");

5508

printk("\n");

5499

printk(KERN_ERR "ERROR: domain->cpu_power not "

5509

printk(KERN_ERR "ERROR: domain->cpu_power not "

5500

"set\n");

5510

"set\n");

5501

}

5511

}

5502

5512

5503

if (!cpus_weight(group->cpumask)) {

5513

if (!cpus_weight(group->cpumask)) {

5504

printk("\n");

5514

printk("\n");

5505

printk(KERN_ERR "ERROR: empty group\n");

5515

printk(KERN_ERR "ERROR: empty group\n");

5506

}

5516

}

5507

5517

5508

if (cpus_intersects(groupmask, group->cpumask)) {

5518

if (cpus_intersects(groupmask, group->cpumask)) {

5509

printk("\n");

5519

printk("\n");

5510

printk(KERN_ERR "ERROR: repeated CPUs\n");

5520

printk(KERN_ERR "ERROR: repeated CPUs\n");

5511

}

5521

}

5512

5522

5513

cpus_or(groupmask, groupmask, group->cpumask);

5523

cpus_or(groupmask, groupmask, group->cpumask);

5514

5524

5515

cpumask_scnprintf(str, NR_CPUS, group->cpumask);

5525

cpumask_scnprintf(str, NR_CPUS, group->cpumask);

5516

printk(" %s", str);

5526

printk(" %s", str);

5517

5527

5518

group = group->next;

5528

group = group->next;

5519

} while (group != sd->groups);

5529

} while (group != sd->groups);

5520

printk("\n");

5530

printk("\n");

5521

5531

5522

if (!cpus_equal(sd->span, groupmask))

5532

if (!cpus_equal(sd->span, groupmask))

5523

printk(KERN_ERR "ERROR: groups don't span "

5533

printk(KERN_ERR "ERROR: groups don't span "

5524

"domain->span\n");

5534

"domain->span\n");

5525

5535

5526

level++;

5536

level++;

5527

sd = sd->parent;

5537

sd = sd->parent;

5528

if (!sd)

5538

if (!sd)

5529

continue;

5539

continue;

5530

5540

5531

if (!cpus_subset(groupmask, sd->span))

5541

if (!cpus_subset(groupmask, sd->span))

5532

printk(KERN_ERR "ERROR: parent span is not a superset "

5542

printk(KERN_ERR "ERROR: parent span is not a superset "

5533

"of domain->span\n");

5543

"of domain->span\n");

5534

5544

5535

} while (sd);

5545

} while (sd);

5536

}

5546

}

5537

#else

5547

#else

5538

# define sched_domain_debug(sd, cpu) do { } while (0)

5548

# define sched_domain_debug(sd, cpu) do { } while (0)

5539

#endif

5549

#endif

5540

5550

5541

static int sd_degenerate(struct sched_domain *sd)

5551

static int sd_degenerate(struct sched_domain *sd)

5542

{

5552

{

5543

if (cpus_weight(sd->span) == 1)

5553

if (cpus_weight(sd->span) == 1)

5544

return 1;

5554

return 1;

5545

5555

5546

/* Following flags need at least 2 groups */

5556

/* Following flags need at least 2 groups */

5547

if (sd->flags & (SD_LOAD_BALANCE |

5557

if (sd->flags & (SD_LOAD_BALANCE |

5548

SD_BALANCE_NEWIDLE |

5558

SD_BALANCE_NEWIDLE |

5549

SD_BALANCE_FORK |

5559

SD_BALANCE_FORK |

5550

SD_BALANCE_EXEC |

5560

SD_BALANCE_EXEC |

5551

SD_SHARE_CPUPOWER |

5561

SD_SHARE_CPUPOWER |

5552

SD_SHARE_PKG_RESOURCES)) {

5562

SD_SHARE_PKG_RESOURCES)) {

5553

if (sd->groups != sd->groups->next)

5563

if (sd->groups != sd->groups->next)

5554

return 0;

5564

return 0;

5555

}

5565

}

5556

5566

5557

/* Following flags don't use groups */

5567

/* Following flags don't use groups */

5558

if (sd->flags & (SD_WAKE_IDLE |

5568

if (sd->flags & (SD_WAKE_IDLE |

5559

SD_WAKE_AFFINE |

5569

SD_WAKE_AFFINE |

5560

SD_WAKE_BALANCE))

5570

SD_WAKE_BALANCE))

5561

return 0;

5571

return 0;

5562

5572

5563

return 1;

5573

return 1;

5564

}

5574

}

5565

5575

5566

static int

5576

static int

5567

sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)

5577

sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)

5568

{

5578

{

5569

unsigned long cflags = sd->flags, pflags = parent->flags;

5579

unsigned long cflags = sd->flags, pflags = parent->flags;

5570

5580

5571

if (sd_degenerate(parent))

5581

if (sd_degenerate(parent))

5572

return 1;

5582

return 1;

5573

5583

5574

if (!cpus_equal(sd->span, parent->span))

5584

if (!cpus_equal(sd->span, parent->span))

5575

return 0;

5585

return 0;

5576

5586

5577

/* Does parent contain flags not in child? */

5587

/* Does parent contain flags not in child? */

5578

/* WAKE_BALANCE is a subset of WAKE_AFFINE */

5588

/* WAKE_BALANCE is a subset of WAKE_AFFINE */

5579

if (cflags & SD_WAKE_AFFINE)

5589

if (cflags & SD_WAKE_AFFINE)

5580

pflags &= ~SD_WAKE_BALANCE;

5590

pflags &= ~SD_WAKE_BALANCE;

5581

/* Flags needing groups don't count if only 1 group in parent */

5591

/* Flags needing groups don't count if only 1 group in parent */

5582

if (parent->groups == parent->groups->next) {

5592

if (parent->groups == parent->groups->next) {

5583

pflags &= ~(SD_LOAD_BALANCE |

5593

pflags &= ~(SD_LOAD_BALANCE |

5584

SD_BALANCE_NEWIDLE |

5594

SD_BALANCE_NEWIDLE |

5585

SD_BALANCE_FORK |

5595

SD_BALANCE_FORK |

5586

SD_BALANCE_EXEC |

5596

SD_BALANCE_EXEC |

5587

SD_SHARE_CPUPOWER |

5597

SD_SHARE_CPUPOWER |

5588

SD_SHARE_PKG_RESOURCES);

5598

SD_SHARE_PKG_RESOURCES);

5589

}

5599

}

5590

if (~cflags & pflags)

5600

if (~cflags & pflags)

5591

return 0;

5601

return 0;

5592

5602

5593

return 1;

5603

return 1;

5594

}

5604

}

5595

5605

5596

/*

5606

/*

5597

* Attach the domain 'sd' to 'cpu' as its base domain. Callers must

5607

* Attach the domain 'sd' to 'cpu' as its base domain. Callers must

5598

* hold the hotplug lock.

5608

* hold the hotplug lock.

5599

*/

5609

*/

5600

static void cpu_attach_domain(struct sched_domain *sd, int cpu)

5610

static void cpu_attach_domain(struct sched_domain *sd, int cpu)

5601

{

5611

{

5602

struct rq *rq = cpu_rq(cpu);

5612

struct rq *rq = cpu_rq(cpu);

5603

struct sched_domain *tmp;

5613

struct sched_domain *tmp;

5604

5614

5605

/* Remove the sched domains which do not contribute to scheduling. */

5615

/* Remove the sched domains which do not contribute to scheduling. */

5606

for (tmp = sd; tmp; tmp = tmp->parent) {

5616

for (tmp = sd; tmp; tmp = tmp->parent) {

5607

struct sched_domain *parent = tmp->parent;

5617

struct sched_domain *parent = tmp->parent;

5608

if (!parent)

5618

if (!parent)

5609

break;

5619

break;

5610

if (sd_parent_degenerate(tmp, parent)) {

5620

if (sd_parent_degenerate(tmp, parent)) {

5611

tmp->parent = parent->parent;

5621

tmp->parent = parent->parent;

5612

if (parent->parent)

5622

if (parent->parent)

5613

parent->parent->child = tmp;

5623

parent->parent->child = tmp;

5614

}

5624

}

5615

}

5625

}

5616

5626

5617

if (sd && sd_degenerate(sd)) {

5627

if (sd && sd_degenerate(sd)) {

5618

sd = sd->parent;

5628

sd = sd->parent;

5619

if (sd)

5629

if (sd)

5620

sd->child = NULL;

5630

sd->child = NULL;

5621

}

5631

}

5622

5632

5623

sched_domain_debug(sd, cpu);

5633

sched_domain_debug(sd, cpu);

5624

5634

5625

rcu_assign_pointer(rq->sd, sd);

5635

rcu_assign_pointer(rq->sd, sd);

5626

}

5636

}

5627

5637

5628

/* cpus with isolated domains */

5638

/* cpus with isolated domains */

5629

static cpumask_t cpu_isolated_map = CPU_MASK_NONE;

5639

static cpumask_t cpu_isolated_map = CPU_MASK_NONE;

5630

5640

5631

/* Setup the mask of cpus configured for isolated domains */

5641

/* Setup the mask of cpus configured for isolated domains */

5632

static int __init isolated_cpu_setup(char *str)

5642

static int __init isolated_cpu_setup(char *str)

5633

{

5643

{

5634

int ints[NR_CPUS], i;

5644

int ints[NR_CPUS], i;

5635

5645

5636

str = get_options(str, ARRAY_SIZE(ints), ints);

5646

str = get_options(str, ARRAY_SIZE(ints), ints);

5637

cpus_clear(cpu_isolated_map);

5647

cpus_clear(cpu_isolated_map);

5638

for (i = 1; i <= ints[0]; i++)

5648

for (i = 1; i <= ints[0]; i++)

5639

if (ints[i] < NR_CPUS)

5649

if (ints[i] < NR_CPUS)

5640

cpu_set(ints[i], cpu_isolated_map);

5650

cpu_set(ints[i], cpu_isolated_map);

5641

return 1;

5651

return 1;

5642

}

5652

}

5643

5653

5644

__setup ("isolcpus=", isolated_cpu_setup);

5654

__setup ("isolcpus=", isolated_cpu_setup);

5645

5655

5646

/*

5656

/*

5647

* init_sched_build_groups takes the cpumask we wish to span, and a pointer

5657

* init_sched_build_groups takes the cpumask we wish to span, and a pointer

5648

* to a function which identifies what group(along with sched group) a CPU

5658

* to a function which identifies what group(along with sched group) a CPU

5649

* belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS

5659

* belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS

5650

* (due to the fact that we keep track of groups covered with a cpumask_t).

5660

* (due to the fact that we keep track of groups covered with a cpumask_t).

5651

*

5661

*

5652

* init_sched_build_groups will build a circular linked list of the groups

5662

* init_sched_build_groups will build a circular linked list of the groups

5653

* covered by the given span, and will set each group's ->cpumask correctly,

5663

* covered by the given span, and will set each group's ->cpumask correctly,

5654

* and ->cpu_power to 0.

5664

* and ->cpu_power to 0.

5655

*/

5665

*/

5656

static void

5666

static void

5657

init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,

5667

init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,

5658

int (*group_fn)(int cpu, const cpumask_t *cpu_map,

5668

int (*group_fn)(int cpu, const cpumask_t *cpu_map,

5659

struct sched_group **sg))

5669

struct sched_group **sg))

5660

{

5670

{

5661

struct sched_group *first = NULL, *last = NULL;

5671

struct sched_group *first = NULL, *last = NULL;

5662

cpumask_t covered = CPU_MASK_NONE;

5672

cpumask_t covered = CPU_MASK_NONE;

5663

int i;

5673

int i;

5664

5674

5665

for_each_cpu_mask(i, span) {

5675

for_each_cpu_mask(i, span) {

5666

struct sched_group *sg;

5676

struct sched_group *sg;

5667

int group = group_fn(i, cpu_map, &sg);

5677

int group = group_fn(i, cpu_map, &sg);

5668

int j;

5678

int j;

5669

5679

5670

if (cpu_isset(i, covered))

5680

if (cpu_isset(i, covered))

5671

continue;

5681

continue;

5672

5682

5673

sg->cpumask = CPU_MASK_NONE;

5683

sg->cpumask = CPU_MASK_NONE;

5674

sg->__cpu_power = 0;

5684

sg->__cpu_power = 0;

5675

5685

5676

for_each_cpu_mask(j, span) {

5686

for_each_cpu_mask(j, span) {

5677

if (group_fn(j, cpu_map, NULL) != group)

5687

if (group_fn(j, cpu_map, NULL) != group)

5678

continue;

5688

continue;

5679

5689

5680

cpu_set(j, covered);

5690

cpu_set(j, covered);

5681

cpu_set(j, sg->cpumask);

5691

cpu_set(j, sg->cpumask);

5682

}

5692

}

5683

if (!first)

5693

if (!first)

5684

first = sg;

5694

first = sg;

5685

if (last)

5695

if (last)

5686

last->next = sg;

5696

last->next = sg;

5687

last = sg;

5697

last = sg;

5688

}

5698

}

5689

last->next = first;

5699

last->next = first;

5690

}

5700

}

5691

5701

5692

#define SD_NODES_PER_DOMAIN 16

5702

#define SD_NODES_PER_DOMAIN 16

5693

5703

5694

#ifdef CONFIG_NUMA

5704

#ifdef CONFIG_NUMA

5695

5705

5696

/**

5706

/**

5697

* find_next_best_node - find the next node to include in a sched_domain

5707

* find_next_best_node - find the next node to include in a sched_domain

5698

* @node: node whose sched_domain we're building

5708

* @node: node whose sched_domain we're building

5699

* @used_nodes: nodes already in the sched_domain

5709

* @used_nodes: nodes already in the sched_domain

5700

*

5710

*

5701

* Find the next node to include in a given scheduling domain. Simply

5711

* Find the next node to include in a given scheduling domain. Simply

5702

* finds the closest node not already in the @used_nodes map.

5712

* finds the closest node not already in the @used_nodes map.

5703

*

5713

*

5704

* Should use nodemask_t.

5714

* Should use nodemask_t.

5705

*/

5715

*/

5706

static int find_next_best_node(int node, unsigned long *used_nodes)

5716

static int find_next_best_node(int node, unsigned long *used_nodes)

5707

{

5717

{

5708

int i, n, val, min_val, best_node = 0;

5718

int i, n, val, min_val, best_node = 0;

5709

5719

5710

min_val = INT_MAX;

5720

min_val = INT_MAX;

5711

5721

5712

for (i = 0; i < MAX_NUMNODES; i++) {

5722

for (i = 0; i < MAX_NUMNODES; i++) {

5713

/* Start at @node */

5723

/* Start at @node */

5714

n = (node + i) % MAX_NUMNODES;

5724

n = (node + i) % MAX_NUMNODES;

5715

5725

5716

if (!nr_cpus_node(n))

5726

if (!nr_cpus_node(n))

5717

continue;

5727

continue;

5718

5728

5719

/* Skip already used nodes */

5729

/* Skip already used nodes */

5720

if (test_bit(n, used_nodes))

5730

if (test_bit(n, used_nodes))

5721

continue;

5731

continue;

5722

5732

5723

/* Simple min distance search */

5733

/* Simple min distance search */

5724

val = node_distance(node, n);

5734

val = node_distance(node, n);

5725

5735

5726

if (val < min_val) {

5736

if (val < min_val) {

5727

min_val = val;

5737

min_val = val;

5728

best_node = n;

5738

best_node = n;

5729

}

5739

}

5730

}

5740

}

5731

5741

5732

set_bit(best_node, used_nodes);

5742

set_bit(best_node, used_nodes);

5733

return best_node;

5743

return best_node;

5734

}

5744

}

5735

5745

5736

/**

5746

/**

5737

* sched_domain_node_span - get a cpumask for a node's sched_domain

5747

* sched_domain_node_span - get a cpumask for a node's sched_domain

5738

* @node: node whose cpumask we're constructing

5748

* @node: node whose cpumask we're constructing

5739

* @size: number of nodes to include in this span

5749

* @size: number of nodes to include in this span

5740

*

5750

*

5741

* Given a node, construct a good cpumask for its sched_domain to span. It

5751

* Given a node, construct a good cpumask for its sched_domain to span. It

5742

* should be one that prevents unnecessary balancing, but also spreads tasks

5752

* should be one that prevents unnecessary balancing, but also spreads tasks

5743

* out optimally.

5753

* out optimally.

5744

*/

5754

*/

5745

static cpumask_t sched_domain_node_span(int node)

5755

static cpumask_t sched_domain_node_span(int node)

5746

{

5756

{

5747

DECLARE_BITMAP(used_nodes, MAX_NUMNODES);

5757

DECLARE_BITMAP(used_nodes, MAX_NUMNODES);

5748

cpumask_t span, nodemask;

5758

cpumask_t span, nodemask;

5749

int i;

5759

int i;

5750

5760

5751

cpus_clear(span);

5761

cpus_clear(span);

5752

bitmap_zero(used_nodes, MAX_NUMNODES);

5762

bitmap_zero(used_nodes, MAX_NUMNODES);

5753

5763

5754

nodemask = node_to_cpumask(node);

5764

nodemask = node_to_cpumask(node);

5755

cpus_or(span, span, nodemask);

5765

cpus_or(span, span, nodemask);

5756

set_bit(node, used_nodes);

5766

set_bit(node, used_nodes);

5757

5767

5758

for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {

5768

for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {

5759

int next_node = find_next_best_node(node, used_nodes);

5769

int next_node = find_next_best_node(node, used_nodes);

5760

5770

5761

nodemask = node_to_cpumask(next_node);

5771

nodemask = node_to_cpumask(next_node);

5762

cpus_or(span, span, nodemask);

5772

cpus_or(span, span, nodemask);

5763

}

5773

}

5764

5774

5765

return span;

5775

return span;

5766

}

5776

}

5767

#endif

5777

#endif

5768

5778

5769

int sched_smt_power_savings = 0, sched_mc_power_savings = 0;

5779

int sched_smt_power_savings = 0, sched_mc_power_savings = 0;

5770

5780

5771

/*

5781

/*

5772

* SMT sched-domains:

5782

* SMT sched-domains:

5773

*/

5783

*/

5774

#ifdef CONFIG_SCHED_SMT

5784

#ifdef CONFIG_SCHED_SMT

5775

static DEFINE_PER_CPU(struct sched_domain, cpu_domains);

5785

static DEFINE_PER_CPU(struct sched_domain, cpu_domains);

5776

static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);

5786

static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);

5777

5787

5778

static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,

5788

static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,

5779

struct sched_group **sg)

5789

struct sched_group **sg)

5780

{

5790

{

5781

if (sg)

5791

if (sg)

5782

*sg = &per_cpu(sched_group_cpus, cpu);

5792

*sg = &per_cpu(sched_group_cpus, cpu);

5783

return cpu;

5793

return cpu;

5784

}

5794

}

5785

#endif

5795

#endif

5786

5796

5787

/*

5797

/*

5788

* multi-core sched-domains:

5798

* multi-core sched-domains:

5789

*/

5799

*/

5790

#ifdef CONFIG_SCHED_MC

5800

#ifdef CONFIG_SCHED_MC

5791

static DEFINE_PER_CPU(struct sched_domain, core_domains);

5801

static DEFINE_PER_CPU(struct sched_domain, core_domains);

5792

static DEFINE_PER_CPU(struct sched_group, sched_group_core);

5802

static DEFINE_PER_CPU(struct sched_group, sched_group_core);

5793

#endif

5803

#endif

5794

5804

5795

#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)

5805

#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)

5796

static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,

5806

static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,

5797

struct sched_group **sg)

5807

struct sched_group **sg)

5798

{

5808

{

5799

int group;

5809

int group;

5800

cpumask_t mask = cpu_sibling_map[cpu];

5810

cpumask_t mask = cpu_sibling_map[cpu];

5801

cpus_and(mask, mask, *cpu_map);

5811

cpus_and(mask, mask, *cpu_map);

5802

group = first_cpu(mask);

5812

group = first_cpu(mask);

5803

if (sg)

5813

if (sg)

5804

*sg = &per_cpu(sched_group_core, group);

5814

*sg = &per_cpu(sched_group_core, group);

5805

return group;

5815

return group;

5806

}

5816

}

5807

#elif defined(CONFIG_SCHED_MC)

5817

#elif defined(CONFIG_SCHED_MC)

5808

static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,

5818

static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,

5809

struct sched_group **sg)

5819

struct sched_group **sg)

5810

{

5820

{

5811

if (sg)

5821

if (sg)

5812

*sg = &per_cpu(sched_group_core, cpu);

5822

*sg = &per_cpu(sched_group_core, cpu);

5813

return cpu;

5823

return cpu;

5814

}

5824

}

5815

#endif

5825

#endif

5816

5826

5817

static DEFINE_PER_CPU(struct sched_domain, phys_domains);

5827

static DEFINE_PER_CPU(struct sched_domain, phys_domains);

5818

static DEFINE_PER_CPU(struct sched_group, sched_group_phys);

5828

static DEFINE_PER_CPU(struct sched_group, sched_group_phys);

5819

5829

5820

static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,

5830

static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,

5821

struct sched_group **sg)

5831

struct sched_group **sg)

5822

{

5832

{

5823

int group;

5833

int group;

5824

#ifdef CONFIG_SCHED_MC

5834

#ifdef CONFIG_SCHED_MC

5825

cpumask_t mask = cpu_coregroup_map(cpu);

5835

cpumask_t mask = cpu_coregroup_map(cpu);

5826

cpus_and(mask, mask, *cpu_map);

5836

cpus_and(mask, mask, *cpu_map);

5827

group = first_cpu(mask);

5837

group = first_cpu(mask);

5828

#elif defined(CONFIG_SCHED_SMT)

5838

#elif defined(CONFIG_SCHED_SMT)

5829

cpumask_t mask = cpu_sibling_map[cpu];

5839

cpumask_t mask = cpu_sibling_map[cpu];

5830

cpus_and(mask, mask, *cpu_map);

5840

cpus_and(mask, mask, *cpu_map);

5831

group = first_cpu(mask);

5841

group = first_cpu(mask);

5832

#else

5842

#else

5833

group = cpu;

5843

group = cpu;

5834

#endif

5844

#endif

5835

if (sg)

5845

if (sg)

5836

*sg = &per_cpu(sched_group_phys, group);

5846

*sg = &per_cpu(sched_group_phys, group);

5837

return group;

5847

return group;

5838

}

5848

}

5839

5849

5840

#ifdef CONFIG_NUMA

5850

#ifdef CONFIG_NUMA

5841

/*

5851

/*

5842

* The init_sched_build_groups can't handle what we want to do with node

5852

* The init_sched_build_groups can't handle what we want to do with node

5843

* groups, so roll our own. Now each node has its own list of groups which

5853

* groups, so roll our own. Now each node has its own list of groups which

5844

* gets dynamically allocated.

5854

* gets dynamically allocated.

5845

*/

5855

*/

5846

static DEFINE_PER_CPU(struct sched_domain, node_domains);

5856

static DEFINE_PER_CPU(struct sched_domain, node_domains);

5847

static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];

5857

static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];

5848

5858

5849

static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);

5859

static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);

5850

static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);

5860

static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);

5851

5861

5852

static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,

5862

static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,

5853

struct sched_group **sg)

5863

struct sched_group **sg)

5854

{

5864

{

5855

cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));

5865

cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));

5856

int group;

5866

int group;

5857

5867

5858

cpus_and(nodemask, nodemask, *cpu_map);

5868

cpus_and(nodemask, nodemask, *cpu_map);

5859

group = first_cpu(nodemask);

5869

group = first_cpu(nodemask);

5860

5870

5861

if (sg)

5871

if (sg)

5862

*sg = &per_cpu(sched_group_allnodes, group);

5872

*sg = &per_cpu(sched_group_allnodes, group);

5863

return group;

5873

return group;

5864

}

5874

}

5865

5875

5866

static void init_numa_sched_groups_power(struct sched_group *group_head)

5876

static void init_numa_sched_groups_power(struct sched_group *group_head)

5867

{

5877

{

5868

struct sched_group *sg = group_head;

5878

struct sched_group *sg = group_head;

5869

int j;

5879

int j;

5870

5880

5871

if (!sg)

5881

if (!sg)

5872

return;

5882

return;

5873

next_sg:

5883

next_sg:

5874

for_each_cpu_mask(j, sg->cpumask) {

5884

for_each_cpu_mask(j, sg->cpumask) {

5875

struct sched_domain *sd;

5885

struct sched_domain *sd;

5876

5886

5877

sd = &per_cpu(phys_domains, j);

5887

sd = &per_cpu(phys_domains, j);

5878

if (j != first_cpu(sd->groups->cpumask)) {

5888

if (j != first_cpu(sd->groups->cpumask)) {

5879

/*

5889

/*

5880

* Only add "power" once for each

5890

* Only add "power" once for each

5881

* physical package.

5891

* physical package.

5882

*/

5892

*/

5883

continue;

5893

continue;

5884

}

5894

}

5885

5895

5886

sg_inc_cpu_power(sg, sd->groups->__cpu_power);

5896

sg_inc_cpu_power(sg, sd->groups->__cpu_power);

5887

}

5897

}

5888

sg = sg->next;

5898

sg = sg->next;

5889

if (sg != group_head)

5899

if (sg != group_head)

5890

goto next_sg;

5900

goto next_sg;

5891

}

5901

}

5892

#endif

5902

#endif

5893

5903

5894

#ifdef CONFIG_NUMA

5904

#ifdef CONFIG_NUMA

5895

/* Free memory allocated for various sched_group structures */

5905

/* Free memory allocated for various sched_group structures */

5896

static void free_sched_groups(const cpumask_t *cpu_map)

5906

static void free_sched_groups(const cpumask_t *cpu_map)

5897

{

5907

{

5898

int cpu, i;

5908

int cpu, i;

5899

5909

5900

for_each_cpu_mask(cpu, *cpu_map) {

5910

for_each_cpu_mask(cpu, *cpu_map) {

5901

struct sched_group **sched_group_nodes

5911

struct sched_group **sched_group_nodes

5902

= sched_group_nodes_bycpu[cpu];

5912

= sched_group_nodes_bycpu[cpu];

5903

5913

5904

if (!sched_group_nodes)

5914

if (!sched_group_nodes)

5905

continue;

5915

continue;

5906

5916

5907

for (i = 0; i < MAX_NUMNODES; i++) {

5917

for (i = 0; i < MAX_NUMNODES; i++) {

5908

cpumask_t nodemask = node_to_cpumask(i);

5918

cpumask_t nodemask = node_to_cpumask(i);

5909

struct sched_group *oldsg, *sg = sched_group_nodes[i];

5919

struct sched_group *oldsg, *sg = sched_group_nodes[i];

5910

5920

5911

cpus_and(nodemask, nodemask, *cpu_map);

5921

cpus_and(nodemask, nodemask, *cpu_map);

5912

if (cpus_empty(nodemask))

5922

if (cpus_empty(nodemask))

5913

continue;

5923

continue;

5914

5924

5915

if (sg == NULL)

5925

if (sg == NULL)

5916

continue;

5926

continue;

5917

sg = sg->next;

5927

sg = sg->next;

5918

next_sg:

5928

next_sg:

5919

oldsg = sg;

5929

oldsg = sg;

5920

sg = sg->next;

5930

sg = sg->next;

5921

kfree(oldsg);

5931

kfree(oldsg);

5922

if (oldsg != sched_group_nodes[i])

5932

if (oldsg != sched_group_nodes[i])

5923

goto next_sg;

5933

goto next_sg;

5924

}

5934

}

5925

kfree(sched_group_nodes);

5935

kfree(sched_group_nodes);

5926

sched_group_nodes_bycpu[cpu] = NULL;

5936

sched_group_nodes_bycpu[cpu] = NULL;

5927

}

5937

}

5928

}

5938

}

5929

#else

5939

#else

5930

static void free_sched_groups(const cpumask_t *cpu_map)

5940

static void free_sched_groups(const cpumask_t *cpu_map)

5931

{

5941

{

5932

}

5942

}

5933

#endif

5943

#endif

5934

5944

5935

/*

5945

/*

5936

* Initialize sched groups cpu_power.

5946

* Initialize sched groups cpu_power.

5937

*

5947

*

5938

* cpu_power indicates the capacity of sched group, which is used while

5948

* cpu_power indicates the capacity of sched group, which is used while

5939

* distributing the load between different sched groups in a sched domain.

5949

* distributing the load between different sched groups in a sched domain.

5940

* Typically cpu_power for all the groups in a sched domain will be same unless

5950

* Typically cpu_power for all the groups in a sched domain will be same unless

5941

* there are asymmetries in the topology. If there are asymmetries, group

5951

* there are asymmetries in the topology. If there are asymmetries, group

5942

* having more cpu_power will pickup more load compared to the group having

5952

* having more cpu_power will pickup more load compared to the group having

5943

* less cpu_power.

5953

* less cpu_power.

5944

*

5954

*

5945

* cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents

5955

* cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents

5946

* the maximum number of tasks a group can handle in the presence of other idle

5956

* the maximum number of tasks a group can handle in the presence of other idle

5947

* or lightly loaded groups in the same sched domain.

5957

* or lightly loaded groups in the same sched domain.

5948

*/

5958

*/

5949

static void init_sched_groups_power(int cpu, struct sched_domain *sd)

5959

static void init_sched_groups_power(int cpu, struct sched_domain *sd)

5950

{

5960

{

5951

struct sched_domain *child;

5961

struct sched_domain *child;

5952

struct sched_group *group;

5962

struct sched_group *group;

5953

5963

5954

WARN_ON(!sd || !sd->groups);

5964

WARN_ON(!sd || !sd->groups);

5955

5965

5956

if (cpu != first_cpu(sd->groups->cpumask))

5966

if (cpu != first_cpu(sd->groups->cpumask))

5957

return;

5967

return;

5958

5968

5959

child = sd->child;

5969

child = sd->child;

5960

5970

5961

sd->groups->__cpu_power = 0;

5971

sd->groups->__cpu_power = 0;

5962

5972

5963

/*

5973

/*

5964

* For perf policy, if the groups in child domain share resources

5974

* For perf policy, if the groups in child domain share resources

5965

* (for example cores sharing some portions of the cache hierarchy

5975

* (for example cores sharing some portions of the cache hierarchy

5966

* or SMT), then set this domain groups cpu_power such that each group

5976

* or SMT), then set this domain groups cpu_power such that each group

5967

* can handle only one task, when there are other idle groups in the

5977

* can handle only one task, when there are other idle groups in the

5968

* same sched domain.

5978

* same sched domain.

5969

*/

5979

*/

5970

if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&

5980

if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&

5971

(child->flags &

5981

(child->flags &

5972

(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {

5982

(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {

5973

sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);

5983

sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);

5974

return;

5984

return;

5975

}

5985

}

5976

5986

5977

/*

5987

/*

5978

* add cpu_power of each child group to this groups cpu_power

5988

* add cpu_power of each child group to this groups cpu_power

5979

*/

5989

*/

5980

group = child->groups;

5990

group = child->groups;

5981

do {

5991

do {

5982

sg_inc_cpu_power(sd->groups, group->__cpu_power);

5992

sg_inc_cpu_power(sd->groups, group->__cpu_power);

5983

group = group->next;

5993

group = group->next;

5984

} while (group != child->groups);

5994

} while (group != child->groups);

5985

}

5995

}

5986

5996

5987

/*

5997

/*

5988

* Build sched domains for a given set of cpus and attach the sched domains

5998

* Build sched domains for a given set of cpus and attach the sched domains

5989

* to the individual cpus

5999

* to the individual cpus

5990

*/

6000

*/

5991

static int build_sched_domains(const cpumask_t *cpu_map)

6001

static int build_sched_domains(const cpumask_t *cpu_map)

5992

{

6002

{

5993

int i;

6003

int i;

5994

#ifdef CONFIG_NUMA

6004

#ifdef CONFIG_NUMA

5995

struct sched_group **sched_group_nodes = NULL;

6005

struct sched_group **sched_group_nodes = NULL;

5996

int sd_allnodes = 0;

6006

int sd_allnodes = 0;

5997

6007

5998

/*

6008

/*

5999

* Allocate the per-node list of sched groups

6009

* Allocate the per-node list of sched groups

6000

*/

6010

*/

6001

sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,

6011

sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,

6002

GFP_KERNEL);

6012

GFP_KERNEL);

6003

if (!sched_group_nodes) {

6013

if (!sched_group_nodes) {

6004

printk(KERN_WARNING "Can not alloc sched group node list\n");

6014

printk(KERN_WARNING "Can not alloc sched group node list\n");

6005

return -ENOMEM;

6015

return -ENOMEM;

6006

}

6016

}

6007

sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;

6017

sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;

6008

#endif

6018

#endif

6009

6019

6010

/*

6020

/*

6011

* Set up domains for cpus specified by the cpu_map.

6021

* Set up domains for cpus specified by the cpu_map.

6012

*/

6022

*/

6013

for_each_cpu_mask(i, *cpu_map) {

6023

for_each_cpu_mask(i, *cpu_map) {

6014

struct sched_domain *sd = NULL, *p;

6024

struct sched_domain *sd = NULL, *p;

6015

cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));

6025

cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));

6016

6026

6017

cpus_and(nodemask, nodemask, *cpu_map);

6027

cpus_and(nodemask, nodemask, *cpu_map);

6018

6028

6019

#ifdef CONFIG_NUMA

6029

#ifdef CONFIG_NUMA

6020

if (cpus_weight(*cpu_map) >

6030

if (cpus_weight(*cpu_map) >

6021

SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {

6031

SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {

6022

sd = &per_cpu(allnodes_domains, i);

6032

sd = &per_cpu(allnodes_domains, i);

6023

*sd = SD_ALLNODES_INIT;

6033

*sd = SD_ALLNODES_INIT;

6024

sd->span = *cpu_map;

6034

sd->span = *cpu_map;

6025

cpu_to_allnodes_group(i, cpu_map, &sd->groups);

6035

cpu_to_allnodes_group(i, cpu_map, &sd->groups);

6026

p = sd;

6036

p = sd;

6027

sd_allnodes = 1;

6037

sd_allnodes = 1;

6028

} else

6038

} else

6029

p = NULL;

6039

p = NULL;

6030

6040

6031

sd = &per_cpu(node_domains, i);

6041

sd = &per_cpu(node_domains, i);

6032

*sd = SD_NODE_INIT;

6042

*sd = SD_NODE_INIT;

6033

sd->span = sched_domain_node_span(cpu_to_node(i));

6043

sd->span = sched_domain_node_span(cpu_to_node(i));

6034

sd->parent = p;

6044

sd->parent = p;

6035

if (p)

6045

if (p)

6036

p->child = sd;

6046

p->child = sd;

6037

cpus_and(sd->span, sd->span, *cpu_map);

6047

cpus_and(sd->span, sd->span, *cpu_map);

6038

#endif

6048

#endif

6039

6049

6040

p = sd;

6050

p = sd;

6041

sd = &per_cpu(phys_domains, i);

6051

sd = &per_cpu(phys_domains, i);

6042

*sd = SD_CPU_INIT;

6052

*sd = SD_CPU_INIT;

6043

sd->span = nodemask;

6053

sd->span = nodemask;

6044

sd->parent = p;

6054

sd->parent = p;

6045

if (p)

6055

if (p)

6046

p->child = sd;

6056

p->child = sd;

6047

cpu_to_phys_group(i, cpu_map, &sd->groups);

6057

cpu_to_phys_group(i, cpu_map, &sd->groups);

6048

6058

6049

#ifdef CONFIG_SCHED_MC

6059

#ifdef CONFIG_SCHED_MC

6050

p = sd;

6060

p = sd;

6051

sd = &per_cpu(core_domains, i);

6061

sd = &per_cpu(core_domains, i);

6052

*sd = SD_MC_INIT;

6062

*sd = SD_MC_INIT;

6053

sd->span = cpu_coregroup_map(i);

6063

sd->span = cpu_coregroup_map(i);

6054

cpus_and(sd->span, sd->span, *cpu_map);

6064

cpus_and(sd->span, sd->span, *cpu_map);

6055

sd->parent = p;

6065

sd->parent = p;

6056

p->child = sd;

6066

p->child = sd;

6057

cpu_to_core_group(i, cpu_map, &sd->groups);

6067

cpu_to_core_group(i, cpu_map, &sd->groups);

6058

#endif

6068

#endif

6059

6069

6060

#ifdef CONFIG_SCHED_SMT

6070

#ifdef CONFIG_SCHED_SMT

6061

p = sd;

6071

p = sd;

6062

sd = &per_cpu(cpu_domains, i);

6072

sd = &per_cpu(cpu_domains, i);

6063

*sd = SD_SIBLING_INIT;

6073

*sd = SD_SIBLING_INIT;

6064

sd->span = cpu_sibling_map[i];

6074

sd->span = cpu_sibling_map[i];

6065

cpus_and(sd->span, sd->span, *cpu_map);

6075

cpus_and(sd->span, sd->span, *cpu_map);

6066

sd->parent = p;

6076

sd->parent = p;

6067

p->child = sd;

6077

p->child = sd;

6068

cpu_to_cpu_group(i, cpu_map, &sd->groups);

6078

cpu_to_cpu_group(i, cpu_map, &sd->groups);

6069

#endif

6079

#endif

6070

}

6080

}

6071

6081

6072

#ifdef CONFIG_SCHED_SMT

6082

#ifdef CONFIG_SCHED_SMT

6073

/* Set up CPU (sibling) groups */

6083

/* Set up CPU (sibling) groups */

6074

for_each_cpu_mask(i, *cpu_map) {

6084

for_each_cpu_mask(i, *cpu_map) {

6075

cpumask_t this_sibling_map = cpu_sibling_map[i];

6085

cpumask_t this_sibling_map = cpu_sibling_map[i];

6076

cpus_and(this_sibling_map, this_sibling_map, *cpu_map);

6086

cpus_and(this_sibling_map, this_sibling_map, *cpu_map);

6077

if (i != first_cpu(this_sibling_map))

6087

if (i != first_cpu(this_sibling_map))

6078

continue;

6088

continue;

6079

6089

6080

init_sched_build_groups(this_sibling_map, cpu_map,

6090

init_sched_build_groups(this_sibling_map, cpu_map,

6081

&cpu_to_cpu_group);

6091

&cpu_to_cpu_group);

6082

}

6092

}

6083

#endif

6093

#endif

6084

6094

6085

#ifdef CONFIG_SCHED_MC

6095

#ifdef CONFIG_SCHED_MC

6086

/* Set up multi-core groups */

6096

/* Set up multi-core groups */

6087

for_each_cpu_mask(i, *cpu_map) {

6097

for_each_cpu_mask(i, *cpu_map) {

6088

cpumask_t this_core_map = cpu_coregroup_map(i);

6098

cpumask_t this_core_map = cpu_coregroup_map(i);

6089

cpus_and(this_core_map, this_core_map, *cpu_map);

6099

cpus_and(this_core_map, this_core_map, *cpu_map);

6090

if (i != first_cpu(this_core_map))

6100

if (i != first_cpu(this_core_map))

6091

continue;

6101

continue;

6092

init_sched_build_groups(this_core_map, cpu_map,

6102

init_sched_build_groups(this_core_map, cpu_map,

6093

&cpu_to_core_group);

6103

&cpu_to_core_group);

6094

}

6104

}

6095

#endif

6105

#endif

6096

6106

6097

/* Set up physical groups */

6107

/* Set up physical groups */

6098

for (i = 0; i < MAX_NUMNODES; i++) {

6108

for (i = 0; i < MAX_NUMNODES; i++) {

6099

cpumask_t nodemask = node_to_cpumask(i);

6109

cpumask_t nodemask = node_to_cpumask(i);

6100

6110

6101

cpus_and(nodemask, nodemask, *cpu_map);

6111

cpus_and(nodemask, nodemask, *cpu_map);

6102

if (cpus_empty(nodemask))

6112

if (cpus_empty(nodemask))

6103

continue;

6113

continue;

6104

6114

6105

init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);

6115

init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);

6106

}

6116

}

6107

6117

6108

#ifdef CONFIG_NUMA

6118

#ifdef CONFIG_NUMA

6109

/* Set up node groups */

6119

/* Set up node groups */

6110

if (sd_allnodes)

6120

if (sd_allnodes)

6111

init_sched_build_groups(*cpu_map, cpu_map,

6121

init_sched_build_groups(*cpu_map, cpu_map,

6112

&cpu_to_allnodes_group);

6122

&cpu_to_allnodes_group);

6113

6123

6114

for (i = 0; i < MAX_NUMNODES; i++) {

6124

for (i = 0; i < MAX_NUMNODES; i++) {

6115

/* Set up node groups */

6125

/* Set up node groups */

6116

struct sched_group *sg, *prev;

6126

struct sched_group *sg, *prev;

6117

cpumask_t nodemask = node_to_cpumask(i);

6127

cpumask_t nodemask = node_to_cpumask(i);

6118

cpumask_t domainspan;

6128

cpumask_t domainspan;

6119

cpumask_t covered = CPU_MASK_NONE;

6129

cpumask_t covered = CPU_MASK_NONE;

6120

int j;

6130

int j;

6121

6131

6122

cpus_and(nodemask, nodemask, *cpu_map);

6132

cpus_and(nodemask, nodemask, *cpu_map);

6123

if (cpus_empty(nodemask)) {

6133

if (cpus_empty(nodemask)) {

6124

sched_group_nodes[i] = NULL;

6134

sched_group_nodes[i] = NULL;

6125

continue;

6135

continue;

6126

}

6136

}

6127

6137

6128

domainspan = sched_domain_node_span(i);

6138

domainspan = sched_domain_node_span(i);

6129

cpus_and(domainspan, domainspan, *cpu_map);

6139

cpus_and(domainspan, domainspan, *cpu_map);

6130

6140

6131

sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);

6141

sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);

6132

if (!sg) {

6142

if (!sg) {

6133

printk(KERN_WARNING "Can not alloc domain group for "

6143

printk(KERN_WARNING "Can not alloc domain group for "

6134

"node %d\n", i);

6144

"node %d\n", i);

6135

goto error;

6145

goto error;

6136

}

6146

}

6137

sched_group_nodes[i] = sg;

6147

sched_group_nodes[i] = sg;

6138

for_each_cpu_mask(j, nodemask) {

6148

for_each_cpu_mask(j, nodemask) {

6139

struct sched_domain *sd;

6149

struct sched_domain *sd;

6140

6150

6141

sd = &per_cpu(node_domains, j);

6151

sd = &per_cpu(node_domains, j);

6142

sd->groups = sg;

6152

sd->groups = sg;

6143

}

6153

}

6144

sg->__cpu_power = 0;

6154

sg->__cpu_power = 0;

6145

sg->cpumask = nodemask;

6155

sg->cpumask = nodemask;

6146

sg->next = sg;

6156

sg->next = sg;

6147

cpus_or(covered, covered, nodemask);

6157

cpus_or(covered, covered, nodemask);

6148

prev = sg;

6158

prev = sg;

6149

6159

6150

for (j = 0; j < MAX_NUMNODES; j++) {

6160

for (j = 0; j < MAX_NUMNODES; j++) {

6151

cpumask_t tmp, notcovered;

6161

cpumask_t tmp, notcovered;

6152

int n = (i + j) % MAX_NUMNODES;

6162

int n = (i + j) % MAX_NUMNODES;

6153

6163

6154

cpus_complement(notcovered, covered);

6164

cpus_complement(notcovered, covered);

6155

cpus_and(tmp, notcovered, *cpu_map);

6165

cpus_and(tmp, notcovered, *cpu_map);

6156

cpus_and(tmp, tmp, domainspan);

6166

cpus_and(tmp, tmp, domainspan);

6157

if (cpus_empty(tmp))

6167

if (cpus_empty(tmp))

6158

break;

6168

break;

6159

6169

6160

nodemask = node_to_cpumask(n);

6170

nodemask = node_to_cpumask(n);

6161

cpus_and(tmp, tmp, nodemask);

6171

cpus_and(tmp, tmp, nodemask);

6162

if (cpus_empty(tmp))

6172

if (cpus_empty(tmp))

6163

continue;

6173

continue;

6164

6174

6165

sg = kmalloc_node(sizeof(struct sched_group),

6175

sg = kmalloc_node(sizeof(struct sched_group),

6166

GFP_KERNEL, i);

6176

GFP_KERNEL, i);

6167

if (!sg) {

6177

if (!sg) {

6168

printk(KERN_WARNING

6178

printk(KERN_WARNING

6169

"Can not alloc domain group for node %d\n", j);

6179

"Can not alloc domain group for node %d\n", j);

6170

goto error;

6180

goto error;

6171

}

6181

}

6172

sg->__cpu_power = 0;

6182

sg->__cpu_power = 0;

6173

sg->cpumask = tmp;

6183

sg->cpumask = tmp;

6174

sg->next = prev->next;

6184

sg->next = prev->next;

6175

cpus_or(covered, covered, tmp);

6185

cpus_or(covered, covered, tmp);

6176

prev->next = sg;

6186

prev->next = sg;

6177

prev = sg;

6187

prev = sg;

6178

}

6188

}

6179

}

6189

}

6180

#endif

6190

#endif

6181

6191

6182

/* Calculate CPU power for physical packages and nodes */

6192

/* Calculate CPU power for physical packages and nodes */

6183

#ifdef CONFIG_SCHED_SMT

6193

#ifdef CONFIG_SCHED_SMT

6184

for_each_cpu_mask(i, *cpu_map) {

6194

for_each_cpu_mask(i, *cpu_map) {

6185

struct sched_domain *sd = &per_cpu(cpu_domains, i);

6195

struct sched_domain *sd = &per_cpu(cpu_domains, i);

6186

6196

6187

init_sched_groups_power(i, sd);

6197

init_sched_groups_power(i, sd);

6188

}

6198

}

6189

#endif

6199

#endif

6190

#ifdef CONFIG_SCHED_MC

6200

#ifdef CONFIG_SCHED_MC

6191

for_each_cpu_mask(i, *cpu_map) {

6201

for_each_cpu_mask(i, *cpu_map) {

6192

struct sched_domain *sd = &per_cpu(core_domains, i);

6202

struct sched_domain *sd = &per_cpu(core_domains, i);

6193

6203

6194

init_sched_groups_power(i, sd);

6204

init_sched_groups_power(i, sd);

6195

}

6205

}

6196

#endif

6206

#endif

6197

6207

6198

for_each_cpu_mask(i, *cpu_map) {

6208

for_each_cpu_mask(i, *cpu_map) {

6199

struct sched_domain *sd = &per_cpu(phys_domains, i);

6209

struct sched_domain *sd = &per_cpu(phys_domains, i);

6200

6210

6201

init_sched_groups_power(i, sd);

6211

init_sched_groups_power(i, sd);

6202

}

6212

}

6203

6213

6204

#ifdef CONFIG_NUMA

6214

#ifdef CONFIG_NUMA

6205

for (i = 0; i < MAX_NUMNODES; i++)

6215

for (i = 0; i < MAX_NUMNODES; i++)

6206

init_numa_sched_groups_power(sched_group_nodes[i]);

6216

init_numa_sched_groups_power(sched_group_nodes[i]);

6207

6217

6208

if (sd_allnodes) {

6218

if (sd_allnodes) {

6209

struct sched_group *sg;

6219

struct sched_group *sg;

6210

6220

6211

cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);

6221

cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);

6212

init_numa_sched_groups_power(sg);

6222

init_numa_sched_groups_power(sg);

6213

}

6223

}

6214

#endif

6224

#endif

6215

6225

6216

/* Attach the domains */

6226

/* Attach the domains */

6217

for_each_cpu_mask(i, *cpu_map) {

6227

for_each_cpu_mask(i, *cpu_map) {

6218

struct sched_domain *sd;

6228

struct sched_domain *sd;

6219

#ifdef CONFIG_SCHED_SMT

6229

#ifdef CONFIG_SCHED_SMT

6220

sd = &per_cpu(cpu_domains, i);

6230

sd = &per_cpu(cpu_domains, i);

6221

#elif defined(CONFIG_SCHED_MC)

6231

#elif defined(CONFIG_SCHED_MC)

6222

sd = &per_cpu(core_domains, i);

6232

sd = &per_cpu(core_domains, i);

6223

#else

6233

#else

6224

sd = &per_cpu(phys_domains, i);

6234

sd = &per_cpu(phys_domains, i);

6225

#endif

6235

#endif

6226

cpu_attach_domain(sd, i);

6236

cpu_attach_domain(sd, i);

6227

}

6237

}

6228

6238

6229

return 0;

6239

return 0;

6230

6240

6231

#ifdef CONFIG_NUMA

6241

#ifdef CONFIG_NUMA

6232

error:

6242

error:

6233

free_sched_groups(cpu_map);

6243

free_sched_groups(cpu_map);

6234

return -ENOMEM;

6244

return -ENOMEM;

6235

#endif

6245

#endif

6236

}

6246

}

6237

/*

6247

/*

6238

* Set up scheduler domains and groups. Callers must hold the hotplug lock.

6248

* Set up scheduler domains and groups. Callers must hold the hotplug lock.

6239

*/

6249

*/

6240

static int arch_init_sched_domains(const cpumask_t *cpu_map)

6250

static int arch_init_sched_domains(const cpumask_t *cpu_map)

6241

{

6251

{

6242

cpumask_t cpu_default_map;

6252

cpumask_t cpu_default_map;

6243

int err;

6253

int err;

6244

6254

6245

/*

6255

/*

6246

* Setup mask for cpus without special case scheduling requirements.

6256

* Setup mask for cpus without special case scheduling requirements.

6247

* For now this just excludes isolated cpus, but could be used to

6257

* For now this just excludes isolated cpus, but could be used to

6248

* exclude other special cases in the future.

6258

* exclude other special cases in the future.

6249

*/

6259

*/

6250

cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);

6260

cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);

6251

6261

6252

err = build_sched_domains(&cpu_default_map);

6262

err = build_sched_domains(&cpu_default_map);

6253

6263

6254

return err;

6264

return err;

6255

}

6265

}

6256

6266

6257

static void arch_destroy_sched_domains(const cpumask_t *cpu_map)

6267

static void arch_destroy_sched_domains(const cpumask_t *cpu_map)

6258

{

6268

{

6259

free_sched_groups(cpu_map);

6269

free_sched_groups(cpu_map);

6260

}

6270

}

6261

6271

6262

/*

6272

/*

6263

* Detach sched domains from a group of cpus specified in cpu_map

6273

* Detach sched domains from a group of cpus specified in cpu_map

6264

* These cpus will now be attached to the NULL domain

6274

* These cpus will now be attached to the NULL domain

6265

*/

6275

*/

6266

static void detach_destroy_domains(const cpumask_t *cpu_map)

6276

static void detach_destroy_domains(const cpumask_t *cpu_map)

6267

{

6277

{

6268

int i;

6278

int i;

6269

6279

6270

for_each_cpu_mask(i, *cpu_map)

6280

for_each_cpu_mask(i, *cpu_map)

6271

cpu_attach_domain(NULL, i);

6281

cpu_attach_domain(NULL, i);

6272

synchronize_sched();

6282

synchronize_sched();

6273

arch_destroy_sched_domains(cpu_map);

6283

arch_destroy_sched_domains(cpu_map);

6274

}

6284

}

6275

6285

6276

/*

6286

/*

6277

* Partition sched domains as specified by the cpumasks below.

6287

* Partition sched domains as specified by the cpumasks below.

6278

* This attaches all cpus from the cpumasks to the NULL domain,

6288

* This attaches all cpus from the cpumasks to the NULL domain,

6279

* waits for a RCU quiescent period, recalculates sched

6289

* waits for a RCU quiescent period, recalculates sched

6280

* domain information and then attaches them back to the

6290

* domain information and then attaches them back to the

6281

* correct sched domains

6291

* correct sched domains

6282

* Call with hotplug lock held

6292

* Call with hotplug lock held

6283

*/

6293

*/

6284

int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)

6294

int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)

6285

{

6295

{

6286

cpumask_t change_map;

6296

cpumask_t change_map;

6287

int err = 0;

6297

int err = 0;

6288

6298

6289

cpus_and(*partition1, *partition1, cpu_online_map);

6299

cpus_and(*partition1, *partition1, cpu_online_map);

6290

cpus_and(*partition2, *partition2, cpu_online_map);

6300

cpus_and(*partition2, *partition2, cpu_online_map);

6291

cpus_or(change_map, *partition1, *partition2);

6301

cpus_or(change_map, *partition1, *partition2);

6292

6302

6293

/* Detach sched domains from all of the affected cpus */

6303

/* Detach sched domains from all of the affected cpus */

6294

detach_destroy_domains(&change_map);

6304

detach_destroy_domains(&change_map);

6295

if (!cpus_empty(*partition1))

6305

if (!cpus_empty(*partition1))

6296

err = build_sched_domains(partition1);

6306

err = build_sched_domains(partition1);

6297

if (!err && !cpus_empty(*partition2))

6307

if (!err && !cpus_empty(*partition2))

6298

err = build_sched_domains(partition2);

6308

err = build_sched_domains(partition2);

6299

6309

6300

return err;

6310

return err;

6301

}

6311

}

6302

6312

6303

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

6313

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

6304

int arch_reinit_sched_domains(void)

6314

int arch_reinit_sched_domains(void)

6305

{

6315

{

6306

int err;

6316

int err;

6307

6317

6308

mutex_lock(&sched_hotcpu_mutex);

6318

mutex_lock(&sched_hotcpu_mutex);

6309

detach_destroy_domains(&cpu_online_map);

6319

detach_destroy_domains(&cpu_online_map);

6310

err = arch_init_sched_domains(&cpu_online_map);

6320

err = arch_init_sched_domains(&cpu_online_map);

6311

mutex_unlock(&sched_hotcpu_mutex);

6321

mutex_unlock(&sched_hotcpu_mutex);

6312

6322

6313

return err;

6323

return err;

6314

}

6324

}

6315

6325

6316

static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)

6326

static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)

6317

{

6327

{

6318

int ret;

6328

int ret;

6319

6329

6320

if (buf[0] != '0' && buf[0] != '1')

6330

if (buf[0] != '0' && buf[0] != '1')

6321

return -EINVAL;

6331

return -EINVAL;

6322

6332

6323

if (smt)

6333

if (smt)

6324

sched_smt_power_savings = (buf[0] == '1');

6334

sched_smt_power_savings = (buf[0] == '1');

6325

else

6335

else

6326

sched_mc_power_savings = (buf[0] == '1');

6336

sched_mc_power_savings = (buf[0] == '1');

6327

6337

6328

ret = arch_reinit_sched_domains();

6338

ret = arch_reinit_sched_domains();

6329

6339

6330

return ret ? ret : count;

6340

return ret ? ret : count;

6331

}

6341

}

6332

6342

6333

int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)

6343

int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)

6334

{

6344

{

6335

int err = 0;

6345

int err = 0;

6336

6346

6337

#ifdef CONFIG_SCHED_SMT

6347

#ifdef CONFIG_SCHED_SMT

6338

if (smt_capable())

6348

if (smt_capable())

6339

err = sysfs_create_file(&cls->kset.kobj,

6349

err = sysfs_create_file(&cls->kset.kobj,

6340

&attr_sched_smt_power_savings.attr);

6350

&attr_sched_smt_power_savings.attr);

6341

#endif

6351

#endif

6342

#ifdef CONFIG_SCHED_MC

6352

#ifdef CONFIG_SCHED_MC

6343

if (!err && mc_capable())

6353

if (!err && mc_capable())

6344

err = sysfs_create_file(&cls->kset.kobj,

6354

err = sysfs_create_file(&cls->kset.kobj,

6345

&attr_sched_mc_power_savings.attr);

6355

&attr_sched_mc_power_savings.attr);

6346

#endif

6356

#endif

6347

return err;

6357

return err;

6348

}

6358

}

6349

#endif

6359

#endif

6350

6360

6351

#ifdef CONFIG_SCHED_MC

6361

#ifdef CONFIG_SCHED_MC

6352

static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)

6362

static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)

6353

{

6363

{

6354

return sprintf(page, "%u\n", sched_mc_power_savings);

6364

return sprintf(page, "%u\n", sched_mc_power_savings);

6355

}

6365

}

6356

static ssize_t sched_mc_power_savings_store(struct sys_device *dev,

6366

static ssize_t sched_mc_power_savings_store(struct sys_device *dev,

6357

const char *buf, size_t count)

6367

const char *buf, size_t count)

6358

{

6368

{

6359

return sched_power_savings_store(buf, count, 0);

6369

return sched_power_savings_store(buf, count, 0);

6360

}

6370

}

6361

SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,

6371

SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,

6362

sched_mc_power_savings_store);

6372

sched_mc_power_savings_store);

6363

#endif

6373

#endif

6364

6374

6365

#ifdef CONFIG_SCHED_SMT

6375

#ifdef CONFIG_SCHED_SMT

6366

static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)

6376

static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)

6367

{

6377

{

6368

return sprintf(page, "%u\n", sched_smt_power_savings);

6378

return sprintf(page, "%u\n", sched_smt_power_savings);

6369

}

6379

}

6370

static ssize_t sched_smt_power_savings_store(struct sys_device *dev,

6380

static ssize_t sched_smt_power_savings_store(struct sys_device *dev,

6371

const char *buf, size_t count)

6381

const char *buf, size_t count)

6372

{

6382

{

6373

return sched_power_savings_store(buf, count, 1);

6383

return sched_power_savings_store(buf, count, 1);

6374

}

6384

}

6375

SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,

6385

SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,

6376

sched_smt_power_savings_store);

6386

sched_smt_power_savings_store);

6377

#endif

6387

#endif

6378

6388

6379

/*

6389

/*

6380

* Force a reinitialization of the sched domains hierarchy. The domains

6390

* Force a reinitialization of the sched domains hierarchy. The domains

6381

* and groups cannot be updated in place without racing with the balancing

6391

* and groups cannot be updated in place without racing with the balancing

6382

* code, so we temporarily attach all running cpus to the NULL domain

6392

* code, so we temporarily attach all running cpus to the NULL domain

6383

* which will prevent rebalancing while the sched domains are recalculated.

6393

* which will prevent rebalancing while the sched domains are recalculated.

6384

*/

6394

*/

6385

static int update_sched_domains(struct notifier_block *nfb,

6395

static int update_sched_domains(struct notifier_block *nfb,

6386

unsigned long action, void *hcpu)

6396

unsigned long action, void *hcpu)

6387

{

6397

{

6388

switch (action) {

6398

switch (action) {

6389

case CPU_UP_PREPARE:

6399

case CPU_UP_PREPARE:

6390

case CPU_UP_PREPARE_FROZEN:

6400

case CPU_UP_PREPARE_FROZEN:

6391

case CPU_DOWN_PREPARE:

6401

case CPU_DOWN_PREPARE:

6392

case CPU_DOWN_PREPARE_FROZEN:

6402

case CPU_DOWN_PREPARE_FROZEN:

6393

detach_destroy_domains(&cpu_online_map);

6403

detach_destroy_domains(&cpu_online_map);

6394

return NOTIFY_OK;

6404

return NOTIFY_OK;

6395

6405

6396

case CPU_UP_CANCELED:

6406

case CPU_UP_CANCELED:

6397

case CPU_UP_CANCELED_FROZEN:

6407

case CPU_UP_CANCELED_FROZEN:

6398

case CPU_DOWN_FAILED:

6408

case CPU_DOWN_FAILED:

6399

case CPU_DOWN_FAILED_FROZEN:

6409

case CPU_DOWN_FAILED_FROZEN:

6400

case CPU_ONLINE:

6410

case CPU_ONLINE:

6401

case CPU_ONLINE_FROZEN:

6411

case CPU_ONLINE_FROZEN:

6402

case CPU_DEAD:

6412

case CPU_DEAD:

6403

case CPU_DEAD_FROZEN:

6413

case CPU_DEAD_FROZEN:

6404

/*

6414

/*

6405

* Fall through and re-initialise the domains.

6415

* Fall through and re-initialise the domains.

6406

*/

6416

*/

6407

break;

6417

break;

6408

default:

6418

default:

6409

return NOTIFY_DONE;

6419

return NOTIFY_DONE;

6410

}

6420

}

6411

6421

6412

/* The hotplug lock is already held by cpu_up/cpu_down */

6422

/* The hotplug lock is already held by cpu_up/cpu_down */

6413

arch_init_sched_domains(&cpu_online_map);

6423

arch_init_sched_domains(&cpu_online_map);

6414

6424

6415

return NOTIFY_OK;

6425

return NOTIFY_OK;

6416

}

6426

}

6417

6427

6418

void __init sched_init_smp(void)

6428

void __init sched_init_smp(void)

6419

{

6429

{

6420

cpumask_t non_isolated_cpus;

6430

cpumask_t non_isolated_cpus;

6421

6431

6422

mutex_lock(&sched_hotcpu_mutex);

6432

mutex_lock(&sched_hotcpu_mutex);

6423

arch_init_sched_domains(&cpu_online_map);

6433

arch_init_sched_domains(&cpu_online_map);

6424

cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);

6434

cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);

6425

if (cpus_empty(non_isolated_cpus))

6435

if (cpus_empty(non_isolated_cpus))

6426

cpu_set(smp_processor_id(), non_isolated_cpus);

6436

cpu_set(smp_processor_id(), non_isolated_cpus);

6427

mutex_unlock(&sched_hotcpu_mutex);

6437

mutex_unlock(&sched_hotcpu_mutex);

6428

/* XXX: Theoretical race here - CPU may be hotplugged now */

6438

/* XXX: Theoretical race here - CPU may be hotplugged now */

6429

hotcpu_notifier(update_sched_domains, 0);

6439

hotcpu_notifier(update_sched_domains, 0);

6430

6440

6431

init_sched_domain_sysctl();

6441

init_sched_domain_sysctl();

6432

6442

6433

/* Move init over to a non-isolated CPU */

6443

/* Move init over to a non-isolated CPU */

6434

if (set_cpus_allowed(current, non_isolated_cpus) < 0)

6444

if (set_cpus_allowed(current, non_isolated_cpus) < 0)

6435

BUG();

6445

BUG();

6436

sched_init_granularity();

6446

sched_init_granularity();

6437

}

6447

}

6438

#else

6448

#else

6439

void __init sched_init_smp(void)

6449

void __init sched_init_smp(void)

6440

{

6450

{

6441

sched_init_granularity();

6451

sched_init_granularity();

6442

}

6452

}

6443

#endif /* CONFIG_SMP */

6453

#endif /* CONFIG_SMP */

6444

6454

6445

int in_sched_functions(unsigned long addr)

6455

int in_sched_functions(unsigned long addr)

6446

{

6456

{

6447

/* Linker adds these: start and end of __sched functions */

6457

/* Linker adds these: start and end of __sched functions */

6448

extern char __sched_text_start[], __sched_text_end[];

6458

extern char __sched_text_start[], __sched_text_end[];

6449

6459

6450

return in_lock_functions(addr) ||

6460

return in_lock_functions(addr) ||

6451

(addr >= (unsigned long)__sched_text_start

6461

(addr >= (unsigned long)__sched_text_start

6452

&& addr < (unsigned long)__sched_text_end);

6462

&& addr < (unsigned long)__sched_text_end);

6453

}

6463

}

6454

6464

6455

static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)

6465

static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)

6456

{

6466

{

6457

cfs_rq->tasks_timeline = RB_ROOT;

6467

cfs_rq->tasks_timeline = RB_ROOT;

6458

cfs_rq->fair_clock = 1;

6468

cfs_rq->fair_clock = 1;

6459

#ifdef CONFIG_FAIR_GROUP_SCHED

6469

#ifdef CONFIG_FAIR_GROUP_SCHED

6460

cfs_rq->rq = rq;

6470

cfs_rq->rq = rq;

6461

#endif

6471

#endif

6462

}

6472

}

6463

6473

6464

void __init sched_init(void)

6474

void __init sched_init(void)

6465

{

6475

{

6466

u64 now = sched_clock();

6476

u64 now = sched_clock();

6467

int highest_cpu = 0;

6477

int highest_cpu = 0;

6468

int i, j;

6478

int i, j;

6469

6479

6470

/*

6480

/*

6471

* Link up the scheduling class hierarchy:

6481

* Link up the scheduling class hierarchy:

6472

*/

6482

*/

6473

rt_sched_class.next = &fair_sched_class;

6483

rt_sched_class.next = &fair_sched_class;

6474

fair_sched_class.next = &idle_sched_class;

6484

fair_sched_class.next = &idle_sched_class;

6475

idle_sched_class.next = NULL;

6485

idle_sched_class.next = NULL;

6476

6486

6477

for_each_possible_cpu(i) {

6487

for_each_possible_cpu(i) {

6478

struct rt_prio_array *array;

6488

struct rt_prio_array *array;

6479

struct rq *rq;

6489

struct rq *rq;

6480

6490

6481

rq = cpu_rq(i);

6491

rq = cpu_rq(i);

6482

spin_lock_init(&rq->lock);

6492

spin_lock_init(&rq->lock);

6483

lockdep_set_class(&rq->lock, &rq->rq_lock_key);

6493

lockdep_set_class(&rq->lock, &rq->rq_lock_key);

6484

rq->nr_running = 0;

6494

rq->nr_running = 0;

6485

rq->clock = 1;

6495

rq->clock = 1;

6486

init_cfs_rq(&rq->cfs, rq);

6496

init_cfs_rq(&rq->cfs, rq);

6487

#ifdef CONFIG_FAIR_GROUP_SCHED

6497

#ifdef CONFIG_FAIR_GROUP_SCHED

6488

INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);

6498

INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);

6489

list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);

6499

list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);

6490

#endif

6500

#endif

6491

rq->ls.load_update_last = now;

6501

rq->ls.load_update_last = now;

6492

rq->ls.load_update_start = now;

6502

rq->ls.load_update_start = now;

6493

6503

6494

for (j = 0; j < CPU_LOAD_IDX_MAX; j++)

6504

for (j = 0; j < CPU_LOAD_IDX_MAX; j++)

6495

rq->cpu_load[j] = 0;

6505

rq->cpu_load[j] = 0;

6496

#ifdef CONFIG_SMP

6506

#ifdef CONFIG_SMP

6497

rq->sd = NULL;

6507

rq->sd = NULL;

6498

rq->active_balance = 0;

6508

rq->active_balance = 0;

6499

rq->next_balance = jiffies;

6509

rq->next_balance = jiffies;

6500

rq->push_cpu = 0;

6510

rq->push_cpu = 0;

6501

rq->cpu = i;

6511

rq->cpu = i;

6502

rq->migration_thread = NULL;

6512

rq->migration_thread = NULL;

6503

INIT_LIST_HEAD(&rq->migration_queue);

6513

INIT_LIST_HEAD(&rq->migration_queue);

6504

#endif

6514

#endif

6505

atomic_set(&rq->nr_iowait, 0);

6515

atomic_set(&rq->nr_iowait, 0);

6506

6516

6507

array = &rq->rt.active;

6517

array = &rq->rt.active;

6508

for (j = 0; j < MAX_RT_PRIO; j++) {

6518

for (j = 0; j < MAX_RT_PRIO; j++) {

6509

INIT_LIST_HEAD(array->queue + j);

6519

INIT_LIST_HEAD(array->queue + j);

6510

__clear_bit(j, array->bitmap);

6520

__clear_bit(j, array->bitmap);

6511

}

6521

}

6512

highest_cpu = i;

6522

highest_cpu = i;

6513

/* delimiter for bitsearch: */

6523

/* delimiter for bitsearch: */

6514

__set_bit(MAX_RT_PRIO, array->bitmap);

6524

__set_bit(MAX_RT_PRIO, array->bitmap);

6515

}

6525

}

6516

6526

6517

set_load_weight(&init_task);

6527

set_load_weight(&init_task);

6518

6528

6519

#ifdef CONFIG_PREEMPT_NOTIFIERS

6529

#ifdef CONFIG_PREEMPT_NOTIFIERS

6520

INIT_HLIST_HEAD(&init_task.preempt_notifiers);

6530

INIT_HLIST_HEAD(&init_task.preempt_notifiers);

6521

#endif

6531

#endif

6522

6532

6523

#ifdef CONFIG_SMP

6533

#ifdef CONFIG_SMP

6524

nr_cpu_ids = highest_cpu + 1;

6534

nr_cpu_ids = highest_cpu + 1;

6525

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);

6535

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);

6526

#endif

6536

#endif

6527

6537

6528

#ifdef CONFIG_RT_MUTEXES

6538

#ifdef CONFIG_RT_MUTEXES

6529

plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);

6539

plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);

6530

#endif

6540

#endif

6531

6541

6532

/*

6542

/*

6533

* The boot idle thread does lazy MMU switching as well:

6543

* The boot idle thread does lazy MMU switching as well:

6534

*/

6544

*/

6535

atomic_inc(&init_mm.mm_count);

6545

atomic_inc(&init_mm.mm_count);

6536

enter_lazy_tlb(&init_mm, current);

6546

enter_lazy_tlb(&init_mm, current);

6537

6547

6538

/*

6548

/*

6539

* Make us the idle thread. Technically, schedule() should not be

6549

* Make us the idle thread. Technically, schedule() should not be

6540

* called from this thread, however somewhere below it might be,

6550

* called from this thread, however somewhere below it might be,

6541

* but because we are the idle thread, we just pick up running again

6551

* but because we are the idle thread, we just pick up running again

6542

* when this runqueue becomes "idle".

6552

* when this runqueue becomes "idle".

6543

*/

6553

*/

6544

init_idle(current, smp_processor_id());

6554

init_idle(current, smp_processor_id());

6545

/*

6555

/*

6546

* During early bootup we pretend to be a normal task:

6556

* During early bootup we pretend to be a normal task:

6547

*/

6557

*/

6548

current->sched_class = &fair_sched_class;

6558

current->sched_class = &fair_sched_class;

6549

}

6559

}

6550

6560

6551

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

6561

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

6552

void __might_sleep(char *file, int line)

6562

void __might_sleep(char *file, int line)

6553

{

6563

{

6554

#ifdef in_atomic

6564

#ifdef in_atomic

6555

static unsigned long prev_jiffy; /* ratelimiting */

6565

static unsigned long prev_jiffy; /* ratelimiting */

6556

6566

6557

if ((in_atomic() || irqs_disabled()) &&

6567

if ((in_atomic() || irqs_disabled()) &&

6558

system_state == SYSTEM_RUNNING && !oops_in_progress) {

6568

system_state == SYSTEM_RUNNING && !oops_in_progress) {

6559

if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

6569

if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

6560

return;

6570

return;

6561

prev_jiffy = jiffies;

6571

prev_jiffy = jiffies;

6562

printk(KERN_ERR "BUG: sleeping function called from invalid"

6572

printk(KERN_ERR "BUG: sleeping function called from invalid"

6563

" context at %s:%d\n", file, line);

6573

" context at %s:%d\n", file, line);

6564

printk("in_atomic():%d, irqs_disabled():%d\n",

6574

printk("in_atomic():%d, irqs_disabled():%d\n",

6565

in_atomic(), irqs_disabled());

6575

in_atomic(), irqs_disabled());

6566

debug_show_held_locks(current);

6576

debug_show_held_locks(current);

6567

if (irqs_disabled())

6577

if (irqs_disabled())

6568

print_irqtrace_events(current);

6578

print_irqtrace_events(current);

6569

dump_stack();

6579

dump_stack();

6570

}

6580

}

6571

#endif

6581

#endif

6572

}

6582

}

6573

EXPORT_SYMBOL(__might_sleep);

6583

EXPORT_SYMBOL(__might_sleep);

6574

#endif

6584

#endif

6575

6585

6576

#ifdef CONFIG_MAGIC_SYSRQ

6586

#ifdef CONFIG_MAGIC_SYSRQ

6577

void normalize_rt_tasks(void)

6587

void normalize_rt_tasks(void)

6578

{

6588

{

6579

struct task_struct *g, *p;

6589

struct task_struct *g, *p;

6580

unsigned long flags;

6590

unsigned long flags;

6581

struct rq *rq;

6591

struct rq *rq;

6582

int on_rq;

6592

int on_rq;

6583

6593

6584

read_lock_irq(&tasklist_lock);

6594

read_lock_irq(&tasklist_lock);

6585

do_each_thread(g, p) {

6595

do_each_thread(g, p) {

6586

p->se.fair_key = 0;

6596

p->se.fair_key = 0;

6587

p->se.wait_runtime = 0;

6597

p->se.wait_runtime = 0;

6588

p->se.exec_start = 0;

6598

p->se.exec_start = 0;

6589

p->se.wait_start_fair = 0;

6599

p->se.wait_start_fair = 0;

6590

p->se.sleep_start_fair = 0;

6600

p->se.sleep_start_fair = 0;

6591

#ifdef CONFIG_SCHEDSTATS

6601

#ifdef CONFIG_SCHEDSTATS

6592

p->se.wait_start = 0;

6602

p->se.wait_start = 0;

6593

p->se.sleep_start = 0;

6603

p->se.sleep_start = 0;

6594

p->se.block_start = 0;

6604

p->se.block_start = 0;

6595

#endif

6605

#endif

6596

task_rq(p)->cfs.fair_clock = 0;

6606

task_rq(p)->cfs.fair_clock = 0;

6597

task_rq(p)->clock = 0;

6607

task_rq(p)->clock = 0;

6598

6608

6599

if (!rt_task(p)) {

6609

if (!rt_task(p)) {

6600

/*

6610

/*

6601

* Renice negative nice level userspace

6611

* Renice negative nice level userspace

6602

* tasks back to 0:

6612

* tasks back to 0:

6603

*/

6613

*/

6604

if (TASK_NICE(p) < 0 && p->mm)

6614

if (TASK_NICE(p) < 0 && p->mm)

6605

set_user_nice(p, 0);

6615

set_user_nice(p, 0);

6606

continue;

6616

continue;

6607

}

6617

}

6608

6618

6609

spin_lock_irqsave(&p->pi_lock, flags);

6619

spin_lock_irqsave(&p->pi_lock, flags);

6610

rq = __task_rq_lock(p);

6620

rq = __task_rq_lock(p);

6611

#ifdef CONFIG_SMP

6621

#ifdef CONFIG_SMP

6612

/*

6622

/*

6613

* Do not touch the migration thread:

6623

* Do not touch the migration thread:

6614

*/

6624

*/

6615

if (p == rq->migration_thread)

6625

if (p == rq->migration_thread)

6616

goto out_unlock;

6626

goto out_unlock;

6617

#endif

6627

#endif

6618

6628

6619

on_rq = p->se.on_rq;

6629

on_rq = p->se.on_rq;

6620

if (on_rq)

6630

if (on_rq)

6621

deactivate_task(task_rq(p), p, 0);

6631

deactivate_task(task_rq(p), p, 0);

6622

__setscheduler(rq, p, SCHED_NORMAL, 0);

6632

__setscheduler(rq, p, SCHED_NORMAL, 0);

6623

if (on_rq) {

6633

if (on_rq) {

6624

activate_task(task_rq(p), p, 0);

6634

activate_task(task_rq(p), p, 0);

6625

resched_task(rq->curr);

6635

resched_task(rq->curr);

6626

}

6636

}

6627

#ifdef CONFIG_SMP

6637

#ifdef CONFIG_SMP

6628

out_unlock:

6638

out_unlock:

6629

#endif

6639

#endif

6630

__task_rq_unlock(rq);

6640

__task_rq_unlock(rq);

6631

spin_unlock_irqrestore(&p->pi_lock, flags);

6641

spin_unlock_irqrestore(&p->pi_lock, flags);

6632

} while_each_thread(g, p);

6642

} while_each_thread(g, p);

6633

6643

6634

read_unlock_irq(&tasklist_lock);

6644

read_unlock_irq(&tasklist_lock);

6635

}

6645

}

6636

6646

6637

#endif /* CONFIG_MAGIC_SYSRQ */

6647

#endif /* CONFIG_MAGIC_SYSRQ */

6638

6648

6639

#ifdef CONFIG_IA64

6649

#ifdef CONFIG_IA64

6640

/*

6650

/*

6641

* These functions are only useful for the IA64 MCA handling.

6651

* These functions are only useful for the IA64 MCA handling.

6642

*

6652

*

6643

* They can only be called when the whole system has been

6653

* They can only be called when the whole system has been

6644

* stopped - every CPU needs to be quiescent, and no scheduling

6654

* stopped - every CPU needs to be quiescent, and no scheduling

6645

* activity can take place. Using them for anything else would

6655

* activity can take place. Using them for anything else would

6646

* be a serious bug, and as a result, they aren't even visible

6656

* be a serious bug, and as a result, they aren't even visible

6647

* under any other configuration.

6657

* under any other configuration.

6648

*/

6658

*/

6649

6659

6650

/**

6660

/**

6651

* curr_task - return the current task for a given cpu.

6661

* curr_task - return the current task for a given cpu.

6652

* @cpu: the processor in question.

6662

* @cpu: the processor in question.

6653

*

6663

*

6654

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

6664

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

6655

*/

6665

*/

6656

struct task_struct *curr_task(int cpu)

6666

struct task_struct *curr_task(int cpu)

6657

{

6667

{

6658

return cpu_curr(cpu);

6668

return cpu_curr(cpu);

6659

}

6669

}

6660

6670

6661

/**

6671

/**

6662

* set_curr_task - set the current task for a given cpu.

6672

* set_curr_task - set the current task for a given cpu.

6663

* @cpu: the processor in question.

6673

* @cpu: the processor in question.

6664

* @p: the task pointer to set.

6674

* @p: the task pointer to set.

6665

*

6675

*

6666

* Description: This function must only be used when non-maskable interrupts

6676

* Description: This function must only be used when non-maskable interrupts

6667

* are serviced on a separate stack. It allows the architecture to switch the

6677

* are serviced on a separate stack. It allows the architecture to switch the

6668

* notion of the current task on a cpu in a non-blocking manner. This function

6678

* notion of the current task on a cpu in a non-blocking manner. This function

6669

* must be called with all CPU's synchronized, and interrupts disabled, the

6679

* must be called with all CPU's synchronized, and interrupts disabled, the

6670

* and caller must save the original value of the current task (see

6680

* and caller must save the original value of the current task (see

6671

* curr_task() above) and restore that value before reenabling interrupts and

6681

* curr_task() above) and restore that value before reenabling interrupts and

6672

* re-starting the system.

6682

* re-starting the system.

GITLAB

sched: simplify move_tasks()

 /*
  *  kernel/sched.c
  *
  *  Kernel scheduler and related syscalls
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
  *
  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
  *		make semaphores SMP safe
  *  1998-11-19	Implemented schedule_timeout() and related stuff
  *		by Andrea Arcangeli
  *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
  *		hybrid priority-list and round-robin design with
  *		an array-switch method of distributing timeslices
  *		and per-CPU runqueues.  Cleanups and useful suggestions
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
  *  2007-04-15  Work begun on replacing all interactivity tuning with a
  *              fair scheduling design by Con Kolivas.
  *  2007-05-05  Load balancing (smp-nice) and other improvements
  *              by Peter Williams
  *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  */
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/nmi.h>
 #include <linux/init.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
 #include <linux/smp_lock.h>
 #include <asm/mmu_context.h>
 #include <linux/interrupt.h>
 #include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/debug_locks.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
 #include <linux/freezer.h>
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/delay.h>
 #include <linux/smp.h>
 #include <linux/threads.h>
 #include <linux/timer.h>
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/percpu.h>
 #include <linux/kthread.h>
 #include <linux/seq_file.h>
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
 #include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
 #include <linux/reciprocal_div.h>
 #include <linux/unistd.h>
 #include <asm/tlb.h>
 /*
  * Scheduler clock - returns current time in nanosec units.
  * This is default implementation.
  * Architectures and sub-architectures can override this.
  */
 unsigned long long __attribute__((weak)) sched_clock(void)
 {
 	return (unsigned long long)jiffies * (1000000000 / HZ);
 }
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  * and back.
  */
 #define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
 #define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
 #define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
 /*
  * 'User priority' is the nice value converted to something we
  * can work with better when scaling various scheduler parameters,
  * it's a [ 0 ... 39 ] range.
  */
 #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
 #define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 /*
  * Some helpers for converting nanosecond timing to jiffy resolution
  */
 #define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
 #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
 #define NICE_0_LOAD		SCHED_LOAD_SCALE
 #define NICE_0_SHIFT		SCHED_LOAD_SHIFT
 /*
  * These are the 'tuning knobs' of the scheduler:
  *
  * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
  * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
  * Timeslices get refilled after they expire.
  */
 #define MIN_TIMESLICE		max(5 * HZ / 1000, 1)
 #define DEF_TIMESLICE		(100 * HZ / 1000)
 #ifdef CONFIG_SMP
 /*
  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
  * Since cpu_power is a 'constant', we can use a reciprocal divide.
  */
 static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
 {
 	return reciprocal_divide(load, sg->reciprocal_cpu_power);
 }
 /*
  * Each time a sched group cpu_power is changed,
  * we must compute its reciprocal value
  */
 static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
 {
 	sg->__cpu_power += val;
 	sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
 }
 #endif
 #define SCALE_PRIO(x, prio) \
 	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
 /*
  * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
  * to time slice values: [800ms ... 100ms ... 5ms]
  */
 static unsigned int static_prio_timeslice(int static_prio)
 {
 	if (static_prio == NICE_TO_PRIO(19))
 		return 1;
 	if (static_prio < NICE_TO_PRIO(0))
 		return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
 	else
 		return SCALE_PRIO(DEF_TIMESLICE, static_prio);
 }
 static inline int rt_policy(int policy)
 {
 	if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
 		return 1;
 	return 0;
 }
 static inline int task_has_rt_policy(struct task_struct *p)
 {
 	return rt_policy(p->policy);
 }
 /*
  * This is the priority-queue data structure of the RT scheduling class:
  */
 struct rt_prio_array {
 	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
 	struct list_head queue[MAX_RT_PRIO];
 };
 struct load_stat {
 	struct load_weight load;
 	u64 load_update_start, load_update_last;
 	unsigned long delta_fair, delta_exec, delta_stat;
 };
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight load;
 	unsigned long nr_running;
 	s64 fair_clock;
 	u64 exec_clock;
 	s64 wait_runtime;
 	u64 sleeper_bonus;
 	unsigned long wait_runtime_overruns, wait_runtime_underruns;
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
 	struct rb_node *rb_load_balance_curr;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* 'curr' points to currently running entity on this cfs_rq.
 	 * It is set to NULL otherwise (i.e when none are currently running).
 	 */
 	struct sched_entity *curr;
 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
 	/* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
 	 * (like users, containers etc.)
 	 *
 	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
 	 * list is used during load balance.
 	 */
 	struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
 #endif
 };
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
 	struct rt_prio_array active;
 	int rt_load_balance_idx;
 	struct list_head *rt_load_balance_head, *rt_load_balance_curr;
 };
 /*
  * This is the main, per-CPU runqueue data structure.
  *
  * Locking rule: those places that want to lock multiple runqueues
  * (such as the load balancing or the thread migration code), lock
  * acquire operations must be ordered by ascending &runqueue.
  */
 struct rq {
 	spinlock_t lock;	/* runqueue lock */
 	/*
 	 * nr_running and cpu_load should be in the same cacheline because
 	 * remote CPUs use both these fields when doing load calculation.
 	 */
 	unsigned long nr_running;
 	#define CPU_LOAD_IDX_MAX 5
 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 	unsigned char idle_at_tick;
 #ifdef CONFIG_NO_HZ
 	unsigned char in_nohz_recently;
 #endif
 	struct load_stat ls;	/* capture load from *all* tasks on this cpu */
 	unsigned long nr_load_updates;
 	u64 nr_switches;
 	struct cfs_rq cfs;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
 #endif
 	struct rt_rq  rt;
 	/*
 	 * This is part of a global counter where only the total sum
 	 * over all CPUs matters. A task can increase this counter on
 	 * one CPU and if it got migrated afterwards it may decrease
 	 * it on another CPU. Always updated under the runqueue lock:
 	 */
 	unsigned long nr_uninterruptible;
 	struct task_struct *curr, *idle;
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
 	u64 clock, prev_clock_raw;
 	s64 clock_max_delta;
 	unsigned int clock_warps, clock_overflows;
 	unsigned int clock_unstable_events;
 	atomic_t nr_iowait;
 #ifdef CONFIG_SMP
 	struct sched_domain *sd;
 	/* For active balancing */
 	int active_balance;
 	int push_cpu;
 	int cpu;		/* cpu of this runqueue */
 	struct task_struct *migration_thread;
 	struct list_head migration_queue;
 #endif
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
 	struct sched_info rq_sched_info;
 	/* sys_sched_yield() stats */
 	unsigned long yld_exp_empty;
 	unsigned long yld_act_empty;
 	unsigned long yld_both_empty;
 	unsigned long yld_cnt;
 	/* schedule() stats */
 	unsigned long sched_switch;
 	unsigned long sched_cnt;
 	unsigned long sched_goidle;
 	/* try_to_wake_up() stats */
 	unsigned long ttwu_cnt;
 	unsigned long ttwu_local;
 #endif
 	struct lock_class_key rq_lock_key;
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 static DEFINE_MUTEX(sched_hotcpu_mutex);
 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
 {
 	rq->curr->sched_class->check_preempt_curr(rq, p);
 }
 static inline int cpu_of(struct rq *rq)
 {
 #ifdef CONFIG_SMP
 	return rq->cpu;
 #else
 	return 0;
 #endif
 }
 /*
  * Per-runqueue clock, as finegrained as the platform can give us:
  */
 static unsigned long long __rq_clock(struct rq *rq)
 {
 	u64 prev_raw = rq->prev_clock_raw;
 	u64 now = sched_clock();
 	s64 delta = now - prev_raw;
 	u64 clock = rq->clock;
 	/*
 	 * Protect against sched_clock() occasionally going backwards:
 	 */
 	if (unlikely(delta < 0)) {
 		clock++;
 		rq->clock_warps++;
 	} else {
 		/*
 		 * Catch too large forward jumps too:
 		 */
 		if (unlikely(delta > 2*TICK_NSEC)) {
 			clock++;
 			rq->clock_overflows++;
 		} else {
 			if (unlikely(delta > rq->clock_max_delta))
 				rq->clock_max_delta = delta;
 			clock += delta;
 		}
 	}
 	rq->prev_clock_raw = now;
 	rq->clock = clock;
 	return clock;
 }
 static inline unsigned long long rq_clock(struct rq *rq)
 {
 	int this_cpu = smp_processor_id();
 	if (this_cpu == cpu_of(rq))
 		return __rq_clock(rq);
 	return rq->clock;
 }
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
  * See detach_destroy_domains: synchronize_sched for details.
  *
  * The domain tree of any CPU may only be accessed from within
  * preempt-disabled sections.
  */
 #define for_each_domain(cpu, __sd) \
 	for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
 #define this_rq()		(&__get_cpu_var(runqueues))
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
  * clock constructed from sched_clock():
  */
 unsigned long long cpu_clock(int cpu)
 {
 	unsigned long long now;
 	unsigned long flags;
 	local_irq_save(flags);
 	now = rq_clock(cpu_rq(cpu));
 	local_irq_restore(flags);
 	return now;
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /* Change a task's ->cfs_rq if it moves across CPUs */
 static inline void set_task_cfs_rq(struct task_struct *p)
 {
 	p->se.cfs_rq = &task_rq(p)->cfs;
 }
 #else
 static inline void set_task_cfs_rq(struct task_struct *p)
 {
 }
 #endif
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
 #ifndef finish_arch_switch
 # define finish_arch_switch(prev)	do { } while (0)
 #endif
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline int task_running(struct rq *rq, struct task_struct *p)
 {
 	return rq->curr == p;
 }
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 }
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	/* this is a valid case when another task releases the spinlock */
 	rq->lock.owner = current;
 #endif
 	/*
 	 * If we are tracking spinlock dependencies then we have to
 	 * fix up the runqueue lock - which gets 'carried over' from
 	 * prev into current:
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
 	spin_unlock_irq(&rq->lock);
 }
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
 static inline int task_running(struct rq *rq, struct task_struct *p)
 {
 #ifdef CONFIG_SMP
 	return p->oncpu;
 #else
 	return rq->curr == p;
 #endif
 }
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 #ifdef CONFIG_SMP
 	/*
 	 * We can optimise this out completely for !SMP, because the
 	 * SMP rebalancing from interrupt is the only thing that cares
 	 * here.
 	 */
 	next->oncpu = 1;
 #endif
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 	spin_unlock_irq(&rq->lock);
 #else
 	spin_unlock(&rq->lock);
 #endif
 }
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
 #ifdef CONFIG_SMP
 	/*
 	 * After ->oncpu is cleared, the task can be moved to a different CPU.
 	 * We must ensure this doesn't happen until the switch is completely
 	 * finished.
 	 */
 	smp_wmb();
 	prev->oncpu = 0;
 #endif
 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 	local_irq_enable();
 #endif
 }
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
  * __task_rq_lock - lock the runqueue a given task resides on.
  * Must be called interrupts disabled.
  */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 repeat_lock_task:
 	rq = task_rq(p);
 	spin_lock(&rq->lock);
 	if (unlikely(rq != task_rq(p))) {
 		spin_unlock(&rq->lock);
 		goto repeat_lock_task;
 	}
 	return rq;
 }
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
  * interrupts.  Note the ordering: we can safely lookup the task_rq without
  * explicitly disabling preemption.
  */
 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 repeat_lock_task:
 	local_irq_save(*flags);
 	rq = task_rq(p);
 	spin_lock(&rq->lock);
 	if (unlikely(rq != task_rq(p))) {
 		spin_unlock_irqrestore(&rq->lock, *flags);
 		goto repeat_lock_task;
 	}
 	return rq;
 }
 static inline void __task_rq_unlock(struct rq *rq)
 	__releases(rq->lock)
 {
 	spin_unlock(&rq->lock);
 }
 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
 	__releases(rq->lock)
 {
 	spin_unlock_irqrestore(&rq->lock, *flags);
 }
 /*
  * this_rq_lock - lock this runqueue and disable interrupts.
  */
 static inline struct rq *this_rq_lock(void)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 	local_irq_disable();
 	rq = this_rq();
 	spin_lock(&rq->lock);
 	return rq;
 }
 /*
  * CPU frequency is/was unstable - start new by setting prev_clock_raw:
  */
 void sched_clock_unstable_event(void)
 {
 	unsigned long flags;
 	struct rq *rq;
 	rq = task_rq_lock(current, &flags);
 	rq->prev_clock_raw = sched_clock();
 	rq->clock_unstable_events++;
 	task_rq_unlock(rq, &flags);
 }
 /*
  * resched_task - mark a task 'to be rescheduled now'.
  *
  * On UP this means the setting of the need_resched flag, on SMP it
  * might also involve a cross-CPU call to trigger the scheduler on
  * the target CPU.
  */
 #ifdef CONFIG_SMP
 #ifndef tsk_is_polling
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
 static void resched_task(struct task_struct *p)
 {
 	int cpu;
 	assert_spin_locked(&task_rq(p)->lock);
 	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
 		return;
 	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
 	cpu = task_cpu(p);
 	if (cpu == smp_processor_id())
 		return;
 	/* NEED_RESCHED must be visible before we test polling */
 	smp_mb();
 	if (!tsk_is_polling(p))
 		smp_send_reschedule(cpu);
 }
 static void resched_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 	if (!spin_trylock_irqsave(&rq->lock, flags))
 		return;
 	resched_task(cpu_curr(cpu));
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 #else
 static inline void resched_task(struct task_struct *p)
 {
 	assert_spin_locked(&task_rq(p)->lock);
 	set_tsk_need_resched(p);
 }
 #endif
 static u64 div64_likely32(u64 divident, unsigned long divisor)
 {
 #if BITS_PER_LONG == 32
 	if (likely(divident <= 0xffffffffULL))
 		return (u32)divident / divisor;
 	do_div(divident, divisor);
 	return divident;
 #else
 	return divident / divisor;
 #endif
 }
 #if BITS_PER_LONG == 32
 # define WMULT_CONST	(~0UL)
 #else
 # define WMULT_CONST	(1UL << 32)
 #endif
 #define WMULT_SHIFT	32
 static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 		struct load_weight *lw)
 {
 	u64 tmp;
 	if (unlikely(!lw->inv_weight))
 		lw->inv_weight = WMULT_CONST / lw->weight;
 	tmp = (u64)delta_exec * weight;
 	/*
 	 * Check whether we'd overflow the 64-bit multiplication:
 	 */
 	if (unlikely(tmp > WMULT_CONST)) {
 		tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
 				>> (WMULT_SHIFT/2);
 	} else {
 		tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
 	}
 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
 static inline unsigned long
 calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
 {
 	return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
 }
 static void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
 	lw->inv_weight = 0;
 }
 static void update_load_sub(struct load_weight *lw, unsigned long dec)
 {
 	lw->weight -= dec;
 	lw->inv_weight = 0;
 }
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
  * of tasks with abnormal "nice" values across CPUs the contribution that
  * each task makes to its run queue's load is weighted according to its
  * scheduling class and "nice" value.  For SCHED_NORMAL tasks this is just a
  * scaled version of the new time slice allocation that they receive on time
  * slice expiry etc.
  */
 #define WEIGHT_IDLEPRIO		2
 #define WMULT_IDLEPRIO		(1 << 31)
 /*
  * Nice levels are multiplicative, with a gentle 10% change for every
  * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
  * nice 1, it will get ~10% less CPU time than another CPU-bound task
  * that remained on nice 0.
  *
  * The "10% effect" is relative and cumulative: from _any_ nice level,
  * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
  * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
  * If a task goes up by ~10% and another task goes down by ~10% then
  * the relative distance between them is ~25%.)
  */
 static const int prio_to_weight[40] = {
 /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
 /* -10 */  9537,  7629,  6103,  4883,  3906,  3125,  2500,  2000,  1600,  1280,
 /*   0 */  NICE_0_LOAD /* 1024 */,
 /*   1 */          819,   655,   524,   419,   336,   268,   215,   172,   137,
 /*  10 */   110,    87,    70,    56,    45,    36,    29,    23,    18,    15,
 };
 /*
  * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
  *
  * In cases where the weight does not change often, we can use the
  * precalculated inverse to speed up arithmetics by turning divisions
  * into multiplications:
  */
 static const u32 prio_to_wmult[40] = {
 /* -20 */     48356,     60446,     75558,     94446,    118058,
 /* -15 */    147573,    184467,    230589,    288233,    360285,
 /* -10 */    450347,    562979,    703746,    879575,   1099582,
 /*  -5 */   1374389,   1717986,   2147483,   2684354,   3355443,
 /*   0 */   4194304,   5244160,   6557201,   8196502,  10250518,
 /*   5 */  12782640,  16025997,  19976592,  24970740,  31350126,
 /*  10 */  39045157,  49367440,  61356675,  76695844,  95443717,
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
 /*
  * runqueue iterator, to support SMP load-balancing between different
  * scheduling classes, without having to expose their internal data
  * structures to the load-balancing proper:
  */
 struct rq_iterator {
 	void *arg;
 	struct task_struct *(*start)(void *);
 	struct task_struct *(*next)(void *);
 };
 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		      unsigned long max_nr_move, unsigned long max_load_move,
 		      struct sched_domain *sd, enum cpu_idle_type idle,
 		      int *all_pinned, unsigned long *load_moved,
 		      int this_best_prio, int best_prio, int best_prio_seen,
 		      struct rq_iterator *iterator);
 #include "sched_stats.h"
 #include "sched_rt.c"
 #include "sched_fair.c"
 #include "sched_idletask.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
 #endif
 #define sched_class_highest (&rt_sched_class)
 static void __update_curr_load(struct rq *rq, struct load_stat *ls)
 {
 	if (rq->curr != rq->idle && ls->load.weight) {
 		ls->delta_exec += ls->delta_stat;
 		ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
 		ls->delta_stat = 0;
 	}
 }
 /*
  * Update delta_exec, delta_fair fields for rq.
  *
  * delta_fair clock advances at a rate inversely proportional to
  * total load (rq->ls.load.weight) on the runqueue, while
  * delta_exec advances at the same rate as wall-clock (provided
  * cpu is not idle).
  *
  * delta_exec / delta_fair is a measure of the (smoothened) load on this
  * runqueue over any given interval. This (smoothened) load is used
  * during load balance.
  *
  * This function is called /before/ updating rq->ls.load
  * and when switching tasks.
  */
 static void update_curr_load(struct rq *rq, u64 now)
 {
 	struct load_stat *ls = &rq->ls;
 	u64 start;
 	start = ls->load_update_start;
 	ls->load_update_start = now;
 	ls->delta_stat += now - start;
 	/*
 	 * Stagger updates to ls->delta_fair. Very frequent updates
 	 * can be expensive.
 	 */
 	if (ls->delta_stat >= sysctl_sched_stat_granularity)
 		__update_curr_load(rq, ls);
 }
 static inline void
 inc_load(struct rq *rq, const struct task_struct *p, u64 now)
 {
 	update_curr_load(rq, now);
 	update_load_add(&rq->ls.load, p->se.load.weight);
 }
 static inline void
 dec_load(struct rq *rq, const struct task_struct *p, u64 now)
 {
 	update_curr_load(rq, now);
 	update_load_sub(&rq->ls.load, p->se.load.weight);
 }
 static void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 {
 	rq->nr_running++;
 	inc_load(rq, p, now);
 }
 static void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 {
 	rq->nr_running--;
 	dec_load(rq, p, now);
 }
 static void set_load_weight(struct task_struct *p)
 {
 	task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
 	p->se.wait_runtime = 0;
 	if (task_has_rt_policy(p)) {
 		p->se.load.weight = prio_to_weight[0] * 2;
 		p->se.load.inv_weight = prio_to_wmult[0] >> 1;
 		return;
 	}
 	/*
 	 * SCHED_IDLE tasks get minimal weight:
 	 */
 	if (p->policy == SCHED_IDLE) {
 		p->se.load.weight = WEIGHT_IDLEPRIO;
 		p->se.load.inv_weight = WMULT_IDLEPRIO;
 		return;
 	}
 	p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
 	p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 }
 static void
 enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
 {
 	sched_info_queued(p);
 	p->sched_class->enqueue_task(rq, p, wakeup, now);
 	p->se.on_rq = 1;
 }
 static void
 dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 {
 	p->sched_class->dequeue_task(rq, p, sleep, now);
 	p->se.on_rq = 0;
 }
 /*
  * __normal_prio - return the priority that is based on the static prio
  */
 static inline int __normal_prio(struct task_struct *p)
 {
 	return p->static_prio;
 }
 /*
  * Calculate the expected normal priority: i.e. priority
  * without taking RT-inheritance into account. Might be
  * boosted by interactivity modifiers. Changes upon fork,
  * setprio syscalls, and whenever the interactivity
  * estimator recalculates.
  */
 static inline int normal_prio(struct task_struct *p)
 {
 	int prio;
 	if (task_has_rt_policy(p))
 		prio = MAX_RT_PRIO-1 - p->rt_priority;
 	else
 		prio = __normal_prio(p);
 	return prio;
 }
 /*
  * Calculate the current priority, i.e. the priority
  * taken into account by the scheduler. This value might
  * be boosted by RT tasks, or might be boosted by
  * interactivity modifiers. Will be RT if the task got
  * RT-boosted. If not then it returns p->normal_prio.
  */
 static int effective_prio(struct task_struct *p)
 {
 	p->normal_prio = normal_prio(p);
 	/*
 	 * If we are RT tasks or we were boosted to RT priority,
 	 * keep the priority unchanged. Otherwise, update priority
 	 * to the normal priority:
 	 */
 	if (!rt_prio(p->prio))
 		return p->normal_prio;
 	return p->prio;
 }
 /*
  * activate_task - move a task to the runqueue.
  */
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	u64 now = rq_clock(rq);
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible--;
 	enqueue_task(rq, p, wakeup, now);
 	inc_nr_running(p, rq, now);
 }
 /*
  * activate_idle_task - move idle task to the _front_ of runqueue.
  */
 static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
 {
 	u64 now = rq_clock(rq);
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible--;
 	enqueue_task(rq, p, 0, now);
 	inc_nr_running(p, rq, now);
 }
 /*
  * deactivate_task - remove a task from the runqueue.
  */
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 {
 	u64 now = rq_clock(rq);
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible++;
 	dequeue_task(rq, p, sleep, now);
 	dec_nr_running(p, rq, now);
 }
 /**
  * task_curr - is this task currently executing on a CPU?
  * @p: the task in question.
  */
 inline int task_curr(const struct task_struct *p)
 {
 	return cpu_curr(task_cpu(p)) == p;
 }
 /* Used instead of source_load when we know the type == 0 */
 unsigned long weighted_cpuload(const int cpu)
 {
 	return cpu_rq(cpu)->ls.load.weight;
 }
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
 #ifdef CONFIG_SMP
 	task_thread_info(p)->cpu = cpu;
 	set_task_cfs_rq(p);
 #endif
 }
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 	int old_cpu = task_cpu(p);
 	struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
 	u64 clock_offset, fair_clock_offset;
 	clock_offset = old_rq->clock - new_rq->clock;
 	fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock;
 	if (p->se.wait_start_fair)
 		p->se.wait_start_fair -= fair_clock_offset;
 	if (p->se.sleep_start_fair)
 		p->se.sleep_start_fair -= fair_clock_offset;
 #ifdef CONFIG_SCHEDSTATS
 	if (p->se.wait_start)
 		p->se.wait_start -= clock_offset;
 	if (p->se.sleep_start)
 		p->se.sleep_start -= clock_offset;
 	if (p->se.block_start)
 		p->se.block_start -= clock_offset;
 #endif
 	__set_task_cpu(p, new_cpu);
 }
 struct migration_req {
 	struct list_head list;
 	struct task_struct *task;
 	int dest_cpu;
 	struct completion done;
 };
 /*
  * The task's runqueue lock must be held.
  * Returns true if you have to wait for migration thread.
  */
 static int
 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 {
 	struct rq *rq = task_rq(p);
 	/*
 	 * If the task is not on a runqueue (and not running), then
 	 * it is sufficient to simply update the task's cpu field.
 	 */
 	if (!p->se.on_rq && !task_running(rq, p)) {
 		set_task_cpu(p, dest_cpu);
 		return 0;
 	}
 	init_completion(&req->done);
 	req->task = p;
 	req->dest_cpu = dest_cpu;
 	list_add(&req->list, &rq->migration_queue);
 	return 1;
 }
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
  * The caller must ensure that the task *will* unschedule sometime soon,
  * else this function might spin for a *long* time. This function can't
  * be called with interrupts off, or it may introduce deadlock with
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
  */
 void wait_task_inactive(struct task_struct *p)
 {
 	unsigned long flags;
 	int running, on_rq;
 	struct rq *rq;
 repeat:
 	/*
 	 * We do the initial early heuristics without holding
 	 * any task-queue locks at all. We'll only try to get
 	 * the runqueue lock when things look like they will
 	 * work out!
 	 */
 	rq = task_rq(p);
 	/*
 	 * If the task is actively running on another CPU
 	 * still, just relax and busy-wait without holding
 	 * any locks.
 	 *
 	 * NOTE! Since we don't hold any locks, it's not
 	 * even sure that "rq" stays as the right runqueue!
 	 * But we don't care, since "task_running()" will
 	 * return false if the runqueue has changed and p
 	 * is actually now running somewhere else!
 	 */
 	while (task_running(rq, p))
 		cpu_relax();
 	/*
 	 * Ok, time to look more closely! We need the rq
 	 * lock now, to be *sure*. If we're wrong, we'll
 	 * just go back and repeat.
 	 */
 	rq = task_rq_lock(p, &flags);
 	running = task_running(rq, p);
 	on_rq = p->se.on_rq;
 	task_rq_unlock(rq, &flags);
 	/*
 	 * Was it really running after all now that we
 	 * checked with the proper locks actually held?
 	 *
 	 * Oops. Go back and try again..
 	 */
 	if (unlikely(running)) {
 		cpu_relax();
 		goto repeat;
 	}
 	/*
 	 * It's not enough that it's not actively running,
 	 * it must be off the runqueue _entirely_, and not
 	 * preempted!
 	 *
 	 * So if it wa still runnable (but just not actively
 	 * running right now), it's preempted, and we should
 	 * yield - it could be a while.
 	 */
 	if (unlikely(on_rq)) {
 		yield();
 		goto repeat;
 	}
 	/*
 	 * Ahh, all good. It wasn't running, and it wasn't
 	 * runnable, which means that it will never become
 	 * running in the future either. We're all done!
 	 */
 }
 /***
  * kick_process - kick a running thread to enter/exit the kernel
  * @p: the to-be-kicked thread
  *
  * Cause a process which is running on another CPU to enter
  * kernel-mode, without any delay. (to get signals handled.)
  *
  * NOTE: this function doesnt have to take the runqueue lock,
  * because all it wants to ensure is that the remote task enters
  * the kernel. If the IPI races and the task has been migrated
  * to another CPU then no harm is done and the purpose has been
  * achieved as well.
  */
 void kick_process(struct task_struct *p)
 {
 	int cpu;
 	preempt_disable();
 	cpu = task_cpu(p);
 	if ((cpu != smp_processor_id()) && task_curr(p))
 		smp_send_reschedule(cpu);
 	preempt_enable();
 }
 /*
  * Return a low guess at the load of a migration-source cpu weighted
  * according to the scheduling class and "nice" value.
  *
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
 static inline unsigned long source_load(int cpu, int type)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
 	if (type == 0)
 		return total;
 	return min(rq->cpu_load[type-1], total);
 }
 /*
  * Return a high guess at the load of a migration-target cpu weighted
  * according to the scheduling class and "nice" value.
  */
 static inline unsigned long target_load(int cpu, int type)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
 	if (type == 0)
 		return total;
 	return max(rq->cpu_load[type-1], total);
 }
 /*
  * Return the average load per task on the cpu's run queue
  */
 static inline unsigned long cpu_avg_load_per_task(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
 	unsigned long n = rq->nr_running;
 	return n ? total / n : SCHED_LOAD_SCALE;
 }
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
  */
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 {
 	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
 	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
 	do {
 		unsigned long load, avg_load;
 		int local_group;
 		int i;
 		/* Skip over this group if it has no CPUs allowed */
 		if (!cpus_intersects(group->cpumask, p->cpus_allowed))
 			goto nextgroup;
 		local_group = cpu_isset(this_cpu, group->cpumask);
 		/* Tally up the load of all CPUs in the group */
 		avg_load = 0;
 		for_each_cpu_mask(i, group->cpumask) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
 				load = source_load(i, load_idx);
 			else
 				load = target_load(i, load_idx);
 			avg_load += load;
 		}
 		/* Adjust by relative CPU power of the group */
 		avg_load = sg_div_cpu_power(group,
 				avg_load * SCHED_LOAD_SCALE);
 		if (local_group) {
 			this_load = avg_load;
 			this = group;
 		} else if (avg_load < min_load) {
 			min_load = avg_load;
 			idlest = group;
 		}
 nextgroup:
 		group = group->next;
 	} while (group != sd->groups);
 	if (!idlest || 100*this_load < imbalance*min_load)
 		return NULL;
 	return idlest;
 }
 /*
  * find_idlest_cpu - find the idlest cpu among the cpus in group.
  */
 static int
 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 {
 	cpumask_t tmp;
 	unsigned long load, min_load = ULONG_MAX;
 	int idlest = -1;
 	int i;
 	/* Traverse only the allowed CPUs */
 	cpus_and(tmp, group->cpumask, p->cpus_allowed);
 	for_each_cpu_mask(i, tmp) {
 		load = weighted_cpuload(i);
 		if (load < min_load || (load == min_load && i == this_cpu)) {
 			min_load = load;
 			idlest = i;
 		}
 	}
 	return idlest;
 }
 /*
  * sched_balance_self: balance the current task (running on cpu) in domains
  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
  * SD_BALANCE_EXEC.
  *
  * Balance, ie. select the least loaded group.
  *
  * Returns the target CPU number, or the same CPU if no balancing is needed.
  *
  * preempt must be disabled.
  */
 static int sched_balance_self(int cpu, int flag)
 {
 	struct task_struct *t = current;
 	struct sched_domain *tmp, *sd = NULL;
 	for_each_domain(cpu, tmp) {
 		/*
 		 * If power savings logic is enabled for a domain, stop there.
 		 */
 		if (tmp->flags & SD_POWERSAVINGS_BALANCE)
 			break;
 		if (tmp->flags & flag)
 			sd = tmp;
 	}
 	while (sd) {
 		cpumask_t span;
 		struct sched_group *group;
 		int new_cpu, weight;
 		if (!(sd->flags & flag)) {
 			sd = sd->child;
 			continue;
 		}
 		span = sd->span;
 		group = find_idlest_group(sd, t, cpu);
 		if (!group) {
 			sd = sd->child;
 			continue;
 		}
 		new_cpu = find_idlest_cpu(group, t, cpu);
 		if (new_cpu == -1 || new_cpu == cpu) {
 			/* Now try balancing at a lower domain level of cpu */
 			sd = sd->child;
 			continue;
 		}
 		/* Now try balancing at a lower domain level of new_cpu */
 		cpu = new_cpu;
 		sd = NULL;
 		weight = cpus_weight(span);
 		for_each_domain(cpu, tmp) {
 			if (weight <= cpus_weight(tmp->span))
 				break;
 			if (tmp->flags & flag)
 				sd = tmp;
 		}
 		/* while loop will break here if sd == NULL */
 	}
 	return cpu;
 }
 #endif /* CONFIG_SMP */
 /*
  * wake_idle() will wake a task on an idle cpu if task->cpu is
  * not idle and an idle cpu is available.  The span of cpus to
  * search starts with cpus closest then further out as needed,
  * so we always favor a closer, idle cpu.
  *
  * Returns the CPU we should wake onto.
  */
 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
 static int wake_idle(int cpu, struct task_struct *p)
 {
 	cpumask_t tmp;
 	struct sched_domain *sd;
 	int i;
 	/*
 	 * If it is idle, then it is the best cpu to run this task.
 	 *
 	 * This cpu is also the best, if it has more than one task already.
 	 * Siblings must be also busy(in most cases) as they didn't already
 	 * pickup the extra load from this cpu and hence we need not check
 	 * sibling runqueue info. This will avoid the checks and cache miss
 	 * penalities associated with that.
 	 */
 	if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
 		return cpu;
 	for_each_domain(cpu, sd) {
 		if (sd->flags & SD_WAKE_IDLE) {
 			cpus_and(tmp, sd->span, p->cpus_allowed);
 			for_each_cpu_mask(i, tmp) {
 				if (idle_cpu(i))
 					return i;
 			}
 		} else {
 			break;
 		}
 	}
 	return cpu;
 }
 #else
 static inline int wake_idle(int cpu, struct task_struct *p)
 {
 	return cpu;
 }
 #endif
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
  * @state: the mask of task states that can be woken
  * @sync: do a synchronous wakeup?
  *
  * Put it on the run-queue if it's not already there. The "current"
  * thread is always on the run-queue (except when the actual
  * re-schedule is in progress), and as such you're allowed to do
  * the simpler "current->state = TASK_RUNNING" to mark yourself
  * runnable without the overhead of this.
  *
  * returns failure only if the task is already active.
  */
 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 {
 	int cpu, this_cpu, success = 0;
 	unsigned long flags;
 	long old_state;
 	struct rq *rq;
 #ifdef CONFIG_SMP
 	struct sched_domain *sd, *this_sd = NULL;
 	unsigned long load, this_load;
 	int new_cpu;
 #endif
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
 	if (!(old_state & state))
 		goto out;
 	if (p->se.on_rq)
 		goto out_running;
 	cpu = task_cpu(p);
 	this_cpu = smp_processor_id();
 #ifdef CONFIG_SMP
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 	new_cpu = cpu;
 	schedstat_inc(rq, ttwu_cnt);
 	if (cpu == this_cpu) {
 		schedstat_inc(rq, ttwu_local);
 		goto out_set_cpu;
 	}
 	for_each_domain(this_cpu, sd) {
 		if (cpu_isset(cpu, sd->span)) {
 			schedstat_inc(sd, ttwu_wake_remote);
 			this_sd = sd;
 			break;
 		}
 	}
 	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
 		goto out_set_cpu;
 	/*
 	 * Check for affine wakeup and passive balancing possibilities.
 	 */
 	if (this_sd) {
 		int idx = this_sd->wake_idx;
 		unsigned int imbalance;
 		imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
 		load = source_load(cpu, idx);
 		this_load = target_load(this_cpu, idx);
 		new_cpu = this_cpu; /* Wake to this CPU if we can */
 		if (this_sd->flags & SD_WAKE_AFFINE) {
 			unsigned long tl = this_load;
 			unsigned long tl_per_task;
 			tl_per_task = cpu_avg_load_per_task(this_cpu);
 			/*
 			 * If sync wakeup then subtract the (maximum possible)
 			 * effect of the currently running task from the load
 			 * of the current CPU:
 			 */
 			if (sync)
 				tl -= current->se.load.weight;
 			if ((tl <= load &&
 				tl + target_load(cpu, idx) <= tl_per_task) ||
 			       100*(tl + p->se.load.weight) <= imbalance*load) {
 				/*
 				 * This domain has SD_WAKE_AFFINE and
 				 * p is cache cold in this domain, and
 				 * there is no bad imbalance.
 				 */
 				schedstat_inc(this_sd, ttwu_move_affine);
 				goto out_set_cpu;
 			}
 		}
 		/*
 		 * Start passive balancing when half the imbalance_pct
 		 * limit is reached.
 		 */
 		if (this_sd->flags & SD_WAKE_BALANCE) {
 			if (imbalance*this_load <= 100*load) {
 				schedstat_inc(this_sd, ttwu_move_balance);
 				goto out_set_cpu;
 			}
 		}
 	}
 	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
 out_set_cpu:
 	new_cpu = wake_idle(new_cpu, p);
 	if (new_cpu != cpu) {
 		set_task_cpu(p, new_cpu);
 		task_rq_unlock(rq, &flags);
 		/* might preempt at this point */
 		rq = task_rq_lock(p, &flags);
 		old_state = p->state;
 		if (!(old_state & state))
 			goto out;
 		if (p->se.on_rq)
 			goto out_running;
 		this_cpu = smp_processor_id();
 		cpu = task_cpu(p);
 	}
 out_activate:
 #endif /* CONFIG_SMP */
 	activate_task(rq, p, 1);
 	/*
 	 * Sync wakeups (i.e. those types of wakeups where the waker
 	 * has indicated that it will leave the CPU in short order)
 	 * don't trigger a preemption, if the woken up task will run on
 	 * this cpu. (in this case the 'I will reschedule' promise of
 	 * the waker guarantees that the freshly woken up task is going
 	 * to be considered on this CPU.)
 	 */
 	if (!sync || cpu != this_cpu)
 		check_preempt_curr(rq, p);
 	success = 1;
 out_running:
 	p->state = TASK_RUNNING;
 out:
 	task_rq_unlock(rq, &flags);
 	return success;
 }
 int fastcall wake_up_process(struct task_struct *p)
 {
 	return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
 				 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
 }
 EXPORT_SYMBOL(wake_up_process);
 int fastcall wake_up_state(struct task_struct *p, unsigned int state)
 {
 	return try_to_wake_up(p, state, 0);
 }
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
  *
  * __sched_fork() is basic setup used by init_idle() too:
  */
 static void __sched_fork(struct task_struct *p)
 {
 	p->se.wait_start_fair		= 0;
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
 	p->se.delta_exec		= 0;
 	p->se.delta_fair_run		= 0;
 	p->se.delta_fair_sleep		= 0;
 	p->se.wait_runtime		= 0;
 	p->se.sleep_start_fair		= 0;
 #ifdef CONFIG_SCHEDSTATS
 	p->se.wait_start		= 0;
 	p->se.sum_wait_runtime		= 0;
 	p->se.sum_sleep_runtime		= 0;
 	p->se.sleep_start		= 0;
 	p->se.block_start		= 0;
 	p->se.sleep_max			= 0;
 	p->se.block_max			= 0;
 	p->se.exec_max			= 0;
 	p->se.wait_max			= 0;
 	p->se.wait_runtime_overruns	= 0;
 	p->se.wait_runtime_underruns	= 0;
 #endif
 	INIT_LIST_HEAD(&p->run_list);
 	p->se.on_rq = 0;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
 	/*
 	 * We mark the process as running here, but have not actually
 	 * inserted it onto the runqueue yet. This guarantees that
 	 * nobody will actually run it, and a signal or other external
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
 	p->state = TASK_RUNNING;
 }
 /*
  * fork()/clone()-time setup:
  */
 void sched_fork(struct task_struct *p, int clone_flags)
 {
 	int cpu = get_cpu();
 	__sched_fork(p);
 #ifdef CONFIG_SMP
 	cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
 #endif
 	__set_task_cpu(p, cpu);
 	/*
 	 * Make sure we do not leak PI boosting priority to the child:
 	 */
 	p->prio = current->normal_prio;
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	p->oncpu = 0;
 #endif
 #ifdef CONFIG_PREEMPT
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
 	put_cpu();
 }
 /*
  * After fork, child runs first. (default) If set to 0 then
  * parent will (try to) run first.
  */
 unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
 /*
  * wake_up_new_task - wake up a newly created task for the first time.
  *
  * This function will do some initial scheduler statistics housekeeping
  * that must be done for every newly created context, then puts the task
  * on the runqueue and wakes it.
  */
 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
 	unsigned long flags;
 	struct rq *rq;
 	int this_cpu;
 	u64 now;
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
 	this_cpu = smp_processor_id(); /* parent's CPU */
 	now = rq_clock(rq);
 	p->prio = effective_prio(p);
 	if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||
 			(clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||
 			!current->se.on_rq) {
 		activate_task(rq, p, 0);
 	} else {
 		/*
 		 * Let the scheduling class do new task startup
 		 * management (if any):
 		 */
 		p->sched_class->task_new(rq, p, now);
 		inc_nr_running(p, rq, now);
 	}
 	check_preempt_curr(rq, p);
 	task_rq_unlock(rq, &flags);
 }
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 /**
  * preempt_notifier_register - tell me when current is being being preempted & rescheduled
  * @notifier: notifier struct to register
  */
 void preempt_notifier_register(struct preempt_notifier *notifier)
 {
 	hlist_add_head(&notifier->link, &current->preempt_notifiers);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_register);
 /**
  * preempt_notifier_unregister - no longer interested in preemption notifications
  * @notifier: notifier struct to unregister
  *
  * This is safe to call from within a preemption notifier.
  */
 void preempt_notifier_unregister(struct preempt_notifier *notifier)
 {
 	hlist_del(&notifier->link);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
 	struct preempt_notifier *notifier;
 	struct hlist_node *node;
 	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
 		notifier->ops->sched_in(notifier, raw_smp_processor_id());
 }
 static void
 fire_sched_out_preempt_notifiers(struct task_struct *curr,
 				 struct task_struct *next)
 {
 	struct preempt_notifier *notifier;
 	struct hlist_node *node;
 	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
 		notifier->ops->sched_out(notifier, next);
 }
 #else
 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
 }
 static void
 fire_sched_out_preempt_notifiers(struct task_struct *curr,
 				 struct task_struct *next)
 {
 }
 #endif
 /**
  * prepare_task_switch - prepare to switch tasks
  * @rq: the runqueue preparing to switch
  * @prev: the current task that is being switched out
  * @next: the task we are going to switch to.
  *
  * This is called with the rq lock held and interrupts off. It must
  * be paired with a subsequent finish_task_switch after the context
  * switch.
  *
  * prepare_task_switch sets up locking and calls architecture specific
  * hooks.
  */
 static inline void
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
 		    struct task_struct *next)
 {
 	fire_sched_out_preempt_notifiers(prev, next);
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
 }
 /**
  * finish_task_switch - clean up after a task-switch
  * @rq: runqueue associated with task-switch
  * @prev: the thread we just switched away from.
  *
  * finish_task_switch must be called after the context switch, paired
  * with a prepare_task_switch call before the context switch.
  * finish_task_switch will reconcile locking set up by prepare_task_switch,
  * and do any other architecture-specific cleanup actions.
  *
  * Note that we may have delayed dropping an mm in context_switch(). If
  * so, we finish that here outside of the runqueue lock.  (Doing it
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
  */
 static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct mm_struct *mm = rq->prev_mm;
 	long prev_state;
 	rq->prev_mm = NULL;
 	/*
 	 * A task struct has one reference for the use as "current".
 	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
 	 * schedule one last time. The schedule call will never return, and
 	 * the scheduled task must drop that reference.
 	 * The test for TASK_DEAD must occur while the runqueue locks are
 	 * still held, otherwise prev could be scheduled on another cpu, die
 	 * there before we look at prev->state, and then the reference would
 	 * be dropped twice.
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
 	prev_state = prev->state;
 	finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
 	fire_sched_in_preempt_notifiers(current);
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
 		/*
 		 * Remove function-return probe instances associated with this
 		 * task and put them back on the free list.
 		 */
 		kprobe_flush_task(prev);
 		put_task_struct(prev);
 	}
 }
 /**
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
  */
 asmlinkage void schedule_tail(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
 	finish_task_switch(rq, prev);
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	/* In this case, finish_task_switch does not reenable preemption */
 	preempt_enable();
 #endif
 	if (current->set_child_tid)
 		put_user(current->pid, current->set_child_tid);
 }
 /*
  * context_switch - switch to the new MM and the new
  * thread's register state.
  */
 static inline void
 context_switch(struct rq *rq, struct task_struct *prev,
 	       struct task_struct *next)
 {
 	struct mm_struct *mm, *oldmm;
 	prepare_task_switch(rq, prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
 	 * For paravirt, this is coupled with an exit in switch_to to
 	 * combine the page table reload and the switch backend into
 	 * one hypercall.
 	 */
 	arch_enter_lazy_cpu_mode();
 	if (unlikely(!mm)) {
 		next->active_mm = oldmm;
 		atomic_inc(&oldmm->mm_count);
 		enter_lazy_tlb(oldmm, next);
 	} else
 		switch_mm(oldmm, mm, next);
 	if (unlikely(!prev->mm)) {
 		prev->active_mm = NULL;
 		rq->prev_mm = oldmm;
 	}
 	/*
 	 * Since the runqueue lock will be released by the next
 	 * task (which is an invalid locking op but in the case
 	 * of the scheduler it's an obvious special-case), so we
 	 * do an early lockdep release here:
 	 */
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 #endif
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 	barrier();
 	/*
 	 * this_rq must be evaluated again because prev may have moved
 	 * CPUs since it called schedule(), thus the 'rq' on its stack
 	 * frame will be invalid.
 	 */
 	finish_task_switch(this_rq(), prev);
 }
 /*
  * nr_running, nr_uninterruptible and nr_context_switches:
  *
  * externally visible scheduler statistics: current number of runnable
  * threads, current number of uninterruptible-sleeping threads, total
  * number of context switches performed since bootup.
  */
 unsigned long nr_running(void)
 {
 	unsigned long i, sum = 0;
 	for_each_online_cpu(i)
 		sum += cpu_rq(i)->nr_running;
 	return sum;
 }
 unsigned long nr_uninterruptible(void)
 {
 	unsigned long i, sum = 0;
 	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_uninterruptible;
 	/*
 	 * Since we read the counters lockless, it might be slightly
 	 * inaccurate. Do not allow it to go below zero though:
 	 */
 	if (unlikely((long)sum < 0))
 		sum = 0;
 	return sum;
 }
 unsigned long long nr_context_switches(void)
 {
 	int i;
 	unsigned long long sum = 0;
 	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_switches;
 	return sum;
 }
 unsigned long nr_iowait(void)
 {
 	unsigned long i, sum = 0;
 	for_each_possible_cpu(i)
 		sum += atomic_read(&cpu_rq(i)->nr_iowait);
 	return sum;
 }
 unsigned long nr_active(void)
 {
 	unsigned long i, running = 0, uninterruptible = 0;
 	for_each_online_cpu(i) {
 		running += cpu_rq(i)->nr_running;
 		uninterruptible += cpu_rq(i)->nr_uninterruptible;
 	}
 	if (unlikely((long)uninterruptible < 0))
 		uninterruptible = 0;
 	return running + uninterruptible;
 }
 /*
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
  */
 static void update_cpu_load(struct rq *this_rq)
 {
 	u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
 	unsigned long total_load = this_rq->ls.load.weight;
 	unsigned long this_load =  total_load;
 	struct load_stat *ls = &this_rq->ls;
 	u64 now = __rq_clock(this_rq);
 	int i, scale;
 	this_rq->nr_load_updates++;
 	if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
 		goto do_avg;
 	/* Update delta_fair/delta_exec fields first */
 	update_curr_load(this_rq, now);
 	fair_delta64 = ls->delta_fair + 1;
 	ls->delta_fair = 0;
 	exec_delta64 = ls->delta_exec + 1;
 	ls->delta_exec = 0;
 	sample_interval64 = now - ls->load_update_last;
 	ls->load_update_last = now;
 	if ((s64)sample_interval64 < (s64)TICK_NSEC)
 		sample_interval64 = TICK_NSEC;
 	if (exec_delta64 > sample_interval64)
 		exec_delta64 = sample_interval64;
 	idle_delta64 = sample_interval64 - exec_delta64;
 	tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
 	tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
 	this_load = (unsigned long)tmp64;
 do_avg:
 	/* Update our load: */
 	for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
 		unsigned long old_load, new_load;
 		/* scale is effectively 1 << i now, and >> i divides by scale */
 		old_load = this_rq->cpu_load[i];
 		new_load = this_load;
 		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
 	}
 }
 #ifdef CONFIG_SMP
 /*
  * double_rq_lock - safely lock two runqueues
  *
  * Note this does not disable interrupts like task_rq_lock,
  * you need to do so manually before calling.
  */
 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
 	__acquires(rq1->lock)
 	__acquires(rq2->lock)
 {
 	BUG_ON(!irqs_disabled());
 	if (rq1 == rq2) {
 		spin_lock(&rq1->lock);
 		__acquire(rq2->lock);	/* Fake it out ;) */
 	} else {
 		if (rq1 < rq2) {
 			spin_lock(&rq1->lock);
 			spin_lock(&rq2->lock);
 		} else {
 			spin_lock(&rq2->lock);
 			spin_lock(&rq1->lock);
 		}
 	}
 }
 /*
  * double_rq_unlock - safely unlock two runqueues
  *
  * Note this does not restore interrupts like task_rq_unlock,
  * you need to do so manually after calling.
  */
 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 	__releases(rq1->lock)
 	__releases(rq2->lock)
 {
 	spin_unlock(&rq1->lock);
 	if (rq1 != rq2)
 		spin_unlock(&rq2->lock);
 	else
 		__release(rq2->lock);
 }
 /*
  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
  */
 static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(this_rq->lock)
 	__acquires(busiest->lock)
 	__acquires(this_rq->lock)
 {
 	if (unlikely(!irqs_disabled())) {
 		/* printk() doesn't work good under rq->lock */
 		spin_unlock(&this_rq->lock);
 		BUG_ON(1);
 	}
 	if (unlikely(!spin_trylock(&busiest->lock))) {
 		if (busiest < this_rq) {
 			spin_unlock(&this_rq->lock);
 			spin_lock(&busiest->lock);
 			spin_lock(&this_rq->lock);
 		} else
 			spin_lock(&busiest->lock);
 	}
 }
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
  * the cpu_allowed mask is restored.
  */
 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
 {
 	struct migration_req req;
 	unsigned long flags;
 	struct rq *rq;
 	rq = task_rq_lock(p, &flags);
 	if (!cpu_isset(dest_cpu, p->cpus_allowed)
 	    || unlikely(cpu_is_offline(dest_cpu)))
 		goto out;
 	/* force the process onto the specified CPU */
 	if (migrate_task(p, dest_cpu, &req)) {
 		/* Need to wait for migration thread (might exit: take ref). */
 		struct task_struct *mt = rq->migration_thread;
 		get_task_struct(mt);
 		task_rq_unlock(rq, &flags);
 		wake_up_process(mt);
 		put_task_struct(mt);
 		wait_for_completion(&req.done);
 		return;
 	}
 out:
 	task_rq_unlock(rq, &flags);
 }
 /*
  * sched_exec - execve() is a valuable balancing opportunity, because at
  * this point the task has the smallest effective memory and cache footprint.
  */
 void sched_exec(void)
 {
 	int new_cpu, this_cpu = get_cpu();
 	new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
 	put_cpu();
 	if (new_cpu != this_cpu)
 		sched_migrate_task(current, new_cpu);
 }
 /*
  * pull_task - move a task from a remote runqueue to the local runqueue.
  * Both runqueues must be locked.
  */
 static void pull_task(struct rq *src_rq, struct task_struct *p,
 		      struct rq *this_rq, int this_cpu)
 {
 	deactivate_task(src_rq, p, 0);
 	set_task_cpu(p, this_cpu);
 	activate_task(this_rq, p, 0);
 	/*
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
 	 * to be always true for them.
 	 */
 	check_preempt_curr(this_rq, p);
 }
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
 static
 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 		     struct sched_domain *sd, enum cpu_idle_type idle,
 		     int *all_pinned)
 {
 	/*
 	 * We do not migrate tasks that are:
 	 * 1) running (obviously), or
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
 	 */
 	if (!cpu_isset(this_cpu, p->cpus_allowed))
 		return 0;
 	*all_pinned = 0;
 	if (task_running(rq, p))
 		return 0;
 	/*
 	 * Aggressive migration if too many balance attempts have failed:
 	 */
 	if (sd->nr_balance_failed > sd->cache_nice_tries)
 		return 1;
 	return 1;
 }
 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		      unsigned long max_nr_move, unsigned long max_load_move,
 		      struct sched_domain *sd, enum cpu_idle_type idle,
 		      int *all_pinned, unsigned long *load_moved,
 		      int this_best_prio, int best_prio, int best_prio_seen,
 		      struct rq_iterator *iterator)
 {
 	int pulled = 0, pinned = 0, skip_for_load;
 	struct task_struct *p;
 	long rem_load_move = max_load_move;
 	if (max_nr_move == 0 || max_load_move == 0)
 		goto out;
 	pinned = 1;
 	/*
 	 * Start the load-balancing iterator:
 	 */
 	p = iterator->start(iterator->arg);
 next:
 	if (!p)
 		goto out;
 	/*
 	 * To help distribute high priority tasks accross CPUs we don't
 	 * skip a task if it will be the highest priority task (i.e. smallest
 	 * prio value) on its new queue regardless of its load weight
 	 */
 	skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
 							 SCHED_LOAD_SCALE_FUZZ;
 	if (skip_for_load && p->prio < this_best_prio)
 		skip_for_load = !best_prio_seen && p->prio == best_prio;
 	if (skip_for_load ||
 	    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
 		best_prio_seen |= p->prio == best_prio;
 		p = iterator->next(iterator->arg);
 		goto next;
 	}
 	pull_task(busiest, p, this_rq, this_cpu);
 	pulled++;
 	rem_load_move -= p->se.load.weight;
 	/*
 	 * We only want to steal up to the prescribed number of tasks
 	 * and the prescribed amount of weighted load.
 	 */
 	if (pulled < max_nr_move && rem_load_move > 0) {
 		if (p->prio < this_best_prio)
 			this_best_prio = p->prio;
 		p = iterator->next(iterator->arg);
 		goto next;
 	}
 out:
 	/*
 	 * Right now, this is the only place pull_task() is called,
 	 * so we can safely collect pull_task() stats here rather than
 	 * inside pull_task().
 	 */
 	schedstat_add(sd, lb_gained[idle], pulled);
 	if (all_pinned)
 		*all_pinned = pinned;
 	*load_moved = max_load_move - rem_load_move;
 	return pulled;
 }
 /*
- * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
+ * move_tasks tries to move up to max_load_move weighted load from busiest to
- * load from busiest to this_rq, as part of a balancing operation within
+ * this_rq, as part of a balancing operation within domain "sd".
- * "domain". Returns the number of tasks moved.
+ * Returns 1 if successful and 0 otherwise.
  *
  * Called with both runqueues locked.
  */
 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-		      unsigned long max_nr_move, unsigned long max_load_move,
+		      unsigned long max_load_move,
 		      struct sched_domain *sd, enum cpu_idle_type idle,
 		      int *all_pinned)
 {
 	struct sched_class *class = sched_class_highest;
-	unsigned long load_moved, total_nr_moved = 0, nr_moved;
+	unsigned long total_load_moved = 0;
-	long rem_load_move = max_load_move;
 	do {
-		nr_moved = class->load_balance(this_rq, this_cpu, busiest,
+		total_load_moved +=
-				max_nr_move, (unsigned long)rem_load_move,
+			class->load_balance(this_rq, this_cpu, busiest,
-				sd, idle, all_pinned, &load_moved);
+				ULONG_MAX, max_load_move - total_load_moved,
-		total_nr_moved += nr_moved;
+				sd, idle, all_pinned);
-		max_nr_move -= nr_moved;
-		rem_load_move -= load_moved;
 		class = class->next;
-	} while (class && max_nr_move && rem_load_move > 0);
+	} while (class && max_load_move > total_load_moved);
-	return total_nr_moved;
+	return total_load_moved > 0;
 }
 /*
+ * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * part of active balancing operations within "domain".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+			 struct sched_domain *sd, enum cpu_idle_type idle)
+{
+	struct sched_class *class;
+	for (class = sched_class_highest; class; class = class->next)
+		if (class->load_balance(this_rq, this_cpu, busiest,
+					1, ULONG_MAX, sd, idle, NULL))
+			return 1;
+	return 0;
+}
+/*
  * find_busiest_group finds and returns the busiest CPU group within the
  * domain. It calculates and returns the amount of weighted load which
  * should be moved to restore balance via the imbalance parameter.
  */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
 		   unsigned long *imbalance, enum cpu_idle_type idle,
 		   int *sd_idle, cpumask_t *cpus, int *balance)
 {
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
 	unsigned long max_pull;
 	unsigned long busiest_load_per_task, busiest_nr_running;
 	unsigned long this_load_per_task, this_nr_running;
 	int load_idx;
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 	int power_savings_balance = 1;
 	unsigned long leader_nr_running = 0, min_load_per_task = 0;
 	unsigned long min_nr_running = ULONG_MAX;
 	struct sched_group *group_min = NULL, *group_leader = NULL;
 #endif
 	max_load = this_load = total_load = total_pwr = 0;
 	busiest_load_per_task = busiest_nr_running = 0;
 	this_load_per_task = this_nr_running = 0;
 	if (idle == CPU_NOT_IDLE)
 		load_idx = sd->busy_idx;
 	else if (idle == CPU_NEWLY_IDLE)
 		load_idx = sd->newidle_idx;
 	else
 		load_idx = sd->idle_idx;
 	do {
 		unsigned long load, group_capacity;
 		int local_group;
 		int i;
 		unsigned int balance_cpu = -1, first_idle_cpu = 0;
 		unsigned long sum_nr_running, sum_weighted_load;
 		local_group = cpu_isset(this_cpu, group->cpumask);
 		if (local_group)
 			balance_cpu = first_cpu(group->cpumask);
 		/* Tally up the load of all CPUs in the group */
 		sum_weighted_load = sum_nr_running = avg_load = 0;
 		for_each_cpu_mask(i, group->cpumask) {
 			struct rq *rq;
 			if (!cpu_isset(i, *cpus))
 				continue;
 			rq = cpu_rq(i);
 			if (*sd_idle && rq->nr_running)
 				*sd_idle = 0;
 			/* Bias balancing toward cpus of our domain */
 			if (local_group) {
 				if (idle_cpu(i) && !first_idle_cpu) {
 					first_idle_cpu = 1;
 					balance_cpu = i;
 				}
 				load = target_load(i, load_idx);
 			} else
 				load = source_load(i, load_idx);
 			avg_load += load;
 			sum_nr_running += rq->nr_running;
 			sum_weighted_load += weighted_cpuload(i);
 		}
 		/*
 		 * First idle cpu or the first cpu(busiest) in this sched group
 		 * is eligible for doing load balancing at this and above
 		 * domains. In the newly idle case, we will allow all the cpu's
 		 * to do the newly idle load balance.
 		 */
 		if (idle != CPU_NEWLY_IDLE && local_group &&
 		    balance_cpu != this_cpu && balance) {
 			*balance = 0;
 			goto ret;
 		}
 		total_load += avg_load;
 		total_pwr += group->__cpu_power;
 		/* Adjust by relative CPU power of the group */
 		avg_load = sg_div_cpu_power(group,
 				avg_load * SCHED_LOAD_SCALE);
 		group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
 		if (local_group) {
 			this_load = avg_load;
 			this = group;
 			this_nr_running = sum_nr_running;
 			this_load_per_task = sum_weighted_load;
 		} else if (avg_load > max_load &&
 			   sum_nr_running > group_capacity) {
 			max_load = avg_load;
 			busiest = group;
 			busiest_nr_running = sum_nr_running;
 			busiest_load_per_task = sum_weighted_load;
 		}
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 		/*
 		 * Busy processors will not participate in power savings
 		 * balance.
 		 */
 		if (idle == CPU_NOT_IDLE ||
 				!(sd->flags & SD_POWERSAVINGS_BALANCE))
 			goto group_next;
 		/*
 		 * If the local group is idle or completely loaded
 		 * no need to do power savings balance at this domain
 		 */
 		if (local_group && (this_nr_running >= group_capacity ||
 				    !this_nr_running))
 			power_savings_balance = 0;
 		/*
 		 * If a group is already running at full capacity or idle,
 		 * don't include that group in power savings calculations
 		 */
 		if (!power_savings_balance || sum_nr_running >= group_capacity
 		    || !sum_nr_running)
 			goto group_next;
 		/*
 		 * Calculate the group which has the least non-idle load.
 		 * This is the group from where we need to pick up the load
 		 * for saving power
 		 */
 		if ((sum_nr_running < min_nr_running) ||
 		    (sum_nr_running == min_nr_running &&
 		     first_cpu(group->cpumask) <
 		     first_cpu(group_min->cpumask))) {
 			group_min = group;
 			min_nr_running = sum_nr_running;
 			min_load_per_task = sum_weighted_load /
 						sum_nr_running;
 		}
 		/*
 		 * Calculate the group which is almost near its
 		 * capacity but still has some space to pick up some load
 		 * from other group and save more power
 		 */
 		if (sum_nr_running <= group_capacity - 1) {
 			if (sum_nr_running > leader_nr_running ||
 			    (sum_nr_running == leader_nr_running &&
 			     first_cpu(group->cpumask) >
 			      first_cpu(group_leader->cpumask))) {
 				group_leader = group;
 				leader_nr_running = sum_nr_running;
 			}
 		}
 group_next:
 #endif
 		group = group->next;
 	} while (group != sd->groups);
 	if (!busiest || this_load >= max_load || busiest_nr_running == 0)
 		goto out_balanced;
 	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
 	if (this_load >= avg_load ||
 			100*max_load <= sd->imbalance_pct*this_load)
 		goto out_balanced;
 	busiest_load_per_task /= busiest_nr_running;
 	/*
 	 * We're trying to get all the cpus to the average_load, so we don't
 	 * want to push ourselves above the average load, nor do we wish to
 	 * reduce the max loaded cpu below the average load, as either of these
 	 * actions would just result in more rebalancing later, and ping-pong
 	 * tasks around. Thus we look for the minimum possible imbalance.
 	 * Negative imbalances (*we* are more loaded than anyone else) will
 	 * be counted as no imbalance for these purposes -- we can't fix that
 	 * by pulling tasks to us.  Be careful of negative numbers as they'll
 	 * appear as very large values with unsigned longs.
 	 */
 	if (max_load <= busiest_load_per_task)
 		goto out_balanced;
 	/*
 	 * In the presence of smp nice balancing, certain scenarios can have
 	 * max load less than avg load(as we skip the groups at or below
 	 * its cpu_power, while calculating max_load..)
 	 */
 	if (max_load < avg_load) {
 		*imbalance = 0;
 		goto small_imbalance;
 	}
 	/* Don't want to pull so many tasks that a group would go idle */
 	max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
 	/* How much load to actually move to equalise the imbalance */
 	*imbalance = min(max_pull * busiest->__cpu_power,
 				(avg_load - this_load) * this->__cpu_power)
 			/ SCHED_LOAD_SCALE;
 	/*
 	 * if *imbalance is less than the average load per runnable task
 	 * there is no gaurantee that any tasks will be moved so we'll have
 	 * a think about bumping its value to force at least one task to be
 	 * moved
 	 */
 	if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
 		unsigned long tmp, pwr_now, pwr_move;
 		unsigned int imbn;
 small_imbalance:
 		pwr_move = pwr_now = 0;
 		imbn = 2;
 		if (this_nr_running) {
 			this_load_per_task /= this_nr_running;
 			if (busiest_load_per_task > this_load_per_task)
 				imbn = 1;
 		} else
 			this_load_per_task = SCHED_LOAD_SCALE;
 		if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
 					busiest_load_per_task * imbn) {
 			*imbalance = busiest_load_per_task;
 			return busiest;
 		}
 		/*
 		 * OK, we don't have enough imbalance to justify moving tasks,
 		 * however we may be able to increase total CPU power used by
 		 * moving them.
 		 */
 		pwr_now += busiest->__cpu_power *
 				min(busiest_load_per_task, max_load);
 		pwr_now += this->__cpu_power *
 				min(this_load_per_task, this_load);
 		pwr_now /= SCHED_LOAD_SCALE;
 		/* Amount of load we'd subtract */
 		tmp = sg_div_cpu_power(busiest,
 				busiest_load_per_task * SCHED_LOAD_SCALE);
 		if (max_load > tmp)
 			pwr_move += busiest->__cpu_power *
 				min(busiest_load_per_task, max_load - tmp);
 		/* Amount of load we'd add */
 		if (max_load * busiest->__cpu_power <
 				busiest_load_per_task * SCHED_LOAD_SCALE)
 			tmp = sg_div_cpu_power(this,
 					max_load * busiest->__cpu_power);
 		else
 			tmp = sg_div_cpu_power(this,
 				busiest_load_per_task * SCHED_LOAD_SCALE);
 		pwr_move += this->__cpu_power *
 				min(this_load_per_task, this_load + tmp);
 		pwr_move /= SCHED_LOAD_SCALE;
 		/* Move if we gain throughput */
 		if (pwr_move <= pwr_now)
 			goto out_balanced;
 		*imbalance = busiest_load_per_task;
 	}
 	return busiest;
 out_balanced:
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
 		goto ret;
 	if (this == group_leader && group_leader != group_min) {
 		*imbalance = min_load_per_task;
 		return group_min;
 	}
 #endif
 ret:
 	*imbalance = 0;
 	return NULL;
 }
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
 static struct rq *
 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 		   unsigned long imbalance, cpumask_t *cpus)
 {
 	struct rq *busiest = NULL, *rq;
 	unsigned long max_load = 0;
 	int i;
 	for_each_cpu_mask(i, group->cpumask) {
 		unsigned long wl;
 		if (!cpu_isset(i, *cpus))
 			continue;
 		rq = cpu_rq(i);
 		wl = weighted_cpuload(i);
 		if (rq->nr_running == 1 && wl > imbalance)
 			continue;
 		if (wl > max_load) {
 			max_load = wl;
 			busiest = rq;
 		}
 	}
 	return busiest;
 }
 /*
  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
  * so long as it is large enough.
  */
 #define MAX_PINNED_INTERVAL	512
-static inline unsigned long minus_1_or_zero(unsigned long n)
-{
-	return n > 0 ? n - 1 : 0;
-}
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *balance)
 {
-	int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
 	struct sched_group *group;
 	unsigned long imbalance;
 	struct rq *busiest;
 	cpumask_t cpus = CPU_MASK_ALL;
 	unsigned long flags;
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
 	 * let the state of idle sibling percolate up as CPU_IDLE, instead of
 	 * portraying it as CPU_NOT_IDLE.
 	 */
 	if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		sd_idle = 1;
 	schedstat_inc(sd, lb_cnt[idle]);
 redo:
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
 				   &cpus, balance);
 	if (*balance == 0)
 		goto out_balanced;
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[idle]);
 		goto out_balanced;
 	}
 	busiest = find_busiest_queue(group, idle, imbalance, &cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
 	}
 	BUG_ON(busiest == this_rq);
 	schedstat_add(sd, lb_imbalance[idle], imbalance);
-	nr_moved = 0;
+	ld_moved = 0;
 	if (busiest->nr_running > 1) {
 		/*
 		 * Attempt to move tasks. If find_busiest_group has found
 		 * an imbalance but busiest->nr_running <= 1, the group is
-		 * still unbalanced. nr_moved simply stays zero, so it is
+		 * still unbalanced. ld_moved simply stays zero, so it is
 		 * correctly treated as an imbalance.
 		 */
 		local_irq_save(flags);
 		double_rq_lock(this_rq, busiest);
-		nr_moved = move_tasks(this_rq, this_cpu, busiest,
+		ld_moved = move_tasks(this_rq, this_cpu, busiest,
-				      minus_1_or_zero(busiest->nr_running),
 				      imbalance, sd, idle, &all_pinned);
 		double_rq_unlock(this_rq, busiest);
 		local_irq_restore(flags);
 		/*
 		 * some other cpu did the load balance for us.
 		 */
-		if (nr_moved && this_cpu != smp_processor_id())
+		if (ld_moved && this_cpu != smp_processor_id())
 			resched_cpu(this_cpu);
 		/* All tasks on this runqueue were pinned by CPU affinity */
 		if (unlikely(all_pinned)) {
 			cpu_clear(cpu_of(busiest), cpus);
 			if (!cpus_empty(cpus))
 				goto redo;
 			goto out_balanced;
 		}
 	}
-	if (!nr_moved) {
+	if (!ld_moved) {
 		schedstat_inc(sd, lb_failed[idle]);
 		sd->nr_balance_failed++;
 		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
 			spin_lock_irqsave(&busiest->lock, flags);
 			/* don't kick the migration_thread, if the curr
 			 * task on busiest cpu can't be moved to this_cpu
 			 */
 			if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
 				spin_unlock_irqrestore(&busiest->lock, flags);
 				all_pinned = 1;
 				goto out_one_pinned;
 			}
 			if (!busiest->active_balance) {
 				busiest->active_balance = 1;
 				busiest->push_cpu = this_cpu;
 				active_balance = 1;
 			}
 			spin_unlock_irqrestore(&busiest->lock, flags);
 			if (active_balance)
 				wake_up_process(busiest->migration_thread);
 			/*
 			 * We've kicked active balancing, reset the failure
 			 * counter.
 			 */
 			sd->nr_balance_failed = sd->cache_nice_tries+1;
 		}
 	} else
 		sd->nr_balance_failed = 0;
 	if (likely(!active_balance)) {
 		/* We were unbalanced, so reset the balancing interval */
 		sd->balance_interval = sd->min_interval;
 	} else {
 		/*
 		 * If we've begun active balancing, start to back off. This
 		 * case may not be covered by the all_pinned logic if there
 		 * is only 1 task on the busy runqueue (because we don't call
 		 * move_tasks).
 		 */
 		if (sd->balance_interval < sd->max_interval)
 			sd->balance_interval *= 2;
 	}
-	if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		return -1;
-	return nr_moved;
+	return ld_moved;
 out_balanced:
 	schedstat_inc(sd, lb_balanced[idle]);
 	sd->nr_balance_failed = 0;
 out_one_pinned:
 	/* tune up the balancing interval */
 	if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
 			(sd->balance_interval < sd->max_interval))
 		sd->balance_interval *= 2;
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		return -1;
 	return 0;
 }
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
  *
  * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
  * this_rq is locked.
  */
 static int
 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 {
 	struct sched_group *group;
 	struct rq *busiest = NULL;
 	unsigned long imbalance;
-	int nr_moved = 0;
+	int ld_moved = 0;
 	int sd_idle = 0;
 	int all_pinned = 0;
 	cpumask_t cpus = CPU_MASK_ALL;
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
 	 * let the state of idle sibling percolate up as IDLE, instead of
 	 * portraying it as CPU_NOT_IDLE.
 	 */
 	if (sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		sd_idle = 1;
 	schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
 redo:
 	group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
 				   &sd_idle, &cpus, NULL);
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
 		goto out_balanced;
 	}
 	busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
 				&cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
 		goto out_balanced;
 	}
 	BUG_ON(busiest == this_rq);
 	schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
-	nr_moved = 0;
+	ld_moved = 0;
 	if (busiest->nr_running > 1) {
 		/* Attempt to move tasks */
 		double_lock_balance(this_rq, busiest);
-		nr_moved = move_tasks(this_rq, this_cpu, busiest,
+		ld_moved = move_tasks(this_rq, this_cpu, busiest,
-					minus_1_or_zero(busiest->nr_running),
 					imbalance, sd, CPU_NEWLY_IDLE,
 					&all_pinned);
 		spin_unlock(&busiest->lock);
 		if (unlikely(all_pinned)) {
 			cpu_clear(cpu_of(busiest), cpus);
 			if (!cpus_empty(cpus))
 				goto redo;
 		}
 	}
-	if (!nr_moved) {
+	if (!ld_moved) {
 		schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
 		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 		    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 			return -1;
 	} else
 		sd->nr_balance_failed = 0;
-	return nr_moved;
+	return ld_moved;
 out_balanced:
 	schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
 		return -1;
 	sd->nr_balance_failed = 0;
 	return 0;
 }
 /*
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
  */
 static void idle_balance(int this_cpu, struct rq *this_rq)
 {
 	struct sched_domain *sd;
 	int pulled_task = -1;
 	unsigned long next_balance = jiffies + HZ;
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
 		if (!(sd->flags & SD_LOAD_BALANCE))
 			continue;
 		if (sd->flags & SD_BALANCE_NEWIDLE)
 			/* If we've pulled tasks over stop searching: */
 			pulled_task = load_balance_newidle(this_cpu,
 								this_rq, sd);
 		interval = msecs_to_jiffies(sd->balance_interval);
 		if (time_after(next_balance, sd->last_balance + interval))
 			next_balance = sd->last_balance + interval;
 		if (pulled_task)
 			break;
 	}
 	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
 		/*
 		 * We are going idle. next_balance may be set based on
 		 * a busy processor. So reset next_balance.
 		 */
 		this_rq->next_balance = next_balance;
 	}
 }
 /*
  * active_load_balance is run by migration threads. It pushes running tasks
  * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
  * running on each physical CPU where possible, and avoids physical /
  * logical imbalances.
  *
  * Called with busiest_rq locked.
  */
 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 {
 	int target_cpu = busiest_rq->push_cpu;
 	struct sched_domain *sd;
 	struct rq *target_rq;
 	/* Is there any task to move? */
 	if (busiest_rq->nr_running <= 1)
 		return;
 	target_rq = cpu_rq(target_cpu);
 	/*
 	 * This condition is "impossible", if it occurs
 	 * we need to fix it.  Originally reported by
 	 * Bjorn Helgaas on a 128-cpu setup.
 	 */
 	BUG_ON(busiest_rq == target_rq);
 	/* move a task from busiest_rq to target_rq */
 	double_lock_balance(busiest_rq, target_rq);
 	/* Search for an sd spanning us and the target CPU. */
 	for_each_domain(target_cpu, sd) {
 		if ((sd->flags & SD_LOAD_BALANCE) &&
 		    cpu_isset(busiest_cpu, sd->span))
 				break;
 	}
 	if (likely(sd)) {
 		schedstat_inc(sd, alb_cnt);
-		if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
+		if (move_one_task(target_rq, target_cpu, busiest_rq,
-			       ULONG_MAX, sd, CPU_IDLE, NULL))
+				  sd, CPU_IDLE))
 			schedstat_inc(sd, alb_pushed);
 		else
 			schedstat_inc(sd, alb_failed);
 	}
 	spin_unlock(&target_rq->lock);
 }
 #ifdef CONFIG_NO_HZ
 static struct {
 	atomic_t load_balancer;
 	cpumask_t  cpu_mask;
 } nohz ____cacheline_aligned = {
 	.load_balancer = ATOMIC_INIT(-1),
 	.cpu_mask = CPU_MASK_NONE,
 };
 /*
  * This routine will try to nominate the ilb (idle load balancing)
  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
  * load balancing on behalf of all those cpus. If all the cpus in the system
  * go into this tickless mode, then there will be no ilb owner (as there is
  * no need for one) and all the cpus will sleep till the next wakeup event
  * arrives...
  *
  * For the ilb owner, tick is not stopped. And this tick will be used
  * for idle load balancing. ilb owner will still be part of
  * nohz.cpu_mask..
  *
  * While stopping the tick, this cpu will become the ilb owner if there
  * is no other owner. And will be the owner till that cpu becomes busy
  * or if all cpus in the system stop their ticks at which point
  * there is no need for ilb owner.
  *
  * When the ilb owner becomes busy, it nominates another owner, during the
  * next busy scheduler_tick()
  */
 int select_nohz_load_balancer(int stop_tick)
 {
 	int cpu = smp_processor_id();
 	if (stop_tick) {
 		cpu_set(cpu, nohz.cpu_mask);
 		cpu_rq(cpu)->in_nohz_recently = 1;
 		/*
 		 * If we are going offline and still the leader, give up!
 		 */
 		if (cpu_is_offline(cpu) &&
 		    atomic_read(&nohz.load_balancer) == cpu) {
 			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
 				BUG();
 			return 0;
 		}
 		/* time for ilb owner also to sleep */
 		if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
 			if (atomic_read(&nohz.load_balancer) == cpu)
 				atomic_set(&nohz.load_balancer, -1);
 			return 0;
 		}
 		if (atomic_read(&nohz.load_balancer) == -1) {
 			/* make me the ilb owner */
 			if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
 				return 1;
 		} else if (atomic_read(&nohz.load_balancer) == cpu)
 			return 1;
 	} else {
 		if (!cpu_isset(cpu, nohz.cpu_mask))
 			return 0;
 		cpu_clear(cpu, nohz.cpu_mask);
 		if (atomic_read(&nohz.load_balancer) == cpu)
 			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
 				BUG();
 	}
 	return 0;
 }
 #endif
 static DEFINE_SPINLOCK(balancing);
 /*
  * It checks each scheduling domain to see if it is due to be balanced,
  * and initiates a balancing operation if so.
  *
  * Balancing parameters are set up in arch_init_sched_domains.
  */
 static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
 {
 	int balance = 1;
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long interval;
 	struct sched_domain *sd;
 	/* Earliest time when we have to do rebalance again */
 	unsigned long next_balance = jiffies + 60*HZ;
 	for_each_domain(cpu, sd) {
 		if (!(sd->flags & SD_LOAD_BALANCE))
 			continue;
 		interval = sd->balance_interval;
 		if (idle != CPU_IDLE)
 			interval *= sd->busy_factor;
 		/* scale ms to jiffies */
 		interval = msecs_to_jiffies(interval);
 		if (unlikely(!interval))
 			interval = 1;
 		if (interval > HZ*NR_CPUS/10)
 			interval = HZ*NR_CPUS/10;
 		if (sd->flags & SD_SERIALIZE) {
 			if (!spin_trylock(&balancing))
 				goto out;
 		}
 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
 			if (load_balance(cpu, rq, sd, idle, &balance)) {
 				/*
 				 * We've pulled tasks over so either we're no
 				 * longer idle, or one of our SMT siblings is
 				 * not idle.
 				 */
 				idle = CPU_NOT_IDLE;
 			}
 			sd->last_balance = jiffies;
 		}
 		if (sd->flags & SD_SERIALIZE)
 			spin_unlock(&balancing);
 out:
 		if (time_after(next_balance, sd->last_balance + interval))
 			next_balance = sd->last_balance + interval;
 		/*
 		 * Stop the load balance at this level. There is another
 		 * CPU in our sched group which is doing load balancing more
 		 * actively.
 		 */
 		if (!balance)
 			break;
 	}
 	rq->next_balance = next_balance;
 }
 /*
  * run_rebalance_domains is triggered when needed from the scheduler tick.
  * In CONFIG_NO_HZ case, the idle load balance owner will do the
  * rebalancing for all the cpus for whom scheduler ticks are stopped.
  */
 static void run_rebalance_domains(struct softirq_action *h)
 {
 	int this_cpu = smp_processor_id();
 	struct rq *this_rq = cpu_rq(this_cpu);
 	enum cpu_idle_type idle = this_rq->idle_at_tick ?
 						CPU_IDLE : CPU_NOT_IDLE;
 	rebalance_domains(this_cpu, idle);
 #ifdef CONFIG_NO_HZ
 	/*
 	 * If this cpu is the owner for idle load balancing, then do the
 	 * balancing on behalf of the other idle cpus whose ticks are
 	 * stopped.
 	 */
 	if (this_rq->idle_at_tick &&
 	    atomic_read(&nohz.load_balancer) == this_cpu) {
 		cpumask_t cpus = nohz.cpu_mask;
 		struct rq *rq;
 		int balance_cpu;
 		cpu_clear(this_cpu, cpus);
 		for_each_cpu_mask(balance_cpu, cpus) {
 			/*
 			 * If this cpu gets work to do, stop the load balancing
 			 * work being done for other cpus. Next load
 			 * balancing owner will pick it up.
 			 */
 			if (need_resched())
 				break;
 			rebalance_domains(balance_cpu, SCHED_IDLE);
 			rq = cpu_rq(balance_cpu);
 			if (time_after(this_rq->next_balance, rq->next_balance))
 				this_rq->next_balance = rq->next_balance;
 		}
 	}
 #endif
 }
 /*
  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
  *
  * In case of CONFIG_NO_HZ, this is the place where we nominate a new
  * idle load balancing owner or decide to stop the periodic load balancing,
  * if the whole system is idle.
  */
 static inline void trigger_load_balance(struct rq *rq, int cpu)
 {
 #ifdef CONFIG_NO_HZ
 	/*
 	 * If we were in the nohz mode recently and busy at the current
 	 * scheduler tick, then check if we need to nominate new idle
 	 * load balancer.
 	 */
 	if (rq->in_nohz_recently && !rq->idle_at_tick) {
 		rq->in_nohz_recently = 0;
 		if (atomic_read(&nohz.load_balancer) == cpu) {
 			cpu_clear(cpu, nohz.cpu_mask);
 			atomic_set(&nohz.load_balancer, -1);
 		}
 		if (atomic_read(&nohz.load_balancer) == -1) {
 			/*
 			 * simple selection for now: Nominate the
 			 * first cpu in the nohz list to be the next
 			 * ilb owner.
 			 *
 			 * TBD: Traverse the sched domains and nominate
 			 * the nearest cpu in the nohz.cpu_mask.
 			 */
 			int ilb = first_cpu(nohz.cpu_mask);
 			if (ilb != NR_CPUS)
 				resched_cpu(ilb);
 		}
 	}
 	/*
 	 * If this cpu is idle and doing idle load balancing for all the
 	 * cpus with ticks stopped, is it time for that to stop?
 	 */
 	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
 	    cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
 		resched_cpu(cpu);
 		return;
 	}
 	/*
 	 * If this cpu is idle and the idle load balancing is done by
 	 * someone else, then no need raise the SCHED_SOFTIRQ
 	 */
 	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
 	    cpu_isset(cpu, nohz.cpu_mask))
 		return;
 #endif
 	if (time_after_eq(jiffies, rq->next_balance))
 		raise_softirq(SCHED_SOFTIRQ);
 }
 #else	/* CONFIG_SMP */
 /*
  * on UP we do not need to balance between CPUs:
  */
 static inline void idle_balance(int cpu, struct rq *rq)
 {
 }
 /* Avoid "used but not defined" warning on UP */
 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		      unsigned long max_nr_move, unsigned long max_load_move,
 		      struct sched_domain *sd, enum cpu_idle_type idle,
 		      int *all_pinned, unsigned long *load_moved,
 		      int this_best_prio, int best_prio, int best_prio_seen,
 		      struct rq_iterator *iterator)
 {
 	*load_moved = 0;
 	return 0;
 }
 #endif
 DEFINE_PER_CPU(struct kernel_stat, kstat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 /*
  * Return p->sum_exec_runtime plus any more ns on the sched_clock
  * that have not yet been banked in case the task is currently running.
  */
 unsigned long long task_sched_runtime(struct task_struct *p)
 {
 	unsigned long flags;
 	u64 ns, delta_exec;
 	struct rq *rq;
 	rq = task_rq_lock(p, &flags);
 	ns = p->se.sum_exec_runtime;
 	if (rq->curr == p) {
 		delta_exec = rq_clock(rq) - p->se.exec_start;
 		if ((s64)delta_exec > 0)
 			ns += delta_exec;
 	}
 	task_rq_unlock(rq, &flags);
 	return ns;
 }
 /*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
  * @cputime: the cpu time spent in user space since the last update
  */
 void account_user_time(struct task_struct *p, cputime_t cputime)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t tmp;
 	p->utime = cputime_add(p->utime, cputime);
 	/* Add user time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
 	if (TASK_NICE(p) > 0)
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
 	else
 		cpustat->user = cputime64_add(cpustat->user, tmp);
 }
 /*
  * Account system cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
  * @cputime: the cpu time spent in kernel space since the last update
  */
 void account_system_time(struct task_struct *p, int hardirq_offset,
 			 cputime_t cputime)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	struct rq *rq = this_rq();
 	cputime64_t tmp;
 	p->stime = cputime_add(p->stime, cputime);
 	/* Add system time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
 	if (hardirq_count() - hardirq_offset)
 		cpustat->irq = cputime64_add(cpustat->irq, tmp);
 	else if (softirq_count())
 		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
 	else if (p != rq->idle)
 		cpustat->system = cputime64_add(cpustat->system, tmp);
 	else if (atomic_read(&rq->nr_iowait) > 0)
 		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
 	else
 		cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	/* Account for system time used */
 	acct_update_integrals(p);
 }
 /*
  * Account for involuntary wait time.
  * @p: the process from which the cpu time has been stolen
  * @steal: the cpu time spent in involuntary wait
  */
 void account_steal_time(struct task_struct *p, cputime_t steal)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t tmp = cputime_to_cputime64(steal);
 	struct rq *rq = this_rq();
 	if (p == rq->idle) {
 		p->stime = cputime_add(p->stime, steal);
 		if (atomic_read(&rq->nr_iowait) > 0)
 			cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
 		else
 			cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	} else
 		cpustat->steal = cputime64_add(cpustat->steal, tmp);
 }
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
  *
  * It also gets called by the fork code, when changing the parent's
  * timeslices.
  */
 void scheduler_tick(void)
 {
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
 	struct task_struct *curr = rq->curr;
 	spin_lock(&rq->lock);
 	update_cpu_load(rq);
 	if (curr != rq->idle) /* FIXME: needed? */
 		curr->sched_class->task_tick(rq, curr);
 	spin_unlock(&rq->lock);
 #ifdef CONFIG_SMP
 	rq->idle_at_tick = idle_cpu(cpu);
 	trigger_load_balance(rq, cpu);
 #endif
 }
 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
 void fastcall add_preempt_count(int val)
 {
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
 		return;
 	preempt_count() += val;
 	/*
 	 * Spinlock count overflowing soon?
 	 */
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
 }
 EXPORT_SYMBOL(add_preempt_count);
 void fastcall sub_preempt_count(int val)
 {
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
 		return;
 	/*
 	 * Is the spinlock portion underflowing?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
 			!(preempt_count() & PREEMPT_MASK)))
 		return;
 	preempt_count() -= val;
 }
 EXPORT_SYMBOL(sub_preempt_count);
 #endif
 /*
  * Print scheduling while atomic bug:
  */
 static noinline void __schedule_bug(struct task_struct *prev)
 {
 	printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
 		prev->comm, preempt_count(), prev->pid);
 	debug_show_held_locks(prev);
 	if (irqs_disabled())
 		print_irqtrace_events(prev);
 	dump_stack();
 }
 /*
  * Various schedule()-time debugging checks and statistics:
  */
 static inline void schedule_debug(struct task_struct *prev)
 {
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
 	if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
 		__schedule_bug(prev);
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 	schedstat_inc(this_rq(), sched_cnt);
 }
 /*
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
 pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
 {
 	struct sched_class *class;
 	struct task_struct *p;
 	/*
 	 * Optimization: we know that if all tasks are in
 	 * the fair class we can call that function directly:
 	 */
 	if (likely(rq->nr_running == rq->cfs.nr_running)) {
 		p = fair_sched_class.pick_next_task(rq, now);
 		if (likely(p))
 			return p;
 	}
 	class = sched_class_highest;
 	for ( ; ; ) {
 		p = class->pick_next_task(rq, now);
 		if (p)
 			return p;
 		/*
 		 * Will never be NULL as the idle class always
 		 * returns a non-NULL p:
 		 */
 		class = class->next;
 	}
 }
 /*
  * schedule() is the main scheduler function.
  */
 asmlinkage void __sched schedule(void)
 {
 	struct task_struct *prev, *next;
 	long *switch_count;
 	struct rq *rq;
 	u64 now;
 	int cpu;
 need_resched:
 	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_qsctr_inc(cpu);
 	prev = rq->curr;
 	switch_count = &prev->nivcsw;
 	release_kernel_lock(prev);
 need_resched_nonpreemptible:
 	schedule_debug(prev);
 	spin_lock_irq(&rq->lock);
 	clear_tsk_need_resched(prev);
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
 				unlikely(signal_pending(prev)))) {
 			prev->state = TASK_RUNNING;
 		} else {
 			deactivate_task(rq, prev, 1);
 		}
 		switch_count = &prev->nvcsw;
 	}
 	if (unlikely(!rq->nr_running))
 		idle_balance(cpu, rq);
 	now = __rq_clock(rq);
 	prev->sched_class->put_prev_task(rq, prev, now);
 	next = pick_next_task(rq, prev, now);
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
 		context_switch(rq, prev, next); /* unlocks the rq */
 	} else
 		spin_unlock_irq(&rq->lock);
 	if (unlikely(reacquire_kernel_lock(current) < 0)) {
 		cpu = smp_processor_id();
 		rq = cpu_rq(cpu);
 		goto need_resched_nonpreemptible;
 	}
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
  * off of preempt_enable.  Kernel preemptions off return from interrupt
  * occur there and call schedule directly.
  */
 asmlinkage void __sched preempt_schedule(void)
 {
 	struct thread_info *ti = current_thread_info();
 #ifdef CONFIG_PREEMPT_BKL
 	struct task_struct *task = current;
 	int saved_lock_depth;
 #endif
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task.  Just return..
 	 */
 	if (likely(ti->preempt_count || irqs_disabled()))
 		return;
 need_resched:
 	add_preempt_count(PREEMPT_ACTIVE);
 	/*
 	 * We keep the big kernel semaphore locked, but we
 	 * clear ->lock_depth so that schedule() doesnt
 	 * auto-release the semaphore:
 	 */
 #ifdef CONFIG_PREEMPT_BKL
 	saved_lock_depth = task->lock_depth;
 	task->lock_depth = -1;
 #endif
 	schedule();
 #ifdef CONFIG_PREEMPT_BKL
 	task->lock_depth = saved_lock_depth;
 #endif
 	sub_preempt_count(PREEMPT_ACTIVE);
 	/* we could miss a preemption opportunity between schedule and now */
 	barrier();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
 }
 EXPORT_SYMBOL(preempt_schedule);
 /*
  * this is the entry point to schedule() from kernel preemption
  * off of irq context.
  * Note, that this is called and return with irqs disabled. This will
  * protect us against recursive calling from irq.
  */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
 	struct thread_info *ti = current_thread_info();
 #ifdef CONFIG_PREEMPT_BKL
 	struct task_struct *task = current;
 	int saved_lock_depth;
 #endif
 	/* Catch callers which need to be fixed */
 	BUG_ON(ti->preempt_count || !irqs_disabled());
 need_resched:
 	add_preempt_count(PREEMPT_ACTIVE);
 	/*
 	 * We keep the big kernel semaphore locked, but we
 	 * clear ->lock_depth so that schedule() doesnt
 	 * auto-release the semaphore:
 	 */
 #ifdef CONFIG_PREEMPT_BKL
 	saved_lock_depth = task->lock_depth;
 	task->lock_depth = -1;
 #endif
 	local_irq_enable();
 	schedule();
 	local_irq_disable();
 #ifdef CONFIG_PREEMPT_BKL
 	task->lock_depth = saved_lock_depth;
 #endif
 	sub_preempt_count(PREEMPT_ACTIVE);
 	/* we could miss a preemption opportunity between schedule and now */
 	barrier();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
 }
 #endif /* CONFIG_PREEMPT */
 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
 			  void *key)
 {
 	return try_to_wake_up(curr->private, mode, sync);
 }
 EXPORT_SYMBOL(default_wake_function);
 /*
  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
  * number) then we wake all the non-exclusive tasks and one exclusive task.
  *
  * There are circumstances in which we can try to wake a task which has already
  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 			     int nr_exclusive, int sync, void *key)
 {
 	struct list_head *tmp, *next;
 	list_for_each_safe(tmp, next, &q->task_list) {
 		wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
 		unsigned flags = curr->flags;
 		if (curr->func(curr, mode, sync, key) &&
 				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
 			break;
 	}
 }
 /**
  * __wake_up - wake up threads blocked on a waitqueue.
  * @q: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  * @key: is directly passed to the wakeup function
  */
 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, void *key)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&q->lock, flags);
 	__wake_up_common(q, mode, nr_exclusive, 0, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(__wake_up);
 /*
  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
  */
 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 {
 	__wake_up_common(q, mode, 1, 0, NULL);
 }
 /**
  * __wake_up_sync - wake up threads blocked on a waitqueue.
  * @q: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  *
  * The sync wakeup differs that the waker knows that it will schedule
  * away soon, so while the target thread will be woken up, it will not
  * be migrated to another CPU - ie. the two threads are 'synchronized'
  * with each other. This can prevent needless bouncing between CPUs.
  *
  * On UP it can prevent extra preemption.
  */
 void fastcall
 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 {
 	unsigned long flags;
 	int sync = 1;
 	if (unlikely(!q))
 		return;
 	if (unlikely(!nr_exclusive))
 		sync = 0;
 	spin_lock_irqsave(&q->lock, flags);
 	__wake_up_common(q, mode, nr_exclusive, sync, NULL);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 void fastcall complete(struct completion *x)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done++;
 	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
 			 1, 0, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete);
 void fastcall complete_all(struct completion *x)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done += UINT_MAX/2;
 	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
 			 0, 0, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete_all);
 void fastcall __sched wait_for_completion(struct completion *x)
 {
 	might_sleep();
 	spin_lock_irq(&x->wait.lock);
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 		wait.flags |= WQ_FLAG_EXCLUSIVE;
 		__add_wait_queue_tail(&x->wait, &wait);
 		do {
 			__set_current_state(TASK_UNINTERRUPTIBLE);
 			spin_unlock_irq(&x->wait.lock);
 			schedule();
 			spin_lock_irq(&x->wait.lock);
 		} while (!x->done);
 		__remove_wait_queue(&x->wait, &wait);
 	}
 	x->done--;
 	spin_unlock_irq(&x->wait.lock);
 }
 EXPORT_SYMBOL(wait_for_completion);
 unsigned long fastcall __sched
 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 {
 	might_sleep();
 	spin_lock_irq(&x->wait.lock);
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 		wait.flags |= WQ_FLAG_EXCLUSIVE;
 		__add_wait_queue_tail(&x->wait, &wait);
 		do {
 			__set_current_state(TASK_UNINTERRUPTIBLE);
 			spin_unlock_irq(&x->wait.lock);
 			timeout = schedule_timeout(timeout);
 			spin_lock_irq(&x->wait.lock);
 			if (!timeout) {
 				__remove_wait_queue(&x->wait, &wait);
 				goto out;
 			}
 		} while (!x->done);
 		__remove_wait_queue(&x->wait, &wait);
 	}
 	x->done--;
 out:
 	spin_unlock_irq(&x->wait.lock);
 	return timeout;
 }
 EXPORT_SYMBOL(wait_for_completion_timeout);
 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
 {
 	int ret = 0;
 	might_sleep();
 	spin_lock_irq(&x->wait.lock);
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 		wait.flags |= WQ_FLAG_EXCLUSIVE;
 		__add_wait_queue_tail(&x->wait, &wait);
 		do {
 			if (signal_pending(current)) {
 				ret = -ERESTARTSYS;
 				__remove_wait_queue(&x->wait, &wait);
 				goto out;
 			}
 			__set_current_state(TASK_INTERRUPTIBLE);
 			spin_unlock_irq(&x->wait.lock);
 			schedule();
 			spin_lock_irq(&x->wait.lock);
 		} while (!x->done);
 		__remove_wait_queue(&x->wait, &wait);
 	}
 	x->done--;
 out:
 	spin_unlock_irq(&x->wait.lock);
 	return ret;
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible);
 unsigned long fastcall __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
 					  unsigned long timeout)
 {
 	might_sleep();
 	spin_lock_irq(&x->wait.lock);
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 		wait.flags |= WQ_FLAG_EXCLUSIVE;
 		__add_wait_queue_tail(&x->wait, &wait);
 		do {
 			if (signal_pending(current)) {
 				timeout = -ERESTARTSYS;
 				__remove_wait_queue(&x->wait, &wait);
 				goto out;
 			}
 			__set_current_state(TASK_INTERRUPTIBLE);
 			spin_unlock_irq(&x->wait.lock);
 			timeout = schedule_timeout(timeout);
 			spin_lock_irq(&x->wait.lock);
 			if (!timeout) {
 				__remove_wait_queue(&x->wait, &wait);
 				goto out;
 			}
 		} while (!x->done);
 		__remove_wait_queue(&x->wait, &wait);
 	}
 	x->done--;
 out:
 	spin_unlock_irq(&x->wait.lock);
 	return timeout;
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
 static inline void
 sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
 {
 	spin_lock_irqsave(&q->lock, *flags);
 	__add_wait_queue(q, wait);
 	spin_unlock(&q->lock);
 }
 static inline void
 sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
 {
 	spin_lock_irq(&q->lock);
 	__remove_wait_queue(q, wait);
 	spin_unlock_irqrestore(&q->lock, *flags);
 }
 void __sched interruptible_sleep_on(wait_queue_head_t *q)
 {
 	unsigned long flags;
 	wait_queue_t wait;
 	init_waitqueue_entry(&wait, current);
 	current->state = TASK_INTERRUPTIBLE;
 	sleep_on_head(q, &wait, &flags);
 	schedule();
 	sleep_on_tail(q, &wait, &flags);
 }
 EXPORT_SYMBOL(interruptible_sleep_on);
 long __sched
 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	unsigned long flags;
 	wait_queue_t wait;
 	init_waitqueue_entry(&wait, current);
 	current->state = TASK_INTERRUPTIBLE;
 	sleep_on_head(q, &wait, &flags);
 	timeout = schedule_timeout(timeout);
 	sleep_on_tail(q, &wait, &flags);
 	return timeout;
 }
 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
 void __sched sleep_on(wait_queue_head_t *q)
 {
 	unsigned long flags;
 	wait_queue_t wait;
 	init_waitqueue_entry(&wait, current);
 	current->state = TASK_UNINTERRUPTIBLE;
 	sleep_on_head(q, &wait, &flags);
 	schedule();
 	sleep_on_tail(q, &wait, &flags);
 }
 EXPORT_SYMBOL(sleep_on);
 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	unsigned long flags;
 	wait_queue_t wait;
 	init_waitqueue_entry(&wait, current);
 	current->state = TASK_UNINTERRUPTIBLE;
 	sleep_on_head(q, &wait, &flags);
 	timeout = schedule_timeout(timeout);
 	sleep_on_tail(q, &wait, &flags);
 	return timeout;
 }
 EXPORT_SYMBOL(sleep_on_timeout);
 #ifdef CONFIG_RT_MUTEXES
 /*
  * rt_mutex_setprio - set the current priority of a task
  * @p: task
  * @prio: prio value (kernel-internal form)
  *
  * This function changes the 'effective' priority of a task. It does
  * not touch ->normal_prio like __setscheduler().
  *
  * Used by the rt_mutex code to implement priority inheritance logic.
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
 	unsigned long flags;
 	int oldprio, on_rq;
 	struct rq *rq;
 	u64 now;
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 	rq = task_rq_lock(p, &flags);
 	now = rq_clock(rq);
 	oldprio = p->prio;
 	on_rq = p->se.on_rq;
 	if (on_rq)
 		dequeue_task(rq, p, 0, now);
 	if (rt_prio(prio))
 		p->sched_class = &rt_sched_class;
 	else
 		p->sched_class = &fair_sched_class;
 	p->prio = prio;
 	if (on_rq) {
 		enqueue_task(rq, p, 0, now);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
 		 * this runqueue and our priority is higher than the current's
 		 */
 		if (task_running(rq, p)) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
 		} else {
 			check_preempt_curr(rq, p);
 		}
 	}
 	task_rq_unlock(rq, &flags);
 }
 #endif
 void set_user_nice(struct task_struct *p, long nice)
 {
 	int old_prio, delta, on_rq;
 	unsigned long flags;
 	struct rq *rq;
 	u64 now;
 	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
 		return;
 	/*
 	 * We have to be careful, if called from sys_setpriority(),
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
 	rq = task_rq_lock(p, &flags);
 	now = rq_clock(rq);
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
 	 * it wont have any effect on scheduling until the task is
 	 * SCHED_FIFO/SCHED_RR:
 	 */
 	if (task_has_rt_policy(p)) {
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
 	on_rq = p->se.on_rq;
 	if (on_rq) {
 		dequeue_task(rq, p, 0, now);
 		dec_load(rq, p, now);
 	}
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
 	old_prio = p->prio;
 	p->prio = effective_prio(p);
 	delta = p->prio - old_prio;
 	if (on_rq) {
 		enqueue_task(rq, p, 0, now);
 		inc_load(rq, p, now);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
 		 */
 		if (delta < 0 || (delta > 0 && task_running(rq, p)))
 			resched_task(rq->curr);
 	}
 out_unlock:
 	task_rq_unlock(rq, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
 /*
  * can_nice - check if a task can reduce its nice value
  * @p: task
  * @nice: nice value
  */
 int can_nice(const struct task_struct *p, const int nice)
 {
 	/* convert nice value [19,-20] to rlimit style value [1,40] */
 	int nice_rlim = 20 - nice;
 	return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
 		capable(CAP_SYS_NICE));
 }
 #ifdef __ARCH_WANT_SYS_NICE
 /*
  * sys_nice - change the priority of the current process.
  * @increment: priority increment
  *
  * sys_setpriority is a more generic, but much slower function that
  * does similar things.
  */
 asmlinkage long sys_nice(int increment)
 {
 	long nice, retval;
 	/*
 	 * Setpriority might change our priority at the same moment.
 	 * We don't have to worry. Conceptually one call occurs first
 	 * and we have a single winner.
 	 */
 	if (increment < -40)
 		increment = -40;
 	if (increment > 40)
 		increment = 40;
 	nice = PRIO_TO_NICE(current->static_prio) + increment;
 	if (nice < -20)
 		nice = -20;
 	if (nice > 19)
 		nice = 19;
 	if (increment < 0 && !can_nice(current, nice))
 		return -EPERM;
 	retval = security_task_setnice(current, nice);
 	if (retval)
 		return retval;
 	set_user_nice(current, nice);
 	return 0;
 }
 #endif
 /**
  * task_prio - return the priority value of a given task.
  * @p: the task in question.
  *
  * This is the priority value as seen by users in /proc.
  * RT tasks are offset by -200. Normal tasks are centered
  * around 0, value goes from -16 to +15.
  */
 int task_prio(const struct task_struct *p)
 {
 	return p->prio - MAX_RT_PRIO;
 }
 /**
  * task_nice - return the nice value of a given task.
  * @p: the task in question.
  */
 int task_nice(const struct task_struct *p)
 {
 	return TASK_NICE(p);
 }
 EXPORT_SYMBOL_GPL(task_nice);
 /**
  * idle_cpu - is a given cpu idle currently?
  * @cpu: the processor in question.
  */
 int idle_cpu(int cpu)
 {
 	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
 }
 /**
  * idle_task - return the idle task for a given cpu.
  * @cpu: the processor in question.
  */
 struct task_struct *idle_task(int cpu)
 {
 	return cpu_rq(cpu)->idle;
 }
 /**
  * find_process_by_pid - find a process with a matching PID value.
  * @pid: the pid in question.
  */
 static inline struct task_struct *find_process_by_pid(pid_t pid)
 {
 	return pid ? find_task_by_pid(pid) : current;
 }
 /* Actually do priority change: must hold rq lock. */
 static void
 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 {
 	BUG_ON(p->se.on_rq);
 	p->policy = policy;
 	switch (p->policy) {
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 	case SCHED_IDLE:
 		p->sched_class = &fair_sched_class;
 		break;
 	case SCHED_FIFO:
 	case SCHED_RR:
 		p->sched_class = &rt_sched_class;
 		break;
 	}
 	p->rt_priority = prio;
 	p->normal_prio = normal_prio(p);
 	/* we are holding p->pi_lock already */
 	p->prio = rt_mutex_getprio(p);
 	set_load_weight(p);
 }
 /**
  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
  * @p: the task in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  *
  * NOTE that the task may be already dead.
  */
 int sched_setscheduler(struct task_struct *p, int policy,
 		       struct sched_param *param)
 {
 	int retval, oldprio, oldpolicy = -1, on_rq;
 	unsigned long flags;
 	struct rq *rq;
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
 recheck:
 	/* double check policy once rq lock held */
 	if (policy < 0)
 		policy = oldpolicy = p->policy;
 	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
 			policy != SCHED_NORMAL && policy != SCHED_BATCH &&
 			policy != SCHED_IDLE)
 		return -EINVAL;
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
 	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
 	 * SCHED_BATCH and SCHED_IDLE is 0.
 	 */
 	if (param->sched_priority < 0 ||
 	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
 	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
 		return -EINVAL;
 	if (rt_policy(policy) != (param->sched_priority != 0))
 		return -EINVAL;
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
 	if (!capable(CAP_SYS_NICE)) {
 		if (rt_policy(policy)) {
 			unsigned long rlim_rtprio;
 			if (!lock_task_sighand(p, &flags))
 				return -ESRCH;
 			rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
 			unlock_task_sighand(p, &flags);
 			/* can't set/change the rt policy */
 			if (policy != p->policy && !rlim_rtprio)
 				return -EPERM;
 			/* can't increase priority */
 			if (param->sched_priority > p->rt_priority &&
 			    param->sched_priority > rlim_rtprio)
 				return -EPERM;
 		}
 		/*
 		 * Like positive nice levels, dont allow tasks to
 		 * move out of SCHED_IDLE either:
 		 */
 		if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
 			return -EPERM;
 		/* can't change other user's priorities */
 		if ((current->euid != p->euid) &&
 		    (current->euid != p->uid))
 			return -EPERM;
 	}
 	retval = security_task_setscheduler(p, policy, param);
 	if (retval)
 		return retval;
 	/*
 	 * make sure no PI-waiters arrive (or leave) while we are
 	 * changing the priority of the task:
 	 */
 	spin_lock_irqsave(&p->pi_lock, flags);
 	/*
 	 * To be able to change p->policy safely, the apropriate
 	 * runqueue lock must be held.
 	 */
 	rq = __task_rq_lock(p);
 	/* recheck policy now with rq lock held */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
 		__task_rq_unlock(rq);
 		spin_unlock_irqrestore(&p->pi_lock, flags);
 		goto recheck;
 	}
 	on_rq = p->se.on_rq;
 	if (on_rq)
 		deactivate_task(rq, p, 0);
 	oldprio = p->prio;
 	__setscheduler(rq, p, policy, param->sched_priority);
 	if (on_rq) {
 		activate_task(rq, p, 0);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
 		 * this runqueue and our priority is higher than the current's
 		 */
 		if (task_running(rq, p)) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
 		} else {
 			check_preempt_curr(rq, p);
 		}
 	}
 	__task_rq_unlock(rq);
 	spin_unlock_irqrestore(&p->pi_lock, flags);
 	rt_mutex_adjust_pi(p);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
 static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
 	struct sched_param lparam;
 	struct task_struct *p;
 	int retval;
 	if (!param || pid < 0)
 		return -EINVAL;
 	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
 		return -EFAULT;
 	rcu_read_lock();
 	retval = -ESRCH;
 	p = find_process_by_pid(pid);
 	if (p != NULL)
 		retval = sched_setscheduler(p, policy, &lparam);
 	rcu_read_unlock();
 	return retval;
 }
 /**
  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
  * @pid: the pid in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  */
 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
 				       struct sched_param __user *param)
 {
 	/* negative values for policy are not valid */
 	if (policy < 0)
 		return -EINVAL;
 	return do_sched_setscheduler(pid, policy, param);
 }
 /**
  * sys_sched_setparam - set/change the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the new RT priority.
  */
 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
 {
 	return do_sched_setscheduler(pid, -1, param);
 }
 /**
  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
  * @pid: the pid in question.
  */
 asmlinkage long sys_sched_getscheduler(pid_t pid)
 {
 	struct task_struct *p;
 	int retval = -EINVAL;
 	if (pid < 0)
 		goto out_nounlock;
 	retval = -ESRCH;
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (p) {
 		retval = security_task_getscheduler(p);
 		if (!retval)
 			retval = p->policy;
 	}
 	read_unlock(&tasklist_lock);
 out_nounlock:
 	return retval;
 }
 /**
  * sys_sched_getscheduler - get the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the RT priority.
  */
 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
 {
 	struct sched_param lp;
 	struct task_struct *p;
 	int retval = -EINVAL;
 	if (!param || pid < 0)
 		goto out_nounlock;
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	retval = -ESRCH;
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	lp.sched_priority = p->rt_priority;
 	read_unlock(&tasklist_lock);
 	/*
 	 * This one might sleep, we cannot do it with a spinlock held ...
 	 */
 	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
 out_nounlock:
 	return retval;
 out_unlock:
 	read_unlock(&tasklist_lock);
 	return retval;
 }
 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
 {
 	cpumask_t cpus_allowed;
 	struct task_struct *p;
 	int retval;
 	mutex_lock(&sched_hotcpu_mutex);
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (!p) {
 		read_unlock(&tasklist_lock);
 		mutex_unlock(&sched_hotcpu_mutex);
 		return -ESRCH;
 	}
 	/*
 	 * It is not safe to call set_cpus_allowed with the
 	 * tasklist_lock held.  We will bump the task_struct's
 	 * usage count and then drop tasklist_lock.
 	 */
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 	retval = -EPERM;
 	if ((current->euid != p->euid) && (current->euid != p->uid) &&
 			!capable(CAP_SYS_NICE))
 		goto out_unlock;
 	retval = security_task_setscheduler(p, 0, NULL);
 	if (retval)
 		goto out_unlock;
 	cpus_allowed = cpuset_cpus_allowed(p);
 	cpus_and(new_mask, new_mask, cpus_allowed);
 	retval = set_cpus_allowed(p, new_mask);
 out_unlock:
 	put_task_struct(p);
 	mutex_unlock(&sched_hotcpu_mutex);
 	return retval;
 }
 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
 			     cpumask_t *new_mask)
 {
 	if (len < sizeof(cpumask_t)) {
 		memset(new_mask, 0, sizeof(cpumask_t));
 	} else if (len > sizeof(cpumask_t)) {
 		len = sizeof(cpumask_t);
 	}
 	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
 }
 /**
  * sys_sched_setaffinity - set the cpu affinity of a process
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to the new cpu mask
  */
 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
 				      unsigned long __user *user_mask_ptr)
 {
 	cpumask_t new_mask;
 	int retval;
 	retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
 	if (retval)
 		return retval;
 	return sched_setaffinity(pid, new_mask);
 }
 /*
  * Represents all cpu's present in the system
  * In systems capable of hotplug, this map could dynamically grow
  * as new cpu's are detected in the system via any platform specific
  * method, such as ACPI for e.g.
  */
 cpumask_t cpu_present_map __read_mostly;
 EXPORT_SYMBOL(cpu_present_map);
 #ifndef CONFIG_SMP
 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
 EXPORT_SYMBOL(cpu_online_map);
 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
 EXPORT_SYMBOL(cpu_possible_map);
 #endif
 long sched_getaffinity(pid_t pid, cpumask_t *mask)
 {
 	struct task_struct *p;
 	int retval;
 	mutex_lock(&sched_hotcpu_mutex);
 	read_lock(&tasklist_lock);
 	retval = -ESRCH;
 	p = find_process_by_pid(pid);
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	cpus_and(*mask, p->cpus_allowed, cpu_online_map);
 out_unlock:
 	read_unlock(&tasklist_lock);
 	mutex_unlock(&sched_hotcpu_mutex);
 	if (retval)
 		return retval;
 	return 0;
 }
 /**
  * sys_sched_getaffinity - get the cpu affinity of a process
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to hold the current cpu mask
  */
 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
 				      unsigned long __user *user_mask_ptr)
 {
 	int ret;
 	cpumask_t mask;
 	if (len < sizeof(cpumask_t))
 		return -EINVAL;
 	ret = sched_getaffinity(pid, &mask);
 	if (ret < 0)
 		return ret;
 	if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
 		return -EFAULT;
 	return sizeof(cpumask_t);
 }
 /**
  * sys_sched_yield - yield the current processor to other threads.
  *
  * This function yields the current CPU to other tasks. If there are no
  * other threads running on this CPU then this function will return.
  */
 asmlinkage long sys_sched_yield(void)
 {
 	struct rq *rq = this_rq_lock();
 	schedstat_inc(rq, yld_cnt);
 	if (unlikely(rq->nr_running == 1))
 		schedstat_inc(rq, yld_act_empty);
 	else
 		current->sched_class->yield_task(rq, current);
 	/*
 	 * Since we are going to call schedule() anyway, there's
 	 * no need to preempt or enable interrupts:
 	 */
 	__release(rq->lock);
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 	_raw_spin_unlock(&rq->lock);
 	preempt_enable_no_resched();
 	schedule();
 	return 0;
 }
 static void __cond_resched(void)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
 	__might_sleep(__FILE__, __LINE__);
 #endif
 	/*
 	 * The BKS might be reacquired before we have dropped
 	 * PREEMPT_ACTIVE, which could trigger a second
 	 * cond_resched() call.
 	 */
 	do {
 		add_preempt_count(PREEMPT_ACTIVE);
 		schedule();
 		sub_preempt_count(PREEMPT_ACTIVE);
 	} while (need_resched());
 }
 int __sched cond_resched(void)
 {
 	if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
 					system_state == SYSTEM_RUNNING) {
 		__cond_resched();
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(cond_resched);
 /*
  * cond_resched_lock() - if a reschedule is pending, drop the given lock,
  * call schedule, and on return reacquire the lock.
  *
  * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
  * operations here to prevent schedule() from being called twice (once via
  * spin_unlock(), once by hand).
  */
 int cond_resched_lock(spinlock_t *lock)
 {
 	int ret = 0;
 	if (need_lockbreak(lock)) {
 		spin_unlock(lock);
 		cpu_relax();
 		ret = 1;
 		spin_lock(lock);
 	}
 	if (need_resched() && system_state == SYSTEM_RUNNING) {
 		spin_release(&lock->dep_map, 1, _THIS_IP_);
 		_raw_spin_unlock(lock);
 		preempt_enable_no_resched();
 		__cond_resched();
 		ret = 1;
 		spin_lock(lock);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(cond_resched_lock);
 int __sched cond_resched_softirq(void)
 {
 	BUG_ON(!in_softirq());
 	if (need_resched() && system_state == SYSTEM_RUNNING) {
 		local_bh_enable();
 		__cond_resched();
 		local_bh_disable();
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(cond_resched_softirq);
 /**
  * yield - yield the current processor to other threads.
  *
  * This is a shortcut for kernel-space yielding - it marks the
  * thread runnable and calls sys_sched_yield().
  */
 void __sched yield(void)
 {
 	set_current_state(TASK_RUNNING);
 	sys_sched_yield();
 }
 EXPORT_SYMBOL(yield);
 /*
  * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
  * that process accounting knows that this is a task in IO wait state.
  *
  * But don't do that if it is a deliberate, throttling IO wait (this task
  * has set its backing_dev_info: the queue against which it should throttle)
  */
 void __sched io_schedule(void)
 {
 	struct rq *rq = &__raw_get_cpu_var(runqueues);
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	schedule();
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
 }
 EXPORT_SYMBOL(io_schedule);
 long __sched io_schedule_timeout(long timeout)
 {
 	struct rq *rq = &__raw_get_cpu_var(runqueues);
 	long ret;
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	ret = schedule_timeout(timeout);
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
 	return ret;
 }
 /**
  * sys_sched_get_priority_max - return maximum RT priority.
  * @policy: scheduling class.
  *
  * this syscall returns the maximum rt_priority that can be used
  * by a given scheduling class.
  */
 asmlinkage long sys_sched_get_priority_max(int policy)
 {
 	int ret = -EINVAL;
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
 		ret = MAX_USER_RT_PRIO-1;
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 	case SCHED_IDLE:
 		ret = 0;
 		break;
 	}
 	return ret;
 }
 /**
  * sys_sched_get_priority_min - return minimum RT priority.
  * @policy: scheduling class.
  *
  * this syscall returns the minimum rt_priority that can be used
  * by a given scheduling class.
  */
 asmlinkage long sys_sched_get_priority_min(int policy)
 {
 	int ret = -EINVAL;
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
 		ret = 1;
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 	case SCHED_IDLE:
 		ret = 0;
 	}
 	return ret;
 }
 /**
  * sys_sched_rr_get_interval - return the default timeslice of a process.
  * @pid: pid of the process.
  * @interval: userspace pointer to the timeslice value.
  *
  * this syscall writes the default timeslice value of a given process
  * into the user-space timespec buffer. A value of '0' means infinity.
  */
 asmlinkage
 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
 {
 	struct task_struct *p;
 	int retval = -EINVAL;
 	struct timespec t;
 	if (pid < 0)
 		goto out_nounlock;
 	retval = -ESRCH;
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	jiffies_to_timespec(p->policy == SCHED_FIFO ?
 				0 : static_prio_timeslice(p->static_prio), &t);
 	read_unlock(&tasklist_lock);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 out_nounlock:
 	return retval;
 out_unlock:
 	read_unlock(&tasklist_lock);
 	return retval;
 }
 static const char stat_nam[] = "RSDTtZX";
 static void show_task(struct task_struct *p)
 {
 	unsigned long free = 0;
 	unsigned state;
 	state = p->state ? __ffs(p->state) + 1 : 0;
 	printk("%-13.13s %c", p->comm,
 		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if BITS_PER_LONG == 32
 	if (state == TASK_RUNNING)
 		printk(" running  ");
 	else
 		printk(" %08lx ", thread_saved_pc(p));
 #else
 	if (state == TASK_RUNNING)
 		printk("  running task    ");
 	else
 		printk(" %016lx ", thread_saved_pc(p));
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	{
 		unsigned long *n = end_of_stack(p);
 		while (!*n)
 			n++;
 		free = (unsigned long)n - (unsigned long)end_of_stack(p);
 	}
 #endif
 	printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid);
 	if (state != TASK_RUNNING)
 		show_stack(p, NULL);
 }
 void show_state_filter(unsigned long state_filter)
 {
 	struct task_struct *g, *p;
 #if BITS_PER_LONG == 32
 	printk(KERN_INFO
 		"  task                PC stack   pid father\n");
 #else
 	printk(KERN_INFO
 		"  task                        PC stack   pid father\n");
 #endif
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
 		 * console might take alot of time:
 		 */
 		touch_nmi_watchdog();
 		if (!state_filter || (p->state & state_filter))
 			show_task(p);
 	} while_each_thread(g, p);
 	touch_all_softlockup_watchdogs();
 #ifdef CONFIG_SCHED_DEBUG
 	sysrq_sched_debug_show();
 #endif
 	read_unlock(&tasklist_lock);
 	/*
 	 * Only show locks if all tasks are dumped:
 	 */
 	if (state_filter == -1)
 		debug_show_all_locks();
 }
 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
 {
 	idle->sched_class = &idle_sched_class;
 }
 /**
  * init_idle - set up an idle thread for a given CPU
  * @idle: task in question
  * @cpu: cpu the idle task belongs to
  *
  * NOTE: this function does not set the idle thread's NEED_RESCHED
  * flag, to make booting more robust.
  */
 void __cpuinit init_idle(struct task_struct *idle, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 	__sched_fork(idle);
 	idle->se.exec_start = sched_clock();
 	idle->prio = idle->normal_prio = MAX_PRIO;
 	idle->cpus_allowed = cpumask_of_cpu(cpu);
 	__set_task_cpu(idle, cpu);
 	spin_lock_irqsave(&rq->lock, flags);
 	rq->curr = rq->idle = idle;
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	idle->oncpu = 1;
 #endif
 	spin_unlock_irqrestore(&rq->lock, flags);
 	/* Set the preempt count _outside_ the spinlocks! */
 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
 	task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
 #else
 	task_thread_info(idle)->preempt_count = 0;
 #endif
 	/*
 	 * The idle tasks have their own, simple scheduling class:
 	 */
 	idle->sched_class = &idle_sched_class;
 }
 /*
  * In a system that switches off the HZ timer nohz_cpu_mask
  * indicates which cpus entered this state. This is used
  * in the rcu update to wait only for active cpus. For system
  * which do not switch off the HZ timer nohz_cpu_mask should
  * always be CPU_MASK_NONE.
  */
 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
 /*
  * Increase the granularity value when there are more CPUs,
  * because with more CPUs the 'effective latency' as visible
  * to users decreases. But the relationship is not linear,
  * so pick a second-best guess by going with the log2 of the
  * number of CPUs.
  *
  * This idea comes from the SD scheduler of Con Kolivas:
  */
 static inline void sched_init_granularity(void)
 {
 	unsigned int factor = 1 + ilog2(num_online_cpus());
 	const unsigned long gran_limit = 100000000;
 	sysctl_sched_granularity *= factor;
 	if (sysctl_sched_granularity > gran_limit)
 		sysctl_sched_granularity = gran_limit;
 	sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
 	sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
 }
 #ifdef CONFIG_SMP
 /*
  * This is how migration works:
  *
  * 1) we queue a struct migration_req structure in the source CPU's
  *    runqueue and wake up that CPU's migration thread.
  * 2) we down() the locked semaphore => thread blocks.
  * 3) migration thread wakes up (implicitly it forces the migrated
  *    thread off the CPU)
  * 4) it gets the migration request and checks whether the migrated
  *    task is still in the wrong runqueue.
  * 5) if it's in the wrong runqueue then the migration thread removes
  *    it and puts it into the right queue.
  * 6) migration thread up()s the semaphore.
  * 7) we wake up and the migration is done.
  */
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
  * is removed from the allowed bitmask.
  *
  * NOTE: the caller must have a valid reference to the task, the
  * task must not exit() & deallocate itself prematurely.  The
  * call is not atomic; no spinlocks may be held.
  */
 int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 {
 	struct migration_req req;
 	unsigned long flags;
 	struct rq *rq;
 	int ret = 0;
 	rq = task_rq_lock(p, &flags);
 	if (!cpus_intersects(new_mask, cpu_online_map)) {
 		ret = -EINVAL;
 		goto out;
 	}
 	p->cpus_allowed = new_mask;
 	/* Can the task run on the task's current CPU? If so, we're done */
 	if (cpu_isset(task_cpu(p), new_mask))
 		goto out;
 	if (migrate_task(p, any_online_cpu(new_mask), &req)) {
 		/* Need help from migration thread: drop lock and wait. */
 		task_rq_unlock(rq, &flags);
 		wake_up_process(rq->migration_thread);
 		wait_for_completion(&req.done);
 		tlb_migrate_finish(p->mm);
 		return 0;
 	}
 out:
 	task_rq_unlock(rq, &flags);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed);
 /*
  * Move (not current) task off this cpu, onto dest cpu.  We're doing
  * this because either it can't run here any more (set_cpus_allowed()
  * away from this CPU, or CPU going down), or because we're
  * attempting to rebalance this task on exec (sched_exec).
  *
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
  *
  * Returns non-zero if task was successfully migrated.
  */
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
 	struct rq *rq_dest, *rq_src;
 	int ret = 0, on_rq;
 	if (unlikely(cpu_is_offline(dest_cpu)))
 		return ret;
 	rq_src = cpu_rq(src_cpu);
 	rq_dest = cpu_rq(dest_cpu);
 	double_rq_lock(rq_src, rq_dest);
 	/* Already moved. */
 	if (task_cpu(p) != src_cpu)
 		goto out;
 	/* Affinity changed (again). */
 	if (!cpu_isset(dest_cpu, p->cpus_allowed))
 		goto out;
 	on_rq = p->se.on_rq;
 	if (on_rq)
 		deactivate_task(rq_src, p, 0);
 	set_task_cpu(p, dest_cpu);
 	if (on_rq) {
 		activate_task(rq_dest, p, 0);
 		check_preempt_curr(rq_dest, p);
 	}
 	ret = 1;
 out:
 	double_rq_unlock(rq_src, rq_dest);
 	return ret;
 }
 /*
  * migration_thread - this is a highprio system thread that performs
  * thread migration by bumping thread off CPU then 'pushing' onto
  * another runqueue.
  */
 static int migration_thread(void *data)
 {
 	int cpu = (long)data;
 	struct rq *rq;
 	rq = cpu_rq(cpu);
 	BUG_ON(rq->migration_thread != current);
 	set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
 		struct migration_req *req;
 		struct list_head *head;
 		spin_lock_irq(&rq->lock);
 		if (cpu_is_offline(cpu)) {
 			spin_unlock_irq(&rq->lock);
 			goto wait_to_die;
 		}
 		if (rq->active_balance) {
 			active_load_balance(rq, cpu);
 			rq->active_balance = 0;
 		}
 		head = &rq->migration_queue;
 		if (list_empty(head)) {
 			spin_unlock_irq(&rq->lock);
 			schedule();
 			set_current_state(TASK_INTERRUPTIBLE);
 			continue;
 		}
 		req = list_entry(head->next, struct migration_req, list);
 		list_del_init(head->next);
 		spin_unlock(&rq->lock);
 		__migrate_task(req->task, cpu, req->dest_cpu);
 		local_irq_enable();
 		complete(&req->done);
 	}
 	__set_current_state(TASK_RUNNING);
 	return 0;
 wait_to_die:
 	/* Wait for kthread_stop */
 	set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
 		schedule();
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
 	__set_current_state(TASK_RUNNING);
 	return 0;
 }
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Figure out where task on dead CPU should go, use force if neccessary.
  * NOTE: interrupts should be disabled by the caller
  */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
 	unsigned long flags;
 	cpumask_t mask;
 	struct rq *rq;
 	int dest_cpu;
 restart:
 	/* On same node? */
 	mask = node_to_cpumask(cpu_to_node(dead_cpu));
 	cpus_and(mask, mask, p->cpus_allowed);
 	dest_cpu = any_online_cpu(mask);
 	/* On any allowed CPU? */
 	if (dest_cpu == NR_CPUS)
 		dest_cpu = any_online_cpu(p->cpus_allowed);
 	/* No more Mr. Nice Guy. */
 	if (dest_cpu == NR_CPUS) {
 		rq = task_rq_lock(p, &flags);
 		cpus_setall(p->cpus_allowed);
 		dest_cpu = any_online_cpu(p->cpus_allowed);
 		task_rq_unlock(rq, &flags);
 		/*
 		 * Don't tell them about moving exiting tasks or
 		 * kernel threads (both mm NULL), since they never
 		 * leave kernel.
 		 */
 		if (p->mm && printk_ratelimit())
 			printk(KERN_INFO "process %d (%s) no "
 			       "longer affine to cpu%d\n",
 			       p->pid, p->comm, dead_cpu);
 	}
 	if (!__migrate_task(p, dead_cpu, dest_cpu))
 		goto restart;
 }
 /*
  * While a dead CPU has no uninterruptible tasks queued at this point,
  * it might still have a nonzero ->nr_uninterruptible counter, because
  * for performance reasons the counter is not stricly tracking tasks to
  * their home CPUs. So we just add the counter to another CPU's counter,
  * to keep the global sum constant after CPU-down:
  */
 static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
 	struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
 	unsigned long flags;
 	local_irq_save(flags);
 	double_rq_lock(rq_src, rq_dest);
 	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
 	rq_src->nr_uninterruptible = 0;
 	double_rq_unlock(rq_src, rq_dest);
 	local_irq_restore(flags);
 }
 /* Run through task list and migrate tasks from the dead cpu. */
 static void migrate_live_tasks(int src_cpu)
 {
 	struct task_struct *p, *t;
 	write_lock_irq(&tasklist_lock);
 	do_each_thread(t, p) {
 		if (p == current)
 			continue;
 		if (task_cpu(p) == src_cpu)
 			move_task_off_dead_cpu(src_cpu, p);
 	} while_each_thread(t, p);
 	write_unlock_irq(&tasklist_lock);
 }
 /*
  * Schedules idle task to be the next runnable task on current CPU.
  * It does so by boosting its priority to highest possible and adding it to
  * the _front_ of the runqueue. Used by CPU offline code.
  */
 void sched_idle_next(void)
 {
 	int this_cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(this_cpu);
 	struct task_struct *p = rq->idle;
 	unsigned long flags;
 	/* cpu has to be offline */
 	BUG_ON(cpu_online(this_cpu));
 	/*
 	 * Strictly not necessary since rest of the CPUs are stopped by now
 	 * and interrupts disabled on the current cpu.
 	 */
 	spin_lock_irqsave(&rq->lock, flags);
 	__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
 	/* Add idle task to the _front_ of its priority queue: */
 	activate_idle_task(p, rq);
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 /*
  * Ensures that the idle task is using init_mm right before its cpu goes
  * offline.
  */
 void idle_task_exit(void)
 {
 	struct mm_struct *mm = current->active_mm;
 	BUG_ON(cpu_online(smp_processor_id()));
 	if (mm != &init_mm)
 		switch_mm(mm, &init_mm, current);
 	mmdrop(mm);
 }
 /* called under rq->lock with disabled interrupts */
 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 {
 	struct rq *rq = cpu_rq(dead_cpu);
 	/* Must be exiting, otherwise would be on tasklist. */
 	BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
 	/* Cannot have done final schedule yet: would have vanished. */
 	BUG_ON(p->state == TASK_DEAD);
 	get_task_struct(p);
 	/*
 	 * Drop lock around migration; if someone else moves it,
 	 * that's OK.  No task can be added to this CPU, so iteration is
 	 * fine.
 	 * NOTE: interrupts should be left disabled  --dev@
 	 */
 	spin_unlock(&rq->lock);
 	move_task_off_dead_cpu(dead_cpu, p);
 	spin_lock(&rq->lock);
 	put_task_struct(p);
 }
 /* release_task() removes task from tasklist, so we won't find dead tasks. */
 static void migrate_dead_tasks(unsigned int dead_cpu)
 {
 	struct rq *rq = cpu_rq(dead_cpu);
 	struct task_struct *next;
 	for ( ; ; ) {
 		if (!rq->nr_running)
 			break;
 		next = pick_next_task(rq, rq->curr, rq_clock(rq));
 		if (!next)
 			break;
 		migrate_dead(dead_cpu, next);
 	}
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
 static struct ctl_table sd_ctl_dir[] = {
 	{CTL_UNNUMBERED, "sched_domain", NULL, 0, 0755, NULL, },
 	{0,},
 };
 static struct ctl_table sd_ctl_root[] = {
 	{CTL_UNNUMBERED, "kernel", NULL, 0, 0755, sd_ctl_dir, },
 	{0,},
 };
 static struct ctl_table *sd_alloc_ctl_entry(int n)
 {
 	struct ctl_table *entry =
 		kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL);
 	BUG_ON(!entry);
 	memset(entry, 0, n * sizeof(struct ctl_table));
 	return entry;
 }
 static void
 set_table_entry(struct ctl_table *entry, int ctl_name,
 		const char *procname, void *data, int maxlen,
 		mode_t mode, proc_handler *proc_handler)
 {
 	entry->ctl_name = ctl_name;
 	entry->procname = procname;
 	entry->data = data;
 	entry->maxlen = maxlen;
 	entry->mode = mode;
 	entry->proc_handler = proc_handler;
 }
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
 	struct ctl_table *table = sd_alloc_ctl_entry(14);
 	set_table_entry(&table[0], 1, "min_interval", &sd->min_interval,
 		sizeof(long), 0644, proc_doulongvec_minmax);
 	set_table_entry(&table[1], 2, "max_interval", &sd->max_interval,
 		sizeof(long), 0644, proc_doulongvec_minmax);
 	set_table_entry(&table[2], 3, "busy_idx", &sd->busy_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[3], 4, "idle_idx", &sd->idle_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[4], 5, "newidle_idx", &sd->newidle_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[5], 6, "wake_idx", &sd->wake_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[6], 7, "forkexec_idx", &sd->forkexec_idx,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[7], 8, "busy_factor", &sd->busy_factor,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[8], 9, "imbalance_pct", &sd->imbalance_pct,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[10], 11, "cache_nice_tries",
 		&sd->cache_nice_tries,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[12], 13, "flags", &sd->flags,
 		sizeof(int), 0644, proc_dointvec_minmax);
 	return table;
 }
 static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
 {
 	struct ctl_table *entry, *table;
 	struct sched_domain *sd;
 	int domain_num = 0, i;
 	char buf[32];
 	for_each_domain(cpu, sd)
 		domain_num++;
 	entry = table = sd_alloc_ctl_entry(domain_num + 1);
 	i = 0;
 	for_each_domain(cpu, sd) {
 		snprintf(buf, 32, "domain%d", i);
 		entry->ctl_name = i + 1;
 		entry->procname = kstrdup(buf, GFP_KERNEL);
 		entry->mode = 0755;
 		entry->child = sd_alloc_ctl_domain_table(sd);
 		entry++;
 		i++;
 	}
 	return table;
 }
 static struct ctl_table_header *sd_sysctl_header;
 static void init_sched_domain_sysctl(void)
 {
 	int i, cpu_num = num_online_cpus();
 	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
 	char buf[32];
 	sd_ctl_dir[0].child = entry;
 	for (i = 0; i < cpu_num; i++, entry++) {
 		snprintf(buf, 32, "cpu%d", i);
 		entry->ctl_name = i + 1;
 		entry->procname = kstrdup(buf, GFP_KERNEL);
 		entry->mode = 0755;
 		entry->child = sd_alloc_ctl_cpu_table(i);
 	}
 	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
 }
 #else
 static void init_sched_domain_sysctl(void)
 {
 }
 #endif
 /*
  * migration_call - callback that gets triggered when a CPU is added.
  * Here we can start up the necessary migration thread for the new CPU.
  */
 static int __cpuinit
 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	struct task_struct *p;
 	int cpu = (long)hcpu;
 	unsigned long flags;
 	struct rq *rq;
 	switch (action) {
 	case CPU_LOCK_ACQUIRE:
 		mutex_lock(&sched_hotcpu_mutex);
 		break;
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
 		p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
 		if (IS_ERR(p))
 			return NOTIFY_BAD;
 		kthread_bind(p, cpu);
 		/* Must be high prio: stop_machine expects to yield to it. */
 		rq = task_rq_lock(p, &flags);
 		__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
 		task_rq_unlock(rq, &flags);
 		cpu_rq(cpu)->migration_thread = p;
 		break;
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		/* Strictly unneccessary, as first user will wake it. */
 		wake_up_process(cpu_rq(cpu)->migration_thread);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 		if (!cpu_rq(cpu)->migration_thread)
 			break;
 		/* Unbind it from offline cpu so it can run.  Fall thru. */
 		kthread_bind(cpu_rq(cpu)->migration_thread,
 			     any_online_cpu(cpu_online_map));
 		kthread_stop(cpu_rq(cpu)->migration_thread);
 		cpu_rq(cpu)->migration_thread = NULL;
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		migrate_live_tasks(cpu);
 		rq = cpu_rq(cpu);
 		kthread_stop(rq->migration_thread);
 		rq->migration_thread = NULL;
 		/* Idle task back to normal (off runqueue, low prio) */
 		rq = task_rq_lock(rq->idle, &flags);
 		deactivate_task(rq, rq->idle, 0);
 		rq->idle->static_prio = MAX_PRIO;
 		__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
 		rq->idle->sched_class = &idle_sched_class;
 		migrate_dead_tasks(cpu);
 		task_rq_unlock(rq, &flags);
 		migrate_nr_uninterruptible(rq);
 		BUG_ON(rq->nr_running != 0);
 		/* No need to migrate the tasks: it was best-effort if
 		 * they didn't take sched_hotcpu_mutex.  Just wake up
 		 * the requestors. */
 		spin_lock_irq(&rq->lock);
 		while (!list_empty(&rq->migration_queue)) {
 			struct migration_req *req;
 			req = list_entry(rq->migration_queue.next,
 					 struct migration_req, list);
 			list_del_init(&req->list);
 			complete(&req->done);
 		}
 		spin_unlock_irq(&rq->lock);
 		break;
 #endif
 	case CPU_LOCK_RELEASE:
 		mutex_unlock(&sched_hotcpu_mutex);
 		break;
 	}
 	return NOTIFY_OK;
 }
 /* Register at highest priority so that task migration (migrate_all_tasks)
  * happens before everything else.
  */
 static struct notifier_block __cpuinitdata migration_notifier = {
 	.notifier_call = migration_call,
 	.priority = 10
 };
 int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err;
 	/* Start one for the boot CPU: */
 	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
 	BUG_ON(err == NOTIFY_BAD);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
 	return 0;
 }
 #endif
 #ifdef CONFIG_SMP
 /* Number of possible processor ids */
 int nr_cpu_ids __read_mostly = NR_CPUS;
 EXPORT_SYMBOL(nr_cpu_ids);
 #undef SCHED_DOMAIN_DEBUG
 #ifdef SCHED_DOMAIN_DEBUG
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
 	int level = 0;
 	if (!sd) {
 		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
 		return;
 	}
 	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
 	do {
 		int i;
 		char str[NR_CPUS];
 		struct sched_group *group = sd->groups;
 		cpumask_t groupmask;
 		cpumask_scnprintf(str, NR_CPUS, sd->span);
 		cpus_clear(groupmask);
 		printk(KERN_DEBUG);
 		for (i = 0; i < level + 1; i++)
 			printk(" ");
 		printk("domain %d: ", level);
 		if (!(sd->flags & SD_LOAD_BALANCE)) {
 			printk("does not load-balance\n");
 			if (sd->parent)
 				printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
 						" has parent");
 			break;
 		}
 		printk("span %s\n", str);
 		if (!cpu_isset(cpu, sd->span))
 			printk(KERN_ERR "ERROR: domain->span does not contain "
 					"CPU%d\n", cpu);
 		if (!cpu_isset(cpu, group->cpumask))
 			printk(KERN_ERR "ERROR: domain->groups does not contain"
 					" CPU%d\n", cpu);
 		printk(KERN_DEBUG);
 		for (i = 0; i < level + 2; i++)
 			printk(" ");
 		printk("groups:");
 		do {
 			if (!group) {
 				printk("\n");
 				printk(KERN_ERR "ERROR: group is NULL\n");
 				break;
 			}
 			if (!group->__cpu_power) {
 				printk("\n");
 				printk(KERN_ERR "ERROR: domain->cpu_power not "
 						"set\n");
 			}
 			if (!cpus_weight(group->cpumask)) {
 				printk("\n");
 				printk(KERN_ERR "ERROR: empty group\n");
 			}
 			if (cpus_intersects(groupmask, group->cpumask)) {
 				printk("\n");
 				printk(KERN_ERR "ERROR: repeated CPUs\n");
 			}
 			cpus_or(groupmask, groupmask, group->cpumask);
 			cpumask_scnprintf(str, NR_CPUS, group->cpumask);
 			printk(" %s", str);
 			group = group->next;
 		} while (group != sd->groups);
 		printk("\n");
 		if (!cpus_equal(sd->span, groupmask))
 			printk(KERN_ERR "ERROR: groups don't span "
 					"domain->span\n");
 		level++;
 		sd = sd->parent;
 		if (!sd)
 			continue;
 		if (!cpus_subset(groupmask, sd->span))
 			printk(KERN_ERR "ERROR: parent span is not a superset "
 				"of domain->span\n");
 	} while (sd);
 }
 #else
 # define sched_domain_debug(sd, cpu) do { } while (0)
 #endif
 static int sd_degenerate(struct sched_domain *sd)
 {
 	if (cpus_weight(sd->span) == 1)
 		return 1;
 	/* Following flags need at least 2 groups */
 	if (sd->flags & (SD_LOAD_BALANCE |
 			 SD_BALANCE_NEWIDLE |
 			 SD_BALANCE_FORK |
 			 SD_BALANCE_EXEC |
 			 SD_SHARE_CPUPOWER |
 			 SD_SHARE_PKG_RESOURCES)) {
 		if (sd->groups != sd->groups->next)
 			return 0;
 	}
 	/* Following flags don't use groups */
 	if (sd->flags & (SD_WAKE_IDLE |
 			 SD_WAKE_AFFINE |
 			 SD_WAKE_BALANCE))
 		return 0;
 	return 1;
 }
 static int
 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 {
 	unsigned long cflags = sd->flags, pflags = parent->flags;
 	if (sd_degenerate(parent))
 		return 1;
 	if (!cpus_equal(sd->span, parent->span))
 		return 0;
 	/* Does parent contain flags not in child? */
 	/* WAKE_BALANCE is a subset of WAKE_AFFINE */
 	if (cflags & SD_WAKE_AFFINE)
 		pflags &= ~SD_WAKE_BALANCE;
 	/* Flags needing groups don't count if only 1 group in parent */
 	if (parent->groups == parent->groups->next) {
 		pflags &= ~(SD_LOAD_BALANCE |
 				SD_BALANCE_NEWIDLE |
 				SD_BALANCE_FORK |
 				SD_BALANCE_EXEC |
 				SD_SHARE_CPUPOWER |
 				SD_SHARE_PKG_RESOURCES);
 	}
 	if (~cflags & pflags)
 		return 0;
 	return 1;
 }
 /*
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
  */
 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
 	/* Remove the sched domains which do not contribute to scheduling. */
 	for (tmp = sd; tmp; tmp = tmp->parent) {
 		struct sched_domain *parent = tmp->parent;
 		if (!parent)
 			break;
 		if (sd_parent_degenerate(tmp, parent)) {
 			tmp->parent = parent->parent;
 			if (parent->parent)
 				parent->parent->child = tmp;
 		}
 	}
 	if (sd && sd_degenerate(sd)) {
 		sd = sd->parent;
 		if (sd)
 			sd->child = NULL;
 	}
 	sched_domain_debug(sd, cpu);
 	rcu_assign_pointer(rq->sd, sd);
 }
 /* cpus with isolated domains */
 static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
 	int ints[NR_CPUS], i;
 	str = get_options(str, ARRAY_SIZE(ints), ints);
 	cpus_clear(cpu_isolated_map);
 	for (i = 1; i <= ints[0]; i++)
 		if (ints[i] < NR_CPUS)
 			cpu_set(ints[i], cpu_isolated_map);
 	return 1;
 }
 __setup ("isolcpus=", isolated_cpu_setup);
 /*
  * init_sched_build_groups takes the cpumask we wish to span, and a pointer
  * to a function which identifies what group(along with sched group) a CPU
  * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
  * (due to the fact that we keep track of groups covered with a cpumask_t).
  *
  * init_sched_build_groups will build a circular linked list of the groups
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
 static void
 init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
 			int (*group_fn)(int cpu, const cpumask_t *cpu_map,
 					struct sched_group **sg))
 {
 	struct sched_group *first = NULL, *last = NULL;
 	cpumask_t covered = CPU_MASK_NONE;
 	int i;
 	for_each_cpu_mask(i, span) {
 		struct sched_group *sg;
 		int group = group_fn(i, cpu_map, &sg);
 		int j;
 		if (cpu_isset(i, covered))
 			continue;
 		sg->cpumask = CPU_MASK_NONE;
 		sg->__cpu_power = 0;
 		for_each_cpu_mask(j, span) {
 			if (group_fn(j, cpu_map, NULL) != group)
 				continue;
 			cpu_set(j, covered);
 			cpu_set(j, sg->cpumask);
 		}
 		if (!first)
 			first = sg;
 		if (last)
 			last->next = sg;
 		last = sg;
 	}
 	last->next = first;
 }
 #define SD_NODES_PER_DOMAIN 16
 #ifdef CONFIG_NUMA
 /**
  * find_next_best_node - find the next node to include in a sched_domain
  * @node: node whose sched_domain we're building
  * @used_nodes: nodes already in the sched_domain
  *
  * Find the next node to include in a given scheduling domain.  Simply
  * finds the closest node not already in the @used_nodes map.
  *
  * Should use nodemask_t.
  */
 static int find_next_best_node(int node, unsigned long *used_nodes)
 {
 	int i, n, val, min_val, best_node = 0;
 	min_val = INT_MAX;
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		/* Start at @node */
 		n = (node + i) % MAX_NUMNODES;
 		if (!nr_cpus_node(n))
 			continue;
 		/* Skip already used nodes */
 		if (test_bit(n, used_nodes))
 			continue;
 		/* Simple min distance search */
 		val = node_distance(node, n);
 		if (val < min_val) {
 			min_val = val;
 			best_node = n;
 		}
 	}
 	set_bit(best_node, used_nodes);
 	return best_node;
 }
 /**
  * sched_domain_node_span - get a cpumask for a node's sched_domain
  * @node: node whose cpumask we're constructing
  * @size: number of nodes to include in this span
  *
  * Given a node, construct a good cpumask for its sched_domain to span.  It
  * should be one that prevents unnecessary balancing, but also spreads tasks
  * out optimally.
  */
 static cpumask_t sched_domain_node_span(int node)
 {
 	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
 	cpumask_t span, nodemask;
 	int i;
 	cpus_clear(span);
 	bitmap_zero(used_nodes, MAX_NUMNODES);
 	nodemask = node_to_cpumask(node);
 	cpus_or(span, span, nodemask);
 	set_bit(node, used_nodes);
 	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
 		int next_node = find_next_best_node(node, used_nodes);
 		nodemask = node_to_cpumask(next_node);
 		cpus_or(span, span, nodemask);
 	}
 	return span;
 }
 #endif
 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 /*
  * SMT sched-domains:
  */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
 static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
 			    struct sched_group **sg)
 {
 	if (sg)
 		*sg = &per_cpu(sched_group_cpus, cpu);
 	return cpu;
 }
 #endif
 /*
  * multi-core sched-domains:
  */
 #ifdef CONFIG_SCHED_MC
 static DEFINE_PER_CPU(struct sched_domain, core_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
 #endif
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
 			     struct sched_group **sg)
 {
 	int group;
 	cpumask_t mask = cpu_sibling_map[cpu];
 	cpus_and(mask, mask, *cpu_map);
 	group = first_cpu(mask);
 	if (sg)
 		*sg = &per_cpu(sched_group_core, group);
 	return group;
 }
 #elif defined(CONFIG_SCHED_MC)
 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
 			     struct sched_group **sg)
 {
 	if (sg)
 		*sg = &per_cpu(sched_group_core, cpu);
 	return cpu;
 }
 #endif
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
 static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
 			     struct sched_group **sg)
 {
 	int group;
 #ifdef CONFIG_SCHED_MC
 	cpumask_t mask = cpu_coregroup_map(cpu);
 	cpus_and(mask, mask, *cpu_map);
 	group = first_cpu(mask);
 #elif defined(CONFIG_SCHED_SMT)
 	cpumask_t mask = cpu_sibling_map[cpu];
 	cpus_and(mask, mask, *cpu_map);
 	group = first_cpu(mask);
 #else
 	group = cpu;
 #endif
 	if (sg)
 		*sg = &per_cpu(sched_group_phys, group);
 	return group;
 }
 #ifdef CONFIG_NUMA
 /*
  * The init_sched_build_groups can't handle what we want to do with node
  * groups, so roll our own. Now each node has its own list of groups which
  * gets dynamically allocated.
  */
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
 				 struct sched_group **sg)
 {
 	cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
 	int group;
 	cpus_and(nodemask, nodemask, *cpu_map);
 	group = first_cpu(nodemask);
 	if (sg)
 		*sg = &per_cpu(sched_group_allnodes, group);
 	return group;
 }
 static void init_numa_sched_groups_power(struct sched_group *group_head)
 {
 	struct sched_group *sg = group_head;
 	int j;
 	if (!sg)
 		return;
 next_sg:
 	for_each_cpu_mask(j, sg->cpumask) {
 		struct sched_domain *sd;
 		sd = &per_cpu(phys_domains, j);
 		if (j != first_cpu(sd->groups->cpumask)) {
 			/*
 			 * Only add "power" once for each
 			 * physical package.
 			 */
 			continue;
 		}
 		sg_inc_cpu_power(sg, sd->groups->__cpu_power);
 	}
 	sg = sg->next;
 	if (sg != group_head)
 		goto next_sg;
 }
 #endif
 #ifdef CONFIG_NUMA
 /* Free memory allocated for various sched_group structures */
 static void free_sched_groups(const cpumask_t *cpu_map)
 {
 	int cpu, i;
 	for_each_cpu_mask(cpu, *cpu_map) {
 		struct sched_group **sched_group_nodes
 			= sched_group_nodes_bycpu[cpu];
 		if (!sched_group_nodes)
 			continue;
 		for (i = 0; i < MAX_NUMNODES; i++) {
 			cpumask_t nodemask = node_to_cpumask(i);
 			struct sched_group *oldsg, *sg = sched_group_nodes[i];
 			cpus_and(nodemask, nodemask, *cpu_map);
 			if (cpus_empty(nodemask))
 				continue;
 			if (sg == NULL)
 				continue;
 			sg = sg->next;
 next_sg:
 			oldsg = sg;
 			sg = sg->next;
 			kfree(oldsg);
 			if (oldsg != sched_group_nodes[i])
 				goto next_sg;
 		}
 		kfree(sched_group_nodes);
 		sched_group_nodes_bycpu[cpu] = NULL;
 	}
 }
 #else
 static void free_sched_groups(const cpumask_t *cpu_map)
 {
 }
 #endif
 /*
  * Initialize sched groups cpu_power.
  *
  * cpu_power indicates the capacity of sched group, which is used while
  * distributing the load between different sched groups in a sched domain.
  * Typically cpu_power for all the groups in a sched domain will be same unless
  * there are asymmetries in the topology. If there are asymmetries, group
  * having more cpu_power will pickup more load compared to the group having
  * less cpu_power.
  *
  * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
  * the maximum number of tasks a group can handle in the presence of other idle
  * or lightly loaded groups in the same sched domain.
  */
 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 {
 	struct sched_domain *child;
 	struct sched_group *group;
 	WARN_ON(!sd || !sd->groups);
 	if (cpu != first_cpu(sd->groups->cpumask))
 		return;
 	child = sd->child;
 	sd->groups->__cpu_power = 0;
 	/*
 	 * For perf policy, if the groups in child domain share resources
 	 * (for example cores sharing some portions of the cache hierarchy
 	 * or SMT), then set this domain groups cpu_power such that each group
 	 * can handle only one task, when there are other idle groups in the
 	 * same sched domain.
 	 */
 	if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
 		       (child->flags &
 			(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
 		sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
 		return;
 	}
 	/*
 	 * add cpu_power of each child group to this groups cpu_power
 	 */
 	group = child->groups;
 	do {
 		sg_inc_cpu_power(sd->groups, group->__cpu_power);
 		group = group->next;
 	} while (group != child->groups);
 }
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
 static int build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
 #ifdef CONFIG_NUMA
 	struct sched_group **sched_group_nodes = NULL;
 	int sd_allnodes = 0;
 	/*
 	 * Allocate the per-node list of sched groups
 	 */
 	sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
 					   GFP_KERNEL);
 	if (!sched_group_nodes) {
 		printk(KERN_WARNING "Can not alloc sched group node list\n");
 		return -ENOMEM;
 	}
 	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
 #endif
 	/*
 	 * Set up domains for cpus specified by the cpu_map.
 	 */
 	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd = NULL, *p;
 		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
 		cpus_and(nodemask, nodemask, *cpu_map);
 #ifdef CONFIG_NUMA
 		if (cpus_weight(*cpu_map) >
 				SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
 			sd = &per_cpu(allnodes_domains, i);
 			*sd = SD_ALLNODES_INIT;
 			sd->span = *cpu_map;
 			cpu_to_allnodes_group(i, cpu_map, &sd->groups);
 			p = sd;
 			sd_allnodes = 1;
 		} else
 			p = NULL;
 		sd = &per_cpu(node_domains, i);
 		*sd = SD_NODE_INIT;
 		sd->span = sched_domain_node_span(cpu_to_node(i));
 		sd->parent = p;
 		if (p)
 			p->child = sd;
 		cpus_and(sd->span, sd->span, *cpu_map);
 #endif
 		p = sd;
 		sd = &per_cpu(phys_domains, i);
 		*sd = SD_CPU_INIT;
 		sd->span = nodemask;
 		sd->parent = p;
 		if (p)
 			p->child = sd;
 		cpu_to_phys_group(i, cpu_map, &sd->groups);
 #ifdef CONFIG_SCHED_MC
 		p = sd;
 		sd = &per_cpu(core_domains, i);
 		*sd = SD_MC_INIT;
 		sd->span = cpu_coregroup_map(i);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
 		cpu_to_core_group(i, cpu_map, &sd->groups);
 #endif
 #ifdef CONFIG_SCHED_SMT
 		p = sd;
 		sd = &per_cpu(cpu_domains, i);
 		*sd = SD_SIBLING_INIT;
 		sd->span = cpu_sibling_map[i];
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
 		cpu_to_cpu_group(i, cpu_map, &sd->groups);
 #endif
 	}
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
 	for_each_cpu_mask(i, *cpu_map) {
 		cpumask_t this_sibling_map = cpu_sibling_map[i];
 		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
 		if (i != first_cpu(this_sibling_map))
 			continue;
 		init_sched_build_groups(this_sibling_map, cpu_map,
 					&cpu_to_cpu_group);
 	}
 #endif
 #ifdef CONFIG_SCHED_MC
 	/* Set up multi-core groups */
 	for_each_cpu_mask(i, *cpu_map) {
 		cpumask_t this_core_map = cpu_coregroup_map(i);
 		cpus_and(this_core_map, this_core_map, *cpu_map);
 		if (i != first_cpu(this_core_map))
 			continue;
 		init_sched_build_groups(this_core_map, cpu_map,
 					&cpu_to_core_group);
 	}
 #endif
 	/* Set up physical groups */
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cpumask_t nodemask = node_to_cpumask(i);
 		cpus_and(nodemask, nodemask, *cpu_map);
 		if (cpus_empty(nodemask))
 			continue;
 		init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
 	}
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
 	if (sd_allnodes)
 		init_sched_build_groups(*cpu_map, cpu_map,
 					&cpu_to_allnodes_group);
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		/* Set up node groups */
 		struct sched_group *sg, *prev;
 		cpumask_t nodemask = node_to_cpumask(i);
 		cpumask_t domainspan;
 		cpumask_t covered = CPU_MASK_NONE;
 		int j;
 		cpus_and(nodemask, nodemask, *cpu_map);
 		if (cpus_empty(nodemask)) {
 			sched_group_nodes[i] = NULL;
 			continue;
 		}
 		domainspan = sched_domain_node_span(i);
 		cpus_and(domainspan, domainspan, *cpu_map);
 		sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
 		if (!sg) {
 			printk(KERN_WARNING "Can not alloc domain group for "
 				"node %d\n", i);
 			goto error;
 		}
 		sched_group_nodes[i] = sg;
 		for_each_cpu_mask(j, nodemask) {
 			struct sched_domain *sd;
 			sd = &per_cpu(node_domains, j);
 			sd->groups = sg;
 		}
 		sg->__cpu_power = 0;
 		sg->cpumask = nodemask;
 		sg->next = sg;
 		cpus_or(covered, covered, nodemask);
 		prev = sg;
 		for (j = 0; j < MAX_NUMNODES; j++) {
 			cpumask_t tmp, notcovered;
 			int n = (i + j) % MAX_NUMNODES;
 			cpus_complement(notcovered, covered);
 			cpus_and(tmp, notcovered, *cpu_map);
 			cpus_and(tmp, tmp, domainspan);
 			if (cpus_empty(tmp))
 				break;
 			nodemask = node_to_cpumask(n);
 			cpus_and(tmp, tmp, nodemask);
 			if (cpus_empty(tmp))
 				continue;
 			sg = kmalloc_node(sizeof(struct sched_group),
 					  GFP_KERNEL, i);
 			if (!sg) {
 				printk(KERN_WARNING
 				"Can not alloc domain group for node %d\n", j);
 				goto error;
 			}
 			sg->__cpu_power = 0;
 			sg->cpumask = tmp;
 			sg->next = prev->next;
 			cpus_or(covered, covered, tmp);
 			prev->next = sg;
 			prev = sg;
 		}
 	}
 #endif
 	/* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
 	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd = &per_cpu(cpu_domains, i);
 		init_sched_groups_power(i, sd);
 	}
 #endif
 #ifdef CONFIG_SCHED_MC
 	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd = &per_cpu(core_domains, i);
 		init_sched_groups_power(i, sd);
 	}
 #endif
 	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd = &per_cpu(phys_domains, i);
 		init_sched_groups_power(i, sd);
 	}
 #ifdef CONFIG_NUMA
 	for (i = 0; i < MAX_NUMNODES; i++)
 		init_numa_sched_groups_power(sched_group_nodes[i]);
 	if (sd_allnodes) {
 		struct sched_group *sg;
 		cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
 		init_numa_sched_groups_power(sg);
 	}
 #endif
 	/* Attach the domains */
 	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
 #elif defined(CONFIG_SCHED_MC)
 		sd = &per_cpu(core_domains, i);
 #else
 		sd = &per_cpu(phys_domains, i);
 #endif
 		cpu_attach_domain(sd, i);
 	}
 	return 0;
 #ifdef CONFIG_NUMA
 error:
 	free_sched_groups(cpu_map);
 	return -ENOMEM;
 #endif
 }
 /*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
  */
 static int arch_init_sched_domains(const cpumask_t *cpu_map)
 {
 	cpumask_t cpu_default_map;
 	int err;
 	/*
 	 * Setup mask for cpus without special case scheduling requirements.
 	 * For now this just excludes isolated cpus, but could be used to
 	 * exclude other special cases in the future.
 	 */
 	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
 	err = build_sched_domains(&cpu_default_map);
 	return err;
 }
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
 	free_sched_groups(cpu_map);
 }
 /*
  * Detach sched domains from a group of cpus specified in cpu_map
  * These cpus will now be attached to the NULL domain
  */
 static void detach_destroy_domains(const cpumask_t *cpu_map)
 {
 	int i;
 	for_each_cpu_mask(i, *cpu_map)
 		cpu_attach_domain(NULL, i);
 	synchronize_sched();
 	arch_destroy_sched_domains(cpu_map);
 }
 /*
  * Partition sched domains as specified by the cpumasks below.
  * This attaches all cpus from the cpumasks to the NULL domain,
  * waits for a RCU quiescent period, recalculates sched
  * domain information and then attaches them back to the
  * correct sched domains
  * Call with hotplug lock held
  */
 int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
 {
 	cpumask_t change_map;
 	int err = 0;
 	cpus_and(*partition1, *partition1, cpu_online_map);
 	cpus_and(*partition2, *partition2, cpu_online_map);
 	cpus_or(change_map, *partition1, *partition2);
 	/* Detach sched domains from all of the affected cpus */
 	detach_destroy_domains(&change_map);
 	if (!cpus_empty(*partition1))
 		err = build_sched_domains(partition1);
 	if (!err && !cpus_empty(*partition2))
 		err = build_sched_domains(partition2);
 	return err;
 }
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 int arch_reinit_sched_domains(void)
 {
 	int err;
 	mutex_lock(&sched_hotcpu_mutex);
 	detach_destroy_domains(&cpu_online_map);
 	err = arch_init_sched_domains(&cpu_online_map);
 	mutex_unlock(&sched_hotcpu_mutex);
 	return err;
 }
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 {
 	int ret;
 	if (buf[0] != '0' && buf[0] != '1')
 		return -EINVAL;
 	if (smt)
 		sched_smt_power_savings = (buf[0] == '1');
 	else
 		sched_mc_power_savings = (buf[0] == '1');
 	ret = arch_reinit_sched_domains();
 	return ret ? ret : count;
 }
 int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 {
 	int err = 0;
 #ifdef CONFIG_SCHED_SMT
 	if (smt_capable())
 		err = sysfs_create_file(&cls->kset.kobj,
 					&attr_sched_smt_power_savings.attr);
 #endif
 #ifdef CONFIG_SCHED_MC
 	if (!err && mc_capable())
 		err = sysfs_create_file(&cls->kset.kobj,
 					&attr_sched_mc_power_savings.attr);
 #endif
 	return err;
 }
 #endif
 #ifdef CONFIG_SCHED_MC
 static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
 {
 	return sprintf(page, "%u\n", sched_mc_power_savings);
 }
 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
 					    const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 0);
 }
 SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
 	    sched_mc_power_savings_store);
 #endif
 #ifdef CONFIG_SCHED_SMT
 static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
 {
 	return sprintf(page, "%u\n", sched_smt_power_savings);
 }
 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
 					     const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 1);
 }
 SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
 	    sched_smt_power_savings_store);
 #endif
 /*
  * Force a reinitialization of the sched domains hierarchy.  The domains
  * and groups cannot be updated in place without racing with the balancing
  * code, so we temporarily attach all running cpus to the NULL domain
  * which will prevent rebalancing while the sched domains are recalculated.
  */
 static int update_sched_domains(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		detach_destroy_domains(&cpu_online_map);
 		return NOTIFY_OK;
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		/*
 		 * Fall through and re-initialise the domains.
 		 */
 		break;
 	default:
 		return NOTIFY_DONE;
 	}
 	/* The hotplug lock is already held by cpu_up/cpu_down */
 	arch_init_sched_domains(&cpu_online_map);
 	return NOTIFY_OK;
 }
 void __init sched_init_smp(void)
 {
 	cpumask_t non_isolated_cpus;
 	mutex_lock(&sched_hotcpu_mutex);
 	arch_init_sched_domains(&cpu_online_map);
 	cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
 	if (cpus_empty(non_isolated_cpus))
 		cpu_set(smp_processor_id(), non_isolated_cpus);
 	mutex_unlock(&sched_hotcpu_mutex);
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
 	init_sched_domain_sysctl();
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 		BUG();
 	sched_init_granularity();
 }
 #else
 void __init sched_init_smp(void)
 {
 	sched_init_granularity();
 }
 #endif /* CONFIG_SMP */
 int in_sched_functions(unsigned long addr)
 {
 	/* Linker adds these: start and end of __sched functions */
 	extern char __sched_text_start[], __sched_text_end[];
 	return in_lock_functions(addr) ||
 		(addr >= (unsigned long)__sched_text_start
 		&& addr < (unsigned long)__sched_text_end);
 }
 static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT;
 	cfs_rq->fair_clock = 1;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->rq = rq;
 #endif
 }
 void __init sched_init(void)
 {
 	u64 now = sched_clock();
 	int highest_cpu = 0;
 	int i, j;
 	/*
 	 * Link up the scheduling class hierarchy:
 	 */
 	rt_sched_class.next = &fair_sched_class;
 	fair_sched_class.next = &idle_sched_class;
 	idle_sched_class.next = NULL;
 	for_each_possible_cpu(i) {
 		struct rt_prio_array *array;
 		struct rq *rq;
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
 		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
 		rq->nr_running = 0;
 		rq->clock = 1;
 		init_cfs_rq(&rq->cfs, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
 		list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
 #endif
 		rq->ls.load_update_last = now;
 		rq->ls.load_update_start = now;
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
 		rq->cpu = i;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
 #endif
 		atomic_set(&rq->nr_iowait, 0);
 		array = &rq->rt.active;
 		for (j = 0; j < MAX_RT_PRIO; j++) {
 			INIT_LIST_HEAD(array->queue + j);
 			__clear_bit(j, array->bitmap);
 		}
 		highest_cpu = i;
 		/* delimiter for bitsearch: */
 		__set_bit(MAX_RT_PRIO, array->bitmap);
 	}
 	set_load_weight(&init_task);
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
 #endif
 #ifdef CONFIG_SMP
 	nr_cpu_ids = highest_cpu + 1;
 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
 #endif
 #ifdef CONFIG_RT_MUTEXES
 	plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
 #endif
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current);
 	/*
 	 * Make us the idle thread. Technically, schedule() should not be
 	 * called from this thread, however somewhere below it might be,
 	 * but because we are the idle thread, we just pick up running again
 	 * when this runqueue becomes "idle".
 	 */
 	init_idle(current, smp_processor_id());
 	/*
 	 * During early bootup we pretend to be a normal task:
 	 */
 	current->sched_class = &fair_sched_class;
 }
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
 void __might_sleep(char *file, int line)
 {
 #ifdef in_atomic
 	static unsigned long prev_jiffy;	/* ratelimiting */
 	if ((in_atomic() || irqs_disabled()) &&
 	    system_state == SYSTEM_RUNNING && !oops_in_progress) {
 		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 			return;
 		prev_jiffy = jiffies;
 		printk(KERN_ERR "BUG: sleeping function called from invalid"
 				" context at %s:%d\n", file, line);
 		printk("in_atomic():%d, irqs_disabled():%d\n",
 			in_atomic(), irqs_disabled());
 		debug_show_held_locks(current);
 		if (irqs_disabled())
 			print_irqtrace_events(current);
 		dump_stack();
 	}
 #endif
 }
 EXPORT_SYMBOL(__might_sleep);
 #endif
 #ifdef CONFIG_MAGIC_SYSRQ
 void normalize_rt_tasks(void)
 {
 	struct task_struct *g, *p;
 	unsigned long flags;
 	struct rq *rq;
 	int on_rq;
 	read_lock_irq(&tasklist_lock);
 	do_each_thread(g, p) {
 		p->se.fair_key			= 0;
 		p->se.wait_runtime		= 0;
 		p->se.exec_start		= 0;
 		p->se.wait_start_fair		= 0;
 		p->se.sleep_start_fair		= 0;
 #ifdef CONFIG_SCHEDSTATS
 		p->se.wait_start		= 0;
 		p->se.sleep_start		= 0;
 		p->se.block_start		= 0;
 #endif
 		task_rq(p)->cfs.fair_clock	= 0;
 		task_rq(p)->clock		= 0;
 		if (!rt_task(p)) {
 			/*
 			 * Renice negative nice level userspace
 			 * tasks back to 0:
 			 */
 			if (TASK_NICE(p) < 0 && p->mm)
 				set_user_nice(p, 0);
 			continue;
 		}
 		spin_lock_irqsave(&p->pi_lock, flags);
 		rq = __task_rq_lock(p);
 #ifdef CONFIG_SMP
 		/*
 		 * Do not touch the migration thread:
 		 */
 		if (p == rq->migration_thread)
 			goto out_unlock;
 #endif
 		on_rq = p->se.on_rq;
 		if (on_rq)
 			deactivate_task(task_rq(p), p, 0);
 		__setscheduler(rq, p, SCHED_NORMAL, 0);
 		if (on_rq) {
 			activate_task(task_rq(p), p, 0);
 			resched_task(rq->curr);
 		}
 #ifdef CONFIG_SMP
  out_unlock:
 #endif
 		__task_rq_unlock(rq);
 		spin_unlock_irqrestore(&p->pi_lock, flags);
 	} while_each_thread(g, p);
 	read_unlock_irq(&tasklist_lock);
 }
 #endif /* CONFIG_MAGIC_SYSRQ */
 #ifdef CONFIG_IA64
 /*
  * These functions are only useful for the IA64 MCA handling.
  *
  * They can only be called when the whole system has been
  * stopped - every CPU needs to be quiescent, and no scheduling
  * activity can take place. Using them for anything else would
  * be a serious bug, and as a result, they aren't even visible
  * under any other configuration.
  */
 /**
  * curr_task - return the current task for a given cpu.
  * @cpu: the processor in question.
  *
  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
  */
 struct task_struct *curr_task(int cpu)
 {
 	return cpu_curr(cpu);
 }
 /**
  * set_curr_task - set the current task for a given cpu.
  * @cpu: the processor in question.
  * @p: the task pointer to set.
  *
  * Description: This function must only be used when non-maskable interrupts
  * are serviced on a separate stack.  It allows the architecture to switch the
  * notion of the current task on a cpu in a non-blocking manner.  This function
  * must be called with all CPU's synchronized, and interrupts disabled, the
  * and caller must save the original value of the current task (see
  * curr_task() above) and restore that value before reenabling interrupts and
  * re-starting the system.

 /*
  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
  *
  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  *
  *  Interactivity improvements by Mike Galbraith
  *  (C) 2007 Mike Galbraith <efault@gmx.de>
  *
  *  Various enhancements by Dmitry Adamushko.
  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  *
  *  Group scheduling enhancements by Srivatsa Vaddagiri
  *  Copyright IBM Corporation, 2007
  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  *
  *  Scaled math optimizations by Thomas Gleixner
  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  */
 /*
  * Preemption granularity:
  * (default: 2 msec, units: nanoseconds)
  *
  * NOTE: this granularity value is not the same as the concept of
  * 'timeslice length' - timeslices in CFS will typically be somewhat
  * larger than this value. (to see the precise effective timeslice
  * length of your workload, run vmstat and monitor the context-switches
  * field)
  *
  * On SMP systems the value of this is multiplied by the log2 of the
  * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
  * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
  */
 unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ;
 /*
  * SCHED_BATCH wake-up granularity.
  * (default: 10 msec, units: nanoseconds)
  *
  * This option delays the preemption effects of decoupled workloads
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
 unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly =
 							10000000000ULL/HZ;
 /*
  * SCHED_OTHER wake-up granularity.
  * (default: 1 msec, units: nanoseconds)
  *
  * This option delays the preemption effects of decoupled workloads
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
 unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ;
 unsigned int sysctl_sched_stat_granularity __read_mostly;
 /*
  * Initialized in sched_init_granularity():
  */
 unsigned int sysctl_sched_runtime_limit __read_mostly;
 /*
  * Debugging: various feature bits
  */
 enum {
 	SCHED_FEAT_FAIR_SLEEPERS	= 1,
 	SCHED_FEAT_SLEEPER_AVG		= 2,
 	SCHED_FEAT_SLEEPER_LOAD_AVG	= 4,
 	SCHED_FEAT_PRECISE_CPU_LOAD	= 8,
 	SCHED_FEAT_START_DEBIT		= 16,
 	SCHED_FEAT_SKIP_INITIAL		= 32,
 };
 unsigned int sysctl_sched_features __read_mostly =
 		SCHED_FEAT_FAIR_SLEEPERS	*1 |
 		SCHED_FEAT_SLEEPER_AVG		*1 |
 		SCHED_FEAT_SLEEPER_LOAD_AVG	*1 |
 		SCHED_FEAT_PRECISE_CPU_LOAD	*1 |
 		SCHED_FEAT_START_DEBIT		*1 |
 		SCHED_FEAT_SKIP_INITIAL		*0;
 extern struct sched_class fair_sched_class;
 /**************************************************************
  * CFS operations on generic schedulable entities:
  */
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /* cpu runqueue to which this cfs_rq is attached */
 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 {
 	return cfs_rq->rq;
 }
 /* currently running entity (if any) on this cfs_rq */
 static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
 {
 	return cfs_rq->curr;
 }
 /* An entity is a task if it doesn't "own" a runqueue */
 #define entity_is_task(se)	(!se->my_q)
 static inline void
 set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	cfs_rq->curr = se;
 }
 #else	/* CONFIG_FAIR_GROUP_SCHED */
 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 {
 	return container_of(cfs_rq, struct rq, cfs);
 }
 static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	if (unlikely(rq->curr->sched_class != &fair_sched_class))
 		return NULL;
 	return &rq->curr->se;
 }
 #define entity_is_task(se)	1
 static inline void
 set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 static inline struct task_struct *task_of(struct sched_entity *se)
 {
 	return container_of(se, struct task_struct, se);
 }
 /**************************************************************
  * Scheduling class tree data structure manipulation methods:
  */
 /*
  * Enqueue an entity into the rb-tree:
  */
 static inline void
 __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 	struct rb_node *parent = NULL;
 	struct sched_entity *entry;
 	s64 key = se->fair_key;
 	int leftmost = 1;
 	/*
 	 * Find the right place in the rbtree:
 	 */
 	while (*link) {
 		parent = *link;
 		entry = rb_entry(parent, struct sched_entity, run_node);
 		/*
 		 * We dont care about collisions. Nodes with
 		 * the same key stay together.
 		 */
 		if (key - entry->fair_key < 0) {
 			link = &parent->rb_left;
 		} else {
 			link = &parent->rb_right;
 			leftmost = 0;
 		}
 	}
 	/*
 	 * Maintain a cache of leftmost tree entries (it is frequently
 	 * used):
 	 */
 	if (leftmost)
 		cfs_rq->rb_leftmost = &se->run_node;
 	rb_link_node(&se->run_node, parent, link);
 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 	update_load_add(&cfs_rq->load, se->load.weight);
 	cfs_rq->nr_running++;
 	se->on_rq = 1;
 }
 static inline void
 __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if (cfs_rq->rb_leftmost == &se->run_node)
 		cfs_rq->rb_leftmost = rb_next(&se->run_node);
 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 	update_load_sub(&cfs_rq->load, se->load.weight);
 	cfs_rq->nr_running--;
 	se->on_rq = 0;
 }
 static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
 {
 	return cfs_rq->rb_leftmost;
 }
 static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
 {
 	return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
 }
 /**************************************************************
  * Scheduling class statistics methods:
  */
 /*
  * We rescale the rescheduling granularity of tasks according to their
  * nice level, but only linearly, not exponentially:
  */
 static long
 niced_granularity(struct sched_entity *curr, unsigned long granularity)
 {
 	u64 tmp;
 	/*
 	 * Negative nice levels get the same granularity as nice-0:
 	 */
 	if (likely(curr->load.weight >= NICE_0_LOAD))
 		return granularity;
 	/*
 	 * Positive nice level tasks get linearly finer
 	 * granularity:
 	 */
 	tmp = curr->load.weight * (u64)granularity;
 	/*
 	 * It will always fit into 'long':
 	 */
 	return (long) (tmp >> NICE_0_SHIFT);
 }
 static inline void
 limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	long limit = sysctl_sched_runtime_limit;
 	/*
 	 * Niced tasks have the same history dynamic range as
 	 * non-niced tasks:
 	 */
 	if (unlikely(se->wait_runtime > limit)) {
 		se->wait_runtime = limit;
 		schedstat_inc(se, wait_runtime_overruns);
 		schedstat_inc(cfs_rq, wait_runtime_overruns);
 	}
 	if (unlikely(se->wait_runtime < -limit)) {
 		se->wait_runtime = -limit;
 		schedstat_inc(se, wait_runtime_underruns);
 		schedstat_inc(cfs_rq, wait_runtime_underruns);
 	}
 }
 static inline void
 __add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
 {
 	se->wait_runtime += delta;
 	schedstat_add(se, sum_wait_runtime, delta);
 	limit_wait_runtime(cfs_rq, se);
 }
 static void
 add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
 {
 	schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
 	__add_wait_runtime(cfs_rq, se, delta);
 	schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
 }
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
  */
 static inline void
 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now)
 {
 	unsigned long delta, delta_exec, delta_fair;
 	long delta_mine;
 	struct load_weight *lw = &cfs_rq->load;
 	unsigned long load = lw->weight;
 	if (unlikely(!load))
 		return;
 	delta_exec = curr->delta_exec;
 	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
 	curr->sum_exec_runtime += delta_exec;
 	cfs_rq->exec_clock += delta_exec;
 	delta_fair = calc_delta_fair(delta_exec, lw);
 	delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
 	if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) {
 		delta = calc_delta_mine(cfs_rq->sleeper_bonus,
 					curr->load.weight, lw);
 		if (unlikely(delta > cfs_rq->sleeper_bonus))
 			delta = cfs_rq->sleeper_bonus;
 		cfs_rq->sleeper_bonus -= delta;
 		delta_mine -= delta;
 	}
 	cfs_rq->fair_clock += delta_fair;
 	/*
 	 * We executed delta_exec amount of time on the CPU,
 	 * but we were only entitled to delta_mine amount of
 	 * time during that period (if nr_running == 1 then
 	 * the two values are equal)
 	 * [Note: delta_mine - delta_exec is negative]:
 	 */
 	add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec);
 }
 static void update_curr(struct cfs_rq *cfs_rq, u64 now)
 {
 	struct sched_entity *curr = cfs_rq_curr(cfs_rq);
 	unsigned long delta_exec;
 	if (unlikely(!curr))
 		return;
 	/*
 	 * Get the amount of time the current task was running
 	 * since the last time we changed load (this cannot
 	 * overflow on 32 bits):
 	 */
 	delta_exec = (unsigned long)(now - curr->exec_start);
 	curr->delta_exec += delta_exec;
 	if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) {
 		__update_curr(cfs_rq, curr, now);
 		curr->delta_exec = 0;
 	}
 	curr->exec_start = now;
 }
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
 	se->wait_start_fair = cfs_rq->fair_clock;
 	schedstat_set(se->wait_start, now);
 }
 /*
  * We calculate fair deltas here, so protect against the random effects
  * of a multiplication overflow by capping it to the runtime limit:
  */
 #if BITS_PER_LONG == 32
 static inline unsigned long
 calc_weighted(unsigned long delta, unsigned long weight, int shift)
 {
 	u64 tmp = (u64)delta * weight >> shift;
 	if (unlikely(tmp > sysctl_sched_runtime_limit*2))
 		return sysctl_sched_runtime_limit*2;
 	return tmp;
 }
 #else
 static inline unsigned long
 calc_weighted(unsigned long delta, unsigned long weight, int shift)
 {
 	return delta * weight >> shift;
 }
 #endif
 /*
  * Task is being enqueued - update stats:
  */
 static void
 update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
 	s64 key;
 	/*
 	 * Are we enqueueing a waiting task? (for current tasks
 	 * a dequeue/enqueue event is a NOP)
 	 */
 	if (se != cfs_rq_curr(cfs_rq))
 		update_stats_wait_start(cfs_rq, se, now);
 	/*
 	 * Update the key:
 	 */
 	key = cfs_rq->fair_clock;
 	/*
 	 * Optimize the common nice 0 case:
 	 */
 	if (likely(se->load.weight == NICE_0_LOAD)) {
 		key -= se->wait_runtime;
 	} else {
 		u64 tmp;
 		if (se->wait_runtime < 0) {
 			tmp = -se->wait_runtime;
 			key += (tmp * se->load.inv_weight) >>
 					(WMULT_SHIFT - NICE_0_SHIFT);
 		} else {
 			tmp = se->wait_runtime;
 			key -= (tmp * se->load.weight) >> NICE_0_SHIFT;
 		}
 	}
 	se->fair_key = key;
 }
 /*
  * Note: must be called with a freshly updated rq->fair_clock.
  */
 static inline void
 __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
 	unsigned long delta_fair = se->delta_fair_run;
 	schedstat_set(se->wait_max, max(se->wait_max, now - se->wait_start));
 	if (unlikely(se->load.weight != NICE_0_LOAD))
 		delta_fair = calc_weighted(delta_fair, se->load.weight,
 							NICE_0_SHIFT);
 	add_wait_runtime(cfs_rq, se, delta_fair);
 }
 static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
 	unsigned long delta_fair;
 	delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
 			(u64)(cfs_rq->fair_clock - se->wait_start_fair));
 	se->delta_fair_run += delta_fair;
 	if (unlikely(abs(se->delta_fair_run) >=
 				sysctl_sched_stat_granularity)) {
 		__update_stats_wait_end(cfs_rq, se, now);
 		se->delta_fair_run = 0;
 	}
 	se->wait_start_fair = 0;
 	schedstat_set(se->wait_start, 0);
 }
 static inline void
 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
 	update_curr(cfs_rq, now);
 	/*
 	 * Mark the end of the wait period if dequeueing a
 	 * waiting task:
 	 */
 	if (se != cfs_rq_curr(cfs_rq))
 		update_stats_wait_end(cfs_rq, se, now);
 }
 /*
  * We are picking a new current task - update its stats:
  */
 static inline void
 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
 	/*
 	 * We are starting a new run period:
 	 */
 	se->exec_start = now;
 }
 /*
  * We are descheduling a task - update its stats:
  */
 static inline void
 update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
 	se->exec_start = 0;
 }
 /**************************************************
  * Scheduling class queueing methods:
  */
 static void
 __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
 	unsigned long load = cfs_rq->load.weight, delta_fair;
 	long prev_runtime;
 	if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG)
 		load = rq_of(cfs_rq)->cpu_load[2];
 	delta_fair = se->delta_fair_sleep;
 	/*
 	 * Fix up delta_fair with the effect of us running
 	 * during the whole sleep period:
 	 */
 	if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG)
 		delta_fair = div64_likely32((u64)delta_fair * load,
 						load + se->load.weight);
 	if (unlikely(se->load.weight != NICE_0_LOAD))
 		delta_fair = calc_weighted(delta_fair, se->load.weight,
 							NICE_0_SHIFT);
 	prev_runtime = se->wait_runtime;
 	__add_wait_runtime(cfs_rq, se, delta_fair);
 	delta_fair = se->wait_runtime - prev_runtime;
 	/*
 	 * Track the amount of bonus we've given to sleepers:
 	 */
 	cfs_rq->sleeper_bonus += delta_fair;
 	schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
 }
 static void
 enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
 	struct task_struct *tsk = task_of(se);
 	unsigned long delta_fair;
 	if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) ||
 			 !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS))
 		return;
 	delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
 		(u64)(cfs_rq->fair_clock - se->sleep_start_fair));
 	se->delta_fair_sleep += delta_fair;
 	if (unlikely(abs(se->delta_fair_sleep) >=
 				sysctl_sched_stat_granularity)) {
 		__enqueue_sleeper(cfs_rq, se, now);
 		se->delta_fair_sleep = 0;
 	}
 	se->sleep_start_fair = 0;
 #ifdef CONFIG_SCHEDSTATS
 	if (se->sleep_start) {
 		u64 delta = now - se->sleep_start;
 		if ((s64)delta < 0)
 			delta = 0;
 		if (unlikely(delta > se->sleep_max))
 			se->sleep_max = delta;
 		se->sleep_start = 0;
 		se->sum_sleep_runtime += delta;
 	}
 	if (se->block_start) {
 		u64 delta = now - se->block_start;
 		if ((s64)delta < 0)
 			delta = 0;
 		if (unlikely(delta > se->block_max))
 			se->block_max = delta;
 		se->block_start = 0;
 		se->sum_sleep_runtime += delta;
 	}
 #endif
 }
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	       int wakeup, u64 now)
 {
 	/*
 	 * Update the fair clock.
 	 */
 	update_curr(cfs_rq, now);
 	if (wakeup)
 		enqueue_sleeper(cfs_rq, se, now);
 	update_stats_enqueue(cfs_rq, se, now);
 	__enqueue_entity(cfs_rq, se);
 }
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	       int sleep, u64 now)
 {
 	update_stats_dequeue(cfs_rq, se, now);
 	if (sleep) {
 		se->sleep_start_fair = cfs_rq->fair_clock;
 #ifdef CONFIG_SCHEDSTATS
 		if (entity_is_task(se)) {
 			struct task_struct *tsk = task_of(se);
 			if (tsk->state & TASK_INTERRUPTIBLE)
 				se->sleep_start = now;
 			if (tsk->state & TASK_UNINTERRUPTIBLE)
 				se->block_start = now;
 		}
 		cfs_rq->wait_runtime -= se->wait_runtime;
 #endif
 	}
 	__dequeue_entity(cfs_rq, se);
 }
 /*
  * Preempt the current task with a newly woken task if needed:
  */
 static void
 __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			  struct sched_entity *curr, unsigned long granularity)
 {
 	s64 __delta = curr->fair_key - se->fair_key;
 	/*
 	 * Take scheduling granularity into account - do not
 	 * preempt the current task unless the best task has
 	 * a larger than sched_granularity fairness advantage:
 	 */
 	if (__delta > niced_granularity(curr, granularity))
 		resched_task(rq_of(cfs_rq)->curr);
 }
 static inline void
 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
 	/*
 	 * Any task has to be enqueued before it get to execute on
 	 * a CPU. So account for the time it spent waiting on the
 	 * runqueue. (note, here we rely on pick_next_task() having
 	 * done a put_prev_task_fair() shortly before this, which
 	 * updated rq->fair_clock - used by update_stats_wait_end())
 	 */
 	update_stats_wait_end(cfs_rq, se, now);
 	update_stats_curr_start(cfs_rq, se, now);
 	set_cfs_rq_curr(cfs_rq, se);
 }
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq, u64 now)
 {
 	struct sched_entity *se = __pick_next_entity(cfs_rq);
 	set_next_entity(cfs_rq, se, now);
 	return se;
 }
 static void
 put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now)
 {
 	/*
 	 * If still on the runqueue then deactivate_task()
 	 * was not called and update_curr() has to be done:
 	 */
 	if (prev->on_rq)
 		update_curr(cfs_rq, now);
 	update_stats_curr_end(cfs_rq, prev, now);
 	if (prev->on_rq)
 		update_stats_wait_start(cfs_rq, prev, now);
 	set_cfs_rq_curr(cfs_rq, NULL);
 }
 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct sched_entity *next;
 	u64 now = __rq_clock(rq);
 	/*
 	 * Dequeue and enqueue the task to update its
 	 * position within the tree:
 	 */
 	dequeue_entity(cfs_rq, curr, 0, now);
 	enqueue_entity(cfs_rq, curr, 0, now);
 	/*
 	 * Reschedule if another task tops the current one.
 	 */
 	next = __pick_next_entity(cfs_rq);
 	if (next == curr)
 		return;
 	__check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity);
 }
 /**************************************************
  * CFS operations on tasks:
  */
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /* Walk up scheduling entities hierarchy */
 #define for_each_sched_entity(se) \
 		for (; se; se = se->parent)
 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 {
 	return p->se.cfs_rq;
 }
 /* runqueue on which this entity is (to be) queued */
 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 {
 	return se->cfs_rq;
 }
 /* runqueue "owned" by this group */
 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 {
 	return grp->my_q;
 }
 /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
  * another cpu ('this_cpu')
  */
 static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 {
 	/* A later patch will take group into account */
 	return &cpu_rq(this_cpu)->cfs;
 }
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 	list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 /* Do the two (enqueued) tasks belong to the same group ? */
 static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
 {
 	if (curr->se.cfs_rq == p->se.cfs_rq)
 		return 1;
 	return 0;
 }
 #else	/* CONFIG_FAIR_GROUP_SCHED */
 #define for_each_sched_entity(se) \
 		for (; se; se = NULL)
 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 {
 	return &task_rq(p)->cfs;
 }
 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 {
 	struct task_struct *p = task_of(se);
 	struct rq *rq = task_rq(p);
 	return &rq->cfs;
 }
 /* runqueue "owned" by this group */
 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 {
 	return NULL;
 }
 static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 {
 	return &cpu_rq(this_cpu)->cfs;
 }
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
 {
 	return 1;
 }
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
  * then put the task into the rbtree:
  */
 static void
 enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
 	for_each_sched_entity(se) {
 		if (se->on_rq)
 			break;
 		cfs_rq = cfs_rq_of(se);
 		enqueue_entity(cfs_rq, se, wakeup, now);
 	}
 }
 /*
  * The dequeue_task method is called before nr_running is
  * decreased. We remove the task from the rbtree and
  * update the fair scheduling stats:
  */
 static void
 dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, sleep, now);
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight)
 			break;
 	}
 }
 /*
  * sched_yield() support is very simple - we dequeue and enqueue
  */
 static void yield_task_fair(struct rq *rq, struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
 	u64 now = __rq_clock(rq);
 	/*
 	 * Dequeue and enqueue the task to update its
 	 * position within the tree:
 	 */
 	dequeue_entity(cfs_rq, &p->se, 0, now);
 	enqueue_entity(cfs_rq, &p->se, 0, now);
 }
 /*
  * Preempt the current task with a newly woken task if needed:
  */
 static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
 {
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	unsigned long gran;
 	if (unlikely(rt_prio(p->prio))) {
 		update_curr(cfs_rq, rq_clock(rq));
 		resched_task(curr);
 		return;
 	}
 	gran = sysctl_sched_wakeup_granularity;
 	/*
 	 * Batch tasks prefer throughput over latency:
 	 */
 	if (unlikely(p->policy == SCHED_BATCH))
 		gran = sysctl_sched_batch_wakeup_granularity;
 	if (is_same_group(curr, p))
 		__check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran);
 }
 static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now)
 {
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
 	if (unlikely(!cfs_rq->nr_running))
 		return NULL;
 	do {
 		se = pick_next_entity(cfs_rq, now);
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 	return task_of(se);
 }
 /*
  * Account for a descheduled task:
  */
 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now)
 {
 	struct sched_entity *se = &prev->se;
 	struct cfs_rq *cfs_rq;
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		put_prev_entity(cfs_rq, se, now);
 	}
 }
 /**************************************************
  * Fair scheduling class load-balancing methods:
  */
 /*
  * Load-balancing iterator. Note: while the runqueue stays locked
  * during the whole iteration, the current task might be
  * dequeued so the iterator has to be dequeue-safe. Here we
  * achieve that by always pre-iterating before returning
  * the current task:
  */
 static inline struct task_struct *
 __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
 {
 	struct task_struct *p;
 	if (!curr)
 		return NULL;
 	p = rb_entry(curr, struct task_struct, se.run_node);
 	cfs_rq->rb_load_balance_curr = rb_next(curr);
 	return p;
 }
 static struct task_struct *load_balance_start_fair(void *arg)
 {
 	struct cfs_rq *cfs_rq = arg;
 	return __load_balance_iterator(cfs_rq, first_fair(cfs_rq));
 }
 static struct task_struct *load_balance_next_fair(void *arg)
 {
 	struct cfs_rq *cfs_rq = arg;
 	return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
 }
 static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *curr;
 	struct task_struct *p;
 	if (!cfs_rq->nr_running)
 		return MAX_PRIO;
 	curr = __pick_next_entity(cfs_rq);
 	p = task_of(curr);
 	return p->prio;
 }
-static int
+static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 			unsigned long max_nr_move, unsigned long max_load_move,
 			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *all_pinned, unsigned long *total_load_moved)
+			int *all_pinned)
 {
 	struct cfs_rq *busy_cfs_rq;
 	unsigned long load_moved, total_nr_moved = 0, nr_moved;
 	long rem_load_move = max_load_move;
 	struct rq_iterator cfs_rq_iterator;
 	cfs_rq_iterator.start = load_balance_start_fair;
 	cfs_rq_iterator.next = load_balance_next_fair;
 	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
 		struct cfs_rq *this_cfs_rq;
 		long imbalance;
 		unsigned long maxload;
 		int this_best_prio, best_prio, best_prio_seen = 0;
 		this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
 		imbalance = busy_cfs_rq->load.weight -
 						 this_cfs_rq->load.weight;
 		/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
 		if (imbalance <= 0)
 			continue;
 		/* Don't pull more than imbalance/2 */
 		imbalance /= 2;
 		maxload = min(rem_load_move, imbalance);
 		this_best_prio = cfs_rq_best_prio(this_cfs_rq);
 		best_prio = cfs_rq_best_prio(busy_cfs_rq);
 		/*
 		 * Enable handling of the case where there is more than one task
 		 * with the best priority. If the current running task is one
 		 * of those with prio==best_prio we know it won't be moved
 		 * and therefore it's safe to override the skip (based on load)
 		 * of any task we find with that prio.
 		 */
 		if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se)
 			best_prio_seen = 1;
 		/* pass busy_cfs_rq argument into
 		 * load_balance_[start|next]_fair iterators
 		 */
 		cfs_rq_iterator.arg = busy_cfs_rq;
 		nr_moved = balance_tasks(this_rq, this_cpu, busiest,
 				max_nr_move, maxload, sd, idle, all_pinned,
 				&load_moved, this_best_prio, best_prio,
 				best_prio_seen, &cfs_rq_iterator);
 		total_nr_moved += nr_moved;
 		max_nr_move -= nr_moved;
 		rem_load_move -= load_moved;
 		if (max_nr_move <= 0 || rem_load_move <= 0)
 			break;
 	}
-	*total_load_moved = max_load_move - rem_load_move;
+	return max_load_move - rem_load_move;
-	return total_nr_moved;
 }
 /*
  * scheduler tick hitting a task of our scheduling class:
  */
 static void task_tick_fair(struct rq *rq, struct task_struct *curr)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &curr->se;
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		entity_tick(cfs_rq, se);
 	}
 }
 /*
  * Share the fairness runtime between parent and child, thus the
  * total amount of pressure for CPU stays equal - new tasks
  * get a chance to run but frequent forkers are not allowed to
  * monopolize the CPU. Note: the parent runqueue is locked,
  * the child is not running yet.
  */
 static void task_new_fair(struct rq *rq, struct task_struct *p, u64 now)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
 	struct sched_entity *se = &p->se;
 	sched_info_queued(p);
 	update_stats_enqueue(cfs_rq, se, now);
 	/*
 	 * Child runs first: we let it run before the parent
 	 * until it reschedules once. We set up the key so that
 	 * it will preempt the parent:
 	 */
 	p->se.fair_key = current->se.fair_key -
 		niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1;
 	/*
 	 * The first wait is dominated by the child-runs-first logic,
 	 * so do not credit it with that waiting time yet:
 	 */
 	if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL)
 		p->se.wait_start_fair = 0;
 	/*
 	 * The statistical average of wait_runtime is about
 	 * -granularity/2, so initialize the task with that:
 	 */
 	if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
 		p->se.wait_runtime = -(sysctl_sched_granularity / 2);
 	__enqueue_entity(cfs_rq, se);
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /* Account for a task changing its policy or group.
  *
  * This routine is mostly called to set cfs_rq->curr field when a task
  * migrates between groups/classes.
  */
 static void set_curr_task_fair(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_entity *se = &curr->se;
 	u64 now = rq_clock(rq);
 	struct cfs_rq *cfs_rq;
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		set_next_entity(cfs_rq, se, now);
 	}
 }
 #else
 static void set_curr_task_fair(struct rq *rq)
 {
 }
 #endif
 /*
  * All the scheduling class methods:
  */
 struct sched_class fair_sched_class __read_mostly = {
 	.enqueue_task		= enqueue_task_fair,
 	.dequeue_task		= dequeue_task_fair,
 	.yield_task		= yield_task_fair,
 	.check_preempt_curr	= check_preempt_curr_fair,
 	.pick_next_task		= pick_next_task_fair,
 	.put_prev_task		= put_prev_task_fair,
 	.load_balance		= load_balance_fair,
 	.set_curr_task          = set_curr_task_fair,
 	.task_tick		= task_tick_fair,
 	.task_new		= task_new_fair,
 };
 #ifdef CONFIG_SCHED_DEBUG
 void print_cfs_stats(struct seq_file *m, int cpu, u64 now)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct cfs_rq *cfs_rq;
 	for_each_leaf_cfs_rq(rq, cfs_rq)
 		print_cfs_rq(m, cpu, cfs_rq, now);
 }
 #endif

1	/*	1	/*
2	* idle-task scheduling class.	2	* idle-task scheduling class.
3	*	3	*
4	* (NOTE: these are not related to SCHED_IDLE tasks which are	4	* (NOTE: these are not related to SCHED_IDLE tasks which are
5	* handled in sched_fair.c)	5	* handled in sched_fair.c)
6	*/	6	*/
7		7
8	/*	8	/*
9	* Idle tasks are unconditionally rescheduled:	9	* Idle tasks are unconditionally rescheduled:
10	*/	10	*/
11	static void check_preempt_curr_idle(struct rq rq, struct task_struct p)	11	static void check_preempt_curr_idle(struct rq rq, struct task_struct p)
12	{	12	{
13	resched_task(rq->idle);	13	resched_task(rq->idle);
14	}	14	}
15		15
16	static struct task_struct pick_next_task_idle(struct rq rq, u64 now)	16	static struct task_struct pick_next_task_idle(struct rq rq, u64 now)
17	{	17	{
18	schedstat_inc(rq, sched_goidle);	18	schedstat_inc(rq, sched_goidle);
19		19
20	return rq->idle;	20	return rq->idle;
21	}	21	}
22		22
23	/*	23	/*
24	* It is not legal to sleep in the idle task - print a warning	24	* It is not legal to sleep in the idle task - print a warning
25	* message if some code attempts to do it:	25	* message if some code attempts to do it:
26	*/	26	*/
27	static void	27	static void
28	dequeue_task_idle(struct rq rq, struct task_struct p, int sleep, u64 now)	28	dequeue_task_idle(struct rq rq, struct task_struct p, int sleep, u64 now)
29	{	29	{
30	spin_unlock_irq(&rq->lock);	30	spin_unlock_irq(&rq->lock);
31	printk(KERN_ERR "bad: scheduling from the idle thread!\n");	31	printk(KERN_ERR "bad: scheduling from the idle thread!\n");
32	dump_stack();	32	dump_stack();
33	spin_lock_irq(&rq->lock);	33	spin_lock_irq(&rq->lock);
34	}	34	}
35		35
36	static void put_prev_task_idle(struct rq rq, struct task_struct prev, u64 now)	36	static void put_prev_task_idle(struct rq rq, struct task_struct prev, u64 now)
37	{	37	{
38	}	38	}
39		39
40	static int	40	static unsigned long
41	load_balance_idle(struct rq this_rq, int this_cpu, struct rq busiest,	41	load_balance_idle(struct rq this_rq, int this_cpu, struct rq busiest,
42	unsigned long max_nr_move, unsigned long max_load_move,	42	unsigned long max_nr_move, unsigned long max_load_move,
43	struct sched_domain *sd, enum cpu_idle_type idle,	43	struct sched_domain *sd, enum cpu_idle_type idle,
44	int all_pinned, unsigned long total_load_moved)	44	int *all_pinned)
45	{	45	{
46	return 0;	46	return 0;
47	}	47	}
48		48
49	static void task_tick_idle(struct rq rq, struct task_struct curr)	49	static void task_tick_idle(struct rq rq, struct task_struct curr)
50	{	50	{
51	}	51	}
52		52
53	/*	53	/*
54	* Simple, special scheduling class for the per-CPU idle tasks:	54	* Simple, special scheduling class for the per-CPU idle tasks:
55	*/	55	*/
56	static struct sched_class idle_sched_class __read_mostly = {	56	static struct sched_class idle_sched_class __read_mostly = {
57	/* no enqueue/yield_task for idle tasks */	57	/* no enqueue/yield_task for idle tasks */
58		58
59	/* dequeue is not valid, we print a debug message there: */	59	/* dequeue is not valid, we print a debug message there: */
60	.dequeue_task = dequeue_task_idle,	60	.dequeue_task = dequeue_task_idle,
61		61
62	.check_preempt_curr = check_preempt_curr_idle,	62	.check_preempt_curr = check_preempt_curr_idle,
63		63
64	.pick_next_task = pick_next_task_idle,	64	.pick_next_task = pick_next_task_idle,
65	.put_prev_task = put_prev_task_idle,	65	.put_prev_task = put_prev_task_idle,
66		66
67	.load_balance = load_balance_idle,	67	.load_balance = load_balance_idle,
68		68
69	.task_tick = task_tick_idle,	69	.task_tick = task_tick_idle,
70	/* no .task_new for idle tasks */	70	/* no .task_new for idle tasks */
71	};	71	};
72		72

 /*
  * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
  * policies)
  */
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
  */
 static inline void update_curr_rt(struct rq *rq, u64 now)
 {
 	struct task_struct *curr = rq->curr;
 	u64 delta_exec;
 	if (!task_has_rt_policy(curr))
 		return;
 	delta_exec = now - curr->se.exec_start;
 	if (unlikely((s64)delta_exec < 0))
 		delta_exec = 0;
 	schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
 	curr->se.sum_exec_runtime += delta_exec;
 	curr->se.exec_start = now;
 }
 static void
 enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
 {
 	struct rt_prio_array *array = &rq->rt.active;
 	list_add_tail(&p->run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
 }
 /*
  * Adding/removing a task to/from a priority array:
  */
 static void
 dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 {
 	struct rt_prio_array *array = &rq->rt.active;
 	update_curr_rt(rq, now);
 	list_del(&p->run_list);
 	if (list_empty(array->queue + p->prio))
 		__clear_bit(p->prio, array->bitmap);
 }
 /*
  * Put task to the end of the run list without the overhead of dequeue
  * followed by enqueue.
  */
 static void requeue_task_rt(struct rq *rq, struct task_struct *p)
 {
 	struct rt_prio_array *array = &rq->rt.active;
 	list_move_tail(&p->run_list, array->queue + p->prio);
 }
 static void
 yield_task_rt(struct rq *rq, struct task_struct *p)
 {
 	requeue_task_rt(rq, p);
 }
 /*
  * Preempt the current task with a newly woken task if needed:
  */
 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
 {
 	if (p->prio < rq->curr->prio)
 		resched_task(rq->curr);
 }
 static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now)
 {
 	struct rt_prio_array *array = &rq->rt.active;
 	struct task_struct *next;
 	struct list_head *queue;
 	int idx;
 	idx = sched_find_first_bit(array->bitmap);
 	if (idx >= MAX_RT_PRIO)
 		return NULL;
 	queue = array->queue + idx;
 	next = list_entry(queue->next, struct task_struct, run_list);
 	next->se.exec_start = now;
 	return next;
 }
 static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now)
 {
 	update_curr_rt(rq, now);
 	p->se.exec_start = 0;
 }
 /*
  * Load-balancing iterator. Note: while the runqueue stays locked
  * during the whole iteration, the current task might be
  * dequeued so the iterator has to be dequeue-safe. Here we
  * achieve that by always pre-iterating before returning
  * the current task:
  */
 static struct task_struct *load_balance_start_rt(void *arg)
 {
 	struct rq *rq = arg;
 	struct rt_prio_array *array = &rq->rt.active;
 	struct list_head *head, *curr;
 	struct task_struct *p;
 	int idx;
 	idx = sched_find_first_bit(array->bitmap);
 	if (idx >= MAX_RT_PRIO)
 		return NULL;
 	head = array->queue + idx;
 	curr = head->prev;
 	p = list_entry(curr, struct task_struct, run_list);
 	curr = curr->prev;
 	rq->rt.rt_load_balance_idx = idx;
 	rq->rt.rt_load_balance_head = head;
 	rq->rt.rt_load_balance_curr = curr;
 	return p;
 }
 static struct task_struct *load_balance_next_rt(void *arg)
 {
 	struct rq *rq = arg;
 	struct rt_prio_array *array = &rq->rt.active;
 	struct list_head *head, *curr;
 	struct task_struct *p;
 	int idx;
 	idx = rq->rt.rt_load_balance_idx;
 	head = rq->rt.rt_load_balance_head;
 	curr = rq->rt.rt_load_balance_curr;
 	/*
 	 * If we arrived back to the head again then
 	 * iterate to the next queue (if any):
 	 */
 	if (unlikely(head == curr)) {
 		int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
 		if (next_idx >= MAX_RT_PRIO)
 			return NULL;
 		idx = next_idx;
 		head = array->queue + idx;
 		curr = head->prev;
 		rq->rt.rt_load_balance_idx = idx;
 		rq->rt.rt_load_balance_head = head;
 	}
 	p = list_entry(curr, struct task_struct, run_list);
 	curr = curr->prev;
 	rq->rt.rt_load_balance_curr = curr;
 	return p;
 }
-static int
+static unsigned long
 load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 			unsigned long max_nr_move, unsigned long max_load_move,
 			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *all_pinned, unsigned long *load_moved)
+			int *all_pinned)
 {
 	int this_best_prio, best_prio, best_prio_seen = 0;
 	int nr_moved;
 	struct rq_iterator rt_rq_iterator;
+	unsigned long load_moved;
 	best_prio = sched_find_first_bit(busiest->rt.active.bitmap);
 	this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap);
 	/*
 	 * Enable handling of the case where there is more than one task
 	 * with the best priority.   If the current running task is one
 	 * of those with prio==best_prio we know it won't be moved
 	 * and therefore it's safe to override the skip (based on load)
 	 * of any task we find with that prio.
 	 */
 	if (busiest->curr->prio == best_prio)
 		best_prio_seen = 1;
 	rt_rq_iterator.start = load_balance_start_rt;
 	rt_rq_iterator.next = load_balance_next_rt;
 	/* pass 'busiest' rq argument into
 	 * load_balance_[start|next]_rt iterators
 	 */
 	rt_rq_iterator.arg = busiest;
 	nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move,
-			max_load_move, sd, idle, all_pinned, load_moved,
+			max_load_move, sd, idle, all_pinned, &load_moved,
 			this_best_prio, best_prio, best_prio_seen,
 			&rt_rq_iterator);
-	return nr_moved;
+	return load_moved;
 }
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
 {
 	/*
 	 * RR tasks need a special form of timeslice management.
 	 * FIFO tasks have no timeslices.
 	 */
 	if (p->policy != SCHED_RR)
 		return;
 	if (--p->time_slice)
 		return;
 	p->time_slice = static_prio_timeslice(p->static_prio);
 	set_tsk_need_resched(p);
 	/* put it at the end of the queue: */
 	requeue_task_rt(rq, p);
 }
 static struct sched_class rt_sched_class __read_mostly = {
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
 	.yield_task		= yield_task_rt,
 	.check_preempt_curr	= check_preempt_curr_rt,
 	.pick_next_task		= pick_next_task_rt,
 	.put_prev_task		= put_prev_task_rt,
 	.load_balance		= load_balance_rt,
 	.task_tick		= task_tick_rt,
 };