Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* kernel/sched.c

2

* kernel/sched.c

3

*

3

*

4

* Kernel scheduler and related syscalls

4

* Kernel scheduler and related syscalls

5

*

5

*

6

7

*

7

*

8

* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and

8

* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and

9

* make semaphores SMP safe

9

* make semaphores SMP safe

10

* 1998-11-19 Implemented schedule_timeout() and related stuff

10

* 1998-11-19 Implemented schedule_timeout() and related stuff

11

* by Andrea Arcangeli

11

* by Andrea Arcangeli

12

* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:

12

* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:

13

* hybrid priority-list and round-robin design with

13

* hybrid priority-list and round-robin design with

14

* an array-switch method of distributing timeslices

14

* an array-switch method of distributing timeslices

15

* and per-CPU runqueues. Cleanups and useful suggestions

15

* and per-CPU runqueues. Cleanups and useful suggestions

16

* by Davide Libenzi, preemptible kernel bits by Robert Love.

16

* by Davide Libenzi, preemptible kernel bits by Robert Love.

17

* 2003-09-03 Interactivity tuning by Con Kolivas.

17

* 2003-09-03 Interactivity tuning by Con Kolivas.

18

* 2004-04-02 Scheduler domains code by Nick Piggin

18

* 2004-04-02 Scheduler domains code by Nick Piggin

19

*/

19

*/

20

21

#include <linux/mm.h>

21

#include <linux/mm.h>

22

#include <linux/module.h>

22

#include <linux/module.h>

23

#include <linux/nmi.h>

23

#include <linux/nmi.h>

24

#include <linux/init.h>

24

#include <linux/init.h>

25

#include <asm/uaccess.h>

25

#include <asm/uaccess.h>

26

#include <linux/highmem.h>

26

#include <linux/highmem.h>

27

#include <linux/smp_lock.h>

27

#include <linux/smp_lock.h>

28

#include <asm/mmu_context.h>

28

#include <asm/mmu_context.h>

29

#include <linux/interrupt.h>

29

#include <linux/interrupt.h>

30

#include <linux/capability.h>

30

#include <linux/capability.h>

31

#include <linux/completion.h>

31

#include <linux/completion.h>

32

#include <linux/kernel_stat.h>

32

#include <linux/kernel_stat.h>

33

#include <linux/security.h>

33

#include <linux/security.h>

34

#include <linux/notifier.h>

34

#include <linux/notifier.h>

35

#include <linux/profile.h>

35

#include <linux/profile.h>

36

#include <linux/suspend.h>

36

#include <linux/suspend.h>

37

#include <linux/vmalloc.h>

37

#include <linux/vmalloc.h>

38

#include <linux/blkdev.h>

38

#include <linux/blkdev.h>

39

#include <linux/delay.h>

39

#include <linux/delay.h>

40

#include <linux/smp.h>

40

#include <linux/smp.h>

41

#include <linux/threads.h>

41

#include <linux/threads.h>

42

#include <linux/timer.h>

42

#include <linux/timer.h>

43

#include <linux/rcupdate.h>

43

#include <linux/rcupdate.h>

44

#include <linux/cpu.h>

44

#include <linux/cpu.h>

45

#include <linux/cpuset.h>

45

#include <linux/cpuset.h>

46

#include <linux/percpu.h>

46

#include <linux/percpu.h>

47

#include <linux/kthread.h>

47

#include <linux/kthread.h>

48

#include <linux/seq_file.h>

48

#include <linux/seq_file.h>

49

#include <linux/syscalls.h>

49

#include <linux/syscalls.h>

50

#include <linux/times.h>

50

#include <linux/times.h>

51

#include <linux/acct.h>

51

#include <linux/acct.h>

52

#include <linux/kprobes.h>

52

#include <asm/tlb.h>

53

#include <asm/tlb.h>

53

54

#include <asm/unistd.h>

55

#include <asm/unistd.h>

55

56

/*

57

/*

57

* Convert user-nice values [ -20 ... 0 ... 19 ]

58

* Convert user-nice values [ -20 ... 0 ... 19 ]

58

* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],

59

* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],

59

* and back.

60

* and back.

60

*/

61

*/

61

#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)

62

#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)

62

#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)

63

#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)

63

#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)

64

#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)

64

65

/*

66

/*

66

* 'User priority' is the nice value converted to something we

67

* 'User priority' is the nice value converted to something we

67

* can work with better when scaling various scheduler parameters,

68

* can work with better when scaling various scheduler parameters,

68

* it's a [ 0 ... 39 ] range.

69

* it's a [ 0 ... 39 ] range.

69

*/

70

*/

70

#define USER_PRIO(p) ((p)-MAX_RT_PRIO)

71

#define USER_PRIO(p) ((p)-MAX_RT_PRIO)

71

#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)

72

#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)

72

#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))

73

#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))

73

74

/*

75

/*

75

* Some helpers for converting nanosecond timing to jiffy resolution

76

* Some helpers for converting nanosecond timing to jiffy resolution

76

*/

77

*/

77

#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))

78

#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))

78

#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))

79

#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))

79

80

/*

81

/*

81

* These are the 'tuning knobs' of the scheduler:

82

* These are the 'tuning knobs' of the scheduler:

82

*

83

*

83

* Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),

84

* Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),

84

* default timeslice is 100 msecs, maximum timeslice is 800 msecs.

85

* default timeslice is 100 msecs, maximum timeslice is 800 msecs.

85

* Timeslices get refilled after they expire.

86

* Timeslices get refilled after they expire.

86

*/

87

*/

87

#define MIN_TIMESLICE max(5 * HZ / 1000, 1)

88

#define MIN_TIMESLICE max(5 * HZ / 1000, 1)

88

#define DEF_TIMESLICE (100 * HZ / 1000)

89

#define DEF_TIMESLICE (100 * HZ / 1000)

89

#define ON_RUNQUEUE_WEIGHT 30

90

#define ON_RUNQUEUE_WEIGHT 30

90

#define CHILD_PENALTY 95

91

#define CHILD_PENALTY 95

91

#define PARENT_PENALTY 100

92

#define PARENT_PENALTY 100

92

#define EXIT_WEIGHT 3

93

#define EXIT_WEIGHT 3

93

#define PRIO_BONUS_RATIO 25

94

#define PRIO_BONUS_RATIO 25

94

#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)

95

#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)

95

#define INTERACTIVE_DELTA 2

96

#define INTERACTIVE_DELTA 2

96

#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)

97

#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)

97

#define STARVATION_LIMIT (MAX_SLEEP_AVG)

98

#define STARVATION_LIMIT (MAX_SLEEP_AVG)

98

#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))

99

#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))

99

100

/*

101

/*

101

* If a task is 'interactive' then we reinsert it in the active

102

* If a task is 'interactive' then we reinsert it in the active

102

* array after it has expired its current timeslice. (it will not

103

* array after it has expired its current timeslice. (it will not

103

* continue to run immediately, it will still roundrobin with

104

* continue to run immediately, it will still roundrobin with

104

* other interactive tasks.)

105

* other interactive tasks.)

105

*

106

*

106

* This part scales the interactivity limit depending on niceness.

107

* This part scales the interactivity limit depending on niceness.

107

*

108

*

108

* We scale it linearly, offset by the INTERACTIVE_DELTA delta.

109

* We scale it linearly, offset by the INTERACTIVE_DELTA delta.

109

* Here are a few examples of different nice levels:

110

* Here are a few examples of different nice levels:

110

*

111

*

111

* TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]

112

* TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]

112

* TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]

113

* TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]

113

* TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]

114

* TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]

114

* TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]

115

* TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]

115

* TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]

116

* TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]

116

*

117

*

117

* (the X axis represents the possible -5 ... 0 ... +5 dynamic

118

* (the X axis represents the possible -5 ... 0 ... +5 dynamic

118

* priority range a task can explore, a value of '1' means the

119

* priority range a task can explore, a value of '1' means the

119

* task is rated interactive.)

120

* task is rated interactive.)

120

*

121

*

121

* Ie. nice +19 tasks can never get 'interactive' enough to be

122

* Ie. nice +19 tasks can never get 'interactive' enough to be

122

* reinserted into the active array. And only heavily CPU-hog nice -20

123

* reinserted into the active array. And only heavily CPU-hog nice -20

123

* tasks will be expired. Default nice 0 tasks are somewhere between,

124

* tasks will be expired. Default nice 0 tasks are somewhere between,

124

* it takes some effort for them to get interactive, but it's not

125

* it takes some effort for them to get interactive, but it's not

125

* too hard.

126

* too hard.

126

*/

127

*/

127

128

#define CURRENT_BONUS(p) \

129

#define CURRENT_BONUS(p) \

129

(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \

130

(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \

130

MAX_SLEEP_AVG)

131

MAX_SLEEP_AVG)

131

132

#define GRANULARITY (10 * HZ / 1000 ? : 1)

133

#define GRANULARITY (10 * HZ / 1000 ? : 1)

133

134

#ifdef CONFIG_SMP

135

#ifdef CONFIG_SMP

135

#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \

136

#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \

136

(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \

137

(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \

137

num_online_cpus())

138

num_online_cpus())

138

#else

139

#else

139

#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \

140

#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \

140

(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))

141

(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))

141

#endif

142

#endif

142

143

#define SCALE(v1,v1_max,v2_max) \

144

#define SCALE(v1,v1_max,v2_max) \

144

(v1) * (v2_max) / (v1_max)

145

(v1) * (v2_max) / (v1_max)

145

146

#define DELTA(p) \

147

#define DELTA(p) \

147

(SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)

148

(SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)

148

149

#define TASK_INTERACTIVE(p) \

150

#define TASK_INTERACTIVE(p) \

150

((p)->prio <= (p)->static_prio - DELTA(p))

151

((p)->prio <= (p)->static_prio - DELTA(p))

151

152

#define INTERACTIVE_SLEEP(p) \

153

#define INTERACTIVE_SLEEP(p) \

153

(JIFFIES_TO_NS(MAX_SLEEP_AVG * \

154

(JIFFIES_TO_NS(MAX_SLEEP_AVG * \

154

(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))

155

(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))

155

156

#define TASK_PREEMPTS_CURR(p, rq) \

157

#define TASK_PREEMPTS_CURR(p, rq) \

157

((p)->prio < (rq)->curr->prio)

158

((p)->prio < (rq)->curr->prio)

158

159

/*

160

/*

160

* task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]

161

* task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]

161

* to time slice values: [800ms ... 100ms ... 5ms]

162

* to time slice values: [800ms ... 100ms ... 5ms]

162

*

163

*

163

* The higher a thread's priority, the bigger timeslices

164

* The higher a thread's priority, the bigger timeslices

164

* it gets during one round of execution. But even the lowest

165

* it gets during one round of execution. But even the lowest

165

* priority thread gets MIN_TIMESLICE worth of execution time.

166

* priority thread gets MIN_TIMESLICE worth of execution time.

166

*/

167

*/

167

168

#define SCALE_PRIO(x, prio) \

169

#define SCALE_PRIO(x, prio) \

169

max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)

170

max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)

170

171

static unsigned int task_timeslice(task_t *p)

172

static unsigned int task_timeslice(task_t *p)

172

{

173

{

173

if (p->static_prio < NICE_TO_PRIO(0))

174

if (p->static_prio < NICE_TO_PRIO(0))

174

return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);

175

return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);

175

else

176

else

176

return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);

177

return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);

177

}

178

}

178

#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \

179

#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \

179

< (long long) (sd)->cache_hot_time)

180

< (long long) (sd)->cache_hot_time)

180

181

/*

182

/*

182

* These are the runqueue data structures:

183

* These are the runqueue data structures:

183

*/

184

*/

184

185

#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))

186

#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))

186

187

typedef struct runqueue runqueue_t;

188

typedef struct runqueue runqueue_t;

188

189

struct prio_array {

190

struct prio_array {

190

unsigned int nr_active;

191

unsigned int nr_active;

191

unsigned long bitmap[BITMAP_SIZE];

192

unsigned long bitmap[BITMAP_SIZE];

192

struct list_head queue[MAX_PRIO];

193

struct list_head queue[MAX_PRIO];

193

};

194

};

194

195

/*

196

/*

196

* This is the main, per-CPU runqueue data structure.

197

* This is the main, per-CPU runqueue data structure.

197

*

198

*

198

* Locking rule: those places that want to lock multiple runqueues

199

* Locking rule: those places that want to lock multiple runqueues

199

* (such as the load balancing or the thread migration code), lock

200

* (such as the load balancing or the thread migration code), lock

200

* acquire operations must be ordered by ascending &runqueue.

201

* acquire operations must be ordered by ascending &runqueue.

201

*/

202

*/

202

struct runqueue {

203

struct runqueue {

203

spinlock_t lock;

204

spinlock_t lock;

204

205

/*

206

/*

206

* nr_running and cpu_load should be in the same cacheline because

207

* nr_running and cpu_load should be in the same cacheline because

207

* remote CPUs use both these fields when doing load calculation.

208

* remote CPUs use both these fields when doing load calculation.

208

*/

209

*/

209

unsigned long nr_running;

210

unsigned long nr_running;

210

#ifdef CONFIG_SMP

211

#ifdef CONFIG_SMP

211

unsigned long cpu_load[3];

212

unsigned long cpu_load[3];

212

#endif

213

#endif

213

unsigned long long nr_switches;

214

unsigned long long nr_switches;

214

215

/*

216

/*

216

* This is part of a global counter where only the total sum

217

* This is part of a global counter where only the total sum

217

* over all CPUs matters. A task can increase this counter on

218

* over all CPUs matters. A task can increase this counter on

218

* one CPU and if it got migrated afterwards it may decrease

219

* one CPU and if it got migrated afterwards it may decrease

219

* it on another CPU. Always updated under the runqueue lock:

220

* it on another CPU. Always updated under the runqueue lock:

220

*/

221

*/

221

unsigned long nr_uninterruptible;

222

unsigned long nr_uninterruptible;

222

223

unsigned long expired_timestamp;

224

unsigned long expired_timestamp;

224

unsigned long long timestamp_last_tick;

225

unsigned long long timestamp_last_tick;

225

task_t *curr, *idle;

226

task_t *curr, *idle;

226

struct mm_struct *prev_mm;

227

struct mm_struct *prev_mm;

227

prio_array_t *active, *expired, arrays[2];

228

prio_array_t *active, *expired, arrays[2];

228

int best_expired_prio;

229

int best_expired_prio;

229

atomic_t nr_iowait;

230

atomic_t nr_iowait;

230

231

#ifdef CONFIG_SMP

232

#ifdef CONFIG_SMP

232

struct sched_domain *sd;

233

struct sched_domain *sd;

233

234

/* For active balancing */

235

/* For active balancing */

235

int active_balance;

236

int active_balance;

236

int push_cpu;

237

int push_cpu;

237

238

task_t *migration_thread;

239

task_t *migration_thread;

239

struct list_head migration_queue;

240

struct list_head migration_queue;

240

int cpu;

241

int cpu;

241

#endif

242

#endif

242

243

#ifdef CONFIG_SCHEDSTATS

244

#ifdef CONFIG_SCHEDSTATS

244

/* latency stats */

245

/* latency stats */

245

struct sched_info rq_sched_info;

246

struct sched_info rq_sched_info;

246

247

/* sys_sched_yield() stats */

248

/* sys_sched_yield() stats */

248

unsigned long yld_exp_empty;

249

unsigned long yld_exp_empty;

249

unsigned long yld_act_empty;

250

unsigned long yld_act_empty;

250

unsigned long yld_both_empty;

251

unsigned long yld_both_empty;

251

unsigned long yld_cnt;

252

unsigned long yld_cnt;

252

253

/* schedule() stats */

254

/* schedule() stats */

254

unsigned long sched_switch;

255

unsigned long sched_switch;

255

unsigned long sched_cnt;

256

unsigned long sched_cnt;

256

unsigned long sched_goidle;

257

unsigned long sched_goidle;

257

258

/* try_to_wake_up() stats */

259

/* try_to_wake_up() stats */

259

unsigned long ttwu_cnt;

260

unsigned long ttwu_cnt;

260

unsigned long ttwu_local;

261

unsigned long ttwu_local;

261

#endif

262

#endif

262

};

263

};

263

264

static DEFINE_PER_CPU(struct runqueue, runqueues);

265

static DEFINE_PER_CPU(struct runqueue, runqueues);

265

266

/*

267

/*

267

* The domain tree (rq->sd) is protected by RCU's quiescent state transition.

268

* The domain tree (rq->sd) is protected by RCU's quiescent state transition.

268

* See detach_destroy_domains: synchronize_sched for details.

269

* See detach_destroy_domains: synchronize_sched for details.

269

*

270

*

270

* The domain tree of any CPU may only be accessed from within

271

* The domain tree of any CPU may only be accessed from within

271

* preempt-disabled sections.

272

* preempt-disabled sections.

272

*/

273

*/

273

#define for_each_domain(cpu, domain) \

274

#define for_each_domain(cpu, domain) \

274

for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)

275

for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)

275

276

#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))

277

#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))

277

#define this_rq() (&__get_cpu_var(runqueues))

278

#define this_rq() (&__get_cpu_var(runqueues))

278

#define task_rq(p) cpu_rq(task_cpu(p))

279

#define task_rq(p) cpu_rq(task_cpu(p))

279

#define cpu_curr(cpu) (cpu_rq(cpu)->curr)

280

#define cpu_curr(cpu) (cpu_rq(cpu)->curr)

280

281

#ifndef prepare_arch_switch

282

#ifndef prepare_arch_switch

282

# define prepare_arch_switch(next) do { } while (0)

283

# define prepare_arch_switch(next) do { } while (0)

283

#endif

284

#endif

284

#ifndef finish_arch_switch

285

#ifndef finish_arch_switch

285

# define finish_arch_switch(prev) do { } while (0)

286

# define finish_arch_switch(prev) do { } while (0)

286

#endif

287

#endif

287

288

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

289

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

289

static inline int task_running(runqueue_t *rq, task_t *p)

290

static inline int task_running(runqueue_t *rq, task_t *p)

290

{

291

{

291

return rq->curr == p;

292

return rq->curr == p;

292

}

293

}

293

294

static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)

295

static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)

295

{

296

{

296

}

297

}

297

298

static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)

299

static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)

299

{

300

{

300

#ifdef CONFIG_DEBUG_SPINLOCK

301

#ifdef CONFIG_DEBUG_SPINLOCK

301

/* this is a valid case when another task releases the spinlock */

302

/* this is a valid case when another task releases the spinlock */

302

rq->lock.owner = current;

303

rq->lock.owner = current;

303

#endif

304

#endif

304

spin_unlock_irq(&rq->lock);

305

spin_unlock_irq(&rq->lock);

305

}

306

}

306

307

#else /* __ARCH_WANT_UNLOCKED_CTXSW */

308

#else /* __ARCH_WANT_UNLOCKED_CTXSW */

308

static inline int task_running(runqueue_t *rq, task_t *p)

309

static inline int task_running(runqueue_t *rq, task_t *p)

309

{

310

{

310

#ifdef CONFIG_SMP

311

#ifdef CONFIG_SMP

311

return p->oncpu;

312

return p->oncpu;

312

#else

313

#else

313

return rq->curr == p;

314

return rq->curr == p;

314

#endif

315

#endif

315

}

316

}

316

317

static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)

318

static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)

318

{

319

{

319

#ifdef CONFIG_SMP

320

#ifdef CONFIG_SMP

320

/*

321

/*

321

* We can optimise this out completely for !SMP, because the

322

* We can optimise this out completely for !SMP, because the

322

* SMP rebalancing from interrupt is the only thing that cares

323

* SMP rebalancing from interrupt is the only thing that cares

323

* here.

324

* here.

324

*/

325

*/

325

next->oncpu = 1;

326

next->oncpu = 1;

326

#endif

327

#endif

327

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

328

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

328

spin_unlock_irq(&rq->lock);

329

spin_unlock_irq(&rq->lock);

329

#else

330

#else

330

spin_unlock(&rq->lock);

331

spin_unlock(&rq->lock);

331

#endif

332

#endif

332

}

333

}

333

334

static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)

335

static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)

335

{

336

{

336

#ifdef CONFIG_SMP

337

#ifdef CONFIG_SMP

337

/*

338

/*

338

* After ->oncpu is cleared, the task can be moved to a different CPU.

339

* After ->oncpu is cleared, the task can be moved to a different CPU.

339

* We must ensure this doesn't happen until the switch is completely

340

* We must ensure this doesn't happen until the switch is completely

340

* finished.

341

* finished.

341

*/

342

*/

342

smp_wmb();

343

smp_wmb();

343

prev->oncpu = 0;

344

prev->oncpu = 0;

344

#endif

345

#endif

345

#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW

346

#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW

346

local_irq_enable();

347

local_irq_enable();

347

#endif

348

#endif

348

}

349

}

349

#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

350

#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

350

351

/*

352

/*

352

* task_rq_lock - lock the runqueue a given task resides on and disable

353

* task_rq_lock - lock the runqueue a given task resides on and disable

353

* interrupts. Note the ordering: we can safely lookup the task_rq without

354

* interrupts. Note the ordering: we can safely lookup the task_rq without

354

* explicitly disabling preemption.

355

* explicitly disabling preemption.

355

*/

356

*/

356

static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)

357

static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)

357

__acquires(rq->lock)

358

__acquires(rq->lock)

358

{

359

{

359

struct runqueue *rq;

360

struct runqueue *rq;

360

361

repeat_lock_task:

362

repeat_lock_task:

362

local_irq_save(*flags);

363

local_irq_save(*flags);

363

rq = task_rq(p);

364

rq = task_rq(p);

364

spin_lock(&rq->lock);

365

spin_lock(&rq->lock);

365

if (unlikely(rq != task_rq(p))) {

366

if (unlikely(rq != task_rq(p))) {

366

spin_unlock_irqrestore(&rq->lock, *flags);

367

spin_unlock_irqrestore(&rq->lock, *flags);

367

goto repeat_lock_task;

368

goto repeat_lock_task;

368

}

369

}

369

return rq;

370

return rq;

370

}

371

}

371

372

static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)

373

static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)

373

__releases(rq->lock)

374

__releases(rq->lock)

374

{

375

{

375

spin_unlock_irqrestore(&rq->lock, *flags);

376

spin_unlock_irqrestore(&rq->lock, *flags);

376

}

377

}

377

378

#ifdef CONFIG_SCHEDSTATS

379

#ifdef CONFIG_SCHEDSTATS

379

/*

380

/*

380

* bump this up when changing the output format or the meaning of an existing

381

* bump this up when changing the output format or the meaning of an existing

381

* format, so that tools can adapt (or abort)

382

* format, so that tools can adapt (or abort)

382

*/

383

*/

383

#define SCHEDSTAT_VERSION 12

384

#define SCHEDSTAT_VERSION 12

384

385

static int show_schedstat(struct seq_file *seq, void *v)

386

static int show_schedstat(struct seq_file *seq, void *v)

386

{

387

{

387

int cpu;

388

int cpu;

388

389

seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);

390

seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);

390

seq_printf(seq, "timestamp %lu\n", jiffies);

391

seq_printf(seq, "timestamp %lu\n", jiffies);

391

for_each_online_cpu(cpu) {

392

for_each_online_cpu(cpu) {

392

runqueue_t *rq = cpu_rq(cpu);

393

runqueue_t *rq = cpu_rq(cpu);

393

#ifdef CONFIG_SMP

394

#ifdef CONFIG_SMP

394

struct sched_domain *sd;

395

struct sched_domain *sd;

395

int dcnt = 0;

396

int dcnt = 0;

396

#endif

397

#endif

397

398

/* runqueue-specific stats */

399

/* runqueue-specific stats */

399

seq_printf(seq,

400

seq_printf(seq,

400

"cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",

401

"cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",

401

cpu, rq->yld_both_empty,

402

cpu, rq->yld_both_empty,

402

rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,

403

rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,

403

rq->sched_switch, rq->sched_cnt, rq->sched_goidle,

404

rq->sched_switch, rq->sched_cnt, rq->sched_goidle,

404

rq->ttwu_cnt, rq->ttwu_local,

405

rq->ttwu_cnt, rq->ttwu_local,

405

rq->rq_sched_info.cpu_time,

406

rq->rq_sched_info.cpu_time,

406

rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);

407

rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);

407

408

seq_printf(seq, "\n");

409

seq_printf(seq, "\n");

409

410

#ifdef CONFIG_SMP

411

#ifdef CONFIG_SMP

411

/* domain-specific stats */

412

/* domain-specific stats */

412

preempt_disable();

413

preempt_disable();

413

for_each_domain(cpu, sd) {

414

for_each_domain(cpu, sd) {

414

enum idle_type itype;

415

enum idle_type itype;

415

char mask_str[NR_CPUS];

416

char mask_str[NR_CPUS];

416

417

cpumask_scnprintf(mask_str, NR_CPUS, sd->span);

418

cpumask_scnprintf(mask_str, NR_CPUS, sd->span);

418

seq_printf(seq, "domain%d %s", dcnt++, mask_str);

419

seq_printf(seq, "domain%d %s", dcnt++, mask_str);

419

for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;

420

for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;

420

itype++) {

421

itype++) {

421

seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",

422

seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",

422

sd->lb_cnt[itype],

423

sd->lb_cnt[itype],

423

sd->lb_balanced[itype],

424

sd->lb_balanced[itype],

424

sd->lb_failed[itype],

425

sd->lb_failed[itype],

425

sd->lb_imbalance[itype],

426

sd->lb_imbalance[itype],

426

sd->lb_gained[itype],

427

sd->lb_gained[itype],

427

sd->lb_hot_gained[itype],

428

sd->lb_hot_gained[itype],

428

sd->lb_nobusyq[itype],

429

sd->lb_nobusyq[itype],

429

sd->lb_nobusyg[itype]);

430

sd->lb_nobusyg[itype]);

430

}

431

}

431

seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",

432

seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",

432

sd->alb_cnt, sd->alb_failed, sd->alb_pushed,

433

sd->alb_cnt, sd->alb_failed, sd->alb_pushed,

433

sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,

434

sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,

434

sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,

435

sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,

435

sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);

436

sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);

436

}

437

}

437

preempt_enable();

438

preempt_enable();

438

#endif

439

#endif

439

}

440

}

440

return 0;

441

return 0;

441

}

442

}

442

443

static int schedstat_open(struct inode *inode, struct file *file)

444

static int schedstat_open(struct inode *inode, struct file *file)

444

{

445

{

445

unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);

446

unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);

446

char *buf = kmalloc(size, GFP_KERNEL);

447

char *buf = kmalloc(size, GFP_KERNEL);

447

struct seq_file *m;

448

struct seq_file *m;

448

int res;

449

int res;

449

450

if (!buf)

451

if (!buf)

451

return -ENOMEM;

452

return -ENOMEM;

452

res = single_open(file, show_schedstat, NULL);

453

res = single_open(file, show_schedstat, NULL);

453

if (!res) {

454

if (!res) {

454

m = file->private_data;

455

m = file->private_data;

455

m->buf = buf;

456

m->buf = buf;

456

m->size = size;

457

m->size = size;

457

} else

458

} else

458

kfree(buf);

459

kfree(buf);

459

return res;

460

return res;

460

}

461

}

461

462

struct file_operations proc_schedstat_operations = {

463

struct file_operations proc_schedstat_operations = {

463

.open = schedstat_open,

464

.open = schedstat_open,

464

.read = seq_read,

465

.read = seq_read,

465

.llseek = seq_lseek,

466

.llseek = seq_lseek,

466

.release = single_release,

467

.release = single_release,

467

};

468

};

468

469

# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)

470

# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)

470

# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)

471

# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)

471

#else /* !CONFIG_SCHEDSTATS */

472

#else /* !CONFIG_SCHEDSTATS */

472

# define schedstat_inc(rq, field) do { } while (0)

473

# define schedstat_inc(rq, field) do { } while (0)

473

# define schedstat_add(rq, field, amt) do { } while (0)

474

# define schedstat_add(rq, field, amt) do { } while (0)

474

#endif

475

#endif

475

476

/*

477

/*

477

* rq_lock - lock a given runqueue and disable interrupts.

478

* rq_lock - lock a given runqueue and disable interrupts.

478

*/

479

*/

479

static inline runqueue_t *this_rq_lock(void)

480

static inline runqueue_t *this_rq_lock(void)

480

__acquires(rq->lock)

481

__acquires(rq->lock)

481

{

482

{

482

runqueue_t *rq;

483

runqueue_t *rq;

483

484

local_irq_disable();

485

local_irq_disable();

485

rq = this_rq();

486

rq = this_rq();

486

spin_lock(&rq->lock);

487

spin_lock(&rq->lock);

487

488

return rq;

489

return rq;

489

}

490

}

490

491

#ifdef CONFIG_SCHEDSTATS

492

#ifdef CONFIG_SCHEDSTATS

492

/*

493

/*

493

* Called when a process is dequeued from the active array and given

494

* Called when a process is dequeued from the active array and given

494

* the cpu. We should note that with the exception of interactive

495

* the cpu. We should note that with the exception of interactive

495

* tasks, the expired queue will become the active queue after the active

496

* tasks, the expired queue will become the active queue after the active

496

* queue is empty, without explicitly dequeuing and requeuing tasks in the

497

* queue is empty, without explicitly dequeuing and requeuing tasks in the

497

* expired queue. (Interactive tasks may be requeued directly to the

498

* expired queue. (Interactive tasks may be requeued directly to the

498

* active queue, thus delaying tasks in the expired queue from running;

499

* active queue, thus delaying tasks in the expired queue from running;

499

* see scheduler_tick()).

500

* see scheduler_tick()).

500

*

501

*

501

* This function is only called from sched_info_arrive(), rather than

502

* This function is only called from sched_info_arrive(), rather than

502

* dequeue_task(). Even though a task may be queued and dequeued multiple

503

* dequeue_task(). Even though a task may be queued and dequeued multiple

503

* times as it is shuffled about, we're really interested in knowing how

504

* times as it is shuffled about, we're really interested in knowing how

504

* long it was from the *first* time it was queued to the time that it

505

* long it was from the *first* time it was queued to the time that it

505

* finally hit a cpu.

506

* finally hit a cpu.

506

*/

507

*/

507

static inline void sched_info_dequeued(task_t *t)

508

static inline void sched_info_dequeued(task_t *t)

508

{

509

{

509

t->sched_info.last_queued = 0;

510

t->sched_info.last_queued = 0;

510

}

511

}

511

512

/*

513

/*

513

* Called when a task finally hits the cpu. We can now calculate how

514

* Called when a task finally hits the cpu. We can now calculate how

514

* long it was waiting to run. We also note when it began so that we

515

* long it was waiting to run. We also note when it began so that we

515

* can keep stats on how long its timeslice is.

516

* can keep stats on how long its timeslice is.

516

*/

517

*/

517

static void sched_info_arrive(task_t *t)

518

static void sched_info_arrive(task_t *t)

518

{

519

{

519

unsigned long now = jiffies, diff = 0;

520

unsigned long now = jiffies, diff = 0;

520

struct runqueue *rq = task_rq(t);

521

struct runqueue *rq = task_rq(t);

521

522

if (t->sched_info.last_queued)

523

if (t->sched_info.last_queued)

523

diff = now - t->sched_info.last_queued;

524

diff = now - t->sched_info.last_queued;

524

sched_info_dequeued(t);

525

sched_info_dequeued(t);

525

t->sched_info.run_delay += diff;

526

t->sched_info.run_delay += diff;

526

t->sched_info.last_arrival = now;

527

t->sched_info.last_arrival = now;

527

t->sched_info.pcnt++;

528

t->sched_info.pcnt++;

528

529

if (!rq)

530

if (!rq)

530

return;

531

return;

531

532

rq->rq_sched_info.run_delay += diff;

533

rq->rq_sched_info.run_delay += diff;

533

rq->rq_sched_info.pcnt++;

534

rq->rq_sched_info.pcnt++;

534

}

535

}

535

536

/*

537

/*

537

* Called when a process is queued into either the active or expired

538

* Called when a process is queued into either the active or expired

538

* array. The time is noted and later used to determine how long we

539

* array. The time is noted and later used to determine how long we

539

* had to wait for us to reach the cpu. Since the expired queue will

540

* had to wait for us to reach the cpu. Since the expired queue will

540

* become the active queue after active queue is empty, without dequeuing

541

* become the active queue after active queue is empty, without dequeuing

541

* and requeuing any tasks, we are interested in queuing to either. It

542

* and requeuing any tasks, we are interested in queuing to either. It

542

* is unusual but not impossible for tasks to be dequeued and immediately

543

* is unusual but not impossible for tasks to be dequeued and immediately

543

* requeued in the same or another array: this can happen in sched_yield(),

544

* requeued in the same or another array: this can happen in sched_yield(),

544

* set_user_nice(), and even load_balance() as it moves tasks from runqueue

545

* set_user_nice(), and even load_balance() as it moves tasks from runqueue

545

* to runqueue.

546

* to runqueue.

546

*

547

*

547

* This function is only called from enqueue_task(), but also only updates

548

* This function is only called from enqueue_task(), but also only updates

548

* the timestamp if it is already not set. It's assumed that

549

* the timestamp if it is already not set. It's assumed that

549

* sched_info_dequeued() will clear that stamp when appropriate.

550

* sched_info_dequeued() will clear that stamp when appropriate.

550

*/

551

*/

551

static inline void sched_info_queued(task_t *t)

552

static inline void sched_info_queued(task_t *t)

552

{

553

{

553

if (!t->sched_info.last_queued)

554

if (!t->sched_info.last_queued)

554

t->sched_info.last_queued = jiffies;

555

t->sched_info.last_queued = jiffies;

555

}

556

}

556

557

/*

558

/*

558

* Called when a process ceases being the active-running process, either

559

* Called when a process ceases being the active-running process, either

559

* voluntarily or involuntarily. Now we can calculate how long we ran.

560

* voluntarily or involuntarily. Now we can calculate how long we ran.

560

*/

561

*/

561

static inline void sched_info_depart(task_t *t)

562

static inline void sched_info_depart(task_t *t)

562

{

563

{

563

struct runqueue *rq = task_rq(t);

564

struct runqueue *rq = task_rq(t);

564

unsigned long diff = jiffies - t->sched_info.last_arrival;

565

unsigned long diff = jiffies - t->sched_info.last_arrival;

565

566

t->sched_info.cpu_time += diff;

567

t->sched_info.cpu_time += diff;

567

568

if (rq)

569

if (rq)

569

rq->rq_sched_info.cpu_time += diff;

570

rq->rq_sched_info.cpu_time += diff;

570

}

571

}

571

572

/*

573

/*

573

* Called when tasks are switched involuntarily due, typically, to expiring

574

* Called when tasks are switched involuntarily due, typically, to expiring

574

* their time slice. (This may also be called when switching to or from

575

* their time slice. (This may also be called when switching to or from

575

* the idle task.) We are only called when prev != next.

576

* the idle task.) We are only called when prev != next.

576

*/

577

*/

577

static inline void sched_info_switch(task_t *prev, task_t *next)

578

static inline void sched_info_switch(task_t *prev, task_t *next)

578

{

579

{

579

struct runqueue *rq = task_rq(prev);

580

struct runqueue *rq = task_rq(prev);

580

581

/*

582

/*

582

* prev now departs the cpu. It's not interesting to record

583

* prev now departs the cpu. It's not interesting to record

583

* stats about how efficient we were at scheduling the idle

584

* stats about how efficient we were at scheduling the idle

584

* process, however.

585

* process, however.

585

*/

586

*/

586

if (prev != rq->idle)

587

if (prev != rq->idle)

587

sched_info_depart(prev);

588

sched_info_depart(prev);

588

589

if (next != rq->idle)

590

if (next != rq->idle)

590

sched_info_arrive(next);

591

sched_info_arrive(next);

591

}

592

}

592

#else

593

#else

593

#define sched_info_queued(t) do { } while (0)

594

#define sched_info_queued(t) do { } while (0)

594

#define sched_info_switch(t, next) do { } while (0)

595

#define sched_info_switch(t, next) do { } while (0)

595

#endif /* CONFIG_SCHEDSTATS */

596

#endif /* CONFIG_SCHEDSTATS */

596

597

/*

598

/*

598

* Adding/removing a task to/from a priority array:

599

* Adding/removing a task to/from a priority array:

599

*/

600

*/

600

static void dequeue_task(struct task_struct *p, prio_array_t *array)

601

static void dequeue_task(struct task_struct *p, prio_array_t *array)

601

{

602

{

602

array->nr_active--;

603

array->nr_active--;

603

list_del(&p->run_list);

604

list_del(&p->run_list);

604

if (list_empty(array->queue + p->prio))

605

if (list_empty(array->queue + p->prio))

605

__clear_bit(p->prio, array->bitmap);

606

__clear_bit(p->prio, array->bitmap);

606

}

607

}

607

608

static void enqueue_task(struct task_struct *p, prio_array_t *array)

609

static void enqueue_task(struct task_struct *p, prio_array_t *array)

609

{

610

{

610

sched_info_queued(p);

611

sched_info_queued(p);

611

list_add_tail(&p->run_list, array->queue + p->prio);

612

list_add_tail(&p->run_list, array->queue + p->prio);

612

__set_bit(p->prio, array->bitmap);

613

__set_bit(p->prio, array->bitmap);

613

array->nr_active++;

614

array->nr_active++;

614

p->array = array;

615

p->array = array;

615

}

616

}

616

617

/*

618

/*

618

* Put task to the end of the run list without the overhead of dequeue

619

* Put task to the end of the run list without the overhead of dequeue

619

* followed by enqueue.

620

* followed by enqueue.

620

*/

621

*/

621

static void requeue_task(struct task_struct *p, prio_array_t *array)

622

static void requeue_task(struct task_struct *p, prio_array_t *array)

622

{

623

{

623

list_move_tail(&p->run_list, array->queue + p->prio);

624

list_move_tail(&p->run_list, array->queue + p->prio);

624

}

625

}

625

626

static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)

627

static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)

627

{

628

{

628

list_add(&p->run_list, array->queue + p->prio);

629

list_add(&p->run_list, array->queue + p->prio);

629

__set_bit(p->prio, array->bitmap);

630

__set_bit(p->prio, array->bitmap);

630

array->nr_active++;

631

array->nr_active++;

631

p->array = array;

632

p->array = array;

632

}

633

}

633

634

/*

635

/*

635

* effective_prio - return the priority that is based on the static

636

* effective_prio - return the priority that is based on the static

636

* priority but is modified by bonuses/penalties.

637

* priority but is modified by bonuses/penalties.

637

*

638

*

638

* We scale the actual sleep average [0 .... MAX_SLEEP_AVG]

639

* We scale the actual sleep average [0 .... MAX_SLEEP_AVG]

639

* into the -5 ... 0 ... +5 bonus/penalty range.

640

* into the -5 ... 0 ... +5 bonus/penalty range.

640

*

641

*

641

* We use 25% of the full 0...39 priority range so that:

642

* We use 25% of the full 0...39 priority range so that:

642

*

643

*

643

* 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.

644

* 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.

644

* 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.

645

* 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.

645

*

646

*

646

* Both properties are important to certain workloads.

647

* Both properties are important to certain workloads.

647

*/

648

*/

648

static int effective_prio(task_t *p)

649

static int effective_prio(task_t *p)

649

{

650

{

650

int bonus, prio;

651

int bonus, prio;

651

652

if (rt_task(p))

653

if (rt_task(p))

653

return p->prio;

654

return p->prio;

654

655

bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;

656

bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;

656

657

prio = p->static_prio - bonus;

658

prio = p->static_prio - bonus;

658

if (prio < MAX_RT_PRIO)

659

if (prio < MAX_RT_PRIO)

659

prio = MAX_RT_PRIO;

660

prio = MAX_RT_PRIO;

660

if (prio > MAX_PRIO-1)

661

if (prio > MAX_PRIO-1)

661

prio = MAX_PRIO-1;

662

prio = MAX_PRIO-1;

662

return prio;

663

return prio;

663

}

664

}

664

665

/*

666

/*

666

* __activate_task - move a task to the runqueue.

667

* __activate_task - move a task to the runqueue.

667

*/

668

*/

668

static inline void __activate_task(task_t *p, runqueue_t *rq)

669

static inline void __activate_task(task_t *p, runqueue_t *rq)

669

{

670

{

670

enqueue_task(p, rq->active);

671

enqueue_task(p, rq->active);

671

rq->nr_running++;

672

rq->nr_running++;

672

}

673

}

673

674

/*

675

/*

675

* __activate_idle_task - move idle task to the _front_ of runqueue.

676

* __activate_idle_task - move idle task to the _front_ of runqueue.

676

*/

677

*/

677

static inline void __activate_idle_task(task_t *p, runqueue_t *rq)

678

static inline void __activate_idle_task(task_t *p, runqueue_t *rq)

678

{

679

{

679

enqueue_task_head(p, rq->active);

680

enqueue_task_head(p, rq->active);

680

rq->nr_running++;

681

rq->nr_running++;

681

}

682

}

682

683

static int recalc_task_prio(task_t *p, unsigned long long now)

684

static int recalc_task_prio(task_t *p, unsigned long long now)

684

{

685

{

685

/* Caller must always ensure 'now >= p->timestamp' */

686

/* Caller must always ensure 'now >= p->timestamp' */

686

unsigned long long __sleep_time = now - p->timestamp;

687

unsigned long long __sleep_time = now - p->timestamp;

687

unsigned long sleep_time;

688

unsigned long sleep_time;

688

689

if (unlikely(p->policy == SCHED_BATCH))

690

if (unlikely(p->policy == SCHED_BATCH))

690

sleep_time = 0;

691

sleep_time = 0;

691

else {

692

else {

692

if (__sleep_time > NS_MAX_SLEEP_AVG)

693

if (__sleep_time > NS_MAX_SLEEP_AVG)

693

sleep_time = NS_MAX_SLEEP_AVG;

694

sleep_time = NS_MAX_SLEEP_AVG;

694

else

695

else

695

sleep_time = (unsigned long)__sleep_time;

696

sleep_time = (unsigned long)__sleep_time;

696

}

697

}

697

698

if (likely(sleep_time > 0)) {

699

if (likely(sleep_time > 0)) {

699

/*

700

/*

700

* User tasks that sleep a long time are categorised as

701

* User tasks that sleep a long time are categorised as

701

* idle and will get just interactive status to stay active &

702

* idle and will get just interactive status to stay active &

702

* prevent them suddenly becoming cpu hogs and starving

703

* prevent them suddenly becoming cpu hogs and starving

703

* other processes.

704

* other processes.

704

*/

705

*/

705

if (p->mm && p->activated != -1 &&

706

if (p->mm && p->activated != -1 &&

706

sleep_time > INTERACTIVE_SLEEP(p)) {

707

sleep_time > INTERACTIVE_SLEEP(p)) {

707

p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -

708

p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -

708

DEF_TIMESLICE);

709

DEF_TIMESLICE);

709

} else {

710

} else {

710

/*

711

/*

711

* Tasks waking from uninterruptible sleep are

712

* Tasks waking from uninterruptible sleep are

712

* limited in their sleep_avg rise as they

713

* limited in their sleep_avg rise as they

713

* are likely to be waiting on I/O

714

* are likely to be waiting on I/O

714

*/

715

*/

715

if (p->activated == -1 && p->mm) {

716

if (p->activated == -1 && p->mm) {

716

if (p->sleep_avg >= INTERACTIVE_SLEEP(p))

717

if (p->sleep_avg >= INTERACTIVE_SLEEP(p))

717

sleep_time = 0;

718

sleep_time = 0;

718

else if (p->sleep_avg + sleep_time >=

719

else if (p->sleep_avg + sleep_time >=

719

INTERACTIVE_SLEEP(p)) {

720

INTERACTIVE_SLEEP(p)) {

720

p->sleep_avg = INTERACTIVE_SLEEP(p);

721

p->sleep_avg = INTERACTIVE_SLEEP(p);

721

sleep_time = 0;

722

sleep_time = 0;

722

}

723

}

723

}

724

}

724

725

/*

726

/*

726

* This code gives a bonus to interactive tasks.

727

* This code gives a bonus to interactive tasks.

727

*

728

*

728

* The boost works by updating the 'average sleep time'

729

* The boost works by updating the 'average sleep time'

729

* value here, based on ->timestamp. The more time a

730

* value here, based on ->timestamp. The more time a

730

* task spends sleeping, the higher the average gets -

731

* task spends sleeping, the higher the average gets -

731

* and the higher the priority boost gets as well.

732

* and the higher the priority boost gets as well.

732

*/

733

*/

733

p->sleep_avg += sleep_time;

734

p->sleep_avg += sleep_time;

734

735

if (p->sleep_avg > NS_MAX_SLEEP_AVG)

736

if (p->sleep_avg > NS_MAX_SLEEP_AVG)

736

p->sleep_avg = NS_MAX_SLEEP_AVG;

737

p->sleep_avg = NS_MAX_SLEEP_AVG;

737

}

738

}

738

}

739

}

739

740

return effective_prio(p);

741

return effective_prio(p);

741

}

742

}

742

743

/*

744

/*

744

* activate_task - move a task to the runqueue and do priority recalculation

745

* activate_task - move a task to the runqueue and do priority recalculation

745

*

746

*

746

* Update all the scheduling statistics stuff. (sleep average

747

* Update all the scheduling statistics stuff. (sleep average

747

* calculation, priority modifiers, etc.)

748

* calculation, priority modifiers, etc.)

748

*/

749

*/

749

static void activate_task(task_t *p, runqueue_t *rq, int local)

750

static void activate_task(task_t *p, runqueue_t *rq, int local)

750

{

751

{

751

unsigned long long now;

752

unsigned long long now;

752

753

now = sched_clock();

754

now = sched_clock();

754

#ifdef CONFIG_SMP

755

#ifdef CONFIG_SMP

755

if (!local) {

756

if (!local) {

756

/* Compensate for drifting sched_clock */

757

/* Compensate for drifting sched_clock */

757

runqueue_t *this_rq = this_rq();

758

runqueue_t *this_rq = this_rq();

758

now = (now - this_rq->timestamp_last_tick)

759

now = (now - this_rq->timestamp_last_tick)

759

+ rq->timestamp_last_tick;

760

+ rq->timestamp_last_tick;

760

}

761

}

761

#endif

762

#endif

762

763

if (!rt_task(p))

764

if (!rt_task(p))

764

p->prio = recalc_task_prio(p, now);

765

p->prio = recalc_task_prio(p, now);

765

766

/*

767

/*

767

* This checks to make sure it's not an uninterruptible task

768

* This checks to make sure it's not an uninterruptible task

768

* that is now waking up.

769

* that is now waking up.

769

*/

770

*/

770

if (!p->activated) {

771

if (!p->activated) {

771

/*

772

/*

772

* Tasks which were woken up by interrupts (ie. hw events)

773

* Tasks which were woken up by interrupts (ie. hw events)

773

* are most likely of interactive nature. So we give them

774

* are most likely of interactive nature. So we give them

774

* the credit of extending their sleep time to the period

775

* the credit of extending their sleep time to the period

775

* of time they spend on the runqueue, waiting for execution

776

* of time they spend on the runqueue, waiting for execution

776

* on a CPU, first time around:

777

* on a CPU, first time around:

777

*/

778

*/

778

if (in_interrupt())

779

if (in_interrupt())

779

p->activated = 2;

780

p->activated = 2;

780

else {

781

else {

781

/*

782

/*

782

* Normal first-time wakeups get a credit too for

783

* Normal first-time wakeups get a credit too for

783

* on-runqueue time, but it will be weighted down:

784

* on-runqueue time, but it will be weighted down:

784

*/

785

*/

785

p->activated = 1;

786

p->activated = 1;

786

}

787

}

787

}

788

}

788

p->timestamp = now;

789

p->timestamp = now;

789

790

__activate_task(p, rq);

791

__activate_task(p, rq);

791

}

792

}

792

793

/*

794

/*

794

* deactivate_task - remove a task from the runqueue.

795

* deactivate_task - remove a task from the runqueue.

795

*/

796

*/

796

static void deactivate_task(struct task_struct *p, runqueue_t *rq)

797

static void deactivate_task(struct task_struct *p, runqueue_t *rq)

797

{

798

{

798

rq->nr_running--;

799

rq->nr_running--;

799

dequeue_task(p, p->array);

800

dequeue_task(p, p->array);

800

p->array = NULL;

801

p->array = NULL;

801

}

802

}

802

803

/*

804

/*

804

* resched_task - mark a task 'to be rescheduled now'.

805

* resched_task - mark a task 'to be rescheduled now'.

805

*

806

*

806

* On UP this means the setting of the need_resched flag, on SMP it

807

* On UP this means the setting of the need_resched flag, on SMP it

807

* might also involve a cross-CPU call to trigger the scheduler on

808

* might also involve a cross-CPU call to trigger the scheduler on

808

* the target CPU.

809

* the target CPU.

809

*/

810

*/

810

#ifdef CONFIG_SMP

811

#ifdef CONFIG_SMP

811

static void resched_task(task_t *p)

812

static void resched_task(task_t *p)

812

{

813

{

813

int cpu;

814

int cpu;

814

815

assert_spin_locked(&task_rq(p)->lock);

816

assert_spin_locked(&task_rq(p)->lock);

816

817

if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))

818

if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))

818

return;

819

return;

819

820

set_tsk_thread_flag(p, TIF_NEED_RESCHED);

821

set_tsk_thread_flag(p, TIF_NEED_RESCHED);

821

822

cpu = task_cpu(p);

823

cpu = task_cpu(p);

823

if (cpu == smp_processor_id())

824

if (cpu == smp_processor_id())

824

return;

825

return;

825

826

/* NEED_RESCHED must be visible before we test POLLING_NRFLAG */

827

/* NEED_RESCHED must be visible before we test POLLING_NRFLAG */

827

smp_mb();

828

smp_mb();

828

if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG))

829

if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG))

829

smp_send_reschedule(cpu);

830

smp_send_reschedule(cpu);

830

}

831

}

831

#else

832

#else

832

static inline void resched_task(task_t *p)

833

static inline void resched_task(task_t *p)

833

{

834

{

834

assert_spin_locked(&task_rq(p)->lock);

835

assert_spin_locked(&task_rq(p)->lock);

835

set_tsk_need_resched(p);

836

set_tsk_need_resched(p);

836

}

837

}

837

#endif

838

#endif

838

839

/**

840

/**

840

* task_curr - is this task currently executing on a CPU?

841

* task_curr - is this task currently executing on a CPU?

841

* @p: the task in question.

842

* @p: the task in question.

842

*/

843

*/

843

inline int task_curr(const task_t *p)

844

inline int task_curr(const task_t *p)

844

{

845

{

845

return cpu_curr(task_cpu(p)) == p;

846

return cpu_curr(task_cpu(p)) == p;

846

}

847

}

847

848

#ifdef CONFIG_SMP

849

#ifdef CONFIG_SMP

849

typedef struct {

850

typedef struct {

850

struct list_head list;

851

struct list_head list;

851

852

task_t *task;

853

task_t *task;

853

int dest_cpu;

854

int dest_cpu;

854

855

struct completion done;

856

struct completion done;

856

} migration_req_t;

857

} migration_req_t;

857

858

/*

859

/*

859

* The task's runqueue lock must be held.

860

* The task's runqueue lock must be held.

860

* Returns true if you have to wait for migration thread.

861

* Returns true if you have to wait for migration thread.

861

*/

862

*/

862

static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)

863

static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)

863

{

864

{

864

runqueue_t *rq = task_rq(p);

865

runqueue_t *rq = task_rq(p);

865

866

/*

867

/*

867

* If the task is not on a runqueue (and not running), then

868

* If the task is not on a runqueue (and not running), then

868

* it is sufficient to simply update the task's cpu field.

869

* it is sufficient to simply update the task's cpu field.

869

*/

870

*/

870

if (!p->array && !task_running(rq, p)) {

871

if (!p->array && !task_running(rq, p)) {

871

set_task_cpu(p, dest_cpu);

872

set_task_cpu(p, dest_cpu);

872

return 0;

873

return 0;

873

}

874

}

874

875

init_completion(&req->done);

876

init_completion(&req->done);

876

req->task = p;

877

req->task = p;

877

req->dest_cpu = dest_cpu;

878

req->dest_cpu = dest_cpu;

878

list_add(&req->list, &rq->migration_queue);

879

list_add(&req->list, &rq->migration_queue);

879

return 1;

880

return 1;

880

}

881

}

881

882

/*

883

/*

883

* wait_task_inactive - wait for a thread to unschedule.

884

* wait_task_inactive - wait for a thread to unschedule.

884

*

885

*

885

* The caller must ensure that the task *will* unschedule sometime soon,

886

* The caller must ensure that the task *will* unschedule sometime soon,

886

* else this function might spin for a *long* time. This function can't

887

* else this function might spin for a *long* time. This function can't

887

* be called with interrupts off, or it may introduce deadlock with

888

* be called with interrupts off, or it may introduce deadlock with

888

* smp_call_function() if an IPI is sent by the same process we are

889

* smp_call_function() if an IPI is sent by the same process we are

889

* waiting to become inactive.

890

* waiting to become inactive.

890

*/

891

*/

891

void wait_task_inactive(task_t *p)

892

void wait_task_inactive(task_t *p)

892

{

893

{

893

unsigned long flags;

894

unsigned long flags;

894

runqueue_t *rq;

895

runqueue_t *rq;

895

int preempted;

896

int preempted;

896

897

repeat:

898

repeat:

898

rq = task_rq_lock(p, &flags);

899

rq = task_rq_lock(p, &flags);

899

/* Must be off runqueue entirely, not preempted. */

900

/* Must be off runqueue entirely, not preempted. */

900

if (unlikely(p->array || task_running(rq, p))) {

901

if (unlikely(p->array || task_running(rq, p))) {

901

/* If it's preempted, we yield. It could be a while. */

902

/* If it's preempted, we yield. It could be a while. */

902

preempted = !task_running(rq, p);

903

preempted = !task_running(rq, p);

903

task_rq_unlock(rq, &flags);

904

task_rq_unlock(rq, &flags);

904

cpu_relax();

905

cpu_relax();

905

if (preempted)

906

if (preempted)

906

yield();

907

yield();

907

goto repeat;

908

goto repeat;

908

}

909

}

909

task_rq_unlock(rq, &flags);

910

task_rq_unlock(rq, &flags);

910

}

911

}

911

912

/***

913

/***

913

* kick_process - kick a running thread to enter/exit the kernel

914

* kick_process - kick a running thread to enter/exit the kernel

914

* @p: the to-be-kicked thread

915

* @p: the to-be-kicked thread

915

*

916

*

916

* Cause a process which is running on another CPU to enter

917

* Cause a process which is running on another CPU to enter

917

* kernel-mode, without any delay. (to get signals handled.)

918

* kernel-mode, without any delay. (to get signals handled.)

918

*

919

*

919

* NOTE: this function doesnt have to take the runqueue lock,

920

* NOTE: this function doesnt have to take the runqueue lock,

920

* because all it wants to ensure is that the remote task enters

921

* because all it wants to ensure is that the remote task enters

921

* the kernel. If the IPI races and the task has been migrated

922

* the kernel. If the IPI races and the task has been migrated

922

* to another CPU then no harm is done and the purpose has been

923

* to another CPU then no harm is done and the purpose has been

923

* achieved as well.

924

* achieved as well.

924

*/

925

*/

925

void kick_process(task_t *p)

926

void kick_process(task_t *p)

926

{

927

{

927

int cpu;

928

int cpu;

928

929

preempt_disable();

930

preempt_disable();

930

cpu = task_cpu(p);

931

cpu = task_cpu(p);

931

if ((cpu != smp_processor_id()) && task_curr(p))

932

if ((cpu != smp_processor_id()) && task_curr(p))

932

smp_send_reschedule(cpu);

933

smp_send_reschedule(cpu);

933

preempt_enable();

934

preempt_enable();

934

}

935

}

935

936

/*

937

/*

937

* Return a low guess at the load of a migration-source cpu.

938

* Return a low guess at the load of a migration-source cpu.

938

*

939

*

939

* We want to under-estimate the load of migration sources, to

940

* We want to under-estimate the load of migration sources, to

940

* balance conservatively.

941

* balance conservatively.

941

*/

942

*/

942

static inline unsigned long source_load(int cpu, int type)

943

static inline unsigned long source_load(int cpu, int type)

943

{

944

{

944

runqueue_t *rq = cpu_rq(cpu);

945

runqueue_t *rq = cpu_rq(cpu);

945

unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;

946

unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;

946

if (type == 0)

947

if (type == 0)

947

return load_now;

948

return load_now;

948

949

return min(rq->cpu_load[type-1], load_now);

950

return min(rq->cpu_load[type-1], load_now);

950

}

951

}

951

952

/*

953

/*

953

* Return a high guess at the load of a migration-target cpu

954

* Return a high guess at the load of a migration-target cpu

954

*/

955

*/

955

static inline unsigned long target_load(int cpu, int type)

956

static inline unsigned long target_load(int cpu, int type)

956

{

957

{

957

runqueue_t *rq = cpu_rq(cpu);

958

runqueue_t *rq = cpu_rq(cpu);

958

unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;

959

unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;

959

if (type == 0)

960

if (type == 0)

960

return load_now;

961

return load_now;

961

962

return max(rq->cpu_load[type-1], load_now);

963

return max(rq->cpu_load[type-1], load_now);

963

}

964

}

964

965

/*

966

/*

966

* find_idlest_group finds and returns the least busy CPU group within the

967

* find_idlest_group finds and returns the least busy CPU group within the

967

* domain.

968

* domain.

968

*/

969

*/

969

static struct sched_group *

970

static struct sched_group *

970

find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)

971

find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)

971

{

972

{

972

struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;

973

struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;

973

unsigned long min_load = ULONG_MAX, this_load = 0;

974

unsigned long min_load = ULONG_MAX, this_load = 0;

974

int load_idx = sd->forkexec_idx;

975

int load_idx = sd->forkexec_idx;

975

int imbalance = 100 + (sd->imbalance_pct-100)/2;

976

int imbalance = 100 + (sd->imbalance_pct-100)/2;

976

977

do {

978

do {

978

unsigned long load, avg_load;

979

unsigned long load, avg_load;

979

int local_group;

980

int local_group;

980

int i;

981

int i;

981

982

/* Skip over this group if it has no CPUs allowed */

983

/* Skip over this group if it has no CPUs allowed */

983

if (!cpus_intersects(group->cpumask, p->cpus_allowed))

984

if (!cpus_intersects(group->cpumask, p->cpus_allowed))

984

goto nextgroup;

985

goto nextgroup;

985

986

local_group = cpu_isset(this_cpu, group->cpumask);

987

local_group = cpu_isset(this_cpu, group->cpumask);

987

988

/* Tally up the load of all CPUs in the group */

989

/* Tally up the load of all CPUs in the group */

989

avg_load = 0;

990

avg_load = 0;

990

991

for_each_cpu_mask(i, group->cpumask) {

992

for_each_cpu_mask(i, group->cpumask) {

992

/* Bias balancing toward cpus of our domain */

993

/* Bias balancing toward cpus of our domain */

993

if (local_group)

994

if (local_group)

994

load = source_load(i, load_idx);

995

load = source_load(i, load_idx);

995

else

996

else

996

load = target_load(i, load_idx);

997

load = target_load(i, load_idx);

997

998

avg_load += load;

999

avg_load += load;

999

}

1000

}

1000

1001

/* Adjust by relative CPU power of the group */

1002

/* Adjust by relative CPU power of the group */

1002

avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;

1003

avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;

1003

1004

if (local_group) {

1005

if (local_group) {

1005

this_load = avg_load;

1006

this_load = avg_load;

1006

this = group;

1007

this = group;

1007

} else if (avg_load < min_load) {

1008

} else if (avg_load < min_load) {

1008

min_load = avg_load;

1009

min_load = avg_load;

1009

idlest = group;

1010

idlest = group;

1010

}

1011

}

1011

nextgroup:

1012

nextgroup:

1012

group = group->next;

1013

group = group->next;

1013

} while (group != sd->groups);

1014

} while (group != sd->groups);

1014

1015

if (!idlest || 100*this_load < imbalance*min_load)

1016

if (!idlest || 100*this_load < imbalance*min_load)

1016

return NULL;

1017

return NULL;

1017

return idlest;

1018

return idlest;

1018

}

1019

}

1019

1020

/*

1021

/*

1021

* find_idlest_queue - find the idlest runqueue among the cpus in group.

1022

* find_idlest_queue - find the idlest runqueue among the cpus in group.

1022

*/

1023

*/

1023

static int

1024

static int

1024

find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)

1025

find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)

1025

{

1026

{

1026

cpumask_t tmp;

1027

cpumask_t tmp;

1027

unsigned long load, min_load = ULONG_MAX;

1028

unsigned long load, min_load = ULONG_MAX;

1028

int idlest = -1;

1029

int idlest = -1;

1029

int i;

1030

int i;

1030

1031

/* Traverse only the allowed CPUs */

1032

/* Traverse only the allowed CPUs */

1032

cpus_and(tmp, group->cpumask, p->cpus_allowed);

1033

cpus_and(tmp, group->cpumask, p->cpus_allowed);

1033

1034

for_each_cpu_mask(i, tmp) {

1035

for_each_cpu_mask(i, tmp) {

1035

load = source_load(i, 0);

1036

load = source_load(i, 0);

1036

1037

if (load < min_load || (load == min_load && i == this_cpu)) {

1038

if (load < min_load || (load == min_load && i == this_cpu)) {

1038

min_load = load;

1039

min_load = load;

1039

idlest = i;

1040

idlest = i;

1040

}

1041

}

1041

}

1042

}

1042

1043

return idlest;

1044

return idlest;

1044

}

1045

}

1045

1046

/*

1047

/*

1047

* sched_balance_self: balance the current task (running on cpu) in domains

1048

* sched_balance_self: balance the current task (running on cpu) in domains

1048

* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and

1049

* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and

1049

* SD_BALANCE_EXEC.

1050

* SD_BALANCE_EXEC.

1050

*

1051

*

1051

* Balance, ie. select the least loaded group.

1052

* Balance, ie. select the least loaded group.

1052

*

1053

*

1053

* Returns the target CPU number, or the same CPU if no balancing is needed.

1054

* Returns the target CPU number, or the same CPU if no balancing is needed.

1054

*

1055

*

1055

* preempt must be disabled.

1056

* preempt must be disabled.

1056

*/

1057

*/

1057

static int sched_balance_self(int cpu, int flag)

1058

static int sched_balance_self(int cpu, int flag)

1058

{

1059

{

1059

struct task_struct *t = current;

1060

struct task_struct *t = current;

1060

struct sched_domain *tmp, *sd = NULL;

1061

struct sched_domain *tmp, *sd = NULL;

1061

1062

for_each_domain(cpu, tmp)

1063

for_each_domain(cpu, tmp)

1063

if (tmp->flags & flag)

1064

if (tmp->flags & flag)

1064

sd = tmp;

1065

sd = tmp;

1065

1066

while (sd) {

1067

while (sd) {

1067

cpumask_t span;

1068

cpumask_t span;

1068

struct sched_group *group;

1069

struct sched_group *group;

1069

int new_cpu;

1070

int new_cpu;

1070

int weight;

1071

int weight;

1071

1072

span = sd->span;

1073

span = sd->span;

1073

group = find_idlest_group(sd, t, cpu);

1074

group = find_idlest_group(sd, t, cpu);

1074

if (!group)

1075

if (!group)

1075

goto nextlevel;

1076

goto nextlevel;

1076

1077

new_cpu = find_idlest_cpu(group, t, cpu);

1078

new_cpu = find_idlest_cpu(group, t, cpu);

1078

if (new_cpu == -1 || new_cpu == cpu)

1079

if (new_cpu == -1 || new_cpu == cpu)

1079

goto nextlevel;

1080

goto nextlevel;

1080

1081

/* Now try balancing at a lower domain level */

1082

/* Now try balancing at a lower domain level */

1082

cpu = new_cpu;

1083

cpu = new_cpu;

1083

nextlevel:

1084

nextlevel:

1084

sd = NULL;

1085

sd = NULL;

1085

weight = cpus_weight(span);

1086

weight = cpus_weight(span);

1086

for_each_domain(cpu, tmp) {

1087

for_each_domain(cpu, tmp) {

1087

if (weight <= cpus_weight(tmp->span))

1088

if (weight <= cpus_weight(tmp->span))

1088

break;

1089

break;

1089

if (tmp->flags & flag)

1090

if (tmp->flags & flag)

1090

sd = tmp;

1091

sd = tmp;

1091

}

1092

}

1092

/* while loop will break here if sd == NULL */

1093

/* while loop will break here if sd == NULL */

1093

}

1094

}

1094

1095

return cpu;

1096

return cpu;

1096

}

1097

}

1097

1098

#endif /* CONFIG_SMP */

1099

#endif /* CONFIG_SMP */

1099

1100

/*

1101

/*

1101

* wake_idle() will wake a task on an idle cpu if task->cpu is

1102

* wake_idle() will wake a task on an idle cpu if task->cpu is

1102

* not idle and an idle cpu is available. The span of cpus to

1103

* not idle and an idle cpu is available. The span of cpus to

1103

* search starts with cpus closest then further out as needed,

1104

* search starts with cpus closest then further out as needed,

1104

* so we always favor a closer, idle cpu.

1105

* so we always favor a closer, idle cpu.

1105

*

1106

*

1106

* Returns the CPU we should wake onto.

1107

* Returns the CPU we should wake onto.

1107

*/

1108

*/

1108

#if defined(ARCH_HAS_SCHED_WAKE_IDLE)

1109

#if defined(ARCH_HAS_SCHED_WAKE_IDLE)

1109

static int wake_idle(int cpu, task_t *p)

1110

static int wake_idle(int cpu, task_t *p)

1110

{

1111

{

1111

cpumask_t tmp;

1112

cpumask_t tmp;

1112

struct sched_domain *sd;

1113

struct sched_domain *sd;

1113

int i;

1114

int i;

1114

1115

if (idle_cpu(cpu))

1116

if (idle_cpu(cpu))

1116

return cpu;

1117

return cpu;

1117

1118

for_each_domain(cpu, sd) {

1119

for_each_domain(cpu, sd) {

1119

if (sd->flags & SD_WAKE_IDLE) {

1120

if (sd->flags & SD_WAKE_IDLE) {

1120

cpus_and(tmp, sd->span, p->cpus_allowed);

1121

cpus_and(tmp, sd->span, p->cpus_allowed);

1121

for_each_cpu_mask(i, tmp) {

1122

for_each_cpu_mask(i, tmp) {

1122

if (idle_cpu(i))

1123

if (idle_cpu(i))

1123

return i;

1124

return i;

1124

}

1125

}

1125

}

1126

}

1126

else

1127

else

1127

break;

1128

break;

1128

}

1129

}

1129

return cpu;

1130

return cpu;

1130

}

1131

}

1131

#else

1132

#else

1132

static inline int wake_idle(int cpu, task_t *p)

1133

static inline int wake_idle(int cpu, task_t *p)

1133

{

1134

{

1134

return cpu;

1135

return cpu;

1135

}

1136

}

1136

#endif

1137

#endif

1137

1138

/***

1139

/***

1139

* try_to_wake_up - wake up a thread

1140

* try_to_wake_up - wake up a thread

1140

* @p: the to-be-woken-up thread

1141

* @p: the to-be-woken-up thread

1141

* @state: the mask of task states that can be woken

1142

* @state: the mask of task states that can be woken

1142

* @sync: do a synchronous wakeup?

1143

* @sync: do a synchronous wakeup?

1143

*

1144

*

1144

* Put it on the run-queue if it's not already there. The "current"

1145

* Put it on the run-queue if it's not already there. The "current"

1145

* thread is always on the run-queue (except when the actual

1146

* thread is always on the run-queue (except when the actual

1146

* re-schedule is in progress), and as such you're allowed to do

1147

* re-schedule is in progress), and as such you're allowed to do

1147

* the simpler "current->state = TASK_RUNNING" to mark yourself

1148

* the simpler "current->state = TASK_RUNNING" to mark yourself

1148

* runnable without the overhead of this.

1149

* runnable without the overhead of this.

1149

*

1150

*

1150

* returns failure only if the task is already active.

1151

* returns failure only if the task is already active.

1151

*/

1152

*/

1152

static int try_to_wake_up(task_t *p, unsigned int state, int sync)

1153

static int try_to_wake_up(task_t *p, unsigned int state, int sync)

1153

{

1154

{

1154

int cpu, this_cpu, success = 0;

1155

int cpu, this_cpu, success = 0;

1155

unsigned long flags;

1156

unsigned long flags;

1156

long old_state;

1157

long old_state;

1157

runqueue_t *rq;

1158

runqueue_t *rq;

1158

#ifdef CONFIG_SMP

1159

#ifdef CONFIG_SMP

1159

unsigned long load, this_load;

1160

unsigned long load, this_load;

1160

struct sched_domain *sd, *this_sd = NULL;

1161

struct sched_domain *sd, *this_sd = NULL;

1161

int new_cpu;

1162

int new_cpu;

1162

#endif

1163

#endif

1163

1164

rq = task_rq_lock(p, &flags);

1165

rq = task_rq_lock(p, &flags);

1165

old_state = p->state;

1166

old_state = p->state;

1166

if (!(old_state & state))

1167

if (!(old_state & state))

1167

goto out;

1168

goto out;

1168

1169

if (p->array)

1170

if (p->array)

1170

goto out_running;

1171

goto out_running;

1171

1172

cpu = task_cpu(p);

1173

cpu = task_cpu(p);

1173

this_cpu = smp_processor_id();

1174

this_cpu = smp_processor_id();

1174

1175

#ifdef CONFIG_SMP

1176

#ifdef CONFIG_SMP

1176

if (unlikely(task_running(rq, p)))

1177

if (unlikely(task_running(rq, p)))

1177

goto out_activate;

1178

goto out_activate;

1178

1179

new_cpu = cpu;

1180

new_cpu = cpu;

1180

1181

schedstat_inc(rq, ttwu_cnt);

1182

schedstat_inc(rq, ttwu_cnt);

1182

if (cpu == this_cpu) {

1183

if (cpu == this_cpu) {

1183

schedstat_inc(rq, ttwu_local);

1184

schedstat_inc(rq, ttwu_local);

1184

goto out_set_cpu;

1185

goto out_set_cpu;

1185

}

1186

}

1186

1187

for_each_domain(this_cpu, sd) {

1188

for_each_domain(this_cpu, sd) {

1188

if (cpu_isset(cpu, sd->span)) {

1189

if (cpu_isset(cpu, sd->span)) {

1189

schedstat_inc(sd, ttwu_wake_remote);

1190

schedstat_inc(sd, ttwu_wake_remote);

1190

this_sd = sd;

1191

this_sd = sd;

1191

break;

1192

break;

1192

}

1193

}

1193

}

1194

}

1194

1195

if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))

1196

if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))

1196

goto out_set_cpu;

1197

goto out_set_cpu;

1197

1198

/*

1199

/*

1199

* Check for affine wakeup and passive balancing possibilities.

1200

* Check for affine wakeup and passive balancing possibilities.

1200

*/

1201

*/

1201

if (this_sd) {

1202

if (this_sd) {

1202

int idx = this_sd->wake_idx;

1203

int idx = this_sd->wake_idx;

1203

unsigned int imbalance;

1204

unsigned int imbalance;

1204

1205

imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;

1206

imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;

1206

1207

load = source_load(cpu, idx);

1208

load = source_load(cpu, idx);

1208

this_load = target_load(this_cpu, idx);

1209

this_load = target_load(this_cpu, idx);

1209

1210

new_cpu = this_cpu; /* Wake to this CPU if we can */

1211

new_cpu = this_cpu; /* Wake to this CPU if we can */

1211

1212

if (this_sd->flags & SD_WAKE_AFFINE) {

1213

if (this_sd->flags & SD_WAKE_AFFINE) {

1213

unsigned long tl = this_load;

1214

unsigned long tl = this_load;

1214

/*

1215

/*

1215

* If sync wakeup then subtract the (maximum possible)

1216

* If sync wakeup then subtract the (maximum possible)

1216

* effect of the currently running task from the load

1217

* effect of the currently running task from the load

1217

* of the current CPU:

1218

* of the current CPU:

1218

*/

1219

*/

1219

if (sync)

1220

if (sync)

1220

tl -= SCHED_LOAD_SCALE;

1221

tl -= SCHED_LOAD_SCALE;

1221

1222

if ((tl <= load &&

1223

if ((tl <= load &&

1223

tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||

1224

tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||

1224

100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {

1225

100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {

1225

/*

1226

/*

1226

* This domain has SD_WAKE_AFFINE and

1227

* This domain has SD_WAKE_AFFINE and

1227

* p is cache cold in this domain, and

1228

* p is cache cold in this domain, and

1228

* there is no bad imbalance.

1229

* there is no bad imbalance.

1229

*/

1230

*/

1230

schedstat_inc(this_sd, ttwu_move_affine);

1231

schedstat_inc(this_sd, ttwu_move_affine);

1231

goto out_set_cpu;

1232

goto out_set_cpu;

1232

}

1233

}

1233

}

1234

}

1234

1235

/*

1236

/*

1236

* Start passive balancing when half the imbalance_pct

1237

* Start passive balancing when half the imbalance_pct

1237

* limit is reached.

1238

* limit is reached.

1238

*/

1239

*/

1239

if (this_sd->flags & SD_WAKE_BALANCE) {

1240

if (this_sd->flags & SD_WAKE_BALANCE) {

1240

if (imbalance*this_load <= 100*load) {

1241

if (imbalance*this_load <= 100*load) {

1241

schedstat_inc(this_sd, ttwu_move_balance);

1242

schedstat_inc(this_sd, ttwu_move_balance);

1242

goto out_set_cpu;

1243

goto out_set_cpu;

1243

}

1244

}

1244

}

1245

}

1245

}

1246

}

1246

1247

new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */

1248

new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */

1248

out_set_cpu:

1249

out_set_cpu:

1249

new_cpu = wake_idle(new_cpu, p);

1250

new_cpu = wake_idle(new_cpu, p);

1250

if (new_cpu != cpu) {

1251

if (new_cpu != cpu) {

1251

set_task_cpu(p, new_cpu);

1252

set_task_cpu(p, new_cpu);

1252

task_rq_unlock(rq, &flags);

1253

task_rq_unlock(rq, &flags);

1253

/* might preempt at this point */

1254

/* might preempt at this point */

1254

rq = task_rq_lock(p, &flags);

1255

rq = task_rq_lock(p, &flags);

1255

old_state = p->state;

1256

old_state = p->state;

1256

if (!(old_state & state))

1257

if (!(old_state & state))

1257

goto out;

1258

goto out;

1258

if (p->array)

1259

if (p->array)

1259

goto out_running;

1260

goto out_running;

1260

1261

this_cpu = smp_processor_id();

1262

this_cpu = smp_processor_id();

1262

cpu = task_cpu(p);

1263

cpu = task_cpu(p);

1263

}

1264

}

1264

1265

out_activate:

1266

out_activate:

1266

#endif /* CONFIG_SMP */

1267

#endif /* CONFIG_SMP */

1267

if (old_state == TASK_UNINTERRUPTIBLE) {

1268

if (old_state == TASK_UNINTERRUPTIBLE) {

1268

rq->nr_uninterruptible--;

1269

rq->nr_uninterruptible--;

1269

/*

1270

/*

1270

* Tasks on involuntary sleep don't earn

1271

* Tasks on involuntary sleep don't earn

1271

* sleep_avg beyond just interactive state.

1272

* sleep_avg beyond just interactive state.

1272

*/

1273

*/

1273

p->activated = -1;

1274

p->activated = -1;

1274

}

1275

}

1275

1276

/*

1277

/*

1277

* Tasks that have marked their sleep as noninteractive get

1278

* Tasks that have marked their sleep as noninteractive get

1278

* woken up without updating their sleep average. (i.e. their

1279

* woken up without updating their sleep average. (i.e. their

1279

* sleep is handled in a priority-neutral manner, no priority

1280

* sleep is handled in a priority-neutral manner, no priority

1280

* boost and no penalty.)

1281

* boost and no penalty.)

1281

*/

1282

*/

1282

if (old_state & TASK_NONINTERACTIVE)

1283

if (old_state & TASK_NONINTERACTIVE)

1283

__activate_task(p, rq);

1284

__activate_task(p, rq);

1284

else

1285

else

1285

activate_task(p, rq, cpu == this_cpu);

1286

activate_task(p, rq, cpu == this_cpu);

1286

/*

1287

/*

1287

* Sync wakeups (i.e. those types of wakeups where the waker

1288

* Sync wakeups (i.e. those types of wakeups where the waker

1288

* has indicated that it will leave the CPU in short order)

1289

* has indicated that it will leave the CPU in short order)

1289

* don't trigger a preemption, if the woken up task will run on

1290

* don't trigger a preemption, if the woken up task will run on

1290

* this cpu. (in this case the 'I will reschedule' promise of

1291

* this cpu. (in this case the 'I will reschedule' promise of

1291

* the waker guarantees that the freshly woken up task is going

1292

* the waker guarantees that the freshly woken up task is going

1292

* to be considered on this CPU.)

1293

* to be considered on this CPU.)

1293

*/

1294

*/

1294

if (!sync || cpu != this_cpu) {

1295

if (!sync || cpu != this_cpu) {

1295

if (TASK_PREEMPTS_CURR(p, rq))

1296

if (TASK_PREEMPTS_CURR(p, rq))

1296

resched_task(rq->curr);

1297

resched_task(rq->curr);

1297

}

1298

}

1298

success = 1;

1299

success = 1;

1299

1300

out_running:

1301

out_running:

1301

p->state = TASK_RUNNING;

1302

p->state = TASK_RUNNING;

1302

out:

1303

out:

1303

task_rq_unlock(rq, &flags);

1304

task_rq_unlock(rq, &flags);

1304

1305

return success;

1306

return success;

1306

}

1307

}

1307

1308

int fastcall wake_up_process(task_t *p)

1309

int fastcall wake_up_process(task_t *p)

1309

{

1310

{

1310

return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |

1311

return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |

1311

TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);

1312

TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);

1312

}

1313

}

1313

1314

EXPORT_SYMBOL(wake_up_process);

1315

EXPORT_SYMBOL(wake_up_process);

1315

1316

int fastcall wake_up_state(task_t *p, unsigned int state)

1317

int fastcall wake_up_state(task_t *p, unsigned int state)

1317

{

1318

{

1318

return try_to_wake_up(p, state, 0);

1319

return try_to_wake_up(p, state, 0);

1319

}

1320

}

1320

1321

/*

1322

/*

1322

* Perform scheduler related setup for a newly forked process p.

1323

* Perform scheduler related setup for a newly forked process p.

1323

* p is forked by current.

1324

* p is forked by current.

1324

*/

1325

*/

1325

void fastcall sched_fork(task_t *p, int clone_flags)

1326

void fastcall sched_fork(task_t *p, int clone_flags)

1326

{

1327

{

1327

int cpu = get_cpu();

1328

int cpu = get_cpu();

1328

1329

#ifdef CONFIG_SMP

1330

#ifdef CONFIG_SMP

1330

cpu = sched_balance_self(cpu, SD_BALANCE_FORK);

1331

cpu = sched_balance_self(cpu, SD_BALANCE_FORK);

1331

#endif

1332

#endif

1332

set_task_cpu(p, cpu);

1333

set_task_cpu(p, cpu);

1333

1334

/*

1335

/*

1335

* We mark the process as running here, but have not actually

1336

* We mark the process as running here, but have not actually

1336

* inserted it onto the runqueue yet. This guarantees that

1337

* inserted it onto the runqueue yet. This guarantees that

1337

* nobody will actually run it, and a signal or other external

1338

* nobody will actually run it, and a signal or other external

1338

* event cannot wake it up and insert it on the runqueue either.

1339

* event cannot wake it up and insert it on the runqueue either.

1339

*/

1340

*/

1340

p->state = TASK_RUNNING;

1341

p->state = TASK_RUNNING;

1341

INIT_LIST_HEAD(&p->run_list);

1342

INIT_LIST_HEAD(&p->run_list);

1342

p->array = NULL;

1343

p->array = NULL;

1343

#ifdef CONFIG_SCHEDSTATS

1344

#ifdef CONFIG_SCHEDSTATS

1344

memset(&p->sched_info, 0, sizeof(p->sched_info));

1345

memset(&p->sched_info, 0, sizeof(p->sched_info));

1345

#endif

1346

#endif

1346

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

1347

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

1347

p->oncpu = 0;

1348

p->oncpu = 0;

1348

#endif

1349

#endif

1349

#ifdef CONFIG_PREEMPT

1350

#ifdef CONFIG_PREEMPT

1350

/* Want to start with kernel preemption disabled. */

1351

/* Want to start with kernel preemption disabled. */

1351

task_thread_info(p)->preempt_count = 1;

1352

task_thread_info(p)->preempt_count = 1;

1352

#endif

1353

#endif

1353

/*

1354

/*

1354

* Share the timeslice between parent and child, thus the

1355

* Share the timeslice between parent and child, thus the

1355

* total amount of pending timeslices in the system doesn't change,

1356

* total amount of pending timeslices in the system doesn't change,

1356

* resulting in more scheduling fairness.

1357

* resulting in more scheduling fairness.

1357

*/

1358

*/

1358

local_irq_disable();

1359

local_irq_disable();

1359

p->time_slice = (current->time_slice + 1) >> 1;

1360

p->time_slice = (current->time_slice + 1) >> 1;

1360

/*

1361

/*

1361

* The remainder of the first timeslice might be recovered by

1362

* The remainder of the first timeslice might be recovered by

1362

* the parent if the child exits early enough.

1363

* the parent if the child exits early enough.

1363

*/

1364

*/

1364

p->first_time_slice = 1;

1365

p->first_time_slice = 1;

1365

current->time_slice >>= 1;

1366

current->time_slice >>= 1;

1366

p->timestamp = sched_clock();

1367

p->timestamp = sched_clock();

1367

if (unlikely(!current->time_slice)) {

1368

if (unlikely(!current->time_slice)) {

1368

/*

1369

/*

1369

* This case is rare, it happens when the parent has only

1370

* This case is rare, it happens when the parent has only

1370

* a single jiffy left from its timeslice. Taking the

1371

* a single jiffy left from its timeslice. Taking the

1371

* runqueue lock is not a problem.

1372

* runqueue lock is not a problem.

1372

*/

1373

*/

1373

current->time_slice = 1;

1374

current->time_slice = 1;

1374

scheduler_tick();

1375

scheduler_tick();

1375

}

1376

}

1376

local_irq_enable();

1377

local_irq_enable();

1377

put_cpu();

1378

put_cpu();

1378

}

1379

}

1379

1380

/*

1381

/*

1381

* wake_up_new_task - wake up a newly created task for the first time.

1382

* wake_up_new_task - wake up a newly created task for the first time.

1382

*

1383

*

1383

* This function will do some initial scheduler statistics housekeeping

1384

* This function will do some initial scheduler statistics housekeeping

1384

* that must be done for every newly created context, then puts the task

1385

* that must be done for every newly created context, then puts the task

1385

* on the runqueue and wakes it.

1386

* on the runqueue and wakes it.

1386

*/

1387

*/

1387

void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)

1388

void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)

1388

{

1389

{

1389

unsigned long flags;

1390

unsigned long flags;

1390

int this_cpu, cpu;

1391

int this_cpu, cpu;

1391

runqueue_t *rq, *this_rq;

1392

runqueue_t *rq, *this_rq;

1392

1393

rq = task_rq_lock(p, &flags);

1394

rq = task_rq_lock(p, &flags);

1394

BUG_ON(p->state != TASK_RUNNING);

1395

BUG_ON(p->state != TASK_RUNNING);

1395

this_cpu = smp_processor_id();

1396

this_cpu = smp_processor_id();

1396

cpu = task_cpu(p);

1397

cpu = task_cpu(p);

1397

1398

/*

1399

/*

1399

* We decrease the sleep average of forking parents

1400

* We decrease the sleep average of forking parents

1400

* and children as well, to keep max-interactive tasks

1401

* and children as well, to keep max-interactive tasks

1401

* from forking tasks that are max-interactive. The parent

1402

* from forking tasks that are max-interactive. The parent

1402

* (current) is done further down, under its lock.

1403

* (current) is done further down, under its lock.

1403

*/

1404

*/

1404

p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *

1405

p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *

1405

CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);

1406

CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);

1406

1407

p->prio = effective_prio(p);

1408

p->prio = effective_prio(p);

1408

1409

if (likely(cpu == this_cpu)) {

1410

if (likely(cpu == this_cpu)) {

1410

if (!(clone_flags & CLONE_VM)) {

1411

if (!(clone_flags & CLONE_VM)) {

1411

/*

1412

/*

1412

* The VM isn't cloned, so we're in a good position to

1413

* The VM isn't cloned, so we're in a good position to

1413

* do child-runs-first in anticipation of an exec. This

1414

* do child-runs-first in anticipation of an exec. This

1414

* usually avoids a lot of COW overhead.

1415

* usually avoids a lot of COW overhead.

1415

*/

1416

*/

1416

if (unlikely(!current->array))

1417

if (unlikely(!current->array))

1417

__activate_task(p, rq);

1418

__activate_task(p, rq);

1418

else {

1419

else {

1419

p->prio = current->prio;

1420

p->prio = current->prio;

1420

list_add_tail(&p->run_list, &current->run_list);

1421

list_add_tail(&p->run_list, &current->run_list);

1421

p->array = current->array;

1422

p->array = current->array;

1422

p->array->nr_active++;

1423

p->array->nr_active++;

1423

rq->nr_running++;

1424

rq->nr_running++;

1424

}

1425

}

1425

set_need_resched();

1426

set_need_resched();

1426

} else

1427

} else

1427

/* Run child last */

1428

/* Run child last */

1428

__activate_task(p, rq);

1429

__activate_task(p, rq);

1429

/*

1430

/*

1430

* We skip the following code due to cpu == this_cpu

1431

* We skip the following code due to cpu == this_cpu

1431

*

1432

*

1432

* task_rq_unlock(rq, &flags);

1433

* task_rq_unlock(rq, &flags);

1433

* this_rq = task_rq_lock(current, &flags);

1434

* this_rq = task_rq_lock(current, &flags);

1434

*/

1435

*/

1435

this_rq = rq;

1436

this_rq = rq;

1436

} else {

1437

} else {

1437

this_rq = cpu_rq(this_cpu);

1438

this_rq = cpu_rq(this_cpu);

1438

1439

/*

1440

/*

1440

* Not the local CPU - must adjust timestamp. This should

1441

* Not the local CPU - must adjust timestamp. This should

1441

* get optimised away in the !CONFIG_SMP case.

1442

* get optimised away in the !CONFIG_SMP case.

1442

*/

1443

*/

1443

p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)

1444

p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)

1444

+ rq->timestamp_last_tick;

1445

+ rq->timestamp_last_tick;

1445

__activate_task(p, rq);

1446

__activate_task(p, rq);

1446

if (TASK_PREEMPTS_CURR(p, rq))

1447

if (TASK_PREEMPTS_CURR(p, rq))

1447

resched_task(rq->curr);

1448

resched_task(rq->curr);

1448

1449

/*

1450

/*

1450

* Parent and child are on different CPUs, now get the

1451

* Parent and child are on different CPUs, now get the

1451

* parent runqueue to update the parent's ->sleep_avg:

1452

* parent runqueue to update the parent's ->sleep_avg:

1452

*/

1453

*/

1453

task_rq_unlock(rq, &flags);

1454

task_rq_unlock(rq, &flags);

1454

this_rq = task_rq_lock(current, &flags);

1455

this_rq = task_rq_lock(current, &flags);

1455

}

1456

}

1456

current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *

1457

current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *

1457

PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);

1458

PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);

1458

task_rq_unlock(this_rq, &flags);

1459

task_rq_unlock(this_rq, &flags);

1459

}

1460

}

1460

1461

/*

1462

/*

1462

* Potentially available exiting-child timeslices are

1463

* Potentially available exiting-child timeslices are

1463

* retrieved here - this way the parent does not get

1464

* retrieved here - this way the parent does not get

1464

* penalized for creating too many threads.

1465

* penalized for creating too many threads.

1465

*

1466

*

1466

* (this cannot be used to 'generate' timeslices

1467

* (this cannot be used to 'generate' timeslices

1467

* artificially, because any timeslice recovered here

1468

* artificially, because any timeslice recovered here

1468

* was given away by the parent in the first place.)

1469

* was given away by the parent in the first place.)

1469

*/

1470

*/

1470

void fastcall sched_exit(task_t *p)

1471

void fastcall sched_exit(task_t *p)

1471

{

1472

{

1472

unsigned long flags;

1473

unsigned long flags;

1473

runqueue_t *rq;

1474

runqueue_t *rq;

1474

1475

/*

1476

/*

1476

* If the child was a (relative-) CPU hog then decrease

1477

* If the child was a (relative-) CPU hog then decrease

1477

* the sleep_avg of the parent as well.

1478

* the sleep_avg of the parent as well.

1478

*/

1479

*/

1479

rq = task_rq_lock(p->parent, &flags);

1480

rq = task_rq_lock(p->parent, &flags);

1480

if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {

1481

if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {

1481

p->parent->time_slice += p->time_slice;

1482

p->parent->time_slice += p->time_slice;

1482

if (unlikely(p->parent->time_slice > task_timeslice(p)))

1483

if (unlikely(p->parent->time_slice > task_timeslice(p)))

1483

p->parent->time_slice = task_timeslice(p);

1484

p->parent->time_slice = task_timeslice(p);

1484

}

1485

}

1485

if (p->sleep_avg < p->parent->sleep_avg)

1486

if (p->sleep_avg < p->parent->sleep_avg)

1486

p->parent->sleep_avg = p->parent->sleep_avg /

1487

p->parent->sleep_avg = p->parent->sleep_avg /

1487

(EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /

1488

(EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /

1488

(EXIT_WEIGHT + 1);

1489

(EXIT_WEIGHT + 1);

1489

task_rq_unlock(rq, &flags);

1490

task_rq_unlock(rq, &flags);

1490

}

1491

}

1491

1492

/**

1493

/**

1493

* prepare_task_switch - prepare to switch tasks

1494

* prepare_task_switch - prepare to switch tasks

1494

* @rq: the runqueue preparing to switch

1495

* @rq: the runqueue preparing to switch

1495

* @next: the task we are going to switch to.

1496

* @next: the task we are going to switch to.

1496

*

1497

*

1497

* This is called with the rq lock held and interrupts off. It must

1498

* This is called with the rq lock held and interrupts off. It must

1498

* be paired with a subsequent finish_task_switch after the context

1499

* be paired with a subsequent finish_task_switch after the context

1499

* switch.

1500

* switch.

1500

*

1501

*

1501

* prepare_task_switch sets up locking and calls architecture specific

1502

* prepare_task_switch sets up locking and calls architecture specific

1502

* hooks.

1503

* hooks.

1503

*/

1504

*/

1504

static inline void prepare_task_switch(runqueue_t *rq, task_t *next)

1505

static inline void prepare_task_switch(runqueue_t *rq, task_t *next)

1505

{

1506

{

1506

prepare_lock_switch(rq, next);

1507

prepare_lock_switch(rq, next);

1507

prepare_arch_switch(next);

1508

prepare_arch_switch(next);

1508

}

1509

}

1509

1510

/**

1511

/**

1511

* finish_task_switch - clean up after a task-switch

1512

* finish_task_switch - clean up after a task-switch

1512

* @rq: runqueue associated with task-switch

1513

* @rq: runqueue associated with task-switch

1513

* @prev: the thread we just switched away from.

1514

* @prev: the thread we just switched away from.

1514

*

1515

*

1515

* finish_task_switch must be called after the context switch, paired

1516

* finish_task_switch must be called after the context switch, paired

1516

* with a prepare_task_switch call before the context switch.

1517

* with a prepare_task_switch call before the context switch.

1517

* finish_task_switch will reconcile locking set up by prepare_task_switch,

1518

* finish_task_switch will reconcile locking set up by prepare_task_switch,

1518

* and do any other architecture-specific cleanup actions.

1519

* and do any other architecture-specific cleanup actions.

1519

*

1520

*

1520

* Note that we may have delayed dropping an mm in context_switch(). If

1521

* Note that we may have delayed dropping an mm in context_switch(). If

1521

* so, we finish that here outside of the runqueue lock. (Doing it

1522

* so, we finish that here outside of the runqueue lock. (Doing it

1522

* with the lock held can cause deadlocks; see schedule() for

1523

* with the lock held can cause deadlocks; see schedule() for

1523

* details.)

1524

* details.)

1524

*/

1525

*/

1525

static inline void finish_task_switch(runqueue_t *rq, task_t *prev)

1526

static inline void finish_task_switch(runqueue_t *rq, task_t *prev)

1526

__releases(rq->lock)

1527

__releases(rq->lock)

1527

{

1528

{

1528

struct mm_struct *mm = rq->prev_mm;

1529

struct mm_struct *mm = rq->prev_mm;

1529

unsigned long prev_task_flags;

1530

unsigned long prev_task_flags;

1530

1531

rq->prev_mm = NULL;

1532

rq->prev_mm = NULL;

1532

1533

/*

1534

/*

1534

* A task struct has one reference for the use as "current".

1535

* A task struct has one reference for the use as "current".

1535

* If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and

1536

* If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and

1536

* calls schedule one last time. The schedule call will never return,

1537

* calls schedule one last time. The schedule call will never return,

1537

* and the scheduled task must drop that reference.

1538

* and the scheduled task must drop that reference.

1538

* The test for EXIT_ZOMBIE must occur while the runqueue locks are

1539

* The test for EXIT_ZOMBIE must occur while the runqueue locks are

1539

* still held, otherwise prev could be scheduled on another cpu, die

1540

* still held, otherwise prev could be scheduled on another cpu, die

1540

* there before we look at prev->state, and then the reference would

1541

* there before we look at prev->state, and then the reference would

1541

* be dropped twice.

1542

* be dropped twice.

1542

* Manfred Spraul <manfred@colorfullife.com>

1543

* Manfred Spraul <manfred@colorfullife.com>

1543

*/

1544

*/

1544

prev_task_flags = prev->flags;

1545

prev_task_flags = prev->flags;

1545

finish_arch_switch(prev);

1546

finish_arch_switch(prev);

1546

finish_lock_switch(rq, prev);

1547

finish_lock_switch(rq, prev);

1547

if (mm)

1548

if (mm)

1548

mmdrop(mm);

1549

mmdrop(mm);

1549

if (unlikely(prev_task_flags & PF_DEAD))

1550

if (unlikely(prev_task_flags & PF_DEAD)) {

1551

/*

1552

* Remove function-return probe instances associated with this

1553

* task and put them back on the free list.

1554

*/

1555

kprobe_flush_task(prev);

1550

put_task_struct(prev);

1556

put_task_struct(prev);

1557

}

1551

}

1558

}

1552

1559

1553

/**

1560

/**

1554

* schedule_tail - first thing a freshly forked thread must call.

1561

* schedule_tail - first thing a freshly forked thread must call.

1555

* @prev: the thread we just switched away from.

1562

* @prev: the thread we just switched away from.

1556

*/

1563

*/

1557

asmlinkage void schedule_tail(task_t *prev)

1564

asmlinkage void schedule_tail(task_t *prev)

1558

__releases(rq->lock)

1565

__releases(rq->lock)

1559

{

1566

{

1560

runqueue_t *rq = this_rq();

1567

runqueue_t *rq = this_rq();

1561

finish_task_switch(rq, prev);

1568

finish_task_switch(rq, prev);

1562

#ifdef __ARCH_WANT_UNLOCKED_CTXSW

1569

#ifdef __ARCH_WANT_UNLOCKED_CTXSW

1563

/* In this case, finish_task_switch does not reenable preemption */

1570

/* In this case, finish_task_switch does not reenable preemption */

1564

preempt_enable();

1571

preempt_enable();

1565

#endif

1572

#endif

1566

if (current->set_child_tid)

1573

if (current->set_child_tid)

1567

put_user(current->pid, current->set_child_tid);

1574

put_user(current->pid, current->set_child_tid);

1568

}

1575

}

1569

1576

1570

/*

1577

/*

1571

* context_switch - switch to the new MM and the new

1578

* context_switch - switch to the new MM and the new

1572

* thread's register state.

1579

* thread's register state.

1573

*/

1580

*/

1574

static inline

1581

static inline

1575

task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)

1582

task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)

1576

{

1583

{

1577

struct mm_struct *mm = next->mm;

1584

struct mm_struct *mm = next->mm;

1578

struct mm_struct *oldmm = prev->active_mm;

1585

struct mm_struct *oldmm = prev->active_mm;

1579

1586

1580

if (unlikely(!mm)) {

1587

if (unlikely(!mm)) {

1581

next->active_mm = oldmm;

1588

next->active_mm = oldmm;

1582

atomic_inc(&oldmm->mm_count);

1589

atomic_inc(&oldmm->mm_count);

1583

enter_lazy_tlb(oldmm, next);

1590

enter_lazy_tlb(oldmm, next);

1584

} else

1591

} else

1585

switch_mm(oldmm, mm, next);

1592

switch_mm(oldmm, mm, next);

1586

1593

1587

if (unlikely(!prev->mm)) {

1594

if (unlikely(!prev->mm)) {

1588

prev->active_mm = NULL;

1595

prev->active_mm = NULL;

1589

WARN_ON(rq->prev_mm);

1596

WARN_ON(rq->prev_mm);

1590

rq->prev_mm = oldmm;

1597

rq->prev_mm = oldmm;

1591

}

1598

}

1592

1599

1593

/* Here we just switch the register state and the stack. */

1600

/* Here we just switch the register state and the stack. */

1594

switch_to(prev, next, prev);

1601

switch_to(prev, next, prev);

1595

1602

1596

return prev;

1603

return prev;

1597

}

1604

}

1598

1605

1599

/*

1606

/*

1600

* nr_running, nr_uninterruptible and nr_context_switches:

1607

* nr_running, nr_uninterruptible and nr_context_switches:

1601

*

1608

*

1602

* externally visible scheduler statistics: current number of runnable

1609

* externally visible scheduler statistics: current number of runnable

1603

* threads, current number of uninterruptible-sleeping threads, total

1610

* threads, current number of uninterruptible-sleeping threads, total

1604

* number of context switches performed since bootup.

1611

* number of context switches performed since bootup.

1605

*/

1612

*/

1606

unsigned long nr_running(void)

1613

unsigned long nr_running(void)

1607

{

1614

{

1608

unsigned long i, sum = 0;

1615

unsigned long i, sum = 0;

1609

1616

1610

for_each_online_cpu(i)

1617

for_each_online_cpu(i)

1611

sum += cpu_rq(i)->nr_running;

1618

sum += cpu_rq(i)->nr_running;

1612

1619

1613

return sum;

1620

return sum;

1614

}

1621

}

1615

1622

1616

unsigned long nr_uninterruptible(void)

1623

unsigned long nr_uninterruptible(void)

1617

{

1624

{

1618

unsigned long i, sum = 0;

1625

unsigned long i, sum = 0;

1619

1626

1620

for_each_cpu(i)

1627

for_each_cpu(i)

1621

sum += cpu_rq(i)->nr_uninterruptible;

1628

sum += cpu_rq(i)->nr_uninterruptible;

1622

1629

1623

/*

1630

/*

1624

* Since we read the counters lockless, it might be slightly

1631

* Since we read the counters lockless, it might be slightly

1625

* inaccurate. Do not allow it to go below zero though:

1632

* inaccurate. Do not allow it to go below zero though:

1626

*/

1633

*/

1627

if (unlikely((long)sum < 0))

1634

if (unlikely((long)sum < 0))

1628

sum = 0;

1635

sum = 0;

1629

1636

1630

return sum;

1637

return sum;

1631

}

1638

}

1632

1639

1633

unsigned long long nr_context_switches(void)

1640

unsigned long long nr_context_switches(void)

1634

{

1641

{

1635

unsigned long long i, sum = 0;

1642

unsigned long long i, sum = 0;

1636

1643

1637

for_each_cpu(i)

1644

for_each_cpu(i)

1638

sum += cpu_rq(i)->nr_switches;

1645

sum += cpu_rq(i)->nr_switches;

1639

1646

1640

return sum;

1647

return sum;

1641

}

1648

}

1642

1649

1643

unsigned long nr_iowait(void)

1650

unsigned long nr_iowait(void)

1644

{

1651

{

1645

unsigned long i, sum = 0;

1652

unsigned long i, sum = 0;

1646

1653

1647

for_each_cpu(i)

1654

for_each_cpu(i)

1648

sum += atomic_read(&cpu_rq(i)->nr_iowait);

1655

sum += atomic_read(&cpu_rq(i)->nr_iowait);

1649

1656

1650

return sum;

1657

return sum;

1651

}

1658

}

1652

1659

1653

#ifdef CONFIG_SMP

1660

#ifdef CONFIG_SMP

1654

1661

1655

/*

1662

/*

1656

* double_rq_lock - safely lock two runqueues

1663

* double_rq_lock - safely lock two runqueues

1657

*

1664

*

1658

* We must take them in cpu order to match code in

1665

* We must take them in cpu order to match code in

1659

* dependent_sleeper and wake_dependent_sleeper.

1666

* dependent_sleeper and wake_dependent_sleeper.

1660

*

1667

*

1661

* Note this does not disable interrupts like task_rq_lock,

1668

* Note this does not disable interrupts like task_rq_lock,

1662

* you need to do so manually before calling.

1669

* you need to do so manually before calling.

1663

*/

1670

*/

1664

static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)

1671

static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)

1665

__acquires(rq1->lock)

1672

__acquires(rq1->lock)

1666

__acquires(rq2->lock)

1673

__acquires(rq2->lock)

1667

{

1674

{

1668

if (rq1 == rq2) {

1675

if (rq1 == rq2) {

1669

spin_lock(&rq1->lock);

1676

spin_lock(&rq1->lock);

1670

__acquire(rq2->lock); /* Fake it out ;) */

1677

__acquire(rq2->lock); /* Fake it out ;) */

1671

} else {

1678

} else {

1672

if (rq1->cpu < rq2->cpu) {

1679

if (rq1->cpu < rq2->cpu) {

1673

spin_lock(&rq1->lock);

1680

spin_lock(&rq1->lock);

1674

spin_lock(&rq2->lock);

1681

spin_lock(&rq2->lock);

1675

} else {

1682

} else {

1676

spin_lock(&rq2->lock);

1683

spin_lock(&rq2->lock);

1677

spin_lock(&rq1->lock);

1684

spin_lock(&rq1->lock);

1678

}

1685

}

1679

}

1686

}

1680

}

1687

}

1681

1688

1682

/*

1689

/*

1683

* double_rq_unlock - safely unlock two runqueues

1690

* double_rq_unlock - safely unlock two runqueues

1684

*

1691

*

1685

* Note this does not restore interrupts like task_rq_unlock,

1692

* Note this does not restore interrupts like task_rq_unlock,

1686

* you need to do so manually after calling.

1693

* you need to do so manually after calling.

1687

*/

1694

*/

1688

static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)

1695

static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)

1689

__releases(rq1->lock)

1696

__releases(rq1->lock)

1690

__releases(rq2->lock)

1697

__releases(rq2->lock)

1691

{

1698

{

1692

spin_unlock(&rq1->lock);

1699

spin_unlock(&rq1->lock);

1693

if (rq1 != rq2)

1700

if (rq1 != rq2)

1694

spin_unlock(&rq2->lock);

1701

spin_unlock(&rq2->lock);

1695

else

1702

else

1696

__release(rq2->lock);

1703

__release(rq2->lock);

1697

}

1704

}

1698

1705

1699

/*

1706

/*

1700

* double_lock_balance - lock the busiest runqueue, this_rq is locked already.

1707

* double_lock_balance - lock the busiest runqueue, this_rq is locked already.

1701

*/

1708

*/

1702

static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)

1709

static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)

1703

__releases(this_rq->lock)

1710

__releases(this_rq->lock)

1704

__acquires(busiest->lock)

1711

__acquires(busiest->lock)

1705

__acquires(this_rq->lock)

1712

__acquires(this_rq->lock)

1706

{

1713

{

1707

if (unlikely(!spin_trylock(&busiest->lock))) {

1714

if (unlikely(!spin_trylock(&busiest->lock))) {

1708

if (busiest->cpu < this_rq->cpu) {

1715

if (busiest->cpu < this_rq->cpu) {

1709

spin_unlock(&this_rq->lock);

1716

spin_unlock(&this_rq->lock);

1710

spin_lock(&busiest->lock);

1717

spin_lock(&busiest->lock);

1711

spin_lock(&this_rq->lock);

1718

spin_lock(&this_rq->lock);

1712

} else

1719

} else

1713

spin_lock(&busiest->lock);

1720

spin_lock(&busiest->lock);

1714

}

1721

}

1715

}

1722

}

1716

1723

1717

/*

1724

/*

1718

* If dest_cpu is allowed for this process, migrate the task to it.

1725

* If dest_cpu is allowed for this process, migrate the task to it.

1719

* This is accomplished by forcing the cpu_allowed mask to only

1726

* This is accomplished by forcing the cpu_allowed mask to only

1720

* allow dest_cpu, which will force the cpu onto dest_cpu. Then

1727

* allow dest_cpu, which will force the cpu onto dest_cpu. Then

1721

* the cpu_allowed mask is restored.

1728

* the cpu_allowed mask is restored.

1722

*/

1729

*/

1723

static void sched_migrate_task(task_t *p, int dest_cpu)

1730

static void sched_migrate_task(task_t *p, int dest_cpu)

1724

{

1731

{

1725

migration_req_t req;

1732

migration_req_t req;

1726

runqueue_t *rq;

1733

runqueue_t *rq;

1727

unsigned long flags;

1734

unsigned long flags;

1728

1735

1729

rq = task_rq_lock(p, &flags);

1736

rq = task_rq_lock(p, &flags);

1730

if (!cpu_isset(dest_cpu, p->cpus_allowed)

1737

if (!cpu_isset(dest_cpu, p->cpus_allowed)

1731

|| unlikely(cpu_is_offline(dest_cpu)))

1738

|| unlikely(cpu_is_offline(dest_cpu)))

1732

goto out;

1739

goto out;

1733

1740

1734

/* force the process onto the specified CPU */

1741

/* force the process onto the specified CPU */

1735

if (migrate_task(p, dest_cpu, &req)) {

1742

if (migrate_task(p, dest_cpu, &req)) {

1736

/* Need to wait for migration thread (might exit: take ref). */

1743

/* Need to wait for migration thread (might exit: take ref). */

1737

struct task_struct *mt = rq->migration_thread;

1744

struct task_struct *mt = rq->migration_thread;

1738

get_task_struct(mt);

1745

get_task_struct(mt);

1739

task_rq_unlock(rq, &flags);

1746

task_rq_unlock(rq, &flags);

1740

wake_up_process(mt);

1747

wake_up_process(mt);

1741

put_task_struct(mt);

1748

put_task_struct(mt);

1742

wait_for_completion(&req.done);

1749

wait_for_completion(&req.done);

1743

return;

1750

return;

1744

}

1751

}

1745

out:

1752

out:

1746

task_rq_unlock(rq, &flags);

1753

task_rq_unlock(rq, &flags);

1747

}

1754

}

1748

1755

1749

/*

1756

/*

1750

* sched_exec - execve() is a valuable balancing opportunity, because at

1757

* sched_exec - execve() is a valuable balancing opportunity, because at

1751

* this point the task has the smallest effective memory and cache footprint.

1758

* this point the task has the smallest effective memory and cache footprint.

1752

*/

1759

*/

1753

void sched_exec(void)

1760

void sched_exec(void)

1754

{

1761

{

1755

int new_cpu, this_cpu = get_cpu();

1762

int new_cpu, this_cpu = get_cpu();

1756

new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);

1763

new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);

1757

put_cpu();

1764

put_cpu();

1758

if (new_cpu != this_cpu)

1765

if (new_cpu != this_cpu)

1759

sched_migrate_task(current, new_cpu);

1766

sched_migrate_task(current, new_cpu);

1760

}

1767

}

1761

1768

1762

/*

1769

/*

1763

* pull_task - move a task from a remote runqueue to the local runqueue.

1770

* pull_task - move a task from a remote runqueue to the local runqueue.

1764

* Both runqueues must be locked.

1771

* Both runqueues must be locked.

1765

*/

1772

*/

1766

static

1773

static

1767

void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,

1774

void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,

1768

runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)

1775

runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)

1769

{

1776

{

1770

dequeue_task(p, src_array);

1777

dequeue_task(p, src_array);

1771

src_rq->nr_running--;

1778

src_rq->nr_running--;

1772

set_task_cpu(p, this_cpu);

1779

set_task_cpu(p, this_cpu);

1773

this_rq->nr_running++;

1780

this_rq->nr_running++;

1774

enqueue_task(p, this_array);

1781

enqueue_task(p, this_array);

1775

p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)

1782

p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)

1776

+ this_rq->timestamp_last_tick;

1783

+ this_rq->timestamp_last_tick;

1777

/*

1784

/*

1778

* Note that idle threads have a prio of MAX_PRIO, for this test

1785

* Note that idle threads have a prio of MAX_PRIO, for this test

1779

* to be always true for them.

1786

* to be always true for them.

1780

*/

1787

*/

1781

if (TASK_PREEMPTS_CURR(p, this_rq))

1788

if (TASK_PREEMPTS_CURR(p, this_rq))

1782

resched_task(this_rq->curr);

1789

resched_task(this_rq->curr);

1783

}

1790

}

1784

1791

1785

/*

1792

/*

1786

* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?

1793

* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?

1787

*/

1794

*/

1788

static

1795

static

1789

int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,

1796

int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,

1790

struct sched_domain *sd, enum idle_type idle,

1797

struct sched_domain *sd, enum idle_type idle,

1791

int *all_pinned)

1798

int *all_pinned)

1792

{

1799

{

1793

/*

1800

/*

1794

* We do not migrate tasks that are:

1801

* We do not migrate tasks that are:

1795

* 1) running (obviously), or

1802

* 1) running (obviously), or

1796

* 2) cannot be migrated to this CPU due to cpus_allowed, or

1803

* 2) cannot be migrated to this CPU due to cpus_allowed, or

1797

* 3) are cache-hot on their current CPU.

1804

* 3) are cache-hot on their current CPU.

1798

*/

1805

*/

1799

if (!cpu_isset(this_cpu, p->cpus_allowed))

1806

if (!cpu_isset(this_cpu, p->cpus_allowed))

1800

return 0;

1807

return 0;

1801

*all_pinned = 0;

1808

*all_pinned = 0;

1802

1809

1803

if (task_running(rq, p))

1810

if (task_running(rq, p))

1804

return 0;

1811

return 0;

1805

1812

1806

/*

1813

/*

1807

* Aggressive migration if:

1814

* Aggressive migration if:

1808

* 1) task is cache cold, or

1815

* 1) task is cache cold, or

1809

* 2) too many balance attempts have failed.

1816

* 2) too many balance attempts have failed.

1810

*/

1817

*/

1811

1818

1812

if (sd->nr_balance_failed > sd->cache_nice_tries)

1819

if (sd->nr_balance_failed > sd->cache_nice_tries)

1813

return 1;

1820

return 1;

1814

1821

1815

if (task_hot(p, rq->timestamp_last_tick, sd))

1822

if (task_hot(p, rq->timestamp_last_tick, sd))

1816

return 0;

1823

return 0;

1817

return 1;

1824

return 1;

1818

}

1825

}

1819

1826

1820

/*

1827

/*

1821

* move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,

1828

* move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,

1822

* as part of a balancing operation within "domain". Returns the number of

1829

* as part of a balancing operation within "domain". Returns the number of

1823

* tasks moved.

1830

* tasks moved.

1824

*

1831

*

1825

* Called with both runqueues locked.

1832

* Called with both runqueues locked.

1826

*/

1833

*/

1827

static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,

1834

static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,

1828

unsigned long max_nr_move, struct sched_domain *sd,

1835

unsigned long max_nr_move, struct sched_domain *sd,

1829

enum idle_type idle, int *all_pinned)

1836

enum idle_type idle, int *all_pinned)

1830

{

1837

{

1831

prio_array_t *array, *dst_array;

1838

prio_array_t *array, *dst_array;

1832

struct list_head *head, *curr;

1839

struct list_head *head, *curr;

1833

int idx, pulled = 0, pinned = 0;

1840

int idx, pulled = 0, pinned = 0;

1834

task_t *tmp;

1841

task_t *tmp;

1835

1842

1836

if (max_nr_move == 0)

1843

if (max_nr_move == 0)

1837

goto out;

1844

goto out;

1838

1845

1839

pinned = 1;

1846

pinned = 1;

1840

1847

1841

/*

1848

/*

1842

* We first consider expired tasks. Those will likely not be

1849

* We first consider expired tasks. Those will likely not be

1843

* executed in the near future, and they are most likely to

1850

* executed in the near future, and they are most likely to

1844

* be cache-cold, thus switching CPUs has the least effect

1851

* be cache-cold, thus switching CPUs has the least effect

1845

* on them.

1852

* on them.

1846

*/

1853

*/

1847

if (busiest->expired->nr_active) {

1854

if (busiest->expired->nr_active) {

1848

array = busiest->expired;

1855

array = busiest->expired;

1849

dst_array = this_rq->expired;

1856

dst_array = this_rq->expired;

1850

} else {

1857

} else {

1851

array = busiest->active;

1858

array = busiest->active;

1852

dst_array = this_rq->active;

1859

dst_array = this_rq->active;

1853

}

1860

}

1854

1861

1855

new_array:

1862

new_array:

1856

/* Start searching at priority 0: */

1863

/* Start searching at priority 0: */

1857

idx = 0;

1864

idx = 0;

1858

skip_bitmap:

1865

skip_bitmap:

1859

if (!idx)

1866

if (!idx)

1860

idx = sched_find_first_bit(array->bitmap);

1867

idx = sched_find_first_bit(array->bitmap);

1861

else

1868

else

1862

idx = find_next_bit(array->bitmap, MAX_PRIO, idx);

1869

idx = find_next_bit(array->bitmap, MAX_PRIO, idx);

1863

if (idx >= MAX_PRIO) {

1870

if (idx >= MAX_PRIO) {

1864

if (array == busiest->expired && busiest->active->nr_active) {

1871

if (array == busiest->expired && busiest->active->nr_active) {

1865

array = busiest->active;

1872

array = busiest->active;

1866

dst_array = this_rq->active;

1873

dst_array = this_rq->active;

1867

goto new_array;

1874

goto new_array;

1868

}

1875

}

1869

goto out;

1876

goto out;

1870

}

1877

}

1871

1878

1872

head = array->queue + idx;

1879

head = array->queue + idx;

1873

curr = head->prev;

1880

curr = head->prev;

1874

skip_queue:

1881

skip_queue:

1875

tmp = list_entry(curr, task_t, run_list);

1882

tmp = list_entry(curr, task_t, run_list);

1876

1883

1877

curr = curr->prev;

1884

curr = curr->prev;

1878

1885

1879

if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {

1886

if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {

1880

if (curr != head)

1887

if (curr != head)

1881

goto skip_queue;

1888

goto skip_queue;

1882

idx++;

1889

idx++;

1883

goto skip_bitmap;

1890

goto skip_bitmap;

1884

}

1891

}

1885

1892

1886

#ifdef CONFIG_SCHEDSTATS

1893

#ifdef CONFIG_SCHEDSTATS

1887

if (task_hot(tmp, busiest->timestamp_last_tick, sd))

1894

if (task_hot(tmp, busiest->timestamp_last_tick, sd))

1888

schedstat_inc(sd, lb_hot_gained[idle]);

1895

schedstat_inc(sd, lb_hot_gained[idle]);

1889

#endif

1896

#endif

1890

1897

1891

pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);

1898

pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);

1892

pulled++;

1899

pulled++;

1893

1900

1894

/* We only want to steal up to the prescribed number of tasks. */

1901

/* We only want to steal up to the prescribed number of tasks. */

1895

if (pulled < max_nr_move) {

1902

if (pulled < max_nr_move) {

1896

if (curr != head)

1903

if (curr != head)

1897

goto skip_queue;

1904

goto skip_queue;

1898

idx++;

1905

idx++;

1899

goto skip_bitmap;

1906

goto skip_bitmap;

1900

}

1907

}

1901

out:

1908

out:

1902

/*

1909

/*

1903

* Right now, this is the only place pull_task() is called,

1910

* Right now, this is the only place pull_task() is called,

1904

* so we can safely collect pull_task() stats here rather than

1911

* so we can safely collect pull_task() stats here rather than

1905

* inside pull_task().

1912

* inside pull_task().

1906

*/

1913

*/

1907

schedstat_add(sd, lb_gained[idle], pulled);

1914

schedstat_add(sd, lb_gained[idle], pulled);

1908

1915

1909

if (all_pinned)

1916

if (all_pinned)

1910

*all_pinned = pinned;

1917

*all_pinned = pinned;

1911

return pulled;

1918

return pulled;

1912

}

1919

}

1913

1920

1914

/*

1921

/*

1915

* find_busiest_group finds and returns the busiest CPU group within the

1922

* find_busiest_group finds and returns the busiest CPU group within the

1916

* domain. It calculates and returns the number of tasks which should be

1923

* domain. It calculates and returns the number of tasks which should be

1917

* moved to restore balance via the imbalance parameter.

1924

* moved to restore balance via the imbalance parameter.

1918

*/

1925

*/

1919

static struct sched_group *

1926

static struct sched_group *

1920

find_busiest_group(struct sched_domain *sd, int this_cpu,

1927

find_busiest_group(struct sched_domain *sd, int this_cpu,

1921

unsigned long *imbalance, enum idle_type idle, int *sd_idle)

1928

unsigned long *imbalance, enum idle_type idle, int *sd_idle)

1922

{

1929

{

1923

struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;

1930

struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;

1924

unsigned long max_load, avg_load, total_load, this_load, total_pwr;

1931

unsigned long max_load, avg_load, total_load, this_load, total_pwr;

1925

unsigned long max_pull;

1932

unsigned long max_pull;

1926

int load_idx;

1933

int load_idx;

1927

1934

1928

max_load = this_load = total_load = total_pwr = 0;

1935

max_load = this_load = total_load = total_pwr = 0;

1929

if (idle == NOT_IDLE)

1936

if (idle == NOT_IDLE)

1930

load_idx = sd->busy_idx;

1937

load_idx = sd->busy_idx;

1931

else if (idle == NEWLY_IDLE)

1938

else if (idle == NEWLY_IDLE)

1932

load_idx = sd->newidle_idx;

1939

load_idx = sd->newidle_idx;

1933

else

1940

else

1934

load_idx = sd->idle_idx;

1941

load_idx = sd->idle_idx;

1935

1942

1936

do {

1943

do {

1937

unsigned long load;

1944

unsigned long load;

1938

int local_group;

1945

int local_group;

1939

int i;

1946

int i;

1940

1947

1941

local_group = cpu_isset(this_cpu, group->cpumask);

1948

local_group = cpu_isset(this_cpu, group->cpumask);

1942

1949

1943

/* Tally up the load of all CPUs in the group */

1950

/* Tally up the load of all CPUs in the group */

1944

avg_load = 0;

1951

avg_load = 0;

1945

1952

1946

for_each_cpu_mask(i, group->cpumask) {

1953

for_each_cpu_mask(i, group->cpumask) {

1947

if (*sd_idle && !idle_cpu(i))

1954

if (*sd_idle && !idle_cpu(i))

1948

*sd_idle = 0;

1955

*sd_idle = 0;

1949

1956

1950

/* Bias balancing toward cpus of our domain */

1957

/* Bias balancing toward cpus of our domain */

1951

if (local_group)

1958

if (local_group)

1952

load = target_load(i, load_idx);

1959

load = target_load(i, load_idx);

1953

else

1960

else

1954

load = source_load(i, load_idx);

1961

load = source_load(i, load_idx);

1955

1962

1956

avg_load += load;

1963

avg_load += load;

1957

}

1964

}

1958

1965

1959

total_load += avg_load;

1966

total_load += avg_load;

1960

total_pwr += group->cpu_power;

1967

total_pwr += group->cpu_power;

1961

1968

1962

/* Adjust by relative CPU power of the group */

1969

/* Adjust by relative CPU power of the group */

1963

avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;

1970

avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;

1964

1971

1965

if (local_group) {

1972

if (local_group) {

1966

this_load = avg_load;

1973

this_load = avg_load;

1967

this = group;

1974

this = group;

1968

} else if (avg_load > max_load) {

1975

} else if (avg_load > max_load) {

1969

max_load = avg_load;

1976

max_load = avg_load;

1970

busiest = group;

1977

busiest = group;

1971

}

1978

}

1972

group = group->next;

1979

group = group->next;

1973

} while (group != sd->groups);

1980

} while (group != sd->groups);

1974

1981

1975

if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)

1982

if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)

1976

goto out_balanced;

1983

goto out_balanced;

1977

1984

1978

avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;

1985

avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;

1979

1986

1980

if (this_load >= avg_load ||

1987

if (this_load >= avg_load ||

1981

100*max_load <= sd->imbalance_pct*this_load)

1988

100*max_load <= sd->imbalance_pct*this_load)

1982

goto out_balanced;

1989

goto out_balanced;

1983

1990

1984

/*

1991

/*

1985

* We're trying to get all the cpus to the average_load, so we don't

1992

* We're trying to get all the cpus to the average_load, so we don't

1986

* want to push ourselves above the average load, nor do we wish to

1993

* want to push ourselves above the average load, nor do we wish to

1987

* reduce the max loaded cpu below the average load, as either of these

1994

* reduce the max loaded cpu below the average load, as either of these

1988

* actions would just result in more rebalancing later, and ping-pong

1995

* actions would just result in more rebalancing later, and ping-pong

1989

* tasks around. Thus we look for the minimum possible imbalance.

1996

* tasks around. Thus we look for the minimum possible imbalance.

1990

* Negative imbalances (*we* are more loaded than anyone else) will

1997

* Negative imbalances (*we* are more loaded than anyone else) will

1991

* be counted as no imbalance for these purposes -- we can't fix that

1998

* be counted as no imbalance for these purposes -- we can't fix that

1992

* by pulling tasks to us. Be careful of negative numbers as they'll

1999

* by pulling tasks to us. Be careful of negative numbers as they'll

1993

* appear as very large values with unsigned longs.

2000

* appear as very large values with unsigned longs.

1994

*/

2001

*/

1995

2002

1996

/* Don't want to pull so many tasks that a group would go idle */

2003

/* Don't want to pull so many tasks that a group would go idle */

1997

max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);

2004

max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);

1998

2005

1999

/* How much load to actually move to equalise the imbalance */

2006

/* How much load to actually move to equalise the imbalance */

2000

*imbalance = min(max_pull * busiest->cpu_power,

2007

*imbalance = min(max_pull * busiest->cpu_power,

2001

(avg_load - this_load) * this->cpu_power)

2008

(avg_load - this_load) * this->cpu_power)

2002

/ SCHED_LOAD_SCALE;

2009

/ SCHED_LOAD_SCALE;

2003

2010

2004

if (*imbalance < SCHED_LOAD_SCALE) {

2011

if (*imbalance < SCHED_LOAD_SCALE) {

2005

unsigned long pwr_now = 0, pwr_move = 0;

2012

unsigned long pwr_now = 0, pwr_move = 0;

2006

unsigned long tmp;

2013

unsigned long tmp;

2007

2014

2008

if (max_load - this_load >= SCHED_LOAD_SCALE*2) {

2015

if (max_load - this_load >= SCHED_LOAD_SCALE*2) {

2009

*imbalance = 1;

2016

*imbalance = 1;

2010

return busiest;

2017

return busiest;

2011

}

2018

}

2012

2019

2013

/*

2020

/*

2014

* OK, we don't have enough imbalance to justify moving tasks,

2021

* OK, we don't have enough imbalance to justify moving tasks,

2015

* however we may be able to increase total CPU power used by

2022

* however we may be able to increase total CPU power used by

2016

* moving them.

2023

* moving them.

2017

*/

2024

*/

2018

2025

2019

pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);

2026

pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);

2020

pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);

2027

pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);

2021

pwr_now /= SCHED_LOAD_SCALE;

2028

pwr_now /= SCHED_LOAD_SCALE;

2022

2029

2023

/* Amount of load we'd subtract */

2030

/* Amount of load we'd subtract */

2024

tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;

2031

tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;

2025

if (max_load > tmp)

2032

if (max_load > tmp)

2026

pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,

2033

pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,

2027

max_load - tmp);

2034

max_load - tmp);

2028

2035

2029

/* Amount of load we'd add */

2036

/* Amount of load we'd add */

2030

if (max_load*busiest->cpu_power <

2037

if (max_load*busiest->cpu_power <

2031

SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)

2038

SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)

2032

tmp = max_load*busiest->cpu_power/this->cpu_power;

2039

tmp = max_load*busiest->cpu_power/this->cpu_power;

2033

else

2040

else

2034

tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;

2041

tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;

2035

pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);

2042

pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);

2036

pwr_move /= SCHED_LOAD_SCALE;

2043

pwr_move /= SCHED_LOAD_SCALE;

2037

2044

2038

/* Move if we gain throughput */

2045

/* Move if we gain throughput */

2039

if (pwr_move <= pwr_now)

2046

if (pwr_move <= pwr_now)

2040

goto out_balanced;

2047

goto out_balanced;

2041

2048

2042

*imbalance = 1;

2049

*imbalance = 1;

2043

return busiest;

2050

return busiest;

2044

}

2051

}

2045

2052

2046

/* Get rid of the scaling factor, rounding down as we divide */

2053

/* Get rid of the scaling factor, rounding down as we divide */

2047

*imbalance = *imbalance / SCHED_LOAD_SCALE;

2054

*imbalance = *imbalance / SCHED_LOAD_SCALE;

2048

return busiest;

2055

return busiest;

2049

2056

2050

out_balanced:

2057

out_balanced:

2051

2058

2052

*imbalance = 0;

2059

*imbalance = 0;

2053

return NULL;

2060

return NULL;

2054

}

2061

}

2055

2062

2056

/*

2063

/*

2057

* find_busiest_queue - find the busiest runqueue among the cpus in group.

2064

* find_busiest_queue - find the busiest runqueue among the cpus in group.

2058

*/

2065

*/

2059

static runqueue_t *find_busiest_queue(struct sched_group *group,

2066

static runqueue_t *find_busiest_queue(struct sched_group *group,

2060

enum idle_type idle)

2067

enum idle_type idle)

2061

{

2068

{

2062

unsigned long load, max_load = 0;

2069

unsigned long load, max_load = 0;

2063

runqueue_t *busiest = NULL;

2070

runqueue_t *busiest = NULL;

2064

int i;

2071

int i;

2065

2072

2066

for_each_cpu_mask(i, group->cpumask) {

2073

for_each_cpu_mask(i, group->cpumask) {

2067

load = source_load(i, 0);

2074

load = source_load(i, 0);

2068

2075

2069

if (load > max_load) {

2076

if (load > max_load) {

2070

max_load = load;

2077

max_load = load;

2071

busiest = cpu_rq(i);

2078

busiest = cpu_rq(i);

2072

}

2079

}

2073

}

2080

}

2074

2081

2075

return busiest;

2082

return busiest;

2076

}

2083

}

2077

2084

2078

/*

2085

/*

2079

* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but

2086

* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but

2080

* so long as it is large enough.

2087

* so long as it is large enough.

2081

*/

2088

*/

2082

#define MAX_PINNED_INTERVAL 512

2089

#define MAX_PINNED_INTERVAL 512

2083

2090

2084

/*

2091

/*

2085

* Check this_cpu to ensure it is balanced within domain. Attempt to move

2092

* Check this_cpu to ensure it is balanced within domain. Attempt to move

2086

* tasks if there is an imbalance.

2093

* tasks if there is an imbalance.

2087

*

2094

*

2088

* Called with this_rq unlocked.

2095

* Called with this_rq unlocked.

2089

*/

2096

*/

2090

static int load_balance(int this_cpu, runqueue_t *this_rq,

2097

static int load_balance(int this_cpu, runqueue_t *this_rq,

2091

struct sched_domain *sd, enum idle_type idle)

2098

struct sched_domain *sd, enum idle_type idle)

2092

{

2099

{

2093

struct sched_group *group;

2100

struct sched_group *group;

2094

runqueue_t *busiest;

2101

runqueue_t *busiest;

2095

unsigned long imbalance;

2102

unsigned long imbalance;

2096

int nr_moved, all_pinned = 0;

2103

int nr_moved, all_pinned = 0;

2097

int active_balance = 0;

2104

int active_balance = 0;

2098

int sd_idle = 0;

2105

int sd_idle = 0;

2099

2106

2100

if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)

2107

if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)

2101

sd_idle = 1;

2108

sd_idle = 1;

2102

2109

2103

schedstat_inc(sd, lb_cnt[idle]);

2110

schedstat_inc(sd, lb_cnt[idle]);

2104

2111

2105

group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);

2112

group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);

2106

if (!group) {

2113

if (!group) {

2107

schedstat_inc(sd, lb_nobusyg[idle]);

2114

schedstat_inc(sd, lb_nobusyg[idle]);

2108

goto out_balanced;

2115

goto out_balanced;

2109

}

2116

}

2110

2117

2111

busiest = find_busiest_queue(group, idle);

2118

busiest = find_busiest_queue(group, idle);

2112

if (!busiest) {

2119

if (!busiest) {

2113

schedstat_inc(sd, lb_nobusyq[idle]);

2120

schedstat_inc(sd, lb_nobusyq[idle]);

2114

goto out_balanced;

2121

goto out_balanced;

2115

}

2122

}

2116

2123

2117

BUG_ON(busiest == this_rq);

2124

BUG_ON(busiest == this_rq);

2118

2125

2119

schedstat_add(sd, lb_imbalance[idle], imbalance);

2126

schedstat_add(sd, lb_imbalance[idle], imbalance);

2120

2127

2121

nr_moved = 0;

2128

nr_moved = 0;

2122

if (busiest->nr_running > 1) {

2129

if (busiest->nr_running > 1) {

2123

/*

2130

/*

2124

* Attempt to move tasks. If find_busiest_group has found

2131

* Attempt to move tasks. If find_busiest_group has found

2125

* an imbalance but busiest->nr_running <= 1, the group is

2132

* an imbalance but busiest->nr_running <= 1, the group is

2126

* still unbalanced. nr_moved simply stays zero, so it is

2133

* still unbalanced. nr_moved simply stays zero, so it is

2127

* correctly treated as an imbalance.

2134

* correctly treated as an imbalance.

2128

*/

2135

*/

2129

double_rq_lock(this_rq, busiest);

2136

double_rq_lock(this_rq, busiest);

2130

nr_moved = move_tasks(this_rq, this_cpu, busiest,

2137

nr_moved = move_tasks(this_rq, this_cpu, busiest,

2131

imbalance, sd, idle, &all_pinned);

2138

imbalance, sd, idle, &all_pinned);

2132

double_rq_unlock(this_rq, busiest);

2139

double_rq_unlock(this_rq, busiest);

2133

2140

2134

/* All tasks on this runqueue were pinned by CPU affinity */

2141

/* All tasks on this runqueue were pinned by CPU affinity */

2135

if (unlikely(all_pinned))

2142

if (unlikely(all_pinned))

2136

goto out_balanced;

2143

goto out_balanced;

2137

}

2144

}

2138

2145

2139

if (!nr_moved) {

2146

if (!nr_moved) {

2140

schedstat_inc(sd, lb_failed[idle]);

2147

schedstat_inc(sd, lb_failed[idle]);

2141

sd->nr_balance_failed++;

2148

sd->nr_balance_failed++;

2142

2149

2143

if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {

2150

if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {

2144

2151

2145

spin_lock(&busiest->lock);

2152

spin_lock(&busiest->lock);

2146

2153

2147

/* don't kick the migration_thread, if the curr

2154

/* don't kick the migration_thread, if the curr

2148

* task on busiest cpu can't be moved to this_cpu

2155

* task on busiest cpu can't be moved to this_cpu

2149

*/

2156

*/

2150

if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {

2157

if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {

2151

spin_unlock(&busiest->lock);

2158

spin_unlock(&busiest->lock);

2152

all_pinned = 1;

2159

all_pinned = 1;

2153

goto out_one_pinned;

2160

goto out_one_pinned;

2154

}

2161

}

2155

2162

2156

if (!busiest->active_balance) {

2163

if (!busiest->active_balance) {

2157

busiest->active_balance = 1;

2164

busiest->active_balance = 1;

2158

busiest->push_cpu = this_cpu;

2165

busiest->push_cpu = this_cpu;

2159

active_balance = 1;

2166

active_balance = 1;

2160

}

2167

}

2161

spin_unlock(&busiest->lock);

2168

spin_unlock(&busiest->lock);

2162

if (active_balance)

2169

if (active_balance)

2163

wake_up_process(busiest->migration_thread);

2170

wake_up_process(busiest->migration_thread);

2164

2171

2165

/*

2172

/*

2166

* We've kicked active balancing, reset the failure

2173

* We've kicked active balancing, reset the failure

2167

* counter.

2174

* counter.

2168

*/

2175

*/

2169

sd->nr_balance_failed = sd->cache_nice_tries+1;

2176

sd->nr_balance_failed = sd->cache_nice_tries+1;

2170

}

2177

}

2171

} else

2178

} else

2172

sd->nr_balance_failed = 0;

2179

sd->nr_balance_failed = 0;

2173

2180

2174

if (likely(!active_balance)) {

2181

if (likely(!active_balance)) {

2175

/* We were unbalanced, so reset the balancing interval */

2182

/* We were unbalanced, so reset the balancing interval */

2176

sd->balance_interval = sd->min_interval;

2183

sd->balance_interval = sd->min_interval;

2177

} else {

2184

} else {

2178

/*

2185

/*

2179

* If we've begun active balancing, start to back off. This

2186

* If we've begun active balancing, start to back off. This

2180

* case may not be covered by the all_pinned logic if there

2187

* case may not be covered by the all_pinned logic if there

2181

* is only 1 task on the busy runqueue (because we don't call

2188

* is only 1 task on the busy runqueue (because we don't call

2182

* move_tasks).

2189

* move_tasks).

2183

*/

2190

*/

2184

if (sd->balance_interval < sd->max_interval)

2191

if (sd->balance_interval < sd->max_interval)

2185

sd->balance_interval *= 2;

2192

sd->balance_interval *= 2;

2186

}

2193

}

2187

2194

2188

if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)

2195

if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)

2189

return -1;

2196

return -1;

2190

return nr_moved;

2197

return nr_moved;

2191

2198

2192

out_balanced:

2199

out_balanced:

2193

schedstat_inc(sd, lb_balanced[idle]);

2200

schedstat_inc(sd, lb_balanced[idle]);

2194

2201

2195

sd->nr_balance_failed = 0;

2202

sd->nr_balance_failed = 0;

2196

2203

2197

out_one_pinned:

2204

out_one_pinned:

2198

/* tune up the balancing interval */

2205

/* tune up the balancing interval */

2199

if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||

2206

if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||

2200

(sd->balance_interval < sd->max_interval))

2207

(sd->balance_interval < sd->max_interval))

2201

sd->balance_interval *= 2;

2208

sd->balance_interval *= 2;

2202

2209

2203

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)

2210

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)

2204

return -1;

2211

return -1;

2205

return 0;

2212

return 0;

2206

}

2213

}

2207

2214

2208

/*

2215

/*

2209

* Check this_cpu to ensure it is balanced within domain. Attempt to move

2216

* Check this_cpu to ensure it is balanced within domain. Attempt to move

2210

* tasks if there is an imbalance.

2217

* tasks if there is an imbalance.

2211

*

2218

*

2212

* Called from schedule when this_rq is about to become idle (NEWLY_IDLE).

2219

* Called from schedule when this_rq is about to become idle (NEWLY_IDLE).

2213

* this_rq is locked.

2220

* this_rq is locked.

2214

*/

2221

*/

2215

static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,

2222

static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,

2216

struct sched_domain *sd)

2223

struct sched_domain *sd)

2217

{

2224

{

2218

struct sched_group *group;

2225

struct sched_group *group;

2219

runqueue_t *busiest = NULL;

2226

runqueue_t *busiest = NULL;

2220

unsigned long imbalance;

2227

unsigned long imbalance;

2221

int nr_moved = 0;

2228

int nr_moved = 0;

2222

int sd_idle = 0;

2229

int sd_idle = 0;

2223

2230

2224

if (sd->flags & SD_SHARE_CPUPOWER)

2231

if (sd->flags & SD_SHARE_CPUPOWER)

2225

sd_idle = 1;

2232

sd_idle = 1;

2226

2233

2227

schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);

2234

schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);

2228

group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);

2235

group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);

2229

if (!group) {

2236

if (!group) {

2230

schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);

2237

schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);

2231

goto out_balanced;

2238

goto out_balanced;

2232

}

2239

}

2233

2240

2234

busiest = find_busiest_queue(group, NEWLY_IDLE);

2241

busiest = find_busiest_queue(group, NEWLY_IDLE);

2235

if (!busiest) {

2242

if (!busiest) {

2236

schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);

2243

schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);

2237

goto out_balanced;

2244

goto out_balanced;

2238

}

2245

}

2239

2246

2240

BUG_ON(busiest == this_rq);

2247

BUG_ON(busiest == this_rq);

2241

2248

2242

schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);

2249

schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);

2243

2250

2244

nr_moved = 0;

2251

nr_moved = 0;

2245

if (busiest->nr_running > 1) {

2252

if (busiest->nr_running > 1) {

2246

/* Attempt to move tasks */

2253

/* Attempt to move tasks */

2247

double_lock_balance(this_rq, busiest);

2254

double_lock_balance(this_rq, busiest);

2248

nr_moved = move_tasks(this_rq, this_cpu, busiest,

2255

nr_moved = move_tasks(this_rq, this_cpu, busiest,

2249

imbalance, sd, NEWLY_IDLE, NULL);

2256

imbalance, sd, NEWLY_IDLE, NULL);

2250

spin_unlock(&busiest->lock);

2257

spin_unlock(&busiest->lock);

2251

}

2258

}

2252

2259

2253

if (!nr_moved) {

2260

if (!nr_moved) {

2254

schedstat_inc(sd, lb_failed[NEWLY_IDLE]);

2261

schedstat_inc(sd, lb_failed[NEWLY_IDLE]);

2255

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)

2262

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)

2256

return -1;

2263

return -1;

2257

} else

2264

} else

2258

sd->nr_balance_failed = 0;

2265

sd->nr_balance_failed = 0;

2259

2266

2260

return nr_moved;

2267

return nr_moved;

2261

2268

2262

out_balanced:

2269

out_balanced:

2263

schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);

2270

schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);

2264

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)

2271

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)

2265

return -1;

2272

return -1;

2266

sd->nr_balance_failed = 0;

2273

sd->nr_balance_failed = 0;

2267

return 0;

2274

return 0;

2268

}

2275

}

2269

2276

2270

/*

2277

/*

2271

* idle_balance is called by schedule() if this_cpu is about to become

2278

* idle_balance is called by schedule() if this_cpu is about to become

2272

* idle. Attempts to pull tasks from other CPUs.

2279

* idle. Attempts to pull tasks from other CPUs.

2273

*/

2280

*/

2274

static void idle_balance(int this_cpu, runqueue_t *this_rq)

2281

static void idle_balance(int this_cpu, runqueue_t *this_rq)

2275

{

2282

{

2276

struct sched_domain *sd;

2283

struct sched_domain *sd;

2277

2284

2278

for_each_domain(this_cpu, sd) {

2285

for_each_domain(this_cpu, sd) {

2279

if (sd->flags & SD_BALANCE_NEWIDLE) {

2286

if (sd->flags & SD_BALANCE_NEWIDLE) {

2280

if (load_balance_newidle(this_cpu, this_rq, sd)) {

2287

if (load_balance_newidle(this_cpu, this_rq, sd)) {

2281

/* We've pulled tasks over so stop searching */

2288

/* We've pulled tasks over so stop searching */

2282

break;

2289

break;

2283

}

2290

}

2284

}

2291

}

2285

}

2292

}

2286

}

2293

}

2287

2294

2288

/*

2295

/*

2289

* active_load_balance is run by migration threads. It pushes running tasks

2296

* active_load_balance is run by migration threads. It pushes running tasks

2290

* off the busiest CPU onto idle CPUs. It requires at least 1 task to be

2297

* off the busiest CPU onto idle CPUs. It requires at least 1 task to be

2291

* running on each physical CPU where possible, and avoids physical /

2298

* running on each physical CPU where possible, and avoids physical /

2292

* logical imbalances.

2299

* logical imbalances.

2293

*

2300

*

2294

* Called with busiest_rq locked.

2301

* Called with busiest_rq locked.

2295

*/

2302

*/

2296

static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)

2303

static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)

2297

{

2304

{

2298

struct sched_domain *sd;

2305

struct sched_domain *sd;

2299

runqueue_t *target_rq;

2306

runqueue_t *target_rq;

2300

int target_cpu = busiest_rq->push_cpu;

2307

int target_cpu = busiest_rq->push_cpu;

2301

2308

2302

if (busiest_rq->nr_running <= 1)

2309

if (busiest_rq->nr_running <= 1)

2303

/* no task to move */

2310

/* no task to move */

2304

return;

2311

return;

2305

2312

2306

target_rq = cpu_rq(target_cpu);

2313

target_rq = cpu_rq(target_cpu);

2307

2314

2308

/*

2315

/*

2309

* This condition is "impossible", if it occurs

2316

* This condition is "impossible", if it occurs

2310

* we need to fix it. Originally reported by

2317

* we need to fix it. Originally reported by

2311

* Bjorn Helgaas on a 128-cpu setup.

2318

* Bjorn Helgaas on a 128-cpu setup.

2312

*/

2319

*/

2313

BUG_ON(busiest_rq == target_rq);

2320

BUG_ON(busiest_rq == target_rq);

2314

2321

2315

/* move a task from busiest_rq to target_rq */

2322

/* move a task from busiest_rq to target_rq */

2316

double_lock_balance(busiest_rq, target_rq);

2323

double_lock_balance(busiest_rq, target_rq);

2317

2324

2318

/* Search for an sd spanning us and the target CPU. */

2325

/* Search for an sd spanning us and the target CPU. */

2319

for_each_domain(target_cpu, sd)

2326

for_each_domain(target_cpu, sd)

2320

if ((sd->flags & SD_LOAD_BALANCE) &&

2327

if ((sd->flags & SD_LOAD_BALANCE) &&

2321

cpu_isset(busiest_cpu, sd->span))

2328

cpu_isset(busiest_cpu, sd->span))

2322

break;

2329

break;

2323

2330

2324

if (unlikely(sd == NULL))

2331

if (unlikely(sd == NULL))

2325

goto out;

2332

goto out;

2326

2333

2327

schedstat_inc(sd, alb_cnt);

2334

schedstat_inc(sd, alb_cnt);

2328

2335

2329

if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))

2336

if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))

2330

schedstat_inc(sd, alb_pushed);

2337

schedstat_inc(sd, alb_pushed);

2331

else

2338

else

2332

schedstat_inc(sd, alb_failed);

2339

schedstat_inc(sd, alb_failed);

2333

out:

2340

out:

2334

spin_unlock(&target_rq->lock);

2341

spin_unlock(&target_rq->lock);

2335

}

2342

}

2336

2343

2337

/*

2344

/*

2338

* rebalance_tick will get called every timer tick, on every CPU.

2345

* rebalance_tick will get called every timer tick, on every CPU.

2339

*

2346

*

2340

* It checks each scheduling domain to see if it is due to be balanced,

2347

* It checks each scheduling domain to see if it is due to be balanced,

2341

* and initiates a balancing operation if so.

2348

* and initiates a balancing operation if so.

2342

*

2349

*

2343

* Balancing parameters are set up in arch_init_sched_domains.

2350

* Balancing parameters are set up in arch_init_sched_domains.

2344

*/

2351

*/

2345

2352

2346

/* Don't have all balancing operations going off at once */

2353

/* Don't have all balancing operations going off at once */

2347

#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)

2354

#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)

2348

2355

2349

static void rebalance_tick(int this_cpu, runqueue_t *this_rq,

2356

static void rebalance_tick(int this_cpu, runqueue_t *this_rq,

2350

enum idle_type idle)

2357

enum idle_type idle)

2351

{

2358

{

2352

unsigned long old_load, this_load;

2359

unsigned long old_load, this_load;

2353

unsigned long j = jiffies + CPU_OFFSET(this_cpu);

2360

unsigned long j = jiffies + CPU_OFFSET(this_cpu);

2354

struct sched_domain *sd;

2361

struct sched_domain *sd;

2355

int i;

2362

int i;

2356

2363

2357

this_load = this_rq->nr_running * SCHED_LOAD_SCALE;

2364

this_load = this_rq->nr_running * SCHED_LOAD_SCALE;

2358

/* Update our load */

2365

/* Update our load */

2359

for (i = 0; i < 3; i++) {

2366

for (i = 0; i < 3; i++) {

2360

unsigned long new_load = this_load;

2367

unsigned long new_load = this_load;

2361

int scale = 1 << i;

2368

int scale = 1 << i;

2362

old_load = this_rq->cpu_load[i];

2369

old_load = this_rq->cpu_load[i];

2363

/*

2370

/*

2364

* Round up the averaging division if load is increasing. This

2371

* Round up the averaging division if load is increasing. This

2365

* prevents us from getting stuck on 9 if the load is 10, for

2372

* prevents us from getting stuck on 9 if the load is 10, for

2366

* example.

2373

* example.

2367

*/

2374

*/

2368

if (new_load > old_load)

2375

if (new_load > old_load)

2369

new_load += scale-1;

2376

new_load += scale-1;

2370

this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;

2377

this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;

2371

}

2378

}

2372

2379

2373

for_each_domain(this_cpu, sd) {

2380

for_each_domain(this_cpu, sd) {

2374

unsigned long interval;

2381

unsigned long interval;

2375

2382

2376

if (!(sd->flags & SD_LOAD_BALANCE))

2383

if (!(sd->flags & SD_LOAD_BALANCE))

2377

continue;

2384

continue;

2378

2385

2379

interval = sd->balance_interval;

2386

interval = sd->balance_interval;

2380

if (idle != SCHED_IDLE)

2387

if (idle != SCHED_IDLE)

2381

interval *= sd->busy_factor;

2388

interval *= sd->busy_factor;

2382

2389

2383

/* scale ms to jiffies */

2390

/* scale ms to jiffies */

2384

interval = msecs_to_jiffies(interval);

2391

interval = msecs_to_jiffies(interval);

2385

if (unlikely(!interval))

2392

if (unlikely(!interval))

2386

interval = 1;

2393

interval = 1;

2387

2394

2388

if (j - sd->last_balance >= interval) {

2395

if (j - sd->last_balance >= interval) {

2389

if (load_balance(this_cpu, this_rq, sd, idle)) {

2396

if (load_balance(this_cpu, this_rq, sd, idle)) {

2390

/*

2397

/*

2391

* We've pulled tasks over so either we're no

2398

* We've pulled tasks over so either we're no

2392

* longer idle, or one of our SMT siblings is

2399

* longer idle, or one of our SMT siblings is

2393

* not idle.

2400

* not idle.

2394

*/

2401

*/

2395

idle = NOT_IDLE;

2402

idle = NOT_IDLE;

2396

}

2403

}

2397

sd->last_balance += interval;

2404

sd->last_balance += interval;

2398

}

2405

}

2399

}

2406

}

2400

}

2407

}

2401

#else

2408

#else

2402

/*

2409

/*

2403

* on UP we do not need to balance between CPUs:

2410

* on UP we do not need to balance between CPUs:

2404

*/

2411

*/

2405

static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)

2412

static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)

2406

{

2413

{

2407

}

2414

}

2408

static inline void idle_balance(int cpu, runqueue_t *rq)

2415

static inline void idle_balance(int cpu, runqueue_t *rq)

2409

{

2416

{

2410

}

2417

}

2411

#endif

2418

#endif

2412

2419

2413

static inline int wake_priority_sleeper(runqueue_t *rq)

2420

static inline int wake_priority_sleeper(runqueue_t *rq)

2414

{

2421

{

2415

int ret = 0;

2422

int ret = 0;

2416

#ifdef CONFIG_SCHED_SMT

2423

#ifdef CONFIG_SCHED_SMT

2417

spin_lock(&rq->lock);

2424

spin_lock(&rq->lock);

2418

/*

2425

/*

2419

* If an SMT sibling task has been put to sleep for priority

2426

* If an SMT sibling task has been put to sleep for priority

2420

* reasons reschedule the idle task to see if it can now run.

2427

* reasons reschedule the idle task to see if it can now run.

2421

*/

2428

*/

2422

if (rq->nr_running) {

2429

if (rq->nr_running) {

2423

resched_task(rq->idle);

2430

resched_task(rq->idle);

2424

ret = 1;

2431

ret = 1;

2425

}

2432

}

2426

spin_unlock(&rq->lock);

2433

spin_unlock(&rq->lock);

2427

#endif

2434

#endif

2428

return ret;

2435

return ret;

2429

}

2436

}

2430

2437

2431

DEFINE_PER_CPU(struct kernel_stat, kstat);

2438

DEFINE_PER_CPU(struct kernel_stat, kstat);

2432

2439

2433

EXPORT_PER_CPU_SYMBOL(kstat);

2440

EXPORT_PER_CPU_SYMBOL(kstat);

2434

2441

2435

/*

2442

/*

2436

* This is called on clock ticks and on context switches.

2443

* This is called on clock ticks and on context switches.

2437

* Bank in p->sched_time the ns elapsed since the last tick or switch.

2444

* Bank in p->sched_time the ns elapsed since the last tick or switch.

2438

*/

2445

*/

2439

static inline void update_cpu_clock(task_t *p, runqueue_t *rq,

2446

static inline void update_cpu_clock(task_t *p, runqueue_t *rq,

2440

unsigned long long now)

2447

unsigned long long now)

2441

{

2448

{

2442

unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);

2449

unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);

2443

p->sched_time += now - last;

2450

p->sched_time += now - last;

2444

}

2451

}

2445

2452

2446

/*

2453

/*

2447

* Return current->sched_time plus any more ns on the sched_clock

2454

* Return current->sched_time plus any more ns on the sched_clock

2448

* that have not yet been banked.

2455

* that have not yet been banked.

2449

*/

2456

*/

2450

unsigned long long current_sched_time(const task_t *tsk)

2457

unsigned long long current_sched_time(const task_t *tsk)

2451

{

2458

{

2452

unsigned long long ns;

2459

unsigned long long ns;

2453

unsigned long flags;

2460

unsigned long flags;

2454

local_irq_save(flags);

2461

local_irq_save(flags);

2455

ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);

2462

ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);

2456

ns = tsk->sched_time + (sched_clock() - ns);

2463

ns = tsk->sched_time + (sched_clock() - ns);

2457

local_irq_restore(flags);

2464

local_irq_restore(flags);

2458

return ns;

2465

return ns;

2459

}

2466

}

2460

2467

2461

/*

2468

/*

2462

* We place interactive tasks back into the active array, if possible.

2469

* We place interactive tasks back into the active array, if possible.

2463

*

2470

*

2464

* To guarantee that this does not starve expired tasks we ignore the

2471

* To guarantee that this does not starve expired tasks we ignore the

2465

* interactivity of a task if the first expired task had to wait more

2472

* interactivity of a task if the first expired task had to wait more

2466

* than a 'reasonable' amount of time. This deadline timeout is

2473

* than a 'reasonable' amount of time. This deadline timeout is

2467

* load-dependent, as the frequency of array switched decreases with

2474

* load-dependent, as the frequency of array switched decreases with

2468

* increasing number of running tasks. We also ignore the interactivity

2475

* increasing number of running tasks. We also ignore the interactivity

2469

* if a better static_prio task has expired:

2476

* if a better static_prio task has expired:

2470

*/

2477

*/

2471

#define EXPIRED_STARVING(rq) \

2478

#define EXPIRED_STARVING(rq) \

2472

((STARVATION_LIMIT && ((rq)->expired_timestamp && \

2479

((STARVATION_LIMIT && ((rq)->expired_timestamp && \

2473

(jiffies - (rq)->expired_timestamp >= \

2480

(jiffies - (rq)->expired_timestamp >= \

2474

STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \

2481

STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \

2475

((rq)->curr->static_prio > (rq)->best_expired_prio))

2482

((rq)->curr->static_prio > (rq)->best_expired_prio))

2476

2483

2477

/*

2484

/*

2478

* Account user cpu time to a process.

2485

* Account user cpu time to a process.

2479

* @p: the process that the cpu time gets accounted to

2486

* @p: the process that the cpu time gets accounted to

2480

* @hardirq_offset: the offset to subtract from hardirq_count()

2487

* @hardirq_offset: the offset to subtract from hardirq_count()

2481

* @cputime: the cpu time spent in user space since the last update

2488

* @cputime: the cpu time spent in user space since the last update

2482

*/

2489

*/

2483

void account_user_time(struct task_struct *p, cputime_t cputime)

2490

void account_user_time(struct task_struct *p, cputime_t cputime)

2484

{

2491

{

2485

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

2492

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

2486

cputime64_t tmp;

2493

cputime64_t tmp;

2487

2494

2488

p->utime = cputime_add(p->utime, cputime);

2495

p->utime = cputime_add(p->utime, cputime);

2489

2496

2490

/* Add user time to cpustat. */

2497

/* Add user time to cpustat. */

2491

tmp = cputime_to_cputime64(cputime);

2498

tmp = cputime_to_cputime64(cputime);

2492

if (TASK_NICE(p) > 0)

2499

if (TASK_NICE(p) > 0)

2493

cpustat->nice = cputime64_add(cpustat->nice, tmp);

2500

cpustat->nice = cputime64_add(cpustat->nice, tmp);

2494

else

2501

else

2495

cpustat->user = cputime64_add(cpustat->user, tmp);

2502

cpustat->user = cputime64_add(cpustat->user, tmp);

2496

}

2503

}

2497

2504

2498

/*

2505

/*

2499

* Account system cpu time to a process.

2506

* Account system cpu time to a process.

2500

* @p: the process that the cpu time gets accounted to

2507

* @p: the process that the cpu time gets accounted to

2501

* @hardirq_offset: the offset to subtract from hardirq_count()

2508

* @hardirq_offset: the offset to subtract from hardirq_count()

2502

* @cputime: the cpu time spent in kernel space since the last update

2509

* @cputime: the cpu time spent in kernel space since the last update

2503

*/

2510

*/

2504

void account_system_time(struct task_struct *p, int hardirq_offset,

2511

void account_system_time(struct task_struct *p, int hardirq_offset,

2505

cputime_t cputime)

2512

cputime_t cputime)

2506

{

2513

{

2507

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

2514

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

2508

runqueue_t *rq = this_rq();

2515

runqueue_t *rq = this_rq();

2509

cputime64_t tmp;

2516

cputime64_t tmp;

2510

2517

2511

p->stime = cputime_add(p->stime, cputime);

2518

p->stime = cputime_add(p->stime, cputime);

2512

2519

2513

/* Add system time to cpustat. */

2520

/* Add system time to cpustat. */

2514

tmp = cputime_to_cputime64(cputime);

2521

tmp = cputime_to_cputime64(cputime);

2515

if (hardirq_count() - hardirq_offset)

2522

if (hardirq_count() - hardirq_offset)

2516

cpustat->irq = cputime64_add(cpustat->irq, tmp);

2523

cpustat->irq = cputime64_add(cpustat->irq, tmp);

2517

else if (softirq_count())

2524

else if (softirq_count())

2518

cpustat->softirq = cputime64_add(cpustat->softirq, tmp);

2525

cpustat->softirq = cputime64_add(cpustat->softirq, tmp);

2519

else if (p != rq->idle)

2526

else if (p != rq->idle)

2520

cpustat->system = cputime64_add(cpustat->system, tmp);

2527

cpustat->system = cputime64_add(cpustat->system, tmp);

2521

else if (atomic_read(&rq->nr_iowait) > 0)

2528

else if (atomic_read(&rq->nr_iowait) > 0)

2522

cpustat->iowait = cputime64_add(cpustat->iowait, tmp);

2529

cpustat->iowait = cputime64_add(cpustat->iowait, tmp);

2523

else

2530

else

2524

cpustat->idle = cputime64_add(cpustat->idle, tmp);

2531

cpustat->idle = cputime64_add(cpustat->idle, tmp);

2525

/* Account for system time used */

2532

/* Account for system time used */

2526

acct_update_integrals(p);

2533

acct_update_integrals(p);

2527

}

2534

}

2528

2535

2529

/*

2536

/*

2530

* Account for involuntary wait time.

2537

* Account for involuntary wait time.

2531

* @p: the process from which the cpu time has been stolen

2538

* @p: the process from which the cpu time has been stolen

2532

* @steal: the cpu time spent in involuntary wait

2539

* @steal: the cpu time spent in involuntary wait

2533

*/

2540

*/

2534

void account_steal_time(struct task_struct *p, cputime_t steal)

2541

void account_steal_time(struct task_struct *p, cputime_t steal)

2535

{

2542

{

2536

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

2543

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

2537

cputime64_t tmp = cputime_to_cputime64(steal);

2544

cputime64_t tmp = cputime_to_cputime64(steal);

2538

runqueue_t *rq = this_rq();

2545

runqueue_t *rq = this_rq();

2539

2546

2540

if (p == rq->idle) {

2547

if (p == rq->idle) {

2541

p->stime = cputime_add(p->stime, steal);

2548

p->stime = cputime_add(p->stime, steal);

2542

if (atomic_read(&rq->nr_iowait) > 0)

2549

if (atomic_read(&rq->nr_iowait) > 0)

2543

cpustat->iowait = cputime64_add(cpustat->iowait, tmp);

2550

cpustat->iowait = cputime64_add(cpustat->iowait, tmp);

2544

else

2551

else

2545

cpustat->idle = cputime64_add(cpustat->idle, tmp);

2552

cpustat->idle = cputime64_add(cpustat->idle, tmp);

2546

} else

2553

} else

2547

cpustat->steal = cputime64_add(cpustat->steal, tmp);

2554

cpustat->steal = cputime64_add(cpustat->steal, tmp);

2548

}

2555

}

2549

2556

2550

/*

2557

/*

2551

* This function gets called by the timer code, with HZ frequency.

2558

* This function gets called by the timer code, with HZ frequency.

2552

* We call it with interrupts disabled.

2559

* We call it with interrupts disabled.

2553

*

2560

*

2554

* It also gets called by the fork code, when changing the parent's

2561

* It also gets called by the fork code, when changing the parent's

2555

* timeslices.

2562

* timeslices.

2556

*/

2563

*/

2557

void scheduler_tick(void)

2564

void scheduler_tick(void)

2558

{

2565

{

2559

int cpu = smp_processor_id();

2566

int cpu = smp_processor_id();

2560

runqueue_t *rq = this_rq();

2567

runqueue_t *rq = this_rq();

2561

task_t *p = current;

2568

task_t *p = current;

2562

unsigned long long now = sched_clock();

2569

unsigned long long now = sched_clock();

2563

2570

2564

update_cpu_clock(p, rq, now);

2571

update_cpu_clock(p, rq, now);

2565

2572

2566

rq->timestamp_last_tick = now;

2573

rq->timestamp_last_tick = now;

2567

2574

2568

if (p == rq->idle) {

2575

if (p == rq->idle) {

2569

if (wake_priority_sleeper(rq))

2576

if (wake_priority_sleeper(rq))

2570

goto out;

2577

goto out;

2571

rebalance_tick(cpu, rq, SCHED_IDLE);

2578

rebalance_tick(cpu, rq, SCHED_IDLE);

2572

return;

2579

return;

2573

}

2580

}

2574

2581

2575

/* Task might have expired already, but not scheduled off yet */

2582

/* Task might have expired already, but not scheduled off yet */

2576

if (p->array != rq->active) {

2583

if (p->array != rq->active) {

2577

set_tsk_need_resched(p);

2584

set_tsk_need_resched(p);

2578

goto out;

2585

goto out;

2579

}

2586

}

2580

spin_lock(&rq->lock);

2587

spin_lock(&rq->lock);

2581

/*

2588

/*

2582

* The task was running during this tick - update the

2589

* The task was running during this tick - update the

2583

* time slice counter. Note: we do not update a thread's

2590

* time slice counter. Note: we do not update a thread's

2584

* priority until it either goes to sleep or uses up its

2591

* priority until it either goes to sleep or uses up its

2585

* timeslice. This makes it possible for interactive tasks

2592

* timeslice. This makes it possible for interactive tasks

2586

* to use up their timeslices at their highest priority levels.

2593

* to use up their timeslices at their highest priority levels.

2587

*/

2594

*/

2588

if (rt_task(p)) {

2595

if (rt_task(p)) {

2589

/*

2596

/*

2590

* RR tasks need a special form of timeslice management.

2597

* RR tasks need a special form of timeslice management.

2591

* FIFO tasks have no timeslices.

2598

* FIFO tasks have no timeslices.

2592

*/

2599

*/

2593

if ((p->policy == SCHED_RR) && !--p->time_slice) {

2600

if ((p->policy == SCHED_RR) && !--p->time_slice) {

2594

p->time_slice = task_timeslice(p);

2601

p->time_slice = task_timeslice(p);

2595

p->first_time_slice = 0;

2602

p->first_time_slice = 0;

2596

set_tsk_need_resched(p);

2603

set_tsk_need_resched(p);

2597

2604

2598

/* put it at the end of the queue: */

2605

/* put it at the end of the queue: */

2599

requeue_task(p, rq->active);

2606

requeue_task(p, rq->active);

2600

}

2607

}

2601

goto out_unlock;

2608

goto out_unlock;

2602

}

2609

}

2603

if (!--p->time_slice) {

2610

if (!--p->time_slice) {

2604

dequeue_task(p, rq->active);

2611

dequeue_task(p, rq->active);

2605

set_tsk_need_resched(p);

2612

set_tsk_need_resched(p);

2606

p->prio = effective_prio(p);

2613

p->prio = effective_prio(p);

2607

p->time_slice = task_timeslice(p);

2614

p->time_slice = task_timeslice(p);

2608

p->first_time_slice = 0;

2615

p->first_time_slice = 0;

2609

2616

2610

if (!rq->expired_timestamp)

2617

if (!rq->expired_timestamp)

2611

rq->expired_timestamp = jiffies;

2618

rq->expired_timestamp = jiffies;

2612

if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {

2619

if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {

2613

enqueue_task(p, rq->expired);

2620

enqueue_task(p, rq->expired);

2614

if (p->static_prio < rq->best_expired_prio)

2621

if (p->static_prio < rq->best_expired_prio)

2615

rq->best_expired_prio = p->static_prio;

2622

rq->best_expired_prio = p->static_prio;

2616

} else

2623

} else

2617

enqueue_task(p, rq->active);

2624

enqueue_task(p, rq->active);

2618

} else {

2625

} else {

2619

/*

2626

/*

2620

* Prevent a too long timeslice allowing a task to monopolize

2627

* Prevent a too long timeslice allowing a task to monopolize

2621

* the CPU. We do this by splitting up the timeslice into

2628

* the CPU. We do this by splitting up the timeslice into

2622

* smaller pieces.

2629

* smaller pieces.

2623

*

2630

*

2624

* Note: this does not mean the task's timeslices expire or

2631

* Note: this does not mean the task's timeslices expire or

2625

* get lost in any way, they just might be preempted by

2632

* get lost in any way, they just might be preempted by

2626

* another task of equal priority. (one with higher

2633

* another task of equal priority. (one with higher

2627

* priority would have preempted this task already.) We

2634

* priority would have preempted this task already.) We

2628

* requeue this task to the end of the list on this priority

2635

* requeue this task to the end of the list on this priority

2629

* level, which is in essence a round-robin of tasks with

2636

* level, which is in essence a round-robin of tasks with

2630

* equal priority.

2637

* equal priority.

2631

*

2638

*

2632

* This only applies to tasks in the interactive

2639

* This only applies to tasks in the interactive

2633

* delta range with at least TIMESLICE_GRANULARITY to requeue.

2640

* delta range with at least TIMESLICE_GRANULARITY to requeue.

2634

*/

2641

*/

2635

if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -

2642

if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -

2636

p->time_slice) % TIMESLICE_GRANULARITY(p)) &&

2643

p->time_slice) % TIMESLICE_GRANULARITY(p)) &&

2637

(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&

2644

(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&

2638

(p->array == rq->active)) {

2645

(p->array == rq->active)) {

2639

2646

2640

requeue_task(p, rq->active);

2647

requeue_task(p, rq->active);

2641

set_tsk_need_resched(p);

2648

set_tsk_need_resched(p);

2642

}

2649

}

2643

}

2650

}

2644

out_unlock:

2651

out_unlock:

2645

spin_unlock(&rq->lock);

2652

spin_unlock(&rq->lock);

2646

out:

2653

out:

2647

rebalance_tick(cpu, rq, NOT_IDLE);

2654

rebalance_tick(cpu, rq, NOT_IDLE);

2648

}

2655

}

2649

2656

2650

#ifdef CONFIG_SCHED_SMT

2657

#ifdef CONFIG_SCHED_SMT

2651

static inline void wakeup_busy_runqueue(runqueue_t *rq)

2658

static inline void wakeup_busy_runqueue(runqueue_t *rq)

2652

{

2659

{

2653

/* If an SMT runqueue is sleeping due to priority reasons wake it up */

2660

/* If an SMT runqueue is sleeping due to priority reasons wake it up */

2654

if (rq->curr == rq->idle && rq->nr_running)

2661

if (rq->curr == rq->idle && rq->nr_running)

2655

resched_task(rq->idle);

2662

resched_task(rq->idle);

2656

}

2663

}

2657

2664

2658

static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)

2665

static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)

2659

{

2666

{

2660

struct sched_domain *tmp, *sd = NULL;

2667

struct sched_domain *tmp, *sd = NULL;

2661

cpumask_t sibling_map;

2668

cpumask_t sibling_map;

2662

int i;

2669

int i;

2663

2670

2664

for_each_domain(this_cpu, tmp)

2671

for_each_domain(this_cpu, tmp)

2665

if (tmp->flags & SD_SHARE_CPUPOWER)

2672

if (tmp->flags & SD_SHARE_CPUPOWER)

2666

sd = tmp;

2673

sd = tmp;

2667

2674

2668

if (!sd)

2675

if (!sd)

2669

return;

2676

return;

2670

2677

2671

/*

2678

/*

2672

* Unlock the current runqueue because we have to lock in

2679

* Unlock the current runqueue because we have to lock in

2673

* CPU order to avoid deadlocks. Caller knows that we might

2680

* CPU order to avoid deadlocks. Caller knows that we might

2674

* unlock. We keep IRQs disabled.

2681

* unlock. We keep IRQs disabled.

2675

*/

2682

*/

2676

spin_unlock(&this_rq->lock);

2683

spin_unlock(&this_rq->lock);

2677

2684

2678

sibling_map = sd->span;

2685

sibling_map = sd->span;

2679

2686

2680

for_each_cpu_mask(i, sibling_map)

2687

for_each_cpu_mask(i, sibling_map)

2681

spin_lock(&cpu_rq(i)->lock);

2688

spin_lock(&cpu_rq(i)->lock);

2682

/*

2689

/*

2683

* We clear this CPU from the mask. This both simplifies the

2690

* We clear this CPU from the mask. This both simplifies the

2684

* inner loop and keps this_rq locked when we exit:

2691

* inner loop and keps this_rq locked when we exit:

2685

*/

2692

*/

2686

cpu_clear(this_cpu, sibling_map);

2693

cpu_clear(this_cpu, sibling_map);

2687

2694

2688

for_each_cpu_mask(i, sibling_map) {

2695

for_each_cpu_mask(i, sibling_map) {

2689

runqueue_t *smt_rq = cpu_rq(i);

2696

runqueue_t *smt_rq = cpu_rq(i);

2690

2697

2691

wakeup_busy_runqueue(smt_rq);

2698

wakeup_busy_runqueue(smt_rq);

2692

}

2699

}

2693

2700

2694

for_each_cpu_mask(i, sibling_map)

2701

for_each_cpu_mask(i, sibling_map)

2695

spin_unlock(&cpu_rq(i)->lock);

2702

spin_unlock(&cpu_rq(i)->lock);

2696

/*

2703

/*

2697

* We exit with this_cpu's rq still held and IRQs

2704

* We exit with this_cpu's rq still held and IRQs

2698

* still disabled:

2705

* still disabled:

2699

*/

2706

*/

2700

}

2707

}

2701

2708

2702

/*

2709

/*

2703

* number of 'lost' timeslices this task wont be able to fully

2710

* number of 'lost' timeslices this task wont be able to fully

2704

* utilize, if another task runs on a sibling. This models the

2711

* utilize, if another task runs on a sibling. This models the

2705

* slowdown effect of other tasks running on siblings:

2712

* slowdown effect of other tasks running on siblings:

2706

*/

2713

*/

2707

static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)

2714

static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)

2708

{

2715

{

2709

return p->time_slice * (100 - sd->per_cpu_gain) / 100;

2716

return p->time_slice * (100 - sd->per_cpu_gain) / 100;

2710

}

2717

}

2711

2718

2712

static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)

2719

static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)

2713

{

2720

{

2714

struct sched_domain *tmp, *sd = NULL;

2721

struct sched_domain *tmp, *sd = NULL;

2715

cpumask_t sibling_map;

2722

cpumask_t sibling_map;

2716

prio_array_t *array;

2723

prio_array_t *array;

2717

int ret = 0, i;

2724

int ret = 0, i;

2718

task_t *p;

2725

task_t *p;

2719

2726

2720

for_each_domain(this_cpu, tmp)

2727

for_each_domain(this_cpu, tmp)

2721

if (tmp->flags & SD_SHARE_CPUPOWER)

2728

if (tmp->flags & SD_SHARE_CPUPOWER)

2722

sd = tmp;

2729

sd = tmp;

2723

2730

2724

if (!sd)

2731

if (!sd)

2725

return 0;

2732

return 0;

2726

2733

2727

/*

2734

/*

2728

* The same locking rules and details apply as for

2735

* The same locking rules and details apply as for

2729

* wake_sleeping_dependent():

2736

* wake_sleeping_dependent():

2730

*/

2737

*/

2731

spin_unlock(&this_rq->lock);

2738

spin_unlock(&this_rq->lock);

2732

sibling_map = sd->span;

2739

sibling_map = sd->span;

2733

for_each_cpu_mask(i, sibling_map)

2740

for_each_cpu_mask(i, sibling_map)

2734

spin_lock(&cpu_rq(i)->lock);

2741

spin_lock(&cpu_rq(i)->lock);

2735

cpu_clear(this_cpu, sibling_map);

2742

cpu_clear(this_cpu, sibling_map);

2736

2743

2737

/*

2744

/*

2738

* Establish next task to be run - it might have gone away because

2745

* Establish next task to be run - it might have gone away because

2739

* we released the runqueue lock above:

2746

* we released the runqueue lock above:

2740

*/

2747

*/

2741

if (!this_rq->nr_running)

2748

if (!this_rq->nr_running)

2742

goto out_unlock;

2749

goto out_unlock;

2743

array = this_rq->active;

2750

array = this_rq->active;

2744

if (!array->nr_active)

2751

if (!array->nr_active)

2745

array = this_rq->expired;

2752

array = this_rq->expired;

2746

BUG_ON(!array->nr_active);

2753

BUG_ON(!array->nr_active);

2747

2754

2748

p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,

2755

p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,

2749

task_t, run_list);

2756

task_t, run_list);

2750

2757

2751

for_each_cpu_mask(i, sibling_map) {

2758

for_each_cpu_mask(i, sibling_map) {

2752

runqueue_t *smt_rq = cpu_rq(i);

2759

runqueue_t *smt_rq = cpu_rq(i);

2753

task_t *smt_curr = smt_rq->curr;

2760

task_t *smt_curr = smt_rq->curr;

2754

2761

2755

/* Kernel threads do not participate in dependent sleeping */

2762

/* Kernel threads do not participate in dependent sleeping */

2756

if (!p->mm || !smt_curr->mm || rt_task(p))

2763

if (!p->mm || !smt_curr->mm || rt_task(p))

2757

goto check_smt_task;

2764

goto check_smt_task;

2758

2765

2759

/*

2766

/*

2760

* If a user task with lower static priority than the

2767

* If a user task with lower static priority than the

2761

* running task on the SMT sibling is trying to schedule,

2768

* running task on the SMT sibling is trying to schedule,

2762

* delay it till there is proportionately less timeslice

2769

* delay it till there is proportionately less timeslice

2763

* left of the sibling task to prevent a lower priority

2770

* left of the sibling task to prevent a lower priority

2764

* task from using an unfair proportion of the

2771

* task from using an unfair proportion of the

2765

* physical cpu's resources. -ck

2772

* physical cpu's resources. -ck

2766

*/

2773

*/

2767

if (rt_task(smt_curr)) {

2774

if (rt_task(smt_curr)) {

2768

/*

2775

/*

2769

* With real time tasks we run non-rt tasks only

2776

* With real time tasks we run non-rt tasks only

2770

* per_cpu_gain% of the time.

2777

* per_cpu_gain% of the time.

2771

*/

2778

*/

2772

if ((jiffies % DEF_TIMESLICE) >

2779

if ((jiffies % DEF_TIMESLICE) >

2773

(sd->per_cpu_gain * DEF_TIMESLICE / 100))

2780

(sd->per_cpu_gain * DEF_TIMESLICE / 100))

2774

ret = 1;

2781

ret = 1;

2775

} else

2782

} else

2776

if (smt_curr->static_prio < p->static_prio &&

2783

if (smt_curr->static_prio < p->static_prio &&

2777

!TASK_PREEMPTS_CURR(p, smt_rq) &&

2784

!TASK_PREEMPTS_CURR(p, smt_rq) &&

2778

smt_slice(smt_curr, sd) > task_timeslice(p))

2785

smt_slice(smt_curr, sd) > task_timeslice(p))

2779

ret = 1;

2786

ret = 1;

2780

2787

2781

check_smt_task:

2788

check_smt_task:

2782

if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||

2789

if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||

2783

rt_task(smt_curr))

2790

rt_task(smt_curr))

2784

continue;

2791

continue;

2785

if (!p->mm) {

2792

if (!p->mm) {

2786

wakeup_busy_runqueue(smt_rq);

2793

wakeup_busy_runqueue(smt_rq);

2787

continue;

2794

continue;

2788

}

2795

}

2789

2796

2790

/*

2797

/*

2791

* Reschedule a lower priority task on the SMT sibling for

2798

* Reschedule a lower priority task on the SMT sibling for

2792

* it to be put to sleep, or wake it up if it has been put to

2799

* it to be put to sleep, or wake it up if it has been put to

2793

* sleep for priority reasons to see if it should run now.

2800

* sleep for priority reasons to see if it should run now.

2794

*/

2801

*/

2795

if (rt_task(p)) {

2802

if (rt_task(p)) {

2796

if ((jiffies % DEF_TIMESLICE) >

2803

if ((jiffies % DEF_TIMESLICE) >

2797

(sd->per_cpu_gain * DEF_TIMESLICE / 100))

2804

(sd->per_cpu_gain * DEF_TIMESLICE / 100))

2798

resched_task(smt_curr);

2805

resched_task(smt_curr);

2799

} else {

2806

} else {

2800

if (TASK_PREEMPTS_CURR(p, smt_rq) &&

2807

if (TASK_PREEMPTS_CURR(p, smt_rq) &&

2801

smt_slice(p, sd) > task_timeslice(smt_curr))

2808

smt_slice(p, sd) > task_timeslice(smt_curr))

2802

resched_task(smt_curr);

2809

resched_task(smt_curr);

2803

else

2810

else

2804

wakeup_busy_runqueue(smt_rq);

2811

wakeup_busy_runqueue(smt_rq);

2805

}

2812

}

2806

}

2813

}

2807

out_unlock:

2814

out_unlock:

2808

for_each_cpu_mask(i, sibling_map)

2815

for_each_cpu_mask(i, sibling_map)

2809

spin_unlock(&cpu_rq(i)->lock);

2816

spin_unlock(&cpu_rq(i)->lock);

2810

return ret;

2817

return ret;

2811

}

2818

}

2812

#else

2819

#else

2813

static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)

2820

static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)

2814

{

2821

{

2815

}

2822

}

2816

2823

2817

static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)

2824

static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)

2818

{

2825

{

2819

return 0;

2826

return 0;

2820

}

2827

}

2821

#endif

2828

#endif

2822

2829

2823

#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)

2830

#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)

2824

2831

2825

void fastcall add_preempt_count(int val)

2832

void fastcall add_preempt_count(int val)

2826

{

2833

{

2827

/*

2834

/*

2828

* Underflow?

2835

* Underflow?

2829

*/

2836

*/

2830

BUG_ON((preempt_count() < 0));

2837

BUG_ON((preempt_count() < 0));

2831

preempt_count() += val;

2838

preempt_count() += val;

2832

/*

2839

/*

2833

* Spinlock count overflowing soon?

2840

* Spinlock count overflowing soon?

2834

*/

2841

*/

2835

BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);

2842

BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);

2836

}

2843

}

2837

EXPORT_SYMBOL(add_preempt_count);

2844

EXPORT_SYMBOL(add_preempt_count);

2838

2845

2839

void fastcall sub_preempt_count(int val)

2846

void fastcall sub_preempt_count(int val)

2840

{

2847

{

2841

/*

2848

/*

2842

* Underflow?

2849

* Underflow?

2843

*/

2850

*/

2844

BUG_ON(val > preempt_count());

2851

BUG_ON(val > preempt_count());

2845

/*

2852

/*

2846

* Is the spinlock portion underflowing?

2853

* Is the spinlock portion underflowing?

2847

*/

2854

*/

2848

BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));

2855

BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));

2849

preempt_count() -= val;

2856

preempt_count() -= val;

2850

}

2857

}

2851

EXPORT_SYMBOL(sub_preempt_count);

2858

EXPORT_SYMBOL(sub_preempt_count);

2852

2859

2853

#endif

2860

#endif

2854

2861

2855

/*

2862

/*

2856

* schedule() is the main scheduler function.

2863

* schedule() is the main scheduler function.

2857

*/

2864

*/

2858

asmlinkage void __sched schedule(void)

2865

asmlinkage void __sched schedule(void)

2859

{

2866

{

2860

long *switch_count;

2867

long *switch_count;

2861

task_t *prev, *next;

2868

task_t *prev, *next;

2862

runqueue_t *rq;

2869

runqueue_t *rq;

2863

prio_array_t *array;

2870

prio_array_t *array;

2864

struct list_head *queue;

2871

struct list_head *queue;

2865

unsigned long long now;

2872

unsigned long long now;

2866

unsigned long run_time;

2873

unsigned long run_time;

2867

int cpu, idx, new_prio;

2874

int cpu, idx, new_prio;

2868

2875

2869

/*

2876

/*

2870

* Test if we are atomic. Since do_exit() needs to call into

2877

* Test if we are atomic. Since do_exit() needs to call into

2871

* schedule() atomically, we ignore that path for now.

2878

* schedule() atomically, we ignore that path for now.

2872

* Otherwise, whine if we are scheduling when we should not be.

2879

* Otherwise, whine if we are scheduling when we should not be.

2873

*/

2880

*/

2874

if (likely(!current->exit_state)) {

2881

if (likely(!current->exit_state)) {

2875

if (unlikely(in_atomic())) {

2882

if (unlikely(in_atomic())) {

2876

printk(KERN_ERR "BUG: scheduling while atomic: "

2883

printk(KERN_ERR "BUG: scheduling while atomic: "

2877

"%s/0x%08x/%d\n",

2884

"%s/0x%08x/%d\n",

2878

current->comm, preempt_count(), current->pid);

2885

current->comm, preempt_count(), current->pid);

2879

dump_stack();

2886

dump_stack();

2880

}

2887

}

2881

}

2888

}

2882

profile_hit(SCHED_PROFILING, __builtin_return_address(0));

2889

profile_hit(SCHED_PROFILING, __builtin_return_address(0));

2883

2890

2884

need_resched:

2891

need_resched:

2885

preempt_disable();

2892

preempt_disable();

2886

prev = current;

2893

prev = current;

2887

release_kernel_lock(prev);

2894

release_kernel_lock(prev);

2888

need_resched_nonpreemptible:

2895

need_resched_nonpreemptible:

2889

rq = this_rq();

2896

rq = this_rq();

2890

2897

2891

/*

2898

/*

2892

* The idle thread is not allowed to schedule!

2899

* The idle thread is not allowed to schedule!

2893

* Remove this check after it has been exercised a bit.

2900

* Remove this check after it has been exercised a bit.

2894

*/

2901

*/

2895

if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {

2902

if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {

2896

printk(KERN_ERR "bad: scheduling from the idle thread!\n");

2903

printk(KERN_ERR "bad: scheduling from the idle thread!\n");

2897

dump_stack();

2904

dump_stack();

2898

}

2905

}

2899

2906

2900

schedstat_inc(rq, sched_cnt);

2907

schedstat_inc(rq, sched_cnt);

2901

now = sched_clock();

2908

now = sched_clock();

2902

if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {

2909

if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {

2903

run_time = now - prev->timestamp;

2910

run_time = now - prev->timestamp;

2904

if (unlikely((long long)(now - prev->timestamp) < 0))

2911

if (unlikely((long long)(now - prev->timestamp) < 0))

2905

run_time = 0;

2912

run_time = 0;

2906

} else

2913

} else

2907

run_time = NS_MAX_SLEEP_AVG;

2914

run_time = NS_MAX_SLEEP_AVG;

2908

2915

2909

/*

2916

/*

2910

* Tasks charged proportionately less run_time at high sleep_avg to

2917

* Tasks charged proportionately less run_time at high sleep_avg to

2911

* delay them losing their interactive status

2918

* delay them losing their interactive status

2912

*/

2919

*/

2913

run_time /= (CURRENT_BONUS(prev) ? : 1);

2920

run_time /= (CURRENT_BONUS(prev) ? : 1);

2914

2921

2915

spin_lock_irq(&rq->lock);

2922

spin_lock_irq(&rq->lock);

2916

2923

2917

if (unlikely(prev->flags & PF_DEAD))

2924

if (unlikely(prev->flags & PF_DEAD))

2918

prev->state = EXIT_DEAD;

2925

prev->state = EXIT_DEAD;

2919

2926

2920

switch_count = &prev->nivcsw;

2927

switch_count = &prev->nivcsw;

2921

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

2928

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

2922

switch_count = &prev->nvcsw;

2929

switch_count = &prev->nvcsw;

2923

if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&

2930

if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&

2924

unlikely(signal_pending(prev))))

2931

unlikely(signal_pending(prev))))

2925

prev->state = TASK_RUNNING;

2932

prev->state = TASK_RUNNING;

2926

else {

2933

else {

2927

if (prev->state == TASK_UNINTERRUPTIBLE)

2934

if (prev->state == TASK_UNINTERRUPTIBLE)

2928

rq->nr_uninterruptible++;

2935

rq->nr_uninterruptible++;

2929

deactivate_task(prev, rq);

2936

deactivate_task(prev, rq);

2930

}

2937

}

2931

}

2938

}

2932

2939

2933

cpu = smp_processor_id();

2940

cpu = smp_processor_id();

2934

if (unlikely(!rq->nr_running)) {

2941

if (unlikely(!rq->nr_running)) {

2935

go_idle:

2942

go_idle:

2936

idle_balance(cpu, rq);

2943

idle_balance(cpu, rq);

2937

if (!rq->nr_running) {

2944

if (!rq->nr_running) {

2938

next = rq->idle;

2945

next = rq->idle;

2939

rq->expired_timestamp = 0;

2946

rq->expired_timestamp = 0;

2940

wake_sleeping_dependent(cpu, rq);

2947

wake_sleeping_dependent(cpu, rq);

2941

/*

2948

/*

2942

* wake_sleeping_dependent() might have released

2949

* wake_sleeping_dependent() might have released

2943

* the runqueue, so break out if we got new

2950

* the runqueue, so break out if we got new

2944

* tasks meanwhile:

2951

* tasks meanwhile:

2945

*/

2952

*/

2946

if (!rq->nr_running)

2953

if (!rq->nr_running)

2947

goto switch_tasks;

2954

goto switch_tasks;

2948

}

2955

}

2949

} else {

2956

} else {

2950

if (dependent_sleeper(cpu, rq)) {

2957

if (dependent_sleeper(cpu, rq)) {

2951

next = rq->idle;

2958

next = rq->idle;

2952

goto switch_tasks;

2959

goto switch_tasks;

2953

}

2960

}

2954

/*

2961

/*

2955

* dependent_sleeper() releases and reacquires the runqueue

2962

* dependent_sleeper() releases and reacquires the runqueue

2956

* lock, hence go into the idle loop if the rq went

2963

* lock, hence go into the idle loop if the rq went

2957

* empty meanwhile:

2964

* empty meanwhile:

2958

*/

2965

*/

2959

if (unlikely(!rq->nr_running))

2966

if (unlikely(!rq->nr_running))

2960

goto go_idle;

2967

goto go_idle;

2961

}

2968

}

2962

2969

2963

array = rq->active;

2970

array = rq->active;

2964

if (unlikely(!array->nr_active)) {

2971

if (unlikely(!array->nr_active)) {

2965

/*

2972

/*

2966

* Switch the active and expired arrays.

2973

* Switch the active and expired arrays.

2967

*/

2974

*/

2968

schedstat_inc(rq, sched_switch);

2975

schedstat_inc(rq, sched_switch);

2969

rq->active = rq->expired;

2976

rq->active = rq->expired;

2970

rq->expired = array;

2977

rq->expired = array;

2971

array = rq->active;

2978

array = rq->active;

2972

rq->expired_timestamp = 0;

2979

rq->expired_timestamp = 0;

2973

rq->best_expired_prio = MAX_PRIO;

2980

rq->best_expired_prio = MAX_PRIO;

2974

}

2981

}

2975

2982

2976

idx = sched_find_first_bit(array->bitmap);

2983

idx = sched_find_first_bit(array->bitmap);

2977

queue = array->queue + idx;

2984

queue = array->queue + idx;

2978

next = list_entry(queue->next, task_t, run_list);

2985

next = list_entry(queue->next, task_t, run_list);

2979

2986

2980

if (!rt_task(next) && next->activated > 0) {

2987

if (!rt_task(next) && next->activated > 0) {

2981

unsigned long long delta = now - next->timestamp;

2988

unsigned long long delta = now - next->timestamp;

2982

if (unlikely((long long)(now - next->timestamp) < 0))

2989

if (unlikely((long long)(now - next->timestamp) < 0))

2983

delta = 0;

2990

delta = 0;

2984

2991

2985

if (next->activated == 1)

2992

if (next->activated == 1)

2986

delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;

2993

delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;

2987

2994

2988

array = next->array;

2995

array = next->array;

2989

new_prio = recalc_task_prio(next, next->timestamp + delta);

2996

new_prio = recalc_task_prio(next, next->timestamp + delta);

2990

2997

2991

if (unlikely(next->prio != new_prio)) {

2998

if (unlikely(next->prio != new_prio)) {

2992

dequeue_task(next, array);

2999

dequeue_task(next, array);

2993

next->prio = new_prio;

3000

next->prio = new_prio;

2994

enqueue_task(next, array);

3001

enqueue_task(next, array);

2995

} else

3002

} else

2996

requeue_task(next, array);

3003

requeue_task(next, array);

2997

}

3004

}

2998

next->activated = 0;

3005

next->activated = 0;

2999

switch_tasks:

3006

switch_tasks:

3000

if (next == rq->idle)

3007

if (next == rq->idle)

3001

schedstat_inc(rq, sched_goidle);

3008

schedstat_inc(rq, sched_goidle);

3002

prefetch(next);

3009

prefetch(next);

3003

prefetch_stack(next);

3010

prefetch_stack(next);

3004

clear_tsk_need_resched(prev);

3011

clear_tsk_need_resched(prev);

3005

rcu_qsctr_inc(task_cpu(prev));

3012

rcu_qsctr_inc(task_cpu(prev));

3006

3013

3007

update_cpu_clock(prev, rq, now);

3014

update_cpu_clock(prev, rq, now);

3008

3015

3009

prev->sleep_avg -= run_time;

3016

prev->sleep_avg -= run_time;

3010

if ((long)prev->sleep_avg <= 0)

3017

if ((long)prev->sleep_avg <= 0)

3011

prev->sleep_avg = 0;

3018

prev->sleep_avg = 0;

3012

prev->timestamp = prev->last_ran = now;

3019

prev->timestamp = prev->last_ran = now;

3013

3020

3014

sched_info_switch(prev, next);

3021

sched_info_switch(prev, next);

3015

if (likely(prev != next)) {

3022

if (likely(prev != next)) {

3016

next->timestamp = now;

3023

next->timestamp = now;

3017

rq->nr_switches++;

3024

rq->nr_switches++;

3018

rq->curr = next;

3025

rq->curr = next;

3019

++*switch_count;

3026

++*switch_count;

3020

3027

3021

prepare_task_switch(rq, next);

3028

prepare_task_switch(rq, next);

3022

prev = context_switch(rq, prev, next);

3029

prev = context_switch(rq, prev, next);

3023

barrier();

3030

barrier();

3024

/*

3031

/*

3025

* this_rq must be evaluated again because prev may have moved

3032

* this_rq must be evaluated again because prev may have moved

3026

* CPUs since it called schedule(), thus the 'rq' on its stack

3033

* CPUs since it called schedule(), thus the 'rq' on its stack

3027

* frame will be invalid.

3034

* frame will be invalid.

3028

*/

3035

*/

3029

finish_task_switch(this_rq(), prev);

3036

finish_task_switch(this_rq(), prev);

3030

} else

3037

} else

3031

spin_unlock_irq(&rq->lock);

3038

spin_unlock_irq(&rq->lock);

3032

3039

3033

prev = current;

3040

prev = current;

3034

if (unlikely(reacquire_kernel_lock(prev) < 0))

3041

if (unlikely(reacquire_kernel_lock(prev) < 0))

3035

goto need_resched_nonpreemptible;

3042

goto need_resched_nonpreemptible;

3036

preempt_enable_no_resched();

3043

preempt_enable_no_resched();

3037

if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))

3044

if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))

3038

goto need_resched;

3045

goto need_resched;

3039

}

3046

}

3040

3047

3041

EXPORT_SYMBOL(schedule);

3048

EXPORT_SYMBOL(schedule);

3042

3049

3043

#ifdef CONFIG_PREEMPT

3050

#ifdef CONFIG_PREEMPT

3044

/*

3051

/*

3045

* this is is the entry point to schedule() from in-kernel preemption

3052

* this is is the entry point to schedule() from in-kernel preemption

3046

* off of preempt_enable. Kernel preemptions off return from interrupt

3053

* off of preempt_enable. Kernel preemptions off return from interrupt

3047

* occur there and call schedule directly.

3054

* occur there and call schedule directly.

3048

*/

3055

*/

3049

asmlinkage void __sched preempt_schedule(void)

3056

asmlinkage void __sched preempt_schedule(void)

3050

{

3057

{

3051

struct thread_info *ti = current_thread_info();

3058

struct thread_info *ti = current_thread_info();

3052

#ifdef CONFIG_PREEMPT_BKL

3059

#ifdef CONFIG_PREEMPT_BKL

3053

struct task_struct *task = current;

3060

struct task_struct *task = current;

3054

int saved_lock_depth;

3061

int saved_lock_depth;

3055

#endif

3062

#endif

3056

/*

3063

/*

3057

* If there is a non-zero preempt_count or interrupts are disabled,

3064

* If there is a non-zero preempt_count or interrupts are disabled,

3058

* we do not want to preempt the current task. Just return..

3065

* we do not want to preempt the current task. Just return..

3059

*/

3066

*/

3060

if (unlikely(ti->preempt_count || irqs_disabled()))

3067

if (unlikely(ti->preempt_count || irqs_disabled()))

3061

return;

3068

return;

3062

3069

3063

need_resched:

3070

need_resched:

3064

add_preempt_count(PREEMPT_ACTIVE);

3071

add_preempt_count(PREEMPT_ACTIVE);

3065

/*

3072

/*

3066

* We keep the big kernel semaphore locked, but we

3073

* We keep the big kernel semaphore locked, but we

3067

* clear ->lock_depth so that schedule() doesnt

3074

* clear ->lock_depth so that schedule() doesnt

3068

* auto-release the semaphore:

3075

* auto-release the semaphore:

3069

*/

3076

*/

3070

#ifdef CONFIG_PREEMPT_BKL

3077

#ifdef CONFIG_PREEMPT_BKL

3071

saved_lock_depth = task->lock_depth;

3078

saved_lock_depth = task->lock_depth;

3072

task->lock_depth = -1;

3079

task->lock_depth = -1;

3073

#endif

3080

#endif

3074

schedule();

3081

schedule();

3075

#ifdef CONFIG_PREEMPT_BKL

3082

#ifdef CONFIG_PREEMPT_BKL

3076

task->lock_depth = saved_lock_depth;

3083

task->lock_depth = saved_lock_depth;

3077

#endif

3084

#endif

3078

sub_preempt_count(PREEMPT_ACTIVE);

3085

sub_preempt_count(PREEMPT_ACTIVE);

3079

3086

3080

/* we could miss a preemption opportunity between schedule and now */

3087

/* we could miss a preemption opportunity between schedule and now */

3081

barrier();

3088

barrier();

3082

if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))

3089

if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))

3083

goto need_resched;

3090

goto need_resched;

3084

}

3091

}

3085

3092

3086

EXPORT_SYMBOL(preempt_schedule);

3093

EXPORT_SYMBOL(preempt_schedule);

3087

3094

3088

/*

3095

/*

3089

* this is is the entry point to schedule() from kernel preemption

3096

* this is is the entry point to schedule() from kernel preemption

3090

* off of irq context.

3097

* off of irq context.

3091

* Note, that this is called and return with irqs disabled. This will

3098

* Note, that this is called and return with irqs disabled. This will

3092

* protect us against recursive calling from irq.

3099

* protect us against recursive calling from irq.

3093

*/

3100

*/

3094

asmlinkage void __sched preempt_schedule_irq(void)

3101

asmlinkage void __sched preempt_schedule_irq(void)

3095

{

3102

{

3096

struct thread_info *ti = current_thread_info();

3103

struct thread_info *ti = current_thread_info();

3097

#ifdef CONFIG_PREEMPT_BKL

3104

#ifdef CONFIG_PREEMPT_BKL

3098

struct task_struct *task = current;

3105

struct task_struct *task = current;

3099

int saved_lock_depth;

3106

int saved_lock_depth;

3100

#endif

3107

#endif

3101

/* Catch callers which need to be fixed*/

3108

/* Catch callers which need to be fixed*/

3102

BUG_ON(ti->preempt_count || !irqs_disabled());

3109

BUG_ON(ti->preempt_count || !irqs_disabled());

3103

3110

3104

need_resched:

3111

need_resched:

3105

add_preempt_count(PREEMPT_ACTIVE);

3112

add_preempt_count(PREEMPT_ACTIVE);

3106

/*

3113

/*

3107

* We keep the big kernel semaphore locked, but we

3114

* We keep the big kernel semaphore locked, but we

3108

* clear ->lock_depth so that schedule() doesnt

3115

* clear ->lock_depth so that schedule() doesnt

3109

* auto-release the semaphore:

3116

* auto-release the semaphore:

3110

*/

3117

*/

3111

#ifdef CONFIG_PREEMPT_BKL

3118

#ifdef CONFIG_PREEMPT_BKL

3112

saved_lock_depth = task->lock_depth;

3119

saved_lock_depth = task->lock_depth;

3113

task->lock_depth = -1;

3120

task->lock_depth = -1;

3114

#endif

3121

#endif

3115

local_irq_enable();

3122

local_irq_enable();

3116

schedule();

3123

schedule();

3117

local_irq_disable();

3124

local_irq_disable();

3118

#ifdef CONFIG_PREEMPT_BKL

3125

#ifdef CONFIG_PREEMPT_BKL

3119

task->lock_depth = saved_lock_depth;

3126

task->lock_depth = saved_lock_depth;

3120

#endif

3127

#endif

3121

sub_preempt_count(PREEMPT_ACTIVE);

3128

sub_preempt_count(PREEMPT_ACTIVE);

3122

3129

3123

/* we could miss a preemption opportunity between schedule and now */

3130

/* we could miss a preemption opportunity between schedule and now */

3124

barrier();

3131

barrier();

3125

if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))

3132

if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))

3126

goto need_resched;

3133

goto need_resched;

3127

}

3134

}

3128

3135

3129

#endif /* CONFIG_PREEMPT */

3136

#endif /* CONFIG_PREEMPT */

3130

3137

3131

int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,

3138

int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,

3132

void *key)

3139

void *key)

3133

{

3140

{

3134

task_t *p = curr->private;

3141

task_t *p = curr->private;

3135

return try_to_wake_up(p, mode, sync);

3142

return try_to_wake_up(p, mode, sync);

3136

}

3143

}

3137

3144

3138

EXPORT_SYMBOL(default_wake_function);

3145

EXPORT_SYMBOL(default_wake_function);

3139

3146

3140

/*

3147

/*

3141

* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just

3148

* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just

3142

* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve

3149

* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve

3143

* number) then we wake all the non-exclusive tasks and one exclusive task.

3150

* number) then we wake all the non-exclusive tasks and one exclusive task.

3144

*

3151

*

3145

* There are circumstances in which we can try to wake a task which has already

3152

* There are circumstances in which we can try to wake a task which has already

3146

* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns

3153

* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns

3147

* zero in this (rare) case, and we handle it by continuing to scan the queue.

3154

* zero in this (rare) case, and we handle it by continuing to scan the queue.

3148

*/

3155

*/

3149

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,

3156

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,

3150

int nr_exclusive, int sync, void *key)

3157

int nr_exclusive, int sync, void *key)

3151

{

3158

{

3152

struct list_head *tmp, *next;

3159

struct list_head *tmp, *next;

3153

3160

3154

list_for_each_safe(tmp, next, &q->task_list) {

3161

list_for_each_safe(tmp, next, &q->task_list) {

3155

wait_queue_t *curr;

3162

wait_queue_t *curr;

3156

unsigned flags;

3163

unsigned flags;

3157

curr = list_entry(tmp, wait_queue_t, task_list);

3164

curr = list_entry(tmp, wait_queue_t, task_list);

3158

flags = curr->flags;

3165

flags = curr->flags;

3159

if (curr->func(curr, mode, sync, key) &&

3166

if (curr->func(curr, mode, sync, key) &&

3160

(flags & WQ_FLAG_EXCLUSIVE) &&

3167

(flags & WQ_FLAG_EXCLUSIVE) &&

3161

!--nr_exclusive)

3168

!--nr_exclusive)

3162

break;

3169

break;

3163

}

3170

}

3164

}

3171

}

3165

3172

3166

/**

3173

/**

3167

* __wake_up - wake up threads blocked on a waitqueue.

3174

* __wake_up - wake up threads blocked on a waitqueue.

3168

* @q: the waitqueue

3175

* @q: the waitqueue

3169

* @mode: which threads

3176

* @mode: which threads

3170

* @nr_exclusive: how many wake-one or wake-many threads to wake up

3177

* @nr_exclusive: how many wake-one or wake-many threads to wake up

3171

* @key: is directly passed to the wakeup function

3178

* @key: is directly passed to the wakeup function

3172

*/

3179

*/

3173

void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,

3180

void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,

3174

int nr_exclusive, void *key)

3181

int nr_exclusive, void *key)

3175

{

3182

{

3176

unsigned long flags;

3183

unsigned long flags;

3177

3184

3178

spin_lock_irqsave(&q->lock, flags);

3185

spin_lock_irqsave(&q->lock, flags);

3179

__wake_up_common(q, mode, nr_exclusive, 0, key);

3186

__wake_up_common(q, mode, nr_exclusive, 0, key);

3180

spin_unlock_irqrestore(&q->lock, flags);

3187

spin_unlock_irqrestore(&q->lock, flags);

3181

}

3188

}

3182

3189

3183

EXPORT_SYMBOL(__wake_up);

3190

EXPORT_SYMBOL(__wake_up);

3184

3191

3185

/*

3192

/*

3186

* Same as __wake_up but called with the spinlock in wait_queue_head_t held.

3193

* Same as __wake_up but called with the spinlock in wait_queue_head_t held.

3187

*/

3194

*/

3188

void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)

3195

void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)

3189

{

3196

{

3190

__wake_up_common(q, mode, 1, 0, NULL);

3197

__wake_up_common(q, mode, 1, 0, NULL);

3191

}

3198

}

3192

3199

3193

/**

3200

/**

3194

* __wake_up_sync - wake up threads blocked on a waitqueue.

3201

* __wake_up_sync - wake up threads blocked on a waitqueue.

3195

* @q: the waitqueue

3202

* @q: the waitqueue

3196

* @mode: which threads

3203

* @mode: which threads

3197

* @nr_exclusive: how many wake-one or wake-many threads to wake up

3204

* @nr_exclusive: how many wake-one or wake-many threads to wake up

3198

*

3205

*

3199

* The sync wakeup differs that the waker knows that it will schedule

3206

* The sync wakeup differs that the waker knows that it will schedule

3200

* away soon, so while the target thread will be woken up, it will not

3207

* away soon, so while the target thread will be woken up, it will not

3201

* be migrated to another CPU - ie. the two threads are 'synchronized'

3208

* be migrated to another CPU - ie. the two threads are 'synchronized'

3202

* with each other. This can prevent needless bouncing between CPUs.

3209

* with each other. This can prevent needless bouncing between CPUs.

3203

*

3210

*

3204

* On UP it can prevent extra preemption.

3211

* On UP it can prevent extra preemption.

3205

*/

3212

*/

3206

void fastcall

3213

void fastcall

3207

__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)

3214

__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)

3208

{

3215

{

3209

unsigned long flags;

3216

unsigned long flags;

3210

int sync = 1;

3217

int sync = 1;

3211

3218

3212

if (unlikely(!q))

3219

if (unlikely(!q))

3213

return;

3220

return;

3214

3221

3215

if (unlikely(!nr_exclusive))

3222

if (unlikely(!nr_exclusive))

3216

sync = 0;

3223

sync = 0;

3217

3224

3218

spin_lock_irqsave(&q->lock, flags);

3225

spin_lock_irqsave(&q->lock, flags);

3219

__wake_up_common(q, mode, nr_exclusive, sync, NULL);

3226

__wake_up_common(q, mode, nr_exclusive, sync, NULL);

3220

spin_unlock_irqrestore(&q->lock, flags);

3227

spin_unlock_irqrestore(&q->lock, flags);

3221

}

3228

}

3222

EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */

3229

EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */

3223

3230

3224

void fastcall complete(struct completion *x)

3231

void fastcall complete(struct completion *x)

3225

{

3232

{

3226

unsigned long flags;

3233

unsigned long flags;

3227

3234

3228

spin_lock_irqsave(&x->wait.lock, flags);

3235

spin_lock_irqsave(&x->wait.lock, flags);

3229

x->done++;

3236

x->done++;

3230

__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,

3237

__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,

3231

1, 0, NULL);

3238

1, 0, NULL);

3232

spin_unlock_irqrestore(&x->wait.lock, flags);

3239

spin_unlock_irqrestore(&x->wait.lock, flags);

3233

}

3240

}

3234

EXPORT_SYMBOL(complete);

3241

EXPORT_SYMBOL(complete);

3235

3242

3236

void fastcall complete_all(struct completion *x)

3243

void fastcall complete_all(struct completion *x)

3237

{

3244

{

3238

unsigned long flags;

3245

unsigned long flags;

3239

3246

3240

spin_lock_irqsave(&x->wait.lock, flags);

3247

spin_lock_irqsave(&x->wait.lock, flags);

3241

x->done += UINT_MAX/2;

3248

x->done += UINT_MAX/2;

3242

__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,

3249

__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,

3243

0, 0, NULL);

3250

0, 0, NULL);

3244

spin_unlock_irqrestore(&x->wait.lock, flags);

3251

spin_unlock_irqrestore(&x->wait.lock, flags);

3245

}

3252

}

3246

EXPORT_SYMBOL(complete_all);

3253

EXPORT_SYMBOL(complete_all);

3247

3254

3248

void fastcall __sched wait_for_completion(struct completion *x)

3255

void fastcall __sched wait_for_completion(struct completion *x)

3249

{

3256

{

3250

might_sleep();

3257

might_sleep();

3251

spin_lock_irq(&x->wait.lock);

3258

spin_lock_irq(&x->wait.lock);

3252

if (!x->done) {

3259

if (!x->done) {

3253

DECLARE_WAITQUEUE(wait, current);

3260

DECLARE_WAITQUEUE(wait, current);

3254

3261

3255

wait.flags |= WQ_FLAG_EXCLUSIVE;

3262

wait.flags |= WQ_FLAG_EXCLUSIVE;

3256

__add_wait_queue_tail(&x->wait, &wait);

3263

__add_wait_queue_tail(&x->wait, &wait);

3257

do {

3264

do {

3258

__set_current_state(TASK_UNINTERRUPTIBLE);

3265

__set_current_state(TASK_UNINTERRUPTIBLE);

3259

spin_unlock_irq(&x->wait.lock);

3266

spin_unlock_irq(&x->wait.lock);

3260

schedule();

3267

schedule();

3261

spin_lock_irq(&x->wait.lock);

3268

spin_lock_irq(&x->wait.lock);

3262

} while (!x->done);

3269

} while (!x->done);

3263

__remove_wait_queue(&x->wait, &wait);

3270

__remove_wait_queue(&x->wait, &wait);

3264

}

3271

}

3265

x->done--;

3272

x->done--;

3266

spin_unlock_irq(&x->wait.lock);

3273

spin_unlock_irq(&x->wait.lock);

3267

}

3274

}

3268

EXPORT_SYMBOL(wait_for_completion);

3275

EXPORT_SYMBOL(wait_for_completion);

3269

3276

3270

unsigned long fastcall __sched

3277

unsigned long fastcall __sched

3271

wait_for_completion_timeout(struct completion *x, unsigned long timeout)

3278

wait_for_completion_timeout(struct completion *x, unsigned long timeout)

3272

{

3279

{

3273

might_sleep();

3280

might_sleep();

3274

3281

3275

spin_lock_irq(&x->wait.lock);

3282

spin_lock_irq(&x->wait.lock);

3276

if (!x->done) {

3283

if (!x->done) {

3277

DECLARE_WAITQUEUE(wait, current);

3284

DECLARE_WAITQUEUE(wait, current);

3278

3285

3279

wait.flags |= WQ_FLAG_EXCLUSIVE;

3286

wait.flags |= WQ_FLAG_EXCLUSIVE;

3280

__add_wait_queue_tail(&x->wait, &wait);

3287

__add_wait_queue_tail(&x->wait, &wait);

3281

do {

3288

do {

3282

__set_current_state(TASK_UNINTERRUPTIBLE);

3289

__set_current_state(TASK_UNINTERRUPTIBLE);

3283

spin_unlock_irq(&x->wait.lock);

3290

spin_unlock_irq(&x->wait.lock);

3284

timeout = schedule_timeout(timeout);

3291

timeout = schedule_timeout(timeout);

3285

spin_lock_irq(&x->wait.lock);

3292

spin_lock_irq(&x->wait.lock);

3286

if (!timeout) {

3293

if (!timeout) {

3287

__remove_wait_queue(&x->wait, &wait);

3294

__remove_wait_queue(&x->wait, &wait);

3288

goto out;

3295

goto out;

3289

}

3296

}

3290

} while (!x->done);

3297

} while (!x->done);

3291

__remove_wait_queue(&x->wait, &wait);

3298

__remove_wait_queue(&x->wait, &wait);

3292

}

3299

}

3293

x->done--;

3300

x->done--;

3294

out:

3301

out:

3295

spin_unlock_irq(&x->wait.lock);

3302

spin_unlock_irq(&x->wait.lock);

3296

return timeout;

3303

return timeout;

3297

}

3304

}

3298

EXPORT_SYMBOL(wait_for_completion_timeout);

3305

EXPORT_SYMBOL(wait_for_completion_timeout);

3299

3306

3300

int fastcall __sched wait_for_completion_interruptible(struct completion *x)

3307

int fastcall __sched wait_for_completion_interruptible(struct completion *x)

3301

{

3308

{

3302

int ret = 0;

3309

int ret = 0;

3303

3310

3304

might_sleep();

3311

might_sleep();

3305

3312

3306

spin_lock_irq(&x->wait.lock);

3313

spin_lock_irq(&x->wait.lock);

3307

if (!x->done) {

3314

if (!x->done) {

3308

DECLARE_WAITQUEUE(wait, current);

3315

DECLARE_WAITQUEUE(wait, current);

3309

3316

3310

wait.flags |= WQ_FLAG_EXCLUSIVE;

3317

wait.flags |= WQ_FLAG_EXCLUSIVE;

3311

__add_wait_queue_tail(&x->wait, &wait);

3318

__add_wait_queue_tail(&x->wait, &wait);

3312

do {

3319

do {

3313

if (signal_pending(current)) {

3320

if (signal_pending(current)) {

3314

ret = -ERESTARTSYS;

3321

ret = -ERESTARTSYS;

3315

__remove_wait_queue(&x->wait, &wait);

3322

__remove_wait_queue(&x->wait, &wait);

3316

goto out;

3323

goto out;

3317

}

3324

}

3318

__set_current_state(TASK_INTERRUPTIBLE);

3325

__set_current_state(TASK_INTERRUPTIBLE);

3319

spin_unlock_irq(&x->wait.lock);

3326

spin_unlock_irq(&x->wait.lock);

3320

schedule();

3327

schedule();

3321

spin_lock_irq(&x->wait.lock);

3328

spin_lock_irq(&x->wait.lock);

3322

} while (!x->done);

3329

} while (!x->done);

3323

__remove_wait_queue(&x->wait, &wait);

3330

__remove_wait_queue(&x->wait, &wait);

3324

}

3331

}

3325

x->done--;

3332

x->done--;

3326

out:

3333

out:

3327

spin_unlock_irq(&x->wait.lock);

3334

spin_unlock_irq(&x->wait.lock);

3328

3335

3329

return ret;

3336

return ret;

3330

}

3337

}

3331

EXPORT_SYMBOL(wait_for_completion_interruptible);

3338

EXPORT_SYMBOL(wait_for_completion_interruptible);

3332

3339

3333

unsigned long fastcall __sched

3340

unsigned long fastcall __sched

3334

wait_for_completion_interruptible_timeout(struct completion *x,

3341

wait_for_completion_interruptible_timeout(struct completion *x,

3335

unsigned long timeout)

3342

unsigned long timeout)

3336

{

3343

{

3337

might_sleep();

3344

might_sleep();

3338

3345

3339

spin_lock_irq(&x->wait.lock);

3346

spin_lock_irq(&x->wait.lock);

3340

if (!x->done) {

3347

if (!x->done) {

3341

DECLARE_WAITQUEUE(wait, current);

3348

DECLARE_WAITQUEUE(wait, current);

3342

3349

3343

wait.flags |= WQ_FLAG_EXCLUSIVE;

3350

wait.flags |= WQ_FLAG_EXCLUSIVE;

3344

__add_wait_queue_tail(&x->wait, &wait);

3351

__add_wait_queue_tail(&x->wait, &wait);

3345

do {

3352

do {

3346

if (signal_pending(current)) {

3353

if (signal_pending(current)) {

3347

timeout = -ERESTARTSYS;

3354

timeout = -ERESTARTSYS;

3348

__remove_wait_queue(&x->wait, &wait);

3355

__remove_wait_queue(&x->wait, &wait);

3349

goto out;

3356

goto out;

3350

}

3357

}

3351

__set_current_state(TASK_INTERRUPTIBLE);

3358

__set_current_state(TASK_INTERRUPTIBLE);

3352

spin_unlock_irq(&x->wait.lock);

3359

spin_unlock_irq(&x->wait.lock);

3353

timeout = schedule_timeout(timeout);

3360

timeout = schedule_timeout(timeout);

3354

spin_lock_irq(&x->wait.lock);

3361

spin_lock_irq(&x->wait.lock);

3355

if (!timeout) {

3362

if (!timeout) {

3356

__remove_wait_queue(&x->wait, &wait);

3363

__remove_wait_queue(&x->wait, &wait);

3357

goto out;

3364

goto out;

3358

}

3365

}

3359

} while (!x->done);

3366

} while (!x->done);

3360

__remove_wait_queue(&x->wait, &wait);

3367

__remove_wait_queue(&x->wait, &wait);

3361

}

3368

}

3362

x->done--;

3369

x->done--;

3363

out:

3370

out:

3364

spin_unlock_irq(&x->wait.lock);

3371

spin_unlock_irq(&x->wait.lock);

3365

return timeout;

3372

return timeout;

3366

}

3373

}

3367

EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);

3374

EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);

3368

3375

3369

3376

3370

#define SLEEP_ON_VAR \

3377

#define SLEEP_ON_VAR \

3371

unsigned long flags; \

3378

unsigned long flags; \

3372

wait_queue_t wait; \

3379

wait_queue_t wait; \

3373

init_waitqueue_entry(&wait, current);

3380

init_waitqueue_entry(&wait, current);

3374

3381

3375

#define SLEEP_ON_HEAD \

3382

#define SLEEP_ON_HEAD \

3376

spin_lock_irqsave(&q->lock,flags); \

3383

spin_lock_irqsave(&q->lock,flags); \

3377

__add_wait_queue(q, &wait); \

3384

__add_wait_queue(q, &wait); \

3378

spin_unlock(&q->lock);

3385

spin_unlock(&q->lock);

3379

3386

3380

#define SLEEP_ON_TAIL \

3387

#define SLEEP_ON_TAIL \

3381

spin_lock_irq(&q->lock); \

3388

spin_lock_irq(&q->lock); \

3382

__remove_wait_queue(q, &wait); \

3389

__remove_wait_queue(q, &wait); \

3383

spin_unlock_irqrestore(&q->lock, flags);

3390

spin_unlock_irqrestore(&q->lock, flags);

3384

3391

3385

void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)

3392

void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)

3386

{

3393

{

3387

SLEEP_ON_VAR

3394

SLEEP_ON_VAR

3388

3395

3389

current->state = TASK_INTERRUPTIBLE;

3396

current->state = TASK_INTERRUPTIBLE;

3390

3397

3391

SLEEP_ON_HEAD

3398

SLEEP_ON_HEAD

3392

schedule();

3399

schedule();

3393

SLEEP_ON_TAIL

3400

SLEEP_ON_TAIL

3394

}

3401

}

3395

3402

3396

EXPORT_SYMBOL(interruptible_sleep_on);

3403

EXPORT_SYMBOL(interruptible_sleep_on);

3397

3404

3398

long fastcall __sched

3405

long fastcall __sched

3399

interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)

3406

interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)

3400

{

3407

{

3401

SLEEP_ON_VAR

3408

SLEEP_ON_VAR

3402

3409

3403

current->state = TASK_INTERRUPTIBLE;

3410

current->state = TASK_INTERRUPTIBLE;

3404

3411

3405

SLEEP_ON_HEAD

3412

SLEEP_ON_HEAD

3406

timeout = schedule_timeout(timeout);

3413

timeout = schedule_timeout(timeout);

3407

SLEEP_ON_TAIL

3414

SLEEP_ON_TAIL

3408

3415

3409

return timeout;

3416

return timeout;

3410

}

3417

}

3411

3418

3412

EXPORT_SYMBOL(interruptible_sleep_on_timeout);

3419

EXPORT_SYMBOL(interruptible_sleep_on_timeout);

3413

3420

3414

void fastcall __sched sleep_on(wait_queue_head_t *q)

3421

void fastcall __sched sleep_on(wait_queue_head_t *q)

3415

{

3422

{

3416

SLEEP_ON_VAR

3423

SLEEP_ON_VAR

3417

3424

3418

current->state = TASK_UNINTERRUPTIBLE;

3425

current->state = TASK_UNINTERRUPTIBLE;

3419

3426

3420

SLEEP_ON_HEAD

3427

SLEEP_ON_HEAD

3421

schedule();

3428

schedule();

3422

SLEEP_ON_TAIL

3429

SLEEP_ON_TAIL

3423

}

3430

}

3424

3431

3425

EXPORT_SYMBOL(sleep_on);

3432

EXPORT_SYMBOL(sleep_on);

3426

3433

3427

long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)

3434

long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)

3428

{

3435

{

3429

SLEEP_ON_VAR

3436

SLEEP_ON_VAR

3430

3437

3431

current->state = TASK_UNINTERRUPTIBLE;

3438

current->state = TASK_UNINTERRUPTIBLE;

3432

3439

3433

SLEEP_ON_HEAD

3440

SLEEP_ON_HEAD

3434

timeout = schedule_timeout(timeout);

3441

timeout = schedule_timeout(timeout);

3435

SLEEP_ON_TAIL

3442

SLEEP_ON_TAIL

3436

3443

3437

return timeout;

3444

return timeout;

3438

}

3445

}

3439

3446

3440

EXPORT_SYMBOL(sleep_on_timeout);

3447

EXPORT_SYMBOL(sleep_on_timeout);

3441

3448

3442

void set_user_nice(task_t *p, long nice)

3449

void set_user_nice(task_t *p, long nice)

3443

{

3450

{

3444

unsigned long flags;

3451

unsigned long flags;

3445

prio_array_t *array;

3452

prio_array_t *array;

3446

runqueue_t *rq;

3453

runqueue_t *rq;

3447

int old_prio, new_prio, delta;

3454

int old_prio, new_prio, delta;

3448

3455

3449

if (TASK_NICE(p) == nice || nice < -20 || nice > 19)

3456

if (TASK_NICE(p) == nice || nice < -20 || nice > 19)

3450

return;

3457

return;

3451

/*

3458

/*

3452

* We have to be careful, if called from sys_setpriority(),

3459

* We have to be careful, if called from sys_setpriority(),

3453

* the task might be in the middle of scheduling on another CPU.

3460

* the task might be in the middle of scheduling on another CPU.

3454

*/

3461

*/

3455

rq = task_rq_lock(p, &flags);

3462

rq = task_rq_lock(p, &flags);

3456

/*

3463

/*

3457

* The RT priorities are set via sched_setscheduler(), but we still

3464

* The RT priorities are set via sched_setscheduler(), but we still

3458

* allow the 'normal' nice value to be set - but as expected

3465

* allow the 'normal' nice value to be set - but as expected

3459

* it wont have any effect on scheduling until the task is

3466

* it wont have any effect on scheduling until the task is

3460

* not SCHED_NORMAL/SCHED_BATCH:

3467

* not SCHED_NORMAL/SCHED_BATCH:

3461

*/

3468

*/

3462

if (rt_task(p)) {

3469

if (rt_task(p)) {

3463

p->static_prio = NICE_TO_PRIO(nice);

3470

p->static_prio = NICE_TO_PRIO(nice);

3464

goto out_unlock;

3471

goto out_unlock;

3465

}

3472

}

3466

array = p->array;

3473

array = p->array;

3467

if (array)

3474

if (array)

3468

dequeue_task(p, array);

3475

dequeue_task(p, array);

3469

3476

3470

old_prio = p->prio;

3477

old_prio = p->prio;

3471

new_prio = NICE_TO_PRIO(nice);

3478

new_prio = NICE_TO_PRIO(nice);

3472

delta = new_prio - old_prio;

3479

delta = new_prio - old_prio;

3473

p->static_prio = NICE_TO_PRIO(nice);

3480

p->static_prio = NICE_TO_PRIO(nice);

3474

p->prio += delta;

3481

p->prio += delta;

3475

3482

3476

if (array) {

3483

if (array) {

3477

enqueue_task(p, array);

3484

enqueue_task(p, array);

3478

/*

3485

/*

3479

* If the task increased its priority or is running and

3486

* If the task increased its priority or is running and

3480

* lowered its priority, then reschedule its CPU:

3487

* lowered its priority, then reschedule its CPU:

3481

*/

3488

*/

3482

if (delta < 0 || (delta > 0 && task_running(rq, p)))

3489

if (delta < 0 || (delta > 0 && task_running(rq, p)))

3483

resched_task(rq->curr);

3490

resched_task(rq->curr);

3484

}

3491

}

3485

out_unlock:

3492

out_unlock:

3486

task_rq_unlock(rq, &flags);

3493

task_rq_unlock(rq, &flags);

3487

}

3494

}

3488

3495

3489

EXPORT_SYMBOL(set_user_nice);

3496

EXPORT_SYMBOL(set_user_nice);

3490

3497

3491

/*

3498

/*

3492

* can_nice - check if a task can reduce its nice value

3499

* can_nice - check if a task can reduce its nice value

3493

* @p: task

3500

* @p: task

3494

* @nice: nice value

3501

* @nice: nice value

3495

*/

3502

*/

3496

int can_nice(const task_t *p, const int nice)

3503

int can_nice(const task_t *p, const int nice)

3497

{

3504

{

3498

/* convert nice value [19,-20] to rlimit style value [1,40] */

3505

/* convert nice value [19,-20] to rlimit style value [1,40] */

3499

int nice_rlim = 20 - nice;

3506

int nice_rlim = 20 - nice;

3500

return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||

3507

return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||

3501

capable(CAP_SYS_NICE));

3508

capable(CAP_SYS_NICE));

3502

}

3509

}

3503

3510

3504

#ifdef __ARCH_WANT_SYS_NICE

3511

#ifdef __ARCH_WANT_SYS_NICE

3505

3512

3506

/*

3513

/*

3507

* sys_nice - change the priority of the current process.

3514

* sys_nice - change the priority of the current process.

3508

* @increment: priority increment

3515

* @increment: priority increment

3509

*

3516

*

3510

* sys_setpriority is a more generic, but much slower function that

3517

* sys_setpriority is a more generic, but much slower function that

3511

* does similar things.

3518

* does similar things.

3512

*/

3519

*/

3513

asmlinkage long sys_nice(int increment)

3520

asmlinkage long sys_nice(int increment)

3514

{

3521

{

3515

int retval;

3522

int retval;

3516

long nice;

3523

long nice;

3517

3524

3518

/*

3525

/*

3519

* Setpriority might change our priority at the same moment.

3526

* Setpriority might change our priority at the same moment.

3520

* We don't have to worry. Conceptually one call occurs first

3527

* We don't have to worry. Conceptually one call occurs first

3521

* and we have a single winner.

3528

* and we have a single winner.

3522

*/

3529

*/

3523

if (increment < -40)

3530

if (increment < -40)

3524

increment = -40;

3531

increment = -40;

3525

if (increment > 40)

3532

if (increment > 40)

3526

increment = 40;

3533

increment = 40;

3527

3534

3528

nice = PRIO_TO_NICE(current->static_prio) + increment;

3535

nice = PRIO_TO_NICE(current->static_prio) + increment;

3529

if (nice < -20)

3536

if (nice < -20)

3530

nice = -20;

3537

nice = -20;

3531

if (nice > 19)

3538

if (nice > 19)

3532

nice = 19;

3539

nice = 19;

3533

3540

3534

if (increment < 0 && !can_nice(current, nice))

3541

if (increment < 0 && !can_nice(current, nice))

3535

return -EPERM;

3542

return -EPERM;

3536

3543

3537

retval = security_task_setnice(current, nice);

3544

retval = security_task_setnice(current, nice);

3538

if (retval)

3545

if (retval)

3539

return retval;

3546

return retval;

3540

3547

3541

set_user_nice(current, nice);

3548

set_user_nice(current, nice);

3542

return 0;

3549

return 0;

3543

}

3550

}

3544

3551

3545

#endif

3552

#endif

3546

3553

3547

/**

3554

/**

3548

* task_prio - return the priority value of a given task.

3555

* task_prio - return the priority value of a given task.

3549

* @p: the task in question.

3556

* @p: the task in question.

3550

*

3557

*

3551

* This is the priority value as seen by users in /proc.

3558

* This is the priority value as seen by users in /proc.

3552

* RT tasks are offset by -200. Normal tasks are centered

3559

* RT tasks are offset by -200. Normal tasks are centered

3553

* around 0, value goes from -16 to +15.

3560

* around 0, value goes from -16 to +15.

3554

*/

3561

*/

3555

int task_prio(const task_t *p)

3562

int task_prio(const task_t *p)

3556

{

3563

{

3557

return p->prio - MAX_RT_PRIO;

3564

return p->prio - MAX_RT_PRIO;

3558

}

3565

}

3559

3566

3560

/**

3567

/**

3561

* task_nice - return the nice value of a given task.

3568

* task_nice - return the nice value of a given task.

3562

* @p: the task in question.

3569

* @p: the task in question.

3563

*/

3570

*/

3564

int task_nice(const task_t *p)

3571

int task_nice(const task_t *p)

3565

{

3572

{

3566

return TASK_NICE(p);

3573

return TASK_NICE(p);

3567

}

3574

}

3568

EXPORT_SYMBOL_GPL(task_nice);

3575

EXPORT_SYMBOL_GPL(task_nice);

3569

3576

3570

/**

3577

/**

3571

* idle_cpu - is a given cpu idle currently?

3578

* idle_cpu - is a given cpu idle currently?

3572

* @cpu: the processor in question.

3579

* @cpu: the processor in question.

3573

*/

3580

*/

3574

int idle_cpu(int cpu)

3581

int idle_cpu(int cpu)

3575

{

3582

{

3576

return cpu_curr(cpu) == cpu_rq(cpu)->idle;

3583

return cpu_curr(cpu) == cpu_rq(cpu)->idle;

3577

}

3584

}

3578

3585

3579

/**

3586

/**

3580

* idle_task - return the idle task for a given cpu.

3587

* idle_task - return the idle task for a given cpu.

3581

* @cpu: the processor in question.

3588

* @cpu: the processor in question.

3582

*/

3589

*/

3583

task_t *idle_task(int cpu)

3590

task_t *idle_task(int cpu)

3584

{

3591

{

3585

return cpu_rq(cpu)->idle;

3592

return cpu_rq(cpu)->idle;

3586

}

3593

}

3587

3594

3588

/**

3595

/**

3589

* find_process_by_pid - find a process with a matching PID value.

3596

* find_process_by_pid - find a process with a matching PID value.

3590

* @pid: the pid in question.

3597

* @pid: the pid in question.

3591

*/

3598

*/

3592

static inline task_t *find_process_by_pid(pid_t pid)

3599

static inline task_t *find_process_by_pid(pid_t pid)

3593

{

3600

{

3594

return pid ? find_task_by_pid(pid) : current;

3601

return pid ? find_task_by_pid(pid) : current;

3595

}

3602

}

3596

3603

3597

/* Actually do priority change: must hold rq lock. */

3604

/* Actually do priority change: must hold rq lock. */

3598

static void __setscheduler(struct task_struct *p, int policy, int prio)

3605

static void __setscheduler(struct task_struct *p, int policy, int prio)

3599

{

3606

{

3600

BUG_ON(p->array);

3607

BUG_ON(p->array);

3601

p->policy = policy;

3608

p->policy = policy;

3602

p->rt_priority = prio;

3609

p->rt_priority = prio;

3603

if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {

3610

if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {

3604

p->prio = MAX_RT_PRIO-1 - p->rt_priority;

3611

p->prio = MAX_RT_PRIO-1 - p->rt_priority;

3605

} else {

3612

} else {

3606

p->prio = p->static_prio;

3613

p->prio = p->static_prio;

3607

/*

3614

/*

3608

* SCHED_BATCH tasks are treated as perpetual CPU hogs:

3615

* SCHED_BATCH tasks are treated as perpetual CPU hogs:

3609

*/

3616

*/

3610

if (policy == SCHED_BATCH)

3617

if (policy == SCHED_BATCH)

3611

p->sleep_avg = 0;

3618

p->sleep_avg = 0;

3612

}

3619

}

3613

}

3620

}

3614

3621

3615

/**

3622

/**

3616

* sched_setscheduler - change the scheduling policy and/or RT priority of

3623

* sched_setscheduler - change the scheduling policy and/or RT priority of

3617

* a thread.

3624

* a thread.

3618

* @p: the task in question.

3625

* @p: the task in question.

3619

* @policy: new policy.

3626

* @policy: new policy.

3620

* @param: structure containing the new RT priority.

3627

* @param: structure containing the new RT priority.

3621

*/

3628

*/

3622

int sched_setscheduler(struct task_struct *p, int policy,

3629

int sched_setscheduler(struct task_struct *p, int policy,

3623

struct sched_param *param)

3630

struct sched_param *param)

3624

{

3631

{

3625

int retval;

3632

int retval;

3626

int oldprio, oldpolicy = -1;

3633

int oldprio, oldpolicy = -1;

3627

prio_array_t *array;

3634

prio_array_t *array;

3628

unsigned long flags;

3635

unsigned long flags;

3629

runqueue_t *rq;

3636

runqueue_t *rq;

3630

3637

3631

recheck:

3638

recheck:

3632

/* double check policy once rq lock held */

3639

/* double check policy once rq lock held */

3633

if (policy < 0)

3640

if (policy < 0)

3634

policy = oldpolicy = p->policy;

3641

policy = oldpolicy = p->policy;

3635

else if (policy != SCHED_FIFO && policy != SCHED_RR &&

3642

else if (policy != SCHED_FIFO && policy != SCHED_RR &&

3636

policy != SCHED_NORMAL && policy != SCHED_BATCH)

3643

policy != SCHED_NORMAL && policy != SCHED_BATCH)

3637

return -EINVAL;

3644

return -EINVAL;

3638

/*

3645

/*

3639

* Valid priorities for SCHED_FIFO and SCHED_RR are

3646

* Valid priorities for SCHED_FIFO and SCHED_RR are

3640

* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and

3647

* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and

3641

* SCHED_BATCH is 0.

3648

* SCHED_BATCH is 0.

3642

*/

3649

*/

3643

if (param->sched_priority < 0 ||

3650

if (param->sched_priority < 0 ||

3644

(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||

3651

(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||

3645

(!p->mm && param->sched_priority > MAX_RT_PRIO-1))

3652

(!p->mm && param->sched_priority > MAX_RT_PRIO-1))

3646

return -EINVAL;

3653

return -EINVAL;

3647

if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)

3654

if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)

3648

!= (param->sched_priority == 0))

3655

!= (param->sched_priority == 0))

3649

return -EINVAL;

3656

return -EINVAL;

3650

3657

3651

/*

3658

/*

3652

* Allow unprivileged RT tasks to decrease priority:

3659

* Allow unprivileged RT tasks to decrease priority:

3653

*/

3660

*/

3654

if (!capable(CAP_SYS_NICE)) {

3661

if (!capable(CAP_SYS_NICE)) {

3655

/*

3662

/*

3656

* can't change policy, except between SCHED_NORMAL

3663

* can't change policy, except between SCHED_NORMAL

3657

* and SCHED_BATCH:

3664

* and SCHED_BATCH:

3658

*/

3665

*/

3659

if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&

3666

if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&

3660

(policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&

3667

(policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&

3661

!p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)

3668

!p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)

3662

return -EPERM;

3669

return -EPERM;

3663

/* can't increase priority */

3670

/* can't increase priority */

3664

if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&

3671

if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&

3665

param->sched_priority > p->rt_priority &&

3672

param->sched_priority > p->rt_priority &&

3666

param->sched_priority >

3673

param->sched_priority >

3667

p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)

3674

p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)

3668

return -EPERM;

3675

return -EPERM;

3669

/* can't change other user's priorities */

3676

/* can't change other user's priorities */

3670

if ((current->euid != p->euid) &&

3677

if ((current->euid != p->euid) &&

3671

(current->euid != p->uid))

3678

(current->euid != p->uid))

3672

return -EPERM;

3679

return -EPERM;

3673

}

3680

}

3674

3681

3675

retval = security_task_setscheduler(p, policy, param);

3682

retval = security_task_setscheduler(p, policy, param);

3676

if (retval)

3683

if (retval)

3677

return retval;

3684

return retval;

3678

/*

3685

/*

3679

* To be able to change p->policy safely, the apropriate

3686

* To be able to change p->policy safely, the apropriate

3680

* runqueue lock must be held.

3687

* runqueue lock must be held.

3681

*/

3688

*/

3682

rq = task_rq_lock(p, &flags);

3689

rq = task_rq_lock(p, &flags);

3683

/* recheck policy now with rq lock held */

3690

/* recheck policy now with rq lock held */

3684

if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

3691

if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

3685

policy = oldpolicy = -1;

3692

policy = oldpolicy = -1;

3686

task_rq_unlock(rq, &flags);

3693

task_rq_unlock(rq, &flags);

3687

goto recheck;

3694

goto recheck;

3688

}

3695

}

3689

array = p->array;

3696

array = p->array;

3690

if (array)

3697

if (array)

3691

deactivate_task(p, rq);

3698

deactivate_task(p, rq);

3692

oldprio = p->prio;

3699

oldprio = p->prio;

3693

__setscheduler(p, policy, param->sched_priority);

3700

__setscheduler(p, policy, param->sched_priority);

3694

if (array) {

3701

if (array) {

3695

__activate_task(p, rq);

3702

__activate_task(p, rq);

3696

/*

3703

/*

3697

* Reschedule if we are currently running on this runqueue and

3704

* Reschedule if we are currently running on this runqueue and

3698

* our priority decreased, or if we are not currently running on

3705

* our priority decreased, or if we are not currently running on

3699

* this runqueue and our priority is higher than the current's

3706

* this runqueue and our priority is higher than the current's

3700

*/

3707

*/

3701

if (task_running(rq, p)) {

3708

if (task_running(rq, p)) {

3702

if (p->prio > oldprio)

3709

if (p->prio > oldprio)

3703

resched_task(rq->curr);

3710

resched_task(rq->curr);

3704

} else if (TASK_PREEMPTS_CURR(p, rq))

3711

} else if (TASK_PREEMPTS_CURR(p, rq))

3705

resched_task(rq->curr);

3712

resched_task(rq->curr);

3706

}

3713

}

3707

task_rq_unlock(rq, &flags);

3714

task_rq_unlock(rq, &flags);

3708

return 0;

3715

return 0;

3709

}

3716

}

3710

EXPORT_SYMBOL_GPL(sched_setscheduler);

3717

EXPORT_SYMBOL_GPL(sched_setscheduler);

3711

3718

3712

static int

3719

static int

3713

do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

3720

do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

3714

{

3721

{

3715

int retval;

3722

int retval;

3716

struct sched_param lparam;

3723

struct sched_param lparam;

3717

struct task_struct *p;

3724

struct task_struct *p;

3718

3725

3719

if (!param || pid < 0)

3726

if (!param || pid < 0)

3720

return -EINVAL;

3727

return -EINVAL;

3721

if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

3728

if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

3722

return -EFAULT;

3729

return -EFAULT;

3723

read_lock_irq(&tasklist_lock);

3730

read_lock_irq(&tasklist_lock);

3724

p = find_process_by_pid(pid);

3731

p = find_process_by_pid(pid);

3725

if (!p) {

3732

if (!p) {

3726

read_unlock_irq(&tasklist_lock);

3733

read_unlock_irq(&tasklist_lock);

3727

return -ESRCH;

3734

return -ESRCH;

3728

}

3735

}

3729

retval = sched_setscheduler(p, policy, &lparam);

3736

retval = sched_setscheduler(p, policy, &lparam);

3730

read_unlock_irq(&tasklist_lock);

3737

read_unlock_irq(&tasklist_lock);

3731

return retval;

3738

return retval;

3732

}

3739

}

3733

3740

3734

/**

3741

/**

3735

* sys_sched_setscheduler - set/change the scheduler policy and RT priority

3742

* sys_sched_setscheduler - set/change the scheduler policy and RT priority

3736

* @pid: the pid in question.

3743

* @pid: the pid in question.

3737

* @policy: new policy.

3744

* @policy: new policy.

3738

* @param: structure containing the new RT priority.

3745

* @param: structure containing the new RT priority.

3739

*/

3746

*/

3740

asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,

3747

asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,

3741

struct sched_param __user *param)

3748

struct sched_param __user *param)

3742

{

3749

{

3743

/* negative values for policy are not valid */

3750

/* negative values for policy are not valid */

3744

if (policy < 0)

3751

if (policy < 0)

3745

return -EINVAL;

3752

return -EINVAL;

3746

3753

3747

return do_sched_setscheduler(pid, policy, param);

3754

return do_sched_setscheduler(pid, policy, param);

3748

}

3755

}

3749

3756

3750

/**

3757

/**

3751

* sys_sched_setparam - set/change the RT priority of a thread

3758

* sys_sched_setparam - set/change the RT priority of a thread

3752

* @pid: the pid in question.

3759

* @pid: the pid in question.

3753

* @param: structure containing the new RT priority.

3760

* @param: structure containing the new RT priority.

3754

*/

3761

*/

3755

asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)

3762

asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)

3756

{

3763

{

3757

return do_sched_setscheduler(pid, -1, param);

3764

return do_sched_setscheduler(pid, -1, param);

3758

}

3765

}

3759

3766

3760

/**

3767

/**

3761

* sys_sched_getscheduler - get the policy (scheduling class) of a thread

3768

* sys_sched_getscheduler - get the policy (scheduling class) of a thread

3762

* @pid: the pid in question.

3769

* @pid: the pid in question.

3763

*/

3770

*/

3764

asmlinkage long sys_sched_getscheduler(pid_t pid)

3771

asmlinkage long sys_sched_getscheduler(pid_t pid)

3765

{

3772

{

3766

int retval = -EINVAL;

3773

int retval = -EINVAL;

3767

task_t *p;

3774

task_t *p;

3768

3775

3769

if (pid < 0)

3776

if (pid < 0)

3770

goto out_nounlock;

3777

goto out_nounlock;

3771

3778

3772

retval = -ESRCH;

3779

retval = -ESRCH;

3773

read_lock(&tasklist_lock);

3780

read_lock(&tasklist_lock);

3774

p = find_process_by_pid(pid);

3781

p = find_process_by_pid(pid);

3775

if (p) {

3782

if (p) {

3776

retval = security_task_getscheduler(p);

3783

retval = security_task_getscheduler(p);

3777

if (!retval)

3784

if (!retval)

3778

retval = p->policy;

3785

retval = p->policy;

3779

}

3786

}

3780

read_unlock(&tasklist_lock);

3787

read_unlock(&tasklist_lock);

3781

3788

3782

out_nounlock:

3789

out_nounlock:

3783

return retval;

3790

return retval;

3784

}

3791

}

3785

3792

3786

/**

3793

/**

3787

* sys_sched_getscheduler - get the RT priority of a thread

3794

* sys_sched_getscheduler - get the RT priority of a thread

3788

* @pid: the pid in question.

3795

* @pid: the pid in question.

3789

* @param: structure containing the RT priority.

3796

* @param: structure containing the RT priority.

3790

*/

3797

*/

3791

asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)

3798

asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)

3792

{

3799

{

3793

struct sched_param lp;

3800

struct sched_param lp;

3794

int retval = -EINVAL;

3801

int retval = -EINVAL;

3795

task_t *p;

3802

task_t *p;

3796

3803

3797

if (!param || pid < 0)

3804

if (!param || pid < 0)

3798

goto out_nounlock;

3805

goto out_nounlock;

3799

3806

3800

read_lock(&tasklist_lock);

3807

read_lock(&tasklist_lock);

3801

p = find_process_by_pid(pid);

3808

p = find_process_by_pid(pid);

3802

retval = -ESRCH;

3809

retval = -ESRCH;

3803

if (!p)

3810

if (!p)

3804

goto out_unlock;

3811

goto out_unlock;

3805

3812

3806

retval = security_task_getscheduler(p);

3813

retval = security_task_getscheduler(p);

3807

if (retval)

3814

if (retval)

3808

goto out_unlock;

3815

goto out_unlock;

3809

3816

3810

lp.sched_priority = p->rt_priority;

3817

lp.sched_priority = p->rt_priority;

3811

read_unlock(&tasklist_lock);

3818

read_unlock(&tasklist_lock);

3812

3819

3813

/*

3820

/*

3814

* This one might sleep, we cannot do it with a spinlock held ...

3821

* This one might sleep, we cannot do it with a spinlock held ...

3815

*/

3822

*/

3816

retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

3823

retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

3817

3824

3818

out_nounlock:

3825

out_nounlock:

3819

return retval;

3826

return retval;

3820

3827

3821

out_unlock:

3828

out_unlock:

3822

read_unlock(&tasklist_lock);

3829

read_unlock(&tasklist_lock);

3823

return retval;

3830

return retval;

3824

}

3831

}

3825

3832

3826

long sched_setaffinity(pid_t pid, cpumask_t new_mask)

3833

long sched_setaffinity(pid_t pid, cpumask_t new_mask)

3827

{

3834

{

3828

task_t *p;

3835

task_t *p;

3829

int retval;

3836

int retval;

3830

cpumask_t cpus_allowed;

3837

cpumask_t cpus_allowed;

3831

3838

3832

lock_cpu_hotplug();

3839

lock_cpu_hotplug();

3833

read_lock(&tasklist_lock);

3840

read_lock(&tasklist_lock);

3834

3841

3835

p = find_process_by_pid(pid);

3842

p = find_process_by_pid(pid);

3836

if (!p) {

3843

if (!p) {

3837

read_unlock(&tasklist_lock);

3844

read_unlock(&tasklist_lock);

3838

unlock_cpu_hotplug();

3845

unlock_cpu_hotplug();

3839

return -ESRCH;

3846

return -ESRCH;

3840

}

3847

}

3841

3848

3842

/*

3849

/*

3843

* It is not safe to call set_cpus_allowed with the

3850

* It is not safe to call set_cpus_allowed with the

3844

* tasklist_lock held. We will bump the task_struct's

3851

* tasklist_lock held. We will bump the task_struct's

3845

* usage count and then drop tasklist_lock.

3852

* usage count and then drop tasklist_lock.

3846

*/

3853

*/

3847

get_task_struct(p);

3854

get_task_struct(p);

3848

read_unlock(&tasklist_lock);

3855

read_unlock(&tasklist_lock);

3849

3856

3850

retval = -EPERM;

3857

retval = -EPERM;

3851

if ((current->euid != p->euid) && (current->euid != p->uid) &&

3858

if ((current->euid != p->euid) && (current->euid != p->uid) &&

3852

!capable(CAP_SYS_NICE))

3859

!capable(CAP_SYS_NICE))

3853

goto out_unlock;

3860

goto out_unlock;

3854

3861

3855

cpus_allowed = cpuset_cpus_allowed(p);

3862

cpus_allowed = cpuset_cpus_allowed(p);

3856

cpus_and(new_mask, new_mask, cpus_allowed);

3863

cpus_and(new_mask, new_mask, cpus_allowed);

3857

retval = set_cpus_allowed(p, new_mask);

3864

retval = set_cpus_allowed(p, new_mask);

3858

3865

3859

out_unlock:

3866

out_unlock:

3860

put_task_struct(p);

3867

put_task_struct(p);

3861

unlock_cpu_hotplug();

3868

unlock_cpu_hotplug();

3862

return retval;

3869

return retval;

3863

}

3870

}

3864

3871

3865

static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

3872

static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

3866

cpumask_t *new_mask)

3873

cpumask_t *new_mask)

3867

{

3874

{

3868

if (len < sizeof(cpumask_t)) {

3875

if (len < sizeof(cpumask_t)) {

3869

memset(new_mask, 0, sizeof(cpumask_t));

3876

memset(new_mask, 0, sizeof(cpumask_t));

3870

} else if (len > sizeof(cpumask_t)) {

3877

} else if (len > sizeof(cpumask_t)) {

3871

len = sizeof(cpumask_t);

3878

len = sizeof(cpumask_t);

3872

}

3879

}

3873

return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

3880

return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

3874

}

3881

}

3875

3882

3876

/**

3883

/**

3877

* sys_sched_setaffinity - set the cpu affinity of a process

3884

* sys_sched_setaffinity - set the cpu affinity of a process

3878

* @pid: pid of the process

3885

* @pid: pid of the process

3879

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

3886

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

3880

* @user_mask_ptr: user-space pointer to the new cpu mask

3887

* @user_mask_ptr: user-space pointer to the new cpu mask

3881

*/

3888

*/

3882

asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,

3889

asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,

3883

unsigned long __user *user_mask_ptr)

3890

unsigned long __user *user_mask_ptr)

3884

{

3891

{

3885

cpumask_t new_mask;

3892

cpumask_t new_mask;

3886

int retval;

3893

int retval;

3887

3894

3888

retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);

3895

retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);

3889

if (retval)

3896

if (retval)

3890

return retval;

3897

return retval;

3891

3898

3892

return sched_setaffinity(pid, new_mask);

3899

return sched_setaffinity(pid, new_mask);

3893

}

3900

}

3894

3901

3895

/*

3902

/*

3896

* Represents all cpu's present in the system

3903

* Represents all cpu's present in the system

3897

* In systems capable of hotplug, this map could dynamically grow

3904

* In systems capable of hotplug, this map could dynamically grow

3898

* as new cpu's are detected in the system via any platform specific

3905

* as new cpu's are detected in the system via any platform specific

3899

* method, such as ACPI for e.g.

3906

* method, such as ACPI for e.g.

3900

*/

3907

*/

3901

3908

3902

cpumask_t cpu_present_map __read_mostly;

3909

cpumask_t cpu_present_map __read_mostly;

3903

EXPORT_SYMBOL(cpu_present_map);

3910

EXPORT_SYMBOL(cpu_present_map);

3904

3911

3905

#ifndef CONFIG_SMP

3912

#ifndef CONFIG_SMP

3906

cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;

3913

cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;

3907

cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;

3914

cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;

3908

#endif

3915

#endif

3909

3916

3910

long sched_getaffinity(pid_t pid, cpumask_t *mask)

3917

long sched_getaffinity(pid_t pid, cpumask_t *mask)

3911

{

3918

{

3912

int retval;

3919

int retval;

3913

task_t *p;

3920

task_t *p;

3914

3921

3915

lock_cpu_hotplug();

3922

lock_cpu_hotplug();

3916

read_lock(&tasklist_lock);

3923

read_lock(&tasklist_lock);

3917

3924

3918

retval = -ESRCH;

3925

retval = -ESRCH;

3919

p = find_process_by_pid(pid);

3926

p = find_process_by_pid(pid);

3920

if (!p)

3927

if (!p)

3921

goto out_unlock;

3928

goto out_unlock;

3922

3929

3923

retval = 0;

3930

retval = 0;

3924

cpus_and(*mask, p->cpus_allowed, cpu_online_map);

3931

cpus_and(*mask, p->cpus_allowed, cpu_online_map);

3925

3932

3926

out_unlock:

3933

out_unlock:

3927

read_unlock(&tasklist_lock);

3934

read_unlock(&tasklist_lock);

3928

unlock_cpu_hotplug();

3935

unlock_cpu_hotplug();

3929

if (retval)

3936

if (retval)

3930

return retval;

3937

return retval;

3931

3938

3932

return 0;

3939

return 0;

3933

}

3940

}

3934

3941

3935

/**

3942

/**

3936

* sys_sched_getaffinity - get the cpu affinity of a process

3943

* sys_sched_getaffinity - get the cpu affinity of a process

3937

* @pid: pid of the process

3944

* @pid: pid of the process

3938

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

3945

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

3939

* @user_mask_ptr: user-space pointer to hold the current cpu mask

3946

* @user_mask_ptr: user-space pointer to hold the current cpu mask

3940

*/

3947

*/

3941

asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,

3948

asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,

3942

unsigned long __user *user_mask_ptr)

3949

unsigned long __user *user_mask_ptr)

3943

{

3950

{

3944

int ret;

3951

int ret;

3945

cpumask_t mask;

3952

cpumask_t mask;

3946

3953

3947

if (len < sizeof(cpumask_t))

3954

if (len < sizeof(cpumask_t))

3948

return -EINVAL;

3955

return -EINVAL;

3949

3956

3950

ret = sched_getaffinity(pid, &mask);

3957

ret = sched_getaffinity(pid, &mask);

3951

if (ret < 0)

3958

if (ret < 0)

3952

return ret;

3959

return ret;

3953

3960

3954

if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))

3961

if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))

3955

return -EFAULT;

3962

return -EFAULT;

3956

3963

3957

return sizeof(cpumask_t);

3964

return sizeof(cpumask_t);

3958

}

3965

}

3959

3966

3960

/**

3967

/**

3961

* sys_sched_yield - yield the current processor to other threads.

3968

* sys_sched_yield - yield the current processor to other threads.

3962

*

3969

*

3963

* this function yields the current CPU by moving the calling thread

3970

* this function yields the current CPU by moving the calling thread

3964

* to the expired array. If there are no other threads running on this

3971

* to the expired array. If there are no other threads running on this

3965

* CPU then this function will return.

3972

* CPU then this function will return.

3966

*/

3973

*/

3967

asmlinkage long sys_sched_yield(void)

3974

asmlinkage long sys_sched_yield(void)

3968

{

3975

{

3969

runqueue_t *rq = this_rq_lock();

3976

runqueue_t *rq = this_rq_lock();

3970

prio_array_t *array = current->array;

3977

prio_array_t *array = current->array;

3971

prio_array_t *target = rq->expired;

3978

prio_array_t *target = rq->expired;

3972

3979

3973

schedstat_inc(rq, yld_cnt);

3980

schedstat_inc(rq, yld_cnt);

3974

/*

3981

/*

3975

* We implement yielding by moving the task into the expired

3982

* We implement yielding by moving the task into the expired

3976

* queue.

3983

* queue.

3977

*

3984

*

3978

* (special rule: RT tasks will just roundrobin in the active

3985

* (special rule: RT tasks will just roundrobin in the active

3979

* array.)

3986

* array.)

3980

*/

3987

*/

3981

if (rt_task(current))

3988

if (rt_task(current))

3982

target = rq->active;

3989

target = rq->active;

3983

3990

3984

if (array->nr_active == 1) {

3991

if (array->nr_active == 1) {

3985

schedstat_inc(rq, yld_act_empty);

3992

schedstat_inc(rq, yld_act_empty);

3986

if (!rq->expired->nr_active)

3993

if (!rq->expired->nr_active)

3987

schedstat_inc(rq, yld_both_empty);

3994

schedstat_inc(rq, yld_both_empty);

3988

} else if (!rq->expired->nr_active)

3995

} else if (!rq->expired->nr_active)

3989

schedstat_inc(rq, yld_exp_empty);

3996

schedstat_inc(rq, yld_exp_empty);

3990

3997

3991

if (array != target) {

3998

if (array != target) {

3992

dequeue_task(current, array);

3999

dequeue_task(current, array);

3993

enqueue_task(current, target);

4000

enqueue_task(current, target);

3994

} else

4001

} else

3995

/*

4002

/*

3996

* requeue_task is cheaper so perform that if possible.

4003

* requeue_task is cheaper so perform that if possible.

3997

*/

4004

*/

3998

requeue_task(current, array);

4005

requeue_task(current, array);

3999

4006

4000

/*

4007

/*

4001

* Since we are going to call schedule() anyway, there's

4008

* Since we are going to call schedule() anyway, there's

4002

* no need to preempt or enable interrupts:

4009

* no need to preempt or enable interrupts:

4003

*/

4010

*/

4004

__release(rq->lock);

4011

__release(rq->lock);

4005

_raw_spin_unlock(&rq->lock);

4012

_raw_spin_unlock(&rq->lock);

4006

preempt_enable_no_resched();

4013

preempt_enable_no_resched();

4007

4014

4008

schedule();

4015

schedule();

4009

4016

4010

return 0;

4017

return 0;

4011

}

4018

}

4012

4019

4013

static inline void __cond_resched(void)

4020

static inline void __cond_resched(void)

4014

{

4021

{

4015

/*

4022

/*

4016

* The BKS might be reacquired before we have dropped

4023

* The BKS might be reacquired before we have dropped

4017

* PREEMPT_ACTIVE, which could trigger a second

4024

* PREEMPT_ACTIVE, which could trigger a second

4018

* cond_resched() call.

4025

* cond_resched() call.

4019

*/

4026

*/

4020

if (unlikely(preempt_count()))

4027

if (unlikely(preempt_count()))

4021

return;

4028

return;

4022

if (unlikely(system_state != SYSTEM_RUNNING))

4029

if (unlikely(system_state != SYSTEM_RUNNING))

4023

return;

4030

return;

4024

do {

4031

do {

4025

add_preempt_count(PREEMPT_ACTIVE);

4032

add_preempt_count(PREEMPT_ACTIVE);

4026

schedule();

4033

schedule();

4027

sub_preempt_count(PREEMPT_ACTIVE);

4034

sub_preempt_count(PREEMPT_ACTIVE);

4028

} while (need_resched());

4035

} while (need_resched());

4029

}

4036

}

4030

4037

4031

int __sched cond_resched(void)

4038

int __sched cond_resched(void)

4032

{

4039

{

4033

if (need_resched()) {

4040

if (need_resched()) {

4034

__cond_resched();

4041

__cond_resched();

4035

return 1;

4042

return 1;

4036

}

4043

}

4037

return 0;

4044

return 0;

4038

}

4045

}

4039

4046

4040

EXPORT_SYMBOL(cond_resched);

4047

EXPORT_SYMBOL(cond_resched);

4041

4048

4042

/*

4049

/*

4043

* cond_resched_lock() - if a reschedule is pending, drop the given lock,

4050

* cond_resched_lock() - if a reschedule is pending, drop the given lock,

4044

* call schedule, and on return reacquire the lock.

4051

* call schedule, and on return reacquire the lock.

4045

*

4052

*

4046

* This works OK both with and without CONFIG_PREEMPT. We do strange low-level

4053

* This works OK both with and without CONFIG_PREEMPT. We do strange low-level

4047

* operations here to prevent schedule() from being called twice (once via

4054

* operations here to prevent schedule() from being called twice (once via

4048

* spin_unlock(), once by hand).

4055

* spin_unlock(), once by hand).

4049

*/

4056

*/

4050

int cond_resched_lock(spinlock_t *lock)

4057

int cond_resched_lock(spinlock_t *lock)

4051

{

4058

{

4052

int ret = 0;

4059

int ret = 0;

4053

4060

4054

if (need_lockbreak(lock)) {

4061

if (need_lockbreak(lock)) {

4055

spin_unlock(lock);

4062

spin_unlock(lock);

4056

cpu_relax();

4063

cpu_relax();

4057

ret = 1;

4064

ret = 1;

4058

spin_lock(lock);

4065

spin_lock(lock);

4059

}

4066

}

4060

if (need_resched()) {

4067

if (need_resched()) {

4061

_raw_spin_unlock(lock);

4068

_raw_spin_unlock(lock);

4062

preempt_enable_no_resched();

4069

preempt_enable_no_resched();

4063

__cond_resched();

4070

__cond_resched();

4064

ret = 1;

4071

ret = 1;

4065

spin_lock(lock);

4072

spin_lock(lock);

4066

}

4073

}

4067

return ret;

4074

return ret;

4068

}

4075

}

4069

4076

4070

EXPORT_SYMBOL(cond_resched_lock);

4077

EXPORT_SYMBOL(cond_resched_lock);

4071

4078

4072

int __sched cond_resched_softirq(void)

4079

int __sched cond_resched_softirq(void)

4073

{

4080

{

4074

BUG_ON(!in_softirq());

4081

BUG_ON(!in_softirq());

4075

4082

4076

if (need_resched()) {

4083

if (need_resched()) {

4077

__local_bh_enable();

4084

__local_bh_enable();

4078

__cond_resched();

4085

__cond_resched();

4079

local_bh_disable();

4086

local_bh_disable();

4080

return 1;

4087

return 1;

4081

}

4088

}

4082

return 0;

4089

return 0;

4083

}

4090

}

4084

4091

4085

EXPORT_SYMBOL(cond_resched_softirq);

4092

EXPORT_SYMBOL(cond_resched_softirq);

4086

4093

4087

4094

4088

/**

4095

/**

4089

* yield - yield the current processor to other threads.

4096

* yield - yield the current processor to other threads.

4090

*

4097

*

4091

* this is a shortcut for kernel-space yielding - it marks the

4098

* this is a shortcut for kernel-space yielding - it marks the

4092

* thread runnable and calls sys_sched_yield().

4099

* thread runnable and calls sys_sched_yield().

4093

*/

4100

*/

4094

void __sched yield(void)

4101

void __sched yield(void)

4095

{

4102

{

4096

set_current_state(TASK_RUNNING);

4103

set_current_state(TASK_RUNNING);

4097

sys_sched_yield();

4104

sys_sched_yield();

4098

}

4105

}

4099

4106

4100

EXPORT_SYMBOL(yield);

4107

EXPORT_SYMBOL(yield);

4101

4108

4102

/*

4109

/*

4103

* This task is about to go to sleep on IO. Increment rq->nr_iowait so

4110

* This task is about to go to sleep on IO. Increment rq->nr_iowait so

4104

* that process accounting knows that this is a task in IO wait state.

4111

* that process accounting knows that this is a task in IO wait state.

4105

*

4112

*

4106

* But don't do that if it is a deliberate, throttling IO wait (this task

4113

* But don't do that if it is a deliberate, throttling IO wait (this task

4107

* has set its backing_dev_info: the queue against which it should throttle)

4114

* has set its backing_dev_info: the queue against which it should throttle)

4108

*/

4115

*/

4109

void __sched io_schedule(void)

4116

void __sched io_schedule(void)

4110

{

4117

{

4111

struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());

4118

struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());

4112

4119

4113

atomic_inc(&rq->nr_iowait);

4120

atomic_inc(&rq->nr_iowait);

4114

schedule();

4121

schedule();

4115

atomic_dec(&rq->nr_iowait);

4122

atomic_dec(&rq->nr_iowait);

4116

}

4123

}

4117

4124

4118

EXPORT_SYMBOL(io_schedule);

4125

EXPORT_SYMBOL(io_schedule);

4119

4126

4120

long __sched io_schedule_timeout(long timeout)

4127

long __sched io_schedule_timeout(long timeout)

4121

{

4128

{

4122

struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());

4129

struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());

4123

long ret;

4130

long ret;

4124

4131

4125

atomic_inc(&rq->nr_iowait);

4132

atomic_inc(&rq->nr_iowait);

4126

ret = schedule_timeout(timeout);

4133

ret = schedule_timeout(timeout);

4127

atomic_dec(&rq->nr_iowait);

4134

atomic_dec(&rq->nr_iowait);

4128

return ret;

4135

return ret;

4129

}

4136

}

4130

4137

4131

/**

4138

/**

4132

* sys_sched_get_priority_max - return maximum RT priority.

4139

* sys_sched_get_priority_max - return maximum RT priority.

4133

* @policy: scheduling class.

4140

* @policy: scheduling class.

4134

*

4141

*

4135

* this syscall returns the maximum rt_priority that can be used

4142

* this syscall returns the maximum rt_priority that can be used

4136

* by a given scheduling class.

4143

* by a given scheduling class.

4137

*/

4144

*/

4138

asmlinkage long sys_sched_get_priority_max(int policy)

4145

asmlinkage long sys_sched_get_priority_max(int policy)

4139

{

4146

{

4140

int ret = -EINVAL;

4147

int ret = -EINVAL;

4141

4148

4142

switch (policy) {

4149

switch (policy) {

4143

case SCHED_FIFO:

4150

case SCHED_FIFO:

4144

case SCHED_RR:

4151

case SCHED_RR:

4145

ret = MAX_USER_RT_PRIO-1;

4152

ret = MAX_USER_RT_PRIO-1;

4146

break;

4153

break;

4147

case SCHED_NORMAL:

4154

case SCHED_NORMAL:

4148

case SCHED_BATCH:

4155

case SCHED_BATCH:

4149

ret = 0;

4156

ret = 0;

4150

break;

4157

break;

4151

}

4158

}

4152

return ret;

4159

return ret;

4153

}

4160

}

4154

4161

4155

/**

4162

/**

4156

* sys_sched_get_priority_min - return minimum RT priority.

4163

* sys_sched_get_priority_min - return minimum RT priority.

4157

* @policy: scheduling class.

4164

* @policy: scheduling class.

4158

*

4165

*

4159

* this syscall returns the minimum rt_priority that can be used

4166

* this syscall returns the minimum rt_priority that can be used

4160

* by a given scheduling class.

4167

* by a given scheduling class.

4161

*/

4168

*/

4162

asmlinkage long sys_sched_get_priority_min(int policy)

4169

asmlinkage long sys_sched_get_priority_min(int policy)

4163

{

4170

{

4164

int ret = -EINVAL;

4171

int ret = -EINVAL;

4165

4172

4166

switch (policy) {

4173

switch (policy) {

4167

case SCHED_FIFO:

4174

case SCHED_FIFO:

4168

case SCHED_RR:

4175

case SCHED_RR:

4169

ret = 1;

4176

ret = 1;

4170

break;

4177

break;

4171

case SCHED_NORMAL:

4178

case SCHED_NORMAL:

4172

case SCHED_BATCH:

4179

case SCHED_BATCH:

4173

ret = 0;

4180

ret = 0;

4174

}

4181

}

4175

return ret;

4182

return ret;

4176

}

4183

}

4177

4184

4178

/**

4185

/**

4179

* sys_sched_rr_get_interval - return the default timeslice of a process.

4186

* sys_sched_rr_get_interval - return the default timeslice of a process.

4180

* @pid: pid of the process.

4187

* @pid: pid of the process.

4181

* @interval: userspace pointer to the timeslice value.

4188

* @interval: userspace pointer to the timeslice value.

4182

*

4189

*

4183

* this syscall writes the default timeslice value of a given process

4190

* this syscall writes the default timeslice value of a given process

4184

* into the user-space timespec buffer. A value of '0' means infinity.

4191

* into the user-space timespec buffer. A value of '0' means infinity.

4185

*/

4192

*/

4186

asmlinkage

4193

asmlinkage

4187

long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)

4194

long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)

4188

{

4195

{

4189

int retval = -EINVAL;

4196

int retval = -EINVAL;

4190

struct timespec t;

4197

struct timespec t;

4191

task_t *p;

4198

task_t *p;

4192

4199

4193

if (pid < 0)

4200

if (pid < 0)

4194

goto out_nounlock;

4201

goto out_nounlock;

4195

4202

4196

retval = -ESRCH;

4203

retval = -ESRCH;

4197

read_lock(&tasklist_lock);

4204

read_lock(&tasklist_lock);

4198

p = find_process_by_pid(pid);

4205

p = find_process_by_pid(pid);

4199

if (!p)

4206

if (!p)

4200

goto out_unlock;

4207

goto out_unlock;

4201

4208

4202

retval = security_task_getscheduler(p);

4209

retval = security_task_getscheduler(p);

4203

if (retval)

4210

if (retval)

4204

goto out_unlock;

4211

goto out_unlock;

4205

4212

4206

jiffies_to_timespec(p->policy & SCHED_FIFO ?

4213

jiffies_to_timespec(p->policy & SCHED_FIFO ?

4207

0 : task_timeslice(p), &t);

4214

0 : task_timeslice(p), &t);

4208

read_unlock(&tasklist_lock);

4215

read_unlock(&tasklist_lock);

4209

retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;

4216

retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;

4210

out_nounlock:

4217

out_nounlock:

4211

return retval;

4218

return retval;

4212

out_unlock:

4219

out_unlock:

4213

read_unlock(&tasklist_lock);

4220

read_unlock(&tasklist_lock);

4214

return retval;

4221

return retval;

4215

}

4222

}

4216

4223

4217

static inline struct task_struct *eldest_child(struct task_struct *p)

4224

static inline struct task_struct *eldest_child(struct task_struct *p)

4218

{

4225

{

4219

if (list_empty(&p->children)) return NULL;

4226

if (list_empty(&p->children)) return NULL;

4220

return list_entry(p->children.next,struct task_struct,sibling);

4227

return list_entry(p->children.next,struct task_struct,sibling);

4221

}

4228

}

4222

4229

4223

static inline struct task_struct *older_sibling(struct task_struct *p)

4230

static inline struct task_struct *older_sibling(struct task_struct *p)

4224

{

4231

{

4225

if (p->sibling.prev==&p->parent->children) return NULL;

4232

if (p->sibling.prev==&p->parent->children) return NULL;

4226

return list_entry(p->sibling.prev,struct task_struct,sibling);

4233

return list_entry(p->sibling.prev,struct task_struct,sibling);

4227

}

4234

}

4228

4235

4229

static inline struct task_struct *younger_sibling(struct task_struct *p)

4236

static inline struct task_struct *younger_sibling(struct task_struct *p)

4230

{

4237

{

4231

if (p->sibling.next==&p->parent->children) return NULL;

4238

if (p->sibling.next==&p->parent->children) return NULL;

4232

return list_entry(p->sibling.next,struct task_struct,sibling);

4239

return list_entry(p->sibling.next,struct task_struct,sibling);

4233

}

4240

}

4234

4241

4235

static void show_task(task_t *p)

4242

static void show_task(task_t *p)

4236

{

4243

{

4237

task_t *relative;

4244

task_t *relative;

4238

unsigned state;

4245

unsigned state;

4239

unsigned long free = 0;

4246

unsigned long free = 0;

4240

static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };

4247

static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };

4241

4248

4242

printk("%-13.13s ", p->comm);

4249

printk("%-13.13s ", p->comm);

4243

state = p->state ? __ffs(p->state) + 1 : 0;

4250

state = p->state ? __ffs(p->state) + 1 : 0;

4244

if (state < ARRAY_SIZE(stat_nam))

4251

if (state < ARRAY_SIZE(stat_nam))

4245

printk(stat_nam[state]);

4252

printk(stat_nam[state]);

4246

else

4253

else

4247

printk("?");

4254

printk("?");

4248

#if (BITS_PER_LONG == 32)

4255

#if (BITS_PER_LONG == 32)

4249

if (state == TASK_RUNNING)

4256

if (state == TASK_RUNNING)

4250

printk(" running ");

4257

printk(" running ");

4251

else

4258

else

4252

printk(" %08lX ", thread_saved_pc(p));

4259

printk(" %08lX ", thread_saved_pc(p));

4253

#else

4260

#else

4254

if (state == TASK_RUNNING)

4261

if (state == TASK_RUNNING)

4255

printk(" running task ");

4262

printk(" running task ");

4256

else

4263

else

4257

printk(" %016lx ", thread_saved_pc(p));

4264

printk(" %016lx ", thread_saved_pc(p));

4258

#endif

4265

#endif

4259

#ifdef CONFIG_DEBUG_STACK_USAGE

4266

#ifdef CONFIG_DEBUG_STACK_USAGE

4260

{

4267

{

4261

unsigned long *n = end_of_stack(p);

4268

unsigned long *n = end_of_stack(p);

4262

while (!*n)

4269

while (!*n)

4263

n++;

4270

n++;

4264

free = (unsigned long)n - (unsigned long)end_of_stack(p);

4271

free = (unsigned long)n - (unsigned long)end_of_stack(p);

4265

}

4272

}

4266

#endif

4273

#endif

4267

printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);

4274

printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);

4268

if ((relative = eldest_child(p)))

4275

if ((relative = eldest_child(p)))

4269

printk("%5d ", relative->pid);

4276

printk("%5d ", relative->pid);

4270

else

4277

else

4271

printk(" ");

4278

printk(" ");

4272

if ((relative = younger_sibling(p)))

4279

if ((relative = younger_sibling(p)))

4273

printk("%7d", relative->pid);

4280

printk("%7d", relative->pid);

4274

else

4281

else

4275

printk(" ");

4282

printk(" ");

4276

if ((relative = older_sibling(p)))

4283

if ((relative = older_sibling(p)))

4277

printk(" %5d", relative->pid);

4284

printk(" %5d", relative->pid);

4278

else

4285

else

4279

printk(" ");

4286

printk(" ");

4280

if (!p->mm)

4287

if (!p->mm)

4281

printk(" (L-TLB)\n");

4288

printk(" (L-TLB)\n");

4282

else

4289

else

4283

printk(" (NOTLB)\n");

4290

printk(" (NOTLB)\n");

4284

4291

4285

if (state != TASK_RUNNING)

4292

if (state != TASK_RUNNING)

4286

show_stack(p, NULL);

4293

show_stack(p, NULL);

4287

}

4294

}

4288

4295

4289

void show_state(void)

4296

void show_state(void)

4290

{

4297

{

4291

task_t *g, *p;

4298

task_t *g, *p;

4292

4299

4293

#if (BITS_PER_LONG == 32)

4300

#if (BITS_PER_LONG == 32)

4294

printk("\n"

4301

printk("\n"

4295

" sibling\n");

4302

" sibling\n");

4296

printk(" task PC pid father child younger older\n");

4303

printk(" task PC pid father child younger older\n");

4297

#else

4304

#else

4298

printk("\n"

4305

printk("\n"

4299

" sibling\n");

4306

" sibling\n");

4300

printk(" task PC pid father child younger older\n");

4307

printk(" task PC pid father child younger older\n");

4301

#endif

4308

#endif

4302

read_lock(&tasklist_lock);

4309

read_lock(&tasklist_lock);

4303

do_each_thread(g, p) {

4310

do_each_thread(g, p) {

4304

/*

4311

/*

4305

* reset the NMI-timeout, listing all files on a slow

4312

* reset the NMI-timeout, listing all files on a slow

4306

* console might take alot of time:

4313

* console might take alot of time:

4307

*/

4314

*/

4308

touch_nmi_watchdog();

4315

touch_nmi_watchdog();

4309

show_task(p);

4316

show_task(p);

4310

} while_each_thread(g, p);

4317

} while_each_thread(g, p);

4311

4318

4312

read_unlock(&tasklist_lock);

4319

read_unlock(&tasklist_lock);

4313

mutex_debug_show_all_locks();

4320

mutex_debug_show_all_locks();

4314

}

4321

}

4315

4322

4316

/**

4323

/**

4317

* init_idle - set up an idle thread for a given CPU

4324

* init_idle - set up an idle thread for a given CPU

4318

* @idle: task in question

4325

* @idle: task in question

4319

* @cpu: cpu the idle task belongs to

4326

* @cpu: cpu the idle task belongs to

4320

*

4327

*

4321

* NOTE: this function does not set the idle thread's NEED_RESCHED

4328

* NOTE: this function does not set the idle thread's NEED_RESCHED

4322

* flag, to make booting more robust.

4329

* flag, to make booting more robust.

4323

*/

4330

*/

4324

void __devinit init_idle(task_t *idle, int cpu)

4331

void __devinit init_idle(task_t *idle, int cpu)

4325

{

4332

{

4326

runqueue_t *rq = cpu_rq(cpu);

4333

runqueue_t *rq = cpu_rq(cpu);

4327

unsigned long flags;

4334

unsigned long flags;

4328

4335

4329

idle->timestamp = sched_clock();

4336

idle->timestamp = sched_clock();

4330

idle->sleep_avg = 0;

4337

idle->sleep_avg = 0;

4331

idle->array = NULL;

4338

idle->array = NULL;

4332

idle->prio = MAX_PRIO;

4339

idle->prio = MAX_PRIO;

4333

idle->state = TASK_RUNNING;

4340

idle->state = TASK_RUNNING;

4334

idle->cpus_allowed = cpumask_of_cpu(cpu);

4341

idle->cpus_allowed = cpumask_of_cpu(cpu);

4335

set_task_cpu(idle, cpu);

4342

set_task_cpu(idle, cpu);

4336

4343

4337

spin_lock_irqsave(&rq->lock, flags);

4344

spin_lock_irqsave(&rq->lock, flags);

4338

rq->curr = rq->idle = idle;

4345

rq->curr = rq->idle = idle;

4339

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

4346

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

4340

idle->oncpu = 1;

4347

idle->oncpu = 1;

4341

#endif

4348

#endif

4342

spin_unlock_irqrestore(&rq->lock, flags);

4349

spin_unlock_irqrestore(&rq->lock, flags);

4343

4350

4344

/* Set the preempt count _outside_ the spinlocks! */

4351

/* Set the preempt count _outside_ the spinlocks! */

4345

#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)

4352

#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)

4346

task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);

4353

task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);

4347

#else

4354

#else

4348

task_thread_info(idle)->preempt_count = 0;

4355

task_thread_info(idle)->preempt_count = 0;

4349

#endif

4356

#endif

4350

}

4357

}

4351

4358

4352

/*

4359

/*

4353

* In a system that switches off the HZ timer nohz_cpu_mask

4360

* In a system that switches off the HZ timer nohz_cpu_mask

4354

* indicates which cpus entered this state. This is used

4361

* indicates which cpus entered this state. This is used

4355

* in the rcu update to wait only for active cpus. For system

4362

* in the rcu update to wait only for active cpus. For system

4356

* which do not switch off the HZ timer nohz_cpu_mask should

4363

* which do not switch off the HZ timer nohz_cpu_mask should

4357

* always be CPU_MASK_NONE.

4364

* always be CPU_MASK_NONE.

4358

*/

4365

*/

4359

cpumask_t nohz_cpu_mask = CPU_MASK_NONE;

4366

cpumask_t nohz_cpu_mask = CPU_MASK_NONE;

4360

4367

4361

#ifdef CONFIG_SMP

4368

#ifdef CONFIG_SMP

4362

/*

4369

/*

4363

* This is how migration works:

4370

* This is how migration works:

4364

*

4371

*

4365

* 1) we queue a migration_req_t structure in the source CPU's

4372

* 1) we queue a migration_req_t structure in the source CPU's

4366

* runqueue and wake up that CPU's migration thread.

4373

* runqueue and wake up that CPU's migration thread.

4367

* 2) we down() the locked semaphore => thread blocks.

4374

* 2) we down() the locked semaphore => thread blocks.

4368

* 3) migration thread wakes up (implicitly it forces the migrated

4375

* 3) migration thread wakes up (implicitly it forces the migrated

4369

* thread off the CPU)

4376

* thread off the CPU)

4370

* 4) it gets the migration request and checks whether the migrated

4377

* 4) it gets the migration request and checks whether the migrated

4371

* task is still in the wrong runqueue.

4378

* task is still in the wrong runqueue.

4372

* 5) if it's in the wrong runqueue then the migration thread removes

4379

* 5) if it's in the wrong runqueue then the migration thread removes

4373

* it and puts it into the right queue.

4380

* it and puts it into the right queue.

4374

* 6) migration thread up()s the semaphore.

4381

* 6) migration thread up()s the semaphore.

4375

* 7) we wake up and the migration is done.

4382

* 7) we wake up and the migration is done.

4376

*/

4383

*/

4377

4384

4378

/*

4385

/*

4379

* Change a given task's CPU affinity. Migrate the thread to a

4386

* Change a given task's CPU affinity. Migrate the thread to a

4380

* proper CPU and schedule it away if the CPU it's executing on

4387

* proper CPU and schedule it away if the CPU it's executing on

4381

* is removed from the allowed bitmask.

4388

* is removed from the allowed bitmask.

4382

*

4389

*

4383

* NOTE: the caller must have a valid reference to the task, the

4390

* NOTE: the caller must have a valid reference to the task, the

4384

* task must not exit() & deallocate itself prematurely. The

4391

* task must not exit() & deallocate itself prematurely. The

4385

* call is not atomic; no spinlocks may be held.

4392

* call is not atomic; no spinlocks may be held.

4386

*/

4393

*/

4387

int set_cpus_allowed(task_t *p, cpumask_t new_mask)

4394

int set_cpus_allowed(task_t *p, cpumask_t new_mask)

4388

{

4395

{

4389

unsigned long flags;

4396

unsigned long flags;

4390

int ret = 0;

4397

int ret = 0;

4391

migration_req_t req;

4398

migration_req_t req;

4392

runqueue_t *rq;

4399

runqueue_t *rq;

4393

4400

4394

rq = task_rq_lock(p, &flags);

4401

rq = task_rq_lock(p, &flags);

4395

if (!cpus_intersects(new_mask, cpu_online_map)) {

4402

if (!cpus_intersects(new_mask, cpu_online_map)) {

4396

ret = -EINVAL;

4403

ret = -EINVAL;

4397

goto out;

4404

goto out;

4398

}

4405

}

4399

4406

4400

p->cpus_allowed = new_mask;

4407

p->cpus_allowed = new_mask;

4401

/* Can the task run on the task's current CPU? If so, we're done */

4408

/* Can the task run on the task's current CPU? If so, we're done */

4402

if (cpu_isset(task_cpu(p), new_mask))

4409

if (cpu_isset(task_cpu(p), new_mask))

4403

goto out;

4410

goto out;

4404

4411

4405

if (migrate_task(p, any_online_cpu(new_mask), &req)) {

4412

if (migrate_task(p, any_online_cpu(new_mask), &req)) {

4406

/* Need help from migration thread: drop lock and wait. */

4413

/* Need help from migration thread: drop lock and wait. */

4407

task_rq_unlock(rq, &flags);

4414

task_rq_unlock(rq, &flags);

4408

wake_up_process(rq->migration_thread);

4415

wake_up_process(rq->migration_thread);

4409

wait_for_completion(&req.done);

4416

wait_for_completion(&req.done);

4410

tlb_migrate_finish(p->mm);

4417

tlb_migrate_finish(p->mm);

4411

return 0;

4418

return 0;

4412

}

4419

}

4413

out:

4420

out:

4414

task_rq_unlock(rq, &flags);

4421

task_rq_unlock(rq, &flags);

4415

return ret;

4422

return ret;

4416

}

4423

}

4417

4424

4418

EXPORT_SYMBOL_GPL(set_cpus_allowed);

4425

EXPORT_SYMBOL_GPL(set_cpus_allowed);

4419

4426

4420

/*

4427

/*

4421

* Move (not current) task off this cpu, onto dest cpu. We're doing

4428

* Move (not current) task off this cpu, onto dest cpu. We're doing

4422

* this because either it can't run here any more (set_cpus_allowed()

4429

* this because either it can't run here any more (set_cpus_allowed()

4423

* away from this CPU, or CPU going down), or because we're

4430

* away from this CPU, or CPU going down), or because we're

4424

* attempting to rebalance this task on exec (sched_exec).

4431

* attempting to rebalance this task on exec (sched_exec).

4425

*

4432

*

4426

* So we race with normal scheduler movements, but that's OK, as long

4433

* So we race with normal scheduler movements, but that's OK, as long

4427

* as the task is no longer on this CPU.

4434

* as the task is no longer on this CPU.

4428

*/

4435

*/

4429

static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)

4436

static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)

4430

{

4437

{

4431

runqueue_t *rq_dest, *rq_src;

4438

runqueue_t *rq_dest, *rq_src;

4432

4439

4433

if (unlikely(cpu_is_offline(dest_cpu)))

4440

if (unlikely(cpu_is_offline(dest_cpu)))

4434

return;

4441

return;

4435

4442

4436

rq_src = cpu_rq(src_cpu);

4443

rq_src = cpu_rq(src_cpu);

4437

rq_dest = cpu_rq(dest_cpu);

4444

rq_dest = cpu_rq(dest_cpu);

4438

4445

4439

double_rq_lock(rq_src, rq_dest);

4446

double_rq_lock(rq_src, rq_dest);

4440

/* Already moved. */

4447

/* Already moved. */

4441

if (task_cpu(p) != src_cpu)

4448

if (task_cpu(p) != src_cpu)

4442

goto out;

4449

goto out;

4443

/* Affinity changed (again). */

4450

/* Affinity changed (again). */

4444

if (!cpu_isset(dest_cpu, p->cpus_allowed))

4451

if (!cpu_isset(dest_cpu, p->cpus_allowed))

4445

goto out;

4452

goto out;

4446

4453

4447

set_task_cpu(p, dest_cpu);

4454

set_task_cpu(p, dest_cpu);

4448

if (p->array) {

4455

if (p->array) {

4449

/*

4456

/*

4450

* Sync timestamp with rq_dest's before activating.

4457

* Sync timestamp with rq_dest's before activating.

4451

* The same thing could be achieved by doing this step

4458

* The same thing could be achieved by doing this step

4452

* afterwards, and pretending it was a local activate.

4459

* afterwards, and pretending it was a local activate.

4453

* This way is cleaner and logically correct.

4460

* This way is cleaner and logically correct.

4454

*/

4461

*/

4455

p->timestamp = p->timestamp - rq_src->timestamp_last_tick

4462

p->timestamp = p->timestamp - rq_src->timestamp_last_tick

4456

+ rq_dest->timestamp_last_tick;

4463

+ rq_dest->timestamp_last_tick;

4457

deactivate_task(p, rq_src);

4464

deactivate_task(p, rq_src);

4458

activate_task(p, rq_dest, 0);

4465

activate_task(p, rq_dest, 0);

4459

if (TASK_PREEMPTS_CURR(p, rq_dest))

4466

if (TASK_PREEMPTS_CURR(p, rq_dest))

4460

resched_task(rq_dest->curr);

4467

resched_task(rq_dest->curr);

4461

}

4468

}

4462

4469

4463

out:

4470

out:

4464

double_rq_unlock(rq_src, rq_dest);

4471

double_rq_unlock(rq_src, rq_dest);

4465

}

4472

}

4466

4473

4467

/*

4474

/*

4468

* migration_thread - this is a highprio system thread that performs

4475

* migration_thread - this is a highprio system thread that performs

4469

* thread migration by bumping thread off CPU then 'pushing' onto

4476

* thread migration by bumping thread off CPU then 'pushing' onto

4470

* another runqueue.

4477

* another runqueue.

4471

*/

4478

*/

4472

static int migration_thread(void *data)

4479

static int migration_thread(void *data)

4473

{

4480

{

4474

runqueue_t *rq;

4481

runqueue_t *rq;

4475

int cpu = (long)data;

4482

int cpu = (long)data;

4476

4483

4477

rq = cpu_rq(cpu);

4484

rq = cpu_rq(cpu);

4478

BUG_ON(rq->migration_thread != current);

4485

BUG_ON(rq->migration_thread != current);

4479

4486

4480

set_current_state(TASK_INTERRUPTIBLE);

4487

set_current_state(TASK_INTERRUPTIBLE);

4481

while (!kthread_should_stop()) {

4488

while (!kthread_should_stop()) {

4482

struct list_head *head;

4489

struct list_head *head;

4483

migration_req_t *req;

4490

migration_req_t *req;

4484

4491

4485

try_to_freeze();

4492

try_to_freeze();

4486

4493

4487

spin_lock_irq(&rq->lock);

4494

spin_lock_irq(&rq->lock);

4488

4495

4489

if (cpu_is_offline(cpu)) {

4496

if (cpu_is_offline(cpu)) {

4490

spin_unlock_irq(&rq->lock);

4497

spin_unlock_irq(&rq->lock);

4491

goto wait_to_die;

4498

goto wait_to_die;

4492

}

4499

}

4493

4500

4494

if (rq->active_balance) {

4501

if (rq->active_balance) {

4495

active_load_balance(rq, cpu);

4502

active_load_balance(rq, cpu);

4496

rq->active_balance = 0;

4503

rq->active_balance = 0;

4497

}

4504

}

4498

4505

4499

head = &rq->migration_queue;

4506

head = &rq->migration_queue;

4500

4507

4501

if (list_empty(head)) {

4508

if (list_empty(head)) {

4502

spin_unlock_irq(&rq->lock);

4509

spin_unlock_irq(&rq->lock);

4503

schedule();

4510

schedule();

4504

set_current_state(TASK_INTERRUPTIBLE);

4511

set_current_state(TASK_INTERRUPTIBLE);

4505

continue;

4512

continue;

4506

}

4513

}

4507

req = list_entry(head->next, migration_req_t, list);

4514

req = list_entry(head->next, migration_req_t, list);

4508

list_del_init(head->next);

4515

list_del_init(head->next);

4509

4516

4510

spin_unlock(&rq->lock);

4517

spin_unlock(&rq->lock);

4511

__migrate_task(req->task, cpu, req->dest_cpu);

4518

__migrate_task(req->task, cpu, req->dest_cpu);

4512

local_irq_enable();

4519

local_irq_enable();

4513

4520

4514

complete(&req->done);

4521

complete(&req->done);

4515

}

4522

}

4516

__set_current_state(TASK_RUNNING);

4523

__set_current_state(TASK_RUNNING);

4517

return 0;

4524

return 0;

4518

4525

4519

wait_to_die:

4526

wait_to_die:

4520

/* Wait for kthread_stop */

4527

/* Wait for kthread_stop */

4521

set_current_state(TASK_INTERRUPTIBLE);

4528

set_current_state(TASK_INTERRUPTIBLE);

4522

while (!kthread_should_stop()) {

4529

while (!kthread_should_stop()) {

4523

schedule();

4530

schedule();

4524

set_current_state(TASK_INTERRUPTIBLE);

4531

set_current_state(TASK_INTERRUPTIBLE);

4525

}

4532

}

4526

__set_current_state(TASK_RUNNING);

4533

__set_current_state(TASK_RUNNING);

4527

return 0;

4534

return 0;

4528

}

4535

}

4529

4536

4530

#ifdef CONFIG_HOTPLUG_CPU

4537

#ifdef CONFIG_HOTPLUG_CPU

4531

/* Figure out where task on dead CPU should go, use force if neccessary. */

4538

/* Figure out where task on dead CPU should go, use force if neccessary. */

4532

static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)

4539

static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)

4533

{

4540

{

4534

int dest_cpu;

4541

int dest_cpu;

4535

cpumask_t mask;

4542

cpumask_t mask;

4536

4543

4537

/* On same node? */

4544

/* On same node? */

4538

mask = node_to_cpumask(cpu_to_node(dead_cpu));

4545

mask = node_to_cpumask(cpu_to_node(dead_cpu));

4539

cpus_and(mask, mask, tsk->cpus_allowed);

4546

cpus_and(mask, mask, tsk->cpus_allowed);

4540

dest_cpu = any_online_cpu(mask);

4547

dest_cpu = any_online_cpu(mask);

4541

4548

4542

/* On any allowed CPU? */

4549

/* On any allowed CPU? */

4543

if (dest_cpu == NR_CPUS)

4550

if (dest_cpu == NR_CPUS)

4544

dest_cpu = any_online_cpu(tsk->cpus_allowed);

4551

dest_cpu = any_online_cpu(tsk->cpus_allowed);

4545

4552

4546

/* No more Mr. Nice Guy. */

4553

/* No more Mr. Nice Guy. */

4547

if (dest_cpu == NR_CPUS) {

4554

if (dest_cpu == NR_CPUS) {

4548

cpus_setall(tsk->cpus_allowed);

4555

cpus_setall(tsk->cpus_allowed);

4549

dest_cpu = any_online_cpu(tsk->cpus_allowed);

4556

dest_cpu = any_online_cpu(tsk->cpus_allowed);

4550

4557

4551

/*

4558

/*

4552

* Don't tell them about moving exiting tasks or

4559

* Don't tell them about moving exiting tasks or

4553

* kernel threads (both mm NULL), since they never

4560

* kernel threads (both mm NULL), since they never

4554

* leave kernel.

4561

* leave kernel.

4555

*/

4562

*/

4556

if (tsk->mm && printk_ratelimit())

4563

if (tsk->mm && printk_ratelimit())

4557

printk(KERN_INFO "process %d (%s) no "

4564

printk(KERN_INFO "process %d (%s) no "

4558

"longer affine to cpu%d\n",

4565

"longer affine to cpu%d\n",

4559

tsk->pid, tsk->comm, dead_cpu);

4566

tsk->pid, tsk->comm, dead_cpu);

4560

}

4567

}

4561

__migrate_task(tsk, dead_cpu, dest_cpu);

4568

__migrate_task(tsk, dead_cpu, dest_cpu);

4562

}

4569

}

4563

4570

4564

/*

4571

/*

4565

* While a dead CPU has no uninterruptible tasks queued at this point,

4572

* While a dead CPU has no uninterruptible tasks queued at this point,

4566

* it might still have a nonzero ->nr_uninterruptible counter, because

4573

* it might still have a nonzero ->nr_uninterruptible counter, because

4567

* for performance reasons the counter is not stricly tracking tasks to

4574

* for performance reasons the counter is not stricly tracking tasks to

4568

* their home CPUs. So we just add the counter to another CPU's counter,

4575

* their home CPUs. So we just add the counter to another CPU's counter,

4569

* to keep the global sum constant after CPU-down:

4576

* to keep the global sum constant after CPU-down:

4570

*/

4577

*/

4571

static void migrate_nr_uninterruptible(runqueue_t *rq_src)

4578

static void migrate_nr_uninterruptible(runqueue_t *rq_src)

4572

{

4579

{

4573

runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));

4580

runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));

4574

unsigned long flags;

4581

unsigned long flags;

4575

4582

4576

local_irq_save(flags);

4583

local_irq_save(flags);

4577

double_rq_lock(rq_src, rq_dest);

4584

double_rq_lock(rq_src, rq_dest);

4578

rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;

4585

rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;

4579

rq_src->nr_uninterruptible = 0;

4586

rq_src->nr_uninterruptible = 0;

4580

double_rq_unlock(rq_src, rq_dest);

4587

double_rq_unlock(rq_src, rq_dest);

4581

local_irq_restore(flags);

4588

local_irq_restore(flags);

4582

}

4589

}

4583

4590

4584

/* Run through task list and migrate tasks from the dead cpu. */

4591

/* Run through task list and migrate tasks from the dead cpu. */

4585

static void migrate_live_tasks(int src_cpu)

4592

static void migrate_live_tasks(int src_cpu)

4586

{

4593

{

4587

struct task_struct *tsk, *t;

4594

struct task_struct *tsk, *t;

4588

4595

4589

write_lock_irq(&tasklist_lock);

4596

write_lock_irq(&tasklist_lock);

4590

4597

4591

do_each_thread(t, tsk) {

4598

do_each_thread(t, tsk) {

4592

if (tsk == current)

4599

if (tsk == current)

4593

continue;

4600

continue;

4594

4601

4595

if (task_cpu(tsk) == src_cpu)

4602

if (task_cpu(tsk) == src_cpu)

4596

move_task_off_dead_cpu(src_cpu, tsk);

4603

move_task_off_dead_cpu(src_cpu, tsk);

4597

} while_each_thread(t, tsk);

4604

} while_each_thread(t, tsk);

4598

4605

4599

write_unlock_irq(&tasklist_lock);

4606

write_unlock_irq(&tasklist_lock);

4600

}

4607

}

4601

4608

4602

/* Schedules idle task to be the next runnable task on current CPU.

4609

/* Schedules idle task to be the next runnable task on current CPU.

4603

* It does so by boosting its priority to highest possible and adding it to

4610

* It does so by boosting its priority to highest possible and adding it to

4604

* the _front_ of runqueue. Used by CPU offline code.

4611

* the _front_ of runqueue. Used by CPU offline code.

4605

*/

4612

*/

4606

void sched_idle_next(void)

4613

void sched_idle_next(void)

4607

{

4614

{

4608

int cpu = smp_processor_id();

4615

int cpu = smp_processor_id();

4609

runqueue_t *rq = this_rq();

4616

runqueue_t *rq = this_rq();

4610

struct task_struct *p = rq->idle;

4617

struct task_struct *p = rq->idle;

4611

unsigned long flags;

4618

unsigned long flags;

4612

4619

4613

/* cpu has to be offline */

4620

/* cpu has to be offline */

4614

BUG_ON(cpu_online(cpu));

4621

BUG_ON(cpu_online(cpu));

4615

4622

4616

/* Strictly not necessary since rest of the CPUs are stopped by now

4623

/* Strictly not necessary since rest of the CPUs are stopped by now

4617

* and interrupts disabled on current cpu.

4624

* and interrupts disabled on current cpu.

4618

*/

4625

*/

4619

spin_lock_irqsave(&rq->lock, flags);

4626

spin_lock_irqsave(&rq->lock, flags);

4620

4627

4621

__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);

4628

__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);

4622

/* Add idle task to _front_ of it's priority queue */

4629

/* Add idle task to _front_ of it's priority queue */

4623

__activate_idle_task(p, rq);

4630

__activate_idle_task(p, rq);

4624

4631

4625

spin_unlock_irqrestore(&rq->lock, flags);

4632

spin_unlock_irqrestore(&rq->lock, flags);

4626

}

4633

}

4627

4634

4628

/* Ensures that the idle task is using init_mm right before its cpu goes

4635

/* Ensures that the idle task is using init_mm right before its cpu goes

4629

* offline.

4636

* offline.

4630

*/

4637

*/

4631

void idle_task_exit(void)

4638

void idle_task_exit(void)

4632

{

4639

{

4633

struct mm_struct *mm = current->active_mm;

4640

struct mm_struct *mm = current->active_mm;

4634

4641

4635

BUG_ON(cpu_online(smp_processor_id()));

4642

BUG_ON(cpu_online(smp_processor_id()));

4636

4643

4637

if (mm != &init_mm)

4644

if (mm != &init_mm)

4638

switch_mm(mm, &init_mm, current);

4645

switch_mm(mm, &init_mm, current);

4639

mmdrop(mm);

4646

mmdrop(mm);

4640

}

4647

}

4641

4648

4642

static void migrate_dead(unsigned int dead_cpu, task_t *tsk)

4649

static void migrate_dead(unsigned int dead_cpu, task_t *tsk)

4643

{

4650

{

4644

struct runqueue *rq = cpu_rq(dead_cpu);

4651

struct runqueue *rq = cpu_rq(dead_cpu);

4645

4652

4646

/* Must be exiting, otherwise would be on tasklist. */

4653

/* Must be exiting, otherwise would be on tasklist. */

4647

BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);

4654

BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);

4648

4655

4649

/* Cannot have done final schedule yet: would have vanished. */

4656

/* Cannot have done final schedule yet: would have vanished. */

4650

BUG_ON(tsk->flags & PF_DEAD);

4657

BUG_ON(tsk->flags & PF_DEAD);

4651

4658

4652

get_task_struct(tsk);

4659

get_task_struct(tsk);

4653

4660

4654

/*

4661

/*

4655

* Drop lock around migration; if someone else moves it,

4662

* Drop lock around migration; if someone else moves it,

4656

* that's OK. No task can be added to this CPU, so iteration is

4663

* that's OK. No task can be added to this CPU, so iteration is

4657

* fine.

4664

* fine.

4658

*/

4665

*/

4659

spin_unlock_irq(&rq->lock);

4666

spin_unlock_irq(&rq->lock);

4660

move_task_off_dead_cpu(dead_cpu, tsk);

4667

move_task_off_dead_cpu(dead_cpu, tsk);

4661

spin_lock_irq(&rq->lock);

4668

spin_lock_irq(&rq->lock);

4662

4669

4663

put_task_struct(tsk);

4670

put_task_struct(tsk);

4664

}

4671

}

4665

4672

4666

/* release_task() removes task from tasklist, so we won't find dead tasks. */

4673

/* release_task() removes task from tasklist, so we won't find dead tasks. */

4667

static void migrate_dead_tasks(unsigned int dead_cpu)

4674

static void migrate_dead_tasks(unsigned int dead_cpu)

4668

{

4675

{

4669

unsigned arr, i;

4676

unsigned arr, i;

4670

struct runqueue *rq = cpu_rq(dead_cpu);

4677

struct runqueue *rq = cpu_rq(dead_cpu);

4671

4678

4672

for (arr = 0; arr < 2; arr++) {

4679

for (arr = 0; arr < 2; arr++) {

4673

for (i = 0; i < MAX_PRIO; i++) {

4680

for (i = 0; i < MAX_PRIO; i++) {

4674

struct list_head *list = &rq->arrays[arr].queue[i];

4681

struct list_head *list = &rq->arrays[arr].queue[i];

4675

while (!list_empty(list))

4682

while (!list_empty(list))

4676

migrate_dead(dead_cpu,

4683

migrate_dead(dead_cpu,

4677

list_entry(list->next, task_t,

4684

list_entry(list->next, task_t,

4678

run_list));

4685

run_list));

4679

}

4686

}

4680

}

4687

}

4681

}

4688

}

4682

#endif /* CONFIG_HOTPLUG_CPU */

4689

#endif /* CONFIG_HOTPLUG_CPU */

4683

4690

4684

/*

4691

/*

4685

* migration_call - callback that gets triggered when a CPU is added.

4692

* migration_call - callback that gets triggered when a CPU is added.

4686

* Here we can start up the necessary migration thread for the new CPU.

4693

* Here we can start up the necessary migration thread for the new CPU.

4687

*/

4694

*/

4688

static int migration_call(struct notifier_block *nfb, unsigned long action,

4695

static int migration_call(struct notifier_block *nfb, unsigned long action,

4689

void *hcpu)

4696

void *hcpu)

4690

{

4697

{

4691

int cpu = (long)hcpu;

4698

int cpu = (long)hcpu;

4692

struct task_struct *p;

4699

struct task_struct *p;

4693

struct runqueue *rq;

4700

struct runqueue *rq;

4694

unsigned long flags;

4701

unsigned long flags;

4695

4702

4696

switch (action) {

4703

switch (action) {

4697

case CPU_UP_PREPARE:

4704

case CPU_UP_PREPARE:

4698

p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);

4705

p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);

4699

if (IS_ERR(p))

4706

if (IS_ERR(p))

4700

return NOTIFY_BAD;

4707

return NOTIFY_BAD;

4701

p->flags |= PF_NOFREEZE;

4708

p->flags |= PF_NOFREEZE;

4702

kthread_bind(p, cpu);

4709

kthread_bind(p, cpu);

4703

/* Must be high prio: stop_machine expects to yield to it. */

4710

/* Must be high prio: stop_machine expects to yield to it. */

4704

rq = task_rq_lock(p, &flags);

4711

rq = task_rq_lock(p, &flags);

4705

__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);

4712

__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);

4706

task_rq_unlock(rq, &flags);

4713

task_rq_unlock(rq, &flags);

4707

cpu_rq(cpu)->migration_thread = p;

4714

cpu_rq(cpu)->migration_thread = p;

4708

break;

4715

break;

4709

case CPU_ONLINE:

4716

case CPU_ONLINE:

4710

/* Strictly unneccessary, as first user will wake it. */

4717

/* Strictly unneccessary, as first user will wake it. */

4711

wake_up_process(cpu_rq(cpu)->migration_thread);

4718

wake_up_process(cpu_rq(cpu)->migration_thread);

4712

break;

4719

break;

4713

#ifdef CONFIG_HOTPLUG_CPU

4720

#ifdef CONFIG_HOTPLUG_CPU

4714

case CPU_UP_CANCELED:

4721

case CPU_UP_CANCELED:

4715

/* Unbind it from offline cpu so it can run. Fall thru. */

4722

/* Unbind it from offline cpu so it can run. Fall thru. */

4716

kthread_bind(cpu_rq(cpu)->migration_thread,

4723

kthread_bind(cpu_rq(cpu)->migration_thread,

4717

any_online_cpu(cpu_online_map));

4724

any_online_cpu(cpu_online_map));

4718

kthread_stop(cpu_rq(cpu)->migration_thread);

4725

kthread_stop(cpu_rq(cpu)->migration_thread);

4719

cpu_rq(cpu)->migration_thread = NULL;

4726

cpu_rq(cpu)->migration_thread = NULL;

4720

break;

4727

break;

4721

case CPU_DEAD:

4728

case CPU_DEAD:

4722

migrate_live_tasks(cpu);

4729

migrate_live_tasks(cpu);

4723

rq = cpu_rq(cpu);

4730

rq = cpu_rq(cpu);

4724

kthread_stop(rq->migration_thread);

4731

kthread_stop(rq->migration_thread);

4725

rq->migration_thread = NULL;

4732

rq->migration_thread = NULL;

4726

/* Idle task back to normal (off runqueue, low prio) */

4733

/* Idle task back to normal (off runqueue, low prio) */

4727

rq = task_rq_lock(rq->idle, &flags);

4734

rq = task_rq_lock(rq->idle, &flags);

4728

deactivate_task(rq->idle, rq);

4735

deactivate_task(rq->idle, rq);

4729

rq->idle->static_prio = MAX_PRIO;

4736

rq->idle->static_prio = MAX_PRIO;

4730

__setscheduler(rq->idle, SCHED_NORMAL, 0);

4737

__setscheduler(rq->idle, SCHED_NORMAL, 0);

4731

migrate_dead_tasks(cpu);

4738

migrate_dead_tasks(cpu);

4732

task_rq_unlock(rq, &flags);

4739

task_rq_unlock(rq, &flags);

4733

migrate_nr_uninterruptible(rq);

4740

migrate_nr_uninterruptible(rq);

4734

BUG_ON(rq->nr_running != 0);

4741

BUG_ON(rq->nr_running != 0);

4735

4742

4736

/* No need to migrate the tasks: it was best-effort if

4743

/* No need to migrate the tasks: it was best-effort if

4737

* they didn't do lock_cpu_hotplug(). Just wake up

4744

* they didn't do lock_cpu_hotplug(). Just wake up

4738

* the requestors. */

4745

* the requestors. */

4739

spin_lock_irq(&rq->lock);

4746

spin_lock_irq(&rq->lock);

4740

while (!list_empty(&rq->migration_queue)) {

4747

while (!list_empty(&rq->migration_queue)) {

4741

migration_req_t *req;

4748

migration_req_t *req;

4742

req = list_entry(rq->migration_queue.next,

4749

req = list_entry(rq->migration_queue.next,

4743

migration_req_t, list);

4750

migration_req_t, list);

4744

list_del_init(&req->list);

4751

list_del_init(&req->list);

4745

complete(&req->done);

4752

complete(&req->done);

4746

}

4753

}

4747

spin_unlock_irq(&rq->lock);

4754

spin_unlock_irq(&rq->lock);

4748

break;

4755

break;

4749

#endif

4756

#endif

4750

}

4757

}

4751

return NOTIFY_OK;

4758

return NOTIFY_OK;

4752

}

4759

}

4753

4760

4754

/* Register at highest priority so that task migration (migrate_all_tasks)

4761

/* Register at highest priority so that task migration (migrate_all_tasks)

4755

* happens before everything else.

4762

* happens before everything else.

4756

*/

4763

*/

4757

static struct notifier_block __devinitdata migration_notifier = {

4764

static struct notifier_block __devinitdata migration_notifier = {

4758

.notifier_call = migration_call,

4765

.notifier_call = migration_call,

4759

.priority = 10

4766

.priority = 10

4760

};

4767

};

4761

4768

4762

int __init migration_init(void)

4769

int __init migration_init(void)

4763

{

4770

{

4764

void *cpu = (void *)(long)smp_processor_id();

4771

void *cpu = (void *)(long)smp_processor_id();

4765

/* Start one for boot CPU. */

4772

/* Start one for boot CPU. */

4766

migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);

4773

migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);

4767

migration_call(&migration_notifier, CPU_ONLINE, cpu);

4774

migration_call(&migration_notifier, CPU_ONLINE, cpu);

4768

register_cpu_notifier(&migration_notifier);

4775

register_cpu_notifier(&migration_notifier);

4769

return 0;

4776

return 0;

4770

}

4777

}

4771

#endif

4778

#endif

4772

4779

4773

#ifdef CONFIG_SMP

4780

#ifdef CONFIG_SMP

4774

#undef SCHED_DOMAIN_DEBUG

4781

#undef SCHED_DOMAIN_DEBUG

4775

#ifdef SCHED_DOMAIN_DEBUG

4782

#ifdef SCHED_DOMAIN_DEBUG

4776

static void sched_domain_debug(struct sched_domain *sd, int cpu)

4783

static void sched_domain_debug(struct sched_domain *sd, int cpu)

4777

{

4784

{

4778

int level = 0;

4785

int level = 0;

4779

4786

4780

if (!sd) {

4787

if (!sd) {

4781

printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);

4788

printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);

4782

return;

4789

return;

4783

}

4790

}

4784

4791

4785

printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

4792

printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

4786

4793

4787

do {

4794

do {

4788

int i;

4795

int i;

4789

char str[NR_CPUS];

4796

char str[NR_CPUS];

4790

struct sched_group *group = sd->groups;

4797

struct sched_group *group = sd->groups;

4791

cpumask_t groupmask;

4798

cpumask_t groupmask;

4792

4799

4793

cpumask_scnprintf(str, NR_CPUS, sd->span);

4800

cpumask_scnprintf(str, NR_CPUS, sd->span);

4794

cpus_clear(groupmask);

4801

cpus_clear(groupmask);

4795

4802

4796

printk(KERN_DEBUG);

4803

printk(KERN_DEBUG);

4797

for (i = 0; i < level + 1; i++)

4804

for (i = 0; i < level + 1; i++)

4798

printk(" ");

4805

printk(" ");

4799

printk("domain %d: ", level);

4806

printk("domain %d: ", level);

4800

4807

4801

if (!(sd->flags & SD_LOAD_BALANCE)) {

4808

if (!(sd->flags & SD_LOAD_BALANCE)) {

4802

printk("does not load-balance\n");

4809

printk("does not load-balance\n");

4803

if (sd->parent)

4810

if (sd->parent)

4804

printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");

4811

printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");

4805

break;

4812

break;

4806

}

4813

}

4807

4814

4808

printk("span %s\n", str);

4815

printk("span %s\n", str);

4809

4816

4810

if (!cpu_isset(cpu, sd->span))

4817

if (!cpu_isset(cpu, sd->span))

4811

printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);

4818

printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);

4812

if (!cpu_isset(cpu, group->cpumask))

4819

if (!cpu_isset(cpu, group->cpumask))

4813

printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);

4820

printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);

4814

4821

4815

printk(KERN_DEBUG);

4822

printk(KERN_DEBUG);

4816

for (i = 0; i < level + 2; i++)

4823

for (i = 0; i < level + 2; i++)

4817

printk(" ");

4824

printk(" ");

4818

printk("groups:");

4825

printk("groups:");

4819

do {

4826

do {

4820

if (!group) {

4827

if (!group) {

4821

printk("\n");

4828

printk("\n");

4822

printk(KERN_ERR "ERROR: group is NULL\n");

4829

printk(KERN_ERR "ERROR: group is NULL\n");

4823

break;

4830

break;

4824

}

4831

}

4825

4832

4826

if (!group->cpu_power) {

4833

if (!group->cpu_power) {

4827

printk("\n");

4834

printk("\n");

4828

printk(KERN_ERR "ERROR: domain->cpu_power not set\n");

4835

printk(KERN_ERR "ERROR: domain->cpu_power not set\n");

4829

}

4836

}

4830

4837

4831

if (!cpus_weight(group->cpumask)) {

4838

if (!cpus_weight(group->cpumask)) {

4832

printk("\n");

4839

printk("\n");

4833

printk(KERN_ERR "ERROR: empty group\n");

4840

printk(KERN_ERR "ERROR: empty group\n");

4834

}

4841

}

4835

4842

4836

if (cpus_intersects(groupmask, group->cpumask)) {

4843

if (cpus_intersects(groupmask, group->cpumask)) {

4837

printk("\n");

4844

printk("\n");

4838

printk(KERN_ERR "ERROR: repeated CPUs\n");

4845

printk(KERN_ERR "ERROR: repeated CPUs\n");

4839

}

4846

}

4840

4847

4841

cpus_or(groupmask, groupmask, group->cpumask);

4848

cpus_or(groupmask, groupmask, group->cpumask);

4842

4849

4843

cpumask_scnprintf(str, NR_CPUS, group->cpumask);

4850

cpumask_scnprintf(str, NR_CPUS, group->cpumask);

4844

printk(" %s", str);

4851

printk(" %s", str);

4845

4852

4846

group = group->next;

4853

group = group->next;

4847

} while (group != sd->groups);

4854

} while (group != sd->groups);

4848

printk("\n");

4855

printk("\n");

4849

4856

4850

if (!cpus_equal(sd->span, groupmask))

4857

if (!cpus_equal(sd->span, groupmask))

4851

printk(KERN_ERR "ERROR: groups don't span domain->span\n");

4858

printk(KERN_ERR "ERROR: groups don't span domain->span\n");

4852

4859

4853

level++;

4860

level++;

4854

sd = sd->parent;

4861

sd = sd->parent;

4855

4862

4856

if (sd) {

4863

if (sd) {

4857

if (!cpus_subset(groupmask, sd->span))

4864

if (!cpus_subset(groupmask, sd->span))

4858

printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");

4865

printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");

4859

}

4866

}

4860

4867

4861

} while (sd);

4868

} while (sd);

4862

}

4869

}

4863

#else

4870

#else

4864

#define sched_domain_debug(sd, cpu) {}

4871

#define sched_domain_debug(sd, cpu) {}

4865

#endif

4872

#endif

4866

4873

4867

static int sd_degenerate(struct sched_domain *sd)

4874

static int sd_degenerate(struct sched_domain *sd)

4868

{

4875

{

4869

if (cpus_weight(sd->span) == 1)

4876

if (cpus_weight(sd->span) == 1)

4870

return 1;

4877

return 1;

4871

4878

4872

/* Following flags need at least 2 groups */

4879

/* Following flags need at least 2 groups */

4873

if (sd->flags & (SD_LOAD_BALANCE |

4880

if (sd->flags & (SD_LOAD_BALANCE |

4874

SD_BALANCE_NEWIDLE |

4881

SD_BALANCE_NEWIDLE |

4875

SD_BALANCE_FORK |

4882

SD_BALANCE_FORK |

4876

SD_BALANCE_EXEC)) {

4883

SD_BALANCE_EXEC)) {

4877

if (sd->groups != sd->groups->next)

4884

if (sd->groups != sd->groups->next)

4878

return 0;

4885

return 0;

4879

}

4886

}

4880

4887

4881

/* Following flags don't use groups */

4888

/* Following flags don't use groups */

4882

if (sd->flags & (SD_WAKE_IDLE |

4889

if (sd->flags & (SD_WAKE_IDLE |

4883

SD_WAKE_AFFINE |

4890

SD_WAKE_AFFINE |

4884

SD_WAKE_BALANCE))

4891

SD_WAKE_BALANCE))

4885

return 0;

4892

return 0;

4886

4893

4887

return 1;

4894

return 1;

4888

}

4895

}

4889

4896

4890

static int sd_parent_degenerate(struct sched_domain *sd,

4897

static int sd_parent_degenerate(struct sched_domain *sd,

4891

struct sched_domain *parent)

4898

struct sched_domain *parent)

4892

{

4899

{

4893

unsigned long cflags = sd->flags, pflags = parent->flags;

4900

unsigned long cflags = sd->flags, pflags = parent->flags;

4894

4901

4895

if (sd_degenerate(parent))

4902

if (sd_degenerate(parent))

4896

return 1;

4903

return 1;

4897

4904

4898

if (!cpus_equal(sd->span, parent->span))

4905

if (!cpus_equal(sd->span, parent->span))

4899

return 0;

4906

return 0;

4900

4907

4901

/* Does parent contain flags not in child? */

4908

/* Does parent contain flags not in child? */

4902

/* WAKE_BALANCE is a subset of WAKE_AFFINE */

4909

/* WAKE_BALANCE is a subset of WAKE_AFFINE */

4903

if (cflags & SD_WAKE_AFFINE)

4910

if (cflags & SD_WAKE_AFFINE)

4904

pflags &= ~SD_WAKE_BALANCE;

4911

pflags &= ~SD_WAKE_BALANCE;

4905

/* Flags needing groups don't count if only 1 group in parent */

4912

/* Flags needing groups don't count if only 1 group in parent */

4906

if (parent->groups == parent->groups->next) {

4913

if (parent->groups == parent->groups->next) {

4907

pflags &= ~(SD_LOAD_BALANCE |

4914

pflags &= ~(SD_LOAD_BALANCE |

4908

SD_BALANCE_NEWIDLE |

4915

SD_BALANCE_NEWIDLE |

4909

SD_BALANCE_FORK |

4916

SD_BALANCE_FORK |

4910

SD_BALANCE_EXEC);

4917

SD_BALANCE_EXEC);

4911

}

4918

}

4912

if (~cflags & pflags)

4919

if (~cflags & pflags)

4913

return 0;

4920

return 0;

4914

4921

4915

return 1;

4922

return 1;

4916

}

4923

}

4917

4924

4918

/*

4925

/*

4919

* Attach the domain 'sd' to 'cpu' as its base domain. Callers must

4926

* Attach the domain 'sd' to 'cpu' as its base domain. Callers must

4920

* hold the hotplug lock.

4927

* hold the hotplug lock.

4921

*/

4928

*/

4922

static void cpu_attach_domain(struct sched_domain *sd, int cpu)

4929

static void cpu_attach_domain(struct sched_domain *sd, int cpu)

4923

{

4930

{

4924

runqueue_t *rq = cpu_rq(cpu);

4931

runqueue_t *rq = cpu_rq(cpu);

4925

struct sched_domain *tmp;

4932

struct sched_domain *tmp;

4926

4933

4927

/* Remove the sched domains which do not contribute to scheduling. */

4934

/* Remove the sched domains which do not contribute to scheduling. */

4928

for (tmp = sd; tmp; tmp = tmp->parent) {

4935

for (tmp = sd; tmp; tmp = tmp->parent) {

4929

struct sched_domain *parent = tmp->parent;

4936

struct sched_domain *parent = tmp->parent;

4930

if (!parent)

4937

if (!parent)

4931

break;

4938

break;

4932

if (sd_parent_degenerate(tmp, parent))

4939

if (sd_parent_degenerate(tmp, parent))

4933

tmp->parent = parent->parent;

4940

tmp->parent = parent->parent;

4934

}

4941

}

4935

4942

4936

if (sd && sd_degenerate(sd))

4943

if (sd && sd_degenerate(sd))

4937

sd = sd->parent;

4944

sd = sd->parent;

4938

4945

4939

sched_domain_debug(sd, cpu);

4946

sched_domain_debug(sd, cpu);

4940

4947

4941

rcu_assign_pointer(rq->sd, sd);

4948

rcu_assign_pointer(rq->sd, sd);

4942

}

4949

}

4943

4950

4944

/* cpus with isolated domains */

4951

/* cpus with isolated domains */

4945

static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;

4952

static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;

4946

4953

4947

/* Setup the mask of cpus configured for isolated domains */

4954

/* Setup the mask of cpus configured for isolated domains */

4948

static int __init isolated_cpu_setup(char *str)

4955

static int __init isolated_cpu_setup(char *str)

4949

{

4956

{

4950

int ints[NR_CPUS], i;

4957

int ints[NR_CPUS], i;

4951

4958

4952

str = get_options(str, ARRAY_SIZE(ints), ints);

4959

str = get_options(str, ARRAY_SIZE(ints), ints);

4953

cpus_clear(cpu_isolated_map);

4960

cpus_clear(cpu_isolated_map);

4954

for (i = 1; i <= ints[0]; i++)

4961

for (i = 1; i <= ints[0]; i++)

4955

if (ints[i] < NR_CPUS)

4962

if (ints[i] < NR_CPUS)

4956

cpu_set(ints[i], cpu_isolated_map);

4963

cpu_set(ints[i], cpu_isolated_map);

4957

return 1;

4964

return 1;

4958

}

4965

}

4959

4966

4960

__setup ("isolcpus=", isolated_cpu_setup);

4967

__setup ("isolcpus=", isolated_cpu_setup);

4961

4968

4962

/*

4969

/*

4963

* init_sched_build_groups takes an array of groups, the cpumask we wish

4970

* init_sched_build_groups takes an array of groups, the cpumask we wish

4964

* to span, and a pointer to a function which identifies what group a CPU

4971

* to span, and a pointer to a function which identifies what group a CPU

4965

* belongs to. The return value of group_fn must be a valid index into the

4972

* belongs to. The return value of group_fn must be a valid index into the

4966

* groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we

4973

* groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we

4967

* keep track of groups covered with a cpumask_t).

4974

* keep track of groups covered with a cpumask_t).

4968

*

4975

*

4969

* init_sched_build_groups will build a circular linked list of the groups

4976

* init_sched_build_groups will build a circular linked list of the groups

4970

* covered by the given span, and will set each group's ->cpumask correctly,

4977

* covered by the given span, and will set each group's ->cpumask correctly,

4971

* and ->cpu_power to 0.

4978

* and ->cpu_power to 0.

4972

*/

4979

*/

4973

static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,

4980

static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,

4974

int (*group_fn)(int cpu))

4981

int (*group_fn)(int cpu))

4975

{

4982

{

4976

struct sched_group *first = NULL, *last = NULL;

4983

struct sched_group *first = NULL, *last = NULL;

4977

cpumask_t covered = CPU_MASK_NONE;

4984

cpumask_t covered = CPU_MASK_NONE;

4978

int i;

4985

int i;

4979

4986

4980

for_each_cpu_mask(i, span) {

4987

for_each_cpu_mask(i, span) {

4981

int group = group_fn(i);

4988

int group = group_fn(i);

4982

struct sched_group *sg = &groups[group];

4989

struct sched_group *sg = &groups[group];

4983

int j;

4990

int j;

4984

4991

4985

if (cpu_isset(i, covered))

4992

if (cpu_isset(i, covered))

4986

continue;

4993

continue;

4987

4994

4988

sg->cpumask = CPU_MASK_NONE;

4995

sg->cpumask = CPU_MASK_NONE;

4989

sg->cpu_power = 0;

4996

sg->cpu_power = 0;

4990

4997

4991

for_each_cpu_mask(j, span) {

4998

for_each_cpu_mask(j, span) {

4992

if (group_fn(j) != group)

4999

if (group_fn(j) != group)

4993

continue;

5000

continue;

4994

5001

4995

cpu_set(j, covered);

5002

cpu_set(j, covered);

4996

cpu_set(j, sg->cpumask);

5003

cpu_set(j, sg->cpumask);

4997

}

5004

}

4998

if (!first)

5005

if (!first)

4999

first = sg;

5006

first = sg;

5000

if (last)

5007

if (last)

5001

last->next = sg;

5008

last->next = sg;

5002

last = sg;

5009

last = sg;

5003

}

5010

}

5004

last->next = first;

5011

last->next = first;

5005

}

5012

}

5006

5013

5007

#define SD_NODES_PER_DOMAIN 16

5014

#define SD_NODES_PER_DOMAIN 16

5008

5015

5009

/*

5016

/*

5010

* Self-tuning task migration cost measurement between source and target CPUs.

5017

* Self-tuning task migration cost measurement between source and target CPUs.

5011

*

5018

*

5012

* This is done by measuring the cost of manipulating buffers of varying

5019

* This is done by measuring the cost of manipulating buffers of varying

5013

* sizes. For a given buffer-size here are the steps that are taken:

5020

* sizes. For a given buffer-size here are the steps that are taken:

5014

*

5021

*

5015

* 1) the source CPU reads+dirties a shared buffer

5022

* 1) the source CPU reads+dirties a shared buffer

5016

* 2) the target CPU reads+dirties the same shared buffer

5023

* 2) the target CPU reads+dirties the same shared buffer

5017

*

5024

*

5018

* We measure how long they take, in the following 4 scenarios:

5025

* We measure how long they take, in the following 4 scenarios:

5019

*

5026

*

5020

* - source: CPU1, target: CPU2 | cost1

5027

* - source: CPU1, target: CPU2 | cost1

5021

* - source: CPU2, target: CPU1 | cost2

5028

* - source: CPU2, target: CPU1 | cost2

5022

* - source: CPU1, target: CPU1 | cost3

5029

* - source: CPU1, target: CPU1 | cost3

5023

* - source: CPU2, target: CPU2 | cost4

5030

* - source: CPU2, target: CPU2 | cost4

5024

*

5031

*

5025

* We then calculate the cost3+cost4-cost1-cost2 difference - this is

5032

* We then calculate the cost3+cost4-cost1-cost2 difference - this is

5026

* the cost of migration.

5033

* the cost of migration.

5027

*

5034

*

5028

* We then start off from a small buffer-size and iterate up to larger

5035

* We then start off from a small buffer-size and iterate up to larger

5029

* buffer sizes, in 5% steps - measuring each buffer-size separately, and

5036

* buffer sizes, in 5% steps - measuring each buffer-size separately, and

5030

* doing a maximum search for the cost. (The maximum cost for a migration

5037

* doing a maximum search for the cost. (The maximum cost for a migration

5031

* normally occurs when the working set size is around the effective cache

5038

* normally occurs when the working set size is around the effective cache

5032

* size.)

5039

* size.)

5033

*/

5040

*/

5034

#define SEARCH_SCOPE 2

5041

#define SEARCH_SCOPE 2

5035

#define MIN_CACHE_SIZE (64*1024U)

5042

#define MIN_CACHE_SIZE (64*1024U)

5036

#define DEFAULT_CACHE_SIZE (5*1024*1024U)

5043

#define DEFAULT_CACHE_SIZE (5*1024*1024U)

5037

#define ITERATIONS 1

5044

#define ITERATIONS 1

5038

#define SIZE_THRESH 130

5045

#define SIZE_THRESH 130

5039

#define COST_THRESH 130

5046

#define COST_THRESH 130

5040

5047

5041

/*

5048

/*

5042

* The migration cost is a function of 'domain distance'. Domain

5049

* The migration cost is a function of 'domain distance'. Domain

5043

* distance is the number of steps a CPU has to iterate down its

5050

* distance is the number of steps a CPU has to iterate down its

5044

* domain tree to share a domain with the other CPU. The farther

5051

* domain tree to share a domain with the other CPU. The farther

5045

* two CPUs are from each other, the larger the distance gets.

5052

* two CPUs are from each other, the larger the distance gets.

5046

*

5053

*

5047

* Note that we use the distance only to cache measurement results,

5054

* Note that we use the distance only to cache measurement results,

5048

* the distance value is not used numerically otherwise. When two

5055

* the distance value is not used numerically otherwise. When two

5049

* CPUs have the same distance it is assumed that the migration

5056

* CPUs have the same distance it is assumed that the migration

5050

* cost is the same. (this is a simplification but quite practical)

5057

* cost is the same. (this is a simplification but quite practical)

5051

*/

5058

*/

5052

#define MAX_DOMAIN_DISTANCE 32

5059

#define MAX_DOMAIN_DISTANCE 32

5053

5060

5054

static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =

5061

static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =

5055

{ [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =

5062

{ [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =

5056

/*

5063

/*

5057

* Architectures may override the migration cost and thus avoid

5064

* Architectures may override the migration cost and thus avoid

5058

* boot-time calibration. Unit is nanoseconds. Mostly useful for

5065

* boot-time calibration. Unit is nanoseconds. Mostly useful for

5059

* virtualized hardware:

5066

* virtualized hardware:

5060

*/

5067

*/

5061

#ifdef CONFIG_DEFAULT_MIGRATION_COST

5068

#ifdef CONFIG_DEFAULT_MIGRATION_COST

5062

CONFIG_DEFAULT_MIGRATION_COST

5069

CONFIG_DEFAULT_MIGRATION_COST

5063

#else

5070

#else

5064

-1LL

5071

-1LL

5065

#endif

5072

#endif

5066

};

5073

};

5067

5074

5068

/*

5075

/*

5069

* Allow override of migration cost - in units of microseconds.

5076

* Allow override of migration cost - in units of microseconds.

5070

* E.g. migration_cost=1000,2000,3000 will set up a level-1 cost

5077

* E.g. migration_cost=1000,2000,3000 will set up a level-1 cost

5071

* of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:

5078

* of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:

5072

*/

5079

*/

5073

static int __init migration_cost_setup(char *str)

5080

static int __init migration_cost_setup(char *str)

5074

{

5081

{

5075

int ints[MAX_DOMAIN_DISTANCE+1], i;

5082

int ints[MAX_DOMAIN_DISTANCE+1], i;

5076

5083

5077

str = get_options(str, ARRAY_SIZE(ints), ints);

5084

str = get_options(str, ARRAY_SIZE(ints), ints);

5078

5085

5079

printk("#ints: %d\n", ints[0]);

5086

printk("#ints: %d\n", ints[0]);

5080

for (i = 1; i <= ints[0]; i++) {

5087

for (i = 1; i <= ints[0]; i++) {

5081

migration_cost[i-1] = (unsigned long long)ints[i]*1000;

5088

migration_cost[i-1] = (unsigned long long)ints[i]*1000;

5082

printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);

5089

printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);

5083

}

5090

}

5084

return 1;

5091

return 1;

5085

}

5092

}

5086

5093

5087

__setup ("migration_cost=", migration_cost_setup);

5094

__setup ("migration_cost=", migration_cost_setup);

5088

5095

5089

/*

5096

/*

5090

* Global multiplier (divisor) for migration-cutoff values,

5097

* Global multiplier (divisor) for migration-cutoff values,

5091

* in percentiles. E.g. use a value of 150 to get 1.5 times

5098

* in percentiles. E.g. use a value of 150 to get 1.5 times

5092

* longer cache-hot cutoff times.

5099

* longer cache-hot cutoff times.

5093

*

5100

*

5094

* (We scale it from 100 to 128 to long long handling easier.)

5101

* (We scale it from 100 to 128 to long long handling easier.)

5095

*/

5102

*/

5096

5103

5097

#define MIGRATION_FACTOR_SCALE 128

5104

#define MIGRATION_FACTOR_SCALE 128

5098

5105

5099

static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;

5106

static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;

5100

5107

5101

static int __init setup_migration_factor(char *str)

5108

static int __init setup_migration_factor(char *str)

5102

{

5109

{

5103

get_option(&str, &migration_factor);

5110

get_option(&str, &migration_factor);

5104

migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;

5111

migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;

5105

return 1;

5112

return 1;

5106

}

5113

}

5107

5114

5108

__setup("migration_factor=", setup_migration_factor);

5115

__setup("migration_factor=", setup_migration_factor);

5109

5116

5110

/*

5117

/*

5111

* Estimated distance of two CPUs, measured via the number of domains

5118

* Estimated distance of two CPUs, measured via the number of domains

5112

* we have to pass for the two CPUs to be in the same span:

5119

* we have to pass for the two CPUs to be in the same span:

5113

*/

5120

*/

5114

static unsigned long domain_distance(int cpu1, int cpu2)

5121

static unsigned long domain_distance(int cpu1, int cpu2)

5115

{

5122

{

5116

unsigned long distance = 0;

5123

unsigned long distance = 0;

5117

struct sched_domain *sd;

5124

struct sched_domain *sd;

5118

5125

5119

for_each_domain(cpu1, sd) {

5126

for_each_domain(cpu1, sd) {

5120

WARN_ON(!cpu_isset(cpu1, sd->span));

5127

WARN_ON(!cpu_isset(cpu1, sd->span));

5121

if (cpu_isset(cpu2, sd->span))

5128

if (cpu_isset(cpu2, sd->span))

5122

return distance;

5129

return distance;

5123

distance++;

5130

distance++;

5124

}

5131

}

5125

if (distance >= MAX_DOMAIN_DISTANCE) {

5132

if (distance >= MAX_DOMAIN_DISTANCE) {

5126

WARN_ON(1);

5133

WARN_ON(1);

5127

distance = MAX_DOMAIN_DISTANCE-1;

5134

distance = MAX_DOMAIN_DISTANCE-1;

5128

}

5135

}

5129

5136

5130

return distance;

5137

return distance;

5131

}

5138

}

5132

5139

5133

static unsigned int migration_debug;

5140

static unsigned int migration_debug;

5134

5141

5135

static int __init setup_migration_debug(char *str)

5142

static int __init setup_migration_debug(char *str)

5136

{

5143

{

5137

get_option(&str, &migration_debug);

5144

get_option(&str, &migration_debug);

5138

return 1;

5145

return 1;

5139

}

5146

}

5140

5147

5141

__setup("migration_debug=", setup_migration_debug);

5148

__setup("migration_debug=", setup_migration_debug);

5142

5149

5143

/*

5150

/*

5144

* Maximum cache-size that the scheduler should try to measure.

5151

* Maximum cache-size that the scheduler should try to measure.

5145

* Architectures with larger caches should tune this up during

5152

* Architectures with larger caches should tune this up during

5146

* bootup. Gets used in the domain-setup code (i.e. during SMP

5153

* bootup. Gets used in the domain-setup code (i.e. during SMP

5147

* bootup).

5154

* bootup).

5148

*/

5155

*/

5149

unsigned int max_cache_size;

5156

unsigned int max_cache_size;

5150

5157

5151

static int __init setup_max_cache_size(char *str)

5158

static int __init setup_max_cache_size(char *str)

5152

{

5159

{

5153

get_option(&str, &max_cache_size);

5160

get_option(&str, &max_cache_size);

5154

return 1;

5161

return 1;

5155

}

5162

}

5156

5163

5157

__setup("max_cache_size=", setup_max_cache_size);

5164

__setup("max_cache_size=", setup_max_cache_size);

5158

5165

5159

/*

5166

/*

5160

* Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This

5167

* Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This

5161

* is the operation that is timed, so we try to generate unpredictable

5168

* is the operation that is timed, so we try to generate unpredictable

5162

* cachemisses that still end up filling the L2 cache:

5169

* cachemisses that still end up filling the L2 cache:

5163

*/

5170

*/

5164

static void touch_cache(void *__cache, unsigned long __size)

5171

static void touch_cache(void *__cache, unsigned long __size)

5165

{

5172

{

5166

unsigned long size = __size/sizeof(long), chunk1 = size/3,

5173

unsigned long size = __size/sizeof(long), chunk1 = size/3,

5167

chunk2 = 2*size/3;

5174

chunk2 = 2*size/3;

5168

unsigned long *cache = __cache;

5175

unsigned long *cache = __cache;

5169

int i;

5176

int i;

5170

5177

5171

for (i = 0; i < size/6; i += 8) {

5178

for (i = 0; i < size/6; i += 8) {

5172

switch (i % 6) {

5179

switch (i % 6) {

5173

case 0: cache[i]++;

5180

case 0: cache[i]++;

5174

case 1: cache[size-1-i]++;

5181

case 1: cache[size-1-i]++;

5175

case 2: cache[chunk1-i]++;

5182

case 2: cache[chunk1-i]++;

5176

case 3: cache[chunk1+i]++;

5183

case 3: cache[chunk1+i]++;

5177

case 4: cache[chunk2-i]++;

5184

case 4: cache[chunk2-i]++;

5178

case 5: cache[chunk2+i]++;

5185

case 5: cache[chunk2+i]++;

5179

}

5186

}

5180

}

5187

}

5181

}

5188

}

5182

5189

5183

/*

5190

/*

5184

* Measure the cache-cost of one task migration. Returns in units of nsec.

5191

* Measure the cache-cost of one task migration. Returns in units of nsec.

5185

*/

5192

*/

5186

static unsigned long long measure_one(void *cache, unsigned long size,

5193

static unsigned long long measure_one(void *cache, unsigned long size,

5187

int source, int target)

5194

int source, int target)

5188

{

5195

{

5189

cpumask_t mask, saved_mask;

5196

cpumask_t mask, saved_mask;

5190

unsigned long long t0, t1, t2, t3, cost;

5197

unsigned long long t0, t1, t2, t3, cost;

5191

5198

5192

saved_mask = current->cpus_allowed;

5199

saved_mask = current->cpus_allowed;

5193

5200

5194

/*

5201

/*

5195

* Flush source caches to RAM and invalidate them:

5202

* Flush source caches to RAM and invalidate them:

5196

*/

5203

*/

5197

sched_cacheflush();

5204

sched_cacheflush();

5198

5205

5199

/*

5206

/*

5200

* Migrate to the source CPU:

5207

* Migrate to the source CPU:

5201

*/

5208

*/

5202

mask = cpumask_of_cpu(source);

5209

mask = cpumask_of_cpu(source);

5203

set_cpus_allowed(current, mask);

5210

set_cpus_allowed(current, mask);

5204

WARN_ON(smp_processor_id() != source);

5211

WARN_ON(smp_processor_id() != source);

5205

5212

5206

/*

5213

/*

5207

* Dirty the working set:

5214

* Dirty the working set:

5208

*/

5215

*/

5209

t0 = sched_clock();

5216

t0 = sched_clock();

5210

touch_cache(cache, size);

5217

touch_cache(cache, size);

5211

t1 = sched_clock();

5218

t1 = sched_clock();

5212

5219

5213

/*

5220

/*

5214

* Migrate to the target CPU, dirty the L2 cache and access

5221

* Migrate to the target CPU, dirty the L2 cache and access

5215

* the shared buffer. (which represents the working set

5222

* the shared buffer. (which represents the working set

5216

* of a migrated task.)

5223

* of a migrated task.)

5217

*/

5224

*/

5218

mask = cpumask_of_cpu(target);

5225

mask = cpumask_of_cpu(target);

5219

set_cpus_allowed(current, mask);

5226

set_cpus_allowed(current, mask);

5220

WARN_ON(smp_processor_id() != target);

5227

WARN_ON(smp_processor_id() != target);

5221

5228

5222

t2 = sched_clock();

5229

t2 = sched_clock();

5223

touch_cache(cache, size);

5230

touch_cache(cache, size);

5224

t3 = sched_clock();

5231

t3 = sched_clock();

5225

5232

5226

cost = t1-t0 + t3-t2;

5233

cost = t1-t0 + t3-t2;

5227

5234

5228

if (migration_debug >= 2)

5235

if (migration_debug >= 2)

5229

printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",

5236

printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",

5230

source, target, t1-t0, t1-t0, t3-t2, cost);

5237

source, target, t1-t0, t1-t0, t3-t2, cost);

5231

/*

5238

/*

5232

* Flush target caches to RAM and invalidate them:

5239

* Flush target caches to RAM and invalidate them:

5233

*/

5240

*/

5234

sched_cacheflush();

5241

sched_cacheflush();

5235

5242

5236

set_cpus_allowed(current, saved_mask);

5243

set_cpus_allowed(current, saved_mask);

5237

5244

5238

return cost;

5245

return cost;

5239

}

5246

}

5240

5247

5241

/*

5248

/*

5242

* Measure a series of task migrations and return the average

5249

* Measure a series of task migrations and return the average

5243

* result. Since this code runs early during bootup the system

5250

* result. Since this code runs early during bootup the system

5244

* is 'undisturbed' and the average latency makes sense.

5251

* is 'undisturbed' and the average latency makes sense.

5245

*

5252

*

5246

* The algorithm in essence auto-detects the relevant cache-size,

5253

* The algorithm in essence auto-detects the relevant cache-size,

5247

* so it will properly detect different cachesizes for different

5254

* so it will properly detect different cachesizes for different

5248

* cache-hierarchies, depending on how the CPUs are connected.

5255

* cache-hierarchies, depending on how the CPUs are connected.

5249

*

5256

*

5250

* Architectures can prime the upper limit of the search range via

5257

* Architectures can prime the upper limit of the search range via

5251

* max_cache_size, otherwise the search range defaults to 20MB...64K.

5258

* max_cache_size, otherwise the search range defaults to 20MB...64K.

5252

*/

5259

*/

5253

static unsigned long long

5260

static unsigned long long

5254

measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)

5261

measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)

5255

{

5262

{

5256

unsigned long long cost1, cost2;

5263

unsigned long long cost1, cost2;

5257

int i;

5264

int i;

5258

5265

5259

/*

5266

/*

5260

* Measure the migration cost of 'size' bytes, over an

5267

* Measure the migration cost of 'size' bytes, over an

5261

* average of 10 runs:

5268

* average of 10 runs:

5262

*

5269

*

5263

* (We perturb the cache size by a small (0..4k)

5270

* (We perturb the cache size by a small (0..4k)

5264

* value to compensate size/alignment related artifacts.

5271

* value to compensate size/alignment related artifacts.

5265

* We also subtract the cost of the operation done on

5272

* We also subtract the cost of the operation done on

5266

* the same CPU.)

5273

* the same CPU.)

5267

*/

5274

*/

5268

cost1 = 0;

5275

cost1 = 0;

5269

5276

5270

/*

5277

/*

5271

* dry run, to make sure we start off cache-cold on cpu1,

5278

* dry run, to make sure we start off cache-cold on cpu1,

5272

* and to get any vmalloc pagefaults in advance:

5279

* and to get any vmalloc pagefaults in advance:

5273

*/

5280

*/

5274

measure_one(cache, size, cpu1, cpu2);

5281

measure_one(cache, size, cpu1, cpu2);

5275

for (i = 0; i < ITERATIONS; i++)

5282

for (i = 0; i < ITERATIONS; i++)

5276

cost1 += measure_one(cache, size - i*1024, cpu1, cpu2);

5283

cost1 += measure_one(cache, size - i*1024, cpu1, cpu2);

5277

5284

5278

measure_one(cache, size, cpu2, cpu1);

5285

measure_one(cache, size, cpu2, cpu1);

5279

for (i = 0; i < ITERATIONS; i++)

5286

for (i = 0; i < ITERATIONS; i++)

5280

cost1 += measure_one(cache, size - i*1024, cpu2, cpu1);

5287

cost1 += measure_one(cache, size - i*1024, cpu2, cpu1);

5281

5288

5282

/*

5289

/*

5283

* (We measure the non-migrating [cached] cost on both

5290

* (We measure the non-migrating [cached] cost on both

5284

* cpu1 and cpu2, to handle CPUs with different speeds)

5291

* cpu1 and cpu2, to handle CPUs with different speeds)

5285

*/

5292

*/

5286

cost2 = 0;

5293

cost2 = 0;

5287

5294

5288

measure_one(cache, size, cpu1, cpu1);

5295

measure_one(cache, size, cpu1, cpu1);

5289

for (i = 0; i < ITERATIONS; i++)

5296

for (i = 0; i < ITERATIONS; i++)

5290

cost2 += measure_one(cache, size - i*1024, cpu1, cpu1);

5297

cost2 += measure_one(cache, size - i*1024, cpu1, cpu1);

5291

5298

5292

measure_one(cache, size, cpu2, cpu2);

5299

measure_one(cache, size, cpu2, cpu2);

5293

for (i = 0; i < ITERATIONS; i++)

5300

for (i = 0; i < ITERATIONS; i++)

5294

cost2 += measure_one(cache, size - i*1024, cpu2, cpu2);

5301

cost2 += measure_one(cache, size - i*1024, cpu2, cpu2);

5295

5302

5296

/*

5303

/*

5297

* Get the per-iteration migration cost:

5304

* Get the per-iteration migration cost:

5298

*/

5305

*/

5299

do_div(cost1, 2*ITERATIONS);

5306

do_div(cost1, 2*ITERATIONS);

5300

do_div(cost2, 2*ITERATIONS);

5307

do_div(cost2, 2*ITERATIONS);

5301

5308

5302

return cost1 - cost2;

5309

return cost1 - cost2;

5303

}

5310

}

5304

5311

5305

static unsigned long long measure_migration_cost(int cpu1, int cpu2)

5312

static unsigned long long measure_migration_cost(int cpu1, int cpu2)

5306

{

5313

{

5307

unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;

5314

unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;

5308

unsigned int max_size, size, size_found = 0;

5315

unsigned int max_size, size, size_found = 0;

5309

long long cost = 0, prev_cost;

5316

long long cost = 0, prev_cost;

5310

void *cache;

5317

void *cache;

5311

5318

5312

/*

5319

/*

5313

* Search from max_cache_size*5 down to 64K - the real relevant

5320

* Search from max_cache_size*5 down to 64K - the real relevant

5314

* cachesize has to lie somewhere inbetween.

5321

* cachesize has to lie somewhere inbetween.

5315

*/

5322

*/

5316

if (max_cache_size) {

5323

if (max_cache_size) {

5317

max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);

5324

max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);

5318

size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);

5325

size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);

5319

} else {

5326

} else {

5320

/*

5327

/*

5321

* Since we have no estimation about the relevant

5328

* Since we have no estimation about the relevant

5322

* search range

5329

* search range

5323

*/

5330

*/

5324

max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;

5331

max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;

5325

size = MIN_CACHE_SIZE;

5332

size = MIN_CACHE_SIZE;

5326

}

5333

}

5327

5334

5328

if (!cpu_online(cpu1) || !cpu_online(cpu2)) {

5335

if (!cpu_online(cpu1) || !cpu_online(cpu2)) {

5329

printk("cpu %d and %d not both online!\n", cpu1, cpu2);

5336

printk("cpu %d and %d not both online!\n", cpu1, cpu2);

5330

return 0;

5337

return 0;

5331

}

5338

}

5332

5339

5333

/*

5340

/*

5334

* Allocate the working set:

5341

* Allocate the working set:

5335

*/

5342

*/

5336

cache = vmalloc(max_size);

5343

cache = vmalloc(max_size);

5337

if (!cache) {

5344

if (!cache) {

5338

printk("could not vmalloc %d bytes for cache!\n", 2*max_size);

5345

printk("could not vmalloc %d bytes for cache!\n", 2*max_size);

5339

return 1000000; // return 1 msec on very small boxen

5346

return 1000000; // return 1 msec on very small boxen

5340

}

5347

}

5341

5348

5342

while (size <= max_size) {

5349

while (size <= max_size) {

5343

prev_cost = cost;

5350

prev_cost = cost;

5344

cost = measure_cost(cpu1, cpu2, cache, size);

5351

cost = measure_cost(cpu1, cpu2, cache, size);

5345

5352

5346

/*

5353

/*

5347

* Update the max:

5354

* Update the max:

5348

*/

5355

*/

5349

if (cost > 0) {

5356

if (cost > 0) {

5350

if (max_cost < cost) {

5357

if (max_cost < cost) {

5351

max_cost = cost;

5358

max_cost = cost;

5352

size_found = size;

5359

size_found = size;

5353

}

5360

}

5354

}

5361

}

5355

/*

5362

/*

5356

* Calculate average fluctuation, we use this to prevent

5363

* Calculate average fluctuation, we use this to prevent

5357

* noise from triggering an early break out of the loop:

5364

* noise from triggering an early break out of the loop:

5358

*/

5365

*/

5359

fluct = abs(cost - prev_cost);

5366

fluct = abs(cost - prev_cost);

5360

avg_fluct = (avg_fluct + fluct)/2;

5367

avg_fluct = (avg_fluct + fluct)/2;

5361

5368

5362

if (migration_debug)

5369

if (migration_debug)

5363

printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n",

5370

printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n",

5364

cpu1, cpu2, size,

5371

cpu1, cpu2, size,

5365

(long)cost / 1000000,

5372

(long)cost / 1000000,

5366

((long)cost / 100000) % 10,

5373

((long)cost / 100000) % 10,

5367

(long)max_cost / 1000000,

5374

(long)max_cost / 1000000,

5368

((long)max_cost / 100000) % 10,

5375

((long)max_cost / 100000) % 10,

5369

domain_distance(cpu1, cpu2),

5376

domain_distance(cpu1, cpu2),

5370

cost, avg_fluct);

5377

cost, avg_fluct);

5371

5378

5372

/*

5379

/*

5373

* If we iterated at least 20% past the previous maximum,

5380

* If we iterated at least 20% past the previous maximum,

5374

* and the cost has dropped by more than 20% already,

5381

* and the cost has dropped by more than 20% already,

5375

* (taking fluctuations into account) then we assume to

5382

* (taking fluctuations into account) then we assume to

5376

* have found the maximum and break out of the loop early:

5383

* have found the maximum and break out of the loop early:

5377

*/

5384

*/

5378

if (size_found && (size*100 > size_found*SIZE_THRESH))

5385

if (size_found && (size*100 > size_found*SIZE_THRESH))

5379

if (cost+avg_fluct <= 0 ||

5386

if (cost+avg_fluct <= 0 ||

5380

max_cost*100 > (cost+avg_fluct)*COST_THRESH) {

5387

max_cost*100 > (cost+avg_fluct)*COST_THRESH) {

5381

5388

5382

if (migration_debug)

5389

if (migration_debug)

5383

printk("-> found max.\n");

5390

printk("-> found max.\n");

5384

break;

5391

break;

5385

}

5392

}

5386

/*

5393

/*

5387

* Increase the cachesize in 10% steps:

5394

* Increase the cachesize in 10% steps:

5388

*/

5395

*/

5389

size = size * 10 / 9;

5396

size = size * 10 / 9;

5390

}

5397

}

5391

5398

5392

if (migration_debug)

5399

if (migration_debug)

5393

printk("[%d][%d] working set size found: %d, cost: %Ld\n",

5400

printk("[%d][%d] working set size found: %d, cost: %Ld\n",

5394

cpu1, cpu2, size_found, max_cost);

5401

cpu1, cpu2, size_found, max_cost);

5395

5402

5396

vfree(cache);

5403

vfree(cache);

5397

5404

5398

/*

5405

/*

5399

* A task is considered 'cache cold' if at least 2 times

5406

* A task is considered 'cache cold' if at least 2 times

5400

* the worst-case cost of migration has passed.

5407

* the worst-case cost of migration has passed.

5401

*

5408

*

5402

* (this limit is only listened to if the load-balancing

5409

* (this limit is only listened to if the load-balancing

5403

* situation is 'nice' - if there is a large imbalance we

5410

* situation is 'nice' - if there is a large imbalance we

5404

* ignore it for the sake of CPU utilization and

5411

* ignore it for the sake of CPU utilization and

5405

* processing fairness.)

5412

* processing fairness.)

5406

*/

5413

*/

5407

return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;

5414

return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;

5408

}

5415

}

5409

5416

5410

static void calibrate_migration_costs(const cpumask_t *cpu_map)

5417

static void calibrate_migration_costs(const cpumask_t *cpu_map)

5411

{

5418

{

5412

int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();

5419

int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();

5413

unsigned long j0, j1, distance, max_distance = 0;

5420

unsigned long j0, j1, distance, max_distance = 0;

5414

struct sched_domain *sd;

5421

struct sched_domain *sd;

5415

5422

5416

j0 = jiffies;

5423

j0 = jiffies;

5417

5424

5418

/*

5425

/*

5419

* First pass - calculate the cacheflush times:

5426

* First pass - calculate the cacheflush times:

5420

*/

5427

*/

5421

for_each_cpu_mask(cpu1, *cpu_map) {

5428

for_each_cpu_mask(cpu1, *cpu_map) {

5422

for_each_cpu_mask(cpu2, *cpu_map) {

5429

for_each_cpu_mask(cpu2, *cpu_map) {

5423

if (cpu1 == cpu2)

5430

if (cpu1 == cpu2)

5424

continue;

5431

continue;

5425

distance = domain_distance(cpu1, cpu2);

5432

distance = domain_distance(cpu1, cpu2);

5426

max_distance = max(max_distance, distance);

5433

max_distance = max(max_distance, distance);

5427

/*

5434

/*

5428

* No result cached yet?

5435

* No result cached yet?

5429

*/

5436

*/

5430

if (migration_cost[distance] == -1LL)

5437

if (migration_cost[distance] == -1LL)

5431

migration_cost[distance] =

5438

migration_cost[distance] =

5432

measure_migration_cost(cpu1, cpu2);

5439

measure_migration_cost(cpu1, cpu2);

5433

}

5440

}

5434

}

5441

}

5435

/*

5442

/*

5436

* Second pass - update the sched domain hierarchy with

5443

* Second pass - update the sched domain hierarchy with

5437

* the new cache-hot-time estimations:

5444

* the new cache-hot-time estimations:

5438

*/

5445

*/

5439

for_each_cpu_mask(cpu, *cpu_map) {

5446

for_each_cpu_mask(cpu, *cpu_map) {

5440

distance = 0;

5447

distance = 0;

5441

for_each_domain(cpu, sd) {

5448

for_each_domain(cpu, sd) {

5442

sd->cache_hot_time = migration_cost[distance];

5449

sd->cache_hot_time = migration_cost[distance];

5443

distance++;

5450

distance++;

5444

}

5451

}

5445

}

5452

}

5446

/*

5453

/*

5447

* Print the matrix:

5454

* Print the matrix:

5448

*/

5455

*/

5449

if (migration_debug)

5456

if (migration_debug)

5450

printk("migration: max_cache_size: %d, cpu: %d MHz:\n",

5457

printk("migration: max_cache_size: %d, cpu: %d MHz:\n",

5451

max_cache_size,

5458

max_cache_size,

5452

#ifdef CONFIG_X86

5459

#ifdef CONFIG_X86

5453

cpu_khz/1000

5460

cpu_khz/1000

5454

#else

5461

#else

5455

-1

5462

-1

5456

#endif

5463

#endif

5457

);

5464

);

5458

if (system_state == SYSTEM_BOOTING) {

5465

if (system_state == SYSTEM_BOOTING) {

5459

printk("migration_cost=");

5466

printk("migration_cost=");

5460

for (distance = 0; distance <= max_distance; distance++) {

5467

for (distance = 0; distance <= max_distance; distance++) {

5461

if (distance)

5468

if (distance)

5462

printk(",");

5469

printk(",");

5463

printk("%ld", (long)migration_cost[distance] / 1000);

5470

printk("%ld", (long)migration_cost[distance] / 1000);

5464

}

5471

}

5465

printk("\n");

5472

printk("\n");

5466

}

5473

}

5467

j1 = jiffies;

5474

j1 = jiffies;

5468

if (migration_debug)

5475

if (migration_debug)

5469

printk("migration: %ld seconds\n", (j1-j0)/HZ);

5476

printk("migration: %ld seconds\n", (j1-j0)/HZ);

5470

5477

5471

/*

5478

/*

5472

* Move back to the original CPU. NUMA-Q gets confused

5479

* Move back to the original CPU. NUMA-Q gets confused

5473

* if we migrate to another quad during bootup.

5480

* if we migrate to another quad during bootup.

5474

*/

5481

*/

5475

if (raw_smp_processor_id() != orig_cpu) {

5482

if (raw_smp_processor_id() != orig_cpu) {

5476

cpumask_t mask = cpumask_of_cpu(orig_cpu),

5483

cpumask_t mask = cpumask_of_cpu(orig_cpu),

5477

saved_mask = current->cpus_allowed;

5484

saved_mask = current->cpus_allowed;

5478

5485

5479

set_cpus_allowed(current, mask);

5486

set_cpus_allowed(current, mask);

5480

set_cpus_allowed(current, saved_mask);

5487

set_cpus_allowed(current, saved_mask);

5481

}

5488

}

5482

}

5489

}

5483

5490

5484

#ifdef CONFIG_NUMA

5491

#ifdef CONFIG_NUMA

5485

5492

5486

/**

5493

/**

5487

* find_next_best_node - find the next node to include in a sched_domain

5494

* find_next_best_node - find the next node to include in a sched_domain

5488

* @node: node whose sched_domain we're building

5495

* @node: node whose sched_domain we're building

5489

* @used_nodes: nodes already in the sched_domain

5496

* @used_nodes: nodes already in the sched_domain

5490

*

5497

*

5491

* Find the next node to include in a given scheduling domain. Simply

5498

* Find the next node to include in a given scheduling domain. Simply

5492

* finds the closest node not already in the @used_nodes map.

5499

* finds the closest node not already in the @used_nodes map.

5493

*

5500

*

5494

* Should use nodemask_t.

5501

* Should use nodemask_t.

5495

*/

5502

*/

5496

static int find_next_best_node(int node, unsigned long *used_nodes)

5503

static int find_next_best_node(int node, unsigned long *used_nodes)

5497

{

5504

{

5498

int i, n, val, min_val, best_node = 0;

5505

int i, n, val, min_val, best_node = 0;

5499

5506

5500

min_val = INT_MAX;

5507

min_val = INT_MAX;

5501

5508

5502

for (i = 0; i < MAX_NUMNODES; i++) {

5509

for (i = 0; i < MAX_NUMNODES; i++) {

5503

/* Start at @node */

5510

/* Start at @node */

5504

n = (node + i) % MAX_NUMNODES;

5511

n = (node + i) % MAX_NUMNODES;

5505

5512

5506

if (!nr_cpus_node(n))

5513

if (!nr_cpus_node(n))

5507

continue;

5514

continue;

5508

5515

5509

/* Skip already used nodes */

5516

/* Skip already used nodes */

5510

if (test_bit(n, used_nodes))

5517

if (test_bit(n, used_nodes))

5511

continue;

5518

continue;

5512

5519

5513

/* Simple min distance search */

5520

/* Simple min distance search */

5514

val = node_distance(node, n);

5521

val = node_distance(node, n);

5515

5522

5516

if (val < min_val) {

5523

if (val < min_val) {

5517

min_val = val;

5524

min_val = val;

5518

best_node = n;

5525

best_node = n;

5519

}

5526

}

5520

}

5527

}

5521

5528

5522

set_bit(best_node, used_nodes);

5529

set_bit(best_node, used_nodes);

5523

return best_node;

5530

return best_node;

5524

}

5531

}

5525

5532

5526

/**

5533

/**

5527

* sched_domain_node_span - get a cpumask for a node's sched_domain

5534

* sched_domain_node_span - get a cpumask for a node's sched_domain

5528

* @node: node whose cpumask we're constructing

5535

* @node: node whose cpumask we're constructing

5529

* @size: number of nodes to include in this span

5536

* @size: number of nodes to include in this span

5530

*

5537

*

5531

* Given a node, construct a good cpumask for its sched_domain to span. It

5538

* Given a node, construct a good cpumask for its sched_domain to span. It

5532

* should be one that prevents unnecessary balancing, but also spreads tasks

5539

* should be one that prevents unnecessary balancing, but also spreads tasks

5533

* out optimally.

5540

* out optimally.

5534

*/

5541

*/

5535

static cpumask_t sched_domain_node_span(int node)

5542

static cpumask_t sched_domain_node_span(int node)

5536

{

5543

{

5537

int i;

5544

int i;

5538

cpumask_t span, nodemask;

5545

cpumask_t span, nodemask;

5539

DECLARE_BITMAP(used_nodes, MAX_NUMNODES);

5546

DECLARE_BITMAP(used_nodes, MAX_NUMNODES);

5540

5547

5541

cpus_clear(span);

5548

cpus_clear(span);

5542

bitmap_zero(used_nodes, MAX_NUMNODES);

5549

bitmap_zero(used_nodes, MAX_NUMNODES);

5543

5550

5544

nodemask = node_to_cpumask(node);

5551

nodemask = node_to_cpumask(node);

5545

cpus_or(span, span, nodemask);

5552

cpus_or(span, span, nodemask);

5546

set_bit(node, used_nodes);

5553

set_bit(node, used_nodes);

5547

5554

5548

for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {

5555

for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {

5549

int next_node = find_next_best_node(node, used_nodes);

5556

int next_node = find_next_best_node(node, used_nodes);

5550

nodemask = node_to_cpumask(next_node);

5557

nodemask = node_to_cpumask(next_node);

5551

cpus_or(span, span, nodemask);

5558

cpus_or(span, span, nodemask);

5552

}

5559

}

5553

5560

5554

return span;

5561

return span;

5555

}

5562

}

5556

#endif

5563

#endif

5557

5564

5558

/*

5565

/*

5559

* At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we

5566

* At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we

5560

* can switch it on easily if needed.

5567

* can switch it on easily if needed.

5561

*/

5568

*/

5562

#ifdef CONFIG_SCHED_SMT

5569

#ifdef CONFIG_SCHED_SMT

5563

static DEFINE_PER_CPU(struct sched_domain, cpu_domains);

5570

static DEFINE_PER_CPU(struct sched_domain, cpu_domains);

5564

static struct sched_group sched_group_cpus[NR_CPUS];

5571

static struct sched_group sched_group_cpus[NR_CPUS];

5565

static int cpu_to_cpu_group(int cpu)

5572

static int cpu_to_cpu_group(int cpu)

5566

{

5573

{

5567

return cpu;

5574

return cpu;

5568

}

5575

}

5569

#endif

5576

#endif

5570

5577

5571

static DEFINE_PER_CPU(struct sched_domain, phys_domains);

5578

static DEFINE_PER_CPU(struct sched_domain, phys_domains);

5572

static struct sched_group sched_group_phys[NR_CPUS];

5579

static struct sched_group sched_group_phys[NR_CPUS];

5573

static int cpu_to_phys_group(int cpu)

5580

static int cpu_to_phys_group(int cpu)

5574

{

5581

{

5575

#ifdef CONFIG_SCHED_SMT

5582

#ifdef CONFIG_SCHED_SMT

5576

return first_cpu(cpu_sibling_map[cpu]);

5583

return first_cpu(cpu_sibling_map[cpu]);

5577

#else

5584

#else

5578

return cpu;

5585

return cpu;

5579

#endif

5586

#endif

5580

}

5587

}

5581

5588

5582

#ifdef CONFIG_NUMA

5589

#ifdef CONFIG_NUMA

5583

/*

5590

/*

5584

* The init_sched_build_groups can't handle what we want to do with node

5591

* The init_sched_build_groups can't handle what we want to do with node

5585

* groups, so roll our own. Now each node has its own list of groups which

5592

* groups, so roll our own. Now each node has its own list of groups which

5586

* gets dynamically allocated.

5593

* gets dynamically allocated.

5587

*/

5594

*/

5588

static DEFINE_PER_CPU(struct sched_domain, node_domains);

5595

static DEFINE_PER_CPU(struct sched_domain, node_domains);

5589

static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];

5596

static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];

5590

5597

5591

static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);

5598

static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);

5592

static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];

5599

static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];

5593

5600

5594

static int cpu_to_allnodes_group(int cpu)

5601

static int cpu_to_allnodes_group(int cpu)

5595

{

5602

{

5596

return cpu_to_node(cpu);

5603

return cpu_to_node(cpu);

5597

}

5604

}

5598

#endif

5605

#endif

5599

5606

5600

/*

5607

/*

5601

* Build sched domains for a given set of cpus and attach the sched domains

5608

* Build sched domains for a given set of cpus and attach the sched domains

5602

* to the individual cpus

5609

* to the individual cpus

5603

*/

5610

*/

5604

void build_sched_domains(const cpumask_t *cpu_map)

5611

void build_sched_domains(const cpumask_t *cpu_map)

5605

{

5612

{

5606

int i;

5613

int i;

5607

#ifdef CONFIG_NUMA

5614

#ifdef CONFIG_NUMA

5608

struct sched_group **sched_group_nodes = NULL;

5615

struct sched_group **sched_group_nodes = NULL;

5609

struct sched_group *sched_group_allnodes = NULL;

5616

struct sched_group *sched_group_allnodes = NULL;

5610

5617

5611

/*

5618

/*

5612

* Allocate the per-node list of sched groups

5619

* Allocate the per-node list of sched groups

5613

*/

5620

*/

5614

sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,

5621

sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,

5615

GFP_ATOMIC);

5622

GFP_ATOMIC);

5616

if (!sched_group_nodes) {

5623

if (!sched_group_nodes) {

5617

printk(KERN_WARNING "Can not alloc sched group node list\n");

5624

printk(KERN_WARNING "Can not alloc sched group node list\n");

5618

return;

5625

return;

5619

}

5626

}

5620

sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;

5627

sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;

5621

#endif

5628

#endif

5622

5629

5623

/*

5630

/*

5624

* Set up domains for cpus specified by the cpu_map.

5631

* Set up domains for cpus specified by the cpu_map.

5625

*/

5632

*/

5626

for_each_cpu_mask(i, *cpu_map) {

5633

for_each_cpu_mask(i, *cpu_map) {

5627

int group;

5634

int group;

5628

struct sched_domain *sd = NULL, *p;

5635

struct sched_domain *sd = NULL, *p;

5629

cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));

5636

cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));

5630

5637

5631

cpus_and(nodemask, nodemask, *cpu_map);

5638

cpus_and(nodemask, nodemask, *cpu_map);

5632

5639

5633

#ifdef CONFIG_NUMA

5640

#ifdef CONFIG_NUMA

5634

if (cpus_weight(*cpu_map)

5641

if (cpus_weight(*cpu_map)

5635

> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {

5642

> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {

5636

if (!sched_group_allnodes) {

5643

if (!sched_group_allnodes) {

5637

sched_group_allnodes

5644

sched_group_allnodes

5638

= kmalloc(sizeof(struct sched_group)

5645

= kmalloc(sizeof(struct sched_group)

5639

* MAX_NUMNODES,

5646

* MAX_NUMNODES,

5640

GFP_KERNEL);

5647

GFP_KERNEL);

5641

if (!sched_group_allnodes) {

5648

if (!sched_group_allnodes) {

5642

printk(KERN_WARNING

5649

printk(KERN_WARNING

5643

"Can not alloc allnodes sched group\n");

5650

"Can not alloc allnodes sched group\n");

5644

break;

5651

break;

5645

}

5652

}

5646

sched_group_allnodes_bycpu[i]

5653

sched_group_allnodes_bycpu[i]

5647

= sched_group_allnodes;

5654

= sched_group_allnodes;

5648

}

5655

}

5649

sd = &per_cpu(allnodes_domains, i);

5656

sd = &per_cpu(allnodes_domains, i);

5650

*sd = SD_ALLNODES_INIT;

5657

*sd = SD_ALLNODES_INIT;

5651

sd->span = *cpu_map;

5658

sd->span = *cpu_map;

5652

group = cpu_to_allnodes_group(i);

5659

group = cpu_to_allnodes_group(i);

5653

sd->groups = &sched_group_allnodes[group];

5660

sd->groups = &sched_group_allnodes[group];

5654

p = sd;

5661

p = sd;

5655

} else

5662

} else

5656

p = NULL;

5663

p = NULL;

5657

5664

5658

sd = &per_cpu(node_domains, i);

5665

sd = &per_cpu(node_domains, i);

5659

*sd = SD_NODE_INIT;

5666

*sd = SD_NODE_INIT;

5660

sd->span = sched_domain_node_span(cpu_to_node(i));

5667

sd->span = sched_domain_node_span(cpu_to_node(i));

5661

sd->parent = p;

5668

sd->parent = p;

5662

cpus_and(sd->span, sd->span, *cpu_map);

5669

cpus_and(sd->span, sd->span, *cpu_map);

5663

#endif

5670

#endif

5664

5671

5665

p = sd;

5672

p = sd;

5666

sd = &per_cpu(phys_domains, i);

5673

sd = &per_cpu(phys_domains, i);

5667

group = cpu_to_phys_group(i);

5674

group = cpu_to_phys_group(i);

5668

*sd = SD_CPU_INIT;

5675

*sd = SD_CPU_INIT;

5669

sd->span = nodemask;

5676

sd->span = nodemask;

5670

sd->parent = p;

5677

sd->parent = p;

5671

sd->groups = &sched_group_phys[group];

5678

sd->groups = &sched_group_phys[group];

5672

5679

5673

#ifdef CONFIG_SCHED_SMT

5680

#ifdef CONFIG_SCHED_SMT

5674

p = sd;

5681

p = sd;

5675

sd = &per_cpu(cpu_domains, i);

5682

sd = &per_cpu(cpu_domains, i);

5676

group = cpu_to_cpu_group(i);

5683

group = cpu_to_cpu_group(i);

5677

*sd = SD_SIBLING_INIT;

5684

*sd = SD_SIBLING_INIT;

5678

sd->span = cpu_sibling_map[i];

5685

sd->span = cpu_sibling_map[i];

5679

cpus_and(sd->span, sd->span, *cpu_map);

5686

cpus_and(sd->span, sd->span, *cpu_map);

5680

sd->parent = p;

5687

sd->parent = p;

5681

sd->groups = &sched_group_cpus[group];

5688

sd->groups = &sched_group_cpus[group];

5682

#endif

5689

#endif

5683

}

5690

}

5684

5691

5685

#ifdef CONFIG_SCHED_SMT

5692

#ifdef CONFIG_SCHED_SMT

5686

/* Set up CPU (sibling) groups */

5693

/* Set up CPU (sibling) groups */

5687

for_each_cpu_mask(i, *cpu_map) {

5694

for_each_cpu_mask(i, *cpu_map) {

5688

cpumask_t this_sibling_map = cpu_sibling_map[i];

5695

cpumask_t this_sibling_map = cpu_sibling_map[i];

5689

cpus_and(this_sibling_map, this_sibling_map, *cpu_map);

5696

cpus_and(this_sibling_map, this_sibling_map, *cpu_map);

5690

if (i != first_cpu(this_sibling_map))

5697

if (i != first_cpu(this_sibling_map))

5691

continue;

5698

continue;

5692

5699

5693

init_sched_build_groups(sched_group_cpus, this_sibling_map,

5700

init_sched_build_groups(sched_group_cpus, this_sibling_map,

5694

&cpu_to_cpu_group);

5701

&cpu_to_cpu_group);

5695

}

5702

}

5696

#endif

5703

#endif

5697

5704

5698

/* Set up physical groups */

5705

/* Set up physical groups */

5699

for (i = 0; i < MAX_NUMNODES; i++) {

5706

for (i = 0; i < MAX_NUMNODES; i++) {

5700

cpumask_t nodemask = node_to_cpumask(i);

5707

cpumask_t nodemask = node_to_cpumask(i);

5701

5708

5702

cpus_and(nodemask, nodemask, *cpu_map);

5709

cpus_and(nodemask, nodemask, *cpu_map);

5703

if (cpus_empty(nodemask))

5710

if (cpus_empty(nodemask))

5704

continue;

5711

continue;

5705

5712

5706

init_sched_build_groups(sched_group_phys, nodemask,

5713

init_sched_build_groups(sched_group_phys, nodemask,

5707

&cpu_to_phys_group);

5714

&cpu_to_phys_group);

5708

}

5715

}

5709

5716

5710

#ifdef CONFIG_NUMA

5717

#ifdef CONFIG_NUMA

5711

/* Set up node groups */

5718

/* Set up node groups */

5712

if (sched_group_allnodes)

5719

if (sched_group_allnodes)

5713

init_sched_build_groups(sched_group_allnodes, *cpu_map,

5720

init_sched_build_groups(sched_group_allnodes, *cpu_map,

5714

&cpu_to_allnodes_group);

5721

&cpu_to_allnodes_group);

5715

5722

5716

for (i = 0; i < MAX_NUMNODES; i++) {

5723

for (i = 0; i < MAX_NUMNODES; i++) {

5717

/* Set up node groups */

5724

/* Set up node groups */

5718

struct sched_group *sg, *prev;

5725

struct sched_group *sg, *prev;

5719

cpumask_t nodemask = node_to_cpumask(i);

5726

cpumask_t nodemask = node_to_cpumask(i);

5720

cpumask_t domainspan;

5727

cpumask_t domainspan;

5721

cpumask_t covered = CPU_MASK_NONE;

5728

cpumask_t covered = CPU_MASK_NONE;

5722

int j;

5729

int j;

5723

5730

5724

cpus_and(nodemask, nodemask, *cpu_map);

5731

cpus_and(nodemask, nodemask, *cpu_map);

5725

if (cpus_empty(nodemask)) {

5732

if (cpus_empty(nodemask)) {

5726

sched_group_nodes[i] = NULL;

5733

sched_group_nodes[i] = NULL;

5727

continue;

5734

continue;

5728

}

5735

}

5729

5736

5730

domainspan = sched_domain_node_span(i);

5737

domainspan = sched_domain_node_span(i);

5731

cpus_and(domainspan, domainspan, *cpu_map);

5738

cpus_and(domainspan, domainspan, *cpu_map);

5732

5739

5733

sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);

5740

sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);

5734

sched_group_nodes[i] = sg;

5741

sched_group_nodes[i] = sg;

5735

for_each_cpu_mask(j, nodemask) {

5742

for_each_cpu_mask(j, nodemask) {

5736

struct sched_domain *sd;

5743

struct sched_domain *sd;

5737

sd = &per_cpu(node_domains, j);

5744

sd = &per_cpu(node_domains, j);

5738

sd->groups = sg;

5745

sd->groups = sg;

5739

if (sd->groups == NULL) {

5746

if (sd->groups == NULL) {

5740

/* Turn off balancing if we have no groups */

5747

/* Turn off balancing if we have no groups */

5741

sd->flags = 0;

5748

sd->flags = 0;

5742

}

5749

}

5743

}

5750

}

5744

if (!sg) {

5751

if (!sg) {

5745

printk(KERN_WARNING

5752

printk(KERN_WARNING

5746

"Can not alloc domain group for node %d\n", i);

5753

"Can not alloc domain group for node %d\n", i);

5747

continue;

5754

continue;

5748

}

5755

}

5749

sg->cpu_power = 0;

5756

sg->cpu_power = 0;

5750

sg->cpumask = nodemask;

5757

sg->cpumask = nodemask;

5751

cpus_or(covered, covered, nodemask);

5758

cpus_or(covered, covered, nodemask);

5752

prev = sg;

5759

prev = sg;

5753

5760

5754

for (j = 0; j < MAX_NUMNODES; j++) {

5761

for (j = 0; j < MAX_NUMNODES; j++) {

5755

cpumask_t tmp, notcovered;

5762

cpumask_t tmp, notcovered;

5756

int n = (i + j) % MAX_NUMNODES;

5763

int n = (i + j) % MAX_NUMNODES;

5757

5764

5758

cpus_complement(notcovered, covered);

5765

cpus_complement(notcovered, covered);

5759

cpus_and(tmp, notcovered, *cpu_map);

5766

cpus_and(tmp, notcovered, *cpu_map);

5760

cpus_and(tmp, tmp, domainspan);

5767

cpus_and(tmp, tmp, domainspan);

5761

if (cpus_empty(tmp))

5768

if (cpus_empty(tmp))

5762

break;

5769

break;

5763

5770

5764

nodemask = node_to_cpumask(n);

5771

nodemask = node_to_cpumask(n);

5765

cpus_and(tmp, tmp, nodemask);

5772

cpus_and(tmp, tmp, nodemask);

5766

if (cpus_empty(tmp))

5773

if (cpus_empty(tmp))

5767

continue;

5774

continue;

5768

5775

5769

sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);

5776

sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);

5770

if (!sg) {

5777

if (!sg) {

5771

printk(KERN_WARNING

5778

printk(KERN_WARNING

5772

"Can not alloc domain group for node %d\n", j);

5779

"Can not alloc domain group for node %d\n", j);

5773

break;

5780

break;

5774

}

5781

}

5775

sg->cpu_power = 0;

5782

sg->cpu_power = 0;

5776

sg->cpumask = tmp;

5783

sg->cpumask = tmp;

5777

cpus_or(covered, covered, tmp);

5784

cpus_or(covered, covered, tmp);

5778

prev->next = sg;

5785

prev->next = sg;

5779

prev = sg;

5786

prev = sg;

5780

}

5787

}

5781

prev->next = sched_group_nodes[i];

5788

prev->next = sched_group_nodes[i];

5782

}

5789

}

5783

#endif

5790

#endif

5784

5791

5785

/* Calculate CPU power for physical packages and nodes */

5792

/* Calculate CPU power for physical packages and nodes */

5786

for_each_cpu_mask(i, *cpu_map) {

5793

for_each_cpu_mask(i, *cpu_map) {

5787

int power;

5794

int power;

5788

struct sched_domain *sd;

5795

struct sched_domain *sd;

5789

#ifdef CONFIG_SCHED_SMT

5796

#ifdef CONFIG_SCHED_SMT

5790

sd = &per_cpu(cpu_domains, i);

5797

sd = &per_cpu(cpu_domains, i);

5791

power = SCHED_LOAD_SCALE;

5798

power = SCHED_LOAD_SCALE;

5792

sd->groups->cpu_power = power;

5799

sd->groups->cpu_power = power;

5793

#endif

5800

#endif

5794

5801

5795

sd = &per_cpu(phys_domains, i);

5802

sd = &per_cpu(phys_domains, i);

5796

power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *

5803

power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *

5797

(cpus_weight(sd->groups->cpumask)-1) / 10;

5804

(cpus_weight(sd->groups->cpumask)-1) / 10;

5798

sd->groups->cpu_power = power;

5805

sd->groups->cpu_power = power;

5799

5806

5800

#ifdef CONFIG_NUMA

5807

#ifdef CONFIG_NUMA

5801

sd = &per_cpu(allnodes_domains, i);

5808

sd = &per_cpu(allnodes_domains, i);

5802

if (sd->groups) {

5809

if (sd->groups) {

5803

power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *

5810

power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *

5804

(cpus_weight(sd->groups->cpumask)-1) / 10;

5811

(cpus_weight(sd->groups->cpumask)-1) / 10;

5805

sd->groups->cpu_power = power;

5812

sd->groups->cpu_power = power;

5806

}

5813

}

5807

#endif

5814

#endif

5808

}

5815

}

5809

5816

5810

#ifdef CONFIG_NUMA

5817

#ifdef CONFIG_NUMA

5811

for (i = 0; i < MAX_NUMNODES; i++) {

5818

for (i = 0; i < MAX_NUMNODES; i++) {

5812

struct sched_group *sg = sched_group_nodes[i];

5819

struct sched_group *sg = sched_group_nodes[i];

5813

int j;

5820

int j;

5814

5821

5815

if (sg == NULL)

5822

if (sg == NULL)

5816

continue;

5823

continue;

5817

next_sg:

5824

next_sg:

5818

for_each_cpu_mask(j, sg->cpumask) {

5825

for_each_cpu_mask(j, sg->cpumask) {

5819

struct sched_domain *sd;

5826

struct sched_domain *sd;

5820

int power;

5827

int power;

5821

5828

5822

sd = &per_cpu(phys_domains, j);

5829

sd = &per_cpu(phys_domains, j);

5823

if (j != first_cpu(sd->groups->cpumask)) {

5830

if (j != first_cpu(sd->groups->cpumask)) {

5824

/*

5831

/*

5825

* Only add "power" once for each

5832

* Only add "power" once for each

5826

* physical package.

5833

* physical package.

5827

*/

5834

*/

5828

continue;

5835

continue;

5829

}

5836

}

5830

power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *

5837

power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *

5831

(cpus_weight(sd->groups->cpumask)-1) / 10;

5838

(cpus_weight(sd->groups->cpumask)-1) / 10;

5832

5839

5833

sg->cpu_power += power;

5840

sg->cpu_power += power;

5834

}

5841

}

5835

sg = sg->next;

5842

sg = sg->next;

5836

if (sg != sched_group_nodes[i])

5843

if (sg != sched_group_nodes[i])

5837

goto next_sg;

5844

goto next_sg;

5838

}

5845

}

5839

#endif

5846

#endif

5840

5847

5841

/* Attach the domains */

5848

/* Attach the domains */

5842

for_each_cpu_mask(i, *cpu_map) {

5849

for_each_cpu_mask(i, *cpu_map) {

5843

struct sched_domain *sd;

5850

struct sched_domain *sd;

5844

#ifdef CONFIG_SCHED_SMT

5851

#ifdef CONFIG_SCHED_SMT

5845

sd = &per_cpu(cpu_domains, i);

5852

sd = &per_cpu(cpu_domains, i);

5846

#else

5853

#else

5847

sd = &per_cpu(phys_domains, i);

5854

sd = &per_cpu(phys_domains, i);

5848

#endif

5855

#endif

5849

cpu_attach_domain(sd, i);

5856

cpu_attach_domain(sd, i);

5850

}

5857

}

5851

/*

5858

/*

5852

* Tune cache-hot values:

5859

* Tune cache-hot values:

5853

*/

5860

*/

5854

calibrate_migration_costs(cpu_map);

5861

calibrate_migration_costs(cpu_map);

5855

}

5862

}

5856

/*

5863

/*

5857

* Set up scheduler domains and groups. Callers must hold the hotplug lock.

5864

* Set up scheduler domains and groups. Callers must hold the hotplug lock.

5858

*/

5865

*/

5859

static void arch_init_sched_domains(const cpumask_t *cpu_map)

5866

static void arch_init_sched_domains(const cpumask_t *cpu_map)

5860

{

5867

{

5861

cpumask_t cpu_default_map;

5868

cpumask_t cpu_default_map;

5862

5869

5863

/*

5870

/*

5864

* Setup mask for cpus without special case scheduling requirements.

5871

* Setup mask for cpus without special case scheduling requirements.

5865

* For now this just excludes isolated cpus, but could be used to

5872

* For now this just excludes isolated cpus, but could be used to

5866

* exclude other special cases in the future.

5873

* exclude other special cases in the future.

5867

*/

5874

*/

5868

cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);

5875

cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);

5869

5876

5870

build_sched_domains(&cpu_default_map);

5877

build_sched_domains(&cpu_default_map);

5871

}

5878

}

5872

5879

5873

static void arch_destroy_sched_domains(const cpumask_t *cpu_map)

5880

static void arch_destroy_sched_domains(const cpumask_t *cpu_map)

5874

{

5881

{

5875

#ifdef CONFIG_NUMA

5882

#ifdef CONFIG_NUMA

5876

int i;

5883

int i;

5877

int cpu;

5884

int cpu;

5878

5885

5879

for_each_cpu_mask(cpu, *cpu_map) {

5886

for_each_cpu_mask(cpu, *cpu_map) {

5880

struct sched_group *sched_group_allnodes

5887

struct sched_group *sched_group_allnodes

5881

= sched_group_allnodes_bycpu[cpu];

5888

= sched_group_allnodes_bycpu[cpu];

5882

struct sched_group **sched_group_nodes

5889

struct sched_group **sched_group_nodes

5883

= sched_group_nodes_bycpu[cpu];

5890

= sched_group_nodes_bycpu[cpu];

5884

5891

5885

if (sched_group_allnodes) {

5892

if (sched_group_allnodes) {

5886

kfree(sched_group_allnodes);

5893

kfree(sched_group_allnodes);

5887

sched_group_allnodes_bycpu[cpu] = NULL;

5894

sched_group_allnodes_bycpu[cpu] = NULL;

5888

}

5895

}

5889

5896

5890

if (!sched_group_nodes)

5897

if (!sched_group_nodes)

5891

continue;

5898

continue;

5892

5899

5893

for (i = 0; i < MAX_NUMNODES; i++) {

5900

for (i = 0; i < MAX_NUMNODES; i++) {

5894

cpumask_t nodemask = node_to_cpumask(i);

5901

cpumask_t nodemask = node_to_cpumask(i);

5895

struct sched_group *oldsg, *sg = sched_group_nodes[i];

5902

struct sched_group *oldsg, *sg = sched_group_nodes[i];

5896

5903

5897

cpus_and(nodemask, nodemask, *cpu_map);

5904

cpus_and(nodemask, nodemask, *cpu_map);

5898

if (cpus_empty(nodemask))

5905

if (cpus_empty(nodemask))

5899

continue;

5906

continue;

5900

5907

5901

if (sg == NULL)

5908

if (sg == NULL)

5902

continue;

5909

continue;

5903

sg = sg->next;

5910

sg = sg->next;

5904

next_sg:

5911

next_sg:

5905

oldsg = sg;

5912

oldsg = sg;

5906

sg = sg->next;

5913

sg = sg->next;

5907

kfree(oldsg);

5914

kfree(oldsg);

5908

if (oldsg != sched_group_nodes[i])

5915

if (oldsg != sched_group_nodes[i])

5909

goto next_sg;

5916

goto next_sg;

5910

}

5917

}

5911

kfree(sched_group_nodes);

5918

kfree(sched_group_nodes);

5912

sched_group_nodes_bycpu[cpu] = NULL;

5919

sched_group_nodes_bycpu[cpu] = NULL;

5913

}

5920

}

5914

#endif

5921

#endif

5915

}

5922

}

5916

5923

5917

/*

5924

/*

5918

* Detach sched domains from a group of cpus specified in cpu_map

5925

* Detach sched domains from a group of cpus specified in cpu_map

5919

* These cpus will now be attached to the NULL domain

5926

* These cpus will now be attached to the NULL domain

5920

*/

5927

*/

5921

static void detach_destroy_domains(const cpumask_t *cpu_map)

5928

static void detach_destroy_domains(const cpumask_t *cpu_map)

5922

{

5929

{

5923

int i;

5930

int i;

5924

5931

5925

for_each_cpu_mask(i, *cpu_map)

5932

for_each_cpu_mask(i, *cpu_map)

5926

cpu_attach_domain(NULL, i);

5933

cpu_attach_domain(NULL, i);

5927

synchronize_sched();

5934

synchronize_sched();

5928

arch_destroy_sched_domains(cpu_map);

5935

arch_destroy_sched_domains(cpu_map);

5929

}

5936

}

5930

5937

5931

/*

5938

/*

5932

* Partition sched domains as specified by the cpumasks below.

5939

* Partition sched domains as specified by the cpumasks below.

5933

* This attaches all cpus from the cpumasks to the NULL domain,

5940

* This attaches all cpus from the cpumasks to the NULL domain,

5934

* waits for a RCU quiescent period, recalculates sched

5941

* waits for a RCU quiescent period, recalculates sched

5935

* domain information and then attaches them back to the

5942

* domain information and then attaches them back to the

5936

* correct sched domains

5943

* correct sched domains

5937

* Call with hotplug lock held

5944

* Call with hotplug lock held

5938

*/

5945

*/

5939

void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)

5946

void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)

5940

{

5947

{

5941

cpumask_t change_map;

5948

cpumask_t change_map;

5942

5949

5943

cpus_and(*partition1, *partition1, cpu_online_map);

5950

cpus_and(*partition1, *partition1, cpu_online_map);

5944

cpus_and(*partition2, *partition2, cpu_online_map);

5951

cpus_and(*partition2, *partition2, cpu_online_map);

5945

cpus_or(change_map, *partition1, *partition2);

5952

cpus_or(change_map, *partition1, *partition2);

5946

5953

5947

/* Detach sched domains from all of the affected cpus */

5954

/* Detach sched domains from all of the affected cpus */

5948

detach_destroy_domains(&change_map);

5955

detach_destroy_domains(&change_map);

5949

if (!cpus_empty(*partition1))

5956

if (!cpus_empty(*partition1))

5950

build_sched_domains(partition1);

5957

build_sched_domains(partition1);

5951

if (!cpus_empty(*partition2))

5958

if (!cpus_empty(*partition2))

5952

build_sched_domains(partition2);

5959

build_sched_domains(partition2);

5953

}

5960

}

5954

5961

5955

#ifdef CONFIG_HOTPLUG_CPU

5962

#ifdef CONFIG_HOTPLUG_CPU

5956

/*

5963

/*

5957

* Force a reinitialization of the sched domains hierarchy. The domains

5964

* Force a reinitialization of the sched domains hierarchy. The domains

5958

* and groups cannot be updated in place without racing with the balancing

5965

* and groups cannot be updated in place without racing with the balancing

5959

* code, so we temporarily attach all running cpus to the NULL domain

5966

* code, so we temporarily attach all running cpus to the NULL domain

5960

* which will prevent rebalancing while the sched domains are recalculated.

5967

* which will prevent rebalancing while the sched domains are recalculated.

5961

*/

5968

*/

5962

static int update_sched_domains(struct notifier_block *nfb,

5969

static int update_sched_domains(struct notifier_block *nfb,

5963

unsigned long action, void *hcpu)

5970

unsigned long action, void *hcpu)

5964

{

5971

{

5965

switch (action) {

5972

switch (action) {

5966

case CPU_UP_PREPARE:

5973

case CPU_UP_PREPARE:

5967

case CPU_DOWN_PREPARE:

5974

case CPU_DOWN_PREPARE:

5968

detach_destroy_domains(&cpu_online_map);

5975

detach_destroy_domains(&cpu_online_map);

5969

return NOTIFY_OK;

5976

return NOTIFY_OK;

5970

5977

5971

case CPU_UP_CANCELED:

5978

case CPU_UP_CANCELED:

5972

case CPU_DOWN_FAILED:

5979

case CPU_DOWN_FAILED:

5973

case CPU_ONLINE:

5980

case CPU_ONLINE:

5974

case CPU_DEAD:

5981

case CPU_DEAD:

5975

/*

5982

/*

5976

* Fall through and re-initialise the domains.

5983

* Fall through and re-initialise the domains.

5977

*/

5984

*/

5978

break;

5985

break;

5979

default:

5986

default:

5980

return NOTIFY_DONE;

5987

return NOTIFY_DONE;

5981

}

5988

}

5982

5989

5983

/* The hotplug lock is already held by cpu_up/cpu_down */

5990

/* The hotplug lock is already held by cpu_up/cpu_down */

5984

arch_init_sched_domains(&cpu_online_map);

5991

arch_init_sched_domains(&cpu_online_map);

5985

5992

5986

return NOTIFY_OK;

5993

return NOTIFY_OK;

5987

}

5994

}

5988

#endif

5995

#endif

5989

5996

5990

void __init sched_init_smp(void)

5997

void __init sched_init_smp(void)

5991

{

5998

{

5992

lock_cpu_hotplug();

5999

lock_cpu_hotplug();

5993

arch_init_sched_domains(&cpu_online_map);

6000

arch_init_sched_domains(&cpu_online_map);

5994

unlock_cpu_hotplug();

6001

unlock_cpu_hotplug();

5995

/* XXX: Theoretical race here - CPU may be hotplugged now */

6002

/* XXX: Theoretical race here - CPU may be hotplugged now */

5996

hotcpu_notifier(update_sched_domains, 0);

6003

hotcpu_notifier(update_sched_domains, 0);

5997

}

6004

}

5998

#else

6005

#else

5999

void __init sched_init_smp(void)

6006

void __init sched_init_smp(void)

6000

{

6007

{

6001

}

6008

}

6002

#endif /* CONFIG_SMP */

6009

#endif /* CONFIG_SMP */

6003

6010

6004

int in_sched_functions(unsigned long addr)

6011

int in_sched_functions(unsigned long addr)

6005

{

6012

{

6006

/* Linker adds these: start and end of __sched functions */

6013

/* Linker adds these: start and end of __sched functions */

6007

extern char __sched_text_start[], __sched_text_end[];

6014

extern char __sched_text_start[], __sched_text_end[];

6008

return in_lock_functions(addr) ||

6015

return in_lock_functions(addr) ||

6009

(addr >= (unsigned long)__sched_text_start

6016

(addr >= (unsigned long)__sched_text_start

6010

&& addr < (unsigned long)__sched_text_end);

6017

&& addr < (unsigned long)__sched_text_end);

6011

}

6018

}

6012

6019

6013

void __init sched_init(void)

6020

void __init sched_init(void)

6014

{

6021

{

6015

runqueue_t *rq;

6022

runqueue_t *rq;

6016

int i, j, k;

6023

int i, j, k;

6017

6024

6018

for_each_cpu(i) {

6025

for_each_cpu(i) {

6019

prio_array_t *array;

6026

prio_array_t *array;

6020

6027

6021

rq = cpu_rq(i);

6028

rq = cpu_rq(i);

6022

spin_lock_init(&rq->lock);

6029

spin_lock_init(&rq->lock);

6023

rq->nr_running = 0;

6030

rq->nr_running = 0;

6024

rq->active = rq->arrays;

6031

rq->active = rq->arrays;

6025

rq->expired = rq->arrays + 1;

6032

rq->expired = rq->arrays + 1;

6026

rq->best_expired_prio = MAX_PRIO;

6033

rq->best_expired_prio = MAX_PRIO;

6027

6034

6028

#ifdef CONFIG_SMP

6035

#ifdef CONFIG_SMP

6029

rq->sd = NULL;

6036

rq->sd = NULL;

6030

for (j = 1; j < 3; j++)

6037

for (j = 1; j < 3; j++)

6031

rq->cpu_load[j] = 0;

6038

rq->cpu_load[j] = 0;

6032

rq->active_balance = 0;

6039

rq->active_balance = 0;

6033

rq->push_cpu = 0;

6040

rq->push_cpu = 0;

6034

rq->migration_thread = NULL;

6041

rq->migration_thread = NULL;

6035

INIT_LIST_HEAD(&rq->migration_queue);

6042

INIT_LIST_HEAD(&rq->migration_queue);

6036

rq->cpu = i;

6043

rq->cpu = i;

6037

#endif

6044

#endif

6038

atomic_set(&rq->nr_iowait, 0);

6045

atomic_set(&rq->nr_iowait, 0);

6039

6046

6040

for (j = 0; j < 2; j++) {

6047

for (j = 0; j < 2; j++) {

6041

array = rq->arrays + j;

6048

array = rq->arrays + j;

6042

for (k = 0; k < MAX_PRIO; k++) {

6049

for (k = 0; k < MAX_PRIO; k++) {

6043

INIT_LIST_HEAD(array->queue + k);

6050

INIT_LIST_HEAD(array->queue + k);

6044

__clear_bit(k, array->bitmap);

6051

__clear_bit(k, array->bitmap);

6045

}

6052

}

6046

// delimiter for bitsearch

6053

// delimiter for bitsearch

6047

__set_bit(MAX_PRIO, array->bitmap);

6054

__set_bit(MAX_PRIO, array->bitmap);

6048

}

6055

}

6049

}

6056

}

6050

6057

6051

/*

6058

/*

6052

* The boot idle thread does lazy MMU switching as well:

6059

* The boot idle thread does lazy MMU switching as well:

6053

*/

6060

*/

6054

atomic_inc(&init_mm.mm_count);

6061

atomic_inc(&init_mm.mm_count);

6055

enter_lazy_tlb(&init_mm, current);

6062

enter_lazy_tlb(&init_mm, current);

6056

6063

6057

/*

6064

/*

6058

* Make us the idle thread. Technically, schedule() should not be

6065

* Make us the idle thread. Technically, schedule() should not be

6059

* called from this thread, however somewhere below it might be,

6066

* called from this thread, however somewhere below it might be,

6060

* but because we are the idle thread, we just pick up running again

6067

* but because we are the idle thread, we just pick up running again

6061

* when this runqueue becomes "idle".

6068

* when this runqueue becomes "idle".

6062

*/

6069

*/

6063

init_idle(current, smp_processor_id());

6070

init_idle(current, smp_processor_id());

6064

}

6071

}

6065

6072

6066

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

6073

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

6067

void __might_sleep(char *file, int line)

6074

void __might_sleep(char *file, int line)

6068

{

6075

{

6069

#if defined(in_atomic)

6076

#if defined(in_atomic)

6070

static unsigned long prev_jiffy; /* ratelimiting */

6077

static unsigned long prev_jiffy; /* ratelimiting */

6071

6078

6072

if ((in_atomic() || irqs_disabled()) &&

6079

if ((in_atomic() || irqs_disabled()) &&

6073

system_state == SYSTEM_RUNNING && !oops_in_progress) {

6080

system_state == SYSTEM_RUNNING && !oops_in_progress) {

6074

if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

6081

if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

6075

return;

6082

return;

6076

prev_jiffy = jiffies;

6083

prev_jiffy = jiffies;

6077

printk(KERN_ERR "BUG: sleeping function called from invalid"

6084

printk(KERN_ERR "BUG: sleeping function called from invalid"

6078

" context at %s:%d\n", file, line);

6085

" context at %s:%d\n", file, line);

6079

printk("in_atomic():%d, irqs_disabled():%d\n",

6086

printk("in_atomic():%d, irqs_disabled():%d\n",

6080

in_atomic(), irqs_disabled());

6087

in_atomic(), irqs_disabled());

6081

dump_stack();

6088

dump_stack();

6082

}

6089

}

6083

#endif

6090

#endif

6084

}

6091

}

6085

EXPORT_SYMBOL(__might_sleep);

6092

EXPORT_SYMBOL(__might_sleep);

6086

#endif

6093

#endif

6087

6094

6088

#ifdef CONFIG_MAGIC_SYSRQ

6095

#ifdef CONFIG_MAGIC_SYSRQ

6089

void normalize_rt_tasks(void)

6096

void normalize_rt_tasks(void)

6090

{

6097

{

6091

struct task_struct *p;

6098

struct task_struct *p;

6092

prio_array_t *array;

6099

prio_array_t *array;

6093

unsigned long flags;

6100

unsigned long flags;

6094

runqueue_t *rq;

6101

runqueue_t *rq;

6095

6102

6096

read_lock_irq(&tasklist_lock);

6103

read_lock_irq(&tasklist_lock);

6097

for_each_process (p) {

6104

for_each_process (p) {

6098

if (!rt_task(p))

6105

if (!rt_task(p))

6099

continue;

6106

continue;

6100

6107

6101

rq = task_rq_lock(p, &flags);

6108

rq = task_rq_lock(p, &flags);

6102

6109

6103

array = p->array;

6110

array = p->array;

6104

if (array)

6111

if (array)

6105

deactivate_task(p, task_rq(p));

6112

deactivate_task(p, task_rq(p));

6106

__setscheduler(p, SCHED_NORMAL, 0);

6113

__setscheduler(p, SCHED_NORMAL, 0);

6107

if (array) {

6114

if (array) {

6108

__activate_task(p, task_rq(p));

6115

__activate_task(p, task_rq(p));

6109

resched_task(rq->curr);

6116

resched_task(rq->curr);

6110

}

6117

}

6111

6118

6112

task_rq_unlock(rq, &flags);

6119

task_rq_unlock(rq, &flags);

6113

}

6120

}

6114

read_unlock_irq(&tasklist_lock);

6121

read_unlock_irq(&tasklist_lock);

6115

}

6122

}

6116

6123

6117

#endif /* CONFIG_MAGIC_SYSRQ */

6124

#endif /* CONFIG_MAGIC_SYSRQ */

6118

6125

6119

#ifdef CONFIG_IA64

6126

#ifdef CONFIG_IA64

6120

/*

6127

/*

6121

* These functions are only useful for the IA64 MCA handling.

6128

* These functions are only useful for the IA64 MCA handling.

6122

*

6129

*

6123

* They can only be called when the whole system has been

6130

* They can only be called when the whole system has been

6124

* stopped - every CPU needs to be quiescent, and no scheduling

6131

* stopped - every CPU needs to be quiescent, and no scheduling

6125

* activity can take place. Using them for anything else would

6132

* activity can take place. Using them for anything else would

6126

* be a serious bug, and as a result, they aren't even visible

6133

* be a serious bug, and as a result, they aren't even visible

6127

* under any other configuration.

6134

* under any other configuration.

6128

*/

6135

*/

6129

6136

6130

/**

6137

/**

6131

* curr_task - return the current task for a given cpu.

6138

* curr_task - return the current task for a given cpu.

6132

* @cpu: the processor in question.

6139

* @cpu: the processor in question.

6133

*

6140

*

6134

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

6141

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

6135

*/

6142

*/

6136

task_t *curr_task(int cpu)

6143

task_t *curr_task(int cpu)

6137

{

6144

{

6138

return cpu_curr(cpu);

6145

return cpu_curr(cpu);

6139

}

6146

}

6140

6147

6141

/**

6148

/**

6142

* set_curr_task - set the current task for a given cpu.

6149

* set_curr_task - set the current task for a given cpu.

6143

* @cpu: the processor in question.

6150

* @cpu: the processor in question.

6144

* @p: the task pointer to set.

6151

* @p: the task pointer to set.

6145

*

6152

*

6146

* Description: This function must only be used when non-maskable interrupts

6153

* Description: This function must only be used when non-maskable interrupts

6147

* are serviced on a separate stack. It allows the architecture to switch the

6154

* are serviced on a separate stack. It allows the architecture to switch the

6148

* notion of the current task on a cpu in a non-blocking manner. This function

6155

* notion of the current task on a cpu in a non-blocking manner. This function

6149

* must be called with all CPU's synchronized, and interrupts disabled, the

6156

* must be called with all CPU's synchronized, and interrupts disabled, the

6150

* and caller must save the original value of the current task (see

6157

* and caller must save the original value of the current task (see

6151

* curr_task() above) and restore that value before reenabling interrupts and

6158

* curr_task() above) and restore that value before reenabling interrupts and

6152

* re-starting the system.

6159

* re-starting the system.

6153

*

6160

*

6154

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

6161

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

6155

*/

6162

*/

6156

void set_curr_task(int cpu, task_t *p)

6163

void set_curr_task(int cpu, task_t *p)

6157

{

6164

{

6158

cpu_curr(cpu) = p;

6165

cpu_curr(cpu) = p;

6159

}

6166

}

6160

6167

6161

#endif

6168

#endif

6162

6169

GITLAB

[PATCH] kretprobe instance recycled by parent process

 /*
  *  linux/arch/i386/kernel/process.c
  *
  *  Copyright (C) 1995  Linus Torvalds
  *
  *  Pentium III FXSR, SSE support
  *	Gareth Hughes <gareth@valinux.com>, May 2000
  */
 /*
  * This file handles the architecture-dependent parts of process handling..
  */
 #include <stdarg.h>
 #include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/elfcore.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/stddef.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/user.h>
 #include <linux/a.out.h>
 #include <linux/interrupt.h>
 #include <linux/config.h>
 #include <linux/utsname.h>
 #include <linux/delay.h>
 #include <linux/reboot.h>
 #include <linux/init.h>
 #include <linux/mc146818rtc.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/ptrace.h>
 #include <linux/random.h>
-#include <linux/kprobes.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/system.h>
 #include <asm/io.h>
 #include <asm/ldt.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/desc.h>
 #include <asm/vm86.h>
 #ifdef CONFIG_MATH_EMULATION
 #include <asm/math_emu.h>
 #endif
 #include <linux/err.h>
 #include <asm/tlbflush.h>
 #include <asm/cpu.h>
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 static int hlt_counter;
 unsigned long boot_option_idle_override = 0;
 EXPORT_SYMBOL(boot_option_idle_override);
 /*
  * Return saved PC of a blocked thread.
  */
 unsigned long thread_saved_pc(struct task_struct *tsk)
 {
 	return ((unsigned long *)tsk->thread.esp)[3];
 }
 /*
  * Powermanagement idle function, if any..
  */
 void (*pm_idle)(void);
 EXPORT_SYMBOL(pm_idle);
 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
 void disable_hlt(void)
 {
 	hlt_counter++;
 }
 EXPORT_SYMBOL(disable_hlt);
 void enable_hlt(void)
 {
 	hlt_counter--;
 }
 EXPORT_SYMBOL(enable_hlt);
 /*
  * We use this if we don't have any better
  * idle routine..
  */
 void default_idle(void)
 {
 	local_irq_enable();
 	if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
 		clear_thread_flag(TIF_POLLING_NRFLAG);
 		smp_mb__after_clear_bit();
 		while (!need_resched()) {
 			local_irq_disable();
 			if (!need_resched())
 				safe_halt();
 			else
 				local_irq_enable();
 		}
 		set_thread_flag(TIF_POLLING_NRFLAG);
 	} else {
 		while (!need_resched())
 			cpu_relax();
 	}
 }
 #ifdef CONFIG_APM_MODULE
 EXPORT_SYMBOL(default_idle);
 #endif
 /*
  * On SMP it's slightly faster (but much more power-consuming!)
  * to poll the ->work.need_resched flag instead of waiting for the
  * cross-CPU IPI to arrive. Use this option with caution.
  */
 static void poll_idle (void)
 {
 	local_irq_enable();
 	asm volatile(
 		"2:"
 		"testl %0, %1;"
 		"rep; nop;"
 		"je 2b;"
 		: : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
 }
 #ifdef CONFIG_HOTPLUG_CPU
 #include <asm/nmi.h>
 /* We don't actually take CPU down, just spin without interrupts. */
 static inline void play_dead(void)
 {
 	/* This must be done before dead CPU ack */
 	cpu_exit_clear();
 	wbinvd();
 	mb();
 	/* Ack it */
 	__get_cpu_var(cpu_state) = CPU_DEAD;
 	/*
 	 * With physical CPU hotplug, we should halt the cpu
 	 */
 	local_irq_disable();
 	while (1)
 		halt();
 }
 #else
 static inline void play_dead(void)
 {
 	BUG();
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 /*
  * The idle thread. There's no useful work to be
  * done, so just try to conserve power and have a
  * low exit latency (ie sit in a loop waiting for
  * somebody to say that they'd like to reschedule)
  */
 void cpu_idle(void)
 {
 	int cpu = smp_processor_id();
 	set_thread_flag(TIF_POLLING_NRFLAG);
 	/* endless idle loop with no priority at all */
 	while (1) {
 		while (!need_resched()) {
 			void (*idle)(void);
 			if (__get_cpu_var(cpu_idle_state))
 				__get_cpu_var(cpu_idle_state) = 0;
 			rmb();
 			idle = pm_idle;
 			if (!idle)
 				idle = default_idle;
 			if (cpu_is_offline(cpu))
 				play_dead();
 			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
 			idle();
 		}
 		preempt_enable_no_resched();
 		schedule();
 		preempt_disable();
 	}
 }
 void cpu_idle_wait(void)
 {
 	unsigned int cpu, this_cpu = get_cpu();
 	cpumask_t map;
 	set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
 	put_cpu();
 	cpus_clear(map);
 	for_each_online_cpu(cpu) {
 		per_cpu(cpu_idle_state, cpu) = 1;
 		cpu_set(cpu, map);
 	}
 	__get_cpu_var(cpu_idle_state) = 0;
 	wmb();
 	do {
 		ssleep(1);
 		for_each_online_cpu(cpu) {
 			if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
 				cpu_clear(cpu, map);
 		}
 		cpus_and(map, map, cpu_online_map);
 	} while (!cpus_empty(map));
 }
 EXPORT_SYMBOL_GPL(cpu_idle_wait);
 /*
  * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
  * which can obviate IPI to trigger checking of need_resched.
  * We execute MONITOR against need_resched and enter optimized wait state
  * through MWAIT. Whenever someone changes need_resched, we would be woken
  * up from MWAIT (without an IPI).
  */
 static void mwait_idle(void)
 {
 	local_irq_enable();
 	while (!need_resched()) {
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
 		if (need_resched())
 			break;
 		__mwait(0, 0);
 	}
 }
 void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
 {
 	if (cpu_has(c, X86_FEATURE_MWAIT)) {
 		printk("monitor/mwait feature present.\n");
 		/*
 		 * Skip, if setup has overridden idle.
 		 * One CPU supports mwait => All CPUs supports mwait
 		 */
 		if (!pm_idle) {
 			printk("using mwait in idle threads.\n");
 			pm_idle = mwait_idle;
 		}
 	}
 }
 static int __init idle_setup (char *str)
 {
 	if (!strncmp(str, "poll", 4)) {
 		printk("using polling idle threads.\n");
 		pm_idle = poll_idle;
 #ifdef CONFIG_X86_SMP
 		if (smp_num_siblings > 1)
 			printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
 #endif
 	} else if (!strncmp(str, "halt", 4)) {
 		printk("using halt in idle threads.\n");
 		pm_idle = default_idle;
 	}
 	boot_option_idle_override = 1;
 	return 1;
 }
 __setup("idle=", idle_setup);
 void show_regs(struct pt_regs * regs)
 {
 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
 	printk("\n");
 	printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
 	printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
 	print_symbol("EIP is at %s\n", regs->eip);
 	if (user_mode_vm(regs))
 		printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
 	printk(" EFLAGS: %08lx    %s  (%s %.*s)\n",
 	       regs->eflags, print_tainted(), system_utsname.release,
 	       (int)strcspn(system_utsname.version, " "),
 	       system_utsname.version);
 	printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
 		regs->eax,regs->ebx,regs->ecx,regs->edx);
 	printk("ESI: %08lx EDI: %08lx EBP: %08lx",
 		regs->esi, regs->edi, regs->ebp);
 	printk(" DS: %04x ES: %04x\n",
 		0xffff & regs->xds,0xffff & regs->xes);
 	cr0 = read_cr0();
 	cr2 = read_cr2();
 	cr3 = read_cr3();
 	cr4 = read_cr4_safe();
 	printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
 	show_trace(NULL, &regs->esp);
 }
 /*
  * This gets run with %ebx containing the
  * function to call, and %edx containing
  * the "args".
  */
 extern void kernel_thread_helper(void);
 __asm__(".section .text\n"
 	".align 4\n"
 	"kernel_thread_helper:\n\t"
 	"movl %edx,%eax\n\t"
 	"pushl %edx\n\t"
 	"call *%ebx\n\t"
 	"pushl %eax\n\t"
 	"call do_exit\n"
 	".previous");
 /*
  * Create a kernel thread
  */
 int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
 {
 	struct pt_regs regs;
 	memset(&regs, 0, sizeof(regs));
 	regs.ebx = (unsigned long) fn;
 	regs.edx = (unsigned long) arg;
 	regs.xds = __USER_DS;
 	regs.xes = __USER_DS;
 	regs.orig_eax = -1;
 	regs.eip = (unsigned long) kernel_thread_helper;
 	regs.xcs = __KERNEL_CS;
 	regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
 	/* Ok, create the new process.. */
 	return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
 }
 EXPORT_SYMBOL(kernel_thread);
 /*
  * Free current thread data structures etc..
  */
 void exit_thread(void)
 {
 	struct task_struct *tsk = current;
 	struct thread_struct *t = &tsk->thread;
-	/*
-	 * Remove function-return probe instances associated with this task
-	 * and put them back on the free list. Do not insert an exit probe for
-	 * this function, it will be disabled by kprobe_flush_task if you do.
-	 */
-	kprobe_flush_task(tsk);
 	/* The process may have allocated an io port bitmap... nuke it. */
 	if (unlikely(NULL != t->io_bitmap_ptr)) {
 		int cpu = get_cpu();
 		struct tss_struct *tss = &per_cpu(init_tss, cpu);
 		kfree(t->io_bitmap_ptr);
 		t->io_bitmap_ptr = NULL;
 		/*
 		 * Careful, clear this in the TSS too:
 		 */
 		memset(tss->io_bitmap, 0xff, tss->io_bitmap_max);
 		t->io_bitmap_max = 0;
 		tss->io_bitmap_owner = NULL;
 		tss->io_bitmap_max = 0;
 		tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
 		put_cpu();
 	}
 }
 void flush_thread(void)
 {
 	struct task_struct *tsk = current;
 	memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 	/*
 	 * Forget coprocessor state..
 	 */
 	clear_fpu(tsk);
 	clear_used_math();
 }
 void release_thread(struct task_struct *dead_task)
 {
 	BUG_ON(dead_task->mm);
 	release_vm86_irqs(dead_task);
 }
 /*
  * This gets called before we allocate a new thread and copy
  * the current task into it.
  */
 void prepare_to_copy(struct task_struct *tsk)
 {
 	unlazy_fpu(tsk);
 }
 int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
 	unsigned long unused,
 	struct task_struct * p, struct pt_regs * regs)
 {
 	struct pt_regs * childregs;
 	struct task_struct *tsk;
 	int err;
 	childregs = task_pt_regs(p);
 	*childregs = *regs;
 	childregs->eax = 0;
 	childregs->esp = esp;
 	p->thread.esp = (unsigned long) childregs;
 	p->thread.esp0 = (unsigned long) (childregs+1);
 	p->thread.eip = (unsigned long) ret_from_fork;
 	savesegment(fs,p->thread.fs);
 	savesegment(gs,p->thread.gs);
 	tsk = current;
 	if (unlikely(NULL != tsk->thread.io_bitmap_ptr)) {
 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
 		if (!p->thread.io_bitmap_ptr) {
 			p->thread.io_bitmap_max = 0;
 			return -ENOMEM;
 		}
 		memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
 			IO_BITMAP_BYTES);
 	}
 	/*
 	 * Set a new TLS for the child thread?
 	 */
 	if (clone_flags & CLONE_SETTLS) {
 		struct desc_struct *desc;
 		struct user_desc info;
 		int idx;
 		err = -EFAULT;
 		if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
 			goto out;
 		err = -EINVAL;
 		if (LDT_empty(&info))
 			goto out;
 		idx = info.entry_number;
 		if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
 			goto out;
 		desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
 		desc->a = LDT_entry_a(&info);
 		desc->b = LDT_entry_b(&info);
 	}
 	err = 0;
  out:
 	if (err && p->thread.io_bitmap_ptr) {
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
 	return err;
 }
 /*
  * fill in the user structure for a core dump..
  */
 void dump_thread(struct pt_regs * regs, struct user * dump)
 {
 	int i;
 /* changed the size calculations - should hopefully work better. lbt */
 	dump->magic = CMAGIC;
 	dump->start_code = 0;
 	dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
 	dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
 	dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
 	dump->u_dsize -= dump->u_tsize;
 	dump->u_ssize = 0;
 	for (i = 0; i < 8; i++)
 		dump->u_debugreg[i] = current->thread.debugreg[i];
 	if (dump->start_stack < TASK_SIZE)
 		dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
 	dump->regs.ebx = regs->ebx;
 	dump->regs.ecx = regs->ecx;
 	dump->regs.edx = regs->edx;
 	dump->regs.esi = regs->esi;
 	dump->regs.edi = regs->edi;
 	dump->regs.ebp = regs->ebp;
 	dump->regs.eax = regs->eax;
 	dump->regs.ds = regs->xds;
 	dump->regs.es = regs->xes;
 	savesegment(fs,dump->regs.fs);
 	savesegment(gs,dump->regs.gs);
 	dump->regs.orig_eax = regs->orig_eax;
 	dump->regs.eip = regs->eip;
 	dump->regs.cs = regs->xcs;
 	dump->regs.eflags = regs->eflags;
 	dump->regs.esp = regs->esp;
 	dump->regs.ss = regs->xss;
 	dump->u_fpvalid = dump_fpu (regs, &dump->i387);
 }
 EXPORT_SYMBOL(dump_thread);
 /*
  * Capture the user space registers if the task is not running (in user space)
  */
 int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
 {
 	struct pt_regs ptregs = *task_pt_regs(tsk);
 	ptregs.xcs &= 0xffff;
 	ptregs.xds &= 0xffff;
 	ptregs.xes &= 0xffff;
 	ptregs.xss &= 0xffff;
 	elf_core_copy_regs(regs, &ptregs);
 	return 1;
 }
 static inline void
 handle_io_bitmap(struct thread_struct *next, struct tss_struct *tss)
 {
 	if (!next->io_bitmap_ptr) {
 		/*
 		 * Disable the bitmap via an invalid offset. We still cache
 		 * the previous bitmap owner and the IO bitmap contents:
 		 */
 		tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
 		return;
 	}
 	if (likely(next == tss->io_bitmap_owner)) {
 		/*
 		 * Previous owner of the bitmap (hence the bitmap content)
 		 * matches the next task, we dont have to do anything but
 		 * to set a valid offset in the TSS:
 		 */
 		tss->io_bitmap_base = IO_BITMAP_OFFSET;
 		return;
 	}
 	/*
 	 * Lazy TSS's I/O bitmap copy. We set an invalid offset here
 	 * and we let the task to get a GPF in case an I/O instruction
 	 * is performed.  The handler of the GPF will verify that the
 	 * faulting task has a valid I/O bitmap and, it true, does the
 	 * real copy and restart the instruction.  This will save us
 	 * redundant copies when the currently switched task does not
 	 * perform any I/O during its timeslice.
 	 */
 	tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
 }
 /*
  * This function selects if the context switch from prev to next
  * has to tweak the TSC disable bit in the cr4.
  */
 static inline void disable_tsc(struct task_struct *prev_p,
 			       struct task_struct *next_p)
 {
 	struct thread_info *prev, *next;
 	/*
 	 * gcc should eliminate the ->thread_info dereference if
 	 * has_secure_computing returns 0 at compile time (SECCOMP=n).
 	 */
 	prev = task_thread_info(prev_p);
 	next = task_thread_info(next_p);
 	if (has_secure_computing(prev) || has_secure_computing(next)) {
 		/* slow path here */
 		if (has_secure_computing(prev) &&
 		    !has_secure_computing(next)) {
 			write_cr4(read_cr4() & ~X86_CR4_TSD);
 		} else if (!has_secure_computing(prev) &&
 			   has_secure_computing(next))
 			write_cr4(read_cr4() | X86_CR4_TSD);
 	}
 }
 /*
  *	switch_to(x,yn) should switch tasks from x to y.
  *
  * We fsave/fwait so that an exception goes off at the right time
  * (as a call from the fsave or fwait in effect) rather than to
  * the wrong process. Lazy FP saving no longer makes any sense
  * with modern CPU's, and this simplifies a lot of things (SMP
  * and UP become the same).
  *
  * NOTE! We used to use the x86 hardware context switching. The
  * reason for not using it any more becomes apparent when you
  * try to recover gracefully from saved state that is no longer
  * valid (stale segment register values in particular). With the
  * hardware task-switch, there is no way to fix up bad state in
  * a reasonable manner.
  *
  * The fact that Intel documents the hardware task-switching to
  * be slow is a fairly red herring - this code is not noticeably
  * faster. However, there _is_ some room for improvement here,
  * so the performance issues may eventually be a valid point.
  * More important, however, is the fact that this allows us much
  * more flexibility.
  *
  * The return value (in %eax) will be the "prev" task after
  * the task-switch, and shows up in ret_from_fork in entry.S,
  * for example.
  */
 struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread,
 				 *next = &next_p->thread;
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 	__unlazy_fpu(prev_p);
 	/*
 	 * Reload esp0.
 	 */
 	load_esp0(tss, next);
 	/*
 	 * Save away %fs and %gs. No need to save %es and %ds, as
 	 * those are always kernel segments while inside the kernel.
 	 * Doing this before setting the new TLS descriptors avoids
 	 * the situation where we temporarily have non-reloadable
 	 * segments in %fs and %gs.  This could be an issue if the
 	 * NMI handler ever used %fs or %gs (it does not today), or
 	 * if the kernel is running inside of a hypervisor layer.
 	 */
 	savesegment(fs, prev->fs);
 	savesegment(gs, prev->gs);
 	/*
 	 * Load the per-thread Thread-Local Storage descriptor.
 	 */
 	load_TLS(next, cpu);
 	/*
 	 * Restore %fs and %gs if needed.
 	 *
 	 * Glibc normally makes %fs be zero, and %gs is one of
 	 * the TLS segments.
 	 */
 	if (unlikely(prev->fs | next->fs))
 		loadsegment(fs, next->fs);
 	if (prev->gs | next->gs)
 		loadsegment(gs, next->gs);
 	/*
 	 * Restore IOPL if needed.
 	 */
 	if (unlikely(prev->iopl != next->iopl))
 		set_iopl_mask(next->iopl);
 	/*
 	 * Now maybe reload the debug registers
 	 */
 	if (unlikely(next->debugreg[7])) {
 		set_debugreg(next->debugreg[0], 0);
 		set_debugreg(next->debugreg[1], 1);
 		set_debugreg(next->debugreg[2], 2);
 		set_debugreg(next->debugreg[3], 3);
 		/* no 4 and 5 */
 		set_debugreg(next->debugreg[6], 6);
 		set_debugreg(next->debugreg[7], 7);
 	}
 	if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr))
 		handle_io_bitmap(next, tss);
 	disable_tsc(prev_p, next_p);
 	return prev_p;
 }
 asmlinkage int sys_fork(struct pt_regs regs)
 {
 	return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
 }
 asmlinkage int sys_clone(struct pt_regs regs)
 {
 	unsigned long clone_flags;
 	unsigned long newsp;
 	int __user *parent_tidptr, *child_tidptr;
 	clone_flags = regs.ebx;
 	newsp = regs.ecx;
 	parent_tidptr = (int __user *)regs.edx;
 	child_tidptr = (int __user *)regs.edi;
 	if (!newsp)
 		newsp = regs.esp;
 	return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
 }
 /*
  * This is trivial, and on the face of it looks like it
  * could equally well be done in user mode.
  *
  * Not so, for quite unobvious reasons - register pressure.
  * In user mode vfork() cannot have a stack frame, and if
  * done by calling the "clone()" system call directly, you
  * do not have enough call-clobbered registers to hold all
  * the information you need.
  */
 asmlinkage int sys_vfork(struct pt_regs regs)
 {
 	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
 }
 /*
  * sys_execve() executes a new program.
  */
 asmlinkage int sys_execve(struct pt_regs regs)
 {
 	int error;
 	char * filename;
 	filename = getname((char __user *) regs.ebx);
 	error = PTR_ERR(filename);
 	if (IS_ERR(filename))
 		goto out;
 	error = do_execve(filename,
 			(char __user * __user *) regs.ecx,
 			(char __user * __user *) regs.edx,
 			&regs);
 	if (error == 0) {
 		task_lock(current);
 		current->ptrace &= ~PT_DTRACE;
 		task_unlock(current);
 		/* Make sure we don't return using sysenter.. */
 		set_thread_flag(TIF_IRET);
 	}
 	putname(filename);
 out:
 	return error;
 }
 #define top_esp                (THREAD_SIZE - sizeof(unsigned long))
 #define top_ebp                (THREAD_SIZE - 2*sizeof(unsigned long))
 unsigned long get_wchan(struct task_struct *p)
 {
 	unsigned long ebp, esp, eip;
 	unsigned long stack_page;
 	int count = 0;
 	if (!p || p == current || p->state == TASK_RUNNING)
 		return 0;
 	stack_page = (unsigned long)task_stack_page(p);
 	esp = p->thread.esp;
 	if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
 		return 0;
 	/* include/asm-i386/system.h:switch_to() pushes ebp last. */
 	ebp = *(unsigned long *) esp;
 	do {
 		if (ebp < stack_page || ebp > top_ebp+stack_page)
 			return 0;
 		eip = *(unsigned long *) (ebp+4);
 		if (!in_sched_functions(eip))
 			return eip;
 		ebp = *(unsigned long *) ebp;
 	} while (count++ < 16);
 	return 0;
 }
 EXPORT_SYMBOL(get_wchan);
 /*
  * sys_alloc_thread_area: get a yet unused TLS descriptor index.
  */
 static int get_free_idx(void)
 {
 	struct thread_struct *t = &current->thread;
 	int idx;
 	for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
 		if (desc_empty(t->tls_array + idx))
 			return idx + GDT_ENTRY_TLS_MIN;
 	return -ESRCH;
 }
 /*
  * Set a given TLS descriptor:
  */
 asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
 {
 	struct thread_struct *t = &current->thread;
 	struct user_desc info;
 	struct desc_struct *desc;
 	int cpu, idx;
 	if (copy_from_user(&info, u_info, sizeof(info)))
 		return -EFAULT;
 	idx = info.entry_number;
 	/*
 	 * index -1 means the kernel should try to find and
 	 * allocate an empty descriptor:
 	 */
 	if (idx == -1) {
 		idx = get_free_idx();
 		if (idx < 0)
 			return idx;
 		if (put_user(idx, &u_info->entry_number))
 			return -EFAULT;
 	}
 	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
 		return -EINVAL;
 	desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
 	/*
 	 * We must not get preempted while modifying the TLS.
 	 */
 	cpu = get_cpu();
 	if (LDT_empty(&info)) {
 		desc->a = 0;
 		desc->b = 0;
 	} else {
 		desc->a = LDT_entry_a(&info);
 		desc->b = LDT_entry_b(&info);
 	}
 	load_TLS(t, cpu);
 	put_cpu();
 	return 0;
 }
 /*
  * Get the current Thread-Local Storage area:
  */
 #define GET_BASE(desc) ( \
 	(((desc)->a >> 16) & 0x0000ffff) | \
 	(((desc)->b << 16) & 0x00ff0000) | \
 	( (desc)->b        & 0xff000000)   )
 #define GET_LIMIT(desc) ( \
 	((desc)->a & 0x0ffff) | \
 	 ((desc)->b & 0xf0000) )
 #define GET_32BIT(desc)		(((desc)->b >> 22) & 1)
 #define GET_CONTENTS(desc)	(((desc)->b >> 10) & 3)
 #define GET_WRITABLE(desc)	(((desc)->b >>  9) & 1)
 #define GET_LIMIT_PAGES(desc)	(((desc)->b >> 23) & 1)
 #define GET_PRESENT(desc)	(((desc)->b >> 15) & 1)
 #define GET_USEABLE(desc)	(((desc)->b >> 20) & 1)
 asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
 {
 	struct user_desc info;
 	struct desc_struct *desc;
 	int idx;
 	if (get_user(idx, &u_info->entry_number))
 		return -EFAULT;
 	if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
 		return -EINVAL;
 	memset(&info, 0, sizeof(info));
 	desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
 	info.entry_number = idx;
 	info.base_addr = GET_BASE(desc);
 	info.limit = GET_LIMIT(desc);
 	info.seg_32bit = GET_32BIT(desc);
 	info.contents = GET_CONTENTS(desc);
 	info.read_exec_only = !GET_WRITABLE(desc);
 	info.limit_in_pages = GET_LIMIT_PAGES(desc);
 	info.seg_not_present = !GET_PRESENT(desc);
 	info.useable = GET_USEABLE(desc);
 	if (copy_to_user(u_info, &info, sizeof(info)))
 		return -EFAULT;
 	return 0;
 }
 unsigned long arch_align_stack(unsigned long sp)
 {
 	if (randomize_va_space)
 		sp -= get_random_int() % 8192;
 	return sp & ~0xf;
 }

 /*
  * Architecture-specific setup.
  *
  * Copyright (C) 1998-2003 Hewlett-Packard Co
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  * 04/11/17 Ashok Raj	<ashok.raj@intel.com> Added CPU Hotplug Support
  *
  * 2005-10-07 Keith Owens <kaos@sgi.com>
  *	      Add notify_die() hooks.
  */
 #define __KERNEL_SYSCALLS__	/* see <asm/unistd.h> */
 #include <linux/config.h>
 #include <linux/cpu.h>
 #include <linux/pm.h>
 #include <linux/elf.h>
 #include <linux/errno.h>
 #include <linux/kallsyms.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/personality.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/stddef.h>
 #include <linux/thread_info.h>
 #include <linux/unistd.h>
 #include <linux/efi.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
-#include <linux/kprobes.h>
 #include <asm/cpu.h>
 #include <asm/delay.h>
 #include <asm/elf.h>
 #include <asm/ia32.h>
 #include <asm/irq.h>
 #include <asm/kdebug.h>
 #include <asm/pgalloc.h>
 #include <asm/processor.h>
 #include <asm/sal.h>
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
 #include <asm/unwind.h>
 #include <asm/user.h>
 #include "entry.h"
 #ifdef CONFIG_PERFMON
 # include <asm/perfmon.h>
 #endif
 #include "sigframe.h"
 void (*ia64_mark_idle)(int);
 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
 unsigned long boot_option_idle_override = 0;
 EXPORT_SYMBOL(boot_option_idle_override);
 void
 ia64_do_show_stack (struct unw_frame_info *info, void *arg)
 {
 	unsigned long ip, sp, bsp;
 	char buf[128];			/* don't make it so big that it overflows the stack! */
 	printk("\nCall Trace:\n");
 	do {
 		unw_get_ip(info, &ip);
 		if (ip == 0)
 			break;
 		unw_get_sp(info, &sp);
 		unw_get_bsp(info, &bsp);
 		snprintf(buf, sizeof(buf),
 			 " [<%016lx>] %%s\n"
 			 "                                sp=%016lx bsp=%016lx\n",
 			 ip, sp, bsp);
 		print_symbol(buf, ip);
 	} while (unw_unwind(info) >= 0);
 }
 void
 show_stack (struct task_struct *task, unsigned long *sp)
 {
 	if (!task)
 		unw_init_running(ia64_do_show_stack, NULL);
 	else {
 		struct unw_frame_info info;
 		unw_init_from_blocked_task(&info, task);
 		ia64_do_show_stack(&info, NULL);
 	}
 }
 void
 dump_stack (void)
 {
 	show_stack(NULL, NULL);
 }
 EXPORT_SYMBOL(dump_stack);
 void
 show_regs (struct pt_regs *regs)
 {
 	unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri;
 	print_modules();
 	printk("\nPid: %d, CPU %d, comm: %20s\n", current->pid, smp_processor_id(), current->comm);
 	printk("psr : %016lx ifs : %016lx ip  : [<%016lx>]    %s\n",
 	       regs->cr_ipsr, regs->cr_ifs, ip, print_tainted());
 	print_symbol("ip is at %s\n", ip);
 	printk("unat: %016lx pfs : %016lx rsc : %016lx\n",
 	       regs->ar_unat, regs->ar_pfs, regs->ar_rsc);
 	printk("rnat: %016lx bsps: %016lx pr  : %016lx\n",
 	       regs->ar_rnat, regs->ar_bspstore, regs->pr);
 	printk("ldrs: %016lx ccv : %016lx fpsr: %016lx\n",
 	       regs->loadrs, regs->ar_ccv, regs->ar_fpsr);
 	printk("csd : %016lx ssd : %016lx\n", regs->ar_csd, regs->ar_ssd);
 	printk("b0  : %016lx b6  : %016lx b7  : %016lx\n", regs->b0, regs->b6, regs->b7);
 	printk("f6  : %05lx%016lx f7  : %05lx%016lx\n",
 	       regs->f6.u.bits[1], regs->f6.u.bits[0],
 	       regs->f7.u.bits[1], regs->f7.u.bits[0]);
 	printk("f8  : %05lx%016lx f9  : %05lx%016lx\n",
 	       regs->f8.u.bits[1], regs->f8.u.bits[0],
 	       regs->f9.u.bits[1], regs->f9.u.bits[0]);
 	printk("f10 : %05lx%016lx f11 : %05lx%016lx\n",
 	       regs->f10.u.bits[1], regs->f10.u.bits[0],
 	       regs->f11.u.bits[1], regs->f11.u.bits[0]);
 	printk("r1  : %016lx r2  : %016lx r3  : %016lx\n", regs->r1, regs->r2, regs->r3);
 	printk("r8  : %016lx r9  : %016lx r10 : %016lx\n", regs->r8, regs->r9, regs->r10);
 	printk("r11 : %016lx r12 : %016lx r13 : %016lx\n", regs->r11, regs->r12, regs->r13);
 	printk("r14 : %016lx r15 : %016lx r16 : %016lx\n", regs->r14, regs->r15, regs->r16);
 	printk("r17 : %016lx r18 : %016lx r19 : %016lx\n", regs->r17, regs->r18, regs->r19);
 	printk("r20 : %016lx r21 : %016lx r22 : %016lx\n", regs->r20, regs->r21, regs->r22);
 	printk("r23 : %016lx r24 : %016lx r25 : %016lx\n", regs->r23, regs->r24, regs->r25);
 	printk("r26 : %016lx r27 : %016lx r28 : %016lx\n", regs->r26, regs->r27, regs->r28);
 	printk("r29 : %016lx r30 : %016lx r31 : %016lx\n", regs->r29, regs->r30, regs->r31);
 	if (user_mode(regs)) {
 		/* print the stacked registers */
 		unsigned long val, *bsp, ndirty;
 		int i, sof, is_nat = 0;
 		sof = regs->cr_ifs & 0x7f;	/* size of frame */
 		ndirty = (regs->loadrs >> 19);
 		bsp = ia64_rse_skip_regs((unsigned long *) regs->ar_bspstore, ndirty);
 		for (i = 0; i < sof; ++i) {
 			get_user(val, (unsigned long __user *) ia64_rse_skip_regs(bsp, i));
 			printk("r%-3u:%c%016lx%s", 32 + i, is_nat ? '*' : ' ', val,
 			       ((i == sof - 1) || (i % 3) == 2) ? "\n" : " ");
 		}
 	} else
 		show_stack(NULL, NULL);
 }
 void
 do_notify_resume_user (sigset_t *oldset, struct sigscratch *scr, long in_syscall)
 {
 	if (fsys_mode(current, &scr->pt)) {
 		/* defer signal-handling etc. until we return to privilege-level 0.  */
 		if (!ia64_psr(&scr->pt)->lp)
 			ia64_psr(&scr->pt)->lp = 1;
 		return;
 	}
 #ifdef CONFIG_PERFMON
 	if (current->thread.pfm_needs_checking)
 		pfm_handle_work();
 #endif
 	/* deal with pending signal delivery */
 	if (test_thread_flag(TIF_SIGPENDING))
 		ia64_do_signal(oldset, scr, in_syscall);
 }
 static int pal_halt        = 1;
 static int can_do_pal_halt = 1;
 static int __init nohalt_setup(char * str)
 {
 	pal_halt = can_do_pal_halt = 0;
 	return 1;
 }
 __setup("nohalt", nohalt_setup);
 void
 update_pal_halt_status(int status)
 {
 	can_do_pal_halt = pal_halt && status;
 }
 /*
  * We use this if we don't have any better idle routine..
  */
 void
 default_idle (void)
 {
 	local_irq_enable();
 	while (!need_resched()) {
 		if (can_do_pal_halt)
 			safe_halt();
 		else
 			cpu_relax();
 	}
 }
 #ifdef CONFIG_HOTPLUG_CPU
 /* We don't actually take CPU down, just spin without interrupts. */
 static inline void play_dead(void)
 {
 	extern void ia64_cpu_local_tick (void);
 	unsigned int this_cpu = smp_processor_id();
 	/* Ack it */
 	__get_cpu_var(cpu_state) = CPU_DEAD;
 	max_xtp();
 	local_irq_disable();
 	idle_task_exit();
 	ia64_jump_to_sal(&sal_boot_rendez_state[this_cpu]);
 	/*
 	 * The above is a point of no-return, the processor is
 	 * expected to be in SAL loop now.
 	 */
 	BUG();
 }
 #else
 static inline void play_dead(void)
 {
 	BUG();
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 void cpu_idle_wait(void)
 {
 	unsigned int cpu, this_cpu = get_cpu();
 	cpumask_t map;
 	set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
 	put_cpu();
 	cpus_clear(map);
 	for_each_online_cpu(cpu) {
 		per_cpu(cpu_idle_state, cpu) = 1;
 		cpu_set(cpu, map);
 	}
 	__get_cpu_var(cpu_idle_state) = 0;
 	wmb();
 	do {
 		ssleep(1);
 		for_each_online_cpu(cpu) {
 			if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
 				cpu_clear(cpu, map);
 		}
 		cpus_and(map, map, cpu_online_map);
 	} while (!cpus_empty(map));
 }
 EXPORT_SYMBOL_GPL(cpu_idle_wait);
 void __attribute__((noreturn))
 cpu_idle (void)
 {
 	void (*mark_idle)(int) = ia64_mark_idle;
   	int cpu = smp_processor_id();
 	/* endless idle loop with no priority at all */
 	while (1) {
 		if (can_do_pal_halt)
 			clear_thread_flag(TIF_POLLING_NRFLAG);
 		else
 			set_thread_flag(TIF_POLLING_NRFLAG);
 		if (!need_resched()) {
 			void (*idle)(void);
 #ifdef CONFIG_SMP
 			min_xtp();
 #endif
 			if (__get_cpu_var(cpu_idle_state))
 				__get_cpu_var(cpu_idle_state) = 0;
 			rmb();
 			if (mark_idle)
 				(*mark_idle)(1);
 			idle = pm_idle;
 			if (!idle)
 				idle = default_idle;
 			(*idle)();
 			if (mark_idle)
 				(*mark_idle)(0);
 #ifdef CONFIG_SMP
 			normal_xtp();
 #endif
 		}
 		preempt_enable_no_resched();
 		schedule();
 		preempt_disable();
 		check_pgt_cache();
 		if (cpu_is_offline(cpu))
 			play_dead();
 	}
 }
 void
 ia64_save_extra (struct task_struct *task)
 {
 #ifdef CONFIG_PERFMON
 	unsigned long info;
 #endif
 	if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
 		ia64_save_debug_regs(&task->thread.dbr[0]);
 #ifdef CONFIG_PERFMON
 	if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
 		pfm_save_regs(task);
 	info = __get_cpu_var(pfm_syst_info);
 	if (info & PFM_CPUINFO_SYST_WIDE)
 		pfm_syst_wide_update_task(task, info, 0);
 #endif
 #ifdef CONFIG_IA32_SUPPORT
 	if (IS_IA32_PROCESS(task_pt_regs(task)))
 		ia32_save_state(task);
 #endif
 }
 void
 ia64_load_extra (struct task_struct *task)
 {
 #ifdef CONFIG_PERFMON
 	unsigned long info;
 #endif
 	if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
 		ia64_load_debug_regs(&task->thread.dbr[0]);
 #ifdef CONFIG_PERFMON
 	if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
 		pfm_load_regs(task);
 	info = __get_cpu_var(pfm_syst_info);
 	if (info & PFM_CPUINFO_SYST_WIDE)
 		pfm_syst_wide_update_task(task, info, 1);
 #endif
 #ifdef CONFIG_IA32_SUPPORT
 	if (IS_IA32_PROCESS(task_pt_regs(task)))
 		ia32_load_state(task);
 #endif
 }
 /*
  * Copy the state of an ia-64 thread.
  *
  * We get here through the following  call chain:
  *
  *	from user-level:	from kernel:
  *
  *	<clone syscall>	        <some kernel call frames>
  *	sys_clone		   :
  *	do_fork			do_fork
  *	copy_thread		copy_thread
  *
  * This means that the stack layout is as follows:
  *
  *	+---------------------+ (highest addr)
  *	|   struct pt_regs    |
  *	+---------------------+
  *	| struct switch_stack |
  *	+---------------------+
  *	|                     |
  *	|    memory stack     |
  *	|                     | <-- sp (lowest addr)
  *	+---------------------+
  *
  * Observe that we copy the unat values that are in pt_regs and switch_stack.  Spilling an
  * integer to address X causes bit N in ar.unat to be set to the NaT bit of the register,
  * with N=(X & 0x1ff)/8.  Thus, copying the unat value preserves the NaT bits ONLY if the
  * pt_regs structure in the parent is congruent to that of the child, modulo 512.  Since
  * the stack is page aligned and the page size is at least 4KB, this is always the case,
  * so there is nothing to worry about.
  */
 int
 copy_thread (int nr, unsigned long clone_flags,
 	     unsigned long user_stack_base, unsigned long user_stack_size,
 	     struct task_struct *p, struct pt_regs *regs)
 {
 	extern char ia64_ret_from_clone, ia32_ret_from_clone;
 	struct switch_stack *child_stack, *stack;
 	unsigned long rbs, child_rbs, rbs_size;
 	struct pt_regs *child_ptregs;
 	int retval = 0;
 #ifdef CONFIG_SMP
 	/*
 	 * For SMP idle threads, fork_by_hand() calls do_fork with
 	 * NULL regs.
 	 */
 	if (!regs)
 		return 0;
 #endif
 	stack = ((struct switch_stack *) regs) - 1;
 	child_ptregs = (struct pt_regs *) ((unsigned long) p + IA64_STK_OFFSET) - 1;
 	child_stack = (struct switch_stack *) child_ptregs - 1;
 	/* copy parent's switch_stack & pt_regs to child: */
 	memcpy(child_stack, stack, sizeof(*child_ptregs) + sizeof(*child_stack));
 	rbs = (unsigned long) current + IA64_RBS_OFFSET;
 	child_rbs = (unsigned long) p + IA64_RBS_OFFSET;
 	rbs_size = stack->ar_bspstore - rbs;
 	/* copy the parent's register backing store to the child: */
 	memcpy((void *) child_rbs, (void *) rbs, rbs_size);
 	if (likely(user_mode(child_ptregs))) {
 		if ((clone_flags & CLONE_SETTLS) && !IS_IA32_PROCESS(regs))
 			child_ptregs->r13 = regs->r16;	/* see sys_clone2() in entry.S */
 		if (user_stack_base) {
 			child_ptregs->r12 = user_stack_base + user_stack_size - 16;
 			child_ptregs->ar_bspstore = user_stack_base;
 			child_ptregs->ar_rnat = 0;
 			child_ptregs->loadrs = 0;
 		}
 	} else {
 		/*
 		 * Note: we simply preserve the relative position of
 		 * the stack pointer here.  There is no need to
 		 * allocate a scratch area here, since that will have
 		 * been taken care of by the caller of sys_clone()
 		 * already.
 		 */
 		child_ptregs->r12 = (unsigned long) child_ptregs - 16; /* kernel sp */
 		child_ptregs->r13 = (unsigned long) p;		/* set `current' pointer */
 	}
 	child_stack->ar_bspstore = child_rbs + rbs_size;
 	if (IS_IA32_PROCESS(regs))
 		child_stack->b0 = (unsigned long) &ia32_ret_from_clone;
 	else
 		child_stack->b0 = (unsigned long) &ia64_ret_from_clone;
 	/* copy parts of thread_struct: */
 	p->thread.ksp = (unsigned long) child_stack - 16;
 	/* stop some PSR bits from being inherited.
 	 * the psr.up/psr.pp bits must be cleared on fork but inherited on execve()
 	 * therefore we must specify them explicitly here and not include them in
 	 * IA64_PSR_BITS_TO_CLEAR.
 	 */
 	child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET)
 				 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP));
 	/*
 	 * NOTE: The calling convention considers all floating point
 	 * registers in the high partition (fph) to be scratch.  Since
 	 * the only way to get to this point is through a system call,
 	 * we know that the values in fph are all dead.  Hence, there
 	 * is no need to inherit the fph state from the parent to the
 	 * child and all we have to do is to make sure that
 	 * IA64_THREAD_FPH_VALID is cleared in the child.
 	 *
 	 * XXX We could push this optimization a bit further by
 	 * clearing IA64_THREAD_FPH_VALID on ANY system call.
 	 * However, it's not clear this is worth doing.  Also, it
 	 * would be a slight deviation from the normal Linux system
 	 * call behavior where scratch registers are preserved across
 	 * system calls (unless used by the system call itself).
 	 */
 #	define THREAD_FLAGS_TO_CLEAR	(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID \
 					 | IA64_THREAD_PM_VALID)
 #	define THREAD_FLAGS_TO_SET	0
 	p->thread.flags = ((current->thread.flags & ~THREAD_FLAGS_TO_CLEAR)
 			   | THREAD_FLAGS_TO_SET);
 	ia64_drop_fpu(p);	/* don't pick up stale state from a CPU's fph */
 #ifdef CONFIG_IA32_SUPPORT
 	/*
 	 * If we're cloning an IA32 task then save the IA32 extra
 	 * state from the current task to the new task
 	 */
 	if (IS_IA32_PROCESS(task_pt_regs(current))) {
 		ia32_save_state(p);
 		if (clone_flags & CLONE_SETTLS)
 			retval = ia32_clone_tls(p, child_ptregs);
 		/* Copy partially mapped page list */
 		if (!retval)
 			retval = ia32_copy_partial_page_list(p, clone_flags);
 	}
 #endif
 #ifdef CONFIG_PERFMON
 	if (current->thread.pfm_context)
 		pfm_inherit(p, child_ptregs);
 #endif
 	return retval;
 }
 static void
 do_copy_task_regs (struct task_struct *task, struct unw_frame_info *info, void *arg)
 {
 	unsigned long mask, sp, nat_bits = 0, ip, ar_rnat, urbs_end, cfm;
 	elf_greg_t *dst = arg;
 	struct pt_regs *pt;
 	char nat;
 	int i;
 	memset(dst, 0, sizeof(elf_gregset_t));	/* don't leak any kernel bits to user-level */
 	if (unw_unwind_to_user(info) < 0)
 		return;
 	unw_get_sp(info, &sp);
 	pt = (struct pt_regs *) (sp + 16);
 	urbs_end = ia64_get_user_rbs_end(task, pt, &cfm);
 	if (ia64_sync_user_rbs(task, info->sw, pt->ar_bspstore, urbs_end) < 0)
 		return;
 	ia64_peek(task, info->sw, urbs_end, (long) ia64_rse_rnat_addr((long *) urbs_end),
 		  &ar_rnat);
 	/*
 	 * coredump format:
 	 *	r0-r31
 	 *	NaT bits (for r0-r31; bit N == 1 iff rN is a NaT)
 	 *	predicate registers (p0-p63)
 	 *	b0-b7
 	 *	ip cfm user-mask
 	 *	ar.rsc ar.bsp ar.bspstore ar.rnat
 	 *	ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec
 	 */
 	/* r0 is zero */
 	for (i = 1, mask = (1UL << i); i < 32; ++i) {
 		unw_get_gr(info, i, &dst[i], &nat);
 		if (nat)
 			nat_bits |= mask;
 		mask <<= 1;
 	}
 	dst[32] = nat_bits;
 	unw_get_pr(info, &dst[33]);
 	for (i = 0; i < 8; ++i)
 		unw_get_br(info, i, &dst[34 + i]);
 	unw_get_rp(info, &ip);
 	dst[42] = ip + ia64_psr(pt)->ri;
 	dst[43] = cfm;
 	dst[44] = pt->cr_ipsr & IA64_PSR_UM;
 	unw_get_ar(info, UNW_AR_RSC, &dst[45]);
 	/*
 	 * For bsp and bspstore, unw_get_ar() would return the kernel
 	 * addresses, but we need the user-level addresses instead:
 	 */
 	dst[46] = urbs_end;	/* note: by convention PT_AR_BSP points to the end of the urbs! */
 	dst[47] = pt->ar_bspstore;
 	dst[48] = ar_rnat;
 	unw_get_ar(info, UNW_AR_CCV, &dst[49]);
 	unw_get_ar(info, UNW_AR_UNAT, &dst[50]);
 	unw_get_ar(info, UNW_AR_FPSR, &dst[51]);
 	dst[52] = pt->ar_pfs;	/* UNW_AR_PFS is == to pt->cr_ifs for interrupt frames */
 	unw_get_ar(info, UNW_AR_LC, &dst[53]);
 	unw_get_ar(info, UNW_AR_EC, &dst[54]);
 	unw_get_ar(info, UNW_AR_CSD, &dst[55]);
 	unw_get_ar(info, UNW_AR_SSD, &dst[56]);
 }
 void
 do_dump_task_fpu (struct task_struct *task, struct unw_frame_info *info, void *arg)
 {
 	elf_fpreg_t *dst = arg;
 	int i;
 	memset(dst, 0, sizeof(elf_fpregset_t));	/* don't leak any "random" bits */
 	if (unw_unwind_to_user(info) < 0)
 		return;
 	/* f0 is 0.0, f1 is 1.0 */
 	for (i = 2; i < 32; ++i)
 		unw_get_fr(info, i, dst + i);
 	ia64_flush_fph(task);
 	if ((task->thread.flags & IA64_THREAD_FPH_VALID) != 0)
 		memcpy(dst + 32, task->thread.fph, 96*16);
 }
 void
 do_copy_regs (struct unw_frame_info *info, void *arg)
 {
 	do_copy_task_regs(current, info, arg);
 }
 void
 do_dump_fpu (struct unw_frame_info *info, void *arg)
 {
 	do_dump_task_fpu(current, info, arg);
 }
 int
 dump_task_regs(struct task_struct *task, elf_gregset_t *regs)
 {
 	struct unw_frame_info tcore_info;
 	if (current == task) {
 		unw_init_running(do_copy_regs, regs);
 	} else {
 		memset(&tcore_info, 0, sizeof(tcore_info));
 		unw_init_from_blocked_task(&tcore_info, task);
 		do_copy_task_regs(task, &tcore_info, regs);
 	}
 	return 1;
 }
 void
 ia64_elf_core_copy_regs (struct pt_regs *pt, elf_gregset_t dst)
 {
 	unw_init_running(do_copy_regs, dst);
 }
 int
 dump_task_fpu (struct task_struct *task, elf_fpregset_t *dst)
 {
 	struct unw_frame_info tcore_info;
 	if (current == task) {
 		unw_init_running(do_dump_fpu, dst);
 	} else {
 		memset(&tcore_info, 0, sizeof(tcore_info));
 		unw_init_from_blocked_task(&tcore_info, task);
 		do_dump_task_fpu(task, &tcore_info, dst);
 	}
 	return 1;
 }
 int
 dump_fpu (struct pt_regs *pt, elf_fpregset_t dst)
 {
 	unw_init_running(do_dump_fpu, dst);
 	return 1;	/* f0-f31 are always valid so we always return 1 */
 }
 long
 sys_execve (char __user *filename, char __user * __user *argv, char __user * __user *envp,
 	    struct pt_regs *regs)
 {
 	char *fname;
 	int error;
 	fname = getname(filename);
 	error = PTR_ERR(fname);
 	if (IS_ERR(fname))
 		goto out;
 	error = do_execve(fname, argv, envp, regs);
 	putname(fname);
 out:
 	return error;
 }
 pid_t
 kernel_thread (int (*fn)(void *), void *arg, unsigned long flags)
 {
 	extern void start_kernel_thread (void);
 	unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread;
 	struct {
 		struct switch_stack sw;
 		struct pt_regs pt;
 	} regs;
 	memset(&regs, 0, sizeof(regs));
 	regs.pt.cr_iip = helper_fptr[0];	/* set entry point (IP) */
 	regs.pt.r1 = helper_fptr[1];		/* set GP */
 	regs.pt.r9 = (unsigned long) fn;	/* 1st argument */
 	regs.pt.r11 = (unsigned long) arg;	/* 2nd argument */
 	/* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read.  */
 	regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN;
 	regs.pt.cr_ifs = 1UL << 63;		/* mark as valid, empty frame */
 	regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR);
 	regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET;
 	regs.sw.pr = (1 << PRED_KERNEL_STACK);
 	return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs.pt, 0, NULL, NULL);
 }
 EXPORT_SYMBOL(kernel_thread);
 /* This gets called from kernel_thread() via ia64_invoke_thread_helper().  */
 int
 kernel_thread_helper (int (*fn)(void *), void *arg)
 {
 #ifdef CONFIG_IA32_SUPPORT
 	if (IS_IA32_PROCESS(task_pt_regs(current))) {
 		/* A kernel thread is always a 64-bit process. */
 		current->thread.map_base  = DEFAULT_MAP_BASE;
 		current->thread.task_size = DEFAULT_TASK_SIZE;
 		ia64_set_kr(IA64_KR_IO_BASE, current->thread.old_iob);
 		ia64_set_kr(IA64_KR_TSSD, current->thread.old_k1);
 	}
 #endif
 	return (*fn)(arg);
 }
 /*
  * Flush thread state.  This is called when a thread does an execve().
  */
 void
 flush_thread (void)
 {
 	/* drop floating-point and debug-register state if it exists: */
 	current->thread.flags &= ~(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID);
 	ia64_drop_fpu(current);
 #ifdef CONFIG_IA32_SUPPORT
 	if (IS_IA32_PROCESS(task_pt_regs(current))) {
 		ia32_drop_partial_page_list(current);
 		current->thread.task_size = IA32_PAGE_OFFSET;
 		set_fs(USER_DS);
 	}
 #endif
 }
 /*
  * Clean up state associated with current thread.  This is called when
  * the thread calls exit().
  */
 void
 exit_thread (void)
 {
-	/*
-	 * Remove function-return probe instances associated with this task
-	 * and put them back on the free list. Do not insert an exit probe for
-	 * this function, it will be disabled by kprobe_flush_task if you do.
-	 */
-	kprobe_flush_task(current);
 	ia64_drop_fpu(current);
 #ifdef CONFIG_PERFMON
        /* if needed, stop monitoring and flush state to perfmon context */
 	if (current->thread.pfm_context)
 		pfm_exit_thread(current);
 	/* free debug register resources */
 	if (current->thread.flags & IA64_THREAD_DBG_VALID)
 		pfm_release_debug_registers(current);
 #endif
 	if (IS_IA32_PROCESS(task_pt_regs(current)))
 		ia32_drop_partial_page_list(current);
 }
 unsigned long
 get_wchan (struct task_struct *p)
 {
 	struct unw_frame_info info;
 	unsigned long ip;
 	int count = 0;
 	/*
 	 * Note: p may not be a blocked task (it could be current or
 	 * another process running on some other CPU.  Rather than
 	 * trying to determine if p is really blocked, we just assume
 	 * it's blocked and rely on the unwind routines to fail
 	 * gracefully if the process wasn't really blocked after all.
 	 * --davidm 99/12/15
 	 */
 	unw_init_from_blocked_task(&info, p);
 	do {
 		if (unw_unwind(&info) < 0)
 			return 0;
 		unw_get_ip(&info, &ip);
 		if (!in_sched_functions(ip))
 			return ip;
 	} while (count++ < 16);
 	return 0;
 }
 void
 cpu_halt (void)
 {
 	pal_power_mgmt_info_u_t power_info[8];
 	unsigned long min_power;
 	int i, min_power_state;
 	if (ia64_pal_halt_info(power_info) != 0)
 		return;
 	min_power_state = 0;
 	min_power = power_info[0].pal_power_mgmt_info_s.power_consumption;
 	for (i = 1; i < 8; ++i)
 		if (power_info[i].pal_power_mgmt_info_s.im
 		    && power_info[i].pal_power_mgmt_info_s.power_consumption < min_power) {
 			min_power = power_info[i].pal_power_mgmt_info_s.power_consumption;
 			min_power_state = i;
 		}
 	while (1)
 		ia64_pal_halt(min_power_state);
 }
 void
 machine_restart (char *restart_cmd)
 {
 	(void) notify_die(DIE_MACHINE_RESTART, restart_cmd, NULL, 0, 0, 0);
 	(*efi.reset_system)(EFI_RESET_WARM, 0, 0, NULL);
 }
 void
 machine_halt (void)
 {
 	(void) notify_die(DIE_MACHINE_HALT, "", NULL, 0, 0, 0);
 	cpu_halt();
 }
 void
 machine_power_off (void)
 {
 	if (pm_power_off)
 		pm_power_off();
 	machine_halt();
 }

 /*
  *  kernel/sched.c
  *
  *  Kernel scheduler and related syscalls
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
  *
  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
  *		make semaphores SMP safe
  *  1998-11-19	Implemented schedule_timeout() and related stuff
  *		by Andrea Arcangeli
  *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
  *		hybrid priority-list and round-robin design with
  *		an array-switch method of distributing timeslices
  *		and per-CPU runqueues.  Cleanups and useful suggestions
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
  */
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/nmi.h>
 #include <linux/init.h>
 #include <asm/uaccess.h>
 #include <linux/highmem.h>
 #include <linux/smp_lock.h>
 #include <asm/mmu_context.h>
 #include <linux/interrupt.h>
 #include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
 #include <linux/suspend.h>
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/delay.h>
 #include <linux/smp.h>
 #include <linux/threads.h>
 #include <linux/timer.h>
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/percpu.h>
 #include <linux/kthread.h>
 #include <linux/seq_file.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
 #include <linux/acct.h>
+#include <linux/kprobes.h>
 #include <asm/tlb.h>
 #include <asm/unistd.h>
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  * and back.
  */
 #define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
 #define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
 #define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
 /*
  * 'User priority' is the nice value converted to something we
  * can work with better when scaling various scheduler parameters,
  * it's a [ 0 ... 39 ] range.
  */
 #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
 #define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 /*
  * Some helpers for converting nanosecond timing to jiffy resolution
  */
 #define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
 #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
 /*
  * These are the 'tuning knobs' of the scheduler:
  *
  * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
  * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
  * Timeslices get refilled after they expire.
  */
 #define MIN_TIMESLICE		max(5 * HZ / 1000, 1)
 #define DEF_TIMESLICE		(100 * HZ / 1000)
 #define ON_RUNQUEUE_WEIGHT	 30
 #define CHILD_PENALTY		 95
 #define PARENT_PENALTY		100
 #define EXIT_WEIGHT		  3
 #define PRIO_BONUS_RATIO	 25
 #define MAX_BONUS		(MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
 #define INTERACTIVE_DELTA	  2
 #define MAX_SLEEP_AVG		(DEF_TIMESLICE * MAX_BONUS)
 #define STARVATION_LIMIT	(MAX_SLEEP_AVG)
 #define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
 /*
  * If a task is 'interactive' then we reinsert it in the active
  * array after it has expired its current timeslice. (it will not
  * continue to run immediately, it will still roundrobin with
  * other interactive tasks.)
  *
  * This part scales the interactivity limit depending on niceness.
  *
  * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
  * Here are a few examples of different nice levels:
  *
  *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
  *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
  *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
  *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
  *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
  *
  * (the X axis represents the possible -5 ... 0 ... +5 dynamic
  *  priority range a task can explore, a value of '1' means the
  *  task is rated interactive.)
  *
  * Ie. nice +19 tasks can never get 'interactive' enough to be
  * reinserted into the active array. And only heavily CPU-hog nice -20
  * tasks will be expired. Default nice 0 tasks are somewhere between,
  * it takes some effort for them to get interactive, but it's not
  * too hard.
  */
 #define CURRENT_BONUS(p) \
 	(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
 		MAX_SLEEP_AVG)
 #define GRANULARITY	(10 * HZ / 1000 ? : 1)
 #ifdef CONFIG_SMP
 #define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
 		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
 			num_online_cpus())
 #else
 #define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
 		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
 #endif
 #define SCALE(v1,v1_max,v2_max) \
 	(v1) * (v2_max) / (v1_max)
 #define DELTA(p) \
 	(SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
 #define TASK_INTERACTIVE(p) \
 	((p)->prio <= (p)->static_prio - DELTA(p))
 #define INTERACTIVE_SLEEP(p) \
 	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
 		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
 #define TASK_PREEMPTS_CURR(p, rq) \
 	((p)->prio < (rq)->curr->prio)
 /*
  * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
  * to time slice values: [800ms ... 100ms ... 5ms]
  *
  * The higher a thread's priority, the bigger timeslices
  * it gets during one round of execution. But even the lowest
  * priority thread gets MIN_TIMESLICE worth of execution time.
  */
 #define SCALE_PRIO(x, prio) \
 	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
 static unsigned int task_timeslice(task_t *p)
 {
 	if (p->static_prio < NICE_TO_PRIO(0))
 		return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
 	else
 		return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
 }
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
 				< (long long) (sd)->cache_hot_time)
 /*
  * These are the runqueue data structures:
  */
 #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
 typedef struct runqueue runqueue_t;
 struct prio_array {
 	unsigned int nr_active;
 	unsigned long bitmap[BITMAP_SIZE];
 	struct list_head queue[MAX_PRIO];
 };
 /*
  * This is the main, per-CPU runqueue data structure.
  *
  * Locking rule: those places that want to lock multiple runqueues
  * (such as the load balancing or the thread migration code), lock
  * acquire operations must be ordered by ascending &runqueue.
  */
 struct runqueue {
 	spinlock_t lock;
 	/*
 	 * nr_running and cpu_load should be in the same cacheline because
 	 * remote CPUs use both these fields when doing load calculation.
 	 */
 	unsigned long nr_running;
 #ifdef CONFIG_SMP
 	unsigned long cpu_load[3];
 #endif
 	unsigned long long nr_switches;
 	/*
 	 * This is part of a global counter where only the total sum
 	 * over all CPUs matters. A task can increase this counter on
 	 * one CPU and if it got migrated afterwards it may decrease
 	 * it on another CPU. Always updated under the runqueue lock:
 	 */
 	unsigned long nr_uninterruptible;
 	unsigned long expired_timestamp;
 	unsigned long long timestamp_last_tick;
 	task_t *curr, *idle;
 	struct mm_struct *prev_mm;
 	prio_array_t *active, *expired, arrays[2];
 	int best_expired_prio;
 	atomic_t nr_iowait;
 #ifdef CONFIG_SMP
 	struct sched_domain *sd;
 	/* For active balancing */
 	int active_balance;
 	int push_cpu;
 	task_t *migration_thread;
 	struct list_head migration_queue;
 	int cpu;
 #endif
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
 	struct sched_info rq_sched_info;
 	/* sys_sched_yield() stats */
 	unsigned long yld_exp_empty;
 	unsigned long yld_act_empty;
 	unsigned long yld_both_empty;
 	unsigned long yld_cnt;
 	/* schedule() stats */
 	unsigned long sched_switch;
 	unsigned long sched_cnt;
 	unsigned long sched_goidle;
 	/* try_to_wake_up() stats */
 	unsigned long ttwu_cnt;
 	unsigned long ttwu_local;
 #endif
 };
 static DEFINE_PER_CPU(struct runqueue, runqueues);
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
  * See detach_destroy_domains: synchronize_sched for details.
  *
  * The domain tree of any CPU may only be accessed from within
  * preempt-disabled sections.
  */
 #define for_each_domain(cpu, domain) \
 for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
 #define this_rq()		(&__get_cpu_var(runqueues))
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
 #ifndef finish_arch_switch
 # define finish_arch_switch(prev)	do { } while (0)
 #endif
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline int task_running(runqueue_t *rq, task_t *p)
 {
 	return rq->curr == p;
 }
 static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
 {
 }
 static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	/* this is a valid case when another task releases the spinlock */
 	rq->lock.owner = current;
 #endif
 	spin_unlock_irq(&rq->lock);
 }
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
 static inline int task_running(runqueue_t *rq, task_t *p)
 {
 #ifdef CONFIG_SMP
 	return p->oncpu;
 #else
 	return rq->curr == p;
 #endif
 }
 static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
 {
 #ifdef CONFIG_SMP
 	/*
 	 * We can optimise this out completely for !SMP, because the
 	 * SMP rebalancing from interrupt is the only thing that cares
 	 * here.
 	 */
 	next->oncpu = 1;
 #endif
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 	spin_unlock_irq(&rq->lock);
 #else
 	spin_unlock(&rq->lock);
 #endif
 }
 static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
 {
 #ifdef CONFIG_SMP
 	/*
 	 * After ->oncpu is cleared, the task can be moved to a different CPU.
 	 * We must ensure this doesn't happen until the switch is completely
 	 * finished.
 	 */
 	smp_wmb();
 	prev->oncpu = 0;
 #endif
 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 	local_irq_enable();
 #endif
 }
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
  * interrupts.  Note the ordering: we can safely lookup the task_rq without
  * explicitly disabling preemption.
  */
 static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
 	__acquires(rq->lock)
 {
 	struct runqueue *rq;
 repeat_lock_task:
 	local_irq_save(*flags);
 	rq = task_rq(p);
 	spin_lock(&rq->lock);
 	if (unlikely(rq != task_rq(p))) {
 		spin_unlock_irqrestore(&rq->lock, *flags);
 		goto repeat_lock_task;
 	}
 	return rq;
 }
 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
 	__releases(rq->lock)
 {
 	spin_unlock_irqrestore(&rq->lock, *flags);
 }
 #ifdef CONFIG_SCHEDSTATS
 /*
  * bump this up when changing the output format or the meaning of an existing
  * format, so that tools can adapt (or abort)
  */
 #define SCHEDSTAT_VERSION 12
 static int show_schedstat(struct seq_file *seq, void *v)
 {
 	int cpu;
 	seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
 	seq_printf(seq, "timestamp %lu\n", jiffies);
 	for_each_online_cpu(cpu) {
 		runqueue_t *rq = cpu_rq(cpu);
 #ifdef CONFIG_SMP
 		struct sched_domain *sd;
 		int dcnt = 0;
 #endif
 		/* runqueue-specific stats */
 		seq_printf(seq,
 		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
 		    cpu, rq->yld_both_empty,
 		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
 		    rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
 		    rq->ttwu_cnt, rq->ttwu_local,
 		    rq->rq_sched_info.cpu_time,
 		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
 		seq_printf(seq, "\n");
 #ifdef CONFIG_SMP
 		/* domain-specific stats */
 		preempt_disable();
 		for_each_domain(cpu, sd) {
 			enum idle_type itype;
 			char mask_str[NR_CPUS];
 			cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
 			seq_printf(seq, "domain%d %s", dcnt++, mask_str);
 			for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
 					itype++) {
 				seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
 				    sd->lb_cnt[itype],
 				    sd->lb_balanced[itype],
 				    sd->lb_failed[itype],
 				    sd->lb_imbalance[itype],
 				    sd->lb_gained[itype],
 				    sd->lb_hot_gained[itype],
 				    sd->lb_nobusyq[itype],
 				    sd->lb_nobusyg[itype]);
 			}
 			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
 			    sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
 			    sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
 			    sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
 			    sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
 		}
 		preempt_enable();
 #endif
 	}
 	return 0;
 }
 static int schedstat_open(struct inode *inode, struct file *file)
 {
 	unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
 	char *buf = kmalloc(size, GFP_KERNEL);
 	struct seq_file *m;
 	int res;
 	if (!buf)
 		return -ENOMEM;
 	res = single_open(file, show_schedstat, NULL);
 	if (!res) {
 		m = file->private_data;
 		m->buf = buf;
 		m->size = size;
 	} else
 		kfree(buf);
 	return res;
 }
 struct file_operations proc_schedstat_operations = {
 	.open    = schedstat_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
 	.release = single_release,
 };
 # define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
 # define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
 #else /* !CONFIG_SCHEDSTATS */
 # define schedstat_inc(rq, field)	do { } while (0)
 # define schedstat_add(rq, field, amt)	do { } while (0)
 #endif
 /*
  * rq_lock - lock a given runqueue and disable interrupts.
  */
 static inline runqueue_t *this_rq_lock(void)
 	__acquires(rq->lock)
 {
 	runqueue_t *rq;
 	local_irq_disable();
 	rq = this_rq();
 	spin_lock(&rq->lock);
 	return rq;
 }
 #ifdef CONFIG_SCHEDSTATS
 /*
  * Called when a process is dequeued from the active array and given
  * the cpu.  We should note that with the exception of interactive
  * tasks, the expired queue will become the active queue after the active
  * queue is empty, without explicitly dequeuing and requeuing tasks in the
  * expired queue.  (Interactive tasks may be requeued directly to the
  * active queue, thus delaying tasks in the expired queue from running;
  * see scheduler_tick()).
  *
  * This function is only called from sched_info_arrive(), rather than
  * dequeue_task(). Even though a task may be queued and dequeued multiple
  * times as it is shuffled about, we're really interested in knowing how
  * long it was from the *first* time it was queued to the time that it
  * finally hit a cpu.
  */
 static inline void sched_info_dequeued(task_t *t)
 {
 	t->sched_info.last_queued = 0;
 }
 /*
  * Called when a task finally hits the cpu.  We can now calculate how
  * long it was waiting to run.  We also note when it began so that we
  * can keep stats on how long its timeslice is.
  */
 static void sched_info_arrive(task_t *t)
 {
 	unsigned long now = jiffies, diff = 0;
 	struct runqueue *rq = task_rq(t);
 	if (t->sched_info.last_queued)
 		diff = now - t->sched_info.last_queued;
 	sched_info_dequeued(t);
 	t->sched_info.run_delay += diff;
 	t->sched_info.last_arrival = now;
 	t->sched_info.pcnt++;
 	if (!rq)
 		return;
 	rq->rq_sched_info.run_delay += diff;
 	rq->rq_sched_info.pcnt++;
 }
 /*
  * Called when a process is queued into either the active or expired
  * array.  The time is noted and later used to determine how long we
  * had to wait for us to reach the cpu.  Since the expired queue will
  * become the active queue after active queue is empty, without dequeuing
  * and requeuing any tasks, we are interested in queuing to either. It
  * is unusual but not impossible for tasks to be dequeued and immediately
  * requeued in the same or another array: this can happen in sched_yield(),
  * set_user_nice(), and even load_balance() as it moves tasks from runqueue
  * to runqueue.
  *
  * This function is only called from enqueue_task(), but also only updates
  * the timestamp if it is already not set.  It's assumed that
  * sched_info_dequeued() will clear that stamp when appropriate.
  */
 static inline void sched_info_queued(task_t *t)
 {
 	if (!t->sched_info.last_queued)
 		t->sched_info.last_queued = jiffies;
 }
 /*
  * Called when a process ceases being the active-running process, either
  * voluntarily or involuntarily.  Now we can calculate how long we ran.
  */
 static inline void sched_info_depart(task_t *t)
 {
 	struct runqueue *rq = task_rq(t);
 	unsigned long diff = jiffies - t->sched_info.last_arrival;
 	t->sched_info.cpu_time += diff;
 	if (rq)
 		rq->rq_sched_info.cpu_time += diff;
 }
 /*
  * Called when tasks are switched involuntarily due, typically, to expiring
  * their time slice.  (This may also be called when switching to or from
  * the idle task.)  We are only called when prev != next.
  */
 static inline void sched_info_switch(task_t *prev, task_t *next)
 {
 	struct runqueue *rq = task_rq(prev);
 	/*
 	 * prev now departs the cpu.  It's not interesting to record
 	 * stats about how efficient we were at scheduling the idle
 	 * process, however.
 	 */
 	if (prev != rq->idle)
 		sched_info_depart(prev);
 	if (next != rq->idle)
 		sched_info_arrive(next);
 }
 #else
 #define sched_info_queued(t)		do { } while (0)
 #define sched_info_switch(t, next)	do { } while (0)
 #endif /* CONFIG_SCHEDSTATS */
 /*
  * Adding/removing a task to/from a priority array:
  */
 static void dequeue_task(struct task_struct *p, prio_array_t *array)
 {
 	array->nr_active--;
 	list_del(&p->run_list);
 	if (list_empty(array->queue + p->prio))
 		__clear_bit(p->prio, array->bitmap);
 }
 static void enqueue_task(struct task_struct *p, prio_array_t *array)
 {
 	sched_info_queued(p);
 	list_add_tail(&p->run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
 	array->nr_active++;
 	p->array = array;
 }
 /*
  * Put task to the end of the run list without the overhead of dequeue
  * followed by enqueue.
  */
 static void requeue_task(struct task_struct *p, prio_array_t *array)
 {
 	list_move_tail(&p->run_list, array->queue + p->prio);
 }
 static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
 {
 	list_add(&p->run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
 	array->nr_active++;
 	p->array = array;
 }
 /*
  * effective_prio - return the priority that is based on the static
  * priority but is modified by bonuses/penalties.
  *
  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
  * into the -5 ... 0 ... +5 bonus/penalty range.
  *
  * We use 25% of the full 0...39 priority range so that:
  *
  * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
  * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
  *
  * Both properties are important to certain workloads.
  */
 static int effective_prio(task_t *p)
 {
 	int bonus, prio;
 	if (rt_task(p))
 		return p->prio;
 	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
 	prio = p->static_prio - bonus;
 	if (prio < MAX_RT_PRIO)
 		prio = MAX_RT_PRIO;
 	if (prio > MAX_PRIO-1)
 		prio = MAX_PRIO-1;
 	return prio;
 }
 /*
  * __activate_task - move a task to the runqueue.
  */
 static inline void __activate_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task(p, rq->active);
 	rq->nr_running++;
 }
 /*
  * __activate_idle_task - move idle task to the _front_ of runqueue.
  */
 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task_head(p, rq->active);
 	rq->nr_running++;
 }
 static int recalc_task_prio(task_t *p, unsigned long long now)
 {
 	/* Caller must always ensure 'now >= p->timestamp' */
 	unsigned long long __sleep_time = now - p->timestamp;
 	unsigned long sleep_time;
 	if (unlikely(p->policy == SCHED_BATCH))
 		sleep_time = 0;
 	else {
 		if (__sleep_time > NS_MAX_SLEEP_AVG)
 			sleep_time = NS_MAX_SLEEP_AVG;
 		else
 			sleep_time = (unsigned long)__sleep_time;
 	}
 	if (likely(sleep_time > 0)) {
 		/*
 		 * User tasks that sleep a long time are categorised as
 		 * idle and will get just interactive status to stay active &
 		 * prevent them suddenly becoming cpu hogs and starving
 		 * other processes.
 		 */
 		if (p->mm && p->activated != -1 &&
 			sleep_time > INTERACTIVE_SLEEP(p)) {
 				p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
 						DEF_TIMESLICE);
 		} else {
 			/*
 			 * Tasks waking from uninterruptible sleep are
 			 * limited in their sleep_avg rise as they
 			 * are likely to be waiting on I/O
 			 */
 			if (p->activated == -1 && p->mm) {
 				if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
 					sleep_time = 0;
 				else if (p->sleep_avg + sleep_time >=
 						INTERACTIVE_SLEEP(p)) {
 					p->sleep_avg = INTERACTIVE_SLEEP(p);
 					sleep_time = 0;
 				}
 			}
 			/*
 			 * This code gives a bonus to interactive tasks.
 			 *
 			 * The boost works by updating the 'average sleep time'
 			 * value here, based on ->timestamp. The more time a
 			 * task spends sleeping, the higher the average gets -
 			 * and the higher the priority boost gets as well.
 			 */
 			p->sleep_avg += sleep_time;
 			if (p->sleep_avg > NS_MAX_SLEEP_AVG)
 				p->sleep_avg = NS_MAX_SLEEP_AVG;
 		}
 	}
 	return effective_prio(p);
 }
 /*
  * activate_task - move a task to the runqueue and do priority recalculation
  *
  * Update all the scheduling statistics stuff. (sleep average
  * calculation, priority modifiers, etc.)
  */
 static void activate_task(task_t *p, runqueue_t *rq, int local)
 {
 	unsigned long long now;
 	now = sched_clock();
 #ifdef CONFIG_SMP
 	if (!local) {
 		/* Compensate for drifting sched_clock */
 		runqueue_t *this_rq = this_rq();
 		now = (now - this_rq->timestamp_last_tick)
 			+ rq->timestamp_last_tick;
 	}
 #endif
 	if (!rt_task(p))
 		p->prio = recalc_task_prio(p, now);
 	/*
 	 * This checks to make sure it's not an uninterruptible task
 	 * that is now waking up.
 	 */
 	if (!p->activated) {
 		/*
 		 * Tasks which were woken up by interrupts (ie. hw events)
 		 * are most likely of interactive nature. So we give them
 		 * the credit of extending their sleep time to the period
 		 * of time they spend on the runqueue, waiting for execution
 		 * on a CPU, first time around:
 		 */
 		if (in_interrupt())
 			p->activated = 2;
 		else {
 			/*
 			 * Normal first-time wakeups get a credit too for
 			 * on-runqueue time, but it will be weighted down:
 			 */
 			p->activated = 1;
 		}
 	}
 	p->timestamp = now;
 	__activate_task(p, rq);
 }
 /*
  * deactivate_task - remove a task from the runqueue.
  */
 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
 	rq->nr_running--;
 	dequeue_task(p, p->array);
 	p->array = NULL;
 }
 /*
  * resched_task - mark a task 'to be rescheduled now'.
  *
  * On UP this means the setting of the need_resched flag, on SMP it
  * might also involve a cross-CPU call to trigger the scheduler on
  * the target CPU.
  */
 #ifdef CONFIG_SMP
 static void resched_task(task_t *p)
 {
 	int cpu;
 	assert_spin_locked(&task_rq(p)->lock);
 	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
 		return;
 	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
 	cpu = task_cpu(p);
 	if (cpu == smp_processor_id())
 		return;
 	/* NEED_RESCHED must be visible before we test POLLING_NRFLAG */
 	smp_mb();
 	if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG))
 		smp_send_reschedule(cpu);
 }
 #else
 static inline void resched_task(task_t *p)
 {
 	assert_spin_locked(&task_rq(p)->lock);
 	set_tsk_need_resched(p);
 }
 #endif
 /**
  * task_curr - is this task currently executing on a CPU?
  * @p: the task in question.
  */
 inline int task_curr(const task_t *p)
 {
 	return cpu_curr(task_cpu(p)) == p;
 }
 #ifdef CONFIG_SMP
 typedef struct {
 	struct list_head list;
 	task_t *task;
 	int dest_cpu;
 	struct completion done;
 } migration_req_t;
 /*
  * The task's runqueue lock must be held.
  * Returns true if you have to wait for migration thread.
  */
 static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
 {
 	runqueue_t *rq = task_rq(p);
 	/*
 	 * If the task is not on a runqueue (and not running), then
 	 * it is sufficient to simply update the task's cpu field.
 	 */
 	if (!p->array && !task_running(rq, p)) {
 		set_task_cpu(p, dest_cpu);
 		return 0;
 	}
 	init_completion(&req->done);
 	req->task = p;
 	req->dest_cpu = dest_cpu;
 	list_add(&req->list, &rq->migration_queue);
 	return 1;
 }
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
  * The caller must ensure that the task *will* unschedule sometime soon,
  * else this function might spin for a *long* time. This function can't
  * be called with interrupts off, or it may introduce deadlock with
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
  */
 void wait_task_inactive(task_t *p)
 {
 	unsigned long flags;
 	runqueue_t *rq;
 	int preempted;
 repeat:
 	rq = task_rq_lock(p, &flags);
 	/* Must be off runqueue entirely, not preempted. */
 	if (unlikely(p->array || task_running(rq, p))) {
 		/* If it's preempted, we yield.  It could be a while. */
 		preempted = !task_running(rq, p);
 		task_rq_unlock(rq, &flags);
 		cpu_relax();
 		if (preempted)
 			yield();
 		goto repeat;
 	}
 	task_rq_unlock(rq, &flags);
 }
 /***
  * kick_process - kick a running thread to enter/exit the kernel
  * @p: the to-be-kicked thread
  *
  * Cause a process which is running on another CPU to enter
  * kernel-mode, without any delay. (to get signals handled.)
  *
  * NOTE: this function doesnt have to take the runqueue lock,
  * because all it wants to ensure is that the remote task enters
  * the kernel. If the IPI races and the task has been migrated
  * to another CPU then no harm is done and the purpose has been
  * achieved as well.
  */
 void kick_process(task_t *p)
 {
 	int cpu;
 	preempt_disable();
 	cpu = task_cpu(p);
 	if ((cpu != smp_processor_id()) && task_curr(p))
 		smp_send_reschedule(cpu);
 	preempt_enable();
 }
 /*
  * Return a low guess at the load of a migration-source cpu.
  *
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
 static inline unsigned long source_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
 	if (type == 0)
 		return load_now;
 	return min(rq->cpu_load[type-1], load_now);
 }
 /*
  * Return a high guess at the load of a migration-target cpu
  */
 static inline unsigned long target_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
 	if (type == 0)
 		return load_now;
 	return max(rq->cpu_load[type-1], load_now);
 }
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
  */
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 {
 	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
 	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
 	do {
 		unsigned long load, avg_load;
 		int local_group;
 		int i;
 		/* Skip over this group if it has no CPUs allowed */
 		if (!cpus_intersects(group->cpumask, p->cpus_allowed))
 			goto nextgroup;
 		local_group = cpu_isset(this_cpu, group->cpumask);
 		/* Tally up the load of all CPUs in the group */
 		avg_load = 0;
 		for_each_cpu_mask(i, group->cpumask) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
 				load = source_load(i, load_idx);
 			else
 				load = target_load(i, load_idx);
 			avg_load += load;
 		}
 		/* Adjust by relative CPU power of the group */
 		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
 		if (local_group) {
 			this_load = avg_load;
 			this = group;
 		} else if (avg_load < min_load) {
 			min_load = avg_load;
 			idlest = group;
 		}
 nextgroup:
 		group = group->next;
 	} while (group != sd->groups);
 	if (!idlest || 100*this_load < imbalance*min_load)
 		return NULL;
 	return idlest;
 }
 /*
  * find_idlest_queue - find the idlest runqueue among the cpus in group.
  */
 static int
 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 {
 	cpumask_t tmp;
 	unsigned long load, min_load = ULONG_MAX;
 	int idlest = -1;
 	int i;
 	/* Traverse only the allowed CPUs */
 	cpus_and(tmp, group->cpumask, p->cpus_allowed);
 	for_each_cpu_mask(i, tmp) {
 		load = source_load(i, 0);
 		if (load < min_load || (load == min_load && i == this_cpu)) {
 			min_load = load;
 			idlest = i;
 		}
 	}
 	return idlest;
 }
 /*
  * sched_balance_self: balance the current task (running on cpu) in domains
  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
  * SD_BALANCE_EXEC.
  *
  * Balance, ie. select the least loaded group.
  *
  * Returns the target CPU number, or the same CPU if no balancing is needed.
  *
  * preempt must be disabled.
  */
 static int sched_balance_self(int cpu, int flag)
 {
 	struct task_struct *t = current;
 	struct sched_domain *tmp, *sd = NULL;
 	for_each_domain(cpu, tmp)
 		if (tmp->flags & flag)
 			sd = tmp;
 	while (sd) {
 		cpumask_t span;
 		struct sched_group *group;
 		int new_cpu;
 		int weight;
 		span = sd->span;
 		group = find_idlest_group(sd, t, cpu);
 		if (!group)
 			goto nextlevel;
 		new_cpu = find_idlest_cpu(group, t, cpu);
 		if (new_cpu == -1 || new_cpu == cpu)
 			goto nextlevel;
 		/* Now try balancing at a lower domain level */
 		cpu = new_cpu;
 nextlevel:
 		sd = NULL;
 		weight = cpus_weight(span);
 		for_each_domain(cpu, tmp) {
 			if (weight <= cpus_weight(tmp->span))
 				break;
 			if (tmp->flags & flag)
 				sd = tmp;
 		}
 		/* while loop will break here if sd == NULL */
 	}
 	return cpu;
 }
 #endif /* CONFIG_SMP */
 /*
  * wake_idle() will wake a task on an idle cpu if task->cpu is
  * not idle and an idle cpu is available.  The span of cpus to
  * search starts with cpus closest then further out as needed,
  * so we always favor a closer, idle cpu.
  *
  * Returns the CPU we should wake onto.
  */
 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
 static int wake_idle(int cpu, task_t *p)
 {
 	cpumask_t tmp;
 	struct sched_domain *sd;
 	int i;
 	if (idle_cpu(cpu))
 		return cpu;
 	for_each_domain(cpu, sd) {
 		if (sd->flags & SD_WAKE_IDLE) {
 			cpus_and(tmp, sd->span, p->cpus_allowed);
 			for_each_cpu_mask(i, tmp) {
 				if (idle_cpu(i))
 					return i;
 			}
 		}
 		else
 			break;
 	}
 	return cpu;
 }
 #else
 static inline int wake_idle(int cpu, task_t *p)
 {
 	return cpu;
 }
 #endif
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
  * @state: the mask of task states that can be woken
  * @sync: do a synchronous wakeup?
  *
  * Put it on the run-queue if it's not already there. The "current"
  * thread is always on the run-queue (except when the actual
  * re-schedule is in progress), and as such you're allowed to do
  * the simpler "current->state = TASK_RUNNING" to mark yourself
  * runnable without the overhead of this.
  *
  * returns failure only if the task is already active.
  */
 static int try_to_wake_up(task_t *p, unsigned int state, int sync)
 {
 	int cpu, this_cpu, success = 0;
 	unsigned long flags;
 	long old_state;
 	runqueue_t *rq;
 #ifdef CONFIG_SMP
 	unsigned long load, this_load;
 	struct sched_domain *sd, *this_sd = NULL;
 	int new_cpu;
 #endif
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
 	if (!(old_state & state))
 		goto out;
 	if (p->array)
 		goto out_running;
 	cpu = task_cpu(p);
 	this_cpu = smp_processor_id();
 #ifdef CONFIG_SMP
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 	new_cpu = cpu;
 	schedstat_inc(rq, ttwu_cnt);
 	if (cpu == this_cpu) {
 		schedstat_inc(rq, ttwu_local);
 		goto out_set_cpu;
 	}
 	for_each_domain(this_cpu, sd) {
 		if (cpu_isset(cpu, sd->span)) {
 			schedstat_inc(sd, ttwu_wake_remote);
 			this_sd = sd;
 			break;
 		}
 	}
 	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
 		goto out_set_cpu;
 	/*
 	 * Check for affine wakeup and passive balancing possibilities.
 	 */
 	if (this_sd) {
 		int idx = this_sd->wake_idx;
 		unsigned int imbalance;
 		imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
 		load = source_load(cpu, idx);
 		this_load = target_load(this_cpu, idx);
 		new_cpu = this_cpu; /* Wake to this CPU if we can */
 		if (this_sd->flags & SD_WAKE_AFFINE) {
 			unsigned long tl = this_load;
 			/*
 			 * If sync wakeup then subtract the (maximum possible)
 			 * effect of the currently running task from the load
 			 * of the current CPU:
 			 */
 			if (sync)
 				tl -= SCHED_LOAD_SCALE;
 			if ((tl <= load &&
 				tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
 				100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
 				/*
 				 * This domain has SD_WAKE_AFFINE and
 				 * p is cache cold in this domain, and
 				 * there is no bad imbalance.
 				 */
 				schedstat_inc(this_sd, ttwu_move_affine);
 				goto out_set_cpu;
 			}
 		}
 		/*
 		 * Start passive balancing when half the imbalance_pct
 		 * limit is reached.
 		 */
 		if (this_sd->flags & SD_WAKE_BALANCE) {
 			if (imbalance*this_load <= 100*load) {
 				schedstat_inc(this_sd, ttwu_move_balance);
 				goto out_set_cpu;
 			}
 		}
 	}
 	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
 out_set_cpu:
 	new_cpu = wake_idle(new_cpu, p);
 	if (new_cpu != cpu) {
 		set_task_cpu(p, new_cpu);
 		task_rq_unlock(rq, &flags);
 		/* might preempt at this point */
 		rq = task_rq_lock(p, &flags);
 		old_state = p->state;
 		if (!(old_state & state))
 			goto out;
 		if (p->array)
 			goto out_running;
 		this_cpu = smp_processor_id();
 		cpu = task_cpu(p);
 	}
 out_activate:
 #endif /* CONFIG_SMP */
 	if (old_state == TASK_UNINTERRUPTIBLE) {
 		rq->nr_uninterruptible--;
 		/*
 		 * Tasks on involuntary sleep don't earn
 		 * sleep_avg beyond just interactive state.
 		 */
 		p->activated = -1;
 	}
 	/*
 	 * Tasks that have marked their sleep as noninteractive get
 	 * woken up without updating their sleep average. (i.e. their
 	 * sleep is handled in a priority-neutral manner, no priority
 	 * boost and no penalty.)
 	 */
 	if (old_state & TASK_NONINTERACTIVE)
 		__activate_task(p, rq);
 	else
 		activate_task(p, rq, cpu == this_cpu);
 	/*
 	 * Sync wakeups (i.e. those types of wakeups where the waker
 	 * has indicated that it will leave the CPU in short order)
 	 * don't trigger a preemption, if the woken up task will run on
 	 * this cpu. (in this case the 'I will reschedule' promise of
 	 * the waker guarantees that the freshly woken up task is going
 	 * to be considered on this CPU.)
 	 */
 	if (!sync || cpu != this_cpu) {
 		if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
 	}
 	success = 1;
 out_running:
 	p->state = TASK_RUNNING;
 out:
 	task_rq_unlock(rq, &flags);
 	return success;
 }
 int fastcall wake_up_process(task_t *p)
 {
 	return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
 				 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
 }
 EXPORT_SYMBOL(wake_up_process);
 int fastcall wake_up_state(task_t *p, unsigned int state)
 {
 	return try_to_wake_up(p, state, 0);
 }
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
  */
 void fastcall sched_fork(task_t *p, int clone_flags)
 {
 	int cpu = get_cpu();
 #ifdef CONFIG_SMP
 	cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
 #endif
 	set_task_cpu(p, cpu);
 	/*
 	 * We mark the process as running here, but have not actually
 	 * inserted it onto the runqueue yet. This guarantees that
 	 * nobody will actually run it, and a signal or other external
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
 	p->state = TASK_RUNNING;
 	INIT_LIST_HEAD(&p->run_list);
 	p->array = NULL;
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	p->oncpu = 0;
 #endif
 #ifdef CONFIG_PREEMPT
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
 	/*
 	 * Share the timeslice between parent and child, thus the
 	 * total amount of pending timeslices in the system doesn't change,
 	 * resulting in more scheduling fairness.
 	 */
 	local_irq_disable();
 	p->time_slice = (current->time_slice + 1) >> 1;
 	/*
 	 * The remainder of the first timeslice might be recovered by
 	 * the parent if the child exits early enough.
 	 */
 	p->first_time_slice = 1;
 	current->time_slice >>= 1;
 	p->timestamp = sched_clock();
 	if (unlikely(!current->time_slice)) {
 		/*
 		 * This case is rare, it happens when the parent has only
 		 * a single jiffy left from its timeslice. Taking the
 		 * runqueue lock is not a problem.
 		 */
 		current->time_slice = 1;
 		scheduler_tick();
 	}
 	local_irq_enable();
 	put_cpu();
 }
 /*
  * wake_up_new_task - wake up a newly created task for the first time.
  *
  * This function will do some initial scheduler statistics housekeeping
  * that must be done for every newly created context, then puts the task
  * on the runqueue and wakes it.
  */
 void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
 {
 	unsigned long flags;
 	int this_cpu, cpu;
 	runqueue_t *rq, *this_rq;
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
 	this_cpu = smp_processor_id();
 	cpu = task_cpu(p);
 	/*
 	 * We decrease the sleep average of forking parents
 	 * and children as well, to keep max-interactive tasks
 	 * from forking tasks that are max-interactive. The parent
 	 * (current) is done further down, under its lock.
 	 */
 	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
 		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
 	p->prio = effective_prio(p);
 	if (likely(cpu == this_cpu)) {
 		if (!(clone_flags & CLONE_VM)) {
 			/*
 			 * The VM isn't cloned, so we're in a good position to
 			 * do child-runs-first in anticipation of an exec. This
 			 * usually avoids a lot of COW overhead.
 			 */
 			if (unlikely(!current->array))
 				__activate_task(p, rq);
 			else {
 				p->prio = current->prio;
 				list_add_tail(&p->run_list, &current->run_list);
 				p->array = current->array;
 				p->array->nr_active++;
 				rq->nr_running++;
 			}
 			set_need_resched();
 		} else
 			/* Run child last */
 			__activate_task(p, rq);
 		/*
 		 * We skip the following code due to cpu == this_cpu
 	 	 *
 		 *   task_rq_unlock(rq, &flags);
 		 *   this_rq = task_rq_lock(current, &flags);
 		 */
 		this_rq = rq;
 	} else {
 		this_rq = cpu_rq(this_cpu);
 		/*
 		 * Not the local CPU - must adjust timestamp. This should
 		 * get optimised away in the !CONFIG_SMP case.
 		 */
 		p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
 					+ rq->timestamp_last_tick;
 		__activate_task(p, rq);
 		if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
 		/*
 		 * Parent and child are on different CPUs, now get the
 		 * parent runqueue to update the parent's ->sleep_avg:
 		 */
 		task_rq_unlock(rq, &flags);
 		this_rq = task_rq_lock(current, &flags);
 	}
 	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
 		PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
 	task_rq_unlock(this_rq, &flags);
 }
 /*
  * Potentially available exiting-child timeslices are
  * retrieved here - this way the parent does not get
  * penalized for creating too many threads.
  *
  * (this cannot be used to 'generate' timeslices
  * artificially, because any timeslice recovered here
  * was given away by the parent in the first place.)
  */
 void fastcall sched_exit(task_t *p)
 {
 	unsigned long flags;
 	runqueue_t *rq;
 	/*
 	 * If the child was a (relative-) CPU hog then decrease
 	 * the sleep_avg of the parent as well.
 	 */
 	rq = task_rq_lock(p->parent, &flags);
 	if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
 		p->parent->time_slice += p->time_slice;
 		if (unlikely(p->parent->time_slice > task_timeslice(p)))
 			p->parent->time_slice = task_timeslice(p);
 	}
 	if (p->sleep_avg < p->parent->sleep_avg)
 		p->parent->sleep_avg = p->parent->sleep_avg /
 		(EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
 		(EXIT_WEIGHT + 1);
 	task_rq_unlock(rq, &flags);
 }
 /**
  * prepare_task_switch - prepare to switch tasks
  * @rq: the runqueue preparing to switch
  * @next: the task we are going to switch to.
  *
  * This is called with the rq lock held and interrupts off. It must
  * be paired with a subsequent finish_task_switch after the context
  * switch.
  *
  * prepare_task_switch sets up locking and calls architecture specific
  * hooks.
  */
 static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
 {
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
 }
 /**
  * finish_task_switch - clean up after a task-switch
  * @rq: runqueue associated with task-switch
  * @prev: the thread we just switched away from.
  *
  * finish_task_switch must be called after the context switch, paired
  * with a prepare_task_switch call before the context switch.
  * finish_task_switch will reconcile locking set up by prepare_task_switch,
  * and do any other architecture-specific cleanup actions.
  *
  * Note that we may have delayed dropping an mm in context_switch(). If
  * so, we finish that here outside of the runqueue lock.  (Doing it
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
  */
 static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
 	__releases(rq->lock)
 {
 	struct mm_struct *mm = rq->prev_mm;
 	unsigned long prev_task_flags;
 	rq->prev_mm = NULL;
 	/*
 	 * A task struct has one reference for the use as "current".
 	 * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
 	 * calls schedule one last time. The schedule call will never return,
 	 * and the scheduled task must drop that reference.
 	 * The test for EXIT_ZOMBIE must occur while the runqueue locks are
 	 * still held, otherwise prev could be scheduled on another cpu, die
 	 * there before we look at prev->state, and then the reference would
 	 * be dropped twice.
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
 	prev_task_flags = prev->flags;
 	finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
 	if (mm)
 		mmdrop(mm);
-	if (unlikely(prev_task_flags & PF_DEAD))
+	if (unlikely(prev_task_flags & PF_DEAD)) {
+		/*
+		 * Remove function-return probe instances associated with this
+		 * task and put them back on the free list.
+	 	 */
+		kprobe_flush_task(prev);
 		put_task_struct(prev);
+	}
 }
 /**
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
  */
 asmlinkage void schedule_tail(task_t *prev)
 	__releases(rq->lock)
 {
 	runqueue_t *rq = this_rq();
 	finish_task_switch(rq, prev);
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	/* In this case, finish_task_switch does not reenable preemption */
 	preempt_enable();
 #endif
 	if (current->set_child_tid)
 		put_user(current->pid, current->set_child_tid);
 }
 /*
  * context_switch - switch to the new MM and the new
  * thread's register state.
  */
 static inline
 task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
 {
 	struct mm_struct *mm = next->mm;
 	struct mm_struct *oldmm = prev->active_mm;
 	if (unlikely(!mm)) {
 		next->active_mm = oldmm;
 		atomic_inc(&oldmm->mm_count);
 		enter_lazy_tlb(oldmm, next);
 	} else
 		switch_mm(oldmm, mm, next);
 	if (unlikely(!prev->mm)) {
 		prev->active_mm = NULL;
 		WARN_ON(rq->prev_mm);
 		rq->prev_mm = oldmm;
 	}
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 	return prev;
 }
 /*
  * nr_running, nr_uninterruptible and nr_context_switches:
  *
  * externally visible scheduler statistics: current number of runnable
  * threads, current number of uninterruptible-sleeping threads, total
  * number of context switches performed since bootup.
  */
 unsigned long nr_running(void)
 {
 	unsigned long i, sum = 0;
 	for_each_online_cpu(i)
 		sum += cpu_rq(i)->nr_running;
 	return sum;
 }
 unsigned long nr_uninterruptible(void)
 {
 	unsigned long i, sum = 0;
 	for_each_cpu(i)
 		sum += cpu_rq(i)->nr_uninterruptible;
 	/*
 	 * Since we read the counters lockless, it might be slightly
 	 * inaccurate. Do not allow it to go below zero though:
 	 */
 	if (unlikely((long)sum < 0))
 		sum = 0;
 	return sum;
 }
 unsigned long long nr_context_switches(void)
 {
 	unsigned long long i, sum = 0;
 	for_each_cpu(i)
 		sum += cpu_rq(i)->nr_switches;
 	return sum;
 }
 unsigned long nr_iowait(void)
 {
 	unsigned long i, sum = 0;
 	for_each_cpu(i)
 		sum += atomic_read(&cpu_rq(i)->nr_iowait);
 	return sum;
 }
 #ifdef CONFIG_SMP
 /*
  * double_rq_lock - safely lock two runqueues
  *
  * We must take them in cpu order to match code in
  * dependent_sleeper and wake_dependent_sleeper.
  *
  * Note this does not disable interrupts like task_rq_lock,
  * you need to do so manually before calling.
  */
 static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
 	__acquires(rq1->lock)
 	__acquires(rq2->lock)
 {
 	if (rq1 == rq2) {
 		spin_lock(&rq1->lock);
 		__acquire(rq2->lock);	/* Fake it out ;) */
 	} else {
 		if (rq1->cpu < rq2->cpu) {
 			spin_lock(&rq1->lock);
 			spin_lock(&rq2->lock);
 		} else {
 			spin_lock(&rq2->lock);
 			spin_lock(&rq1->lock);
 		}
 	}
 }
 /*
  * double_rq_unlock - safely unlock two runqueues
  *
  * Note this does not restore interrupts like task_rq_unlock,
  * you need to do so manually after calling.
  */
 static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
 	__releases(rq1->lock)
 	__releases(rq2->lock)
 {
 	spin_unlock(&rq1->lock);
 	if (rq1 != rq2)
 		spin_unlock(&rq2->lock);
 	else
 		__release(rq2->lock);
 }
 /*
  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
  */
 static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
 	__releases(this_rq->lock)
 	__acquires(busiest->lock)
 	__acquires(this_rq->lock)
 {
 	if (unlikely(!spin_trylock(&busiest->lock))) {
 		if (busiest->cpu < this_rq->cpu) {
 			spin_unlock(&this_rq->lock);
 			spin_lock(&busiest->lock);
 			spin_lock(&this_rq->lock);
 		} else
 			spin_lock(&busiest->lock);
 	}
 }
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
  * the cpu_allowed mask is restored.
  */
 static void sched_migrate_task(task_t *p, int dest_cpu)
 {
 	migration_req_t req;
 	runqueue_t *rq;
 	unsigned long flags;
 	rq = task_rq_lock(p, &flags);
 	if (!cpu_isset(dest_cpu, p->cpus_allowed)
 	    || unlikely(cpu_is_offline(dest_cpu)))
 		goto out;
 	/* force the process onto the specified CPU */
 	if (migrate_task(p, dest_cpu, &req)) {
 		/* Need to wait for migration thread (might exit: take ref). */
 		struct task_struct *mt = rq->migration_thread;
 		get_task_struct(mt);
 		task_rq_unlock(rq, &flags);
 		wake_up_process(mt);
 		put_task_struct(mt);
 		wait_for_completion(&req.done);
 		return;
 	}
 out:
 	task_rq_unlock(rq, &flags);
 }
 /*
  * sched_exec - execve() is a valuable balancing opportunity, because at
  * this point the task has the smallest effective memory and cache footprint.
  */
 void sched_exec(void)
 {
 	int new_cpu, this_cpu = get_cpu();
 	new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
 	put_cpu();
 	if (new_cpu != this_cpu)
 		sched_migrate_task(current, new_cpu);
 }
 /*
  * pull_task - move a task from a remote runqueue to the local runqueue.
  * Both runqueues must be locked.
  */
 static
 void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
 	       runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
 {
 	dequeue_task(p, src_array);
 	src_rq->nr_running--;
 	set_task_cpu(p, this_cpu);
 	this_rq->nr_running++;
 	enqueue_task(p, this_array);
 	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
 				+ this_rq->timestamp_last_tick;
 	/*
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
 	 * to be always true for them.
 	 */
 	if (TASK_PREEMPTS_CURR(p, this_rq))
 		resched_task(this_rq->curr);
 }
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
 static
 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
 		     struct sched_domain *sd, enum idle_type idle,
 		     int *all_pinned)
 {
 	/*
 	 * We do not migrate tasks that are:
 	 * 1) running (obviously), or
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
 	 */
 	if (!cpu_isset(this_cpu, p->cpus_allowed))
 		return 0;
 	*all_pinned = 0;
 	if (task_running(rq, p))
 		return 0;
 	/*
 	 * Aggressive migration if:
 	 * 1) task is cache cold, or
 	 * 2) too many balance attempts have failed.
 	 */
 	if (sd->nr_balance_failed > sd->cache_nice_tries)
 		return 1;
 	if (task_hot(p, rq->timestamp_last_tick, sd))
 		return 0;
 	return 1;
 }
 /*
  * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
  * as part of a balancing operation within "domain". Returns the number of
  * tasks moved.
  *
  * Called with both runqueues locked.
  */
 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
 		      unsigned long max_nr_move, struct sched_domain *sd,
 		      enum idle_type idle, int *all_pinned)
 {
 	prio_array_t *array, *dst_array;
 	struct list_head *head, *curr;
 	int idx, pulled = 0, pinned = 0;
 	task_t *tmp;
 	if (max_nr_move == 0)
 		goto out;
 	pinned = 1;
 	/*
 	 * We first consider expired tasks. Those will likely not be
 	 * executed in the near future, and they are most likely to
 	 * be cache-cold, thus switching CPUs has the least effect
 	 * on them.
 	 */
 	if (busiest->expired->nr_active) {
 		array = busiest->expired;
 		dst_array = this_rq->expired;
 	} else {
 		array = busiest->active;
 		dst_array = this_rq->active;
 	}
 new_array:
 	/* Start searching at priority 0: */
 	idx = 0;
 skip_bitmap:
 	if (!idx)
 		idx = sched_find_first_bit(array->bitmap);
 	else
 		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
 	if (idx >= MAX_PRIO) {
 		if (array == busiest->expired && busiest->active->nr_active) {
 			array = busiest->active;
 			dst_array = this_rq->active;
 			goto new_array;
 		}
 		goto out;
 	}
 	head = array->queue + idx;
 	curr = head->prev;
 skip_queue:
 	tmp = list_entry(curr, task_t, run_list);
 	curr = curr->prev;
 	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
 		if (curr != head)
 			goto skip_queue;
 		idx++;
 		goto skip_bitmap;
 	}
 #ifdef CONFIG_SCHEDSTATS
 	if (task_hot(tmp, busiest->timestamp_last_tick, sd))
 		schedstat_inc(sd, lb_hot_gained[idle]);
 #endif
 	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
 	pulled++;
 	/* We only want to steal up to the prescribed number of tasks. */
 	if (pulled < max_nr_move) {
 		if (curr != head)
 			goto skip_queue;
 		idx++;
 		goto skip_bitmap;
 	}
 out:
 	/*
 	 * Right now, this is the only place pull_task() is called,
 	 * so we can safely collect pull_task() stats here rather than
 	 * inside pull_task().
 	 */
 	schedstat_add(sd, lb_gained[idle], pulled);
 	if (all_pinned)
 		*all_pinned = pinned;
 	return pulled;
 }
 /*
  * find_busiest_group finds and returns the busiest CPU group within the
  * domain. It calculates and returns the number of tasks which should be
  * moved to restore balance via the imbalance parameter.
  */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
 		   unsigned long *imbalance, enum idle_type idle, int *sd_idle)
 {
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
 	unsigned long max_pull;
 	int load_idx;
 	max_load = this_load = total_load = total_pwr = 0;
 	if (idle == NOT_IDLE)
 		load_idx = sd->busy_idx;
 	else if (idle == NEWLY_IDLE)
 		load_idx = sd->newidle_idx;
 	else
 		load_idx = sd->idle_idx;
 	do {
 		unsigned long load;
 		int local_group;
 		int i;
 		local_group = cpu_isset(this_cpu, group->cpumask);
 		/* Tally up the load of all CPUs in the group */
 		avg_load = 0;
 		for_each_cpu_mask(i, group->cpumask) {
 			if (*sd_idle && !idle_cpu(i))
 				*sd_idle = 0;
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
 				load = target_load(i, load_idx);
 			else
 				load = source_load(i, load_idx);
 			avg_load += load;
 		}
 		total_load += avg_load;
 		total_pwr += group->cpu_power;
 		/* Adjust by relative CPU power of the group */
 		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
 		if (local_group) {
 			this_load = avg_load;
 			this = group;
 		} else if (avg_load > max_load) {
 			max_load = avg_load;
 			busiest = group;
 		}
 		group = group->next;
 	} while (group != sd->groups);
 	if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
 		goto out_balanced;
 	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
 	if (this_load >= avg_load ||
 			100*max_load <= sd->imbalance_pct*this_load)
 		goto out_balanced;
 	/*
 	 * We're trying to get all the cpus to the average_load, so we don't
 	 * want to push ourselves above the average load, nor do we wish to
 	 * reduce the max loaded cpu below the average load, as either of these
 	 * actions would just result in more rebalancing later, and ping-pong
 	 * tasks around. Thus we look for the minimum possible imbalance.
 	 * Negative imbalances (*we* are more loaded than anyone else) will
 	 * be counted as no imbalance for these purposes -- we can't fix that
 	 * by pulling tasks to us.  Be careful of negative numbers as they'll
 	 * appear as very large values with unsigned longs.
 	 */
 	/* Don't want to pull so many tasks that a group would go idle */
 	max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
 	/* How much load to actually move to equalise the imbalance */
 	*imbalance = min(max_pull * busiest->cpu_power,
 				(avg_load - this_load) * this->cpu_power)
 			/ SCHED_LOAD_SCALE;
 	if (*imbalance < SCHED_LOAD_SCALE) {
 		unsigned long pwr_now = 0, pwr_move = 0;
 		unsigned long tmp;
 		if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
 			*imbalance = 1;
 			return busiest;
 		}
 		/*
 		 * OK, we don't have enough imbalance to justify moving tasks,
 		 * however we may be able to increase total CPU power used by
 		 * moving them.
 		 */
 		pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
 		pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
 		pwr_now /= SCHED_LOAD_SCALE;
 		/* Amount of load we'd subtract */
 		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
 		if (max_load > tmp)
 			pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
 							max_load - tmp);
 		/* Amount of load we'd add */
 		if (max_load*busiest->cpu_power <
 				SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)
 			tmp = max_load*busiest->cpu_power/this->cpu_power;
 		else
 			tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
 		pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
 		pwr_move /= SCHED_LOAD_SCALE;
 		/* Move if we gain throughput */
 		if (pwr_move <= pwr_now)
 			goto out_balanced;
 		*imbalance = 1;
 		return busiest;
 	}
 	/* Get rid of the scaling factor, rounding down as we divide */
 	*imbalance = *imbalance / SCHED_LOAD_SCALE;
 	return busiest;
 out_balanced:
 	*imbalance = 0;
 	return NULL;
 }
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
 static runqueue_t *find_busiest_queue(struct sched_group *group,
 	enum idle_type idle)
 {
 	unsigned long load, max_load = 0;
 	runqueue_t *busiest = NULL;
 	int i;
 	for_each_cpu_mask(i, group->cpumask) {
 		load = source_load(i, 0);
 		if (load > max_load) {
 			max_load = load;
 			busiest = cpu_rq(i);
 		}
 	}
 	return busiest;
 }
 /*
  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
  * so long as it is large enough.
  */
 #define MAX_PINNED_INTERVAL	512
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
  *
  * Called with this_rq unlocked.
  */
 static int load_balance(int this_cpu, runqueue_t *this_rq,
 			struct sched_domain *sd, enum idle_type idle)
 {
 	struct sched_group *group;
 	runqueue_t *busiest;
 	unsigned long imbalance;
 	int nr_moved, all_pinned = 0;
 	int active_balance = 0;
 	int sd_idle = 0;
 	if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
 		sd_idle = 1;
 	schedstat_inc(sd, lb_cnt[idle]);
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[idle]);
 		goto out_balanced;
 	}
 	busiest = find_busiest_queue(group, idle);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
 	}
 	BUG_ON(busiest == this_rq);
 	schedstat_add(sd, lb_imbalance[idle], imbalance);
 	nr_moved = 0;
 	if (busiest->nr_running > 1) {
 		/*
 		 * Attempt to move tasks. If find_busiest_group has found
 		 * an imbalance but busiest->nr_running <= 1, the group is
 		 * still unbalanced. nr_moved simply stays zero, so it is
 		 * correctly treated as an imbalance.
 		 */
 		double_rq_lock(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
 					imbalance, sd, idle, &all_pinned);
 		double_rq_unlock(this_rq, busiest);
 		/* All tasks on this runqueue were pinned by CPU affinity */
 		if (unlikely(all_pinned))
 			goto out_balanced;
 	}
 	if (!nr_moved) {
 		schedstat_inc(sd, lb_failed[idle]);
 		sd->nr_balance_failed++;
 		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
 			spin_lock(&busiest->lock);
 			/* don't kick the migration_thread, if the curr
 			 * task on busiest cpu can't be moved to this_cpu
 			 */
 			if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
 				spin_unlock(&busiest->lock);
 				all_pinned = 1;
 				goto out_one_pinned;
 			}
 			if (!busiest->active_balance) {
 				busiest->active_balance = 1;
 				busiest->push_cpu = this_cpu;
 				active_balance = 1;
 			}
 			spin_unlock(&busiest->lock);
 			if (active_balance)
 				wake_up_process(busiest->migration_thread);
 			/*
 			 * We've kicked active balancing, reset the failure
 			 * counter.
 			 */
 			sd->nr_balance_failed = sd->cache_nice_tries+1;
 		}
 	} else
 		sd->nr_balance_failed = 0;
 	if (likely(!active_balance)) {
 		/* We were unbalanced, so reset the balancing interval */
 		sd->balance_interval = sd->min_interval;
 	} else {
 		/*
 		 * If we've begun active balancing, start to back off. This
 		 * case may not be covered by the all_pinned logic if there
 		 * is only 1 task on the busy runqueue (because we don't call
 		 * move_tasks).
 		 */
 		if (sd->balance_interval < sd->max_interval)
 			sd->balance_interval *= 2;
 	}
 	if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
 		return -1;
 	return nr_moved;
 out_balanced:
 	schedstat_inc(sd, lb_balanced[idle]);
 	sd->nr_balance_failed = 0;
 out_one_pinned:
 	/* tune up the balancing interval */
 	if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
 			(sd->balance_interval < sd->max_interval))
 		sd->balance_interval *= 2;
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
 		return -1;
 	return 0;
 }
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
  *
  * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
  * this_rq is locked.
  */
 static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 				struct sched_domain *sd)
 {
 	struct sched_group *group;
 	runqueue_t *busiest = NULL;
 	unsigned long imbalance;
 	int nr_moved = 0;
 	int sd_idle = 0;
 	if (sd->flags & SD_SHARE_CPUPOWER)
 		sd_idle = 1;
 	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
 	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
 		goto out_balanced;
 	}
 	busiest = find_busiest_queue(group, NEWLY_IDLE);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
 		goto out_balanced;
 	}
 	BUG_ON(busiest == this_rq);
 	schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
 	nr_moved = 0;
 	if (busiest->nr_running > 1) {
 		/* Attempt to move tasks */
 		double_lock_balance(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
 					imbalance, sd, NEWLY_IDLE, NULL);
 		spin_unlock(&busiest->lock);
 	}
 	if (!nr_moved) {
 		schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
 		if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
 			return -1;
 	} else
 		sd->nr_balance_failed = 0;
 	return nr_moved;
 out_balanced:
 	schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
 		return -1;
 	sd->nr_balance_failed = 0;
 	return 0;
 }
 /*
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
  */
 static void idle_balance(int this_cpu, runqueue_t *this_rq)
 {
 	struct sched_domain *sd;
 	for_each_domain(this_cpu, sd) {
 		if (sd->flags & SD_BALANCE_NEWIDLE) {
 			if (load_balance_newidle(this_cpu, this_rq, sd)) {
 				/* We've pulled tasks over so stop searching */
 				break;
 			}
 		}
 	}
 }
 /*
  * active_load_balance is run by migration threads. It pushes running tasks
  * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
  * running on each physical CPU where possible, and avoids physical /
  * logical imbalances.
  *
  * Called with busiest_rq locked.
  */
 static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
 {
 	struct sched_domain *sd;
 	runqueue_t *target_rq;
 	int target_cpu = busiest_rq->push_cpu;
 	if (busiest_rq->nr_running <= 1)
 		/* no task to move */
 		return;
 	target_rq = cpu_rq(target_cpu);
 	/*
 	 * This condition is "impossible", if it occurs
 	 * we need to fix it.  Originally reported by
 	 * Bjorn Helgaas on a 128-cpu setup.
 	 */
 	BUG_ON(busiest_rq == target_rq);
 	/* move a task from busiest_rq to target_rq */
 	double_lock_balance(busiest_rq, target_rq);
 	/* Search for an sd spanning us and the target CPU. */
 	for_each_domain(target_cpu, sd)
 		if ((sd->flags & SD_LOAD_BALANCE) &&
 			cpu_isset(busiest_cpu, sd->span))
 				break;
 	if (unlikely(sd == NULL))
 		goto out;
 	schedstat_inc(sd, alb_cnt);
 	if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
 		schedstat_inc(sd, alb_pushed);
 	else
 		schedstat_inc(sd, alb_failed);
 out:
 	spin_unlock(&target_rq->lock);
 }
 /*
  * rebalance_tick will get called every timer tick, on every CPU.
  *
  * It checks each scheduling domain to see if it is due to be balanced,
  * and initiates a balancing operation if so.
  *
  * Balancing parameters are set up in arch_init_sched_domains.
  */
 /* Don't have all balancing operations going off at once */
 #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
 static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
 			   enum idle_type idle)
 {
 	unsigned long old_load, this_load;
 	unsigned long j = jiffies + CPU_OFFSET(this_cpu);
 	struct sched_domain *sd;
 	int i;
 	this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
 	/* Update our load */
 	for (i = 0; i < 3; i++) {
 		unsigned long new_load = this_load;
 		int scale = 1 << i;
 		old_load = this_rq->cpu_load[i];
 		/*
 		 * Round up the averaging division if load is increasing. This
 		 * prevents us from getting stuck on 9 if the load is 10, for
 		 * example.
 		 */
 		if (new_load > old_load)
 			new_load += scale-1;
 		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
 	}
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
 		if (!(sd->flags & SD_LOAD_BALANCE))
 			continue;
 		interval = sd->balance_interval;
 		if (idle != SCHED_IDLE)
 			interval *= sd->busy_factor;
 		/* scale ms to jiffies */
 		interval = msecs_to_jiffies(interval);
 		if (unlikely(!interval))
 			interval = 1;
 		if (j - sd->last_balance >= interval) {
 			if (load_balance(this_cpu, this_rq, sd, idle)) {
 				/*
 				 * We've pulled tasks over so either we're no
 				 * longer idle, or one of our SMT siblings is
 				 * not idle.
 				 */
 				idle = NOT_IDLE;
 			}
 			sd->last_balance += interval;
 		}
 	}
 }
 #else
 /*
  * on UP we do not need to balance between CPUs:
  */
 static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
 {
 }
 static inline void idle_balance(int cpu, runqueue_t *rq)
 {
 }
 #endif
 static inline int wake_priority_sleeper(runqueue_t *rq)
 {
 	int ret = 0;
 #ifdef CONFIG_SCHED_SMT
 	spin_lock(&rq->lock);
 	/*
 	 * If an SMT sibling task has been put to sleep for priority
 	 * reasons reschedule the idle task to see if it can now run.
 	 */
 	if (rq->nr_running) {
 		resched_task(rq->idle);
 		ret = 1;
 	}
 	spin_unlock(&rq->lock);
 #endif
 	return ret;
 }
 DEFINE_PER_CPU(struct kernel_stat, kstat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 /*
  * This is called on clock ticks and on context switches.
  * Bank in p->sched_time the ns elapsed since the last tick or switch.
  */
 static inline void update_cpu_clock(task_t *p, runqueue_t *rq,
 				    unsigned long long now)
 {
 	unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
 	p->sched_time += now - last;
 }
 /*
  * Return current->sched_time plus any more ns on the sched_clock
  * that have not yet been banked.
  */
 unsigned long long current_sched_time(const task_t *tsk)
 {
 	unsigned long long ns;
 	unsigned long flags;
 	local_irq_save(flags);
 	ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
 	ns = tsk->sched_time + (sched_clock() - ns);
 	local_irq_restore(flags);
 	return ns;
 }
 /*
  * We place interactive tasks back into the active array, if possible.
  *
  * To guarantee that this does not starve expired tasks we ignore the
  * interactivity of a task if the first expired task had to wait more
  * than a 'reasonable' amount of time. This deadline timeout is
  * load-dependent, as the frequency of array switched decreases with
  * increasing number of running tasks. We also ignore the interactivity
  * if a better static_prio task has expired:
  */
 #define EXPIRED_STARVING(rq) \
 	((STARVATION_LIMIT && ((rq)->expired_timestamp && \
 		(jiffies - (rq)->expired_timestamp >= \
 			STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
 			((rq)->curr->static_prio > (rq)->best_expired_prio))
 /*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
  * @cputime: the cpu time spent in user space since the last update
  */
 void account_user_time(struct task_struct *p, cputime_t cputime)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t tmp;
 	p->utime = cputime_add(p->utime, cputime);
 	/* Add user time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
 	if (TASK_NICE(p) > 0)
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
 	else
 		cpustat->user = cputime64_add(cpustat->user, tmp);
 }
 /*
  * Account system cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
  * @cputime: the cpu time spent in kernel space since the last update
  */
 void account_system_time(struct task_struct *p, int hardirq_offset,
 			 cputime_t cputime)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	runqueue_t *rq = this_rq();
 	cputime64_t tmp;
 	p->stime = cputime_add(p->stime, cputime);
 	/* Add system time to cpustat. */
 	tmp = cputime_to_cputime64(cputime);
 	if (hardirq_count() - hardirq_offset)
 		cpustat->irq = cputime64_add(cpustat->irq, tmp);
 	else if (softirq_count())
 		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
 	else if (p != rq->idle)
 		cpustat->system = cputime64_add(cpustat->system, tmp);
 	else if (atomic_read(&rq->nr_iowait) > 0)
 		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
 	else
 		cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	/* Account for system time used */
 	acct_update_integrals(p);
 }
 /*
  * Account for involuntary wait time.
  * @p: the process from which the cpu time has been stolen
  * @steal: the cpu time spent in involuntary wait
  */
 void account_steal_time(struct task_struct *p, cputime_t steal)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t tmp = cputime_to_cputime64(steal);
 	runqueue_t *rq = this_rq();
 	if (p == rq->idle) {
 		p->stime = cputime_add(p->stime, steal);
 		if (atomic_read(&rq->nr_iowait) > 0)
 			cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
 		else
 			cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	} else
 		cpustat->steal = cputime64_add(cpustat->steal, tmp);
 }
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
  *
  * It also gets called by the fork code, when changing the parent's
  * timeslices.
  */
 void scheduler_tick(void)
 {
 	int cpu = smp_processor_id();
 	runqueue_t *rq = this_rq();
 	task_t *p = current;
 	unsigned long long now = sched_clock();
 	update_cpu_clock(p, rq, now);
 	rq->timestamp_last_tick = now;
 	if (p == rq->idle) {
 		if (wake_priority_sleeper(rq))
 			goto out;
 		rebalance_tick(cpu, rq, SCHED_IDLE);
 		return;
 	}
 	/* Task might have expired already, but not scheduled off yet */
 	if (p->array != rq->active) {
 		set_tsk_need_resched(p);
 		goto out;
 	}
 	spin_lock(&rq->lock);
 	/*
 	 * The task was running during this tick - update the
 	 * time slice counter. Note: we do not update a thread's
 	 * priority until it either goes to sleep or uses up its
 	 * timeslice. This makes it possible for interactive tasks
 	 * to use up their timeslices at their highest priority levels.
 	 */
 	if (rt_task(p)) {
 		/*
 		 * RR tasks need a special form of timeslice management.
 		 * FIFO tasks have no timeslices.
 		 */
 		if ((p->policy == SCHED_RR) && !--p->time_slice) {
 			p->time_slice = task_timeslice(p);
 			p->first_time_slice = 0;
 			set_tsk_need_resched(p);
 			/* put it at the end of the queue: */
 			requeue_task(p, rq->active);
 		}
 		goto out_unlock;
 	}
 	if (!--p->time_slice) {
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
 		p->prio = effective_prio(p);
 		p->time_slice = task_timeslice(p);
 		p->first_time_slice = 0;
 		if (!rq->expired_timestamp)
 			rq->expired_timestamp = jiffies;
 		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
 			enqueue_task(p, rq->expired);
 			if (p->static_prio < rq->best_expired_prio)
 				rq->best_expired_prio = p->static_prio;
 		} else
 			enqueue_task(p, rq->active);
 	} else {
 		/*
 		 * Prevent a too long timeslice allowing a task to monopolize
 		 * the CPU. We do this by splitting up the timeslice into
 		 * smaller pieces.
 		 *
 		 * Note: this does not mean the task's timeslices expire or
 		 * get lost in any way, they just might be preempted by
 		 * another task of equal priority. (one with higher
 		 * priority would have preempted this task already.) We
 		 * requeue this task to the end of the list on this priority
 		 * level, which is in essence a round-robin of tasks with
 		 * equal priority.
 		 *
 		 * This only applies to tasks in the interactive
 		 * delta range with at least TIMESLICE_GRANULARITY to requeue.
 		 */
 		if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
 			p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
 			(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
 			(p->array == rq->active)) {
 			requeue_task(p, rq->active);
 			set_tsk_need_resched(p);
 		}
 	}
 out_unlock:
 	spin_unlock(&rq->lock);
 out:
 	rebalance_tick(cpu, rq, NOT_IDLE);
 }
 #ifdef CONFIG_SCHED_SMT
 static inline void wakeup_busy_runqueue(runqueue_t *rq)
 {
 	/* If an SMT runqueue is sleeping due to priority reasons wake it up */
 	if (rq->curr == rq->idle && rq->nr_running)
 		resched_task(rq->idle);
 }
 static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 {
 	struct sched_domain *tmp, *sd = NULL;
 	cpumask_t sibling_map;
 	int i;
 	for_each_domain(this_cpu, tmp)
 		if (tmp->flags & SD_SHARE_CPUPOWER)
 			sd = tmp;
 	if (!sd)
 		return;
 	/*
 	 * Unlock the current runqueue because we have to lock in
 	 * CPU order to avoid deadlocks. Caller knows that we might
 	 * unlock. We keep IRQs disabled.
 	 */
 	spin_unlock(&this_rq->lock);
 	sibling_map = sd->span;
 	for_each_cpu_mask(i, sibling_map)
 		spin_lock(&cpu_rq(i)->lock);
 	/*
 	 * We clear this CPU from the mask. This both simplifies the
 	 * inner loop and keps this_rq locked when we exit:
 	 */
 	cpu_clear(this_cpu, sibling_map);
 	for_each_cpu_mask(i, sibling_map) {
 		runqueue_t *smt_rq = cpu_rq(i);
 		wakeup_busy_runqueue(smt_rq);
 	}
 	for_each_cpu_mask(i, sibling_map)
 		spin_unlock(&cpu_rq(i)->lock);
 	/*
 	 * We exit with this_cpu's rq still held and IRQs
 	 * still disabled:
 	 */
 }
 /*
  * number of 'lost' timeslices this task wont be able to fully
  * utilize, if another task runs on a sibling. This models the
  * slowdown effect of other tasks running on siblings:
  */
 static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
 {
 	return p->time_slice * (100 - sd->per_cpu_gain) / 100;
 }
 static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 {
 	struct sched_domain *tmp, *sd = NULL;
 	cpumask_t sibling_map;
 	prio_array_t *array;
 	int ret = 0, i;
 	task_t *p;
 	for_each_domain(this_cpu, tmp)
 		if (tmp->flags & SD_SHARE_CPUPOWER)
 			sd = tmp;
 	if (!sd)
 		return 0;
 	/*
 	 * The same locking rules and details apply as for
 	 * wake_sleeping_dependent():
 	 */
 	spin_unlock(&this_rq->lock);
 	sibling_map = sd->span;
 	for_each_cpu_mask(i, sibling_map)
 		spin_lock(&cpu_rq(i)->lock);
 	cpu_clear(this_cpu, sibling_map);
 	/*
 	 * Establish next task to be run - it might have gone away because
 	 * we released the runqueue lock above:
 	 */
 	if (!this_rq->nr_running)
 		goto out_unlock;
 	array = this_rq->active;
 	if (!array->nr_active)
 		array = this_rq->expired;
 	BUG_ON(!array->nr_active);
 	p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
 		task_t, run_list);
 	for_each_cpu_mask(i, sibling_map) {
 		runqueue_t *smt_rq = cpu_rq(i);
 		task_t *smt_curr = smt_rq->curr;
 		/* Kernel threads do not participate in dependent sleeping */
 		if (!p->mm || !smt_curr->mm || rt_task(p))
 			goto check_smt_task;
 		/*
 		 * If a user task with lower static priority than the
 		 * running task on the SMT sibling is trying to schedule,
 		 * delay it till there is proportionately less timeslice
 		 * left of the sibling task to prevent a lower priority
 		 * task from using an unfair proportion of the
 		 * physical cpu's resources. -ck
 		 */
 		if (rt_task(smt_curr)) {
 			/*
 			 * With real time tasks we run non-rt tasks only
 			 * per_cpu_gain% of the time.
 			 */
 			if ((jiffies % DEF_TIMESLICE) >
 				(sd->per_cpu_gain * DEF_TIMESLICE / 100))
 					ret = 1;
 		} else
 			if (smt_curr->static_prio < p->static_prio &&
 				!TASK_PREEMPTS_CURR(p, smt_rq) &&
 				smt_slice(smt_curr, sd) > task_timeslice(p))
 					ret = 1;
 check_smt_task:
 		if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
 			rt_task(smt_curr))
 				continue;
 		if (!p->mm) {
 			wakeup_busy_runqueue(smt_rq);
 			continue;
 		}
 		/*
 		 * Reschedule a lower priority task on the SMT sibling for
 		 * it to be put to sleep, or wake it up if it has been put to
 		 * sleep for priority reasons to see if it should run now.
 		 */
 		if (rt_task(p)) {
 			if ((jiffies % DEF_TIMESLICE) >
 				(sd->per_cpu_gain * DEF_TIMESLICE / 100))
 					resched_task(smt_curr);
 		} else {
 			if (TASK_PREEMPTS_CURR(p, smt_rq) &&
 				smt_slice(p, sd) > task_timeslice(smt_curr))
 					resched_task(smt_curr);
 			else
 				wakeup_busy_runqueue(smt_rq);
 		}
 	}
 out_unlock:
 	for_each_cpu_mask(i, sibling_map)
 		spin_unlock(&cpu_rq(i)->lock);
 	return ret;
 }
 #else
 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 {
 }
 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 {
 	return 0;
 }
 #endif
 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
 void fastcall add_preempt_count(int val)
 {
 	/*
 	 * Underflow?
 	 */
 	BUG_ON((preempt_count() < 0));
 	preempt_count() += val;
 	/*
 	 * Spinlock count overflowing soon?
 	 */
 	BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
 }
 EXPORT_SYMBOL(add_preempt_count);
 void fastcall sub_preempt_count(int val)
 {
 	/*
 	 * Underflow?
 	 */
 	BUG_ON(val > preempt_count());
 	/*
 	 * Is the spinlock portion underflowing?
 	 */
 	BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
 	preempt_count() -= val;
 }
 EXPORT_SYMBOL(sub_preempt_count);
 #endif
 /*
  * schedule() is the main scheduler function.
  */
 asmlinkage void __sched schedule(void)
 {
 	long *switch_count;
 	task_t *prev, *next;
 	runqueue_t *rq;
 	prio_array_t *array;
 	struct list_head *queue;
 	unsigned long long now;
 	unsigned long run_time;
 	int cpu, idx, new_prio;
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
 	if (likely(!current->exit_state)) {
 		if (unlikely(in_atomic())) {
 			printk(KERN_ERR "BUG: scheduling while atomic: "
 				"%s/0x%08x/%d\n",
 				current->comm, preempt_count(), current->pid);
 			dump_stack();
 		}
 	}
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 need_resched:
 	preempt_disable();
 	prev = current;
 	release_kernel_lock(prev);
 need_resched_nonpreemptible:
 	rq = this_rq();
 	/*
 	 * The idle thread is not allowed to schedule!
 	 * Remove this check after it has been exercised a bit.
 	 */
 	if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
 		printk(KERN_ERR "bad: scheduling from the idle thread!\n");
 		dump_stack();
 	}
 	schedstat_inc(rq, sched_cnt);
 	now = sched_clock();
 	if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
 		run_time = now - prev->timestamp;
 		if (unlikely((long long)(now - prev->timestamp) < 0))
 			run_time = 0;
 	} else
 		run_time = NS_MAX_SLEEP_AVG;
 	/*
 	 * Tasks charged proportionately less run_time at high sleep_avg to
 	 * delay them losing their interactive status
 	 */
 	run_time /= (CURRENT_BONUS(prev) ? : 1);
 	spin_lock_irq(&rq->lock);
 	if (unlikely(prev->flags & PF_DEAD))
 		prev->state = EXIT_DEAD;
 	switch_count = &prev->nivcsw;
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		switch_count = &prev->nvcsw;
 		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
 				unlikely(signal_pending(prev))))
 			prev->state = TASK_RUNNING;
 		else {
 			if (prev->state == TASK_UNINTERRUPTIBLE)
 				rq->nr_uninterruptible++;
 			deactivate_task(prev, rq);
 		}
 	}
 	cpu = smp_processor_id();
 	if (unlikely(!rq->nr_running)) {
 go_idle:
 		idle_balance(cpu, rq);
 		if (!rq->nr_running) {
 			next = rq->idle;
 			rq->expired_timestamp = 0;
 			wake_sleeping_dependent(cpu, rq);
 			/*
 			 * wake_sleeping_dependent() might have released
 			 * the runqueue, so break out if we got new
 			 * tasks meanwhile:
 			 */
 			if (!rq->nr_running)
 				goto switch_tasks;
 		}
 	} else {
 		if (dependent_sleeper(cpu, rq)) {
 			next = rq->idle;
 			goto switch_tasks;
 		}
 		/*
 		 * dependent_sleeper() releases and reacquires the runqueue
 		 * lock, hence go into the idle loop if the rq went
 		 * empty meanwhile:
 		 */
 		if (unlikely(!rq->nr_running))
 			goto go_idle;
 	}
 	array = rq->active;
 	if (unlikely(!array->nr_active)) {
 		/*
 		 * Switch the active and expired arrays.
 		 */
 		schedstat_inc(rq, sched_switch);
 		rq->active = rq->expired;
 		rq->expired = array;
 		array = rq->active;
 		rq->expired_timestamp = 0;
 		rq->best_expired_prio = MAX_PRIO;
 	}
 	idx = sched_find_first_bit(array->bitmap);
 	queue = array->queue + idx;
 	next = list_entry(queue->next, task_t, run_list);
 	if (!rt_task(next) && next->activated > 0) {
 		unsigned long long delta = now - next->timestamp;
 		if (unlikely((long long)(now - next->timestamp) < 0))
 			delta = 0;
 		if (next->activated == 1)
 			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
 		array = next->array;
 		new_prio = recalc_task_prio(next, next->timestamp + delta);
 		if (unlikely(next->prio != new_prio)) {
 			dequeue_task(next, array);
 			next->prio = new_prio;
 			enqueue_task(next, array);
 		} else
 			requeue_task(next, array);
 	}
 	next->activated = 0;
 switch_tasks:
 	if (next == rq->idle)
 		schedstat_inc(rq, sched_goidle);
 	prefetch(next);
 	prefetch_stack(next);
 	clear_tsk_need_resched(prev);
 	rcu_qsctr_inc(task_cpu(prev));
 	update_cpu_clock(prev, rq, now);
 	prev->sleep_avg -= run_time;
 	if ((long)prev->sleep_avg <= 0)
 		prev->sleep_avg = 0;
 	prev->timestamp = prev->last_ran = now;
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
 		next->timestamp = now;
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
 		prepare_task_switch(rq, next);
 		prev = context_switch(rq, prev, next);
 		barrier();
 		/*
 		 * this_rq must be evaluated again because prev may have moved
 		 * CPUs since it called schedule(), thus the 'rq' on its stack
 		 * frame will be invalid.
 		 */
 		finish_task_switch(this_rq(), prev);
 	} else
 		spin_unlock_irq(&rq->lock);
 	prev = current;
 	if (unlikely(reacquire_kernel_lock(prev) < 0))
 		goto need_resched_nonpreemptible;
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
 #ifdef CONFIG_PREEMPT
 /*
  * this is is the entry point to schedule() from in-kernel preemption
  * off of preempt_enable.  Kernel preemptions off return from interrupt
  * occur there and call schedule directly.
  */
 asmlinkage void __sched preempt_schedule(void)
 {
 	struct thread_info *ti = current_thread_info();
 #ifdef CONFIG_PREEMPT_BKL
 	struct task_struct *task = current;
 	int saved_lock_depth;
 #endif
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task.  Just return..
 	 */
 	if (unlikely(ti->preempt_count || irqs_disabled()))
 		return;
 need_resched:
 	add_preempt_count(PREEMPT_ACTIVE);
 	/*
 	 * We keep the big kernel semaphore locked, but we
 	 * clear ->lock_depth so that schedule() doesnt
 	 * auto-release the semaphore:
 	 */
 #ifdef CONFIG_PREEMPT_BKL
 	saved_lock_depth = task->lock_depth;
 	task->lock_depth = -1;
 #endif
 	schedule();
 #ifdef CONFIG_PREEMPT_BKL
 	task->lock_depth = saved_lock_depth;
 #endif
 	sub_preempt_count(PREEMPT_ACTIVE);
 	/* we could miss a preemption opportunity between schedule and now */
 	barrier();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
 }
 EXPORT_SYMBOL(preempt_schedule);
 /*
  * this is is the entry point to schedule() from kernel preemption
  * off of irq context.
  * Note, that this is called and return with irqs disabled. This will
  * protect us against recursive calling from irq.
  */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
 	struct thread_info *ti = current_thread_info();
 #ifdef CONFIG_PREEMPT_BKL
 	struct task_struct *task = current;
 	int saved_lock_depth;
 #endif
 	/* Catch callers which need to be fixed*/
 	BUG_ON(ti->preempt_count || !irqs_disabled());
 need_resched:
 	add_preempt_count(PREEMPT_ACTIVE);
 	/*
 	 * We keep the big kernel semaphore locked, but we
 	 * clear ->lock_depth so that schedule() doesnt
 	 * auto-release the semaphore:
 	 */
 #ifdef CONFIG_PREEMPT_BKL
 	saved_lock_depth = task->lock_depth;
 	task->lock_depth = -1;
 #endif
 	local_irq_enable();
 	schedule();
 	local_irq_disable();
 #ifdef CONFIG_PREEMPT_BKL
 	task->lock_depth = saved_lock_depth;
 #endif
 	sub_preempt_count(PREEMPT_ACTIVE);
 	/* we could miss a preemption opportunity between schedule and now */
 	barrier();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
 }
 #endif /* CONFIG_PREEMPT */
 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
 			  void *key)
 {
 	task_t *p = curr->private;
 	return try_to_wake_up(p, mode, sync);
 }
 EXPORT_SYMBOL(default_wake_function);
 /*
  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
  * number) then we wake all the non-exclusive tasks and one exclusive task.
  *
  * There are circumstances in which we can try to wake a task which has already
  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 			     int nr_exclusive, int sync, void *key)
 {
 	struct list_head *tmp, *next;
 	list_for_each_safe(tmp, next, &q->task_list) {
 		wait_queue_t *curr;
 		unsigned flags;
 		curr = list_entry(tmp, wait_queue_t, task_list);
 		flags = curr->flags;
 		if (curr->func(curr, mode, sync, key) &&
 		    (flags & WQ_FLAG_EXCLUSIVE) &&
 		    !--nr_exclusive)
 			break;
 	}
 }
 /**
  * __wake_up - wake up threads blocked on a waitqueue.
  * @q: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  * @key: is directly passed to the wakeup function
  */
 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, void *key)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&q->lock, flags);
 	__wake_up_common(q, mode, nr_exclusive, 0, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(__wake_up);
 /*
  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
  */
 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 {
 	__wake_up_common(q, mode, 1, 0, NULL);
 }
 /**
  * __wake_up_sync - wake up threads blocked on a waitqueue.
  * @q: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  *
  * The sync wakeup differs that the waker knows that it will schedule
  * away soon, so while the target thread will be woken up, it will not
  * be migrated to another CPU - ie. the two threads are 'synchronized'
  * with each other. This can prevent needless bouncing between CPUs.
  *
  * On UP it can prevent extra preemption.
  */
 void fastcall
 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 {
 	unsigned long flags;
 	int sync = 1;
 	if (unlikely(!q))
 		return;
 	if (unlikely(!nr_exclusive))
 		sync = 0;
 	spin_lock_irqsave(&q->lock, flags);
 	__wake_up_common(q, mode, nr_exclusive, sync, NULL);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 void fastcall complete(struct completion *x)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done++;
 	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
 			 1, 0, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete);
 void fastcall complete_all(struct completion *x)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done += UINT_MAX/2;
 	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
 			 0, 0, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 EXPORT_SYMBOL(complete_all);
 void fastcall __sched wait_for_completion(struct completion *x)
 {
 	might_sleep();
 	spin_lock_irq(&x->wait.lock);
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 		wait.flags |= WQ_FLAG_EXCLUSIVE;
 		__add_wait_queue_tail(&x->wait, &wait);
 		do {
 			__set_current_state(TASK_UNINTERRUPTIBLE);
 			spin_unlock_irq(&x->wait.lock);
 			schedule();
 			spin_lock_irq(&x->wait.lock);
 		} while (!x->done);
 		__remove_wait_queue(&x->wait, &wait);
 	}
 	x->done--;
 	spin_unlock_irq(&x->wait.lock);
 }
 EXPORT_SYMBOL(wait_for_completion);
 unsigned long fastcall __sched
 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 {
 	might_sleep();
 	spin_lock_irq(&x->wait.lock);
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 		wait.flags |= WQ_FLAG_EXCLUSIVE;
 		__add_wait_queue_tail(&x->wait, &wait);
 		do {
 			__set_current_state(TASK_UNINTERRUPTIBLE);
 			spin_unlock_irq(&x->wait.lock);
 			timeout = schedule_timeout(timeout);
 			spin_lock_irq(&x->wait.lock);
 			if (!timeout) {
 				__remove_wait_queue(&x->wait, &wait);
 				goto out;
 			}
 		} while (!x->done);
 		__remove_wait_queue(&x->wait, &wait);
 	}
 	x->done--;
 out:
 	spin_unlock_irq(&x->wait.lock);
 	return timeout;
 }
 EXPORT_SYMBOL(wait_for_completion_timeout);
 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
 {
 	int ret = 0;
 	might_sleep();
 	spin_lock_irq(&x->wait.lock);
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 		wait.flags |= WQ_FLAG_EXCLUSIVE;
 		__add_wait_queue_tail(&x->wait, &wait);
 		do {
 			if (signal_pending(current)) {
 				ret = -ERESTARTSYS;
 				__remove_wait_queue(&x->wait, &wait);
 				goto out;
 			}
 			__set_current_state(TASK_INTERRUPTIBLE);
 			spin_unlock_irq(&x->wait.lock);
 			schedule();
 			spin_lock_irq(&x->wait.lock);
 		} while (!x->done);
 		__remove_wait_queue(&x->wait, &wait);
 	}
 	x->done--;
 out:
 	spin_unlock_irq(&x->wait.lock);
 	return ret;
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible);
 unsigned long fastcall __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
 					  unsigned long timeout)
 {
 	might_sleep();
 	spin_lock_irq(&x->wait.lock);
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 		wait.flags |= WQ_FLAG_EXCLUSIVE;
 		__add_wait_queue_tail(&x->wait, &wait);
 		do {
 			if (signal_pending(current)) {
 				timeout = -ERESTARTSYS;
 				__remove_wait_queue(&x->wait, &wait);
 				goto out;
 			}
 			__set_current_state(TASK_INTERRUPTIBLE);
 			spin_unlock_irq(&x->wait.lock);
 			timeout = schedule_timeout(timeout);
 			spin_lock_irq(&x->wait.lock);
 			if (!timeout) {
 				__remove_wait_queue(&x->wait, &wait);
 				goto out;
 			}
 		} while (!x->done);
 		__remove_wait_queue(&x->wait, &wait);
 	}
 	x->done--;
 out:
 	spin_unlock_irq(&x->wait.lock);
 	return timeout;
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
 #define	SLEEP_ON_VAR					\
 	unsigned long flags;				\
 	wait_queue_t wait;				\
 	init_waitqueue_entry(&wait, current);
 #define SLEEP_ON_HEAD					\
 	spin_lock_irqsave(&q->lock,flags);		\
 	__add_wait_queue(q, &wait);			\
 	spin_unlock(&q->lock);
 #define	SLEEP_ON_TAIL					\
 	spin_lock_irq(&q->lock);			\
 	__remove_wait_queue(q, &wait);			\
 	spin_unlock_irqrestore(&q->lock, flags);
 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
 {
 	SLEEP_ON_VAR
 	current->state = TASK_INTERRUPTIBLE;
 	SLEEP_ON_HEAD
 	schedule();
 	SLEEP_ON_TAIL
 }
 EXPORT_SYMBOL(interruptible_sleep_on);
 long fastcall __sched
 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	SLEEP_ON_VAR
 	current->state = TASK_INTERRUPTIBLE;
 	SLEEP_ON_HEAD
 	timeout = schedule_timeout(timeout);
 	SLEEP_ON_TAIL
 	return timeout;
 }
 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
 void fastcall __sched sleep_on(wait_queue_head_t *q)
 {
 	SLEEP_ON_VAR
 	current->state = TASK_UNINTERRUPTIBLE;
 	SLEEP_ON_HEAD
 	schedule();
 	SLEEP_ON_TAIL
 }
 EXPORT_SYMBOL(sleep_on);
 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	SLEEP_ON_VAR
 	current->state = TASK_UNINTERRUPTIBLE;
 	SLEEP_ON_HEAD
 	timeout = schedule_timeout(timeout);
 	SLEEP_ON_TAIL
 	return timeout;
 }
 EXPORT_SYMBOL(sleep_on_timeout);
 void set_user_nice(task_t *p, long nice)
 {
 	unsigned long flags;
 	prio_array_t *array;
 	runqueue_t *rq;
 	int old_prio, new_prio, delta;
 	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
 		return;
 	/*
 	 * We have to be careful, if called from sys_setpriority(),
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
 	rq = task_rq_lock(p, &flags);
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
 	 * it wont have any effect on scheduling until the task is
 	 * not SCHED_NORMAL/SCHED_BATCH:
 	 */
 	if (rt_task(p)) {
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
 	array = p->array;
 	if (array)
 		dequeue_task(p, array);
 	old_prio = p->prio;
 	new_prio = NICE_TO_PRIO(nice);
 	delta = new_prio - old_prio;
 	p->static_prio = NICE_TO_PRIO(nice);
 	p->prio += delta;
 	if (array) {
 		enqueue_task(p, array);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
 		 */
 		if (delta < 0 || (delta > 0 && task_running(rq, p)))
 			resched_task(rq->curr);
 	}
 out_unlock:
 	task_rq_unlock(rq, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
 /*
  * can_nice - check if a task can reduce its nice value
  * @p: task
  * @nice: nice value
  */
 int can_nice(const task_t *p, const int nice)
 {
 	/* convert nice value [19,-20] to rlimit style value [1,40] */
 	int nice_rlim = 20 - nice;
 	return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
 		capable(CAP_SYS_NICE));
 }
 #ifdef __ARCH_WANT_SYS_NICE
 /*
  * sys_nice - change the priority of the current process.
  * @increment: priority increment
  *
  * sys_setpriority is a more generic, but much slower function that
  * does similar things.
  */
 asmlinkage long sys_nice(int increment)
 {
 	int retval;
 	long nice;
 	/*
 	 * Setpriority might change our priority at the same moment.
 	 * We don't have to worry. Conceptually one call occurs first
 	 * and we have a single winner.
 	 */
 	if (increment < -40)
 		increment = -40;
 	if (increment > 40)
 		increment = 40;
 	nice = PRIO_TO_NICE(current->static_prio) + increment;
 	if (nice < -20)
 		nice = -20;
 	if (nice > 19)
 		nice = 19;
 	if (increment < 0 && !can_nice(current, nice))
 		return -EPERM;
 	retval = security_task_setnice(current, nice);
 	if (retval)
 		return retval;
 	set_user_nice(current, nice);
 	return 0;
 }
 #endif
 /**
  * task_prio - return the priority value of a given task.
  * @p: the task in question.
  *
  * This is the priority value as seen by users in /proc.
  * RT tasks are offset by -200. Normal tasks are centered
  * around 0, value goes from -16 to +15.
  */
 int task_prio(const task_t *p)
 {
 	return p->prio - MAX_RT_PRIO;
 }
 /**
  * task_nice - return the nice value of a given task.
  * @p: the task in question.
  */
 int task_nice(const task_t *p)
 {
 	return TASK_NICE(p);
 }
 EXPORT_SYMBOL_GPL(task_nice);
 /**
  * idle_cpu - is a given cpu idle currently?
  * @cpu: the processor in question.
  */
 int idle_cpu(int cpu)
 {
 	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
 }
 /**
  * idle_task - return the idle task for a given cpu.
  * @cpu: the processor in question.
  */
 task_t *idle_task(int cpu)
 {
 	return cpu_rq(cpu)->idle;
 }
 /**
  * find_process_by_pid - find a process with a matching PID value.
  * @pid: the pid in question.
  */
 static inline task_t *find_process_by_pid(pid_t pid)
 {
 	return pid ? find_task_by_pid(pid) : current;
 }
 /* Actually do priority change: must hold rq lock. */
 static void __setscheduler(struct task_struct *p, int policy, int prio)
 {
 	BUG_ON(p->array);
 	p->policy = policy;
 	p->rt_priority = prio;
 	if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
 		p->prio = MAX_RT_PRIO-1 - p->rt_priority;
 	} else {
 		p->prio = p->static_prio;
 		/*
 		 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
 		 */
 		if (policy == SCHED_BATCH)
 			p->sleep_avg = 0;
 	}
 }
 /**
  * sched_setscheduler - change the scheduling policy and/or RT priority of
  * a thread.
  * @p: the task in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  */
 int sched_setscheduler(struct task_struct *p, int policy,
 		       struct sched_param *param)
 {
 	int retval;
 	int oldprio, oldpolicy = -1;
 	prio_array_t *array;
 	unsigned long flags;
 	runqueue_t *rq;
 recheck:
 	/* double check policy once rq lock held */
 	if (policy < 0)
 		policy = oldpolicy = p->policy;
 	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
 			policy != SCHED_NORMAL && policy != SCHED_BATCH)
 		return -EINVAL;
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
 	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
 	 * SCHED_BATCH is 0.
 	 */
 	if (param->sched_priority < 0 ||
 	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
 	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
 		return -EINVAL;
 	if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
 					!= (param->sched_priority == 0))
 		return -EINVAL;
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
 	if (!capable(CAP_SYS_NICE)) {
 		/*
 		 * can't change policy, except between SCHED_NORMAL
 		 * and SCHED_BATCH:
 		 */
 		if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
 			(policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
 				!p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
 			return -EPERM;
 		/* can't increase priority */
 		if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
 		    param->sched_priority > p->rt_priority &&
 		    param->sched_priority >
 				p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
 			return -EPERM;
 		/* can't change other user's priorities */
 		if ((current->euid != p->euid) &&
 		    (current->euid != p->uid))
 			return -EPERM;
 	}
 	retval = security_task_setscheduler(p, policy, param);
 	if (retval)
 		return retval;
 	/*
 	 * To be able to change p->policy safely, the apropriate
 	 * runqueue lock must be held.
 	 */
 	rq = task_rq_lock(p, &flags);
 	/* recheck policy now with rq lock held */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
 		task_rq_unlock(rq, &flags);
 		goto recheck;
 	}
 	array = p->array;
 	if (array)
 		deactivate_task(p, rq);
 	oldprio = p->prio;
 	__setscheduler(p, policy, param->sched_priority);
 	if (array) {
 		__activate_task(p, rq);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
 		 * this runqueue and our priority is higher than the current's
 		 */
 		if (task_running(rq, p)) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
 		} else if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
 	}
 	task_rq_unlock(rq, &flags);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
 static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
 	int retval;
 	struct sched_param lparam;
 	struct task_struct *p;
 	if (!param || pid < 0)
 		return -EINVAL;
 	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
 		return -EFAULT;
 	read_lock_irq(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (!p) {
 		read_unlock_irq(&tasklist_lock);
 		return -ESRCH;
 	}
 	retval = sched_setscheduler(p, policy, &lparam);
 	read_unlock_irq(&tasklist_lock);
 	return retval;
 }
 /**
  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
  * @pid: the pid in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  */
 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
 				       struct sched_param __user *param)
 {
 	/* negative values for policy are not valid */
 	if (policy < 0)
 		return -EINVAL;
 	return do_sched_setscheduler(pid, policy, param);
 }
 /**
  * sys_sched_setparam - set/change the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the new RT priority.
  */
 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
 {
 	return do_sched_setscheduler(pid, -1, param);
 }
 /**
  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
  * @pid: the pid in question.
  */
 asmlinkage long sys_sched_getscheduler(pid_t pid)
 {
 	int retval = -EINVAL;
 	task_t *p;
 	if (pid < 0)
 		goto out_nounlock;
 	retval = -ESRCH;
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (p) {
 		retval = security_task_getscheduler(p);
 		if (!retval)
 			retval = p->policy;
 	}
 	read_unlock(&tasklist_lock);
 out_nounlock:
 	return retval;
 }
 /**
  * sys_sched_getscheduler - get the RT priority of a thread
  * @pid: the pid in question.
  * @param: structure containing the RT priority.
  */
 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
 {
 	struct sched_param lp;
 	int retval = -EINVAL;
 	task_t *p;
 	if (!param || pid < 0)
 		goto out_nounlock;
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	retval = -ESRCH;
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	lp.sched_priority = p->rt_priority;
 	read_unlock(&tasklist_lock);
 	/*
 	 * This one might sleep, we cannot do it with a spinlock held ...
 	 */
 	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
 out_nounlock:
 	return retval;
 out_unlock:
 	read_unlock(&tasklist_lock);
 	return retval;
 }
 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
 {
 	task_t *p;
 	int retval;
 	cpumask_t cpus_allowed;
 	lock_cpu_hotplug();
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (!p) {
 		read_unlock(&tasklist_lock);
 		unlock_cpu_hotplug();
 		return -ESRCH;
 	}
 	/*
 	 * It is not safe to call set_cpus_allowed with the
 	 * tasklist_lock held.  We will bump the task_struct's
 	 * usage count and then drop tasklist_lock.
 	 */
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 	retval = -EPERM;
 	if ((current->euid != p->euid) && (current->euid != p->uid) &&
 			!capable(CAP_SYS_NICE))
 		goto out_unlock;
 	cpus_allowed = cpuset_cpus_allowed(p);
 	cpus_and(new_mask, new_mask, cpus_allowed);
 	retval = set_cpus_allowed(p, new_mask);
 out_unlock:
 	put_task_struct(p);
 	unlock_cpu_hotplug();
 	return retval;
 }
 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
 			     cpumask_t *new_mask)
 {
 	if (len < sizeof(cpumask_t)) {
 		memset(new_mask, 0, sizeof(cpumask_t));
 	} else if (len > sizeof(cpumask_t)) {
 		len = sizeof(cpumask_t);
 	}
 	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
 }
 /**
  * sys_sched_setaffinity - set the cpu affinity of a process
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to the new cpu mask
  */
 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
 				      unsigned long __user *user_mask_ptr)
 {
 	cpumask_t new_mask;
 	int retval;
 	retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
 	if (retval)
 		return retval;
 	return sched_setaffinity(pid, new_mask);
 }
 /*
  * Represents all cpu's present in the system
  * In systems capable of hotplug, this map could dynamically grow
  * as new cpu's are detected in the system via any platform specific
  * method, such as ACPI for e.g.
  */
 cpumask_t cpu_present_map __read_mostly;
 EXPORT_SYMBOL(cpu_present_map);
 #ifndef CONFIG_SMP
 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
 #endif
 long sched_getaffinity(pid_t pid, cpumask_t *mask)
 {
 	int retval;
 	task_t *p;
 	lock_cpu_hotplug();
 	read_lock(&tasklist_lock);
 	retval = -ESRCH;
 	p = find_process_by_pid(pid);
 	if (!p)
 		goto out_unlock;
 	retval = 0;
 	cpus_and(*mask, p->cpus_allowed, cpu_online_map);
 out_unlock:
 	read_unlock(&tasklist_lock);
 	unlock_cpu_hotplug();
 	if (retval)
 		return retval;
 	return 0;
 }
 /**
  * sys_sched_getaffinity - get the cpu affinity of a process
  * @pid: pid of the process
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to hold the current cpu mask
  */
 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
 				      unsigned long __user *user_mask_ptr)
 {
 	int ret;
 	cpumask_t mask;
 	if (len < sizeof(cpumask_t))
 		return -EINVAL;
 	ret = sched_getaffinity(pid, &mask);
 	if (ret < 0)
 		return ret;
 	if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
 		return -EFAULT;
 	return sizeof(cpumask_t);
 }
 /**
  * sys_sched_yield - yield the current processor to other threads.
  *
  * this function yields the current CPU by moving the calling thread
  * to the expired array. If there are no other threads running on this
  * CPU then this function will return.
  */
 asmlinkage long sys_sched_yield(void)
 {
 	runqueue_t *rq = this_rq_lock();
 	prio_array_t *array = current->array;
 	prio_array_t *target = rq->expired;
 	schedstat_inc(rq, yld_cnt);
 	/*
 	 * We implement yielding by moving the task into the expired
 	 * queue.
 	 *
 	 * (special rule: RT tasks will just roundrobin in the active
 	 *  array.)
 	 */
 	if (rt_task(current))
 		target = rq->active;
 	if (array->nr_active == 1) {
 		schedstat_inc(rq, yld_act_empty);
 		if (!rq->expired->nr_active)
 			schedstat_inc(rq, yld_both_empty);
 	} else if (!rq->expired->nr_active)
 		schedstat_inc(rq, yld_exp_empty);
 	if (array != target) {
 		dequeue_task(current, array);
 		enqueue_task(current, target);
 	} else
 		/*
 		 * requeue_task is cheaper so perform that if possible.
 		 */
 		requeue_task(current, array);
 	/*
 	 * Since we are going to call schedule() anyway, there's
 	 * no need to preempt or enable interrupts:
 	 */
 	__release(rq->lock);
 	_raw_spin_unlock(&rq->lock);
 	preempt_enable_no_resched();
 	schedule();
 	return 0;
 }
 static inline void __cond_resched(void)
 {
 	/*
 	 * The BKS might be reacquired before we have dropped
 	 * PREEMPT_ACTIVE, which could trigger a second
 	 * cond_resched() call.
 	 */
 	if (unlikely(preempt_count()))
 		return;
 	if (unlikely(system_state != SYSTEM_RUNNING))
 		return;
 	do {
 		add_preempt_count(PREEMPT_ACTIVE);
 		schedule();
 		sub_preempt_count(PREEMPT_ACTIVE);
 	} while (need_resched());
 }
 int __sched cond_resched(void)
 {
 	if (need_resched()) {
 		__cond_resched();
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(cond_resched);
 /*
  * cond_resched_lock() - if a reschedule is pending, drop the given lock,
  * call schedule, and on return reacquire the lock.
  *
  * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
  * operations here to prevent schedule() from being called twice (once via
  * spin_unlock(), once by hand).
  */
 int cond_resched_lock(spinlock_t *lock)
 {
 	int ret = 0;
 	if (need_lockbreak(lock)) {
 		spin_unlock(lock);
 		cpu_relax();
 		ret = 1;
 		spin_lock(lock);
 	}
 	if (need_resched()) {
 		_raw_spin_unlock(lock);
 		preempt_enable_no_resched();
 		__cond_resched();
 		ret = 1;
 		spin_lock(lock);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(cond_resched_lock);
 int __sched cond_resched_softirq(void)
 {
 	BUG_ON(!in_softirq());
 	if (need_resched()) {
 		__local_bh_enable();
 		__cond_resched();
 		local_bh_disable();
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(cond_resched_softirq);
 /**
  * yield - yield the current processor to other threads.
  *
  * this is a shortcut for kernel-space yielding - it marks the
  * thread runnable and calls sys_sched_yield().
  */
 void __sched yield(void)
 {
 	set_current_state(TASK_RUNNING);
 	sys_sched_yield();
 }
 EXPORT_SYMBOL(yield);
 /*
  * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
  * that process accounting knows that this is a task in IO wait state.
  *
  * But don't do that if it is a deliberate, throttling IO wait (this task
  * has set its backing_dev_info: the queue against which it should throttle)
  */
 void __sched io_schedule(void)
 {
 	struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
 	atomic_inc(&rq->nr_iowait);
 	schedule();
 	atomic_dec(&rq->nr_iowait);
 }
 EXPORT_SYMBOL(io_schedule);
 long __sched io_schedule_timeout(long timeout)
 {
 	struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
 	long ret;
 	atomic_inc(&rq->nr_iowait);
 	ret = schedule_timeout(timeout);
 	atomic_dec(&rq->nr_iowait);
 	return ret;
 }
 /**
  * sys_sched_get_priority_max - return maximum RT priority.
  * @policy: scheduling class.
  *
  * this syscall returns the maximum rt_priority that can be used
  * by a given scheduling class.
  */
 asmlinkage long sys_sched_get_priority_max(int policy)
 {
 	int ret = -EINVAL;
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
 		ret = MAX_USER_RT_PRIO-1;
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 		ret = 0;
 		break;
 	}
 	return ret;
 }
 /**
  * sys_sched_get_priority_min - return minimum RT priority.
  * @policy: scheduling class.
  *
  * this syscall returns the minimum rt_priority that can be used
  * by a given scheduling class.
  */
 asmlinkage long sys_sched_get_priority_min(int policy)
 {
 	int ret = -EINVAL;
 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
 		ret = 1;
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
 		ret = 0;
 	}
 	return ret;
 }
 /**
  * sys_sched_rr_get_interval - return the default timeslice of a process.
  * @pid: pid of the process.
  * @interval: userspace pointer to the timeslice value.
  *
  * this syscall writes the default timeslice value of a given process
  * into the user-space timespec buffer. A value of '0' means infinity.
  */
 asmlinkage
 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
 {
 	int retval = -EINVAL;
 	struct timespec t;
 	task_t *p;
 	if (pid < 0)
 		goto out_nounlock;
 	retval = -ESRCH;
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (!p)
 		goto out_unlock;
 	retval = security_task_getscheduler(p);
 	if (retval)
 		goto out_unlock;
 	jiffies_to_timespec(p->policy & SCHED_FIFO ?
 				0 : task_timeslice(p), &t);
 	read_unlock(&tasklist_lock);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 out_nounlock:
 	return retval;
 out_unlock:
 	read_unlock(&tasklist_lock);
 	return retval;
 }
 static inline struct task_struct *eldest_child(struct task_struct *p)
 {
 	if (list_empty(&p->children)) return NULL;
 	return list_entry(p->children.next,struct task_struct,sibling);
 }
 static inline struct task_struct *older_sibling(struct task_struct *p)
 {
 	if (p->sibling.prev==&p->parent->children) return NULL;
 	return list_entry(p->sibling.prev,struct task_struct,sibling);
 }
 static inline struct task_struct *younger_sibling(struct task_struct *p)
 {
 	if (p->sibling.next==&p->parent->children) return NULL;
 	return list_entry(p->sibling.next,struct task_struct,sibling);
 }
 static void show_task(task_t *p)
 {
 	task_t *relative;
 	unsigned state;
 	unsigned long free = 0;
 	static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
 	printk("%-13.13s ", p->comm);
 	state = p->state ? __ffs(p->state) + 1 : 0;
 	if (state < ARRAY_SIZE(stat_nam))
 		printk(stat_nam[state]);
 	else
 		printk("?");
 #if (BITS_PER_LONG == 32)
 	if (state == TASK_RUNNING)
 		printk(" running ");
 	else
 		printk(" %08lX ", thread_saved_pc(p));
 #else
 	if (state == TASK_RUNNING)
 		printk("  running task   ");
 	else
 		printk(" %016lx ", thread_saved_pc(p));
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	{
 		unsigned long *n = end_of_stack(p);
 		while (!*n)
 			n++;
 		free = (unsigned long)n - (unsigned long)end_of_stack(p);
 	}
 #endif
 	printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
 	if ((relative = eldest_child(p)))
 		printk("%5d ", relative->pid);
 	else
 		printk("      ");
 	if ((relative = younger_sibling(p)))
 		printk("%7d", relative->pid);
 	else
 		printk("       ");
 	if ((relative = older_sibling(p)))
 		printk(" %5d", relative->pid);
 	else
 		printk("      ");
 	if (!p->mm)
 		printk(" (L-TLB)\n");
 	else
 		printk(" (NOTLB)\n");
 	if (state != TASK_RUNNING)
 		show_stack(p, NULL);
 }
 void show_state(void)
 {
 	task_t *g, *p;
 #if (BITS_PER_LONG == 32)
 	printk("\n"
 	       "                                               sibling\n");
 	printk("  task             PC      pid father child younger older\n");
 #else
 	printk("\n"
 	       "                                                       sibling\n");
 	printk("  task                 PC          pid father child younger older\n");
 #endif
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
 		 * console might take alot of time:
 		 */
 		touch_nmi_watchdog();
 		show_task(p);
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
 	mutex_debug_show_all_locks();
 }
 /**
  * init_idle - set up an idle thread for a given CPU
  * @idle: task in question
  * @cpu: cpu the idle task belongs to
  *
  * NOTE: this function does not set the idle thread's NEED_RESCHED
  * flag, to make booting more robust.
  */
 void __devinit init_idle(task_t *idle, int cpu)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	unsigned long flags;
 	idle->timestamp = sched_clock();
 	idle->sleep_avg = 0;
 	idle->array = NULL;
 	idle->prio = MAX_PRIO;
 	idle->state = TASK_RUNNING;
 	idle->cpus_allowed = cpumask_of_cpu(cpu);
 	set_task_cpu(idle, cpu);
 	spin_lock_irqsave(&rq->lock, flags);
 	rq->curr = rq->idle = idle;
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	idle->oncpu = 1;
 #endif
 	spin_unlock_irqrestore(&rq->lock, flags);
 	/* Set the preempt count _outside_ the spinlocks! */
 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
 	task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
 #else
 	task_thread_info(idle)->preempt_count = 0;
 #endif
 }
 /*
  * In a system that switches off the HZ timer nohz_cpu_mask
  * indicates which cpus entered this state. This is used
  * in the rcu update to wait only for active cpus. For system
  * which do not switch off the HZ timer nohz_cpu_mask should
  * always be CPU_MASK_NONE.
  */
 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
 #ifdef CONFIG_SMP
 /*
  * This is how migration works:
  *
  * 1) we queue a migration_req_t structure in the source CPU's
  *    runqueue and wake up that CPU's migration thread.
  * 2) we down() the locked semaphore => thread blocks.
  * 3) migration thread wakes up (implicitly it forces the migrated
  *    thread off the CPU)
  * 4) it gets the migration request and checks whether the migrated
  *    task is still in the wrong runqueue.
  * 5) if it's in the wrong runqueue then the migration thread removes
  *    it and puts it into the right queue.
  * 6) migration thread up()s the semaphore.
  * 7) we wake up and the migration is done.
  */
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
  * is removed from the allowed bitmask.
  *
  * NOTE: the caller must have a valid reference to the task, the
  * task must not exit() & deallocate itself prematurely.  The
  * call is not atomic; no spinlocks may be held.
  */
 int set_cpus_allowed(task_t *p, cpumask_t new_mask)
 {
 	unsigned long flags;
 	int ret = 0;
 	migration_req_t req;
 	runqueue_t *rq;
 	rq = task_rq_lock(p, &flags);
 	if (!cpus_intersects(new_mask, cpu_online_map)) {
 		ret = -EINVAL;
 		goto out;
 	}
 	p->cpus_allowed = new_mask;
 	/* Can the task run on the task's current CPU? If so, we're done */
 	if (cpu_isset(task_cpu(p), new_mask))
 		goto out;
 	if (migrate_task(p, any_online_cpu(new_mask), &req)) {
 		/* Need help from migration thread: drop lock and wait. */
 		task_rq_unlock(rq, &flags);
 		wake_up_process(rq->migration_thread);
 		wait_for_completion(&req.done);
 		tlb_migrate_finish(p->mm);
 		return 0;
 	}
 out:
 	task_rq_unlock(rq, &flags);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed);
 /*
  * Move (not current) task off this cpu, onto dest cpu.  We're doing
  * this because either it can't run here any more (set_cpus_allowed()
  * away from this CPU, or CPU going down), or because we're
  * attempting to rebalance this task on exec (sched_exec).
  *
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
  */
 static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
 	runqueue_t *rq_dest, *rq_src;
 	if (unlikely(cpu_is_offline(dest_cpu)))
 		return;
 	rq_src = cpu_rq(src_cpu);
 	rq_dest = cpu_rq(dest_cpu);
 	double_rq_lock(rq_src, rq_dest);
 	/* Already moved. */
 	if (task_cpu(p) != src_cpu)
 		goto out;
 	/* Affinity changed (again). */
 	if (!cpu_isset(dest_cpu, p->cpus_allowed))
 		goto out;
 	set_task_cpu(p, dest_cpu);
 	if (p->array) {
 		/*
 		 * Sync timestamp with rq_dest's before activating.
 		 * The same thing could be achieved by doing this step
 		 * afterwards, and pretending it was a local activate.
 		 * This way is cleaner and logically correct.
 		 */
 		p->timestamp = p->timestamp - rq_src->timestamp_last_tick
 				+ rq_dest->timestamp_last_tick;
 		deactivate_task(p, rq_src);
 		activate_task(p, rq_dest, 0);
 		if (TASK_PREEMPTS_CURR(p, rq_dest))
 			resched_task(rq_dest->curr);
 	}
 out:
 	double_rq_unlock(rq_src, rq_dest);
 }
 /*
  * migration_thread - this is a highprio system thread that performs
  * thread migration by bumping thread off CPU then 'pushing' onto
  * another runqueue.
  */
 static int migration_thread(void *data)
 {
 	runqueue_t *rq;
 	int cpu = (long)data;
 	rq = cpu_rq(cpu);
 	BUG_ON(rq->migration_thread != current);
 	set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
 		struct list_head *head;
 		migration_req_t *req;
 		try_to_freeze();
 		spin_lock_irq(&rq->lock);
 		if (cpu_is_offline(cpu)) {
 			spin_unlock_irq(&rq->lock);
 			goto wait_to_die;
 		}
 		if (rq->active_balance) {
 			active_load_balance(rq, cpu);
 			rq->active_balance = 0;
 		}
 		head = &rq->migration_queue;
 		if (list_empty(head)) {
 			spin_unlock_irq(&rq->lock);
 			schedule();
 			set_current_state(TASK_INTERRUPTIBLE);
 			continue;
 		}
 		req = list_entry(head->next, migration_req_t, list);
 		list_del_init(head->next);
 		spin_unlock(&rq->lock);
 		__migrate_task(req->task, cpu, req->dest_cpu);
 		local_irq_enable();
 		complete(&req->done);
 	}
 	__set_current_state(TASK_RUNNING);
 	return 0;
 wait_to_die:
 	/* Wait for kthread_stop */
 	set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
 		schedule();
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
 	__set_current_state(TASK_RUNNING);
 	return 0;
 }
 #ifdef CONFIG_HOTPLUG_CPU
 /* Figure out where task on dead CPU should go, use force if neccessary. */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
 {
 	int dest_cpu;
 	cpumask_t mask;
 	/* On same node? */
 	mask = node_to_cpumask(cpu_to_node(dead_cpu));
 	cpus_and(mask, mask, tsk->cpus_allowed);
 	dest_cpu = any_online_cpu(mask);
 	/* On any allowed CPU? */
 	if (dest_cpu == NR_CPUS)
 		dest_cpu = any_online_cpu(tsk->cpus_allowed);
 	/* No more Mr. Nice Guy. */
 	if (dest_cpu == NR_CPUS) {
 		cpus_setall(tsk->cpus_allowed);
 		dest_cpu = any_online_cpu(tsk->cpus_allowed);
 		/*
 		 * Don't tell them about moving exiting tasks or
 		 * kernel threads (both mm NULL), since they never
 		 * leave kernel.
 		 */
 		if (tsk->mm && printk_ratelimit())
 			printk(KERN_INFO "process %d (%s) no "
 			       "longer affine to cpu%d\n",
 			       tsk->pid, tsk->comm, dead_cpu);
 	}
 	__migrate_task(tsk, dead_cpu, dest_cpu);
 }
 /*
  * While a dead CPU has no uninterruptible tasks queued at this point,
  * it might still have a nonzero ->nr_uninterruptible counter, because
  * for performance reasons the counter is not stricly tracking tasks to
  * their home CPUs. So we just add the counter to another CPU's counter,
  * to keep the global sum constant after CPU-down:
  */
 static void migrate_nr_uninterruptible(runqueue_t *rq_src)
 {
 	runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
 	unsigned long flags;
 	local_irq_save(flags);
 	double_rq_lock(rq_src, rq_dest);
 	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
 	rq_src->nr_uninterruptible = 0;
 	double_rq_unlock(rq_src, rq_dest);
 	local_irq_restore(flags);
 }
 /* Run through task list and migrate tasks from the dead cpu. */
 static void migrate_live_tasks(int src_cpu)
 {
 	struct task_struct *tsk, *t;
 	write_lock_irq(&tasklist_lock);
 	do_each_thread(t, tsk) {
 		if (tsk == current)
 			continue;
 		if (task_cpu(tsk) == src_cpu)
 			move_task_off_dead_cpu(src_cpu, tsk);
 	} while_each_thread(t, tsk);
 	write_unlock_irq(&tasklist_lock);
 }
 /* Schedules idle task to be the next runnable task on current CPU.
  * It does so by boosting its priority to highest possible and adding it to
  * the _front_ of runqueue. Used by CPU offline code.
  */
 void sched_idle_next(void)
 {
 	int cpu = smp_processor_id();
 	runqueue_t *rq = this_rq();
 	struct task_struct *p = rq->idle;
 	unsigned long flags;
 	/* cpu has to be offline */
 	BUG_ON(cpu_online(cpu));
 	/* Strictly not necessary since rest of the CPUs are stopped by now
 	 * and interrupts disabled on current cpu.
 	 */
 	spin_lock_irqsave(&rq->lock, flags);
 	__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
 	/* Add idle task to _front_ of it's priority queue */
 	__activate_idle_task(p, rq);
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 /* Ensures that the idle task is using init_mm right before its cpu goes
  * offline.
  */
 void idle_task_exit(void)
 {
 	struct mm_struct *mm = current->active_mm;
 	BUG_ON(cpu_online(smp_processor_id()));
 	if (mm != &init_mm)
 		switch_mm(mm, &init_mm, current);
 	mmdrop(mm);
 }
 static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
 {
 	struct runqueue *rq = cpu_rq(dead_cpu);
 	/* Must be exiting, otherwise would be on tasklist. */
 	BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);
 	/* Cannot have done final schedule yet: would have vanished. */
 	BUG_ON(tsk->flags & PF_DEAD);
 	get_task_struct(tsk);
 	/*
 	 * Drop lock around migration; if someone else moves it,
 	 * that's OK.  No task can be added to this CPU, so iteration is
 	 * fine.
 	 */
 	spin_unlock_irq(&rq->lock);
 	move_task_off_dead_cpu(dead_cpu, tsk);
 	spin_lock_irq(&rq->lock);
 	put_task_struct(tsk);
 }
 /* release_task() removes task from tasklist, so we won't find dead tasks. */
 static void migrate_dead_tasks(unsigned int dead_cpu)
 {
 	unsigned arr, i;
 	struct runqueue *rq = cpu_rq(dead_cpu);
 	for (arr = 0; arr < 2; arr++) {
 		for (i = 0; i < MAX_PRIO; i++) {
 			struct list_head *list = &rq->arrays[arr].queue[i];
 			while (!list_empty(list))
 				migrate_dead(dead_cpu,
 					     list_entry(list->next, task_t,
 							run_list));
 		}
 	}
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 /*
  * migration_call - callback that gets triggered when a CPU is added.
  * Here we can start up the necessary migration thread for the new CPU.
  */
 static int migration_call(struct notifier_block *nfb, unsigned long action,
 			  void *hcpu)
 {
 	int cpu = (long)hcpu;
 	struct task_struct *p;
 	struct runqueue *rq;
 	unsigned long flags;
 	switch (action) {
 	case CPU_UP_PREPARE:
 		p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
 		if (IS_ERR(p))
 			return NOTIFY_BAD;
 		p->flags |= PF_NOFREEZE;
 		kthread_bind(p, cpu);
 		/* Must be high prio: stop_machine expects to yield to it. */
 		rq = task_rq_lock(p, &flags);
 		__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
 		task_rq_unlock(rq, &flags);
 		cpu_rq(cpu)->migration_thread = p;
 		break;
 	case CPU_ONLINE:
 		/* Strictly unneccessary, as first user will wake it. */
 		wake_up_process(cpu_rq(cpu)->migration_thread);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
 		/* Unbind it from offline cpu so it can run.  Fall thru. */
 		kthread_bind(cpu_rq(cpu)->migration_thread,
 			     any_online_cpu(cpu_online_map));
 		kthread_stop(cpu_rq(cpu)->migration_thread);
 		cpu_rq(cpu)->migration_thread = NULL;
 		break;
 	case CPU_DEAD:
 		migrate_live_tasks(cpu);
 		rq = cpu_rq(cpu);
 		kthread_stop(rq->migration_thread);
 		rq->migration_thread = NULL;
 		/* Idle task back to normal (off runqueue, low prio) */
 		rq = task_rq_lock(rq->idle, &flags);
 		deactivate_task(rq->idle, rq);
 		rq->idle->static_prio = MAX_PRIO;
 		__setscheduler(rq->idle, SCHED_NORMAL, 0);
 		migrate_dead_tasks(cpu);
 		task_rq_unlock(rq, &flags);
 		migrate_nr_uninterruptible(rq);
 		BUG_ON(rq->nr_running != 0);
 		/* No need to migrate the tasks: it was best-effort if
 		 * they didn't do lock_cpu_hotplug().  Just wake up
 		 * the requestors. */
 		spin_lock_irq(&rq->lock);
 		while (!list_empty(&rq->migration_queue)) {
 			migration_req_t *req;
 			req = list_entry(rq->migration_queue.next,
 					 migration_req_t, list);
 			list_del_init(&req->list);
 			complete(&req->done);
 		}
 		spin_unlock_irq(&rq->lock);
 		break;
 #endif
 	}
 	return NOTIFY_OK;
 }
 /* Register at highest priority so that task migration (migrate_all_tasks)
  * happens before everything else.
  */
 static struct notifier_block __devinitdata migration_notifier = {
 	.notifier_call = migration_call,
 	.priority = 10
 };
 int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	/* Start one for boot CPU. */
 	migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
 	return 0;
 }
 #endif
 #ifdef CONFIG_SMP
 #undef SCHED_DOMAIN_DEBUG
 #ifdef SCHED_DOMAIN_DEBUG
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
 	int level = 0;
 	if (!sd) {
 		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
 		return;
 	}
 	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
 	do {
 		int i;
 		char str[NR_CPUS];
 		struct sched_group *group = sd->groups;
 		cpumask_t groupmask;
 		cpumask_scnprintf(str, NR_CPUS, sd->span);
 		cpus_clear(groupmask);
 		printk(KERN_DEBUG);
 		for (i = 0; i < level + 1; i++)
 			printk(" ");
 		printk("domain %d: ", level);
 		if (!(sd->flags & SD_LOAD_BALANCE)) {
 			printk("does not load-balance\n");
 			if (sd->parent)
 				printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
 			break;
 		}
 		printk("span %s\n", str);
 		if (!cpu_isset(cpu, sd->span))
 			printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
 		if (!cpu_isset(cpu, group->cpumask))
 			printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
 		printk(KERN_DEBUG);
 		for (i = 0; i < level + 2; i++)
 			printk(" ");
 		printk("groups:");
 		do {
 			if (!group) {
 				printk("\n");
 				printk(KERN_ERR "ERROR: group is NULL\n");
 				break;
 			}
 			if (!group->cpu_power) {
 				printk("\n");
 				printk(KERN_ERR "ERROR: domain->cpu_power not set\n");
 			}
 			if (!cpus_weight(group->cpumask)) {
 				printk("\n");
 				printk(KERN_ERR "ERROR: empty group\n");
 			}
 			if (cpus_intersects(groupmask, group->cpumask)) {
 				printk("\n");
 				printk(KERN_ERR "ERROR: repeated CPUs\n");
 			}
 			cpus_or(groupmask, groupmask, group->cpumask);
 			cpumask_scnprintf(str, NR_CPUS, group->cpumask);
 			printk(" %s", str);
 			group = group->next;
 		} while (group != sd->groups);
 		printk("\n");
 		if (!cpus_equal(sd->span, groupmask))
 			printk(KERN_ERR "ERROR: groups don't span domain->span\n");
 		level++;
 		sd = sd->parent;
 		if (sd) {
 			if (!cpus_subset(groupmask, sd->span))
 				printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
 		}
 	} while (sd);
 }
 #else
 #define sched_domain_debug(sd, cpu) {}
 #endif
 static int sd_degenerate(struct sched_domain *sd)
 {
 	if (cpus_weight(sd->span) == 1)
 		return 1;
 	/* Following flags need at least 2 groups */
 	if (sd->flags & (SD_LOAD_BALANCE |
 			 SD_BALANCE_NEWIDLE |
 			 SD_BALANCE_FORK |
 			 SD_BALANCE_EXEC)) {
 		if (sd->groups != sd->groups->next)
 			return 0;
 	}
 	/* Following flags don't use groups */
 	if (sd->flags & (SD_WAKE_IDLE |
 			 SD_WAKE_AFFINE |
 			 SD_WAKE_BALANCE))
 		return 0;
 	return 1;
 }
 static int sd_parent_degenerate(struct sched_domain *sd,
 						struct sched_domain *parent)
 {
 	unsigned long cflags = sd->flags, pflags = parent->flags;
 	if (sd_degenerate(parent))
 		return 1;
 	if (!cpus_equal(sd->span, parent->span))
 		return 0;
 	/* Does parent contain flags not in child? */
 	/* WAKE_BALANCE is a subset of WAKE_AFFINE */
 	if (cflags & SD_WAKE_AFFINE)
 		pflags &= ~SD_WAKE_BALANCE;
 	/* Flags needing groups don't count if only 1 group in parent */
 	if (parent->groups == parent->groups->next) {
 		pflags &= ~(SD_LOAD_BALANCE |
 				SD_BALANCE_NEWIDLE |
 				SD_BALANCE_FORK |
 				SD_BALANCE_EXEC);
 	}
 	if (~cflags & pflags)
 		return 0;
 	return 1;
 }
 /*
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
  */
 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
 	/* Remove the sched domains which do not contribute to scheduling. */
 	for (tmp = sd; tmp; tmp = tmp->parent) {
 		struct sched_domain *parent = tmp->parent;
 		if (!parent)
 			break;
 		if (sd_parent_degenerate(tmp, parent))
 			tmp->parent = parent->parent;
 	}
 	if (sd && sd_degenerate(sd))
 		sd = sd->parent;
 	sched_domain_debug(sd, cpu);
 	rcu_assign_pointer(rq->sd, sd);
 }
 /* cpus with isolated domains */
 static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
 	int ints[NR_CPUS], i;
 	str = get_options(str, ARRAY_SIZE(ints), ints);
 	cpus_clear(cpu_isolated_map);
 	for (i = 1; i <= ints[0]; i++)
 		if (ints[i] < NR_CPUS)
 			cpu_set(ints[i], cpu_isolated_map);
 	return 1;
 }
 __setup ("isolcpus=", isolated_cpu_setup);
 /*
  * init_sched_build_groups takes an array of groups, the cpumask we wish
  * to span, and a pointer to a function which identifies what group a CPU
  * belongs to. The return value of group_fn must be a valid index into the
  * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we
  * keep track of groups covered with a cpumask_t).
  *
  * init_sched_build_groups will build a circular linked list of the groups
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
 static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
 				    int (*group_fn)(int cpu))
 {
 	struct sched_group *first = NULL, *last = NULL;
 	cpumask_t covered = CPU_MASK_NONE;
 	int i;
 	for_each_cpu_mask(i, span) {
 		int group = group_fn(i);
 		struct sched_group *sg = &groups[group];
 		int j;
 		if (cpu_isset(i, covered))
 			continue;
 		sg->cpumask = CPU_MASK_NONE;
 		sg->cpu_power = 0;
 		for_each_cpu_mask(j, span) {
 			if (group_fn(j) != group)
 				continue;
 			cpu_set(j, covered);
 			cpu_set(j, sg->cpumask);
 		}
 		if (!first)
 			first = sg;
 		if (last)
 			last->next = sg;
 		last = sg;
 	}
 	last->next = first;
 }
 #define SD_NODES_PER_DOMAIN 16
 /*
  * Self-tuning task migration cost measurement between source and target CPUs.
  *
  * This is done by measuring the cost of manipulating buffers of varying
  * sizes. For a given buffer-size here are the steps that are taken:
  *
  * 1) the source CPU reads+dirties a shared buffer
  * 2) the target CPU reads+dirties the same shared buffer
  *
  * We measure how long they take, in the following 4 scenarios:
  *
  *  - source: CPU1, target: CPU2 | cost1
  *  - source: CPU2, target: CPU1 | cost2
  *  - source: CPU1, target: CPU1 | cost3
  *  - source: CPU2, target: CPU2 | cost4
  *
  * We then calculate the cost3+cost4-cost1-cost2 difference - this is
  * the cost of migration.
  *
  * We then start off from a small buffer-size and iterate up to larger
  * buffer sizes, in 5% steps - measuring each buffer-size separately, and
  * doing a maximum search for the cost. (The maximum cost for a migration
  * normally occurs when the working set size is around the effective cache
  * size.)
  */
 #define SEARCH_SCOPE		2
 #define MIN_CACHE_SIZE		(64*1024U)
 #define DEFAULT_CACHE_SIZE	(5*1024*1024U)
 #define ITERATIONS		1
 #define SIZE_THRESH		130
 #define COST_THRESH		130
 /*
  * The migration cost is a function of 'domain distance'. Domain
  * distance is the number of steps a CPU has to iterate down its
  * domain tree to share a domain with the other CPU. The farther
  * two CPUs are from each other, the larger the distance gets.
  *
  * Note that we use the distance only to cache measurement results,
  * the distance value is not used numerically otherwise. When two
  * CPUs have the same distance it is assumed that the migration
  * cost is the same. (this is a simplification but quite practical)
  */
 #define MAX_DOMAIN_DISTANCE 32
 static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
 		{ [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
 /*
  * Architectures may override the migration cost and thus avoid
  * boot-time calibration. Unit is nanoseconds. Mostly useful for
  * virtualized hardware:
  */
 #ifdef CONFIG_DEFAULT_MIGRATION_COST
 			CONFIG_DEFAULT_MIGRATION_COST
 #else
 			-1LL
 #endif
 };
 /*
  * Allow override of migration cost - in units of microseconds.
  * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
  * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
  */
 static int __init migration_cost_setup(char *str)
 {
 	int ints[MAX_DOMAIN_DISTANCE+1], i;
 	str = get_options(str, ARRAY_SIZE(ints), ints);
 	printk("#ints: %d\n", ints[0]);
 	for (i = 1; i <= ints[0]; i++) {
 		migration_cost[i-1] = (unsigned long long)ints[i]*1000;
 		printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
 	}
 	return 1;
 }
 __setup ("migration_cost=", migration_cost_setup);
 /*
  * Global multiplier (divisor) for migration-cutoff values,
  * in percentiles. E.g. use a value of 150 to get 1.5 times
  * longer cache-hot cutoff times.
  *
  * (We scale it from 100 to 128 to long long handling easier.)
  */
 #define MIGRATION_FACTOR_SCALE 128
 static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
 static int __init setup_migration_factor(char *str)
 {
 	get_option(&str, &migration_factor);
 	migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
 	return 1;
 }
 __setup("migration_factor=", setup_migration_factor);
 /*
  * Estimated distance of two CPUs, measured via the number of domains
  * we have to pass for the two CPUs to be in the same span:
  */
 static unsigned long domain_distance(int cpu1, int cpu2)
 {
 	unsigned long distance = 0;
 	struct sched_domain *sd;
 	for_each_domain(cpu1, sd) {
 		WARN_ON(!cpu_isset(cpu1, sd->span));
 		if (cpu_isset(cpu2, sd->span))
 			return distance;
 		distance++;
 	}
 	if (distance >= MAX_DOMAIN_DISTANCE) {
 		WARN_ON(1);
 		distance = MAX_DOMAIN_DISTANCE-1;
 	}
 	return distance;
 }
 static unsigned int migration_debug;
 static int __init setup_migration_debug(char *str)
 {
 	get_option(&str, &migration_debug);
 	return 1;
 }
 __setup("migration_debug=", setup_migration_debug);
 /*
  * Maximum cache-size that the scheduler should try to measure.
  * Architectures with larger caches should tune this up during
  * bootup. Gets used in the domain-setup code (i.e. during SMP
  * bootup).
  */
 unsigned int max_cache_size;
 static int __init setup_max_cache_size(char *str)
 {
 	get_option(&str, &max_cache_size);
 	return 1;
 }
 __setup("max_cache_size=", setup_max_cache_size);
 /*
  * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
  * is the operation that is timed, so we try to generate unpredictable
  * cachemisses that still end up filling the L2 cache:
  */
 static void touch_cache(void *__cache, unsigned long __size)
 {
 	unsigned long size = __size/sizeof(long), chunk1 = size/3,
 			chunk2 = 2*size/3;
 	unsigned long *cache = __cache;
 	int i;
 	for (i = 0; i < size/6; i += 8) {
 		switch (i % 6) {
 			case 0: cache[i]++;
 			case 1: cache[size-1-i]++;
 			case 2: cache[chunk1-i]++;
 			case 3: cache[chunk1+i]++;
 			case 4: cache[chunk2-i]++;
 			case 5: cache[chunk2+i]++;
 		}
 	}
 }
 /*
  * Measure the cache-cost of one task migration. Returns in units of nsec.
  */
 static unsigned long long measure_one(void *cache, unsigned long size,
 				      int source, int target)
 {
 	cpumask_t mask, saved_mask;
 	unsigned long long t0, t1, t2, t3, cost;
 	saved_mask = current->cpus_allowed;
 	/*
 	 * Flush source caches to RAM and invalidate them:
 	 */
 	sched_cacheflush();
 	/*
 	 * Migrate to the source CPU:
 	 */
 	mask = cpumask_of_cpu(source);
 	set_cpus_allowed(current, mask);
 	WARN_ON(smp_processor_id() != source);
 	/*
 	 * Dirty the working set:
 	 */
 	t0 = sched_clock();
 	touch_cache(cache, size);
 	t1 = sched_clock();
 	/*
 	 * Migrate to the target CPU, dirty the L2 cache and access
 	 * the shared buffer. (which represents the working set
 	 * of a migrated task.)
 	 */
 	mask = cpumask_of_cpu(target);
 	set_cpus_allowed(current, mask);
 	WARN_ON(smp_processor_id() != target);
 	t2 = sched_clock();
 	touch_cache(cache, size);
 	t3 = sched_clock();
 	cost = t1-t0 + t3-t2;
 	if (migration_debug >= 2)
 		printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
 			source, target, t1-t0, t1-t0, t3-t2, cost);
 	/*
 	 * Flush target caches to RAM and invalidate them:
 	 */
 	sched_cacheflush();
 	set_cpus_allowed(current, saved_mask);
 	return cost;
 }
 /*
  * Measure a series of task migrations and return the average
  * result. Since this code runs early during bootup the system
  * is 'undisturbed' and the average latency makes sense.
  *
  * The algorithm in essence auto-detects the relevant cache-size,
  * so it will properly detect different cachesizes for different
  * cache-hierarchies, depending on how the CPUs are connected.
  *
  * Architectures can prime the upper limit of the search range via
  * max_cache_size, otherwise the search range defaults to 20MB...64K.
  */
 static unsigned long long
 measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
 {
 	unsigned long long cost1, cost2;
 	int i;
 	/*
 	 * Measure the migration cost of 'size' bytes, over an
 	 * average of 10 runs:
 	 *
 	 * (We perturb the cache size by a small (0..4k)
 	 *  value to compensate size/alignment related artifacts.
 	 *  We also subtract the cost of the operation done on
 	 *  the same CPU.)
 	 */
 	cost1 = 0;
 	/*
 	 * dry run, to make sure we start off cache-cold on cpu1,
 	 * and to get any vmalloc pagefaults in advance:
 	 */
 	measure_one(cache, size, cpu1, cpu2);
 	for (i = 0; i < ITERATIONS; i++)
 		cost1 += measure_one(cache, size - i*1024, cpu1, cpu2);
 	measure_one(cache, size, cpu2, cpu1);
 	for (i = 0; i < ITERATIONS; i++)
 		cost1 += measure_one(cache, size - i*1024, cpu2, cpu1);
 	/*
 	 * (We measure the non-migrating [cached] cost on both
 	 *  cpu1 and cpu2, to handle CPUs with different speeds)
 	 */
 	cost2 = 0;
 	measure_one(cache, size, cpu1, cpu1);
 	for (i = 0; i < ITERATIONS; i++)
 		cost2 += measure_one(cache, size - i*1024, cpu1, cpu1);
 	measure_one(cache, size, cpu2, cpu2);
 	for (i = 0; i < ITERATIONS; i++)
 		cost2 += measure_one(cache, size - i*1024, cpu2, cpu2);
 	/*
 	 * Get the per-iteration migration cost:
 	 */
 	do_div(cost1, 2*ITERATIONS);
 	do_div(cost2, 2*ITERATIONS);
 	return cost1 - cost2;
 }
 static unsigned long long measure_migration_cost(int cpu1, int cpu2)
 {
 	unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
 	unsigned int max_size, size, size_found = 0;
 	long long cost = 0, prev_cost;
 	void *cache;
 	/*
 	 * Search from max_cache_size*5 down to 64K - the real relevant
 	 * cachesize has to lie somewhere inbetween.
 	 */
 	if (max_cache_size) {
 		max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
 		size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
 	} else {
 		/*
 		 * Since we have no estimation about the relevant
 		 * search range
 		 */
 		max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
 		size = MIN_CACHE_SIZE;
 	}
 	if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
 		printk("cpu %d and %d not both online!\n", cpu1, cpu2);
 		return 0;
 	}
 	/*
 	 * Allocate the working set:
 	 */
 	cache = vmalloc(max_size);
 	if (!cache) {
 		printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
 		return 1000000; // return 1 msec on very small boxen
 	}
 	while (size <= max_size) {
 		prev_cost = cost;
 		cost = measure_cost(cpu1, cpu2, cache, size);
 		/*
 		 * Update the max:
 		 */
 		if (cost > 0) {
 			if (max_cost < cost) {
 				max_cost = cost;
 				size_found = size;
 			}
 		}
 		/*
 		 * Calculate average fluctuation, we use this to prevent
 		 * noise from triggering an early break out of the loop:
 		 */
 		fluct = abs(cost - prev_cost);
 		avg_fluct = (avg_fluct + fluct)/2;
 		if (migration_debug)
 			printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n",
 				cpu1, cpu2, size,
 				(long)cost / 1000000,
 				((long)cost / 100000) % 10,
 				(long)max_cost / 1000000,
 				((long)max_cost / 100000) % 10,
 				domain_distance(cpu1, cpu2),
 				cost, avg_fluct);
 		/*
 		 * If we iterated at least 20% past the previous maximum,
 		 * and the cost has dropped by more than 20% already,
 		 * (taking fluctuations into account) then we assume to
 		 * have found the maximum and break out of the loop early:
 		 */
 		if (size_found && (size*100 > size_found*SIZE_THRESH))
 			if (cost+avg_fluct <= 0 ||
 				max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
 				if (migration_debug)
 					printk("-> found max.\n");
 				break;
 			}
 		/*
 		 * Increase the cachesize in 10% steps:
 		 */
 		size = size * 10 / 9;
 	}
 	if (migration_debug)
 		printk("[%d][%d] working set size found: %d, cost: %Ld\n",
 			cpu1, cpu2, size_found, max_cost);
 	vfree(cache);
 	/*
 	 * A task is considered 'cache cold' if at least 2 times
 	 * the worst-case cost of migration has passed.
 	 *
 	 * (this limit is only listened to if the load-balancing
 	 * situation is 'nice' - if there is a large imbalance we
 	 * ignore it for the sake of CPU utilization and
 	 * processing fairness.)
 	 */
 	return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
 }
 static void calibrate_migration_costs(const cpumask_t *cpu_map)
 {
 	int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
 	unsigned long j0, j1, distance, max_distance = 0;
 	struct sched_domain *sd;
 	j0 = jiffies;
 	/*
 	 * First pass - calculate the cacheflush times:
 	 */
 	for_each_cpu_mask(cpu1, *cpu_map) {
 		for_each_cpu_mask(cpu2, *cpu_map) {
 			if (cpu1 == cpu2)
 				continue;
 			distance = domain_distance(cpu1, cpu2);
 			max_distance = max(max_distance, distance);
 			/*
 			 * No result cached yet?
 			 */
 			if (migration_cost[distance] == -1LL)
 				migration_cost[distance] =
 					measure_migration_cost(cpu1, cpu2);
 		}
 	}
 	/*
 	 * Second pass - update the sched domain hierarchy with
 	 * the new cache-hot-time estimations:
 	 */
 	for_each_cpu_mask(cpu, *cpu_map) {
 		distance = 0;
 		for_each_domain(cpu, sd) {
 			sd->cache_hot_time = migration_cost[distance];
 			distance++;
 		}
 	}
 	/*
 	 * Print the matrix:
 	 */
 	if (migration_debug)
 		printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
 			max_cache_size,
 #ifdef CONFIG_X86
 			cpu_khz/1000
 #else
 			-1
 #endif
 		);
 	if (system_state == SYSTEM_BOOTING) {
 		printk("migration_cost=");
 		for (distance = 0; distance <= max_distance; distance++) {
 			if (distance)
 				printk(",");
 			printk("%ld", (long)migration_cost[distance] / 1000);
 		}
 		printk("\n");
 	}
 	j1 = jiffies;
 	if (migration_debug)
 		printk("migration: %ld seconds\n", (j1-j0)/HZ);
 	/*
 	 * Move back to the original CPU. NUMA-Q gets confused
 	 * if we migrate to another quad during bootup.
 	 */
 	if (raw_smp_processor_id() != orig_cpu) {
 		cpumask_t mask = cpumask_of_cpu(orig_cpu),
 			saved_mask = current->cpus_allowed;
 		set_cpus_allowed(current, mask);
 		set_cpus_allowed(current, saved_mask);
 	}
 }
 #ifdef CONFIG_NUMA
 /**
  * find_next_best_node - find the next node to include in a sched_domain
  * @node: node whose sched_domain we're building
  * @used_nodes: nodes already in the sched_domain
  *
  * Find the next node to include in a given scheduling domain.  Simply
  * finds the closest node not already in the @used_nodes map.
  *
  * Should use nodemask_t.
  */
 static int find_next_best_node(int node, unsigned long *used_nodes)
 {
 	int i, n, val, min_val, best_node = 0;
 	min_val = INT_MAX;
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		/* Start at @node */
 		n = (node + i) % MAX_NUMNODES;
 		if (!nr_cpus_node(n))
 			continue;
 		/* Skip already used nodes */
 		if (test_bit(n, used_nodes))
 			continue;
 		/* Simple min distance search */
 		val = node_distance(node, n);
 		if (val < min_val) {
 			min_val = val;
 			best_node = n;
 		}
 	}
 	set_bit(best_node, used_nodes);
 	return best_node;
 }
 /**
  * sched_domain_node_span - get a cpumask for a node's sched_domain
  * @node: node whose cpumask we're constructing
  * @size: number of nodes to include in this span
  *
  * Given a node, construct a good cpumask for its sched_domain to span.  It
  * should be one that prevents unnecessary balancing, but also spreads tasks
  * out optimally.
  */
 static cpumask_t sched_domain_node_span(int node)
 {
 	int i;
 	cpumask_t span, nodemask;
 	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
 	cpus_clear(span);
 	bitmap_zero(used_nodes, MAX_NUMNODES);
 	nodemask = node_to_cpumask(node);
 	cpus_or(span, span, nodemask);
 	set_bit(node, used_nodes);
 	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
 		int next_node = find_next_best_node(node, used_nodes);
 		nodemask = node_to_cpumask(next_node);
 		cpus_or(span, span, nodemask);
 	}
 	return span;
 }
 #endif
 /*
  * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
  * can switch it on easily if needed.
  */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
 static int cpu_to_cpu_group(int cpu)
 {
 	return cpu;
 }
 #endif
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static struct sched_group sched_group_phys[NR_CPUS];
 static int cpu_to_phys_group(int cpu)
 {
 #ifdef CONFIG_SCHED_SMT
 	return first_cpu(cpu_sibling_map[cpu]);
 #else
 	return cpu;
 #endif
 }
 #ifdef CONFIG_NUMA
 /*
  * The init_sched_build_groups can't handle what we want to do with node
  * groups, so roll our own. Now each node has its own list of groups which
  * gets dynamically allocated.
  */
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
 static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
 static int cpu_to_allnodes_group(int cpu)
 {
 	return cpu_to_node(cpu);
 }
 #endif
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
 void build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
 #ifdef CONFIG_NUMA
 	struct sched_group **sched_group_nodes = NULL;
 	struct sched_group *sched_group_allnodes = NULL;
 	/*
 	 * Allocate the per-node list of sched groups
 	 */
 	sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
 					   GFP_ATOMIC);
 	if (!sched_group_nodes) {
 		printk(KERN_WARNING "Can not alloc sched group node list\n");
 		return;
 	}
 	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
 #endif
 	/*
 	 * Set up domains for cpus specified by the cpu_map.
 	 */
 	for_each_cpu_mask(i, *cpu_map) {
 		int group;
 		struct sched_domain *sd = NULL, *p;
 		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
 		cpus_and(nodemask, nodemask, *cpu_map);
 #ifdef CONFIG_NUMA
 		if (cpus_weight(*cpu_map)
 				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
 			if (!sched_group_allnodes) {
 				sched_group_allnodes
 					= kmalloc(sizeof(struct sched_group)
 							* MAX_NUMNODES,
 						  GFP_KERNEL);
 				if (!sched_group_allnodes) {
 					printk(KERN_WARNING
 					"Can not alloc allnodes sched group\n");
 					break;
 				}
 				sched_group_allnodes_bycpu[i]
 						= sched_group_allnodes;
 			}
 			sd = &per_cpu(allnodes_domains, i);
 			*sd = SD_ALLNODES_INIT;
 			sd->span = *cpu_map;
 			group = cpu_to_allnodes_group(i);
 			sd->groups = &sched_group_allnodes[group];
 			p = sd;
 		} else
 			p = NULL;
 		sd = &per_cpu(node_domains, i);
 		*sd = SD_NODE_INIT;
 		sd->span = sched_domain_node_span(cpu_to_node(i));
 		sd->parent = p;
 		cpus_and(sd->span, sd->span, *cpu_map);
 #endif
 		p = sd;
 		sd = &per_cpu(phys_domains, i);
 		group = cpu_to_phys_group(i);
 		*sd = SD_CPU_INIT;
 		sd->span = nodemask;
 		sd->parent = p;
 		sd->groups = &sched_group_phys[group];
 #ifdef CONFIG_SCHED_SMT
 		p = sd;
 		sd = &per_cpu(cpu_domains, i);
 		group = cpu_to_cpu_group(i);
 		*sd = SD_SIBLING_INIT;
 		sd->span = cpu_sibling_map[i];
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		sd->groups = &sched_group_cpus[group];
 #endif
 	}
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
 	for_each_cpu_mask(i, *cpu_map) {
 		cpumask_t this_sibling_map = cpu_sibling_map[i];
 		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
 		if (i != first_cpu(this_sibling_map))
 			continue;
 		init_sched_build_groups(sched_group_cpus, this_sibling_map,
 						&cpu_to_cpu_group);
 	}
 #endif
 	/* Set up physical groups */
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cpumask_t nodemask = node_to_cpumask(i);
 		cpus_and(nodemask, nodemask, *cpu_map);
 		if (cpus_empty(nodemask))
 			continue;
 		init_sched_build_groups(sched_group_phys, nodemask,
 						&cpu_to_phys_group);
 	}
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
 	if (sched_group_allnodes)
 		init_sched_build_groups(sched_group_allnodes, *cpu_map,
 					&cpu_to_allnodes_group);
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		/* Set up node groups */
 		struct sched_group *sg, *prev;
 		cpumask_t nodemask = node_to_cpumask(i);
 		cpumask_t domainspan;
 		cpumask_t covered = CPU_MASK_NONE;
 		int j;
 		cpus_and(nodemask, nodemask, *cpu_map);
 		if (cpus_empty(nodemask)) {
 			sched_group_nodes[i] = NULL;
 			continue;
 		}
 		domainspan = sched_domain_node_span(i);
 		cpus_and(domainspan, domainspan, *cpu_map);
 		sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
 		sched_group_nodes[i] = sg;
 		for_each_cpu_mask(j, nodemask) {
 			struct sched_domain *sd;
 			sd = &per_cpu(node_domains, j);
 			sd->groups = sg;
 			if (sd->groups == NULL) {
 				/* Turn off balancing if we have no groups */
 				sd->flags = 0;
 			}
 		}
 		if (!sg) {
 			printk(KERN_WARNING
 			"Can not alloc domain group for node %d\n", i);
 			continue;
 		}
 		sg->cpu_power = 0;
 		sg->cpumask = nodemask;
 		cpus_or(covered, covered, nodemask);
 		prev = sg;
 		for (j = 0; j < MAX_NUMNODES; j++) {
 			cpumask_t tmp, notcovered;
 			int n = (i + j) % MAX_NUMNODES;
 			cpus_complement(notcovered, covered);
 			cpus_and(tmp, notcovered, *cpu_map);
 			cpus_and(tmp, tmp, domainspan);
 			if (cpus_empty(tmp))
 				break;
 			nodemask = node_to_cpumask(n);
 			cpus_and(tmp, tmp, nodemask);
 			if (cpus_empty(tmp))
 				continue;
 			sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
 			if (!sg) {
 				printk(KERN_WARNING
 				"Can not alloc domain group for node %d\n", j);
 				break;
 			}
 			sg->cpu_power = 0;
 			sg->cpumask = tmp;
 			cpus_or(covered, covered, tmp);
 			prev->next = sg;
 			prev = sg;
 		}
 		prev->next = sched_group_nodes[i];
 	}
 #endif
 	/* Calculate CPU power for physical packages and nodes */
 	for_each_cpu_mask(i, *cpu_map) {
 		int power;
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
 		power = SCHED_LOAD_SCALE;
 		sd->groups->cpu_power = power;
 #endif
 		sd = &per_cpu(phys_domains, i);
 		power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
 				(cpus_weight(sd->groups->cpumask)-1) / 10;
 		sd->groups->cpu_power = power;
 #ifdef CONFIG_NUMA
 		sd = &per_cpu(allnodes_domains, i);
 		if (sd->groups) {
 			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
 				(cpus_weight(sd->groups->cpumask)-1) / 10;
 			sd->groups->cpu_power = power;
 		}
 #endif
 	}
 #ifdef CONFIG_NUMA
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		struct sched_group *sg = sched_group_nodes[i];
 		int j;
 		if (sg == NULL)
 			continue;
 next_sg:
 		for_each_cpu_mask(j, sg->cpumask) {
 			struct sched_domain *sd;
 			int power;
 			sd = &per_cpu(phys_domains, j);
 			if (j != first_cpu(sd->groups->cpumask)) {
 				/*
 				 * Only add "power" once for each
 				 * physical package.
 				 */
 				continue;
 			}
 			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
 				(cpus_weight(sd->groups->cpumask)-1) / 10;
 			sg->cpu_power += power;
 		}
 		sg = sg->next;
 		if (sg != sched_group_nodes[i])
 			goto next_sg;
 	}
 #endif
 	/* Attach the domains */
 	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
 #else
 		sd = &per_cpu(phys_domains, i);
 #endif
 		cpu_attach_domain(sd, i);
 	}
 	/*
 	 * Tune cache-hot values:
 	 */
 	calibrate_migration_costs(cpu_map);
 }
 /*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
  */
 static void arch_init_sched_domains(const cpumask_t *cpu_map)
 {
 	cpumask_t cpu_default_map;
 	/*
 	 * Setup mask for cpus without special case scheduling requirements.
 	 * For now this just excludes isolated cpus, but could be used to
 	 * exclude other special cases in the future.
 	 */
 	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
 	build_sched_domains(&cpu_default_map);
 }
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
 #ifdef CONFIG_NUMA
 	int i;
 	int cpu;
 	for_each_cpu_mask(cpu, *cpu_map) {
 		struct sched_group *sched_group_allnodes
 			= sched_group_allnodes_bycpu[cpu];
 		struct sched_group **sched_group_nodes
 			= sched_group_nodes_bycpu[cpu];
 		if (sched_group_allnodes) {
 			kfree(sched_group_allnodes);
 			sched_group_allnodes_bycpu[cpu] = NULL;
 		}
 		if (!sched_group_nodes)
 			continue;
 		for (i = 0; i < MAX_NUMNODES; i++) {
 			cpumask_t nodemask = node_to_cpumask(i);
 			struct sched_group *oldsg, *sg = sched_group_nodes[i];
 			cpus_and(nodemask, nodemask, *cpu_map);
 			if (cpus_empty(nodemask))
 				continue;
 			if (sg == NULL)
 				continue;
 			sg = sg->next;
 next_sg:
 			oldsg = sg;
 			sg = sg->next;
 			kfree(oldsg);
 			if (oldsg != sched_group_nodes[i])
 				goto next_sg;
 		}
 		kfree(sched_group_nodes);
 		sched_group_nodes_bycpu[cpu] = NULL;
 	}
 #endif
 }
 /*
  * Detach sched domains from a group of cpus specified in cpu_map
  * These cpus will now be attached to the NULL domain
  */
 static void detach_destroy_domains(const cpumask_t *cpu_map)
 {
 	int i;
 	for_each_cpu_mask(i, *cpu_map)
 		cpu_attach_domain(NULL, i);
 	synchronize_sched();
 	arch_destroy_sched_domains(cpu_map);
 }
 /*
  * Partition sched domains as specified by the cpumasks below.
  * This attaches all cpus from the cpumasks to the NULL domain,
  * waits for a RCU quiescent period, recalculates sched
  * domain information and then attaches them back to the
  * correct sched domains
  * Call with hotplug lock held
  */
 void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
 {
 	cpumask_t change_map;
 	cpus_and(*partition1, *partition1, cpu_online_map);
 	cpus_and(*partition2, *partition2, cpu_online_map);
 	cpus_or(change_map, *partition1, *partition2);
 	/* Detach sched domains from all of the affected cpus */
 	detach_destroy_domains(&change_map);
 	if (!cpus_empty(*partition1))
 		build_sched_domains(partition1);
 	if (!cpus_empty(*partition2))
 		build_sched_domains(partition2);
 }
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Force a reinitialization of the sched domains hierarchy.  The domains
  * and groups cannot be updated in place without racing with the balancing
  * code, so we temporarily attach all running cpus to the NULL domain
  * which will prevent rebalancing while the sched domains are recalculated.
  */
 static int update_sched_domains(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_DOWN_PREPARE:
 		detach_destroy_domains(&cpu_online_map);
 		return NOTIFY_OK;
 	case CPU_UP_CANCELED:
 	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
 	case CPU_DEAD:
 		/*
 		 * Fall through and re-initialise the domains.
 		 */
 		break;
 	default:
 		return NOTIFY_DONE;
 	}
 	/* The hotplug lock is already held by cpu_up/cpu_down */
 	arch_init_sched_domains(&cpu_online_map);
 	return NOTIFY_OK;
 }
 #endif
 void __init sched_init_smp(void)
 {
 	lock_cpu_hotplug();
 	arch_init_sched_domains(&cpu_online_map);
 	unlock_cpu_hotplug();
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
 }
 #else
 void __init sched_init_smp(void)
 {
 }
 #endif /* CONFIG_SMP */
 int in_sched_functions(unsigned long addr)
 {
 	/* Linker adds these: start and end of __sched functions */
 	extern char __sched_text_start[], __sched_text_end[];
 	return in_lock_functions(addr) ||
 		(addr >= (unsigned long)__sched_text_start
 		&& addr < (unsigned long)__sched_text_end);
 }
 void __init sched_init(void)
 {
 	runqueue_t *rq;
 	int i, j, k;
 	for_each_cpu(i) {
 		prio_array_t *array;
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
 		rq->nr_running = 0;
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
 		rq->best_expired_prio = MAX_PRIO;
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
 		for (j = 1; j < 3; j++)
 			rq->cpu_load[j] = 0;
 		rq->active_balance = 0;
 		rq->push_cpu = 0;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
 		rq->cpu = i;
 #endif
 		atomic_set(&rq->nr_iowait, 0);
 		for (j = 0; j < 2; j++) {
 			array = rq->arrays + j;
 			for (k = 0; k < MAX_PRIO; k++) {
 				INIT_LIST_HEAD(array->queue + k);
 				__clear_bit(k, array->bitmap);
 			}
 			// delimiter for bitsearch
 			__set_bit(MAX_PRIO, array->bitmap);
 		}
 	}
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current);
 	/*
 	 * Make us the idle thread. Technically, schedule() should not be
 	 * called from this thread, however somewhere below it might be,
 	 * but because we are the idle thread, we just pick up running again
 	 * when this runqueue becomes "idle".
 	 */
 	init_idle(current, smp_processor_id());
 }
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
 void __might_sleep(char *file, int line)
 {
 #if defined(in_atomic)
 	static unsigned long prev_jiffy;	/* ratelimiting */
 	if ((in_atomic() || irqs_disabled()) &&
 	    system_state == SYSTEM_RUNNING && !oops_in_progress) {
 		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 			return;
 		prev_jiffy = jiffies;
 		printk(KERN_ERR "BUG: sleeping function called from invalid"
 				" context at %s:%d\n", file, line);
 		printk("in_atomic():%d, irqs_disabled():%d\n",
 			in_atomic(), irqs_disabled());
 		dump_stack();
 	}
 #endif
 }
 EXPORT_SYMBOL(__might_sleep);
 #endif
 #ifdef CONFIG_MAGIC_SYSRQ
 void normalize_rt_tasks(void)
 {
 	struct task_struct *p;
 	prio_array_t *array;
 	unsigned long flags;
 	runqueue_t *rq;
 	read_lock_irq(&tasklist_lock);
 	for_each_process (p) {
 		if (!rt_task(p))
 			continue;
 		rq = task_rq_lock(p, &flags);
 		array = p->array;
 		if (array)
 			deactivate_task(p, task_rq(p));
 		__setscheduler(p, SCHED_NORMAL, 0);
 		if (array) {
 			__activate_task(p, task_rq(p));
 			resched_task(rq->curr);
 		}
 		task_rq_unlock(rq, &flags);
 	}
 	read_unlock_irq(&tasklist_lock);
 }
 #endif /* CONFIG_MAGIC_SYSRQ */
 #ifdef CONFIG_IA64
 /*
  * These functions are only useful for the IA64 MCA handling.
  *
  * They can only be called when the whole system has been
  * stopped - every CPU needs to be quiescent, and no scheduling
  * activity can take place. Using them for anything else would
  * be a serious bug, and as a result, they aren't even visible
  * under any other configuration.
  */
 /**
  * curr_task - return the current task for a given cpu.
  * @cpu: the processor in question.
  *
  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
  */
 task_t *curr_task(int cpu)
 {
 	return cpu_curr(cpu);
 }
 /**
  * set_curr_task - set the current task for a given cpu.
  * @cpu: the processor in question.
  * @p: the task pointer to set.
  *
  * Description: This function must only be used when non-maskable interrupts
  * are serviced on a separate stack.  It allows the architecture to switch the
  * notion of the current task on a cpu in a non-blocking manner.  This function
  * must be called with all CPU's synchronized, and interrupts disabled, the
  * and caller must save the original value of the current task (see
  * curr_task() above) and restore that value before reenabling interrupts and
  * re-starting the system.
  *
  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
  */
 void set_curr_task(int cpu, task_t *p)
 {
 	cpu_curr(cpu) = p;
 }
 #endif