Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

* kernel/sched.c

2

* kernel/sched.c

3

*

3

*

4

* Kernel scheduler and related syscalls

4

* Kernel scheduler and related syscalls

5

*

5

*

6

7

*

7

*

8

* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and

8

* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and

9

* make semaphores SMP safe

9

* make semaphores SMP safe

10

* 1998-11-19 Implemented schedule_timeout() and related stuff

10

* 1998-11-19 Implemented schedule_timeout() and related stuff

11

* by Andrea Arcangeli

11

* by Andrea Arcangeli

12

* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:

12

* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:

13

* hybrid priority-list and round-robin design with

13

* hybrid priority-list and round-robin design with

14

* an array-switch method of distributing timeslices

14

* an array-switch method of distributing timeslices

15

* and per-CPU runqueues. Cleanups and useful suggestions

15

* and per-CPU runqueues. Cleanups and useful suggestions

16

* by Davide Libenzi, preemptible kernel bits by Robert Love.

16

* by Davide Libenzi, preemptible kernel bits by Robert Love.

17

* 2003-09-03 Interactivity tuning by Con Kolivas.

17

* 2003-09-03 Interactivity tuning by Con Kolivas.

18

* 2004-04-02 Scheduler domains code by Nick Piggin

18

* 2004-04-02 Scheduler domains code by Nick Piggin

19

* 2007-04-15 Work begun on replacing all interactivity tuning with a

19

* 2007-04-15 Work begun on replacing all interactivity tuning with a

20

* fair scheduling design by Con Kolivas.

20

* fair scheduling design by Con Kolivas.

21

* 2007-05-05 Load balancing (smp-nice) and other improvements

21

* 2007-05-05 Load balancing (smp-nice) and other improvements

22

* by Peter Williams

22

* by Peter Williams

23

* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith

23

* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith

24

* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri

24

* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri

25

* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,

25

* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,

26

* Thomas Gleixner, Mike Kravetz

26

* Thomas Gleixner, Mike Kravetz

27

*/

27

*/

28

29

#include <linux/mm.h>

29

#include <linux/mm.h>

30

#include <linux/module.h>

30

#include <linux/module.h>

31

#include <linux/nmi.h>

31

#include <linux/nmi.h>

32

#include <linux/init.h>

32

#include <linux/init.h>

33

#include <linux/uaccess.h>

33

#include <linux/uaccess.h>

34

#include <linux/highmem.h>

34

#include <linux/highmem.h>

35

#include <linux/smp_lock.h>

35

#include <linux/smp_lock.h>

36

#include <asm/mmu_context.h>

36

#include <asm/mmu_context.h>

37

#include <linux/interrupt.h>

37

#include <linux/interrupt.h>

38

#include <linux/capability.h>

38

#include <linux/capability.h>

39

#include <linux/completion.h>

39

#include <linux/completion.h>

40

#include <linux/kernel_stat.h>

40

#include <linux/kernel_stat.h>

41

#include <linux/debug_locks.h>

41

#include <linux/debug_locks.h>

42

#include <linux/perf_counter.h>

42

#include <linux/perf_counter.h>

43

#include <linux/security.h>

43

#include <linux/security.h>

44

#include <linux/notifier.h>

44

#include <linux/notifier.h>

45

#include <linux/profile.h>

45

#include <linux/profile.h>

46

#include <linux/freezer.h>

46

#include <linux/freezer.h>

47

#include <linux/vmalloc.h>

47

#include <linux/vmalloc.h>

48

#include <linux/blkdev.h>

48

#include <linux/blkdev.h>

49

#include <linux/delay.h>

49

#include <linux/delay.h>

50

#include <linux/pid_namespace.h>

50

#include <linux/pid_namespace.h>

51

#include <linux/smp.h>

51

#include <linux/smp.h>

52

#include <linux/threads.h>

52

#include <linux/threads.h>

53

#include <linux/timer.h>

53

#include <linux/timer.h>

54

#include <linux/rcupdate.h>

54

#include <linux/rcupdate.h>

55

#include <linux/cpu.h>

55

#include <linux/cpu.h>

56

#include <linux/cpuset.h>

56

#include <linux/cpuset.h>

57

#include <linux/percpu.h>

57

#include <linux/percpu.h>

58

#include <linux/kthread.h>

58

#include <linux/kthread.h>

59

#include <linux/proc_fs.h>

59

#include <linux/proc_fs.h>

60

#include <linux/seq_file.h>

60

#include <linux/seq_file.h>

61

#include <linux/sysctl.h>

61

#include <linux/sysctl.h>

62

#include <linux/syscalls.h>

62

#include <linux/syscalls.h>

63

#include <linux/times.h>

63

#include <linux/times.h>

64

#include <linux/tsacct_kern.h>

64

#include <linux/tsacct_kern.h>

65

#include <linux/kprobes.h>

65

#include <linux/kprobes.h>

66

#include <linux/delayacct.h>

66

#include <linux/delayacct.h>

67

#include <linux/reciprocal_div.h>

67

#include <linux/reciprocal_div.h>

68

#include <linux/unistd.h>

68

#include <linux/unistd.h>

69

#include <linux/pagemap.h>

69

#include <linux/pagemap.h>

70

#include <linux/hrtimer.h>

70

#include <linux/hrtimer.h>

71

#include <linux/tick.h>

71

#include <linux/tick.h>

72

#include <linux/debugfs.h>

72

#include <linux/debugfs.h>

73

#include <linux/ctype.h>

73

#include <linux/ctype.h>

74

#include <linux/ftrace.h>

74

#include <linux/ftrace.h>

75

76

#include <asm/tlb.h>

76

#include <asm/tlb.h>

77

#include <asm/irq_regs.h>

77

#include <asm/irq_regs.h>

78

79

#include "sched_cpupri.h"

79

#include "sched_cpupri.h"

80

81

#define CREATE_TRACE_POINTS

81

#define CREATE_TRACE_POINTS

82

#include <trace/events/sched.h>

82

#include <trace/events/sched.h>

83

84

/*

84

/*

85

* Convert user-nice values [ -20 ... 0 ... 19 ]

85

* Convert user-nice values [ -20 ... 0 ... 19 ]

86

* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],

86

* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],

87

* and back.

87

* and back.

88

*/

88

*/

89

#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)

89

#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)

90

#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)

90

#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)

91

#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)

91

#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)

92

93

/*

93

/*

94

* 'User priority' is the nice value converted to something we

94

* 'User priority' is the nice value converted to something we

95

* can work with better when scaling various scheduler parameters,

95

* can work with better when scaling various scheduler parameters,

96

* it's a [ 0 ... 39 ] range.

96

* it's a [ 0 ... 39 ] range.

97

*/

97

*/

98

#define USER_PRIO(p) ((p)-MAX_RT_PRIO)

98

#define USER_PRIO(p) ((p)-MAX_RT_PRIO)

99

#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)

99

#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)

100

#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))

100

#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))

101

102

/*

102

/*

103

* Helpers for converting nanosecond timing to jiffy resolution

103

* Helpers for converting nanosecond timing to jiffy resolution

104

*/

104

*/

105

#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))

105

#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))

106

107

#define NICE_0_LOAD SCHED_LOAD_SCALE

107

#define NICE_0_LOAD SCHED_LOAD_SCALE

108

#define NICE_0_SHIFT SCHED_LOAD_SHIFT

108

#define NICE_0_SHIFT SCHED_LOAD_SHIFT

109

110

/*

110

/*

111

* These are the 'tuning knobs' of the scheduler:

111

* These are the 'tuning knobs' of the scheduler:

112

*

112

*

113

* default timeslice is 100 msecs (used only for SCHED_RR tasks).

113

* default timeslice is 100 msecs (used only for SCHED_RR tasks).

114

* Timeslices get refilled after they expire.

114

* Timeslices get refilled after they expire.

115

*/

115

*/

116

#define DEF_TIMESLICE (100 * HZ / 1000)

116

#define DEF_TIMESLICE (100 * HZ / 1000)

117

118

/*

118

/*

119

* single value that denotes runtime == period, ie unlimited time.

119

* single value that denotes runtime == period, ie unlimited time.

120

*/

120

*/

121

#define RUNTIME_INF ((u64)~0ULL)

121

#define RUNTIME_INF ((u64)~0ULL)

122

123

#ifdef CONFIG_SMP

123

#ifdef CONFIG_SMP

124

125

static void double_rq_lock(struct rq *rq1, struct rq *rq2);

125

static void double_rq_lock(struct rq *rq1, struct rq *rq2);

126

127

/*

127

/*

128

* Divide a load by a sched group cpu_power : (load / sg->__cpu_power)

128

* Divide a load by a sched group cpu_power : (load / sg->__cpu_power)

129

* Since cpu_power is a 'constant', we can use a reciprocal divide.

129

* Since cpu_power is a 'constant', we can use a reciprocal divide.

130

*/

130

*/

131

static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)

131

static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)

132

{

132

{

133

return reciprocal_divide(load, sg->reciprocal_cpu_power);

133

return reciprocal_divide(load, sg->reciprocal_cpu_power);

134

}

134

}

135

136

/*

136

/*

137

* Each time a sched group cpu_power is changed,

137

* Each time a sched group cpu_power is changed,

138

* we must compute its reciprocal value

138

* we must compute its reciprocal value

139

*/

139

*/

140

static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)

140

static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)

141

{

141

{

142

sg->__cpu_power += val;

142

sg->__cpu_power += val;

143

sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);

143

sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);

144

}

144

}

145

#endif

145

#endif

146

147

static inline int rt_policy(int policy)

147

static inline int rt_policy(int policy)

148

{

148

{

149

if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))

149

if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))

150

return 1;

150

return 1;

151

return 0;

151

return 0;

152

}

152

}

153

154

static inline int task_has_rt_policy(struct task_struct *p)

154

static inline int task_has_rt_policy(struct task_struct *p)

155

{

155

{

156

return rt_policy(p->policy);

156

return rt_policy(p->policy);

157

}

157

}

158

159

/*

159

/*

160

* This is the priority-queue data structure of the RT scheduling class:

160

* This is the priority-queue data structure of the RT scheduling class:

161

*/

161

*/

162

struct rt_prio_array {

162

struct rt_prio_array {

163

DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */

163

DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */

164

struct list_head queue[MAX_RT_PRIO];

164

struct list_head queue[MAX_RT_PRIO];

165

};

165

};

166

167

struct rt_bandwidth {

167

struct rt_bandwidth {

168

/* nests inside the rq lock: */

168

/* nests inside the rq lock: */

169

spinlock_t rt_runtime_lock;

169

spinlock_t rt_runtime_lock;

170

ktime_t rt_period;

170

ktime_t rt_period;

171

u64 rt_runtime;

171

u64 rt_runtime;

172

struct hrtimer rt_period_timer;

172

struct hrtimer rt_period_timer;

173

};

173

};

174

175

static struct rt_bandwidth def_rt_bandwidth;

175

static struct rt_bandwidth def_rt_bandwidth;

176

177

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);

177

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);

178

179

static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)

179

static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)

180

{

180

{

181

struct rt_bandwidth *rt_b =

181

struct rt_bandwidth *rt_b =

182

container_of(timer, struct rt_bandwidth, rt_period_timer);

182

container_of(timer, struct rt_bandwidth, rt_period_timer);

183

ktime_t now;

183

ktime_t now;

184

int overrun;

184

int overrun;

185

int idle = 0;

185

int idle = 0;

186

187

for (;;) {

187

for (;;) {

188

now = hrtimer_cb_get_time(timer);

188

now = hrtimer_cb_get_time(timer);

189

overrun = hrtimer_forward(timer, now, rt_b->rt_period);

189

overrun = hrtimer_forward(timer, now, rt_b->rt_period);

190

191

if (!overrun)

191

if (!overrun)

192

break;

192

break;

193

194

idle = do_sched_rt_period_timer(rt_b, overrun);

194

idle = do_sched_rt_period_timer(rt_b, overrun);

195

}

195

}

196

197

return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;

197

return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;

198

}

198

}

199

200

static

200

static

201

void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)

201

void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)

202

{

202

{

203

rt_b->rt_period = ns_to_ktime(period);

203

rt_b->rt_period = ns_to_ktime(period);

204

rt_b->rt_runtime = runtime;

204

rt_b->rt_runtime = runtime;

205

206

spin_lock_init(&rt_b->rt_runtime_lock);

206

spin_lock_init(&rt_b->rt_runtime_lock);

207

208

hrtimer_init(&rt_b->rt_period_timer,

208

hrtimer_init(&rt_b->rt_period_timer,

209

CLOCK_MONOTONIC, HRTIMER_MODE_REL);

209

CLOCK_MONOTONIC, HRTIMER_MODE_REL);

210

rt_b->rt_period_timer.function = sched_rt_period_timer;

210

rt_b->rt_period_timer.function = sched_rt_period_timer;

211

}

211

}

212

213

static inline int rt_bandwidth_enabled(void)

213

static inline int rt_bandwidth_enabled(void)

214

{

214

{

215

return sysctl_sched_rt_runtime >= 0;

215

return sysctl_sched_rt_runtime >= 0;

216

}

216

}

217

218

static void start_rt_bandwidth(struct rt_bandwidth *rt_b)

218

static void start_rt_bandwidth(struct rt_bandwidth *rt_b)

219

{

219

{

220

ktime_t now;

220

ktime_t now;

221

222

if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)

222

if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)

223

return;

223

return;

224

225

if (hrtimer_active(&rt_b->rt_period_timer))

225

if (hrtimer_active(&rt_b->rt_period_timer))

226

return;

226

return;

227

228

spin_lock(&rt_b->rt_runtime_lock);

228

spin_lock(&rt_b->rt_runtime_lock);

229

for (;;) {

229

for (;;) {

230

unsigned long delta;

230

unsigned long delta;

231

ktime_t soft, hard;

231

ktime_t soft, hard;

232

233

if (hrtimer_active(&rt_b->rt_period_timer))

233

if (hrtimer_active(&rt_b->rt_period_timer))

234

break;

234

break;

235

236

now = hrtimer_cb_get_time(&rt_b->rt_period_timer);

236

now = hrtimer_cb_get_time(&rt_b->rt_period_timer);

237

hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);

237

hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);

238

239

soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);

239

soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);

240

hard = hrtimer_get_expires(&rt_b->rt_period_timer);

240

hard = hrtimer_get_expires(&rt_b->rt_period_timer);

241

delta = ktime_to_ns(ktime_sub(hard, soft));

241

delta = ktime_to_ns(ktime_sub(hard, soft));

242

__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,

242

__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,

243

HRTIMER_MODE_ABS_PINNED, 0);

243

HRTIMER_MODE_ABS_PINNED, 0);

244

}

244

}

245

spin_unlock(&rt_b->rt_runtime_lock);

245

spin_unlock(&rt_b->rt_runtime_lock);

246

}

246

}

247

248

#ifdef CONFIG_RT_GROUP_SCHED

248

#ifdef CONFIG_RT_GROUP_SCHED

249

static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)

249

static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)

250

{

250

{

251

hrtimer_cancel(&rt_b->rt_period_timer);

251

hrtimer_cancel(&rt_b->rt_period_timer);

252

}

252

}

253

#endif

253

#endif

254

255

/*

255

/*

256

* sched_domains_mutex serializes calls to arch_init_sched_domains,

256

* sched_domains_mutex serializes calls to arch_init_sched_domains,

257

* detach_destroy_domains and partition_sched_domains.

257

* detach_destroy_domains and partition_sched_domains.

258

*/

258

*/

259

static DEFINE_MUTEX(sched_domains_mutex);

259

static DEFINE_MUTEX(sched_domains_mutex);

260

261

#ifdef CONFIG_GROUP_SCHED

261

#ifdef CONFIG_GROUP_SCHED

262

263

#include <linux/cgroup.h>

263

#include <linux/cgroup.h>

264

265

struct cfs_rq;

265

struct cfs_rq;

266

267

static LIST_HEAD(task_groups);

267

static LIST_HEAD(task_groups);

268

269

/* task group related information */

269

/* task group related information */

270

struct task_group {

270

struct task_group {

271

#ifdef CONFIG_CGROUP_SCHED

271

#ifdef CONFIG_CGROUP_SCHED

272

struct cgroup_subsys_state css;

272

struct cgroup_subsys_state css;

273

#endif

273

#endif

274

275

#ifdef CONFIG_USER_SCHED

275

#ifdef CONFIG_USER_SCHED

276

uid_t uid;

276

uid_t uid;

277

#endif

277

#endif

278

279

#ifdef CONFIG_FAIR_GROUP_SCHED

279

#ifdef CONFIG_FAIR_GROUP_SCHED

280

/* schedulable entities of this group on each cpu */

280

/* schedulable entities of this group on each cpu */

281

struct sched_entity **se;

281

struct sched_entity **se;

282

/* runqueue "owned" by this group on each cpu */

282

/* runqueue "owned" by this group on each cpu */

283

struct cfs_rq **cfs_rq;

283

struct cfs_rq **cfs_rq;

284

unsigned long shares;

284

unsigned long shares;

285

#endif

285

#endif

286

287

#ifdef CONFIG_RT_GROUP_SCHED

287

#ifdef CONFIG_RT_GROUP_SCHED

288

struct sched_rt_entity **rt_se;

288

struct sched_rt_entity **rt_se;

289

struct rt_rq **rt_rq;

289

struct rt_rq **rt_rq;

290

291

struct rt_bandwidth rt_bandwidth;

291

struct rt_bandwidth rt_bandwidth;

292

#endif

292

#endif

293

294

struct rcu_head rcu;

294

struct rcu_head rcu;

295

struct list_head list;

295

struct list_head list;

296

297

struct task_group *parent;

297

struct task_group *parent;

298

struct list_head siblings;

298

struct list_head siblings;

299

struct list_head children;

299

struct list_head children;

300

};

300

};

301

302

#ifdef CONFIG_USER_SCHED

302

#ifdef CONFIG_USER_SCHED

303

304

/* Helper function to pass uid information to create_sched_user() */

304

/* Helper function to pass uid information to create_sched_user() */

305

void set_tg_uid(struct user_struct *user)

305

void set_tg_uid(struct user_struct *user)

306

{

306

{

307

user->tg->uid = user->uid;

307

user->tg->uid = user->uid;

308

}

308

}

309

310

/*

310

/*

311

* Root task group.

311

* Root task group.

312

* Every UID task group (including init_task_group aka UID-0) will

312

* Every UID task group (including init_task_group aka UID-0) will

313

* be a child to this group.

313

* be a child to this group.

314

*/

314

*/

315

struct task_group root_task_group;

315

struct task_group root_task_group;

316

317

#ifdef CONFIG_FAIR_GROUP_SCHED

317

#ifdef CONFIG_FAIR_GROUP_SCHED

318

/* Default task group's sched entity on each cpu */

318

/* Default task group's sched entity on each cpu */

319

static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);

319

static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);

320

/* Default task group's cfs_rq on each cpu */

320

/* Default task group's cfs_rq on each cpu */

321

static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;

321

static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;

322

#endif /* CONFIG_FAIR_GROUP_SCHED */

322

#endif /* CONFIG_FAIR_GROUP_SCHED */

323

324

#ifdef CONFIG_RT_GROUP_SCHED

324

#ifdef CONFIG_RT_GROUP_SCHED

325

static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);

325

static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);

326

static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;

326

static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;

327

#endif /* CONFIG_RT_GROUP_SCHED */

327

#endif /* CONFIG_RT_GROUP_SCHED */

328

#else /* !CONFIG_USER_SCHED */

328

#else /* !CONFIG_USER_SCHED */

329

#define root_task_group init_task_group

329

#define root_task_group init_task_group

330

#endif /* CONFIG_USER_SCHED */

330

#endif /* CONFIG_USER_SCHED */

331

332

/* task_group_lock serializes add/remove of task groups and also changes to

332

/* task_group_lock serializes add/remove of task groups and also changes to

333

* a task group's cpu shares.

333

* a task group's cpu shares.

334

*/

334

*/

335

static DEFINE_SPINLOCK(task_group_lock);

335

static DEFINE_SPINLOCK(task_group_lock);

336

337

#ifdef CONFIG_SMP

337

#ifdef CONFIG_SMP

338

static int root_task_group_empty(void)

338

static int root_task_group_empty(void)

339

{

339

{

340

return list_empty(&root_task_group.children);

340

return list_empty(&root_task_group.children);

341

}

341

}

342

#endif

342

#endif

343

344

#ifdef CONFIG_FAIR_GROUP_SCHED

344

#ifdef CONFIG_FAIR_GROUP_SCHED

345

#ifdef CONFIG_USER_SCHED

345

#ifdef CONFIG_USER_SCHED

346

# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)

346

# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)

347

#else /* !CONFIG_USER_SCHED */

347

#else /* !CONFIG_USER_SCHED */

348

# define INIT_TASK_GROUP_LOAD NICE_0_LOAD

348

# define INIT_TASK_GROUP_LOAD NICE_0_LOAD

349

#endif /* CONFIG_USER_SCHED */

349

#endif /* CONFIG_USER_SCHED */

350

351

/*

351

/*

352

* A weight of 0 or 1 can cause arithmetics problems.

352

* A weight of 0 or 1 can cause arithmetics problems.

353

* A weight of a cfs_rq is the sum of weights of which entities

353

* A weight of a cfs_rq is the sum of weights of which entities

354

* are queued on this cfs_rq, so a weight of a entity should not be

354

* are queued on this cfs_rq, so a weight of a entity should not be

355

* too large, so as the shares value of a task group.

355

* too large, so as the shares value of a task group.

356

* (The default weight is 1024 - so there's no practical

356

* (The default weight is 1024 - so there's no practical

357

* limitation from this.)

357

* limitation from this.)

358

*/

358

*/

359

#define MIN_SHARES 2

359

#define MIN_SHARES 2

360

#define MAX_SHARES (1UL << 18)

360

#define MAX_SHARES (1UL << 18)

361

362

static int init_task_group_load = INIT_TASK_GROUP_LOAD;

362

static int init_task_group_load = INIT_TASK_GROUP_LOAD;

363

#endif

363

#endif

364

365

/* Default task group.

365

/* Default task group.

366

* Every task in system belong to this group at bootup.

366

* Every task in system belong to this group at bootup.

367

*/

367

*/

368

struct task_group init_task_group;

368

struct task_group init_task_group;

369

370

/* return group to which a task belongs */

370

/* return group to which a task belongs */

371

static inline struct task_group *task_group(struct task_struct *p)

371

static inline struct task_group *task_group(struct task_struct *p)

372

{

372

{

373

struct task_group *tg;

373

struct task_group *tg;

374

375

#ifdef CONFIG_USER_SCHED

375

#ifdef CONFIG_USER_SCHED

376

rcu_read_lock();

376

rcu_read_lock();

377

tg = __task_cred(p)->user->tg;

377

tg = __task_cred(p)->user->tg;

378

rcu_read_unlock();

378

rcu_read_unlock();

379

#elif defined(CONFIG_CGROUP_SCHED)

379

#elif defined(CONFIG_CGROUP_SCHED)

380

tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),

380

tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),

381

struct task_group, css);

381

struct task_group, css);

382

#else

382

#else

383

tg = &init_task_group;

383

tg = &init_task_group;

384

#endif

384

#endif

385

return tg;

385

return tg;

386

}

386

}

387

388

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */

388

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */

389

static inline void set_task_rq(struct task_struct *p, unsigned int cpu)

389

static inline void set_task_rq(struct task_struct *p, unsigned int cpu)

390

{

390

{

391

#ifdef CONFIG_FAIR_GROUP_SCHED

391

#ifdef CONFIG_FAIR_GROUP_SCHED

392

p->se.cfs_rq = task_group(p)->cfs_rq[cpu];

392

p->se.cfs_rq = task_group(p)->cfs_rq[cpu];

393

p->se.parent = task_group(p)->se[cpu];

393

p->se.parent = task_group(p)->se[cpu];

394

#endif

394

#endif

395

396

#ifdef CONFIG_RT_GROUP_SCHED

396

#ifdef CONFIG_RT_GROUP_SCHED

397

p->rt.rt_rq = task_group(p)->rt_rq[cpu];

397

p->rt.rt_rq = task_group(p)->rt_rq[cpu];

398

p->rt.parent = task_group(p)->rt_se[cpu];

398

p->rt.parent = task_group(p)->rt_se[cpu];

399

#endif

399

#endif

400

}

400

}

401

402

#else

402

#else

403

404

#ifdef CONFIG_SMP

404

#ifdef CONFIG_SMP

405

static int root_task_group_empty(void)

405

static int root_task_group_empty(void)

406

{

406

{

407

return 1;

407

return 1;

408

}

408

}

409

#endif

409

#endif

410

411

static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }

411

static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }

412

static inline struct task_group *task_group(struct task_struct *p)

412

static inline struct task_group *task_group(struct task_struct *p)

413

{

413

{

414

return NULL;

414

return NULL;

415

}

415

}

416

417

#endif /* CONFIG_GROUP_SCHED */

417

#endif /* CONFIG_GROUP_SCHED */

418

419

/* CFS-related fields in a runqueue */

419

/* CFS-related fields in a runqueue */

420

struct cfs_rq {

420

struct cfs_rq {

421

struct load_weight load;

421

struct load_weight load;

422

unsigned long nr_running;

422

unsigned long nr_running;

423

424

u64 exec_clock;

424

u64 exec_clock;

425

u64 min_vruntime;

425

u64 min_vruntime;

426

427

struct rb_root tasks_timeline;

427

struct rb_root tasks_timeline;

428

struct rb_node *rb_leftmost;

428

struct rb_node *rb_leftmost;

429

430

struct list_head tasks;

430

struct list_head tasks;

431

struct list_head *balance_iterator;

431

struct list_head *balance_iterator;

432

433

/*

433

/*

434

* 'curr' points to currently running entity on this cfs_rq.

434

* 'curr' points to currently running entity on this cfs_rq.

435

* It is set to NULL otherwise (i.e when none are currently running).

435

* It is set to NULL otherwise (i.e when none are currently running).

436

*/

436

*/

437

struct sched_entity *curr, *next, *last;

437

struct sched_entity *curr, *next, *last;

438

439

unsigned int nr_spread_over;

439

unsigned int nr_spread_over;

440

441

#ifdef CONFIG_FAIR_GROUP_SCHED

441

#ifdef CONFIG_FAIR_GROUP_SCHED

442

struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */

442

struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */

443

444

/*

444

/*

445

* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in

445

* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in

446

* a hierarchy). Non-leaf lrqs hold other higher schedulable entities

446

* a hierarchy). Non-leaf lrqs hold other higher schedulable entities

447

* (like users, containers etc.)

447

* (like users, containers etc.)

448

*

448

*

449

* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This

449

* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This

450

* list is used during load balance.

450

* list is used during load balance.

451

*/

451

*/

452

struct list_head leaf_cfs_rq_list;

452

struct list_head leaf_cfs_rq_list;

453

struct task_group *tg; /* group that "owns" this runqueue */

453

struct task_group *tg; /* group that "owns" this runqueue */

454

455

#ifdef CONFIG_SMP

455

#ifdef CONFIG_SMP

456

/*

456

/*

457

* the part of load.weight contributed by tasks

457

* the part of load.weight contributed by tasks

458

*/

458

*/

459

unsigned long task_weight;

459

unsigned long task_weight;

460

461

/*

461

/*

462

* h_load = weight * f(tg)

462

* h_load = weight * f(tg)

463

*

463

*

464

* Where f(tg) is the recursive weight fraction assigned to

464

* Where f(tg) is the recursive weight fraction assigned to

465

* this group.

465

* this group.

466

*/

466

*/

467

unsigned long h_load;

467

unsigned long h_load;

468

469

/*

469

/*

470

* this cpu's part of tg->shares

470

* this cpu's part of tg->shares

471

*/

471

*/

472

unsigned long shares;

472

unsigned long shares;

473

474

/*

474

/*

475

* load.weight at the time we set shares

475

* load.weight at the time we set shares

476

*/

476

*/

477

unsigned long rq_weight;

477

unsigned long rq_weight;

478

#endif

478

#endif

479

#endif

479

#endif

480

};

480

};

481

482

/* Real-Time classes' related field in a runqueue: */

482

/* Real-Time classes' related field in a runqueue: */

483

struct rt_rq {

483

struct rt_rq {

484

struct rt_prio_array active;

484

struct rt_prio_array active;

485

unsigned long rt_nr_running;

485

unsigned long rt_nr_running;

486

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

486

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

487

struct {

487

struct {

488

int curr; /* highest queued rt task prio */

488

int curr; /* highest queued rt task prio */

489

#ifdef CONFIG_SMP

489

#ifdef CONFIG_SMP

490

int next; /* next highest */

490

int next; /* next highest */

491

#endif

491

#endif

492

} highest_prio;

492

} highest_prio;

493

#endif

493

#endif

494

#ifdef CONFIG_SMP

494

#ifdef CONFIG_SMP

495

unsigned long rt_nr_migratory;

495

unsigned long rt_nr_migratory;

496

int overloaded;

496

int overloaded;

497

struct plist_head pushable_tasks;

497

struct plist_head pushable_tasks;

498

#endif

498

#endif

499

int rt_throttled;

499

int rt_throttled;

500

u64 rt_time;

500

u64 rt_time;

501

u64 rt_runtime;

501

u64 rt_runtime;

502

/* Nests inside the rq lock: */

502

/* Nests inside the rq lock: */

503

spinlock_t rt_runtime_lock;

503

spinlock_t rt_runtime_lock;

504

505

#ifdef CONFIG_RT_GROUP_SCHED

505

#ifdef CONFIG_RT_GROUP_SCHED

506

unsigned long rt_nr_boosted;

506

unsigned long rt_nr_boosted;

507

508

struct rq *rq;

508

struct rq *rq;

509

struct list_head leaf_rt_rq_list;

509

struct list_head leaf_rt_rq_list;

510

struct task_group *tg;

510

struct task_group *tg;

511

struct sched_rt_entity *rt_se;

511

struct sched_rt_entity *rt_se;

512

#endif

512

#endif

513

};

513

};

514

515

#ifdef CONFIG_SMP

515

#ifdef CONFIG_SMP

516

517

/*

517

/*

518

* We add the notion of a root-domain which will be used to define per-domain

518

* We add the notion of a root-domain which will be used to define per-domain

519

* variables. Each exclusive cpuset essentially defines an island domain by

519

* variables. Each exclusive cpuset essentially defines an island domain by

520

* fully partitioning the member cpus from any other cpuset. Whenever a new

520

* fully partitioning the member cpus from any other cpuset. Whenever a new

521

* exclusive cpuset is created, we also create and attach a new root-domain

521

* exclusive cpuset is created, we also create and attach a new root-domain

522

* object.

522

* object.

523

*

523

*

524

*/

524

*/

525

struct root_domain {

525

struct root_domain {

526

atomic_t refcount;

526

atomic_t refcount;

527

cpumask_var_t span;

527

cpumask_var_t span;

528

cpumask_var_t online;

528

cpumask_var_t online;

529

530

/*

530

/*

531

* The "RT overload" flag: it gets set if a CPU has more than

531

* The "RT overload" flag: it gets set if a CPU has more than

532

* one runnable RT task.

532

* one runnable RT task.

533

*/

533

*/

534

cpumask_var_t rto_mask;

534

cpumask_var_t rto_mask;

535

atomic_t rto_count;

535

atomic_t rto_count;

536

#ifdef CONFIG_SMP

536

#ifdef CONFIG_SMP

537

struct cpupri cpupri;

537

struct cpupri cpupri;

538

#endif

538

#endif

539

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

539

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

540

/*

540

/*

541

* Preferred wake up cpu nominated by sched_mc balance that will be

541

* Preferred wake up cpu nominated by sched_mc balance that will be

542

* used when most cpus are idle in the system indicating overall very

542

* used when most cpus are idle in the system indicating overall very

543

* low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)

543

* low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)

544

*/

544

*/

545

unsigned int sched_mc_preferred_wakeup_cpu;

545

unsigned int sched_mc_preferred_wakeup_cpu;

546

#endif

546

#endif

547

};

547

};

548

549

/*

549

/*

550

* By default the system creates a single root-domain with all cpus as

550

* By default the system creates a single root-domain with all cpus as

551

* members (mimicking the global state we have today).

551

* members (mimicking the global state we have today).

552

*/

552

*/

553

static struct root_domain def_root_domain;

553

static struct root_domain def_root_domain;

554

555

#endif

555

#endif

556

557

/*

557

/*

558

* This is the main, per-CPU runqueue data structure.

558

* This is the main, per-CPU runqueue data structure.

559

*

559

*

560

* Locking rule: those places that want to lock multiple runqueues

560

* Locking rule: those places that want to lock multiple runqueues

561

* (such as the load balancing or the thread migration code), lock

561

* (such as the load balancing or the thread migration code), lock

562

* acquire operations must be ordered by ascending &runqueue.

562

* acquire operations must be ordered by ascending &runqueue.

563

*/

563

*/

564

struct rq {

564

struct rq {

565

/* runqueue lock: */

565

/* runqueue lock: */

566

spinlock_t lock;

566

spinlock_t lock;

567

568

/*

568

/*

569

* nr_running and cpu_load should be in the same cacheline because

569

* nr_running and cpu_load should be in the same cacheline because

570

* remote CPUs use both these fields when doing load calculation.

570

* remote CPUs use both these fields when doing load calculation.

571

*/

571

*/

572

unsigned long nr_running;

572

unsigned long nr_running;

573

#define CPU_LOAD_IDX_MAX 5

573

#define CPU_LOAD_IDX_MAX 5

574

unsigned long cpu_load[CPU_LOAD_IDX_MAX];

574

unsigned long cpu_load[CPU_LOAD_IDX_MAX];

575

#ifdef CONFIG_NO_HZ

575

#ifdef CONFIG_NO_HZ

576

unsigned long last_tick_seen;

576

unsigned long last_tick_seen;

577

unsigned char in_nohz_recently;

577

unsigned char in_nohz_recently;

578

#endif

578

#endif

579

/* capture load from *all* tasks on this cpu: */

579

/* capture load from *all* tasks on this cpu: */

580

struct load_weight load;

580

struct load_weight load;

581

unsigned long nr_load_updates;

581

unsigned long nr_load_updates;

582

u64 nr_switches;

582

u64 nr_switches;

583

u64 nr_migrations_in;

583

u64 nr_migrations_in;

584

585

struct cfs_rq cfs;

585

struct cfs_rq cfs;

586

struct rt_rq rt;

586

struct rt_rq rt;

587

588

#ifdef CONFIG_FAIR_GROUP_SCHED

588

#ifdef CONFIG_FAIR_GROUP_SCHED

589

/* list of leaf cfs_rq on this cpu: */

589

/* list of leaf cfs_rq on this cpu: */

590

struct list_head leaf_cfs_rq_list;

590

struct list_head leaf_cfs_rq_list;

591

#endif

591

#endif

592

#ifdef CONFIG_RT_GROUP_SCHED

592

#ifdef CONFIG_RT_GROUP_SCHED

593

struct list_head leaf_rt_rq_list;

593

struct list_head leaf_rt_rq_list;

594

#endif

594

#endif

595

596

/*

596

/*

597

* This is part of a global counter where only the total sum

597

* This is part of a global counter where only the total sum

598

* over all CPUs matters. A task can increase this counter on

598

* over all CPUs matters. A task can increase this counter on

599

* one CPU and if it got migrated afterwards it may decrease

599

* one CPU and if it got migrated afterwards it may decrease

600

* it on another CPU. Always updated under the runqueue lock:

600

* it on another CPU. Always updated under the runqueue lock:

601

*/

601

*/

602

unsigned long nr_uninterruptible;

602

unsigned long nr_uninterruptible;

603

604

struct task_struct *curr, *idle;

604

struct task_struct *curr, *idle;

605

unsigned long next_balance;

605

unsigned long next_balance;

606

struct mm_struct *prev_mm;

606

struct mm_struct *prev_mm;

607

608

u64 clock;

608

u64 clock;

609

610

atomic_t nr_iowait;

610

atomic_t nr_iowait;

611

612

#ifdef CONFIG_SMP

612

#ifdef CONFIG_SMP

613

struct root_domain *rd;

613

struct root_domain *rd;

614

struct sched_domain *sd;

614

struct sched_domain *sd;

615

616

unsigned char idle_at_tick;

616

unsigned char idle_at_tick;

617

/* For active balancing */

617

/* For active balancing */

618

int active_balance;

618

int active_balance;

619

int push_cpu;

619

int push_cpu;

620

/* cpu of this runqueue: */

620

/* cpu of this runqueue: */

621

int cpu;

621

int cpu;

622

int online;

622

int online;

623

624

unsigned long avg_load_per_task;

624

unsigned long avg_load_per_task;

625

626

struct task_struct *migration_thread;

626

struct task_struct *migration_thread;

627

struct list_head migration_queue;

627

struct list_head migration_queue;

628

#endif

628

#endif

629

630

/* calc_load related fields */

630

/* calc_load related fields */

631

unsigned long calc_load_update;

631

unsigned long calc_load_update;

632

long calc_load_active;

632

long calc_load_active;

633

634

#ifdef CONFIG_SCHED_HRTICK

634

#ifdef CONFIG_SCHED_HRTICK

635

#ifdef CONFIG_SMP

635

#ifdef CONFIG_SMP

636

int hrtick_csd_pending;

636

int hrtick_csd_pending;

637

struct call_single_data hrtick_csd;

637

struct call_single_data hrtick_csd;

638

#endif

638

#endif

639

struct hrtimer hrtick_timer;

639

struct hrtimer hrtick_timer;

640

#endif

640

#endif

641

642

#ifdef CONFIG_SCHEDSTATS

642

#ifdef CONFIG_SCHEDSTATS

643

/* latency stats */

643

/* latency stats */

644

struct sched_info rq_sched_info;

644

struct sched_info rq_sched_info;

645

unsigned long long rq_cpu_time;

645

unsigned long long rq_cpu_time;

646

/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */

646

/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */

647

648

/* sys_sched_yield() stats */

648

/* sys_sched_yield() stats */

649

unsigned int yld_count;

649

unsigned int yld_count;

650

651

/* schedule() stats */

651

/* schedule() stats */

652

unsigned int sched_switch;

652

unsigned int sched_switch;

653

unsigned int sched_count;

653

unsigned int sched_count;

654

unsigned int sched_goidle;

654

unsigned int sched_goidle;

655

656

/* try_to_wake_up() stats */

656

/* try_to_wake_up() stats */

657

unsigned int ttwu_count;

657

unsigned int ttwu_count;

658

unsigned int ttwu_local;

658

unsigned int ttwu_local;

659

660

/* BKL stats */

660

/* BKL stats */

661

unsigned int bkl_count;

661

unsigned int bkl_count;

662

#endif

662

#endif

663

};

663

};

664

665

static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

665

static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

666

667

static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)

667

static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)

668

{

668

{

669

rq->curr->sched_class->check_preempt_curr(rq, p, sync);

669

rq->curr->sched_class->check_preempt_curr(rq, p, sync);

670

}

670

}

671

672

static inline int cpu_of(struct rq *rq)

672

static inline int cpu_of(struct rq *rq)

673

{

673

{

674

#ifdef CONFIG_SMP

674

#ifdef CONFIG_SMP

675

return rq->cpu;

675

return rq->cpu;

676

#else

676

#else

677

return 0;

677

return 0;

678

#endif

678

#endif

679

}

679

}

680

681

/*

681

/*

682

* The domain tree (rq->sd) is protected by RCU's quiescent state transition.

682

* The domain tree (rq->sd) is protected by RCU's quiescent state transition.

683

* See detach_destroy_domains: synchronize_sched for details.

683

* See detach_destroy_domains: synchronize_sched for details.

684

*

684

*

685

* The domain tree of any CPU may only be accessed from within

685

* The domain tree of any CPU may only be accessed from within

686

* preempt-disabled sections.

686

* preempt-disabled sections.

687

*/

687

*/

688

#define for_each_domain(cpu, __sd) \

688

#define for_each_domain(cpu, __sd) \

689

for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)

689

for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)

690

691

#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))

691

#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))

692

#define this_rq() (&__get_cpu_var(runqueues))

692

#define this_rq() (&__get_cpu_var(runqueues))

693

#define task_rq(p) cpu_rq(task_cpu(p))

693

#define task_rq(p) cpu_rq(task_cpu(p))

694

#define cpu_curr(cpu) (cpu_rq(cpu)->curr)

694

#define cpu_curr(cpu) (cpu_rq(cpu)->curr)

695

696

inline void update_rq_clock(struct rq *rq)

696

inline void update_rq_clock(struct rq *rq)

697

{

697

{

698

rq->clock = sched_clock_cpu(cpu_of(rq));

698

rq->clock = sched_clock_cpu(cpu_of(rq));

699

}

699

}

700

701

/*

701

/*

702

* Tunables that become constants when CONFIG_SCHED_DEBUG is off:

702

* Tunables that become constants when CONFIG_SCHED_DEBUG is off:

703

*/

703

*/

704

#ifdef CONFIG_SCHED_DEBUG

704

#ifdef CONFIG_SCHED_DEBUG

705

# define const_debug __read_mostly

705

# define const_debug __read_mostly

706

#else

706

#else

707

# define const_debug static const

707

# define const_debug static const

708

#endif

708

#endif

709

710

/**

710

/**

711

* runqueue_is_locked

711

* runqueue_is_locked

712

*

712

*

713

* Returns true if the current cpu runqueue is locked.

713

* Returns true if the current cpu runqueue is locked.

714

* This interface allows printk to be called with the runqueue lock

714

* This interface allows printk to be called with the runqueue lock

715

* held and know whether or not it is OK to wake up the klogd.

715

* held and know whether or not it is OK to wake up the klogd.

716

*/

716

*/

717

int runqueue_is_locked(void)

717

int runqueue_is_locked(void)

718

{

718

{

719

int cpu = get_cpu();

719

int cpu = get_cpu();

720

struct rq *rq = cpu_rq(cpu);

720

struct rq *rq = cpu_rq(cpu);

721

int ret;

721

int ret;

722

723

ret = spin_is_locked(&rq->lock);

723

ret = spin_is_locked(&rq->lock);

724

put_cpu();

724

put_cpu();

725

return ret;

725

return ret;

726

}

726

}

727

728

/*

728

/*

729

* Debugging: various feature bits

729

* Debugging: various feature bits

730

*/

730

*/

731

732

#define SCHED_FEAT(name, enabled) \

732

#define SCHED_FEAT(name, enabled) \

733

__SCHED_FEAT_##name ,

733

__SCHED_FEAT_##name ,

734

735

enum {

735

enum {

736

#include "sched_features.h"

736

#include "sched_features.h"

737

};

737

};

738

739

#undef SCHED_FEAT

739

#undef SCHED_FEAT

740

741

#define SCHED_FEAT(name, enabled) \

741

#define SCHED_FEAT(name, enabled) \

742

(1UL << __SCHED_FEAT_##name) * enabled |

742

(1UL << __SCHED_FEAT_##name) * enabled |

743

744

const_debug unsigned int sysctl_sched_features =

744

const_debug unsigned int sysctl_sched_features =

745

#include "sched_features.h"

745

#include "sched_features.h"

746

0;

746

0;

747

748

#undef SCHED_FEAT

748

#undef SCHED_FEAT

749

750

#ifdef CONFIG_SCHED_DEBUG

750

#ifdef CONFIG_SCHED_DEBUG

751

#define SCHED_FEAT(name, enabled) \

751

#define SCHED_FEAT(name, enabled) \

752

#name ,

752

#name ,

753

754

static __read_mostly char *sched_feat_names[] = {

754

static __read_mostly char *sched_feat_names[] = {

755

#include "sched_features.h"

755

#include "sched_features.h"

756

NULL

756

NULL

757

};

757

};

758

759

#undef SCHED_FEAT

759

#undef SCHED_FEAT

760

761

static int sched_feat_show(struct seq_file *m, void *v)

761

static int sched_feat_show(struct seq_file *m, void *v)

762

{

762

{

763

int i;

763

int i;

764

765

for (i = 0; sched_feat_names[i]; i++) {

765

for (i = 0; sched_feat_names[i]; i++) {

766

if (!(sysctl_sched_features & (1UL << i)))

766

if (!(sysctl_sched_features & (1UL << i)))

767

seq_puts(m, "NO_");

767

seq_puts(m, "NO_");

768

seq_printf(m, "%s ", sched_feat_names[i]);

768

seq_printf(m, "%s ", sched_feat_names[i]);

769

}

769

}

770

seq_puts(m, "\n");

770

seq_puts(m, "\n");

771

772

return 0;

772

return 0;

773

}

773

}

774

775

static ssize_t

775

static ssize_t

776

sched_feat_write(struct file *filp, const char __user *ubuf,

776

sched_feat_write(struct file *filp, const char __user *ubuf,

777

size_t cnt, loff_t *ppos)

777

size_t cnt, loff_t *ppos)

778

{

778

{

779

char buf[64];

779

char buf[64];

780

char *cmp = buf;

780

char *cmp = buf;

781

int neg = 0;

781

int neg = 0;

782

int i;

782

int i;

783

784

if (cnt > 63)

784

if (cnt > 63)

785

cnt = 63;

785

cnt = 63;

786

787

if (copy_from_user(&buf, ubuf, cnt))

787

if (copy_from_user(&buf, ubuf, cnt))

788

return -EFAULT;

788

return -EFAULT;

789

790

buf[cnt] = 0;

790

buf[cnt] = 0;

791

792

if (strncmp(buf, "NO_", 3) == 0) {

792

if (strncmp(buf, "NO_", 3) == 0) {

793

neg = 1;

793

neg = 1;

794

cmp += 3;

794

cmp += 3;

795

}

795

}

796

797

for (i = 0; sched_feat_names[i]; i++) {

797

for (i = 0; sched_feat_names[i]; i++) {

798

int len = strlen(sched_feat_names[i]);

798

int len = strlen(sched_feat_names[i]);

799

800

if (strncmp(cmp, sched_feat_names[i], len) == 0) {

800

if (strncmp(cmp, sched_feat_names[i], len) == 0) {

801

if (neg)

801

if (neg)

802

sysctl_sched_features &= ~(1UL << i);

802

sysctl_sched_features &= ~(1UL << i);

803

else

803

else

804

sysctl_sched_features |= (1UL << i);

804

sysctl_sched_features |= (1UL << i);

805

break;

805

break;

806

}

806

}

807

}

807

}

808

809

if (!sched_feat_names[i])

809

if (!sched_feat_names[i])

810

return -EINVAL;

810

return -EINVAL;

811

812

filp->f_pos += cnt;

812

filp->f_pos += cnt;

813

814

return cnt;

814

return cnt;

815

}

815

}

816

817

static int sched_feat_open(struct inode *inode, struct file *filp)

817

static int sched_feat_open(struct inode *inode, struct file *filp)

818

{

818

{

819

return single_open(filp, sched_feat_show, NULL);

819

return single_open(filp, sched_feat_show, NULL);

820

}

820

}

821

822

static struct file_operations sched_feat_fops = {

822

static struct file_operations sched_feat_fops = {

823

.open = sched_feat_open,

823

.open = sched_feat_open,

824

.write = sched_feat_write,

824

.write = sched_feat_write,

825

.read = seq_read,

825

.read = seq_read,

826

.llseek = seq_lseek,

826

.llseek = seq_lseek,

827

.release = single_release,

827

.release = single_release,

828

};

828

};

829

830

static __init int sched_init_debug(void)

830

static __init int sched_init_debug(void)

831

{

831

{

832

debugfs_create_file("sched_features", 0644, NULL, NULL,

832

debugfs_create_file("sched_features", 0644, NULL, NULL,

833

&sched_feat_fops);

833

&sched_feat_fops);

834

835

return 0;

835

return 0;

836

}

836

}

837

late_initcall(sched_init_debug);

837

late_initcall(sched_init_debug);

838

839

#endif

839

#endif

840

841

#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))

841

#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))

842

843

/*

843

/*

844

* Number of tasks to iterate in a single balance run.

844

* Number of tasks to iterate in a single balance run.

845

* Limited because this is done with IRQs disabled.

845

* Limited because this is done with IRQs disabled.

846

*/

846

*/

847

const_debug unsigned int sysctl_sched_nr_migrate = 32;

847

const_debug unsigned int sysctl_sched_nr_migrate = 32;

848

849

/*

849

/*

850

* ratelimit for updating the group shares.

850

* ratelimit for updating the group shares.

851

* default: 0.25ms

851

* default: 0.25ms

852

*/

852

*/

853

unsigned int sysctl_sched_shares_ratelimit = 250000;

853

unsigned int sysctl_sched_shares_ratelimit = 250000;

854

855

/*

855

/*

856

* Inject some fuzzyness into changing the per-cpu group shares

856

* Inject some fuzzyness into changing the per-cpu group shares

857

* this avoids remote rq-locks at the expense of fairness.

857

* this avoids remote rq-locks at the expense of fairness.

858

* default: 4

858

* default: 4

859

*/

859

*/

860

unsigned int sysctl_sched_shares_thresh = 4;

860

unsigned int sysctl_sched_shares_thresh = 4;

861

862

/*

862

/*

863

* period over which we measure -rt task cpu usage in us.

863

* period over which we measure -rt task cpu usage in us.

864

* default: 1s

864

* default: 1s

865

*/

865

*/

866

unsigned int sysctl_sched_rt_period = 1000000;

866

unsigned int sysctl_sched_rt_period = 1000000;

867

868

static __read_mostly int scheduler_running;

868

static __read_mostly int scheduler_running;

869

870

/*

870

/*

871

* part of the period that we allow rt tasks to run in us.

871

* part of the period that we allow rt tasks to run in us.

872

* default: 0.95s

872

* default: 0.95s

873

*/

873

*/

874

int sysctl_sched_rt_runtime = 950000;

874

int sysctl_sched_rt_runtime = 950000;

875

876

static inline u64 global_rt_period(void)

876

static inline u64 global_rt_period(void)

877

{

877

{

878

return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;

878

return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;

879

}

879

}

880

881

static inline u64 global_rt_runtime(void)

881

static inline u64 global_rt_runtime(void)

882

{

882

{

883

if (sysctl_sched_rt_runtime < 0)

883

if (sysctl_sched_rt_runtime < 0)

884

return RUNTIME_INF;

884

return RUNTIME_INF;

885

886

return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;

886

return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;

887

}

887

}

888

889

#ifndef prepare_arch_switch

889

#ifndef prepare_arch_switch

890

# define prepare_arch_switch(next) do { } while (0)

890

# define prepare_arch_switch(next) do { } while (0)

891

#endif

891

#endif

892

#ifndef finish_arch_switch

892

#ifndef finish_arch_switch

893

# define finish_arch_switch(prev) do { } while (0)

893

# define finish_arch_switch(prev) do { } while (0)

894

#endif

894

#endif

895

896

static inline int task_current(struct rq *rq, struct task_struct *p)

896

static inline int task_current(struct rq *rq, struct task_struct *p)

897

{

897

{

898

return rq->curr == p;

898

return rq->curr == p;

899

}

899

}

900

901

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

901

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

902

static inline int task_running(struct rq *rq, struct task_struct *p)

902

static inline int task_running(struct rq *rq, struct task_struct *p)

903

{

903

{

904

return task_current(rq, p);

904

return task_current(rq, p);

905

}

905

}

906

907

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

907

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

908

{

908

{

909

}

909

}

910

911

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

911

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

912

{

912

{

913

#ifdef CONFIG_DEBUG_SPINLOCK

913

#ifdef CONFIG_DEBUG_SPINLOCK

914

/* this is a valid case when another task releases the spinlock */

914

/* this is a valid case when another task releases the spinlock */

915

rq->lock.owner = current;

915

rq->lock.owner = current;

916

#endif

916

#endif

917

/*

917

/*

918

* If we are tracking spinlock dependencies then we have to

918

* If we are tracking spinlock dependencies then we have to

919

* fix up the runqueue lock - which gets 'carried over' from

919

* fix up the runqueue lock - which gets 'carried over' from

920

* prev into current:

920

* prev into current:

921

*/

921

*/

922

spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);

922

spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);

923

924

spin_unlock_irq(&rq->lock);

924

spin_unlock_irq(&rq->lock);

925

}

925

}

926

927

#else /* __ARCH_WANT_UNLOCKED_CTXSW */

927

#else /* __ARCH_WANT_UNLOCKED_CTXSW */

928

static inline int task_running(struct rq *rq, struct task_struct *p)

928

static inline int task_running(struct rq *rq, struct task_struct *p)

929

{

929

{

930

#ifdef CONFIG_SMP

930

#ifdef CONFIG_SMP

931

return p->oncpu;

931

return p->oncpu;

932

#else

932

#else

933

return task_current(rq, p);

933

return task_current(rq, p);

934

#endif

934

#endif

935

}

935

}

936

937

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

937

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

938

{

938

{

939

#ifdef CONFIG_SMP

939

#ifdef CONFIG_SMP

940

/*

940

/*

941

* We can optimise this out completely for !SMP, because the

941

* We can optimise this out completely for !SMP, because the

942

* SMP rebalancing from interrupt is the only thing that cares

942

* SMP rebalancing from interrupt is the only thing that cares

943

* here.

943

* here.

944

*/

944

*/

945

next->oncpu = 1;

945

next->oncpu = 1;

946

#endif

946

#endif

947

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

947

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

948

spin_unlock_irq(&rq->lock);

948

spin_unlock_irq(&rq->lock);

949

#else

949

#else

950

spin_unlock(&rq->lock);

950

spin_unlock(&rq->lock);

951

#endif

951

#endif

952

}

952

}

953

954

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

954

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

955

{

955

{

956

#ifdef CONFIG_SMP

956

#ifdef CONFIG_SMP

957

/*

957

/*

958

* After ->oncpu is cleared, the task can be moved to a different CPU.

958

* After ->oncpu is cleared, the task can be moved to a different CPU.

959

* We must ensure this doesn't happen until the switch is completely

959

* We must ensure this doesn't happen until the switch is completely

960

* finished.

960

* finished.

961

*/

961

*/

962

smp_wmb();

962

smp_wmb();

963

prev->oncpu = 0;

963

prev->oncpu = 0;

964

#endif

964

#endif

965

#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW

965

#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW

966

local_irq_enable();

966

local_irq_enable();

967

#endif

967

#endif

968

}

968

}

969

#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

969

#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

970

971

/*

971

/*

972

* __task_rq_lock - lock the runqueue a given task resides on.

972

* __task_rq_lock - lock the runqueue a given task resides on.

973

* Must be called interrupts disabled.

973

* Must be called interrupts disabled.

974

*/

974

*/

975

static inline struct rq *__task_rq_lock(struct task_struct *p)

975

static inline struct rq *__task_rq_lock(struct task_struct *p)

976

__acquires(rq->lock)

976

__acquires(rq->lock)

977

{

977

{

978

for (;;) {

978

for (;;) {

979

struct rq *rq = task_rq(p);

979

struct rq *rq = task_rq(p);

980

spin_lock(&rq->lock);

980

spin_lock(&rq->lock);

981

if (likely(rq == task_rq(p)))

981

if (likely(rq == task_rq(p)))

982

return rq;

982

return rq;

983

spin_unlock(&rq->lock);

983

spin_unlock(&rq->lock);

984

}

984

}

985

}

985

}

986

987

/*

987

/*

988

* task_rq_lock - lock the runqueue a given task resides on and disable

988

* task_rq_lock - lock the runqueue a given task resides on and disable

989

* interrupts. Note the ordering: we can safely lookup the task_rq without

989

* interrupts. Note the ordering: we can safely lookup the task_rq without

990

* explicitly disabling preemption.

990

* explicitly disabling preemption.

991

*/

991

*/

992

static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)

992

static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)

993

__acquires(rq->lock)

993

__acquires(rq->lock)

994

{

994

{

995

struct rq *rq;

995

struct rq *rq;

996

997

for (;;) {

997

for (;;) {

998

local_irq_save(*flags);

998

local_irq_save(*flags);

999

rq = task_rq(p);

999

rq = task_rq(p);

1000

spin_lock(&rq->lock);

1000

spin_lock(&rq->lock);

1001

if (likely(rq == task_rq(p)))

1001

if (likely(rq == task_rq(p)))

1002

return rq;

1002

return rq;

1003

spin_unlock_irqrestore(&rq->lock, *flags);

1003

spin_unlock_irqrestore(&rq->lock, *flags);

1004

}

1004

}

1005

}

1005

}

1006

1007

void task_rq_unlock_wait(struct task_struct *p)

1007

void task_rq_unlock_wait(struct task_struct *p)

1008

{

1008

{

1009

struct rq *rq = task_rq(p);

1009

struct rq *rq = task_rq(p);

1010

1011

smp_mb(); /* spin-unlock-wait is not a full memory barrier */

1011

smp_mb(); /* spin-unlock-wait is not a full memory barrier */

1012

spin_unlock_wait(&rq->lock);

1012

spin_unlock_wait(&rq->lock);

1013

}

1013

}

1014

1015

static void __task_rq_unlock(struct rq *rq)

1015

static void __task_rq_unlock(struct rq *rq)

1016

__releases(rq->lock)

1016

__releases(rq->lock)

1017

{

1017

{

1018

spin_unlock(&rq->lock);

1018

spin_unlock(&rq->lock);

1019

}

1019

}

1020

1021

static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)

1021

static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)

1022

__releases(rq->lock)

1022

__releases(rq->lock)

1023

{

1023

{

1024

spin_unlock_irqrestore(&rq->lock, *flags);

1024

spin_unlock_irqrestore(&rq->lock, *flags);

1025

}

1025

}

1026

1027

/*

1027

/*

1028

* this_rq_lock - lock this runqueue and disable interrupts.

1028

* this_rq_lock - lock this runqueue and disable interrupts.

1029

*/

1029

*/

1030

static struct rq *this_rq_lock(void)

1030

static struct rq *this_rq_lock(void)

1031

__acquires(rq->lock)

1031

__acquires(rq->lock)

1032

{

1032

{

1033

struct rq *rq;

1033

struct rq *rq;

1034

1035

local_irq_disable();

1035

local_irq_disable();

1036

rq = this_rq();

1036

rq = this_rq();

1037

spin_lock(&rq->lock);

1037

spin_lock(&rq->lock);

1038

1039

return rq;

1039

return rq;

1040

}

1040

}

1041

1042

#ifdef CONFIG_SCHED_HRTICK

1042

#ifdef CONFIG_SCHED_HRTICK

1043

/*

1043

/*

1044

* Use HR-timers to deliver accurate preemption points.

1044

* Use HR-timers to deliver accurate preemption points.

1045

*

1045

*

1046

* Its all a bit involved since we cannot program an hrt while holding the

1046

* Its all a bit involved since we cannot program an hrt while holding the

1047

* rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a

1047

* rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a

1048

* reschedule event.

1048

* reschedule event.

1049

*

1049

*

1050

* When we get rescheduled we reprogram the hrtick_timer outside of the

1050

* When we get rescheduled we reprogram the hrtick_timer outside of the

1051

* rq->lock.

1051

* rq->lock.

1052

*/

1052

*/

1053

1054

/*

1054

/*

1055

* Use hrtick when:

1055

* Use hrtick when:

1056

* - enabled by features

1056

* - enabled by features

1057

* - hrtimer is actually high res

1057

* - hrtimer is actually high res

1058

*/

1058

*/

1059

static inline int hrtick_enabled(struct rq *rq)

1059

static inline int hrtick_enabled(struct rq *rq)

1060

{

1060

{

1061

if (!sched_feat(HRTICK))

1061

if (!sched_feat(HRTICK))

1062

return 0;

1062

return 0;

1063

if (!cpu_active(cpu_of(rq)))

1063

if (!cpu_active(cpu_of(rq)))

1064

return 0;

1064

return 0;

1065

return hrtimer_is_hres_active(&rq->hrtick_timer);

1065

return hrtimer_is_hres_active(&rq->hrtick_timer);

1066

}

1066

}

1067

1068

static void hrtick_clear(struct rq *rq)

1068

static void hrtick_clear(struct rq *rq)

1069

{

1069

{

1070

if (hrtimer_active(&rq->hrtick_timer))

1070

if (hrtimer_active(&rq->hrtick_timer))

1071

hrtimer_cancel(&rq->hrtick_timer);

1071

hrtimer_cancel(&rq->hrtick_timer);

1072

}

1072

}

1073

1074

/*

1074

/*

1075

* High-resolution timer tick.

1075

* High-resolution timer tick.

1076

* Runs from hardirq context with interrupts disabled.

1076

* Runs from hardirq context with interrupts disabled.

1077

*/

1077

*/

1078

static enum hrtimer_restart hrtick(struct hrtimer *timer)

1078

static enum hrtimer_restart hrtick(struct hrtimer *timer)

1079

{

1079

{

1080

struct rq *rq = container_of(timer, struct rq, hrtick_timer);

1080

struct rq *rq = container_of(timer, struct rq, hrtick_timer);

1081

1082

WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

1082

WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

1083

1084

spin_lock(&rq->lock);

1084

spin_lock(&rq->lock);

1085

update_rq_clock(rq);

1085

update_rq_clock(rq);

1086

rq->curr->sched_class->task_tick(rq, rq->curr, 1);

1086

rq->curr->sched_class->task_tick(rq, rq->curr, 1);

1087

spin_unlock(&rq->lock);

1087

spin_unlock(&rq->lock);

1088

1089

return HRTIMER_NORESTART;

1089

return HRTIMER_NORESTART;

1090

}

1090

}

1091

1092

#ifdef CONFIG_SMP

1092

#ifdef CONFIG_SMP

1093

/*

1093

/*

1094

* called from hardirq (IPI) context

1094

* called from hardirq (IPI) context

1095

*/

1095

*/

1096

static void __hrtick_start(void *arg)

1096

static void __hrtick_start(void *arg)

1097

{

1097

{

1098

struct rq *rq = arg;

1098

struct rq *rq = arg;

1099

1100

spin_lock(&rq->lock);

1100

spin_lock(&rq->lock);

1101

hrtimer_restart(&rq->hrtick_timer);

1101

hrtimer_restart(&rq->hrtick_timer);

1102

rq->hrtick_csd_pending = 0;

1102

rq->hrtick_csd_pending = 0;

1103

spin_unlock(&rq->lock);

1103

spin_unlock(&rq->lock);

1104

}

1104

}

1105

1106

/*

1106

/*

1107

* Called to set the hrtick timer state.

1107

* Called to set the hrtick timer state.

1108

*

1108

*

1109

* called with rq->lock held and irqs disabled

1109

* called with rq->lock held and irqs disabled

1110

*/

1110

*/

1111

static void hrtick_start(struct rq *rq, u64 delay)

1111

static void hrtick_start(struct rq *rq, u64 delay)

1112

{

1112

{

1113

struct hrtimer *timer = &rq->hrtick_timer;

1113

struct hrtimer *timer = &rq->hrtick_timer;

1114

ktime_t time = ktime_add_ns(timer->base->get_time(), delay);

1114

ktime_t time = ktime_add_ns(timer->base->get_time(), delay);

1115

1116

hrtimer_set_expires(timer, time);

1116

hrtimer_set_expires(timer, time);

1117

1118

if (rq == this_rq()) {

1118

if (rq == this_rq()) {

1119

hrtimer_restart(timer);

1119

hrtimer_restart(timer);

1120

} else if (!rq->hrtick_csd_pending) {

1120

} else if (!rq->hrtick_csd_pending) {

1121

__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);

1121

__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);

1122

rq->hrtick_csd_pending = 1;

1122

rq->hrtick_csd_pending = 1;

1123

}

1123

}

1124

}

1124

}

1125

1126

static int

1126

static int

1127

hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)

1127

hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)

1128

{

1128

{

1129

int cpu = (int)(long)hcpu;

1129

int cpu = (int)(long)hcpu;

1130

1131

switch (action) {

1131

switch (action) {

1132

case CPU_UP_CANCELED:

1132

case CPU_UP_CANCELED:

1133

case CPU_UP_CANCELED_FROZEN:

1133

case CPU_UP_CANCELED_FROZEN:

1134

case CPU_DOWN_PREPARE:

1134

case CPU_DOWN_PREPARE:

1135

case CPU_DOWN_PREPARE_FROZEN:

1135

case CPU_DOWN_PREPARE_FROZEN:

1136

case CPU_DEAD:

1136

case CPU_DEAD:

1137

case CPU_DEAD_FROZEN:

1137

case CPU_DEAD_FROZEN:

1138

hrtick_clear(cpu_rq(cpu));

1138

hrtick_clear(cpu_rq(cpu));

1139

return NOTIFY_OK;

1139

return NOTIFY_OK;

1140

}

1140

}

1141

1142

return NOTIFY_DONE;

1142

return NOTIFY_DONE;

1143

}

1143

}

1144

1145

static __init void init_hrtick(void)

1145

static __init void init_hrtick(void)

1146

{

1146

{

1147

hotcpu_notifier(hotplug_hrtick, 0);

1147

hotcpu_notifier(hotplug_hrtick, 0);

1148

}

1148

}

1149

#else

1149

#else

1150

/*

1150

/*

1151

* Called to set the hrtick timer state.

1151

* Called to set the hrtick timer state.

1152

*

1152

*

1153

* called with rq->lock held and irqs disabled

1153

* called with rq->lock held and irqs disabled

1154

*/

1154

*/

1155

static void hrtick_start(struct rq *rq, u64 delay)

1155

static void hrtick_start(struct rq *rq, u64 delay)

1156

{

1156

{

1157

__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,

1157

__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,

1158

HRTIMER_MODE_REL_PINNED, 0);

1158

HRTIMER_MODE_REL_PINNED, 0);

1159

}

1159

}

1160

1161

static inline void init_hrtick(void)

1161

static inline void init_hrtick(void)

1162

{

1162

{

1163

}

1163

}

1164

#endif /* CONFIG_SMP */

1164

#endif /* CONFIG_SMP */

1165

1166

static void init_rq_hrtick(struct rq *rq)

1166

static void init_rq_hrtick(struct rq *rq)

1167

{

1167

{

1168

#ifdef CONFIG_SMP

1168

#ifdef CONFIG_SMP

1169

rq->hrtick_csd_pending = 0;

1169

rq->hrtick_csd_pending = 0;

1170

1171

rq->hrtick_csd.flags = 0;

1171

rq->hrtick_csd.flags = 0;

1172

rq->hrtick_csd.func = __hrtick_start;

1172

rq->hrtick_csd.func = __hrtick_start;

1173

rq->hrtick_csd.info = rq;

1173

rq->hrtick_csd.info = rq;

1174

#endif

1174

#endif

1175

1176

hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

1176

hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

1177

rq->hrtick_timer.function = hrtick;

1177

rq->hrtick_timer.function = hrtick;

1178

}

1178

}

1179

#else /* CONFIG_SCHED_HRTICK */

1179

#else /* CONFIG_SCHED_HRTICK */

1180

static inline void hrtick_clear(struct rq *rq)

1180

static inline void hrtick_clear(struct rq *rq)

1181

{

1181

{

1182

}

1182

}

1183

1184

static inline void init_rq_hrtick(struct rq *rq)

1184

static inline void init_rq_hrtick(struct rq *rq)

1185

{

1185

{

1186

}

1186

}

1187

1188

static inline void init_hrtick(void)

1188

static inline void init_hrtick(void)

1189

{

1189

{

1190

}

1190

}

1191

#endif /* CONFIG_SCHED_HRTICK */

1191

#endif /* CONFIG_SCHED_HRTICK */

1192

1193

/*

1193

/*

1194

* resched_task - mark a task 'to be rescheduled now'.

1194

* resched_task - mark a task 'to be rescheduled now'.

1195

*

1195

*

1196

* On UP this means the setting of the need_resched flag, on SMP it

1196

* On UP this means the setting of the need_resched flag, on SMP it

1197

* might also involve a cross-CPU call to trigger the scheduler on

1197

* might also involve a cross-CPU call to trigger the scheduler on

1198

* the target CPU.

1198

* the target CPU.

1199

*/

1199

*/

1200

#ifdef CONFIG_SMP

1200

#ifdef CONFIG_SMP

1201

1202

#ifndef tsk_is_polling

1202

#ifndef tsk_is_polling

1203

#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)

1203

#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)

1204

#endif

1204

#endif

1205

1206

static void resched_task(struct task_struct *p)

1206

static void resched_task(struct task_struct *p)

1207

{

1207

{

1208

int cpu;

1208

int cpu;

1209

1210

assert_spin_locked(&task_rq(p)->lock);

1210

assert_spin_locked(&task_rq(p)->lock);

1211

1212

if (test_tsk_need_resched(p))

1212

if (test_tsk_need_resched(p))

1213

return;

1213

return;

1214

1215

set_tsk_need_resched(p);

1215

set_tsk_need_resched(p);

1216

1217

cpu = task_cpu(p);

1217

cpu = task_cpu(p);

1218

if (cpu == smp_processor_id())

1218

if (cpu == smp_processor_id())

1219

return;

1219

return;

1220

1221

/* NEED_RESCHED must be visible before we test polling */

1221

/* NEED_RESCHED must be visible before we test polling */

1222

smp_mb();

1222

smp_mb();

1223

if (!tsk_is_polling(p))

1223

if (!tsk_is_polling(p))

1224

smp_send_reschedule(cpu);

1224

smp_send_reschedule(cpu);

1225

}

1225

}

1226

1227

static void resched_cpu(int cpu)

1227

static void resched_cpu(int cpu)

1228

{

1228

{

1229

struct rq *rq = cpu_rq(cpu);

1229

struct rq *rq = cpu_rq(cpu);

1230

unsigned long flags;

1230

unsigned long flags;

1231

1232

if (!spin_trylock_irqsave(&rq->lock, flags))

1232

if (!spin_trylock_irqsave(&rq->lock, flags))

1233

return;

1233

return;

1234

resched_task(cpu_curr(cpu));

1234

resched_task(cpu_curr(cpu));

1235

spin_unlock_irqrestore(&rq->lock, flags);

1235

spin_unlock_irqrestore(&rq->lock, flags);

1236

}

1236

}

1237

1238

#ifdef CONFIG_NO_HZ

1238

#ifdef CONFIG_NO_HZ

1239

/*

1239

/*

1240

* When add_timer_on() enqueues a timer into the timer wheel of an

1240

* When add_timer_on() enqueues a timer into the timer wheel of an

1241

* idle CPU then this timer might expire before the next timer event

1241

* idle CPU then this timer might expire before the next timer event

1242

* which is scheduled to wake up that CPU. In case of a completely

1242

* which is scheduled to wake up that CPU. In case of a completely

1243

* idle system the next event might even be infinite time into the

1243

* idle system the next event might even be infinite time into the

1244

* future. wake_up_idle_cpu() ensures that the CPU is woken up and

1244

* future. wake_up_idle_cpu() ensures that the CPU is woken up and

1245

* leaves the inner idle loop so the newly added timer is taken into

1245

* leaves the inner idle loop so the newly added timer is taken into

1246

* account when the CPU goes back to idle and evaluates the timer

1246

* account when the CPU goes back to idle and evaluates the timer

1247

* wheel for the next timer event.

1247

* wheel for the next timer event.

1248

*/

1248

*/

1249

void wake_up_idle_cpu(int cpu)

1249

void wake_up_idle_cpu(int cpu)

1250

{

1250

{

1251

struct rq *rq = cpu_rq(cpu);

1251

struct rq *rq = cpu_rq(cpu);

1252

1253

if (cpu == smp_processor_id())

1253

if (cpu == smp_processor_id())

1254

return;

1254

return;

1255

1256

/*

1256

/*

1257

* This is safe, as this function is called with the timer

1257

* This is safe, as this function is called with the timer

1258

* wheel base lock of (cpu) held. When the CPU is on the way

1258

* wheel base lock of (cpu) held. When the CPU is on the way

1259

* to idle and has not yet set rq->curr to idle then it will

1259

* to idle and has not yet set rq->curr to idle then it will

1260

* be serialized on the timer wheel base lock and take the new

1260

* be serialized on the timer wheel base lock and take the new

1261

* timer into account automatically.

1261

* timer into account automatically.

1262

*/

1262

*/

1263

if (rq->curr != rq->idle)

1263

if (rq->curr != rq->idle)

1264

return;

1264

return;

1265

1266

/*

1266

/*

1267

* We can set TIF_RESCHED on the idle task of the other CPU

1267

* We can set TIF_RESCHED on the idle task of the other CPU

1268

* lockless. The worst case is that the other CPU runs the

1268

* lockless. The worst case is that the other CPU runs the

1269

* idle task through an additional NOOP schedule()

1269

* idle task through an additional NOOP schedule()

1270

*/

1270

*/

1271

set_tsk_need_resched(rq->idle);

1271

set_tsk_need_resched(rq->idle);

1272

1273

/* NEED_RESCHED must be visible before we test polling */

1273

/* NEED_RESCHED must be visible before we test polling */

1274

smp_mb();

1274

smp_mb();

1275

if (!tsk_is_polling(rq->idle))

1275

if (!tsk_is_polling(rq->idle))

1276

smp_send_reschedule(cpu);

1276

smp_send_reschedule(cpu);

1277

}

1277

}

1278

#endif /* CONFIG_NO_HZ */

1278

#endif /* CONFIG_NO_HZ */

1279

1280

#else /* !CONFIG_SMP */

1280

#else /* !CONFIG_SMP */

1281

static void resched_task(struct task_struct *p)

1281

static void resched_task(struct task_struct *p)

1282

{

1282

{

1283

assert_spin_locked(&task_rq(p)->lock);

1283

assert_spin_locked(&task_rq(p)->lock);

1284

set_tsk_need_resched(p);

1284

set_tsk_need_resched(p);

1285

}

1285

}

1286

#endif /* CONFIG_SMP */

1286

#endif /* CONFIG_SMP */

1287

1288

#if BITS_PER_LONG == 32

1288

#if BITS_PER_LONG == 32

1289

# define WMULT_CONST (~0UL)

1289

# define WMULT_CONST (~0UL)

1290

#else

1290

#else

1291

# define WMULT_CONST (1UL << 32)

1291

# define WMULT_CONST (1UL << 32)

1292

#endif

1292

#endif

1293

1294

#define WMULT_SHIFT 32

1294

#define WMULT_SHIFT 32

1295

1296

/*

1296

/*

1297

* Shift right and round:

1297

* Shift right and round:

1298

*/

1298

*/

1299

#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))

1299

#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))

1300

1301

/*

1301

/*

1302

* delta *= weight / lw

1302

* delta *= weight / lw

1303

*/

1303

*/

1304

static unsigned long

1304

static unsigned long

1305

calc_delta_mine(unsigned long delta_exec, unsigned long weight,

1305

calc_delta_mine(unsigned long delta_exec, unsigned long weight,

1306

struct load_weight *lw)

1306

struct load_weight *lw)

1307

{

1307

{

1308

u64 tmp;

1308

u64 tmp;

1309

1310

if (!lw->inv_weight) {

1310

if (!lw->inv_weight) {

1311

if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))

1311

if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))

1312

lw->inv_weight = 1;

1312

lw->inv_weight = 1;

1313

else

1313

else

1314

lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)

1314

lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)

1315

/ (lw->weight+1);

1315

/ (lw->weight+1);

1316

}

1316

}

1317

1318

tmp = (u64)delta_exec * weight;

1318

tmp = (u64)delta_exec * weight;

1319

/*

1319

/*

1320

* Check whether we'd overflow the 64-bit multiplication:

1320

* Check whether we'd overflow the 64-bit multiplication:

1321

*/

1321

*/

1322

if (unlikely(tmp > WMULT_CONST))

1322

if (unlikely(tmp > WMULT_CONST))

1323

tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,

1323

tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,

1324

WMULT_SHIFT/2);

1324

WMULT_SHIFT/2);

1325

else

1325

else

1326

tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);

1326

tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);

1327

1328

return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);

1328

return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);

1329

}

1329

}

1330

1331

static inline void update_load_add(struct load_weight *lw, unsigned long inc)

1331

static inline void update_load_add(struct load_weight *lw, unsigned long inc)

1332

{

1332

{

1333

lw->weight += inc;

1333

lw->weight += inc;

1334

lw->inv_weight = 0;

1334

lw->inv_weight = 0;

1335

}

1335

}

1336

1337

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)

1337

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)

1338

{

1338

{

1339

lw->weight -= dec;

1339

lw->weight -= dec;

1340

lw->inv_weight = 0;

1340

lw->inv_weight = 0;

1341

}

1341

}

1342

1343

/*

1343

/*

1344

* To aid in avoiding the subversion of "niceness" due to uneven distribution

1344

* To aid in avoiding the subversion of "niceness" due to uneven distribution

1345

* of tasks with abnormal "nice" values across CPUs the contribution that

1345

* of tasks with abnormal "nice" values across CPUs the contribution that

1346

* each task makes to its run queue's load is weighted according to its

1346

* each task makes to its run queue's load is weighted according to its

1347

* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a

1347

* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a

1348

* scaled version of the new time slice allocation that they receive on time

1348

* scaled version of the new time slice allocation that they receive on time

1349

* slice expiry etc.

1349

* slice expiry etc.

1350

*/

1350

*/

1351

1352

#define WEIGHT_IDLEPRIO 3

1352

#define WEIGHT_IDLEPRIO 3

1353

#define WMULT_IDLEPRIO 1431655765

1353

#define WMULT_IDLEPRIO 1431655765

1354

1355

/*

1355

/*

1356

* Nice levels are multiplicative, with a gentle 10% change for every

1356

* Nice levels are multiplicative, with a gentle 10% change for every

1357

* nice level changed. I.e. when a CPU-bound task goes from nice 0 to

1357

* nice level changed. I.e. when a CPU-bound task goes from nice 0 to

1358

* nice 1, it will get ~10% less CPU time than another CPU-bound task

1358

* nice 1, it will get ~10% less CPU time than another CPU-bound task

1359

* that remained on nice 0.

1359

* that remained on nice 0.

1360

*

1360

*

1361

* The "10% effect" is relative and cumulative: from _any_ nice level,

1361

* The "10% effect" is relative and cumulative: from _any_ nice level,

1362

* if you go up 1 level, it's -10% CPU usage, if you go down 1 level

1362

* if you go up 1 level, it's -10% CPU usage, if you go down 1 level

1363

* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.

1363

* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.

1364

* If a task goes up by ~10% and another task goes down by ~10% then

1364

* If a task goes up by ~10% and another task goes down by ~10% then

1365

* the relative distance between them is ~25%.)

1365

* the relative distance between them is ~25%.)

1366

*/

1366

*/

1367

static const int prio_to_weight[40] = {

1367

static const int prio_to_weight[40] = {

1368

/* -20 */ 88761, 71755, 56483, 46273, 36291,

1368

/* -20 */ 88761, 71755, 56483, 46273, 36291,

1369

/* -15 */ 29154, 23254, 18705, 14949, 11916,

1369

/* -15 */ 29154, 23254, 18705, 14949, 11916,

1370

/* -10 */ 9548, 7620, 6100, 4904, 3906,

1370

/* -10 */ 9548, 7620, 6100, 4904, 3906,

1371

/* -5 */ 3121, 2501, 1991, 1586, 1277,

1371

/* -5 */ 3121, 2501, 1991, 1586, 1277,

1372

/* 0 */ 1024, 820, 655, 526, 423,

1372

/* 0 */ 1024, 820, 655, 526, 423,

1373

/* 5 */ 335, 272, 215, 172, 137,

1373

/* 5 */ 335, 272, 215, 172, 137,

1374

/* 10 */ 110, 87, 70, 56, 45,

1374

/* 10 */ 110, 87, 70, 56, 45,

1375

/* 15 */ 36, 29, 23, 18, 15,

1375

/* 15 */ 36, 29, 23, 18, 15,

1376

};

1376

};

1377

1378

/*

1378

/*

1379

* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.

1379

* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.

1380

*

1380

*

1381

* In cases where the weight does not change often, we can use the

1381

* In cases where the weight does not change often, we can use the

1382

* precalculated inverse to speed up arithmetics by turning divisions

1382

* precalculated inverse to speed up arithmetics by turning divisions

1383

* into multiplications:

1383

* into multiplications:

1384

*/

1384

*/

1385

static const u32 prio_to_wmult[40] = {

1385

static const u32 prio_to_wmult[40] = {

1386

/* -20 */ 48388, 59856, 76040, 92818, 118348,

1386

/* -20 */ 48388, 59856, 76040, 92818, 118348,

1387

/* -15 */ 147320, 184698, 229616, 287308, 360437,

1387

/* -15 */ 147320, 184698, 229616, 287308, 360437,

1388

/* -10 */ 449829, 563644, 704093, 875809, 1099582,

1388

/* -10 */ 449829, 563644, 704093, 875809, 1099582,

1389

/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,

1389

/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,

1390

/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,

1390

/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,

1391

/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,

1391

/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,

1392

/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,

1392

/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,

1393

/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,

1393

/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,

1394

};

1394

};

1395

1396

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);

1396

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);

1397

1398

/*

1398

/*

1399

* runqueue iterator, to support SMP load-balancing between different

1399

* runqueue iterator, to support SMP load-balancing between different

1400

* scheduling classes, without having to expose their internal data

1400

* scheduling classes, without having to expose their internal data

1401

* structures to the load-balancing proper:

1401

* structures to the load-balancing proper:

1402

*/

1402

*/

1403

struct rq_iterator {

1403

struct rq_iterator {

1404

void *arg;

1404

void *arg;

1405

struct task_struct *(*start)(void *);

1405

struct task_struct *(*start)(void *);

1406

struct task_struct *(*next)(void *);

1406

struct task_struct *(*next)(void *);

1407

};

1407

};

1408

1409

#ifdef CONFIG_SMP

1409

#ifdef CONFIG_SMP

1410

static unsigned long

1410

static unsigned long

1411

balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

1411

balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

1412

unsigned long max_load_move, struct sched_domain *sd,

1412

unsigned long max_load_move, struct sched_domain *sd,

1413

enum cpu_idle_type idle, int *all_pinned,

1413

enum cpu_idle_type idle, int *all_pinned,

1414

int *this_best_prio, struct rq_iterator *iterator);

1414

int *this_best_prio, struct rq_iterator *iterator);

1415

1416

static int

1416

static int

1417

iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

1417

iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

1418

struct sched_domain *sd, enum cpu_idle_type idle,

1418

struct sched_domain *sd, enum cpu_idle_type idle,

1419

struct rq_iterator *iterator);

1419

struct rq_iterator *iterator);

1420

#endif

1420

#endif

1421

1422

/* Time spent by the tasks of the cpu accounting group executing in ... */

1422

/* Time spent by the tasks of the cpu accounting group executing in ... */

1423

enum cpuacct_stat_index {

1423

enum cpuacct_stat_index {

1424

CPUACCT_STAT_USER, /* ... user mode */

1424

CPUACCT_STAT_USER, /* ... user mode */

1425

CPUACCT_STAT_SYSTEM, /* ... kernel mode */

1425

CPUACCT_STAT_SYSTEM, /* ... kernel mode */

1426

1427

CPUACCT_STAT_NSTATS,

1427

CPUACCT_STAT_NSTATS,

1428

};

1428

};

1429

1430

#ifdef CONFIG_CGROUP_CPUACCT

1430

#ifdef CONFIG_CGROUP_CPUACCT

1431

static void cpuacct_charge(struct task_struct *tsk, u64 cputime);

1431

static void cpuacct_charge(struct task_struct *tsk, u64 cputime);

1432

static void cpuacct_update_stats(struct task_struct *tsk,

1432

static void cpuacct_update_stats(struct task_struct *tsk,

1433

enum cpuacct_stat_index idx, cputime_t val);

1433

enum cpuacct_stat_index idx, cputime_t val);

1434

#else

1434

#else

1435

static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}

1435

static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}

1436

static inline void cpuacct_update_stats(struct task_struct *tsk,

1436

static inline void cpuacct_update_stats(struct task_struct *tsk,

1437

enum cpuacct_stat_index idx, cputime_t val) {}

1437

enum cpuacct_stat_index idx, cputime_t val) {}

1438

#endif

1438

#endif

1439

1440

static inline void inc_cpu_load(struct rq *rq, unsigned long load)

1440

static inline void inc_cpu_load(struct rq *rq, unsigned long load)

1441

{

1441

{

1442

update_load_add(&rq->load, load);

1442

update_load_add(&rq->load, load);

1443

}

1443

}

1444

1445

static inline void dec_cpu_load(struct rq *rq, unsigned long load)

1445

static inline void dec_cpu_load(struct rq *rq, unsigned long load)

1446

{

1446

{

1447

update_load_sub(&rq->load, load);

1447

update_load_sub(&rq->load, load);

1448

}

1448

}

1449

1450

#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)

1450

#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)

1451

typedef int (*tg_visitor)(struct task_group *, void *);

1451

typedef int (*tg_visitor)(struct task_group *, void *);

1452

1453

/*

1453

/*

1454

* Iterate the full tree, calling @down when first entering a node and @up when

1454

* Iterate the full tree, calling @down when first entering a node and @up when

1455

* leaving it for the final time.

1455

* leaving it for the final time.

1456

*/

1456

*/

1457

static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)

1457

static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)

1458

{

1458

{

1459

struct task_group *parent, *child;

1459

struct task_group *parent, *child;

1460

int ret;

1460

int ret;

1461

1462

rcu_read_lock();

1462

rcu_read_lock();

1463

parent = &root_task_group;

1463

parent = &root_task_group;

1464

down:

1464

down:

1465

ret = (*down)(parent, data);

1465

ret = (*down)(parent, data);

1466

if (ret)

1466

if (ret)

1467

goto out_unlock;

1467

goto out_unlock;

1468

list_for_each_entry_rcu(child, &parent->children, siblings) {

1468

list_for_each_entry_rcu(child, &parent->children, siblings) {

1469

parent = child;

1469

parent = child;

1470

goto down;

1470

goto down;

1471

1472

up:

1472

up:

1473

continue;

1473

continue;

1474

}

1474

}

1475

ret = (*up)(parent, data);

1475

ret = (*up)(parent, data);

1476

if (ret)

1476

if (ret)

1477

goto out_unlock;

1477

goto out_unlock;

1478

1479

child = parent;

1479

child = parent;

1480

parent = parent->parent;

1480

parent = parent->parent;

1481

if (parent)

1481

if (parent)

1482

goto up;

1482

goto up;

1483

out_unlock:

1483

out_unlock:

1484

rcu_read_unlock();

1484

rcu_read_unlock();

1485

1486

return ret;

1486

return ret;

1487

}

1487

}

1488

1489

static int tg_nop(struct task_group *tg, void *data)

1489

static int tg_nop(struct task_group *tg, void *data)

1490

{

1490

{

1491

return 0;

1491

return 0;

1492

}

1492

}

1493

#endif

1493

#endif

1494

1495

#ifdef CONFIG_SMP

1495

#ifdef CONFIG_SMP

1496

static unsigned long source_load(int cpu, int type);

1496

static unsigned long source_load(int cpu, int type);

1497

static unsigned long target_load(int cpu, int type);

1497

static unsigned long target_load(int cpu, int type);

1498

static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);

1498

static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);

1499

1500

static unsigned long cpu_avg_load_per_task(int cpu)

1500

static unsigned long cpu_avg_load_per_task(int cpu)

1501

{

1501

{

1502

struct rq *rq = cpu_rq(cpu);

1502

struct rq *rq = cpu_rq(cpu);

1503

unsigned long nr_running = ACCESS_ONCE(rq->nr_running);

1503

unsigned long nr_running = ACCESS_ONCE(rq->nr_running);

1504

1505

if (nr_running)

1505

if (nr_running)

1506

rq->avg_load_per_task = rq->load.weight / nr_running;

1506

rq->avg_load_per_task = rq->load.weight / nr_running;

1507

else

1507

else

1508

rq->avg_load_per_task = 0;

1508

rq->avg_load_per_task = 0;

1509

1510

return rq->avg_load_per_task;

1510

return rq->avg_load_per_task;

1511

}

1511

}

1512

1513

#ifdef CONFIG_FAIR_GROUP_SCHED

1513

#ifdef CONFIG_FAIR_GROUP_SCHED

1514

1515

static void __set_se_shares(struct sched_entity *se, unsigned long shares);

1515

static void __set_se_shares(struct sched_entity *se, unsigned long shares);

1516

1517

/*

1517

/*

1518

* Calculate and set the cpu's group shares.

1518

* Calculate and set the cpu's group shares.

1519

*/

1519

*/

1520

static void

1520

static void

1521

update_group_shares_cpu(struct task_group *tg, int cpu,

1521

update_group_shares_cpu(struct task_group *tg, int cpu,

1522

unsigned long sd_shares, unsigned long sd_rq_weight)

1522

unsigned long sd_shares, unsigned long sd_rq_weight)

1523

{

1523

{

1524

unsigned long shares;

1524

unsigned long shares;

1525

unsigned long rq_weight;

1525

unsigned long rq_weight;

1526

1527

if (!tg->se[cpu])

1527

if (!tg->se[cpu])

1528

return;

1528

return;

1529

1530

rq_weight = tg->cfs_rq[cpu]->rq_weight;

1530

rq_weight = tg->cfs_rq[cpu]->rq_weight;

1531

1532

/*

1532

/*

1533

* \Sum shares * rq_weight

1533

* \Sum shares * rq_weight

1534

* shares = -----------------------

1534

* shares = -----------------------

1535

* \Sum rq_weight

1535

* \Sum rq_weight

1536

*

1536

*

1537

*/

1537

*/

1538

shares = (sd_shares * rq_weight) / sd_rq_weight;

1538

shares = (sd_shares * rq_weight) / sd_rq_weight;

1539

shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);

1539

shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);

1540

1541

if (abs(shares - tg->se[cpu]->load.weight) >

1541

if (abs(shares - tg->se[cpu]->load.weight) >

1542

sysctl_sched_shares_thresh) {

1542

sysctl_sched_shares_thresh) {

1543

struct rq *rq = cpu_rq(cpu);

1543

struct rq *rq = cpu_rq(cpu);

1544

unsigned long flags;

1544

unsigned long flags;

1545

1546

spin_lock_irqsave(&rq->lock, flags);

1546

spin_lock_irqsave(&rq->lock, flags);

1547

tg->cfs_rq[cpu]->shares = shares;

1547

tg->cfs_rq[cpu]->shares = shares;

1548

1549

__set_se_shares(tg->se[cpu], shares);

1549

__set_se_shares(tg->se[cpu], shares);

1550

spin_unlock_irqrestore(&rq->lock, flags);

1550

spin_unlock_irqrestore(&rq->lock, flags);

1551

}

1551

}

1552

}

1552

}

1553

1554

/*

1554

/*

1555

* Re-compute the task group their per cpu shares over the given domain.

1555

* Re-compute the task group their per cpu shares over the given domain.

1556

* This needs to be done in a bottom-up fashion because the rq weight of a

1556

* This needs to be done in a bottom-up fashion because the rq weight of a

1557

* parent group depends on the shares of its child groups.

1557

* parent group depends on the shares of its child groups.

1558

*/

1558

*/

1559

static int tg_shares_up(struct task_group *tg, void *data)

1559

static int tg_shares_up(struct task_group *tg, void *data)

1560

{

1560

{

1561

unsigned long weight, rq_weight = 0;

1561

unsigned long weight, rq_weight = 0;

1562

unsigned long shares = 0;

1562

unsigned long shares = 0;

1563

struct sched_domain *sd = data;

1563

struct sched_domain *sd = data;

1564

int i;

1564

int i;

1565

1566

for_each_cpu(i, sched_domain_span(sd)) {

1566

for_each_cpu(i, sched_domain_span(sd)) {

1567

/*

1567

/*

1568

* If there are currently no tasks on the cpu pretend there

1568

* If there are currently no tasks on the cpu pretend there

1569

* is one of average load so that when a new task gets to

1569

* is one of average load so that when a new task gets to

1570

* run here it will not get delayed by group starvation.

1570

* run here it will not get delayed by group starvation.

1571

*/

1571

*/

1572

weight = tg->cfs_rq[i]->load.weight;

1572

weight = tg->cfs_rq[i]->load.weight;

1573

if (!weight)

1573

if (!weight)

1574

weight = NICE_0_LOAD;

1574

weight = NICE_0_LOAD;

1575

1576

tg->cfs_rq[i]->rq_weight = weight;

1576

tg->cfs_rq[i]->rq_weight = weight;

1577

rq_weight += weight;

1577

rq_weight += weight;

1578

shares += tg->cfs_rq[i]->shares;

1578

shares += tg->cfs_rq[i]->shares;

1579

}

1579

}

1580

1581

if ((!shares && rq_weight) || shares > tg->shares)

1581

if ((!shares && rq_weight) || shares > tg->shares)

1582

shares = tg->shares;

1582

shares = tg->shares;

1583

1584

if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))

1584

if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))

1585

shares = tg->shares;

1585

shares = tg->shares;

1586

1587

for_each_cpu(i, sched_domain_span(sd))

1587

for_each_cpu(i, sched_domain_span(sd))

1588

update_group_shares_cpu(tg, i, shares, rq_weight);

1588

update_group_shares_cpu(tg, i, shares, rq_weight);

1589

1590

return 0;

1590

return 0;

1591

}

1591

}

1592

1593

/*

1593

/*

1594

* Compute the cpu's hierarchical load factor for each task group.

1594

* Compute the cpu's hierarchical load factor for each task group.

1595

* This needs to be done in a top-down fashion because the load of a child

1595

* This needs to be done in a top-down fashion because the load of a child

1596

* group is a fraction of its parents load.

1596

* group is a fraction of its parents load.

1597

*/

1597

*/

1598

static int tg_load_down(struct task_group *tg, void *data)

1598

static int tg_load_down(struct task_group *tg, void *data)

1599

{

1599

{

1600

unsigned long load;

1600

unsigned long load;

1601

long cpu = (long)data;

1601

long cpu = (long)data;

1602

1603

if (!tg->parent) {

1603

if (!tg->parent) {

1604

load = cpu_rq(cpu)->load.weight;

1604

load = cpu_rq(cpu)->load.weight;

1605

} else {

1605

} else {

1606

load = tg->parent->cfs_rq[cpu]->h_load;

1606

load = tg->parent->cfs_rq[cpu]->h_load;

1607

load *= tg->cfs_rq[cpu]->shares;

1607

load *= tg->cfs_rq[cpu]->shares;

1608

load /= tg->parent->cfs_rq[cpu]->load.weight + 1;

1608

load /= tg->parent->cfs_rq[cpu]->load.weight + 1;

1609

}

1609

}

1610

1611

tg->cfs_rq[cpu]->h_load = load;

1611

tg->cfs_rq[cpu]->h_load = load;

1612

1613

return 0;

1613

return 0;

1614

}

1614

}

1615

1616

static void update_shares(struct sched_domain *sd)

1616

static void update_shares(struct sched_domain *sd)

1617

{

1617

{

1618

u64 now = cpu_clock(raw_smp_processor_id());

1618

u64 now = cpu_clock(raw_smp_processor_id());

1619

s64 elapsed = now - sd->last_update;

1619

s64 elapsed = now - sd->last_update;

1620

1621

if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {

1621

if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {

1622

sd->last_update = now;

1622

sd->last_update = now;

1623

walk_tg_tree(tg_nop, tg_shares_up, sd);

1623

walk_tg_tree(tg_nop, tg_shares_up, sd);

1624

}

1624

}

1625

}

1625

}

1626

1627

static void update_shares_locked(struct rq *rq, struct sched_domain *sd)

1627

static void update_shares_locked(struct rq *rq, struct sched_domain *sd)

1628

{

1628

{

1629

spin_unlock(&rq->lock);

1629

spin_unlock(&rq->lock);

1630

update_shares(sd);

1630

update_shares(sd);

1631

spin_lock(&rq->lock);

1631

spin_lock(&rq->lock);

1632

}

1632

}

1633

1634

static void update_h_load(long cpu)

1634

static void update_h_load(long cpu)

1635

{

1635

{

1636

walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);

1636

walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);

1637

}

1637

}

1638

1639

#else

1639

#else

1640

1641

static inline void update_shares(struct sched_domain *sd)

1641

static inline void update_shares(struct sched_domain *sd)

1642

{

1642

{

1643

}

1643

}

1644

1645

static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)

1645

static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)

1646

{

1646

{

1647

}

1647

}

1648

1649

#endif

1649

#endif

1650

1651

#ifdef CONFIG_PREEMPT

1651

#ifdef CONFIG_PREEMPT

1652

1653

/*

1653

/*

1654

* fair double_lock_balance: Safely acquires both rq->locks in a fair

1654

* fair double_lock_balance: Safely acquires both rq->locks in a fair

1655

* way at the expense of forcing extra atomic operations in all

1655

* way at the expense of forcing extra atomic operations in all

1656

* invocations. This assures that the double_lock is acquired using the

1656

* invocations. This assures that the double_lock is acquired using the

1657

* same underlying policy as the spinlock_t on this architecture, which

1657

* same underlying policy as the spinlock_t on this architecture, which

1658

* reduces latency compared to the unfair variant below. However, it

1658

* reduces latency compared to the unfair variant below. However, it

1659

* also adds more overhead and therefore may reduce throughput.

1659

* also adds more overhead and therefore may reduce throughput.

1660

*/

1660

*/

1661

static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1661

static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1662

__releases(this_rq->lock)

1662

__releases(this_rq->lock)

1663

__acquires(busiest->lock)

1663

__acquires(busiest->lock)

1664

__acquires(this_rq->lock)

1664

__acquires(this_rq->lock)

1665

{

1665

{

1666

spin_unlock(&this_rq->lock);

1666

spin_unlock(&this_rq->lock);

1667

double_rq_lock(this_rq, busiest);

1667

double_rq_lock(this_rq, busiest);

1668

1669

return 1;

1669

return 1;

1670

}

1670

}

1671

1672

#else

1672

#else

1673

/*

1673

/*

1674

* Unfair double_lock_balance: Optimizes throughput at the expense of

1674

* Unfair double_lock_balance: Optimizes throughput at the expense of

1675

* latency by eliminating extra atomic operations when the locks are

1675

* latency by eliminating extra atomic operations when the locks are

1676

* already in proper order on entry. This favors lower cpu-ids and will

1676

* already in proper order on entry. This favors lower cpu-ids and will

1677

* grant the double lock to lower cpus over higher ids under contention,

1677

* grant the double lock to lower cpus over higher ids under contention,

1678

* regardless of entry order into the function.

1678

* regardless of entry order into the function.

1679

*/

1679

*/

1680

static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1680

static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1681

__releases(this_rq->lock)

1681

__releases(this_rq->lock)

1682

__acquires(busiest->lock)

1682

__acquires(busiest->lock)

1683

__acquires(this_rq->lock)

1683

__acquires(this_rq->lock)

1684

{

1684

{

1685

int ret = 0;

1685

int ret = 0;

1686

1687

if (unlikely(!spin_trylock(&busiest->lock))) {

1687

if (unlikely(!spin_trylock(&busiest->lock))) {

1688

if (busiest < this_rq) {

1688

if (busiest < this_rq) {

1689

spin_unlock(&this_rq->lock);

1689

spin_unlock(&this_rq->lock);

1690

spin_lock(&busiest->lock);

1690

spin_lock(&busiest->lock);

1691

spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);

1691

spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);

1692

ret = 1;

1692

ret = 1;

1693

} else

1693

} else

1694

spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);

1694

spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);

1695

}

1695

}

1696

return ret;

1696

return ret;

1697

}

1697

}

1698

1699

#endif /* CONFIG_PREEMPT */

1699

#endif /* CONFIG_PREEMPT */

1700

1701

/*

1701

/*

1702

* double_lock_balance - lock the busiest runqueue, this_rq is locked already.

1702

* double_lock_balance - lock the busiest runqueue, this_rq is locked already.

1703

*/

1703

*/

1704

static int double_lock_balance(struct rq *this_rq, struct rq *busiest)

1704

static int double_lock_balance(struct rq *this_rq, struct rq *busiest)

1705

{

1705

{

1706

if (unlikely(!irqs_disabled())) {

1706

if (unlikely(!irqs_disabled())) {

1707

/* printk() doesn't work good under rq->lock */

1707

/* printk() doesn't work good under rq->lock */

1708

spin_unlock(&this_rq->lock);

1708

spin_unlock(&this_rq->lock);

1709

BUG_ON(1);

1709

BUG_ON(1);

1710

}

1710

}

1711

1712

return _double_lock_balance(this_rq, busiest);

1712

return _double_lock_balance(this_rq, busiest);

1713

}

1713

}

1714

1715

static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)

1715

static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)

1716

__releases(busiest->lock)

1716

__releases(busiest->lock)

1717

{

1717

{

1718

spin_unlock(&busiest->lock);

1718

spin_unlock(&busiest->lock);

1719

lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);

1719

lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);

1720

}

1720

}

1721

#endif

1721

#endif

1722

1723

#ifdef CONFIG_FAIR_GROUP_SCHED

1723

#ifdef CONFIG_FAIR_GROUP_SCHED

1724

static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)

1724

static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)

1725

{

1725

{

1726

#ifdef CONFIG_SMP

1726

#ifdef CONFIG_SMP

1727

cfs_rq->shares = shares;

1727

cfs_rq->shares = shares;

1728

#endif

1728

#endif

1729

}

1729

}

1730

#endif

1730

#endif

1731

1732

static void calc_load_account_active(struct rq *this_rq);

1732

static void calc_load_account_active(struct rq *this_rq);

1733

1734

#include "sched_stats.h"

1734

#include "sched_stats.h"

1735

#include "sched_idletask.c"

1735

#include "sched_idletask.c"

1736

#include "sched_fair.c"

1736

#include "sched_fair.c"

1737

#include "sched_rt.c"

1737

#include "sched_rt.c"

1738

#ifdef CONFIG_SCHED_DEBUG

1738

#ifdef CONFIG_SCHED_DEBUG

1739

# include "sched_debug.c"

1739

# include "sched_debug.c"

1740

#endif

1740

#endif

1741

1742

#define sched_class_highest (&rt_sched_class)

1742

#define sched_class_highest (&rt_sched_class)

1743

#define for_each_class(class) \

1743

#define for_each_class(class) \

1744

for (class = sched_class_highest; class; class = class->next)

1744

for (class = sched_class_highest; class; class = class->next)

1745

1746

static void inc_nr_running(struct rq *rq)

1746

static void inc_nr_running(struct rq *rq)

1747

{

1747

{

1748

rq->nr_running++;

1748

rq->nr_running++;

1749

}

1749

}

1750

1751

static void dec_nr_running(struct rq *rq)

1751

static void dec_nr_running(struct rq *rq)

1752

{

1752

{

1753

rq->nr_running--;

1753

rq->nr_running--;

1754

}

1754

}

1755

1756

static void set_load_weight(struct task_struct *p)

1756

static void set_load_weight(struct task_struct *p)

1757

{

1757

{

1758

if (task_has_rt_policy(p)) {

1758

if (task_has_rt_policy(p)) {

1759

p->se.load.weight = prio_to_weight[0] * 2;

1759

p->se.load.weight = prio_to_weight[0] * 2;

1760

p->se.load.inv_weight = prio_to_wmult[0] >> 1;

1760

p->se.load.inv_weight = prio_to_wmult[0] >> 1;

1761

return;

1761

return;

1762

}

1762

}

1763

1764

/*

1764

/*

1765

* SCHED_IDLE tasks get minimal weight:

1765

* SCHED_IDLE tasks get minimal weight:

1766

*/

1766

*/

1767

if (p->policy == SCHED_IDLE) {

1767

if (p->policy == SCHED_IDLE) {

1768

p->se.load.weight = WEIGHT_IDLEPRIO;

1768

p->se.load.weight = WEIGHT_IDLEPRIO;

1769

p->se.load.inv_weight = WMULT_IDLEPRIO;

1769

p->se.load.inv_weight = WMULT_IDLEPRIO;

1770

return;

1770

return;

1771

}

1771

}

1772

1773

p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];

1773

p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];

1774

p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];

1774

p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];

1775

}

1775

}

1776

1777

static void update_avg(u64 *avg, u64 sample)

1777

static void update_avg(u64 *avg, u64 sample)

1778

{

1778

{

1779

s64 diff = sample - *avg;

1779

s64 diff = sample - *avg;

1780

*avg += diff >> 3;

1780

*avg += diff >> 3;

1781

}

1781

}

1782

1783

static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)

1783

static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)

1784

{

1784

{

1785

if (wakeup)

1785

if (wakeup)

1786

p->se.start_runtime = p->se.sum_exec_runtime;

1786

p->se.start_runtime = p->se.sum_exec_runtime;

1787

1788

sched_info_queued(p);

1788

sched_info_queued(p);

1789

p->sched_class->enqueue_task(rq, p, wakeup);

1789

p->sched_class->enqueue_task(rq, p, wakeup);

1790

p->se.on_rq = 1;

1790

p->se.on_rq = 1;

1791

}

1791

}

1792

1793

static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)

1793

static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)

1794

{

1794

{

1795

if (sleep) {

1795

if (sleep) {

1796

if (p->se.last_wakeup) {

1796

if (p->se.last_wakeup) {

1797

update_avg(&p->se.avg_overlap,

1797

update_avg(&p->se.avg_overlap,

1798

p->se.sum_exec_runtime - p->se.last_wakeup);

1798

p->se.sum_exec_runtime - p->se.last_wakeup);

1799

p->se.last_wakeup = 0;

1799

p->se.last_wakeup = 0;

1800

} else {

1800

} else {

1801

update_avg(&p->se.avg_wakeup,

1801

update_avg(&p->se.avg_wakeup,

1802

sysctl_sched_wakeup_granularity);

1802

sysctl_sched_wakeup_granularity);

1803

}

1803

}

1804

}

1804

}

1805

1806

sched_info_dequeued(p);

1806

sched_info_dequeued(p);

1807

p->sched_class->dequeue_task(rq, p, sleep);

1807

p->sched_class->dequeue_task(rq, p, sleep);

1808

p->se.on_rq = 0;

1808

p->se.on_rq = 0;

1809

}

1809

}

1810

1811

/*

1811

/*

1812

* __normal_prio - return the priority that is based on the static prio

1812

* __normal_prio - return the priority that is based on the static prio

1813

*/

1813

*/

1814

static inline int __normal_prio(struct task_struct *p)

1814

static inline int __normal_prio(struct task_struct *p)

1815

{

1815

{

1816

return p->static_prio;

1816

return p->static_prio;

1817

}

1817

}

1818

1819

/*

1819

/*

1820

* Calculate the expected normal priority: i.e. priority

1820

* Calculate the expected normal priority: i.e. priority

1821

* without taking RT-inheritance into account. Might be

1821

* without taking RT-inheritance into account. Might be

1822

* boosted by interactivity modifiers. Changes upon fork,

1822

* boosted by interactivity modifiers. Changes upon fork,

1823

* setprio syscalls, and whenever the interactivity

1823

* setprio syscalls, and whenever the interactivity

1824

* estimator recalculates.

1824

* estimator recalculates.

1825

*/

1825

*/

1826

static inline int normal_prio(struct task_struct *p)

1826

static inline int normal_prio(struct task_struct *p)

1827

{

1827

{

1828

int prio;

1828

int prio;

1829

1830

if (task_has_rt_policy(p))

1830

if (task_has_rt_policy(p))

1831

prio = MAX_RT_PRIO-1 - p->rt_priority;

1831

prio = MAX_RT_PRIO-1 - p->rt_priority;

1832

else

1832

else

1833

prio = __normal_prio(p);

1833

prio = __normal_prio(p);

1834

return prio;

1834

return prio;

1835

}

1835

}

1836

1837

/*

1837

/*

1838

* Calculate the current priority, i.e. the priority

1838

* Calculate the current priority, i.e. the priority

1839

* taken into account by the scheduler. This value might

1839

* taken into account by the scheduler. This value might

1840

* be boosted by RT tasks, or might be boosted by

1840

* be boosted by RT tasks, or might be boosted by

1841

* interactivity modifiers. Will be RT if the task got

1841

* interactivity modifiers. Will be RT if the task got

1842

* RT-boosted. If not then it returns p->normal_prio.

1842

* RT-boosted. If not then it returns p->normal_prio.

1843

*/

1843

*/

1844

static int effective_prio(struct task_struct *p)

1844

static int effective_prio(struct task_struct *p)

1845

{

1845

{

1846

p->normal_prio = normal_prio(p);

1846

p->normal_prio = normal_prio(p);

1847

/*

1847

/*

1848

* If we are RT tasks or we were boosted to RT priority,

1848

* If we are RT tasks or we were boosted to RT priority,

1849

* keep the priority unchanged. Otherwise, update priority

1849

* keep the priority unchanged. Otherwise, update priority

1850

* to the normal priority:

1850

* to the normal priority:

1851

*/

1851

*/

1852

if (!rt_prio(p->prio))

1852

if (!rt_prio(p->prio))

1853

return p->normal_prio;

1853

return p->normal_prio;

1854

return p->prio;

1854

return p->prio;

1855

}

1855

}

1856

1857

/*

1857

/*

1858

* activate_task - move a task to the runqueue.

1858

* activate_task - move a task to the runqueue.

1859

*/

1859

*/

1860

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)

1860

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)

1861

{

1861

{

1862

if (task_contributes_to_load(p))

1862

if (task_contributes_to_load(p))

1863

rq->nr_uninterruptible--;

1863

rq->nr_uninterruptible--;

1864

1865

enqueue_task(rq, p, wakeup);

1865

enqueue_task(rq, p, wakeup);

1866

inc_nr_running(rq);

1866

inc_nr_running(rq);

1867

}

1867

}

1868

1869

/*

1869

/*

1870

* deactivate_task - remove a task from the runqueue.

1870

* deactivate_task - remove a task from the runqueue.

1871

*/

1871

*/

1872

static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)

1872

static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)

1873

{

1873

{

1874

if (task_contributes_to_load(p))

1874

if (task_contributes_to_load(p))

1875

rq->nr_uninterruptible++;

1875

rq->nr_uninterruptible++;

1876

1877

dequeue_task(rq, p, sleep);

1877

dequeue_task(rq, p, sleep);

1878

dec_nr_running(rq);

1878

dec_nr_running(rq);

1879

}

1879

}

1880

1881

/**

1881

/**

1882

* task_curr - is this task currently executing on a CPU?

1882

* task_curr - is this task currently executing on a CPU?

1883

* @p: the task in question.

1883

* @p: the task in question.

1884

*/

1884

*/

1885

inline int task_curr(const struct task_struct *p)

1885

inline int task_curr(const struct task_struct *p)

1886

{

1886

{

1887

return cpu_curr(task_cpu(p)) == p;

1887

return cpu_curr(task_cpu(p)) == p;

1888

}

1888

}

1889

1890

static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)

1890

static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)

1891

{

1891

{

1892

set_task_rq(p, cpu);

1892

set_task_rq(p, cpu);

1893

#ifdef CONFIG_SMP

1893

#ifdef CONFIG_SMP

1894

/*

1894

/*

1895

* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be

1895

* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be

1896

* successfuly executed on another CPU. We must ensure that updates of

1896

* successfuly executed on another CPU. We must ensure that updates of

1897

* per-task data have been completed by this moment.

1897

* per-task data have been completed by this moment.

1898

*/

1898

*/

1899

smp_wmb();

1899

smp_wmb();

1900

task_thread_info(p)->cpu = cpu;

1900

task_thread_info(p)->cpu = cpu;

1901

#endif

1901

#endif

1902

}

1902

}

1903

1904

static inline void check_class_changed(struct rq *rq, struct task_struct *p,

1904

static inline void check_class_changed(struct rq *rq, struct task_struct *p,

1905

const struct sched_class *prev_class,

1905

const struct sched_class *prev_class,

1906

int oldprio, int running)

1906

int oldprio, int running)

1907

{

1907

{

1908

if (prev_class != p->sched_class) {

1908

if (prev_class != p->sched_class) {

1909

if (prev_class->switched_from)

1909

if (prev_class->switched_from)

1910

prev_class->switched_from(rq, p, running);

1910

prev_class->switched_from(rq, p, running);

1911

p->sched_class->switched_to(rq, p, running);

1911

p->sched_class->switched_to(rq, p, running);

1912

} else

1912

} else

1913

p->sched_class->prio_changed(rq, p, oldprio, running);

1913

p->sched_class->prio_changed(rq, p, oldprio, running);

1914

}

1914

}

1915

1916

#ifdef CONFIG_SMP

1916

#ifdef CONFIG_SMP

1917

1918

/* Used instead of source_load when we know the type == 0 */

1918

/* Used instead of source_load when we know the type == 0 */

1919

static unsigned long weighted_cpuload(const int cpu)

1919

static unsigned long weighted_cpuload(const int cpu)

1920

{

1920

{

1921

return cpu_rq(cpu)->load.weight;

1921

return cpu_rq(cpu)->load.weight;

1922

}

1922

}

1923

1924

/*

1924

/*

1925

* Is this task likely cache-hot:

1925

* Is this task likely cache-hot:

1926

*/

1926

*/

1927

static int

1927

static int

1928

task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)

1928

task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)

1929

{

1929

{

1930

s64 delta;

1930

s64 delta;

1931

1932

/*

1932

/*

1933

* Buddy candidates are cache hot:

1933

* Buddy candidates are cache hot:

1934

*/

1934

*/

1935

if (sched_feat(CACHE_HOT_BUDDY) &&

1935

if (sched_feat(CACHE_HOT_BUDDY) &&

1936

(&p->se == cfs_rq_of(&p->se)->next ||

1936

(&p->se == cfs_rq_of(&p->se)->next ||

1937

&p->se == cfs_rq_of(&p->se)->last))

1937

&p->se == cfs_rq_of(&p->se)->last))

1938

return 1;

1938

return 1;

1939

1940

if (p->sched_class != &fair_sched_class)

1940

if (p->sched_class != &fair_sched_class)

1941

return 0;

1941

return 0;

1942

1943

if (sysctl_sched_migration_cost == -1)

1943

if (sysctl_sched_migration_cost == -1)

1944

return 1;

1944

return 1;

1945

if (sysctl_sched_migration_cost == 0)

1945

if (sysctl_sched_migration_cost == 0)

1946

return 0;

1946

return 0;

1947

1948

delta = now - p->se.exec_start;

1948

delta = now - p->se.exec_start;

1949

1950

return delta < (s64)sysctl_sched_migration_cost;

1950

return delta < (s64)sysctl_sched_migration_cost;

1951

}

1951

}

1952

1953

1954

void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

1954

void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

1955

{

1955

{

1956

int old_cpu = task_cpu(p);

1956

int old_cpu = task_cpu(p);

1957

struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);

1957

struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);

1958

struct cfs_rq *old_cfsrq = task_cfs_rq(p),

1958

struct cfs_rq *old_cfsrq = task_cfs_rq(p),

1959

*new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);

1959

*new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);

1960

u64 clock_offset;

1960

u64 clock_offset;

1961

1962

clock_offset = old_rq->clock - new_rq->clock;

1962

clock_offset = old_rq->clock - new_rq->clock;

1963

1964

trace_sched_migrate_task(p, new_cpu);

1964

trace_sched_migrate_task(p, new_cpu);

1965

1966

#ifdef CONFIG_SCHEDSTATS

1966

#ifdef CONFIG_SCHEDSTATS

1967

if (p->se.wait_start)

1967

if (p->se.wait_start)

1968

p->se.wait_start -= clock_offset;

1968

p->se.wait_start -= clock_offset;

1969

if (p->se.sleep_start)

1969

if (p->se.sleep_start)

1970

p->se.sleep_start -= clock_offset;

1970

p->se.sleep_start -= clock_offset;

1971

if (p->se.block_start)

1971

if (p->se.block_start)

1972

p->se.block_start -= clock_offset;

1972

p->se.block_start -= clock_offset;

1973

#endif

1973

#endif

1974

if (old_cpu != new_cpu) {

1974

if (old_cpu != new_cpu) {

1975

p->se.nr_migrations++;

1975

p->se.nr_migrations++;

1976

new_rq->nr_migrations_in++;

1976

new_rq->nr_migrations_in++;

1977

#ifdef CONFIG_SCHEDSTATS

1977

#ifdef CONFIG_SCHEDSTATS

1978

if (task_hot(p, old_rq->clock, NULL))

1978

if (task_hot(p, old_rq->clock, NULL))

1979

schedstat_inc(p, se.nr_forced2_migrations);

1979

schedstat_inc(p, se.nr_forced2_migrations);

1980

#endif

1980

#endif

1981

perf_counter_task_migration(p, new_cpu);

1981

perf_counter_task_migration(p, new_cpu);

1982

}

1982

}

1983

p->se.vruntime -= old_cfsrq->min_vruntime -

1983

p->se.vruntime -= old_cfsrq->min_vruntime -

1984

new_cfsrq->min_vruntime;

1984

new_cfsrq->min_vruntime;

1985

1986

__set_task_cpu(p, new_cpu);

1986

__set_task_cpu(p, new_cpu);

1987

}

1987

}

1988

1989

struct migration_req {

1989

struct migration_req {

1990

struct list_head list;

1990

struct list_head list;

1991

1992

struct task_struct *task;

1992

struct task_struct *task;

1993

int dest_cpu;

1993

int dest_cpu;

1994

1995

struct completion done;

1995

struct completion done;

1996

};

1996

};

1997

1998

/*

1998

/*

1999

* The task's runqueue lock must be held.

1999

* The task's runqueue lock must be held.

2000

* Returns true if you have to wait for migration thread.

2000

* Returns true if you have to wait for migration thread.

2001

*/

2001

*/

2002

static int

2002

static int

2003

migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)

2003

migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)

2004

{

2004

{

2005

struct rq *rq = task_rq(p);

2005

struct rq *rq = task_rq(p);

2006

2007

/*

2007

/*

2008

* If the task is not on a runqueue (and not running), then

2008

* If the task is not on a runqueue (and not running), then

2009

* it is sufficient to simply update the task's cpu field.

2009

* it is sufficient to simply update the task's cpu field.

2010

*/

2010

*/

2011

if (!p->se.on_rq && !task_running(rq, p)) {

2011

if (!p->se.on_rq && !task_running(rq, p)) {

2012

set_task_cpu(p, dest_cpu);

2012

set_task_cpu(p, dest_cpu);

2013

return 0;

2013

return 0;

2014

}

2014

}

2015

2016

init_completion(&req->done);

2016

init_completion(&req->done);

2017

req->task = p;

2017

req->task = p;

2018

req->dest_cpu = dest_cpu;

2018

req->dest_cpu = dest_cpu;

2019

list_add(&req->list, &rq->migration_queue);

2019

list_add(&req->list, &rq->migration_queue);

2020

2021

return 1;

2021

return 1;

2022

}

2022

}

2023

2024

/*

2024

/*

2025

* wait_task_context_switch - wait for a thread to complete at least one

2025

* wait_task_context_switch - wait for a thread to complete at least one

2026

* context switch.

2026

* context switch.

2027

*

2027

*

2028

* @p must not be current.

2028

* @p must not be current.

2029

*/

2029

*/

2030

void wait_task_context_switch(struct task_struct *p)

2030

void wait_task_context_switch(struct task_struct *p)

2031

{

2031

{

2032

unsigned long nvcsw, nivcsw, flags;

2032

unsigned long nvcsw, nivcsw, flags;

2033

int running;

2033

int running;

2034

struct rq *rq;

2034

struct rq *rq;

2035

2036

nvcsw = p->nvcsw;

2036

nvcsw = p->nvcsw;

2037

nivcsw = p->nivcsw;

2037

nivcsw = p->nivcsw;

2038

for (;;) {

2038

for (;;) {

2039

/*

2039

/*

2040

* The runqueue is assigned before the actual context

2040

* The runqueue is assigned before the actual context

2041

* switch. We need to take the runqueue lock.

2041

* switch. We need to take the runqueue lock.

2042

*

2042

*

2043

* We could check initially without the lock but it is

2043

* We could check initially without the lock but it is

2044

* very likely that we need to take the lock in every

2044

* very likely that we need to take the lock in every

2045

* iteration.

2045

* iteration.

2046

*/

2046

*/

2047

rq = task_rq_lock(p, &flags);

2047

rq = task_rq_lock(p, &flags);

2048

running = task_running(rq, p);

2048

running = task_running(rq, p);

2049

task_rq_unlock(rq, &flags);

2049

task_rq_unlock(rq, &flags);

2050

2051

if (likely(!running))

2051

if (likely(!running))

2052

break;

2052

break;

2053

/*

2053

/*

2054

* The switch count is incremented before the actual

2054

* The switch count is incremented before the actual

2055

* context switch. We thus wait for two switches to be

2055

* context switch. We thus wait for two switches to be

2056

* sure at least one completed.

2056

* sure at least one completed.

2057

*/

2057

*/

2058

if ((p->nvcsw - nvcsw) > 1)

2058

if ((p->nvcsw - nvcsw) > 1)

2059

break;

2059

break;

2060

if ((p->nivcsw - nivcsw) > 1)

2060

if ((p->nivcsw - nivcsw) > 1)

2061

break;

2061

break;

2062

2063

cpu_relax();

2063

cpu_relax();

2064

}

2064

}

2065

}

2065

}

2066

2067

/*

2067

/*

2068

* wait_task_inactive - wait for a thread to unschedule.

2068

* wait_task_inactive - wait for a thread to unschedule.

2069

*

2069

*

2070

* If @match_state is nonzero, it's the @p->state value just checked and

2070

* If @match_state is nonzero, it's the @p->state value just checked and

2071

* not expected to change. If it changes, i.e. @p might have woken up,

2071

* not expected to change. If it changes, i.e. @p might have woken up,

2072

* then return zero. When we succeed in waiting for @p to be off its CPU,

2072

* then return zero. When we succeed in waiting for @p to be off its CPU,

2073

* we return a positive number (its total switch count). If a second call

2073

* we return a positive number (its total switch count). If a second call

2074

* a short while later returns the same number, the caller can be sure that

2074

* a short while later returns the same number, the caller can be sure that

2075

* @p has remained unscheduled the whole time.

2075

* @p has remained unscheduled the whole time.

2076

*

2076

*

2077

* The caller must ensure that the task *will* unschedule sometime soon,

2077

* The caller must ensure that the task *will* unschedule sometime soon,

2078

* else this function might spin for a *long* time. This function can't

2078

* else this function might spin for a *long* time. This function can't

2079

* be called with interrupts off, or it may introduce deadlock with

2079

* be called with interrupts off, or it may introduce deadlock with

2080

* smp_call_function() if an IPI is sent by the same process we are

2080

* smp_call_function() if an IPI is sent by the same process we are

2081

* waiting to become inactive.

2081

* waiting to become inactive.

2082

*/

2082

*/

2083

unsigned long wait_task_inactive(struct task_struct *p, long match_state)

2083

unsigned long wait_task_inactive(struct task_struct *p, long match_state)

2084

{

2084

{

2085

unsigned long flags;

2085

unsigned long flags;

2086

int running, on_rq;

2086

int running, on_rq;

2087

unsigned long ncsw;

2087

unsigned long ncsw;

2088

struct rq *rq;

2088

struct rq *rq;

2089

2090

for (;;) {

2090

for (;;) {

2091

/*

2091

/*

2092

* We do the initial early heuristics without holding

2092

* We do the initial early heuristics without holding

2093

* any task-queue locks at all. We'll only try to get

2093

* any task-queue locks at all. We'll only try to get

2094

* the runqueue lock when things look like they will

2094

* the runqueue lock when things look like they will

2095

* work out!

2095

* work out!

2096

*/

2096

*/

2097

rq = task_rq(p);

2097

rq = task_rq(p);

2098

2099

/*

2099

/*

2100

* If the task is actively running on another CPU

2100

* If the task is actively running on another CPU

2101

* still, just relax and busy-wait without holding

2101

* still, just relax and busy-wait without holding

2102

* any locks.

2102

* any locks.

2103

*

2103

*

2104

* NOTE! Since we don't hold any locks, it's not

2104

* NOTE! Since we don't hold any locks, it's not

2105

* even sure that "rq" stays as the right runqueue!

2105

* even sure that "rq" stays as the right runqueue!

2106

* But we don't care, since "task_running()" will

2106

* But we don't care, since "task_running()" will

2107

* return false if the runqueue has changed and p

2107

* return false if the runqueue has changed and p

2108

* is actually now running somewhere else!

2108

* is actually now running somewhere else!

2109

*/

2109

*/

2110

while (task_running(rq, p)) {

2110

while (task_running(rq, p)) {

2111

if (match_state && unlikely(p->state != match_state))

2111

if (match_state && unlikely(p->state != match_state))

2112

return 0;

2112

return 0;

2113

cpu_relax();

2113

cpu_relax();

2114

}

2114

}

2115

2116

/*

2116

/*

2117

* Ok, time to look more closely! We need the rq

2117

* Ok, time to look more closely! We need the rq

2118

* lock now, to be *sure*. If we're wrong, we'll

2118

* lock now, to be *sure*. If we're wrong, we'll

2119

* just go back and repeat.

2119

* just go back and repeat.

2120

*/

2120

*/

2121

rq = task_rq_lock(p, &flags);

2121

rq = task_rq_lock(p, &flags);

2122

trace_sched_wait_task(rq, p);

2122

trace_sched_wait_task(rq, p);

2123

running = task_running(rq, p);

2123

running = task_running(rq, p);

2124

on_rq = p->se.on_rq;

2124

on_rq = p->se.on_rq;

2125

ncsw = 0;

2125

ncsw = 0;

2126

if (!match_state || p->state == match_state)

2126

if (!match_state || p->state == match_state)

2127

ncsw = p->nvcsw | LONG_MIN; /* sets MSB */

2127

ncsw = p->nvcsw | LONG_MIN; /* sets MSB */

2128

task_rq_unlock(rq, &flags);

2128

task_rq_unlock(rq, &flags);

2129

2130

/*

2130

/*

2131

* If it changed from the expected state, bail out now.

2131

* If it changed from the expected state, bail out now.

2132

*/

2132

*/

2133

if (unlikely(!ncsw))

2133

if (unlikely(!ncsw))

2134

break;

2134

break;

2135

2136

/*

2136

/*

2137

* Was it really running after all now that we

2137

* Was it really running after all now that we

2138

* checked with the proper locks actually held?

2138

* checked with the proper locks actually held?

2139

*

2139

*

2140

* Oops. Go back and try again..

2140

* Oops. Go back and try again..

2141

*/

2141

*/

2142

if (unlikely(running)) {

2142

if (unlikely(running)) {

2143

cpu_relax();

2143

cpu_relax();

2144

continue;

2144

continue;

2145

}

2145

}

2146

2147

/*

2147

/*

2148

* It's not enough that it's not actively running,

2148

* It's not enough that it's not actively running,

2149

* it must be off the runqueue _entirely_, and not

2149

* it must be off the runqueue _entirely_, and not

2150

* preempted!

2150

* preempted!

2151

*

2151

*

2152

* So if it was still runnable (but just not actively

2152

* So if it was still runnable (but just not actively

2153

* running right now), it's preempted, and we should

2153

* running right now), it's preempted, and we should

2154

* yield - it could be a while.

2154

* yield - it could be a while.

2155

*/

2155

*/

2156

if (unlikely(on_rq)) {

2156

if (unlikely(on_rq)) {

2157

schedule_timeout_uninterruptible(1);

2157

schedule_timeout_uninterruptible(1);

2158

continue;

2158

continue;

2159

}

2159

}

2160

2161

/*

2161

/*

2162

* Ahh, all good. It wasn't running, and it wasn't

2162

* Ahh, all good. It wasn't running, and it wasn't

2163

* runnable, which means that it will never become

2163

* runnable, which means that it will never become

2164

* running in the future either. We're all done!

2164

* running in the future either. We're all done!

2165

*/

2165

*/

2166

break;

2166

break;

2167

}

2167

}

2168

2169

return ncsw;

2169

return ncsw;

2170

}

2170

}

2171

2172

/***

2172

/***

2173

* kick_process - kick a running thread to enter/exit the kernel

2173

* kick_process - kick a running thread to enter/exit the kernel

2174

* @p: the to-be-kicked thread

2174

* @p: the to-be-kicked thread

2175

*

2175

*

2176

* Cause a process which is running on another CPU to enter

2176

* Cause a process which is running on another CPU to enter

2177

* kernel-mode, without any delay. (to get signals handled.)

2177

* kernel-mode, without any delay. (to get signals handled.)

2178

*

2178

*

2179

* NOTE: this function doesnt have to take the runqueue lock,

2179

* NOTE: this function doesnt have to take the runqueue lock,

2180

* because all it wants to ensure is that the remote task enters

2180

* because all it wants to ensure is that the remote task enters

2181

* the kernel. If the IPI races and the task has been migrated

2181

* the kernel. If the IPI races and the task has been migrated

2182

* to another CPU then no harm is done and the purpose has been

2182

* to another CPU then no harm is done and the purpose has been

2183

* achieved as well.

2183

* achieved as well.

2184

*/

2184

*/

2185

void kick_process(struct task_struct *p)

2185

void kick_process(struct task_struct *p)

2186

{

2186

{

2187

int cpu;

2187

int cpu;

2188

2189

preempt_disable();

2189

preempt_disable();

2190

cpu = task_cpu(p);

2190

cpu = task_cpu(p);

2191

if ((cpu != smp_processor_id()) && task_curr(p))

2191

if ((cpu != smp_processor_id()) && task_curr(p))

2192

smp_send_reschedule(cpu);

2192

smp_send_reschedule(cpu);

2193

preempt_enable();

2193

preempt_enable();

2194

}

2194

}

2195

EXPORT_SYMBOL_GPL(kick_process);

2195

EXPORT_SYMBOL_GPL(kick_process);

2196

2197

/*

2197

/*

2198

* Return a low guess at the load of a migration-source cpu weighted

2198

* Return a low guess at the load of a migration-source cpu weighted

2199

* according to the scheduling class and "nice" value.

2199

* according to the scheduling class and "nice" value.

2200

*

2200

*

2201

* We want to under-estimate the load of migration sources, to

2201

* We want to under-estimate the load of migration sources, to

2202

* balance conservatively.

2202

* balance conservatively.

2203

*/

2203

*/

2204

static unsigned long source_load(int cpu, int type)

2204

static unsigned long source_load(int cpu, int type)

2205

{

2205

{

2206

struct rq *rq = cpu_rq(cpu);

2206

struct rq *rq = cpu_rq(cpu);

2207

unsigned long total = weighted_cpuload(cpu);

2207

unsigned long total = weighted_cpuload(cpu);

2208

2209

if (type == 0 || !sched_feat(LB_BIAS))

2209

if (type == 0 || !sched_feat(LB_BIAS))

2210

return total;

2210

return total;

2211

2212

return min(rq->cpu_load[type-1], total);

2212

return min(rq->cpu_load[type-1], total);

2213

}

2213

}

2214

2215

/*

2215

/*

2216

* Return a high guess at the load of a migration-target cpu weighted

2216

* Return a high guess at the load of a migration-target cpu weighted

2217

* according to the scheduling class and "nice" value.

2217

* according to the scheduling class and "nice" value.

2218

*/

2218

*/

2219

static unsigned long target_load(int cpu, int type)

2219

static unsigned long target_load(int cpu, int type)

2220

{

2220

{

2221

struct rq *rq = cpu_rq(cpu);

2221

struct rq *rq = cpu_rq(cpu);

2222

unsigned long total = weighted_cpuload(cpu);

2222

unsigned long total = weighted_cpuload(cpu);

2223

2224

if (type == 0 || !sched_feat(LB_BIAS))

2224

if (type == 0 || !sched_feat(LB_BIAS))

2225

return total;

2225

return total;

2226

2227

return max(rq->cpu_load[type-1], total);

2227

return max(rq->cpu_load[type-1], total);

2228

}

2228

}

2229

2230

/*

2230

/*

2231

* find_idlest_group finds and returns the least busy CPU group within the

2231

* find_idlest_group finds and returns the least busy CPU group within the

2232

* domain.

2232

* domain.

2233

*/

2233

*/

2234

static struct sched_group *

2234

static struct sched_group *

2235

find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)

2235

find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)

2236

{

2236

{

2237

struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;

2237

struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;

2238

unsigned long min_load = ULONG_MAX, this_load = 0;

2238

unsigned long min_load = ULONG_MAX, this_load = 0;

2239

int load_idx = sd->forkexec_idx;

2239

int load_idx = sd->forkexec_idx;

2240

int imbalance = 100 + (sd->imbalance_pct-100)/2;

2240

int imbalance = 100 + (sd->imbalance_pct-100)/2;

2241

2242

do {

2242

do {

2243

unsigned long load, avg_load;

2243

unsigned long load, avg_load;

2244

int local_group;

2244

int local_group;

2245

int i;

2245

int i;

2246

2247

/* Skip over this group if it has no CPUs allowed */

2247

/* Skip over this group if it has no CPUs allowed */

2248

if (!cpumask_intersects(sched_group_cpus(group),

2248

if (!cpumask_intersects(sched_group_cpus(group),

2249

&p->cpus_allowed))

2249

&p->cpus_allowed))

2250

continue;

2250

continue;

2251

2252

local_group = cpumask_test_cpu(this_cpu,

2252

local_group = cpumask_test_cpu(this_cpu,

2253

sched_group_cpus(group));

2253

sched_group_cpus(group));

2254

2255

/* Tally up the load of all CPUs in the group */

2255

/* Tally up the load of all CPUs in the group */

2256

avg_load = 0;

2256

avg_load = 0;

2257

2258

for_each_cpu(i, sched_group_cpus(group)) {

2258

for_each_cpu(i, sched_group_cpus(group)) {

2259

/* Bias balancing toward cpus of our domain */

2259

/* Bias balancing toward cpus of our domain */

2260

if (local_group)

2260

if (local_group)

2261

load = source_load(i, load_idx);

2261

load = source_load(i, load_idx);

2262

else

2262

else

2263

load = target_load(i, load_idx);

2263

load = target_load(i, load_idx);

2264

2265

avg_load += load;

2265

avg_load += load;

2266

}

2266

}

2267

2268

/* Adjust by relative CPU power of the group */

2268

/* Adjust by relative CPU power of the group */

2269

avg_load = sg_div_cpu_power(group,

2269

avg_load = sg_div_cpu_power(group,

2270

avg_load * SCHED_LOAD_SCALE);

2270

avg_load * SCHED_LOAD_SCALE);

2271

2272

if (local_group) {

2272

if (local_group) {

2273

this_load = avg_load;

2273

this_load = avg_load;

2274

this = group;

2274

this = group;

2275

} else if (avg_load < min_load) {

2275

} else if (avg_load < min_load) {

2276

min_load = avg_load;

2276

min_load = avg_load;

2277

idlest = group;

2277

idlest = group;

2278

}

2278

}

2279

} while (group = group->next, group != sd->groups);

2279

} while (group = group->next, group != sd->groups);

2280

2281

if (!idlest || 100*this_load < imbalance*min_load)

2281

if (!idlest || 100*this_load < imbalance*min_load)

2282

return NULL;

2282

return NULL;

2283

return idlest;

2283

return idlest;

2284

}

2284

}

2285

2286

/*

2286

/*

2287

* find_idlest_cpu - find the idlest cpu among the cpus in group.

2287

* find_idlest_cpu - find the idlest cpu among the cpus in group.

2288

*/

2288

*/

2289

static int

2289

static int

2290

find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)

2290

find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)

2291

{

2291

{

2292

unsigned long load, min_load = ULONG_MAX;

2292

unsigned long load, min_load = ULONG_MAX;

2293

int idlest = -1;

2293

int idlest = -1;

2294

int i;

2294

int i;

2295

2296

/* Traverse only the allowed CPUs */

2296

/* Traverse only the allowed CPUs */

2297

for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {

2297

for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {

2298

load = weighted_cpuload(i);

2298

load = weighted_cpuload(i);

2299

2300

if (load < min_load || (load == min_load && i == this_cpu)) {

2300

if (load < min_load || (load == min_load && i == this_cpu)) {

2301

min_load = load;

2301

min_load = load;

2302

idlest = i;

2302

idlest = i;

2303

}

2303

}

2304

}

2304

}

2305

2306

return idlest;

2306

return idlest;

2307

}

2307

}

2308

2309

/*

2309

/*

2310

* sched_balance_self: balance the current task (running on cpu) in domains

2310

* sched_balance_self: balance the current task (running on cpu) in domains

2311

* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and

2311

* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and

2312

* SD_BALANCE_EXEC.

2312

* SD_BALANCE_EXEC.

2313

*

2313

*

2314

* Balance, ie. select the least loaded group.

2314

* Balance, ie. select the least loaded group.

2315

*

2315

*

2316

* Returns the target CPU number, or the same CPU if no balancing is needed.

2316

* Returns the target CPU number, or the same CPU if no balancing is needed.

2317

*

2317

*

2318

* preempt must be disabled.

2318

* preempt must be disabled.

2319

*/

2319

*/

2320

static int sched_balance_self(int cpu, int flag)

2320

static int sched_balance_self(int cpu, int flag)

2321

{

2321

{

2322

struct task_struct *t = current;

2322

struct task_struct *t = current;

2323

struct sched_domain *tmp, *sd = NULL;

2323

struct sched_domain *tmp, *sd = NULL;

2324

2325

for_each_domain(cpu, tmp) {

2325

for_each_domain(cpu, tmp) {

2326

/*

2326

/*

2327

* If power savings logic is enabled for a domain, stop there.

2327

* If power savings logic is enabled for a domain, stop there.

2328

*/

2328

*/

2329

if (tmp->flags & SD_POWERSAVINGS_BALANCE)

2329

if (tmp->flags & SD_POWERSAVINGS_BALANCE)

2330

break;

2330

break;

2331

if (tmp->flags & flag)

2331

if (tmp->flags & flag)

2332

sd = tmp;

2332

sd = tmp;

2333

}

2333

}

2334

2335

if (sd)

2335

if (sd)

2336

update_shares(sd);

2336

update_shares(sd);

2337

2338

while (sd) {

2338

while (sd) {

2339

struct sched_group *group;

2339

struct sched_group *group;

2340

int new_cpu, weight;

2340

int new_cpu, weight;

2341

2342

if (!(sd->flags & flag)) {

2342

if (!(sd->flags & flag)) {

2343

sd = sd->child;

2343

sd = sd->child;

2344

continue;

2344

continue;

2345

}

2345

}

2346

2347

group = find_idlest_group(sd, t, cpu);

2347

group = find_idlest_group(sd, t, cpu);

2348

if (!group) {

2348

if (!group) {

2349

sd = sd->child;

2349

sd = sd->child;

2350

continue;

2350

continue;

2351

}

2351

}

2352

2353

new_cpu = find_idlest_cpu(group, t, cpu);

2353

new_cpu = find_idlest_cpu(group, t, cpu);

2354

if (new_cpu == -1 || new_cpu == cpu) {

2354

if (new_cpu == -1 || new_cpu == cpu) {

2355

/* Now try balancing at a lower domain level of cpu */

2355

/* Now try balancing at a lower domain level of cpu */

2356

sd = sd->child;

2356

sd = sd->child;

2357

continue;

2357

continue;

2358

}

2358

}

2359

2360

/* Now try balancing at a lower domain level of new_cpu */

2360

/* Now try balancing at a lower domain level of new_cpu */

2361

cpu = new_cpu;

2361

cpu = new_cpu;

2362

weight = cpumask_weight(sched_domain_span(sd));

2362

weight = cpumask_weight(sched_domain_span(sd));

2363

sd = NULL;

2363

sd = NULL;

2364

for_each_domain(cpu, tmp) {

2364

for_each_domain(cpu, tmp) {

2365

if (weight <= cpumask_weight(sched_domain_span(tmp)))

2365

if (weight <= cpumask_weight(sched_domain_span(tmp)))

2366

break;

2366

break;

2367

if (tmp->flags & flag)

2367

if (tmp->flags & flag)

2368

sd = tmp;

2368

sd = tmp;

2369

}

2369

}

2370

/* while loop will break here if sd == NULL */

2370

/* while loop will break here if sd == NULL */

2371

}

2371

}

2372

2373

return cpu;

2373

return cpu;

2374

}

2374

}

2375

2376

#endif /* CONFIG_SMP */

2376

#endif /* CONFIG_SMP */

2377

2378

/**

2378

/**

2379

* task_oncpu_function_call - call a function on the cpu on which a task runs

2379

* task_oncpu_function_call - call a function on the cpu on which a task runs

2380

* @p: the task to evaluate

2380

* @p: the task to evaluate

2381

* @func: the function to be called

2381

* @func: the function to be called

2382

* @info: the function call argument

2382

* @info: the function call argument

2383

*

2383

*

2384

* Calls the function @func when the task is currently running. This might

2384

* Calls the function @func when the task is currently running. This might

2385

* be on the current CPU, which just calls the function directly

2385

* be on the current CPU, which just calls the function directly

2386

*/

2386

*/

2387

void task_oncpu_function_call(struct task_struct *p,

2387

void task_oncpu_function_call(struct task_struct *p,

2388

void (*func) (void *info), void *info)

2388

void (*func) (void *info), void *info)

2389

{

2389

{

2390

int cpu;

2390

int cpu;

2391

2392

preempt_disable();

2392

preempt_disable();

2393

cpu = task_cpu(p);

2393

cpu = task_cpu(p);

2394

if (task_curr(p))

2394

if (task_curr(p))

2395

smp_call_function_single(cpu, func, info, 1);

2395

smp_call_function_single(cpu, func, info, 1);

2396

preempt_enable();

2396

preempt_enable();

2397

}

2397

}

2398

2399

/***

2399

/***

2400

* try_to_wake_up - wake up a thread

2400

* try_to_wake_up - wake up a thread

2401

* @p: the to-be-woken-up thread

2401

* @p: the to-be-woken-up thread

2402

* @state: the mask of task states that can be woken

2402

* @state: the mask of task states that can be woken

2403

* @sync: do a synchronous wakeup?

2403

* @sync: do a synchronous wakeup?

2404

*

2404

*

2405

* Put it on the run-queue if it's not already there. The "current"

2405

* Put it on the run-queue if it's not already there. The "current"

2406

* thread is always on the run-queue (except when the actual

2406

* thread is always on the run-queue (except when the actual

2407

* re-schedule is in progress), and as such you're allowed to do

2407

* re-schedule is in progress), and as such you're allowed to do

2408

* the simpler "current->state = TASK_RUNNING" to mark yourself

2408

* the simpler "current->state = TASK_RUNNING" to mark yourself

2409

* runnable without the overhead of this.

2409

* runnable without the overhead of this.

2410

*

2410

*

2411

* returns failure only if the task is already active.

2411

* returns failure only if the task is already active.

2412

*/

2412

*/

2413

static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)

2413

static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)

2414

{

2414

{

2415

int cpu, orig_cpu, this_cpu, success = 0;

2415

int cpu, orig_cpu, this_cpu, success = 0;

2416

unsigned long flags;

2416

unsigned long flags;

2417

long old_state;

2417

long old_state;

2418

struct rq *rq;

2418

struct rq *rq;

2419

2420

if (!sched_feat(SYNC_WAKEUPS))

2420

if (!sched_feat(SYNC_WAKEUPS))

2421

sync = 0;

2421

sync = 0;

2422

2423

#ifdef CONFIG_SMP

2423

#ifdef CONFIG_SMP

2424

if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {

2424

if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {

2425

struct sched_domain *sd;

2425

struct sched_domain *sd;

2426

2427

this_cpu = raw_smp_processor_id();

2427

this_cpu = raw_smp_processor_id();

2428

cpu = task_cpu(p);

2428

cpu = task_cpu(p);

2429

2430

for_each_domain(this_cpu, sd) {

2430

for_each_domain(this_cpu, sd) {

2431

if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {

2431

if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {

2432

update_shares(sd);

2432

update_shares(sd);

2433

break;

2433

break;

2434

}

2434

}

2435

}

2435

}

2436

}

2436

}

2437

#endif

2437

#endif

2438

2439

smp_wmb();

2439

smp_wmb();

2440

rq = task_rq_lock(p, &flags);

2440

rq = task_rq_lock(p, &flags);

2441

update_rq_clock(rq);

2441

update_rq_clock(rq);

2442

old_state = p->state;

2442

old_state = p->state;

2443

if (!(old_state & state))

2443

if (!(old_state & state))

2444

goto out;

2444

goto out;

2445

2446

if (p->se.on_rq)

2446

if (p->se.on_rq)

2447

goto out_running;

2447

goto out_running;

2448

2449

cpu = task_cpu(p);

2449

cpu = task_cpu(p);

2450

orig_cpu = cpu;

2450

orig_cpu = cpu;

2451

this_cpu = smp_processor_id();

2451

this_cpu = smp_processor_id();

2452

2453

#ifdef CONFIG_SMP

2453

#ifdef CONFIG_SMP

2454

if (unlikely(task_running(rq, p)))

2454

if (unlikely(task_running(rq, p)))

2455

goto out_activate;

2455

goto out_activate;

2456

2457

cpu = p->sched_class->select_task_rq(p, sync);

2457

cpu = p->sched_class->select_task_rq(p, sync);

2458

if (cpu != orig_cpu) {

2458

if (cpu != orig_cpu) {

2459

set_task_cpu(p, cpu);

2459

set_task_cpu(p, cpu);

2460

task_rq_unlock(rq, &flags);

2460

task_rq_unlock(rq, &flags);

2461

/* might preempt at this point */

2461

/* might preempt at this point */

2462

rq = task_rq_lock(p, &flags);

2462

rq = task_rq_lock(p, &flags);

2463

old_state = p->state;

2463

old_state = p->state;

2464

if (!(old_state & state))

2464

if (!(old_state & state))

2465

goto out;

2465

goto out;

2466

if (p->se.on_rq)

2466

if (p->se.on_rq)

2467

goto out_running;

2467

goto out_running;

2468

2469

this_cpu = smp_processor_id();

2469

this_cpu = smp_processor_id();

2470

cpu = task_cpu(p);

2470

cpu = task_cpu(p);

2471

}

2471

}

2472

2473

#ifdef CONFIG_SCHEDSTATS

2473

#ifdef CONFIG_SCHEDSTATS

2474

schedstat_inc(rq, ttwu_count);

2474

schedstat_inc(rq, ttwu_count);

2475

if (cpu == this_cpu)

2475

if (cpu == this_cpu)

2476

schedstat_inc(rq, ttwu_local);

2476

schedstat_inc(rq, ttwu_local);

2477

else {

2477

else {

2478

struct sched_domain *sd;

2478

struct sched_domain *sd;

2479

for_each_domain(this_cpu, sd) {

2479

for_each_domain(this_cpu, sd) {

2480

if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {

2480

if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {

2481

schedstat_inc(sd, ttwu_wake_remote);

2481

schedstat_inc(sd, ttwu_wake_remote);

2482

break;

2482

break;

2483

}

2483

}

2484

}

2484

}

2485

}

2485

}

2486

#endif /* CONFIG_SCHEDSTATS */

2486

#endif /* CONFIG_SCHEDSTATS */

2487

2488

out_activate:

2488

out_activate:

2489

#endif /* CONFIG_SMP */

2489

#endif /* CONFIG_SMP */

2490

schedstat_inc(p, se.nr_wakeups);

2490

schedstat_inc(p, se.nr_wakeups);

2491

if (sync)

2491

if (sync)

2492

schedstat_inc(p, se.nr_wakeups_sync);

2492

schedstat_inc(p, se.nr_wakeups_sync);

2493

if (orig_cpu != cpu)

2493

if (orig_cpu != cpu)

2494

schedstat_inc(p, se.nr_wakeups_migrate);

2494

schedstat_inc(p, se.nr_wakeups_migrate);

2495

if (cpu == this_cpu)

2495

if (cpu == this_cpu)

2496

schedstat_inc(p, se.nr_wakeups_local);

2496

schedstat_inc(p, se.nr_wakeups_local);

2497

else

2497

else

2498

schedstat_inc(p, se.nr_wakeups_remote);

2498

schedstat_inc(p, se.nr_wakeups_remote);

2499

activate_task(rq, p, 1);

2499

activate_task(rq, p, 1);

2500

success = 1;

2500

success = 1;

2501

2502

/*

2502

/*

2503

* Only attribute actual wakeups done by this task.

2503

* Only attribute actual wakeups done by this task.

2504

*/

2504

*/

2505

if (!in_interrupt()) {

2505

if (!in_interrupt()) {

2506

struct sched_entity *se = &current->se;

2506

struct sched_entity *se = &current->se;

2507

u64 sample = se->sum_exec_runtime;

2507

u64 sample = se->sum_exec_runtime;

2508

2509

if (se->last_wakeup)

2509

if (se->last_wakeup)

2510

sample -= se->last_wakeup;

2510

sample -= se->last_wakeup;

2511

else

2511

else

2512

sample -= se->start_runtime;

2512

sample -= se->start_runtime;

2513

update_avg(&se->avg_wakeup, sample);

2513

update_avg(&se->avg_wakeup, sample);

2514

2515

se->last_wakeup = se->sum_exec_runtime;

2515

se->last_wakeup = se->sum_exec_runtime;

2516

}

2516

}

2517

2518

out_running:

2518

out_running:

2519

trace_sched_wakeup(rq, p, success);

2519

trace_sched_wakeup(rq, p, success);

2520

check_preempt_curr(rq, p, sync);

2520

check_preempt_curr(rq, p, sync);

2521

2522

p->state = TASK_RUNNING;

2522

p->state = TASK_RUNNING;

2523

#ifdef CONFIG_SMP

2523

#ifdef CONFIG_SMP

2524

if (p->sched_class->task_wake_up)

2524

if (p->sched_class->task_wake_up)

2525

p->sched_class->task_wake_up(rq, p);

2525

p->sched_class->task_wake_up(rq, p);

2526

#endif

2526

#endif

2527

out:

2527

out:

2528

task_rq_unlock(rq, &flags);

2528

task_rq_unlock(rq, &flags);

2529

2530

return success;

2530

return success;

2531

}

2531

}

2532

2533

/**

2533

/**

2534

* wake_up_process - Wake up a specific process

2534

* wake_up_process - Wake up a specific process

2535

* @p: The process to be woken up.

2535

* @p: The process to be woken up.

2536

*

2536

*

2537

* Attempt to wake up the nominated process and move it to the set of runnable

2537

* Attempt to wake up the nominated process and move it to the set of runnable

2538

* processes. Returns 1 if the process was woken up, 0 if it was already

2538

* processes. Returns 1 if the process was woken up, 0 if it was already

2539

* running.

2539

* running.

2540

*

2540

*

2541

* It may be assumed that this function implies a write memory barrier before

2541

* It may be assumed that this function implies a write memory barrier before

2542

* changing the task state if and only if any tasks are woken up.

2542

* changing the task state if and only if any tasks are woken up.

2543

*/

2543

*/

2544

int wake_up_process(struct task_struct *p)

2544

int wake_up_process(struct task_struct *p)

2545

{

2545

{

2546

return try_to_wake_up(p, TASK_ALL, 0);

2546

return try_to_wake_up(p, TASK_ALL, 0);

2547

}

2547

}

2548

EXPORT_SYMBOL(wake_up_process);

2548

EXPORT_SYMBOL(wake_up_process);

2549

2550

int wake_up_state(struct task_struct *p, unsigned int state)

2550

int wake_up_state(struct task_struct *p, unsigned int state)

2551

{

2551

{

2552

return try_to_wake_up(p, state, 0);

2552

return try_to_wake_up(p, state, 0);

2553

}

2553

}

2554

2555

/*

2555

/*

2556

* Perform scheduler related setup for a newly forked process p.

2556

* Perform scheduler related setup for a newly forked process p.

2557

* p is forked by current.

2557

* p is forked by current.

2558

*

2558

*

2559

* __sched_fork() is basic setup used by init_idle() too:

2559

* __sched_fork() is basic setup used by init_idle() too:

2560

*/

2560

*/

2561

static void __sched_fork(struct task_struct *p)

2561

static void __sched_fork(struct task_struct *p)

2562

{

2562

{

2563

p->se.exec_start = 0;

2563

p->se.exec_start = 0;

2564

p->se.sum_exec_runtime = 0;

2564

p->se.sum_exec_runtime = 0;

2565

p->se.prev_sum_exec_runtime = 0;

2565

p->se.prev_sum_exec_runtime = 0;

2566

p->se.nr_migrations = 0;

2566

p->se.nr_migrations = 0;

2567

p->se.last_wakeup = 0;

2567

p->se.last_wakeup = 0;

2568

p->se.avg_overlap = 0;

2568

p->se.avg_overlap = 0;

2569

p->se.start_runtime = 0;

2569

p->se.start_runtime = 0;

2570

p->se.avg_wakeup = sysctl_sched_wakeup_granularity;

2570

p->se.avg_wakeup = sysctl_sched_wakeup_granularity;

2571

2572

#ifdef CONFIG_SCHEDSTATS

2572

#ifdef CONFIG_SCHEDSTATS

2573

p->se.wait_start = 0;

2573

p->se.wait_start = 0;

2574

p->se.sum_sleep_runtime = 0;

2574

p->se.sum_sleep_runtime = 0;

2575

p->se.sleep_start = 0;

2575

p->se.sleep_start = 0;

2576

p->se.block_start = 0;

2576

p->se.block_start = 0;

2577

p->se.sleep_max = 0;

2577

p->se.sleep_max = 0;

2578

p->se.block_max = 0;

2578

p->se.block_max = 0;

2579

p->se.exec_max = 0;

2579

p->se.exec_max = 0;

2580

p->se.slice_max = 0;

2580

p->se.slice_max = 0;

2581

p->se.wait_max = 0;

2581

p->se.wait_max = 0;

2582

#endif

2582

#endif

2583

2584

INIT_LIST_HEAD(&p->rt.run_list);

2584

INIT_LIST_HEAD(&p->rt.run_list);

2585

p->se.on_rq = 0;

2585

p->se.on_rq = 0;

2586

INIT_LIST_HEAD(&p->se.group_node);

2586

INIT_LIST_HEAD(&p->se.group_node);

2587

2588

#ifdef CONFIG_PREEMPT_NOTIFIERS

2588

#ifdef CONFIG_PREEMPT_NOTIFIERS

2589

INIT_HLIST_HEAD(&p->preempt_notifiers);

2589

INIT_HLIST_HEAD(&p->preempt_notifiers);

2590

#endif

2590

#endif

2591

2592

/*

2592

/*

2593

* We mark the process as running here, but have not actually

2593

* We mark the process as running here, but have not actually

2594

* inserted it onto the runqueue yet. This guarantees that

2594

* inserted it onto the runqueue yet. This guarantees that

2595

* nobody will actually run it, and a signal or other external

2595

* nobody will actually run it, and a signal or other external

2596

* event cannot wake it up and insert it on the runqueue either.

2596

* event cannot wake it up and insert it on the runqueue either.

2597

*/

2597

*/

2598

p->state = TASK_RUNNING;

2598

p->state = TASK_RUNNING;

2599

}

2599

}

2600

2601

/*

2601

/*

2602

* fork()/clone()-time setup:

2602

* fork()/clone()-time setup:

2603

*/

2603

*/

2604

void sched_fork(struct task_struct *p, int clone_flags)

2604

void sched_fork(struct task_struct *p, int clone_flags)

2605

{

2605

{

2606

int cpu = get_cpu();

2606

int cpu = get_cpu();

2607

2608

__sched_fork(p);

2608

__sched_fork(p);

2609

2610

#ifdef CONFIG_SMP

2610

#ifdef CONFIG_SMP

2611

cpu = sched_balance_self(cpu, SD_BALANCE_FORK);

2611

cpu = sched_balance_self(cpu, SD_BALANCE_FORK);

2612

#endif

2612

#endif

2613

set_task_cpu(p, cpu);

2613

set_task_cpu(p, cpu);

2614

2615

/*

2615

/*

2616

* Make sure we do not leak PI boosting priority to the child:

2616

* Make sure we do not leak PI boosting priority to the child:

2617

*/

2617

*/

2618

p->prio = current->normal_prio;

2618

p->prio = current->normal_prio;

2619

if (!rt_prio(p->prio))

2619

if (!rt_prio(p->prio))

2620

p->sched_class = &fair_sched_class;

2620

p->sched_class = &fair_sched_class;

2621

2622

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)

2622

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)

2623

if (likely(sched_info_on()))

2623

if (likely(sched_info_on()))

2624

memset(&p->sched_info, 0, sizeof(p->sched_info));

2624

memset(&p->sched_info, 0, sizeof(p->sched_info));

2625

#endif

2625

#endif

2626

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

2626

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

2627

p->oncpu = 0;

2627

p->oncpu = 0;

2628

#endif

2628

#endif

2629

#ifdef CONFIG_PREEMPT

2629

#ifdef CONFIG_PREEMPT

2630

/* Want to start with kernel preemption disabled. */

2630

/* Want to start with kernel preemption disabled. */

2631

task_thread_info(p)->preempt_count = 1;

2631

task_thread_info(p)->preempt_count = 1;

2632

#endif

2632

#endif

2633

plist_node_init(&p->pushable_tasks, MAX_PRIO);

2633

plist_node_init(&p->pushable_tasks, MAX_PRIO);

2634

2635

put_cpu();

2635

put_cpu();

2636

}

2636

}

2637

2638

/*

2638

/*

2639

* wake_up_new_task - wake up a newly created task for the first time.

2639

* wake_up_new_task - wake up a newly created task for the first time.

2640

*

2640

*

2641

* This function will do some initial scheduler statistics housekeeping

2641

* This function will do some initial scheduler statistics housekeeping

2642

* that must be done for every newly created context, then puts the task

2642

* that must be done for every newly created context, then puts the task

2643

* on the runqueue and wakes it.

2643

* on the runqueue and wakes it.

2644

*/

2644

*/

2645

void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)

2645

void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)

2646

{

2646

{

2647

unsigned long flags;

2647

unsigned long flags;

2648

struct rq *rq;

2648

struct rq *rq;

2649

2650

rq = task_rq_lock(p, &flags);

2650

rq = task_rq_lock(p, &flags);

2651

BUG_ON(p->state != TASK_RUNNING);

2651

BUG_ON(p->state != TASK_RUNNING);

2652

update_rq_clock(rq);

2652

update_rq_clock(rq);

2653

2654

p->prio = effective_prio(p);

2654

p->prio = effective_prio(p);

2655

2656

if (!p->sched_class->task_new || !current->se.on_rq) {

2656

if (!p->sched_class->task_new || !current->se.on_rq) {

2657

activate_task(rq, p, 0);

2657

activate_task(rq, p, 0);

2658

} else {

2658

} else {

2659

/*

2659

/*

2660

* Let the scheduling class do new task startup

2660

* Let the scheduling class do new task startup

2661

* management (if any):

2661

* management (if any):

2662

*/

2662

*/

2663

p->sched_class->task_new(rq, p);

2663

p->sched_class->task_new(rq, p);

2664

inc_nr_running(rq);

2664

inc_nr_running(rq);

2665

}

2665

}

2666

trace_sched_wakeup_new(rq, p, 1);

2666

trace_sched_wakeup_new(rq, p, 1);

2667

check_preempt_curr(rq, p, 0);

2667

check_preempt_curr(rq, p, 0);

2668

#ifdef CONFIG_SMP

2668

#ifdef CONFIG_SMP

2669

if (p->sched_class->task_wake_up)

2669

if (p->sched_class->task_wake_up)

2670

p->sched_class->task_wake_up(rq, p);

2670

p->sched_class->task_wake_up(rq, p);

2671

#endif

2671

#endif

2672

task_rq_unlock(rq, &flags);

2672

task_rq_unlock(rq, &flags);

2673

}

2673

}

2674

2675

#ifdef CONFIG_PREEMPT_NOTIFIERS

2675

#ifdef CONFIG_PREEMPT_NOTIFIERS

2676

2677

/**

2677

/**

2678

* preempt_notifier_register - tell me when current is being preempted & rescheduled

2678

* preempt_notifier_register - tell me when current is being preempted & rescheduled

2679

* @notifier: notifier struct to register

2679

* @notifier: notifier struct to register

2680

*/

2680

*/

2681

void preempt_notifier_register(struct preempt_notifier *notifier)

2681

void preempt_notifier_register(struct preempt_notifier *notifier)

2682

{

2682

{

2683

hlist_add_head(&notifier->link, &current->preempt_notifiers);

2683

hlist_add_head(&notifier->link, &current->preempt_notifiers);

2684

}

2684

}

2685

EXPORT_SYMBOL_GPL(preempt_notifier_register);

2685

EXPORT_SYMBOL_GPL(preempt_notifier_register);

2686

2687

/**

2687

/**

2688

* preempt_notifier_unregister - no longer interested in preemption notifications

2688

* preempt_notifier_unregister - no longer interested in preemption notifications

2689

* @notifier: notifier struct to unregister

2689

* @notifier: notifier struct to unregister

2690

*

2690

*

2691

* This is safe to call from within a preemption notifier.

2691

* This is safe to call from within a preemption notifier.

2692

*/

2692

*/

2693

void preempt_notifier_unregister(struct preempt_notifier *notifier)

2693

void preempt_notifier_unregister(struct preempt_notifier *notifier)

2694

{

2694

{

2695

hlist_del(&notifier->link);

2695

hlist_del(&notifier->link);

2696

}

2696

}

2697

EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

2697

EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

2698

2699

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2699

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2700

{

2700

{

2701

struct preempt_notifier *notifier;

2701

struct preempt_notifier *notifier;

2702

struct hlist_node *node;

2702

struct hlist_node *node;

2703

2704

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2704

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2705

notifier->ops->sched_in(notifier, raw_smp_processor_id());

2705

notifier->ops->sched_in(notifier, raw_smp_processor_id());

2706

}

2706

}

2707

2708

static void

2708

static void

2709

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2709

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2710

struct task_struct *next)

2710

struct task_struct *next)

2711

{

2711

{

2712

struct preempt_notifier *notifier;

2712

struct preempt_notifier *notifier;

2713

struct hlist_node *node;

2713

struct hlist_node *node;

2714

2715

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2715

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2716

notifier->ops->sched_out(notifier, next);

2716

notifier->ops->sched_out(notifier, next);

2717

}

2717

}

2718

2719

#else /* !CONFIG_PREEMPT_NOTIFIERS */

2719

#else /* !CONFIG_PREEMPT_NOTIFIERS */

2720

2721

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2721

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2722

{

2722

{

2723

}

2723

}

2724

2725

static void

2725

static void

2726

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2726

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2727

struct task_struct *next)

2727

struct task_struct *next)

2728

{

2728

{

2729

}

2729

}

2730

2731

#endif /* CONFIG_PREEMPT_NOTIFIERS */

2731

#endif /* CONFIG_PREEMPT_NOTIFIERS */

2732

2733

/**

2733

/**

2734

* prepare_task_switch - prepare to switch tasks

2734

* prepare_task_switch - prepare to switch tasks

2735

* @rq: the runqueue preparing to switch

2735

* @rq: the runqueue preparing to switch

2736

* @prev: the current task that is being switched out

2736

* @prev: the current task that is being switched out

2737

* @next: the task we are going to switch to.

2737

* @next: the task we are going to switch to.

2738

*

2738

*

2739

* This is called with the rq lock held and interrupts off. It must

2739

* This is called with the rq lock held and interrupts off. It must

2740

* be paired with a subsequent finish_task_switch after the context

2740

* be paired with a subsequent finish_task_switch after the context

2741

* switch.

2741

* switch.

2742

*

2742

*

2743

* prepare_task_switch sets up locking and calls architecture specific

2743

* prepare_task_switch sets up locking and calls architecture specific

2744

* hooks.

2744

* hooks.

2745

*/

2745

*/

2746

static inline void

2746

static inline void

2747

prepare_task_switch(struct rq *rq, struct task_struct *prev,

2747

prepare_task_switch(struct rq *rq, struct task_struct *prev,

2748

struct task_struct *next)

2748

struct task_struct *next)

2749

{

2749

{

2750

fire_sched_out_preempt_notifiers(prev, next);

2750

fire_sched_out_preempt_notifiers(prev, next);

2751

prepare_lock_switch(rq, next);

2751

prepare_lock_switch(rq, next);

2752

prepare_arch_switch(next);

2752

prepare_arch_switch(next);

2753

}

2753

}

2754

2755

/**

2755

/**

2756

* finish_task_switch - clean up after a task-switch

2756

* finish_task_switch - clean up after a task-switch

2757

* @rq: runqueue associated with task-switch

2757

* @rq: runqueue associated with task-switch

2758

* @prev: the thread we just switched away from.

2758

* @prev: the thread we just switched away from.

2759

*

2759

*

2760

* finish_task_switch must be called after the context switch, paired

2760

* finish_task_switch must be called after the context switch, paired

2761

* with a prepare_task_switch call before the context switch.

2761

* with a prepare_task_switch call before the context switch.

2762

* finish_task_switch will reconcile locking set up by prepare_task_switch,

2762

* finish_task_switch will reconcile locking set up by prepare_task_switch,

2763

* and do any other architecture-specific cleanup actions.

2763

* and do any other architecture-specific cleanup actions.

2764

*

2764

*

2765

* Note that we may have delayed dropping an mm in context_switch(). If

2765

* Note that we may have delayed dropping an mm in context_switch(). If

2766

* so, we finish that here outside of the runqueue lock. (Doing it

2766

* so, we finish that here outside of the runqueue lock. (Doing it

2767

* with the lock held can cause deadlocks; see schedule() for

2767

* with the lock held can cause deadlocks; see schedule() for

2768

* details.)

2768

* details.)

2769

*/

2769

*/

2770

static void finish_task_switch(struct rq *rq, struct task_struct *prev)

2770

static void finish_task_switch(struct rq *rq, struct task_struct *prev)

2771

__releases(rq->lock)

2771

__releases(rq->lock)

2772

{

2772

{

2773

struct mm_struct *mm = rq->prev_mm;

2773

struct mm_struct *mm = rq->prev_mm;

2774

long prev_state;

2774

long prev_state;

2775

#ifdef CONFIG_SMP

2775

#ifdef CONFIG_SMP

2776

int post_schedule = 0;

2776

int post_schedule = 0;

2777

2778

if (current->sched_class->needs_post_schedule)

2778

if (current->sched_class->needs_post_schedule)

2779

post_schedule = current->sched_class->needs_post_schedule(rq);

2779

post_schedule = current->sched_class->needs_post_schedule(rq);

2780

#endif

2780

#endif

2781

2782

rq->prev_mm = NULL;

2782

rq->prev_mm = NULL;

2783

2784

/*

2784

/*

2785

* A task struct has one reference for the use as "current".

2785

* A task struct has one reference for the use as "current".

2786

* If a task dies, then it sets TASK_DEAD in tsk->state and calls

2786

* If a task dies, then it sets TASK_DEAD in tsk->state and calls

2787

* schedule one last time. The schedule call will never return, and

2787

* schedule one last time. The schedule call will never return, and

2788

* the scheduled task must drop that reference.

2788

* the scheduled task must drop that reference.

2789

* The test for TASK_DEAD must occur while the runqueue locks are

2789

* The test for TASK_DEAD must occur while the runqueue locks are

2790

* still held, otherwise prev could be scheduled on another cpu, die

2790

* still held, otherwise prev could be scheduled on another cpu, die

2791

* there before we look at prev->state, and then the reference would

2791

* there before we look at prev->state, and then the reference would

2792

* be dropped twice.

2792

* be dropped twice.

2793

* Manfred Spraul <manfred@colorfullife.com>

2793

* Manfred Spraul <manfred@colorfullife.com>

2794

*/

2794

*/

2795

prev_state = prev->state;

2795

prev_state = prev->state;

2796

finish_arch_switch(prev);

2796

finish_arch_switch(prev);

2797

perf_counter_task_sched_in(current, cpu_of(rq));

2797

perf_counter_task_sched_in(current, cpu_of(rq));

2798

finish_lock_switch(rq, prev);

2798

finish_lock_switch(rq, prev);

2799

#ifdef CONFIG_SMP

2799

#ifdef CONFIG_SMP

2800

if (post_schedule)

2800

if (post_schedule)

2801

current->sched_class->post_schedule(rq);

2801

current->sched_class->post_schedule(rq);

2802

#endif

2802

#endif

2803

2804

fire_sched_in_preempt_notifiers(current);

2804

fire_sched_in_preempt_notifiers(current);

2805

if (mm)

2805

if (mm)

2806

mmdrop(mm);

2806

mmdrop(mm);

2807

if (unlikely(prev_state == TASK_DEAD)) {

2807

if (unlikely(prev_state == TASK_DEAD)) {

2808

/*

2808

/*

2809

* Remove function-return probe instances associated with this

2809

* Remove function-return probe instances associated with this

2810

* task and put them back on the free list.

2810

* task and put them back on the free list.

2811

*/

2811

*/

2812

kprobe_flush_task(prev);

2812

kprobe_flush_task(prev);

2813

put_task_struct(prev);

2813

put_task_struct(prev);

2814

}

2814

}

2815

}

2815

}

2816

2817

/**

2817

/**

2818

* schedule_tail - first thing a freshly forked thread must call.

2818

* schedule_tail - first thing a freshly forked thread must call.

2819

* @prev: the thread we just switched away from.

2819

* @prev: the thread we just switched away from.

2820

*/

2820

*/

2821

asmlinkage void schedule_tail(struct task_struct *prev)

2821

asmlinkage void schedule_tail(struct task_struct *prev)

2822

__releases(rq->lock)

2822

__releases(rq->lock)

2823

{

2823

{

2824

struct rq *rq = this_rq();

2824

struct rq *rq = this_rq();

2825

2826

finish_task_switch(rq, prev);

2826

finish_task_switch(rq, prev);

2827

#ifdef __ARCH_WANT_UNLOCKED_CTXSW

2827

#ifdef __ARCH_WANT_UNLOCKED_CTXSW

2828

/* In this case, finish_task_switch does not reenable preemption */

2828

/* In this case, finish_task_switch does not reenable preemption */

2829

preempt_enable();

2829

preempt_enable();

2830

#endif

2830

#endif

2831

if (current->set_child_tid)

2831

if (current->set_child_tid)

2832

put_user(task_pid_vnr(current), current->set_child_tid);

2832

put_user(task_pid_vnr(current), current->set_child_tid);

2833

}

2833

}

2834

2835

/*

2835

/*

2836

* context_switch - switch to the new MM and the new

2836

* context_switch - switch to the new MM and the new

2837

* thread's register state.

2837

* thread's register state.

2838

*/

2838

*/

2839

static inline void

2839

static inline void

2840

context_switch(struct rq *rq, struct task_struct *prev,

2840

context_switch(struct rq *rq, struct task_struct *prev,

2841

struct task_struct *next)

2841

struct task_struct *next)

2842

{

2842

{

2843

struct mm_struct *mm, *oldmm;

2843

struct mm_struct *mm, *oldmm;

2844

2845

prepare_task_switch(rq, prev, next);

2845

prepare_task_switch(rq, prev, next);

2846

trace_sched_switch(rq, prev, next);

2846

trace_sched_switch(rq, prev, next);

2847

mm = next->mm;

2847

mm = next->mm;

2848

oldmm = prev->active_mm;

2848

oldmm = prev->active_mm;

2849

/*

2849

/*

2850

* For paravirt, this is coupled with an exit in switch_to to

2850

* For paravirt, this is coupled with an exit in switch_to to

2851

* combine the page table reload and the switch backend into

2851

* combine the page table reload and the switch backend into

2852

* one hypercall.

2852

* one hypercall.

2853

*/

2853

*/

2854

arch_start_context_switch(prev);

2854

arch_start_context_switch(prev);

2855

2856

if (unlikely(!mm)) {

2856

if (unlikely(!mm)) {

2857

next->active_mm = oldmm;

2857

next->active_mm = oldmm;

2858

atomic_inc(&oldmm->mm_count);

2858

atomic_inc(&oldmm->mm_count);

2859

enter_lazy_tlb(oldmm, next);

2859

enter_lazy_tlb(oldmm, next);

2860

} else

2860

} else

2861

switch_mm(oldmm, mm, next);

2861

switch_mm(oldmm, mm, next);

2862

2863

if (unlikely(!prev->mm)) {

2863

if (unlikely(!prev->mm)) {

2864

prev->active_mm = NULL;

2864

prev->active_mm = NULL;

2865

rq->prev_mm = oldmm;

2865

rq->prev_mm = oldmm;

2866

}

2866

}

2867

/*

2867

/*

2868

* Since the runqueue lock will be released by the next

2868

* Since the runqueue lock will be released by the next

2869

* task (which is an invalid locking op but in the case

2869

* task (which is an invalid locking op but in the case

2870

* of the scheduler it's an obvious special-case), so we

2870

* of the scheduler it's an obvious special-case), so we

2871

* do an early lockdep release here:

2871

* do an early lockdep release here:

2872

*/

2872

*/

2873

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

2873

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

2874

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

2874

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

2875

#endif

2875

#endif

2876

2877

/* Here we just switch the register state and the stack. */

2877

/* Here we just switch the register state and the stack. */

2878

switch_to(prev, next, prev);

2878

switch_to(prev, next, prev);

2879

2880

barrier();

2880

barrier();

2881

/*

2881

/*

2882

* this_rq must be evaluated again because prev may have moved

2882

* this_rq must be evaluated again because prev may have moved

2883

* CPUs since it called schedule(), thus the 'rq' on its stack

2883

* CPUs since it called schedule(), thus the 'rq' on its stack

2884

* frame will be invalid.

2884

* frame will be invalid.

2885

*/

2885

*/

2886

finish_task_switch(this_rq(), prev);

2886

finish_task_switch(this_rq(), prev);

2887

}

2887

}

2888

2889

/*

2889

/*

2890

* nr_running, nr_uninterruptible and nr_context_switches:

2890

* nr_running, nr_uninterruptible and nr_context_switches:

2891

*

2891

*

2892

* externally visible scheduler statistics: current number of runnable

2892

* externally visible scheduler statistics: current number of runnable

2893

* threads, current number of uninterruptible-sleeping threads, total

2893

* threads, current number of uninterruptible-sleeping threads, total

2894

* number of context switches performed since bootup.

2894

* number of context switches performed since bootup.

2895

*/

2895

*/

2896

unsigned long nr_running(void)

2896

unsigned long nr_running(void)

2897

{

2897

{

2898

unsigned long i, sum = 0;

2898

unsigned long i, sum = 0;

2899

2900

for_each_online_cpu(i)

2900

for_each_online_cpu(i)

2901

sum += cpu_rq(i)->nr_running;

2901

sum += cpu_rq(i)->nr_running;

2902

2903

return sum;

2903

return sum;

2904

}

2904

}

2905

2906

unsigned long nr_uninterruptible(void)

2906

unsigned long nr_uninterruptible(void)

2907

{

2907

{

2908

unsigned long i, sum = 0;

2908

unsigned long i, sum = 0;

2909

2910

for_each_possible_cpu(i)

2910

for_each_possible_cpu(i)

2911

sum += cpu_rq(i)->nr_uninterruptible;

2911

sum += cpu_rq(i)->nr_uninterruptible;

2912

2913

/*

2913

/*

2914

* Since we read the counters lockless, it might be slightly

2914

* Since we read the counters lockless, it might be slightly

2915

* inaccurate. Do not allow it to go below zero though:

2915

* inaccurate. Do not allow it to go below zero though:

2916

*/

2916

*/

2917

if (unlikely((long)sum < 0))

2917

if (unlikely((long)sum < 0))

2918

sum = 0;

2918

sum = 0;

2919

2920

return sum;

2920

return sum;

2921

}

2921

}

2922

2923

unsigned long long nr_context_switches(void)

2923

unsigned long long nr_context_switches(void)

2924

{

2924

{

2925

int i;

2925

int i;

2926

unsigned long long sum = 0;

2926

unsigned long long sum = 0;

2927

2928

for_each_possible_cpu(i)

2928

for_each_possible_cpu(i)

2929

sum += cpu_rq(i)->nr_switches;

2929

sum += cpu_rq(i)->nr_switches;

2930

2931

return sum;

2931

return sum;

2932

}

2932

}

2933

2934

unsigned long nr_iowait(void)

2934

unsigned long nr_iowait(void)

2935

{

2935

{

2936

unsigned long i, sum = 0;

2936

unsigned long i, sum = 0;

2937

2938

for_each_possible_cpu(i)

2938

for_each_possible_cpu(i)

2939

sum += atomic_read(&cpu_rq(i)->nr_iowait);

2939

sum += atomic_read(&cpu_rq(i)->nr_iowait);

2940

2941

return sum;

2941

return sum;

2942

}

2942

}

2943

2944

/* Variables and functions for calc_load */

2944

/* Variables and functions for calc_load */

2945

static atomic_long_t calc_load_tasks;

2945

static atomic_long_t calc_load_tasks;

2946

static unsigned long calc_load_update;

2946

static unsigned long calc_load_update;

2947

unsigned long avenrun[3];

2947

unsigned long avenrun[3];

2948

EXPORT_SYMBOL(avenrun);

2948

EXPORT_SYMBOL(avenrun);

2949

2950

/**

2950

/**

2951

* get_avenrun - get the load average array

2951

* get_avenrun - get the load average array

2952

* @loads: pointer to dest load array

2952

* @loads: pointer to dest load array

2953

* @offset: offset to add

2953

* @offset: offset to add

2954

* @shift: shift count to shift the result left

2954

* @shift: shift count to shift the result left

2955

*

2955

*

2956

* These values are estimates at best, so no need for locking.

2956

* These values are estimates at best, so no need for locking.

2957

*/

2957

*/

2958

void get_avenrun(unsigned long *loads, unsigned long offset, int shift)

2958

void get_avenrun(unsigned long *loads, unsigned long offset, int shift)

2959

{

2959

{

2960

loads[0] = (avenrun[0] + offset) << shift;

2960

loads[0] = (avenrun[0] + offset) << shift;

2961

loads[1] = (avenrun[1] + offset) << shift;

2961

loads[1] = (avenrun[1] + offset) << shift;

2962

loads[2] = (avenrun[2] + offset) << shift;

2962

loads[2] = (avenrun[2] + offset) << shift;

2963

}

2963

}

2964

2965

static unsigned long

2965

static unsigned long

2966

calc_load(unsigned long load, unsigned long exp, unsigned long active)

2966

calc_load(unsigned long load, unsigned long exp, unsigned long active)

2967

{

2967

{

2968

load *= exp;

2968

load *= exp;

2969

load += active * (FIXED_1 - exp);

2969

load += active * (FIXED_1 - exp);

2970

return load >> FSHIFT;

2970

return load >> FSHIFT;

2971

}

2971

}

2972

2973

/*

2973

/*

2974

* calc_load - update the avenrun load estimates 10 ticks after the

2974

* calc_load - update the avenrun load estimates 10 ticks after the

2975

* CPUs have updated calc_load_tasks.

2975

* CPUs have updated calc_load_tasks.

2976

*/

2976

*/

2977

void calc_global_load(void)

2977

void calc_global_load(void)

2978

{

2978

{

2979

unsigned long upd = calc_load_update + 10;

2979

unsigned long upd = calc_load_update + 10;

2980

long active;

2980

long active;

2981

2982

if (time_before(jiffies, upd))

2982

if (time_before(jiffies, upd))

2983

return;

2983

return;

2984

2985

active = atomic_long_read(&calc_load_tasks);

2985

active = atomic_long_read(&calc_load_tasks);

2986

active = active > 0 ? active * FIXED_1 : 0;

2986

active = active > 0 ? active * FIXED_1 : 0;

2987

2988

avenrun[0] = calc_load(avenrun[0], EXP_1, active);

2988

avenrun[0] = calc_load(avenrun[0], EXP_1, active);

2989

avenrun[1] = calc_load(avenrun[1], EXP_5, active);

2989

avenrun[1] = calc_load(avenrun[1], EXP_5, active);

2990

avenrun[2] = calc_load(avenrun[2], EXP_15, active);

2990

avenrun[2] = calc_load(avenrun[2], EXP_15, active);

2991

2992

calc_load_update += LOAD_FREQ;

2992

calc_load_update += LOAD_FREQ;

2993

}

2993

}

2994

2995

/*

2995

/*

2996

* Either called from update_cpu_load() or from a cpu going idle

2996

* Either called from update_cpu_load() or from a cpu going idle

2997

*/

2997

*/

2998

static void calc_load_account_active(struct rq *this_rq)

2998

static void calc_load_account_active(struct rq *this_rq)

2999

{

2999

{

3000

long nr_active, delta;

3000

long nr_active, delta;

3001

3002

nr_active = this_rq->nr_running;

3002

nr_active = this_rq->nr_running;

3003

nr_active += (long) this_rq->nr_uninterruptible;

3003

nr_active += (long) this_rq->nr_uninterruptible;

3004

3005

if (nr_active != this_rq->calc_load_active) {

3005

if (nr_active != this_rq->calc_load_active) {

3006

delta = nr_active - this_rq->calc_load_active;

3006

delta = nr_active - this_rq->calc_load_active;

3007

this_rq->calc_load_active = nr_active;

3007

this_rq->calc_load_active = nr_active;

3008

atomic_long_add(delta, &calc_load_tasks);

3008

atomic_long_add(delta, &calc_load_tasks);

3009

}

3009

}

3010

}

3010

}

3011

3012

/*

3012

/*

3013

* Externally visible per-cpu scheduler statistics:

3013

* Externally visible per-cpu scheduler statistics:

3014

* cpu_nr_migrations(cpu) - number of migrations into that cpu

3014

* cpu_nr_migrations(cpu) - number of migrations into that cpu

3015

*/

3015

*/

3016

u64 cpu_nr_migrations(int cpu)

3016

u64 cpu_nr_migrations(int cpu)

3017

{

3017

{

3018

return cpu_rq(cpu)->nr_migrations_in;

3018

return cpu_rq(cpu)->nr_migrations_in;

3019

}

3019

}

3020

3021

/*

3021

/*

3022

* Update rq->cpu_load[] statistics. This function is usually called every

3022

* Update rq->cpu_load[] statistics. This function is usually called every

3023

* scheduler tick (TICK_NSEC).

3023

* scheduler tick (TICK_NSEC).

3024

*/

3024

*/

3025

static void update_cpu_load(struct rq *this_rq)

3025

static void update_cpu_load(struct rq *this_rq)

3026

{

3026

{

3027

unsigned long this_load = this_rq->load.weight;

3027

unsigned long this_load = this_rq->load.weight;

3028

int i, scale;

3028

int i, scale;

3029

3030

this_rq->nr_load_updates++;

3030

this_rq->nr_load_updates++;

3031

3032

/* Update our load: */

3032

/* Update our load: */

3033

for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {

3033

for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {

3034

unsigned long old_load, new_load;

3034

unsigned long old_load, new_load;

3035

3036

/* scale is effectively 1 << i now, and >> i divides by scale */

3036

/* scale is effectively 1 << i now, and >> i divides by scale */

3037

3038

old_load = this_rq->cpu_load[i];

3038

old_load = this_rq->cpu_load[i];

3039

new_load = this_load;

3039

new_load = this_load;

3040

/*

3040

/*

3041

* Round up the averaging division if load is increasing. This

3041

* Round up the averaging division if load is increasing. This

3042

* prevents us from getting stuck on 9 if the load is 10, for

3042

* prevents us from getting stuck on 9 if the load is 10, for

3043

* example.

3043

* example.

3044

*/

3044

*/

3045

if (new_load > old_load)

3045

if (new_load > old_load)

3046

new_load += scale-1;

3046

new_load += scale-1;

3047

this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;

3047

this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;

3048

}

3048

}

3049

3050

if (time_after_eq(jiffies, this_rq->calc_load_update)) {

3050

if (time_after_eq(jiffies, this_rq->calc_load_update)) {

3051

this_rq->calc_load_update += LOAD_FREQ;

3051

this_rq->calc_load_update += LOAD_FREQ;

3052

calc_load_account_active(this_rq);

3052

calc_load_account_active(this_rq);

3053

}

3053

}

3054

}

3054

}

3055

3056

#ifdef CONFIG_SMP

3056

#ifdef CONFIG_SMP

3057

3058

/*

3058

/*

3059

* double_rq_lock - safely lock two runqueues

3059

* double_rq_lock - safely lock two runqueues

3060

*

3060

*

3061

* Note this does not disable interrupts like task_rq_lock,

3061

* Note this does not disable interrupts like task_rq_lock,

3062

* you need to do so manually before calling.

3062

* you need to do so manually before calling.

3063

*/

3063

*/

3064

static void double_rq_lock(struct rq *rq1, struct rq *rq2)

3064

static void double_rq_lock(struct rq *rq1, struct rq *rq2)

3065

__acquires(rq1->lock)

3065

__acquires(rq1->lock)

3066

__acquires(rq2->lock)

3066

__acquires(rq2->lock)

3067

{

3067

{

3068

BUG_ON(!irqs_disabled());

3068

BUG_ON(!irqs_disabled());

3069

if (rq1 == rq2) {

3069

if (rq1 == rq2) {

3070

spin_lock(&rq1->lock);

3070

spin_lock(&rq1->lock);

3071

__acquire(rq2->lock); /* Fake it out ;) */

3071

__acquire(rq2->lock); /* Fake it out ;) */

3072

} else {

3072

} else {

3073

if (rq1 < rq2) {

3073

if (rq1 < rq2) {

3074

spin_lock(&rq1->lock);

3074

spin_lock(&rq1->lock);

3075

spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);

3075

spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);

3076

} else {

3076

} else {

3077

spin_lock(&rq2->lock);

3077

spin_lock(&rq2->lock);

3078

spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);

3078

spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);

3079

}

3079

}

3080

}

3080

}

3081

update_rq_clock(rq1);

3081

update_rq_clock(rq1);

3082

update_rq_clock(rq2);

3082

update_rq_clock(rq2);

3083

}

3083

}

3084

3085

/*

3085

/*

3086

* double_rq_unlock - safely unlock two runqueues

3086

* double_rq_unlock - safely unlock two runqueues

3087

*

3087

*

3088

* Note this does not restore interrupts like task_rq_unlock,

3088

* Note this does not restore interrupts like task_rq_unlock,

3089

* you need to do so manually after calling.

3089

* you need to do so manually after calling.

3090

*/

3090

*/

3091

static void double_rq_unlock(struct rq *rq1, struct rq *rq2)

3091

static void double_rq_unlock(struct rq *rq1, struct rq *rq2)

3092

__releases(rq1->lock)

3092

__releases(rq1->lock)

3093

__releases(rq2->lock)

3093

__releases(rq2->lock)

3094

{

3094

{

3095

spin_unlock(&rq1->lock);

3095

spin_unlock(&rq1->lock);

3096

if (rq1 != rq2)

3096

if (rq1 != rq2)

3097

spin_unlock(&rq2->lock);

3097

spin_unlock(&rq2->lock);

3098

else

3098

else

3099

__release(rq2->lock);

3099

__release(rq2->lock);

3100

}

3100

}

3101

3102

/*

3102

/*

3103

* If dest_cpu is allowed for this process, migrate the task to it.

3103

* If dest_cpu is allowed for this process, migrate the task to it.

3104

* This is accomplished by forcing the cpu_allowed mask to only

3104

* This is accomplished by forcing the cpu_allowed mask to only

3105

* allow dest_cpu, which will force the cpu onto dest_cpu. Then

3105

* allow dest_cpu, which will force the cpu onto dest_cpu. Then

3106

* the cpu_allowed mask is restored.

3106

* the cpu_allowed mask is restored.

3107

*/

3107

*/

3108

static void sched_migrate_task(struct task_struct *p, int dest_cpu)

3108

static void sched_migrate_task(struct task_struct *p, int dest_cpu)

3109

{

3109

{

3110

struct migration_req req;

3110

struct migration_req req;

3111

unsigned long flags;

3111

unsigned long flags;

3112

struct rq *rq;

3112

struct rq *rq;

3113

3114

rq = task_rq_lock(p, &flags);

3114

rq = task_rq_lock(p, &flags);

3115

if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)

3115

if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)

3116

|| unlikely(!cpu_active(dest_cpu)))

3116

|| unlikely(!cpu_active(dest_cpu)))

3117

goto out;

3117

goto out;

3118

3119

/* force the process onto the specified CPU */

3119

/* force the process onto the specified CPU */

3120

if (migrate_task(p, dest_cpu, &req)) {

3120

if (migrate_task(p, dest_cpu, &req)) {

3121

/* Need to wait for migration thread (might exit: take ref). */

3121

/* Need to wait for migration thread (might exit: take ref). */

3122

struct task_struct *mt = rq->migration_thread;

3122

struct task_struct *mt = rq->migration_thread;

3123

3124

get_task_struct(mt);

3124

get_task_struct(mt);

3125

task_rq_unlock(rq, &flags);

3125

task_rq_unlock(rq, &flags);

3126

wake_up_process(mt);

3126

wake_up_process(mt);

3127

put_task_struct(mt);

3127

put_task_struct(mt);

3128

wait_for_completion(&req.done);

3128

wait_for_completion(&req.done);

3129

3130

return;

3130

return;

3131

}

3131

}

3132

out:

3132

out:

3133

task_rq_unlock(rq, &flags);

3133

task_rq_unlock(rq, &flags);

3134

}

3134

}

3135

3136

/*

3136

/*

3137

* sched_exec - execve() is a valuable balancing opportunity, because at

3137

* sched_exec - execve() is a valuable balancing opportunity, because at

3138

* this point the task has the smallest effective memory and cache footprint.

3138

* this point the task has the smallest effective memory and cache footprint.

3139

*/

3139

*/

3140

void sched_exec(void)

3140

void sched_exec(void)

3141

{

3141

{

3142

int new_cpu, this_cpu = get_cpu();

3142

int new_cpu, this_cpu = get_cpu();

3143

new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);

3143

new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);

3144

put_cpu();

3144

put_cpu();

3145

if (new_cpu != this_cpu)

3145

if (new_cpu != this_cpu)

3146

sched_migrate_task(current, new_cpu);

3146

sched_migrate_task(current, new_cpu);

3147

}

3147

}

3148

3149

/*

3149

/*

3150

* pull_task - move a task from a remote runqueue to the local runqueue.

3150

* pull_task - move a task from a remote runqueue to the local runqueue.

3151

* Both runqueues must be locked.

3151

* Both runqueues must be locked.

3152

*/

3152

*/

3153

static void pull_task(struct rq *src_rq, struct task_struct *p,

3153

static void pull_task(struct rq *src_rq, struct task_struct *p,

3154

struct rq *this_rq, int this_cpu)

3154

struct rq *this_rq, int this_cpu)

3155

{

3155

{

3156

deactivate_task(src_rq, p, 0);

3156

deactivate_task(src_rq, p, 0);

3157

set_task_cpu(p, this_cpu);

3157

set_task_cpu(p, this_cpu);

3158

activate_task(this_rq, p, 0);

3158

activate_task(this_rq, p, 0);

3159

/*

3159

/*

3160

* Note that idle threads have a prio of MAX_PRIO, for this test

3160

* Note that idle threads have a prio of MAX_PRIO, for this test

3161

* to be always true for them.

3161

* to be always true for them.

3162

*/

3162

*/

3163

check_preempt_curr(this_rq, p, 0);

3163

check_preempt_curr(this_rq, p, 0);

3164

}

3164

}

3165

3166

/*

3166

/*

3167

* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?

3167

* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?

3168

*/

3168

*/

3169

static

3169

static

3170

int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,

3170

int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,

3171

struct sched_domain *sd, enum cpu_idle_type idle,

3171

struct sched_domain *sd, enum cpu_idle_type idle,

3172

int *all_pinned)

3172

int *all_pinned)

3173

{

3173

{

3174

int tsk_cache_hot = 0;

3174

int tsk_cache_hot = 0;

3175

/*

3175

/*

3176

* We do not migrate tasks that are:

3176

* We do not migrate tasks that are:

3177

* 1) running (obviously), or

3177

* 1) running (obviously), or

3178

* 2) cannot be migrated to this CPU due to cpus_allowed, or

3178

* 2) cannot be migrated to this CPU due to cpus_allowed, or

3179

* 3) are cache-hot on their current CPU.

3179

* 3) are cache-hot on their current CPU.

3180

*/

3180

*/

3181

if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {

3181

if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {

3182

schedstat_inc(p, se.nr_failed_migrations_affine);

3182

schedstat_inc(p, se.nr_failed_migrations_affine);

3183

return 0;

3183

return 0;

3184

}

3184

}

3185

*all_pinned = 0;

3185

*all_pinned = 0;

3186

3187

if (task_running(rq, p)) {

3187

if (task_running(rq, p)) {

3188

schedstat_inc(p, se.nr_failed_migrations_running);

3188

schedstat_inc(p, se.nr_failed_migrations_running);

3189

return 0;

3189

return 0;

3190

}

3190

}

3191

3192

/*

3192

/*

3193

* Aggressive migration if:

3193

* Aggressive migration if:

3194

* 1) task is cache cold, or

3194

* 1) task is cache cold, or

3195

* 2) too many balance attempts have failed.

3195

* 2) too many balance attempts have failed.

3196

*/

3196

*/

3197

3198

tsk_cache_hot = task_hot(p, rq->clock, sd);

3198

tsk_cache_hot = task_hot(p, rq->clock, sd);

3199

if (!tsk_cache_hot ||

3199

if (!tsk_cache_hot ||

3200

sd->nr_balance_failed > sd->cache_nice_tries) {

3200

sd->nr_balance_failed > sd->cache_nice_tries) {

3201

#ifdef CONFIG_SCHEDSTATS

3201

#ifdef CONFIG_SCHEDSTATS

3202

if (tsk_cache_hot) {

3202

if (tsk_cache_hot) {

3203

schedstat_inc(sd, lb_hot_gained[idle]);

3203

schedstat_inc(sd, lb_hot_gained[idle]);

3204

schedstat_inc(p, se.nr_forced_migrations);

3204

schedstat_inc(p, se.nr_forced_migrations);

3205

}

3205

}

3206

#endif

3206

#endif

3207

return 1;

3207

return 1;

3208

}

3208

}

3209

3210

if (tsk_cache_hot) {

3210

if (tsk_cache_hot) {

3211

schedstat_inc(p, se.nr_failed_migrations_hot);

3211

schedstat_inc(p, se.nr_failed_migrations_hot);

3212

return 0;

3212

return 0;

3213

}

3213

}

3214

return 1;

3214

return 1;

3215

}

3215

}

3216

3217

static unsigned long

3217

static unsigned long

3218

balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

3218

balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

3219

unsigned long max_load_move, struct sched_domain *sd,

3219

unsigned long max_load_move, struct sched_domain *sd,

3220

enum cpu_idle_type idle, int *all_pinned,

3220

enum cpu_idle_type idle, int *all_pinned,

3221

int *this_best_prio, struct rq_iterator *iterator)

3221

int *this_best_prio, struct rq_iterator *iterator)

3222

{

3222

{

3223

int loops = 0, pulled = 0, pinned = 0;

3223

int loops = 0, pulled = 0, pinned = 0;

3224

struct task_struct *p;

3224

struct task_struct *p;

3225

long rem_load_move = max_load_move;

3225

long rem_load_move = max_load_move;

3226

3227

if (max_load_move == 0)

3227

if (max_load_move == 0)

3228

goto out;

3228

goto out;

3229

3230

pinned = 1;

3230

pinned = 1;

3231

3232

/*

3232

/*

3233

* Start the load-balancing iterator:

3233

* Start the load-balancing iterator:

3234

*/

3234

*/

3235

p = iterator->start(iterator->arg);

3235

p = iterator->start(iterator->arg);

3236

if (!p || loops++ > sysctl_sched_nr_migrate)

3237

if (!p || loops++ > sysctl_sched_nr_migrate)

3238

goto out;

3238

goto out;

3239

3240

if ((p->se.load.weight >> 1) > rem_load_move ||

3240

if ((p->se.load.weight >> 1) > rem_load_move ||

3241

!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {

3241

!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {

3242

p = iterator->next(iterator->arg);

3242

p = iterator->next(iterator->arg);

3243

goto next;

3243

goto next;

3244

}

3244

}

3245

3246

pull_task(busiest, p, this_rq, this_cpu);

3246

pull_task(busiest, p, this_rq, this_cpu);

3247

pulled++;

3247

pulled++;

3248

rem_load_move -= p->se.load.weight;

3248

rem_load_move -= p->se.load.weight;

3249

3250

#ifdef CONFIG_PREEMPT

3250

#ifdef CONFIG_PREEMPT

3251

/*

3251

/*

3252

* NEWIDLE balancing is a source of latency, so preemptible kernels

3252

* NEWIDLE balancing is a source of latency, so preemptible kernels

3253

* will stop after the first task is pulled to minimize the critical

3253

* will stop after the first task is pulled to minimize the critical

3254

* section.

3254

* section.

3255

*/

3255

*/

3256

if (idle == CPU_NEWLY_IDLE)

3256

if (idle == CPU_NEWLY_IDLE)

3257

goto out;

3257

goto out;

3258

#endif

3258

#endif

3259

3260

/*

3260

/*

3261

* We only want to steal up to the prescribed amount of weighted load.

3261

* We only want to steal up to the prescribed amount of weighted load.

3262

*/

3262

*/

3263

if (rem_load_move > 0) {

3263

if (rem_load_move > 0) {

3264

if (p->prio < *this_best_prio)

3264

if (p->prio < *this_best_prio)

3265

*this_best_prio = p->prio;

3265

*this_best_prio = p->prio;

3266

p = iterator->next(iterator->arg);

3266

p = iterator->next(iterator->arg);

3267

goto next;

3267

goto next;

3268

}

3268

}

3269

out:

3269

out:

3270

/*

3270

/*

3271

* Right now, this is one of only two places pull_task() is called,

3271

* Right now, this is one of only two places pull_task() is called,

3272

* so we can safely collect pull_task() stats here rather than

3272

* so we can safely collect pull_task() stats here rather than

3273

* inside pull_task().

3273

* inside pull_task().

3274

*/

3274

*/

3275

schedstat_add(sd, lb_gained[idle], pulled);

3275

schedstat_add(sd, lb_gained[idle], pulled);

3276

3277

if (all_pinned)

3277

if (all_pinned)

3278

*all_pinned = pinned;

3278

*all_pinned = pinned;

3279

3280

return max_load_move - rem_load_move;

3280

return max_load_move - rem_load_move;

3281

}

3281

}

3282

3283

/*

3283

/*

3284

* move_tasks tries to move up to max_load_move weighted load from busiest to

3284

* move_tasks tries to move up to max_load_move weighted load from busiest to

3285

* this_rq, as part of a balancing operation within domain "sd".

3285

* this_rq, as part of a balancing operation within domain "sd".

3286

* Returns 1 if successful and 0 otherwise.

3286

* Returns 1 if successful and 0 otherwise.

3287

*

3287

*

3288

* Called with both runqueues locked.

3288

* Called with both runqueues locked.

3289

*/

3289

*/

3290

static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

3290

static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

3291

unsigned long max_load_move,

3291

unsigned long max_load_move,

3292

struct sched_domain *sd, enum cpu_idle_type idle,

3292

struct sched_domain *sd, enum cpu_idle_type idle,

3293

int *all_pinned)

3293

int *all_pinned)

3294

{

3294

{

3295

const struct sched_class *class = sched_class_highest;

3295

const struct sched_class *class = sched_class_highest;

3296

unsigned long total_load_moved = 0;

3296

unsigned long total_load_moved = 0;

3297

int this_best_prio = this_rq->curr->prio;

3297

int this_best_prio = this_rq->curr->prio;

3298

3299

do {

3299

do {

3300

total_load_moved +=

3300

total_load_moved +=

3301

class->load_balance(this_rq, this_cpu, busiest,

3301

class->load_balance(this_rq, this_cpu, busiest,

3302

max_load_move - total_load_moved,

3302

max_load_move - total_load_moved,

3303

sd, idle, all_pinned, &this_best_prio);

3303

sd, idle, all_pinned, &this_best_prio);

3304

class = class->next;

3304

class = class->next;

3305

3306

#ifdef CONFIG_PREEMPT

3306

#ifdef CONFIG_PREEMPT

3307

/*

3307

/*

3308

* NEWIDLE balancing is a source of latency, so preemptible

3308

* NEWIDLE balancing is a source of latency, so preemptible

3309

* kernels will stop after the first task is pulled to minimize

3309

* kernels will stop after the first task is pulled to minimize

3310

* the critical section.

3310

* the critical section.

3311

*/

3311

*/

3312

if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)

3312

if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)

3313

break;

3313

break;

3314

#endif

3314

#endif

3315

} while (class && max_load_move > total_load_moved);

3315

} while (class && max_load_move > total_load_moved);

3316

3317

return total_load_moved > 0;

3317

return total_load_moved > 0;

3318

}

3318

}

3319

3320

static int

3320

static int

3321

iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

3321

iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

3322

struct sched_domain *sd, enum cpu_idle_type idle,

3322

struct sched_domain *sd, enum cpu_idle_type idle,

3323

struct rq_iterator *iterator)

3323

struct rq_iterator *iterator)

3324

{

3324

{

3325

struct task_struct *p = iterator->start(iterator->arg);

3325

struct task_struct *p = iterator->start(iterator->arg);

3326

int pinned = 0;

3326

int pinned = 0;

3327

3328

while (p) {

3328

while (p) {

3329

if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {

3329

if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {

3330

pull_task(busiest, p, this_rq, this_cpu);

3330

pull_task(busiest, p, this_rq, this_cpu);

3331

/*

3331

/*

3332

* Right now, this is only the second place pull_task()

3332

* Right now, this is only the second place pull_task()

3333

* is called, so we can safely collect pull_task()

3333

* is called, so we can safely collect pull_task()

3334

* stats here rather than inside pull_task().

3334

* stats here rather than inside pull_task().

3335

*/

3335

*/

3336

schedstat_inc(sd, lb_gained[idle]);

3336

schedstat_inc(sd, lb_gained[idle]);

3337

3338

return 1;

3338

return 1;

3339

}

3339

}

3340

p = iterator->next(iterator->arg);

3340

p = iterator->next(iterator->arg);

3341

}

3341

}

3342

3343

return 0;

3343

return 0;

3344

}

3344

}

3345

3346

/*

3346

/*

3347

* move_one_task tries to move exactly one task from busiest to this_rq, as

3347

* move_one_task tries to move exactly one task from busiest to this_rq, as

3348

* part of active balancing operations within "domain".

3348

* part of active balancing operations within "domain".

3349

* Returns 1 if successful and 0 otherwise.

3349

* Returns 1 if successful and 0 otherwise.

3350

*

3350

*

3351

* Called with both runqueues locked.

3351

* Called with both runqueues locked.

3352

*/

3352

*/

3353

static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

3353

static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

3354

struct sched_domain *sd, enum cpu_idle_type idle)

3354

struct sched_domain *sd, enum cpu_idle_type idle)

3355

{

3355

{

3356

const struct sched_class *class;

3356

const struct sched_class *class;

3357

3358

for (class = sched_class_highest; class; class = class->next)

3358

for (class = sched_class_highest; class; class = class->next)

3359

if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))

3359

if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))

3360

return 1;

3360

return 1;

3361

3362

return 0;

3362

return 0;

3363

}

3363

}

3364

/********** Helpers for find_busiest_group ************************/

3364

/********** Helpers for find_busiest_group ************************/

3365

/*

3365

/*

3366

* sd_lb_stats - Structure to store the statistics of a sched_domain

3366

* sd_lb_stats - Structure to store the statistics of a sched_domain

3367

* during load balancing.

3367

* during load balancing.

3368

*/

3368

*/

3369

struct sd_lb_stats {

3369

struct sd_lb_stats {

3370

struct sched_group *busiest; /* Busiest group in this sd */

3370

struct sched_group *busiest; /* Busiest group in this sd */

3371

struct sched_group *this; /* Local group in this sd */

3371

struct sched_group *this; /* Local group in this sd */

3372

unsigned long total_load; /* Total load of all groups in sd */

3372

unsigned long total_load; /* Total load of all groups in sd */

3373

unsigned long total_pwr; /* Total power of all groups in sd */

3373

unsigned long total_pwr; /* Total power of all groups in sd */

3374

unsigned long avg_load; /* Average load across all groups in sd */

3374

unsigned long avg_load; /* Average load across all groups in sd */

3375

3376

/** Statistics of this group */

3376

/** Statistics of this group */

3377

unsigned long this_load;

3377

unsigned long this_load;

3378

unsigned long this_load_per_task;

3378

unsigned long this_load_per_task;

3379

unsigned long this_nr_running;

3379

unsigned long this_nr_running;

3380

3381

/* Statistics of the busiest group */

3381

/* Statistics of the busiest group */

3382

unsigned long max_load;

3382

unsigned long max_load;

3383

unsigned long busiest_load_per_task;

3383

unsigned long busiest_load_per_task;

3384

unsigned long busiest_nr_running;

3384

unsigned long busiest_nr_running;

3385

3386

int group_imb; /* Is there imbalance in this sd */

3386

int group_imb; /* Is there imbalance in this sd */

3387

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

3387

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

3388

int power_savings_balance; /* Is powersave balance needed for this sd */

3388

int power_savings_balance; /* Is powersave balance needed for this sd */

3389

struct sched_group *group_min; /* Least loaded group in sd */

3389

struct sched_group *group_min; /* Least loaded group in sd */

3390

struct sched_group *group_leader; /* Group which relieves group_min */

3390

struct sched_group *group_leader; /* Group which relieves group_min */

3391

unsigned long min_load_per_task; /* load_per_task in group_min */

3391

unsigned long min_load_per_task; /* load_per_task in group_min */

3392

unsigned long leader_nr_running; /* Nr running of group_leader */

3392

unsigned long leader_nr_running; /* Nr running of group_leader */

3393

unsigned long min_nr_running; /* Nr running of group_min */

3393

unsigned long min_nr_running; /* Nr running of group_min */

3394

#endif

3394

#endif

3395

};

3395

};

3396

3397

/*

3397

/*

3398

* sg_lb_stats - stats of a sched_group required for load_balancing

3398

* sg_lb_stats - stats of a sched_group required for load_balancing

3399

*/

3399

*/

3400

struct sg_lb_stats {

3400

struct sg_lb_stats {

3401

unsigned long avg_load; /*Avg load across the CPUs of the group */

3401

unsigned long avg_load; /*Avg load across the CPUs of the group */

3402

unsigned long group_load; /* Total load over the CPUs of the group */

3402

unsigned long group_load; /* Total load over the CPUs of the group */

3403

unsigned long sum_nr_running; /* Nr tasks running in the group */

3403

unsigned long sum_nr_running; /* Nr tasks running in the group */

3404

unsigned long sum_weighted_load; /* Weighted load of group's tasks */

3404

unsigned long sum_weighted_load; /* Weighted load of group's tasks */

3405

unsigned long group_capacity;

3405

unsigned long group_capacity;

3406

int group_imb; /* Is there an imbalance in the group ? */

3406

int group_imb; /* Is there an imbalance in the group ? */

3407

};

3407

};

3408

3409

/**

3409

/**

3410

* group_first_cpu - Returns the first cpu in the cpumask of a sched_group.

3410

* group_first_cpu - Returns the first cpu in the cpumask of a sched_group.

3411

* @group: The group whose first cpu is to be returned.

3411

* @group: The group whose first cpu is to be returned.

3412

*/

3412

*/

3413

static inline unsigned int group_first_cpu(struct sched_group *group)

3413

static inline unsigned int group_first_cpu(struct sched_group *group)

3414

{

3414

{

3415

return cpumask_first(sched_group_cpus(group));

3415

return cpumask_first(sched_group_cpus(group));

3416

}

3416

}

3417

3418

/**

3418

/**

3419

* get_sd_load_idx - Obtain the load index for a given sched domain.

3419

* get_sd_load_idx - Obtain the load index for a given sched domain.

3420

* @sd: The sched_domain whose load_idx is to be obtained.

3420

* @sd: The sched_domain whose load_idx is to be obtained.

3421

* @idle: The Idle status of the CPU for whose sd load_icx is obtained.

3421

* @idle: The Idle status of the CPU for whose sd load_icx is obtained.

3422

*/

3422

*/

3423

static inline int get_sd_load_idx(struct sched_domain *sd,

3423

static inline int get_sd_load_idx(struct sched_domain *sd,

3424

enum cpu_idle_type idle)

3424

enum cpu_idle_type idle)

3425

{

3425

{

3426

int load_idx;

3426

int load_idx;

3427

3428

switch (idle) {

3428

switch (idle) {

3429

case CPU_NOT_IDLE:

3429

case CPU_NOT_IDLE:

3430

load_idx = sd->busy_idx;

3430

load_idx = sd->busy_idx;

3431

break;

3431

break;

3432

3433

case CPU_NEWLY_IDLE:

3433

case CPU_NEWLY_IDLE:

3434

load_idx = sd->newidle_idx;

3434

load_idx = sd->newidle_idx;

3435

break;

3435

break;

3436

default:

3436

default:

3437

load_idx = sd->idle_idx;

3437

load_idx = sd->idle_idx;

3438

break;

3438

break;

3439

}

3439

}

3440

3441

return load_idx;

3441

return load_idx;

3442

}

3442

}

3443

3444

3445

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

3445

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

3446

/**

3446

/**

3447

* init_sd_power_savings_stats - Initialize power savings statistics for

3447

* init_sd_power_savings_stats - Initialize power savings statistics for

3448

* the given sched_domain, during load balancing.

3448

* the given sched_domain, during load balancing.

3449

*

3449

*

3450

* @sd: Sched domain whose power-savings statistics are to be initialized.

3450

* @sd: Sched domain whose power-savings statistics are to be initialized.

3451

* @sds: Variable containing the statistics for sd.

3451

* @sds: Variable containing the statistics for sd.

3452

* @idle: Idle status of the CPU at which we're performing load-balancing.

3452

* @idle: Idle status of the CPU at which we're performing load-balancing.

3453

*/

3453

*/

3454

static inline void init_sd_power_savings_stats(struct sched_domain *sd,

3454

static inline void init_sd_power_savings_stats(struct sched_domain *sd,

3455

struct sd_lb_stats *sds, enum cpu_idle_type idle)

3455

struct sd_lb_stats *sds, enum cpu_idle_type idle)

3456

{

3456

{

3457

/*

3457

/*

3458

* Busy processors will not participate in power savings

3458

* Busy processors will not participate in power savings

3459

* balance.

3459

* balance.

3460

*/

3460

*/

3461

if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))

3461

if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))

3462

sds->power_savings_balance = 0;

3462

sds->power_savings_balance = 0;

3463

else {

3463

else {

3464

sds->power_savings_balance = 1;

3464

sds->power_savings_balance = 1;

3465

sds->min_nr_running = ULONG_MAX;

3465

sds->min_nr_running = ULONG_MAX;

3466

sds->leader_nr_running = 0;

3466

sds->leader_nr_running = 0;

3467

}

3467

}

3468

}

3468

}

3469

3470

/**

3470

/**

3471

* update_sd_power_savings_stats - Update the power saving stats for a

3471

* update_sd_power_savings_stats - Update the power saving stats for a

3472

* sched_domain while performing load balancing.

3472

* sched_domain while performing load balancing.

3473

*

3473

*

3474

* @group: sched_group belonging to the sched_domain under consideration.

3474

* @group: sched_group belonging to the sched_domain under consideration.

3475

* @sds: Variable containing the statistics of the sched_domain

3475

* @sds: Variable containing the statistics of the sched_domain

3476

* @local_group: Does group contain the CPU for which we're performing

3476

* @local_group: Does group contain the CPU for which we're performing

3477

* load balancing ?

3477

* load balancing ?

3478

* @sgs: Variable containing the statistics of the group.

3478

* @sgs: Variable containing the statistics of the group.

3479

*/

3479

*/

3480

static inline void update_sd_power_savings_stats(struct sched_group *group,

3480

static inline void update_sd_power_savings_stats(struct sched_group *group,

3481

struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)

3481

struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)

3482

{

3482

{

3483

3484

if (!sds->power_savings_balance)

3484

if (!sds->power_savings_balance)

3485

return;

3485

return;

3486

3487

/*

3487

/*

3488

* If the local group is idle or completely loaded

3488

* If the local group is idle or completely loaded

3489

* no need to do power savings balance at this domain

3489

* no need to do power savings balance at this domain

3490

*/

3490

*/

3491

if (local_group && (sds->this_nr_running >= sgs->group_capacity ||

3491

if (local_group && (sds->this_nr_running >= sgs->group_capacity ||

3492

!sds->this_nr_running))

3492

!sds->this_nr_running))

3493

sds->power_savings_balance = 0;

3493

sds->power_savings_balance = 0;

3494

3495

/*

3495

/*

3496

* If a group is already running at full capacity or idle,

3496

* If a group is already running at full capacity or idle,

3497

* don't include that group in power savings calculations

3497

* don't include that group in power savings calculations

3498

*/

3498

*/

3499

if (!sds->power_savings_balance ||

3499

if (!sds->power_savings_balance ||

3500

sgs->sum_nr_running >= sgs->group_capacity ||

3500

sgs->sum_nr_running >= sgs->group_capacity ||

3501

!sgs->sum_nr_running)

3501

!sgs->sum_nr_running)

3502

return;

3502

return;

3503

3504

/*

3504

/*

3505

* Calculate the group which has the least non-idle load.

3505

* Calculate the group which has the least non-idle load.

3506

* This is the group from where we need to pick up the load

3506

* This is the group from where we need to pick up the load

3507

* for saving power

3507

* for saving power

3508

*/

3508

*/

3509

if ((sgs->sum_nr_running < sds->min_nr_running) ||

3509

if ((sgs->sum_nr_running < sds->min_nr_running) ||

3510

(sgs->sum_nr_running == sds->min_nr_running &&

3510

(sgs->sum_nr_running == sds->min_nr_running &&

3511

group_first_cpu(group) > group_first_cpu(sds->group_min))) {

3511

group_first_cpu(group) > group_first_cpu(sds->group_min))) {

3512

sds->group_min = group;

3512

sds->group_min = group;

3513

sds->min_nr_running = sgs->sum_nr_running;

3513

sds->min_nr_running = sgs->sum_nr_running;

3514

sds->min_load_per_task = sgs->sum_weighted_load /

3514

sds->min_load_per_task = sgs->sum_weighted_load /

3515

sgs->sum_nr_running;

3515

sgs->sum_nr_running;

3516

}

3516

}

3517

3518

/*

3518

/*

3519

* Calculate the group which is almost near its

3519

* Calculate the group which is almost near its

3520

* capacity but still has some space to pick up some load

3520

* capacity but still has some space to pick up some load

3521

* from other group and save more power

3521

* from other group and save more power

3522

*/

3522

*/

3523

if (sgs->sum_nr_running > sgs->group_capacity - 1)

3523

if (sgs->sum_nr_running > sgs->group_capacity - 1)

3524

return;

3524

return;

3525

3526

if (sgs->sum_nr_running > sds->leader_nr_running ||

3526

if (sgs->sum_nr_running > sds->leader_nr_running ||

3527

(sgs->sum_nr_running == sds->leader_nr_running &&

3527

(sgs->sum_nr_running == sds->leader_nr_running &&

3528

group_first_cpu(group) < group_first_cpu(sds->group_leader))) {

3528

group_first_cpu(group) < group_first_cpu(sds->group_leader))) {

3529

sds->group_leader = group;

3529

sds->group_leader = group;

3530

sds->leader_nr_running = sgs->sum_nr_running;

3530

sds->leader_nr_running = sgs->sum_nr_running;

3531

}

3531

}

3532

}

3532

}

3533

3534

/**

3534

/**

3535

* check_power_save_busiest_group - see if there is potential for some power-savings balance

3535

* check_power_save_busiest_group - see if there is potential for some power-savings balance

3536

* @sds: Variable containing the statistics of the sched_domain

3536

* @sds: Variable containing the statistics of the sched_domain

3537

* under consideration.

3537

* under consideration.

3538

* @this_cpu: Cpu at which we're currently performing load-balancing.

3538

* @this_cpu: Cpu at which we're currently performing load-balancing.

3539

* @imbalance: Variable to store the imbalance.

3539

* @imbalance: Variable to store the imbalance.

3540

*

3540

*

3541

* Description:

3541

* Description:

3542

* Check if we have potential to perform some power-savings balance.

3542

* Check if we have potential to perform some power-savings balance.

3543

* If yes, set the busiest group to be the least loaded group in the

3543

* If yes, set the busiest group to be the least loaded group in the

3544

* sched_domain, so that it's CPUs can be put to idle.

3544

* sched_domain, so that it's CPUs can be put to idle.

3545

*

3545

*

3546

* Returns 1 if there is potential to perform power-savings balance.

3546

* Returns 1 if there is potential to perform power-savings balance.

3547

* Else returns 0.

3547

* Else returns 0.

3548

*/

3548

*/

3549

static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,

3549

static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,

3550

int this_cpu, unsigned long *imbalance)

3550

int this_cpu, unsigned long *imbalance)

3551

{

3551

{

3552

if (!sds->power_savings_balance)

3552

if (!sds->power_savings_balance)

3553

return 0;

3553

return 0;

3554

3555

if (sds->this != sds->group_leader ||

3555

if (sds->this != sds->group_leader ||

3556

sds->group_leader == sds->group_min)

3556

sds->group_leader == sds->group_min)

3557

return 0;

3557

return 0;

3558

3559

*imbalance = sds->min_load_per_task;

3559

*imbalance = sds->min_load_per_task;

3560

sds->busiest = sds->group_min;

3560

sds->busiest = sds->group_min;

3561

3562

if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {

3562

if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {

3563

cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =

3563

cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =

3564

group_first_cpu(sds->group_leader);

3564

group_first_cpu(sds->group_leader);

3565

}

3565

}

3566

3567

return 1;

3567

return 1;

3568

3569

}

3569

}

3570

#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

3570

#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

3571

static inline void init_sd_power_savings_stats(struct sched_domain *sd,

3571

static inline void init_sd_power_savings_stats(struct sched_domain *sd,

3572

struct sd_lb_stats *sds, enum cpu_idle_type idle)

3572

struct sd_lb_stats *sds, enum cpu_idle_type idle)

3573

{

3573

{

3574

return;

3574

return;

3575

}

3575

}

3576

3577

static inline void update_sd_power_savings_stats(struct sched_group *group,

3577

static inline void update_sd_power_savings_stats(struct sched_group *group,

3578

struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)

3578

struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)

3579

{

3579

{

3580

return;

3580

return;

3581

}

3581

}

3582

3583

static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,

3583

static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,

3584

int this_cpu, unsigned long *imbalance)

3584

int this_cpu, unsigned long *imbalance)

3585

{

3585

{

3586

return 0;

3586

return 0;

3587

}

3587

}

3588

#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

3588

#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

3589

3590

3591

/**

3591

/**

3592

* update_sg_lb_stats - Update sched_group's statistics for load balancing.

3592

* update_sg_lb_stats - Update sched_group's statistics for load balancing.

3593

* @group: sched_group whose statistics are to be updated.

3593

* @group: sched_group whose statistics are to be updated.

3594

* @this_cpu: Cpu for which load balance is currently performed.

3594

* @this_cpu: Cpu for which load balance is currently performed.

3595

* @idle: Idle status of this_cpu

3595

* @idle: Idle status of this_cpu

3596

* @load_idx: Load index of sched_domain of this_cpu for load calc.

3596

* @load_idx: Load index of sched_domain of this_cpu for load calc.

3597

* @sd_idle: Idle status of the sched_domain containing group.

3597

* @sd_idle: Idle status of the sched_domain containing group.

3598

* @local_group: Does group contain this_cpu.

3598

* @local_group: Does group contain this_cpu.

3599

* @cpus: Set of cpus considered for load balancing.

3599

* @cpus: Set of cpus considered for load balancing.

3600

* @balance: Should we balance.

3600

* @balance: Should we balance.

3601

* @sgs: variable to hold the statistics for this group.

3601

* @sgs: variable to hold the statistics for this group.

3602

*/

3602

*/

3603

static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,

3603

static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,

3604

enum cpu_idle_type idle, int load_idx, int *sd_idle,

3604

enum cpu_idle_type idle, int load_idx, int *sd_idle,

3605

int local_group, const struct cpumask *cpus,

3605

int local_group, const struct cpumask *cpus,

3606

int *balance, struct sg_lb_stats *sgs)

3606

int *balance, struct sg_lb_stats *sgs)

3607

{

3607

{

3608

unsigned long load, max_cpu_load, min_cpu_load;

3608

unsigned long load, max_cpu_load, min_cpu_load;

3609

int i;

3609

int i;

3610

unsigned int balance_cpu = -1, first_idle_cpu = 0;

3610

unsigned int balance_cpu = -1, first_idle_cpu = 0;

3611

unsigned long sum_avg_load_per_task;

3611

unsigned long sum_avg_load_per_task;

3612

unsigned long avg_load_per_task;

3612

unsigned long avg_load_per_task;

3613

3614

if (local_group)

3614

if (local_group)

3615

balance_cpu = group_first_cpu(group);

3615

balance_cpu = group_first_cpu(group);

3616

3617

/* Tally up the load of all CPUs in the group */

3617

/* Tally up the load of all CPUs in the group */

3618

sum_avg_load_per_task = avg_load_per_task = 0;

3618

sum_avg_load_per_task = avg_load_per_task = 0;

3619

max_cpu_load = 0;

3619

max_cpu_load = 0;

3620

min_cpu_load = ~0UL;

3620

min_cpu_load = ~0UL;

3621

3622

for_each_cpu_and(i, sched_group_cpus(group), cpus) {

3622

for_each_cpu_and(i, sched_group_cpus(group), cpus) {

3623

struct rq *rq = cpu_rq(i);

3623

struct rq *rq = cpu_rq(i);

3624

3625

if (*sd_idle && rq->nr_running)

3625

if (*sd_idle && rq->nr_running)

3626

*sd_idle = 0;

3626

*sd_idle = 0;

3627

3628

/* Bias balancing toward cpus of our domain */

3628

/* Bias balancing toward cpus of our domain */

3629

if (local_group) {

3629

if (local_group) {

3630

if (idle_cpu(i) && !first_idle_cpu) {

3630

if (idle_cpu(i) && !first_idle_cpu) {

3631

first_idle_cpu = 1;

3631

first_idle_cpu = 1;

3632

balance_cpu = i;

3632

balance_cpu = i;

3633

}

3633

}

3634

3635

load = target_load(i, load_idx);

3635

load = target_load(i, load_idx);

3636

} else {

3636

} else {

3637

load = source_load(i, load_idx);

3637

load = source_load(i, load_idx);

3638

if (load > max_cpu_load)

3638

if (load > max_cpu_load)

3639

max_cpu_load = load;

3639

max_cpu_load = load;

3640

if (min_cpu_load > load)

3640

if (min_cpu_load > load)

3641

min_cpu_load = load;

3641

min_cpu_load = load;

3642

}

3642

}

3643

3644

sgs->group_load += load;

3644

sgs->group_load += load;

3645

sgs->sum_nr_running += rq->nr_running;

3645

sgs->sum_nr_running += rq->nr_running;

3646

sgs->sum_weighted_load += weighted_cpuload(i);

3646

sgs->sum_weighted_load += weighted_cpuload(i);

3647

3648

sum_avg_load_per_task += cpu_avg_load_per_task(i);

3648

sum_avg_load_per_task += cpu_avg_load_per_task(i);

3649

}

3649

}

3650

3651

/*

3651

/*

3652

* First idle cpu or the first cpu(busiest) in this sched group

3652

* First idle cpu or the first cpu(busiest) in this sched group

3653

* is eligible for doing load balancing at this and above

3653

* is eligible for doing load balancing at this and above

3654

* domains. In the newly idle case, we will allow all the cpu's

3654

* domains. In the newly idle case, we will allow all the cpu's

3655

* to do the newly idle load balance.

3655

* to do the newly idle load balance.

3656

*/

3656

*/

3657

if (idle != CPU_NEWLY_IDLE && local_group &&

3657

if (idle != CPU_NEWLY_IDLE && local_group &&

3658

balance_cpu != this_cpu && balance) {

3658

balance_cpu != this_cpu && balance) {

3659

*balance = 0;

3659

*balance = 0;

3660

return;

3660

return;

3661

}

3661

}

3662

3663

/* Adjust by relative CPU power of the group */

3663

/* Adjust by relative CPU power of the group */

3664

sgs->avg_load = sg_div_cpu_power(group,

3664

sgs->avg_load = sg_div_cpu_power(group,

3665

sgs->group_load * SCHED_LOAD_SCALE);

3665

sgs->group_load * SCHED_LOAD_SCALE);

3666

3667

3668

/*

3668

/*

3669

* Consider the group unbalanced when the imbalance is larger

3669

* Consider the group unbalanced when the imbalance is larger

3670

* than the average weight of two tasks.

3670

* than the average weight of two tasks.

3671

*

3671

*

3672

* APZ: with cgroup the avg task weight can vary wildly and

3672

* APZ: with cgroup the avg task weight can vary wildly and

3673

* might not be a suitable number - should we keep a

3673

* might not be a suitable number - should we keep a

3674

* normalized nr_running number somewhere that negates

3674

* normalized nr_running number somewhere that negates

3675

* the hierarchy?

3675

* the hierarchy?

3676

*/

3676

*/

3677

avg_load_per_task = sg_div_cpu_power(group,

3677

avg_load_per_task = sg_div_cpu_power(group,

3678

sum_avg_load_per_task * SCHED_LOAD_SCALE);

3678

sum_avg_load_per_task * SCHED_LOAD_SCALE);

3679

3680

if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)

3680

if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)

3681

sgs->group_imb = 1;

3681

sgs->group_imb = 1;

3682

3683

sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;

3683

sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;

3684

3685

}

3685

}

3686

3687

/**

3687

/**

3688

* update_sd_lb_stats - Update sched_group's statistics for load balancing.

3688

* update_sd_lb_stats - Update sched_group's statistics for load balancing.

3689

* @sd: sched_domain whose statistics are to be updated.

3689

* @sd: sched_domain whose statistics are to be updated.

3690

* @this_cpu: Cpu for which load balance is currently performed.

3690

* @this_cpu: Cpu for which load balance is currently performed.

3691

* @idle: Idle status of this_cpu

3691

* @idle: Idle status of this_cpu

3692

* @sd_idle: Idle status of the sched_domain containing group.

3692

* @sd_idle: Idle status of the sched_domain containing group.

3693

* @cpus: Set of cpus considered for load balancing.

3693

* @cpus: Set of cpus considered for load balancing.

3694

* @balance: Should we balance.

3694

* @balance: Should we balance.

3695

* @sds: variable to hold the statistics for this sched_domain.

3695

* @sds: variable to hold the statistics for this sched_domain.

3696

*/

3696

*/

3697

static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,

3697

static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,

3698

enum cpu_idle_type idle, int *sd_idle,

3698

enum cpu_idle_type idle, int *sd_idle,

3699

const struct cpumask *cpus, int *balance,

3699

const struct cpumask *cpus, int *balance,

3700

struct sd_lb_stats *sds)

3700

struct sd_lb_stats *sds)

3701

{

3701

{

3702

struct sched_group *group = sd->groups;

3702

struct sched_group *group = sd->groups;

3703

struct sg_lb_stats sgs;

3703

struct sg_lb_stats sgs;

3704

int load_idx;

3704

int load_idx;

3705

3706

init_sd_power_savings_stats(sd, sds, idle);

3706

init_sd_power_savings_stats(sd, sds, idle);

3707

load_idx = get_sd_load_idx(sd, idle);

3707

load_idx = get_sd_load_idx(sd, idle);

3708

3709

do {

3709

do {

3710

int local_group;

3710

int local_group;

3711

3712

local_group = cpumask_test_cpu(this_cpu,

3712

local_group = cpumask_test_cpu(this_cpu,

3713

sched_group_cpus(group));

3713

sched_group_cpus(group));

3714

memset(&sgs, 0, sizeof(sgs));

3714

memset(&sgs, 0, sizeof(sgs));

3715

update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,

3715

update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,

3716

local_group, cpus, balance, &sgs);

3716

local_group, cpus, balance, &sgs);

3717

3718

if (local_group && balance && !(*balance))

3718

if (local_group && balance && !(*balance))

3719

return;

3719

return;

3720

3721

sds->total_load += sgs.group_load;

3721

sds->total_load += sgs.group_load;

3722

sds->total_pwr += group->__cpu_power;

3722

sds->total_pwr += group->__cpu_power;

3723

3724

if (local_group) {

3724

if (local_group) {

3725

sds->this_load = sgs.avg_load;

3725

sds->this_load = sgs.avg_load;

3726

sds->this = group;

3726

sds->this = group;

3727

sds->this_nr_running = sgs.sum_nr_running;

3727

sds->this_nr_running = sgs.sum_nr_running;

3728

sds->this_load_per_task = sgs.sum_weighted_load;

3728

sds->this_load_per_task = sgs.sum_weighted_load;

3729

} else if (sgs.avg_load > sds->max_load &&

3729

} else if (sgs.avg_load > sds->max_load &&

3730

(sgs.sum_nr_running > sgs.group_capacity ||

3730

(sgs.sum_nr_running > sgs.group_capacity ||

3731

sgs.group_imb)) {

3731

sgs.group_imb)) {

3732

sds->max_load = sgs.avg_load;

3732

sds->max_load = sgs.avg_load;

3733

sds->busiest = group;

3733

sds->busiest = group;

3734

sds->busiest_nr_running = sgs.sum_nr_running;

3734

sds->busiest_nr_running = sgs.sum_nr_running;

3735

sds->busiest_load_per_task = sgs.sum_weighted_load;

3735

sds->busiest_load_per_task = sgs.sum_weighted_load;

3736

sds->group_imb = sgs.group_imb;

3736

sds->group_imb = sgs.group_imb;

3737

}

3737

}

3738

3739

update_sd_power_savings_stats(group, sds, local_group, &sgs);

3739

update_sd_power_savings_stats(group, sds, local_group, &sgs);

3740

group = group->next;

3740

group = group->next;

3741

} while (group != sd->groups);

3741

} while (group != sd->groups);

3742

3743

}

3743

}

3744

3745

/**

3745

/**

3746

* fix_small_imbalance - Calculate the minor imbalance that exists

3746

* fix_small_imbalance - Calculate the minor imbalance that exists

3747

* amongst the groups of a sched_domain, during

3747

* amongst the groups of a sched_domain, during

3748

* load balancing.

3748

* load balancing.

3749

* @sds: Statistics of the sched_domain whose imbalance is to be calculated.

3749

* @sds: Statistics of the sched_domain whose imbalance is to be calculated.

3750

* @this_cpu: The cpu at whose sched_domain we're performing load-balance.

3750

* @this_cpu: The cpu at whose sched_domain we're performing load-balance.

3751

* @imbalance: Variable to store the imbalance.

3751

* @imbalance: Variable to store the imbalance.

3752

*/

3752

*/

3753

static inline void fix_small_imbalance(struct sd_lb_stats *sds,

3753

static inline void fix_small_imbalance(struct sd_lb_stats *sds,

3754

int this_cpu, unsigned long *imbalance)

3754

int this_cpu, unsigned long *imbalance)

3755

{

3755

{

3756

unsigned long tmp, pwr_now = 0, pwr_move = 0;

3756

unsigned long tmp, pwr_now = 0, pwr_move = 0;

3757

unsigned int imbn = 2;

3757

unsigned int imbn = 2;

3758

3759

if (sds->this_nr_running) {

3759

if (sds->this_nr_running) {

3760

sds->this_load_per_task /= sds->this_nr_running;

3760

sds->this_load_per_task /= sds->this_nr_running;

3761

if (sds->busiest_load_per_task >

3761

if (sds->busiest_load_per_task >

3762

sds->this_load_per_task)

3762

sds->this_load_per_task)

3763

imbn = 1;

3763

imbn = 1;

3764

} else

3764

} else

3765

sds->this_load_per_task =

3765

sds->this_load_per_task =

3766

cpu_avg_load_per_task(this_cpu);

3766

cpu_avg_load_per_task(this_cpu);

3767

3768

if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=

3768

if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=

3769

sds->busiest_load_per_task * imbn) {

3769

sds->busiest_load_per_task * imbn) {

3770

*imbalance = sds->busiest_load_per_task;

3770

*imbalance = sds->busiest_load_per_task;

3771

return;

3771

return;

3772

}

3772

}

3773

3774

/*

3774

/*

3775

* OK, we don't have enough imbalance to justify moving tasks,

3775

* OK, we don't have enough imbalance to justify moving tasks,

3776

* however we may be able to increase total CPU power used by

3776

* however we may be able to increase total CPU power used by

3777

* moving them.

3777

* moving them.

3778

*/

3778

*/

3779

3780

pwr_now += sds->busiest->__cpu_power *

3780

pwr_now += sds->busiest->__cpu_power *

3781

min(sds->busiest_load_per_task, sds->max_load);

3781

min(sds->busiest_load_per_task, sds->max_load);

3782

pwr_now += sds->this->__cpu_power *

3782

pwr_now += sds->this->__cpu_power *

3783

min(sds->this_load_per_task, sds->this_load);

3783

min(sds->this_load_per_task, sds->this_load);

3784

pwr_now /= SCHED_LOAD_SCALE;

3784

pwr_now /= SCHED_LOAD_SCALE;

3785

3786

/* Amount of load we'd subtract */

3786

/* Amount of load we'd subtract */

3787

tmp = sg_div_cpu_power(sds->busiest,

3787

tmp = sg_div_cpu_power(sds->busiest,

3788

sds->busiest_load_per_task * SCHED_LOAD_SCALE);

3788

sds->busiest_load_per_task * SCHED_LOAD_SCALE);

3789

if (sds->max_load > tmp)

3789

if (sds->max_load > tmp)

3790

pwr_move += sds->busiest->__cpu_power *

3790

pwr_move += sds->busiest->__cpu_power *

3791

min(sds->busiest_load_per_task, sds->max_load - tmp);

3791

min(sds->busiest_load_per_task, sds->max_load - tmp);

3792

3793

/* Amount of load we'd add */

3793

/* Amount of load we'd add */

3794

if (sds->max_load * sds->busiest->__cpu_power <

3794

if (sds->max_load * sds->busiest->__cpu_power <

3795

sds->busiest_load_per_task * SCHED_LOAD_SCALE)

3795

sds->busiest_load_per_task * SCHED_LOAD_SCALE)

3796

tmp = sg_div_cpu_power(sds->this,

3796

tmp = sg_div_cpu_power(sds->this,

3797

sds->max_load * sds->busiest->__cpu_power);

3797

sds->max_load * sds->busiest->__cpu_power);

3798

else

3798

else

3799

tmp = sg_div_cpu_power(sds->this,

3799

tmp = sg_div_cpu_power(sds->this,

3800

sds->busiest_load_per_task * SCHED_LOAD_SCALE);

3800

sds->busiest_load_per_task * SCHED_LOAD_SCALE);

3801

pwr_move += sds->this->__cpu_power *

3801

pwr_move += sds->this->__cpu_power *

3802

min(sds->this_load_per_task, sds->this_load + tmp);

3802

min(sds->this_load_per_task, sds->this_load + tmp);

3803

pwr_move /= SCHED_LOAD_SCALE;

3803

pwr_move /= SCHED_LOAD_SCALE;

3804

3805

/* Move if we gain throughput */

3805

/* Move if we gain throughput */

3806

if (pwr_move > pwr_now)

3806

if (pwr_move > pwr_now)

3807

*imbalance = sds->busiest_load_per_task;

3807

*imbalance = sds->busiest_load_per_task;

3808

}

3808

}

3809

3810

/**

3810

/**

3811

* calculate_imbalance - Calculate the amount of imbalance present within the

3811

* calculate_imbalance - Calculate the amount of imbalance present within the

3812

* groups of a given sched_domain during load balance.

3812

* groups of a given sched_domain during load balance.

3813

* @sds: statistics of the sched_domain whose imbalance is to be calculated.

3813

* @sds: statistics of the sched_domain whose imbalance is to be calculated.

3814

* @this_cpu: Cpu for which currently load balance is being performed.

3814

* @this_cpu: Cpu for which currently load balance is being performed.

3815

* @imbalance: The variable to store the imbalance.

3815

* @imbalance: The variable to store the imbalance.

3816

*/

3816

*/

3817

static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,

3817

static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,

3818

unsigned long *imbalance)

3818

unsigned long *imbalance)

3819

{

3819

{

3820

unsigned long max_pull;

3820

unsigned long max_pull;

3821

/*

3821

/*

3822

* In the presence of smp nice balancing, certain scenarios can have

3822

* In the presence of smp nice balancing, certain scenarios can have

3823

* max load less than avg load(as we skip the groups at or below

3823

* max load less than avg load(as we skip the groups at or below

3824

* its cpu_power, while calculating max_load..)

3824

* its cpu_power, while calculating max_load..)

3825

*/

3825

*/

3826

if (sds->max_load < sds->avg_load) {

3826

if (sds->max_load < sds->avg_load) {

3827

*imbalance = 0;

3827

*imbalance = 0;

3828

return fix_small_imbalance(sds, this_cpu, imbalance);

3828

return fix_small_imbalance(sds, this_cpu, imbalance);

3829

}

3829

}

3830

3831

/* Don't want to pull so many tasks that a group would go idle */

3831

/* Don't want to pull so many tasks that a group would go idle */

3832

max_pull = min(sds->max_load - sds->avg_load,

3832

max_pull = min(sds->max_load - sds->avg_load,

3833

sds->max_load - sds->busiest_load_per_task);

3833

sds->max_load - sds->busiest_load_per_task);

3834

3835

/* How much load to actually move to equalise the imbalance */

3835

/* How much load to actually move to equalise the imbalance */

3836

*imbalance = min(max_pull * sds->busiest->__cpu_power,

3836

*imbalance = min(max_pull * sds->busiest->__cpu_power,

3837

(sds->avg_load - sds->this_load) * sds->this->__cpu_power)

3837

(sds->avg_load - sds->this_load) * sds->this->__cpu_power)

3838

/ SCHED_LOAD_SCALE;

3838

/ SCHED_LOAD_SCALE;

3839

3840

/*

3840

/*

3841

* if *imbalance is less than the average load per runnable task

3841

* if *imbalance is less than the average load per runnable task

3842

* there is no gaurantee that any tasks will be moved so we'll have

3842

* there is no gaurantee that any tasks will be moved so we'll have

3843

* a think about bumping its value to force at least one task to be

3843

* a think about bumping its value to force at least one task to be

3844

* moved

3844

* moved

3845

*/

3845

*/

3846

if (*imbalance < sds->busiest_load_per_task)

3846

if (*imbalance < sds->busiest_load_per_task)

3847

return fix_small_imbalance(sds, this_cpu, imbalance);

3847

return fix_small_imbalance(sds, this_cpu, imbalance);

3848

3849

}

3849

}

3850

/******* find_busiest_group() helpers end here *********************/

3850

/******* find_busiest_group() helpers end here *********************/

3851

3852

/**

3852

/**

3853

* find_busiest_group - Returns the busiest group within the sched_domain

3853

* find_busiest_group - Returns the busiest group within the sched_domain

3854

* if there is an imbalance. If there isn't an imbalance, and

3854

* if there is an imbalance. If there isn't an imbalance, and

3855

* the user has opted for power-savings, it returns a group whose

3855

* the user has opted for power-savings, it returns a group whose

3856

* CPUs can be put to idle by rebalancing those tasks elsewhere, if

3856

* CPUs can be put to idle by rebalancing those tasks elsewhere, if

3857

* such a group exists.

3857

* such a group exists.

3858

*

3858

*

3859

* Also calculates the amount of weighted load which should be moved

3859

* Also calculates the amount of weighted load which should be moved

3860

* to restore balance.

3860

* to restore balance.

3861

*

3861

*

3862

* @sd: The sched_domain whose busiest group is to be returned.

3862

* @sd: The sched_domain whose busiest group is to be returned.

3863

* @this_cpu: The cpu for which load balancing is currently being performed.

3863

* @this_cpu: The cpu for which load balancing is currently being performed.

3864

* @imbalance: Variable which stores amount of weighted load which should

3864

* @imbalance: Variable which stores amount of weighted load which should

3865

* be moved to restore balance/put a group to idle.

3865

* be moved to restore balance/put a group to idle.

3866

* @idle: The idle status of this_cpu.

3866

* @idle: The idle status of this_cpu.

3867

* @sd_idle: The idleness of sd

3867

* @sd_idle: The idleness of sd

3868

* @cpus: The set of CPUs under consideration for load-balancing.

3868

* @cpus: The set of CPUs under consideration for load-balancing.

3869

* @balance: Pointer to a variable indicating if this_cpu

3869

* @balance: Pointer to a variable indicating if this_cpu

3870

* is the appropriate cpu to perform load balancing at this_level.

3870

* is the appropriate cpu to perform load balancing at this_level.

3871

*

3871

*

3872

* Returns: - the busiest group if imbalance exists.

3872

* Returns: - the busiest group if imbalance exists.

3873

* - If no imbalance and user has opted for power-savings balance,

3873

* - If no imbalance and user has opted for power-savings balance,

3874

* return the least loaded group whose CPUs can be

3874

* return the least loaded group whose CPUs can be

3875

* put to idle by rebalancing its tasks onto our group.

3875

* put to idle by rebalancing its tasks onto our group.

3876

*/

3876

*/

3877

static struct sched_group *

3877

static struct sched_group *

3878

find_busiest_group(struct sched_domain *sd, int this_cpu,

3878

find_busiest_group(struct sched_domain *sd, int this_cpu,

3879

unsigned long *imbalance, enum cpu_idle_type idle,

3879

unsigned long *imbalance, enum cpu_idle_type idle,

3880

int *sd_idle, const struct cpumask *cpus, int *balance)

3880

int *sd_idle, const struct cpumask *cpus, int *balance)

3881

{

3881

{

3882

struct sd_lb_stats sds;

3882

struct sd_lb_stats sds;

3883

3884

memset(&sds, 0, sizeof(sds));

3884

memset(&sds, 0, sizeof(sds));

3885

3886

/*

3886

/*

3887

* Compute the various statistics relavent for load balancing at

3887

* Compute the various statistics relavent for load balancing at

3888

* this level.

3888

* this level.

3889

*/

3889

*/

3890

update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,

3890

update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,

3891

balance, &sds);

3891

balance, &sds);

3892

3893

/* Cases where imbalance does not exist from POV of this_cpu */

3893

/* Cases where imbalance does not exist from POV of this_cpu */

3894

/* 1) this_cpu is not the appropriate cpu to perform load balancing

3894

/* 1) this_cpu is not the appropriate cpu to perform load balancing

3895

* at this level.

3895

* at this level.

3896

* 2) There is no busy sibling group to pull from.

3896

* 2) There is no busy sibling group to pull from.

3897

* 3) This group is the busiest group.

3897

* 3) This group is the busiest group.

3898

* 4) This group is more busy than the avg busieness at this

3898

* 4) This group is more busy than the avg busieness at this

3899

* sched_domain.

3899

* sched_domain.

3900

* 5) The imbalance is within the specified limit.

3900

* 5) The imbalance is within the specified limit.

3901

* 6) Any rebalance would lead to ping-pong

3901

* 6) Any rebalance would lead to ping-pong

3902

*/

3902

*/

3903

if (balance && !(*balance))

3903

if (balance && !(*balance))

3904

goto ret;

3904

goto ret;

3905

3906

if (!sds.busiest || sds.busiest_nr_running == 0)

3906

if (!sds.busiest || sds.busiest_nr_running == 0)

3907

goto out_balanced;

3907

goto out_balanced;

3908

3909

if (sds.this_load >= sds.max_load)

3909

if (sds.this_load >= sds.max_load)

3910

goto out_balanced;

3910

goto out_balanced;

3911

3912

sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;

3912

sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;

3913

3914

if (sds.this_load >= sds.avg_load)

3914

if (sds.this_load >= sds.avg_load)

3915

goto out_balanced;

3915

goto out_balanced;

3916

3917

if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)

3917

if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)

3918

goto out_balanced;

3918

goto out_balanced;

3919

3920

sds.busiest_load_per_task /= sds.busiest_nr_running;

3920

sds.busiest_load_per_task /= sds.busiest_nr_running;

3921

if (sds.group_imb)

3921

if (sds.group_imb)

3922

sds.busiest_load_per_task =

3922

sds.busiest_load_per_task =

3923

min(sds.busiest_load_per_task, sds.avg_load);

3923

min(sds.busiest_load_per_task, sds.avg_load);

3924

3925

/*

3925

/*

3926

* We're trying to get all the cpus to the average_load, so we don't

3926

* We're trying to get all the cpus to the average_load, so we don't

3927

* want to push ourselves above the average load, nor do we wish to

3927

* want to push ourselves above the average load, nor do we wish to

3928

* reduce the max loaded cpu below the average load, as either of these

3928

* reduce the max loaded cpu below the average load, as either of these

3929

* actions would just result in more rebalancing later, and ping-pong

3929

* actions would just result in more rebalancing later, and ping-pong

3930

* tasks around. Thus we look for the minimum possible imbalance.

3930

* tasks around. Thus we look for the minimum possible imbalance.

3931

* Negative imbalances (*we* are more loaded than anyone else) will

3931

* Negative imbalances (*we* are more loaded than anyone else) will

3932

* be counted as no imbalance for these purposes -- we can't fix that

3932

* be counted as no imbalance for these purposes -- we can't fix that

3933

* by pulling tasks to us. Be careful of negative numbers as they'll

3933

* by pulling tasks to us. Be careful of negative numbers as they'll

3934

* appear as very large values with unsigned longs.

3934

* appear as very large values with unsigned longs.

3935

*/

3935

*/

3936

if (sds.max_load <= sds.busiest_load_per_task)

3936

if (sds.max_load <= sds.busiest_load_per_task)

3937

goto out_balanced;

3937

goto out_balanced;

3938

3939

/* Looks like there is an imbalance. Compute it */

3939

/* Looks like there is an imbalance. Compute it */

3940

calculate_imbalance(&sds, this_cpu, imbalance);

3940

calculate_imbalance(&sds, this_cpu, imbalance);

3941

return sds.busiest;

3941

return sds.busiest;

3942

3943

out_balanced:

3943

out_balanced:

3944

/*

3944

/*

3945

* There is no obvious imbalance. But check if we can do some balancing

3945

* There is no obvious imbalance. But check if we can do some balancing

3946

* to save power.

3946

* to save power.

3947

*/

3947

*/

3948

if (check_power_save_busiest_group(&sds, this_cpu, imbalance))

3948

if (check_power_save_busiest_group(&sds, this_cpu, imbalance))

3949

return sds.busiest;

3949

return sds.busiest;

3950

ret:

3950

ret:

3951

*imbalance = 0;

3951

*imbalance = 0;

3952

return NULL;

3952

return NULL;

3953

}

3953

}

3954

3955

/*

3955

/*

3956

* find_busiest_queue - find the busiest runqueue among the cpus in group.

3956

* find_busiest_queue - find the busiest runqueue among the cpus in group.

3957

*/

3957

*/

3958

static struct rq *

3958

static struct rq *

3959

find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,

3959

find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,

3960

unsigned long imbalance, const struct cpumask *cpus)

3960

unsigned long imbalance, const struct cpumask *cpus)

3961

{

3961

{

3962

struct rq *busiest = NULL, *rq;

3962

struct rq *busiest = NULL, *rq;

3963

unsigned long max_load = 0;

3963

unsigned long max_load = 0;

3964

int i;

3964

int i;

3965

3966

for_each_cpu(i, sched_group_cpus(group)) {

3966

for_each_cpu(i, sched_group_cpus(group)) {

3967

unsigned long wl;

3967

unsigned long wl;

3968

3969

if (!cpumask_test_cpu(i, cpus))

3969

if (!cpumask_test_cpu(i, cpus))

3970

continue;

3970

continue;

3971

3972

rq = cpu_rq(i);

3972

rq = cpu_rq(i);

3973

wl = weighted_cpuload(i);

3973

wl = weighted_cpuload(i);

3974

3975

if (rq->nr_running == 1 && wl > imbalance)

3975

if (rq->nr_running == 1 && wl > imbalance)

3976

continue;

3976

continue;

3977

3978

if (wl > max_load) {

3978

if (wl > max_load) {

3979

max_load = wl;

3979

max_load = wl;

3980

busiest = rq;

3980

busiest = rq;

3981

}

3981

}

3982

}

3982

}

3983

3984

return busiest;

3984

return busiest;

3985

}

3985

}

3986

3987

/*

3987

/*

3988

* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but

3988

* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but

3989

* so long as it is large enough.

3989

* so long as it is large enough.

3990

*/

3990

*/

3991

#define MAX_PINNED_INTERVAL 512

3991

#define MAX_PINNED_INTERVAL 512

3992

3993

/* Working cpumask for load_balance and load_balance_newidle. */

3993

/* Working cpumask for load_balance and load_balance_newidle. */

3994

static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);

3994

static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);

3995

3996

/*

3996

/*

3997

* Check this_cpu to ensure it is balanced within domain. Attempt to move

3997

* Check this_cpu to ensure it is balanced within domain. Attempt to move

3998

* tasks if there is an imbalance.

3998

* tasks if there is an imbalance.

3999

*/

3999

*/

4000

static int load_balance(int this_cpu, struct rq *this_rq,

4000

static int load_balance(int this_cpu, struct rq *this_rq,

4001

struct sched_domain *sd, enum cpu_idle_type idle,

4001

struct sched_domain *sd, enum cpu_idle_type idle,

4002

int *balance)

4002

int *balance)

4003

{

4003

{

4004

int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;

4004

int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;

4005

struct sched_group *group;

4005

struct sched_group *group;

4006

unsigned long imbalance;

4006

unsigned long imbalance;

4007

struct rq *busiest;

4007

struct rq *busiest;

4008

unsigned long flags;

4008

unsigned long flags;

4009

struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);

4009

struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);

4010

4011

cpumask_setall(cpus);

4011

cpumask_setall(cpus);

4012

4013

/*

4013

/*

4014

* When power savings policy is enabled for the parent domain, idle

4014

* When power savings policy is enabled for the parent domain, idle

4015

* sibling can pick up load irrespective of busy siblings. In this case,

4015

* sibling can pick up load irrespective of busy siblings. In this case,

4016

* let the state of idle sibling percolate up as CPU_IDLE, instead of

4016

* let the state of idle sibling percolate up as CPU_IDLE, instead of

4017

* portraying it as CPU_NOT_IDLE.

4017

* portraying it as CPU_NOT_IDLE.

4018

*/

4018

*/

4019

if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&

4019

if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&

4020

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4020

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4021

sd_idle = 1;

4021

sd_idle = 1;

4022

4023

schedstat_inc(sd, lb_count[idle]);

4023

schedstat_inc(sd, lb_count[idle]);

4024

4025

redo:

4025

redo:

4026

update_shares(sd);

4026

update_shares(sd);

4027

group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,

4027

group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,

4028

cpus, balance);

4028

cpus, balance);

4029

4030

if (*balance == 0)

4030

if (*balance == 0)

4031

goto out_balanced;

4031

goto out_balanced;

4032

4033

if (!group) {

4033

if (!group) {

4034

schedstat_inc(sd, lb_nobusyg[idle]);

4034

schedstat_inc(sd, lb_nobusyg[idle]);

4035

goto out_balanced;

4035

goto out_balanced;

4036

}

4036

}

4037

4038

busiest = find_busiest_queue(group, idle, imbalance, cpus);

4038

busiest = find_busiest_queue(group, idle, imbalance, cpus);

4039

if (!busiest) {

4039

if (!busiest) {

4040

schedstat_inc(sd, lb_nobusyq[idle]);

4040

schedstat_inc(sd, lb_nobusyq[idle]);

4041

goto out_balanced;

4041

goto out_balanced;

4042

}

4042

}

4043

4044

BUG_ON(busiest == this_rq);

4044

BUG_ON(busiest == this_rq);

4045

4046

schedstat_add(sd, lb_imbalance[idle], imbalance);

4046

schedstat_add(sd, lb_imbalance[idle], imbalance);

4047

4048

ld_moved = 0;

4048

ld_moved = 0;

4049

if (busiest->nr_running > 1) {

4049

if (busiest->nr_running > 1) {

4050

/*

4050

/*

4051

* Attempt to move tasks. If find_busiest_group has found

4051

* Attempt to move tasks. If find_busiest_group has found

4052

* an imbalance but busiest->nr_running <= 1, the group is

4052

* an imbalance but busiest->nr_running <= 1, the group is

4053

* still unbalanced. ld_moved simply stays zero, so it is

4053

* still unbalanced. ld_moved simply stays zero, so it is

4054

* correctly treated as an imbalance.

4054

* correctly treated as an imbalance.

4055

*/

4055

*/

4056

local_irq_save(flags);

4056

local_irq_save(flags);

4057

double_rq_lock(this_rq, busiest);

4057

double_rq_lock(this_rq, busiest);

4058

ld_moved = move_tasks(this_rq, this_cpu, busiest,

4058

ld_moved = move_tasks(this_rq, this_cpu, busiest,

4059

imbalance, sd, idle, &all_pinned);

4059

imbalance, sd, idle, &all_pinned);

4060

double_rq_unlock(this_rq, busiest);

4060

double_rq_unlock(this_rq, busiest);

4061

local_irq_restore(flags);

4061

local_irq_restore(flags);

4062

4063

/*

4063

/*

4064

* some other cpu did the load balance for us.

4064

* some other cpu did the load balance for us.

4065

*/

4065

*/

4066

if (ld_moved && this_cpu != smp_processor_id())

4066

if (ld_moved && this_cpu != smp_processor_id())

4067

resched_cpu(this_cpu);

4067

resched_cpu(this_cpu);

4068

4069

/* All tasks on this runqueue were pinned by CPU affinity */

4069

/* All tasks on this runqueue were pinned by CPU affinity */

4070

if (unlikely(all_pinned)) {

4070

if (unlikely(all_pinned)) {

4071

cpumask_clear_cpu(cpu_of(busiest), cpus);

4071

cpumask_clear_cpu(cpu_of(busiest), cpus);

4072

if (!cpumask_empty(cpus))

4072

if (!cpumask_empty(cpus))

4073

goto redo;

4073

goto redo;

4074

goto out_balanced;

4074

goto out_balanced;

4075

}

4075

}

4076

}

4076

}

4077

4078

if (!ld_moved) {

4078

if (!ld_moved) {

4079

schedstat_inc(sd, lb_failed[idle]);

4079

schedstat_inc(sd, lb_failed[idle]);

4080

sd->nr_balance_failed++;

4080

sd->nr_balance_failed++;

4081

4082

if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {

4082

if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {

4083

4084

spin_lock_irqsave(&busiest->lock, flags);

4084

spin_lock_irqsave(&busiest->lock, flags);

4085

4086

/* don't kick the migration_thread, if the curr

4086

/* don't kick the migration_thread, if the curr

4087

* task on busiest cpu can't be moved to this_cpu

4087

* task on busiest cpu can't be moved to this_cpu

4088

*/

4088

*/

4089

if (!cpumask_test_cpu(this_cpu,

4089

if (!cpumask_test_cpu(this_cpu,

4090

&busiest->curr->cpus_allowed)) {

4090

&busiest->curr->cpus_allowed)) {

4091

spin_unlock_irqrestore(&busiest->lock, flags);

4091

spin_unlock_irqrestore(&busiest->lock, flags);

4092

all_pinned = 1;

4092

all_pinned = 1;

4093

goto out_one_pinned;

4093

goto out_one_pinned;

4094

}

4094

}

4095

4096

if (!busiest->active_balance) {

4096

if (!busiest->active_balance) {

4097

busiest->active_balance = 1;

4097

busiest->active_balance = 1;

4098

busiest->push_cpu = this_cpu;

4098

busiest->push_cpu = this_cpu;

4099

active_balance = 1;

4099

active_balance = 1;

4100

}

4100

}

4101

spin_unlock_irqrestore(&busiest->lock, flags);

4101

spin_unlock_irqrestore(&busiest->lock, flags);

4102

if (active_balance)

4102

if (active_balance)

4103

wake_up_process(busiest->migration_thread);

4103

wake_up_process(busiest->migration_thread);

4104

4105

/*

4105

/*

4106

* We've kicked active balancing, reset the failure

4106

* We've kicked active balancing, reset the failure

4107

* counter.

4107

* counter.

4108

*/

4108

*/

4109

sd->nr_balance_failed = sd->cache_nice_tries+1;

4109

sd->nr_balance_failed = sd->cache_nice_tries+1;

4110

}

4110

}

4111

} else

4111

} else

4112

sd->nr_balance_failed = 0;

4112

sd->nr_balance_failed = 0;

4113

4114

if (likely(!active_balance)) {

4114

if (likely(!active_balance)) {

4115

/* We were unbalanced, so reset the balancing interval */

4115

/* We were unbalanced, so reset the balancing interval */

4116

sd->balance_interval = sd->min_interval;

4116

sd->balance_interval = sd->min_interval;

4117

} else {

4117

} else {

4118

/*

4118

/*

4119

* If we've begun active balancing, start to back off. This

4119

* If we've begun active balancing, start to back off. This

4120

* case may not be covered by the all_pinned logic if there

4120

* case may not be covered by the all_pinned logic if there

4121

* is only 1 task on the busy runqueue (because we don't call

4121

* is only 1 task on the busy runqueue (because we don't call

4122

* move_tasks).

4122

* move_tasks).

4123

*/

4123

*/

4124

if (sd->balance_interval < sd->max_interval)

4124

if (sd->balance_interval < sd->max_interval)

4125

sd->balance_interval *= 2;

4125

sd->balance_interval *= 2;

4126

}

4126

}

4127

4128

if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4128

if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4129

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4129

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4130

ld_moved = -1;

4130

ld_moved = -1;

4131

4132

goto out;

4132

goto out;

4133

4134

out_balanced:

4134

out_balanced:

4135

schedstat_inc(sd, lb_balanced[idle]);

4135

schedstat_inc(sd, lb_balanced[idle]);

4136

4137

sd->nr_balance_failed = 0;

4137

sd->nr_balance_failed = 0;

4138

4139

out_one_pinned:

4139

out_one_pinned:

4140

/* tune up the balancing interval */

4140

/* tune up the balancing interval */

4141

if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||

4141

if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||

4142

(sd->balance_interval < sd->max_interval))

4142

(sd->balance_interval < sd->max_interval))

4143

sd->balance_interval *= 2;

4143

sd->balance_interval *= 2;

4144

4145

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4145

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4146

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4146

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4147

ld_moved = -1;

4147

ld_moved = -1;

4148

else

4148

else

4149

ld_moved = 0;

4149

ld_moved = 0;

4150

out:

4150

out:

4151

if (ld_moved)

4151

if (ld_moved)

4152

update_shares(sd);

4152

update_shares(sd);

4153

return ld_moved;

4153

return ld_moved;

4154

}

4154

}

4155

4156

/*

4156

/*

4157

* Check this_cpu to ensure it is balanced within domain. Attempt to move

4157

* Check this_cpu to ensure it is balanced within domain. Attempt to move

4158

* tasks if there is an imbalance.

4158

* tasks if there is an imbalance.

4159

*

4159

*

4160

* Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).

4160

* Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).

4161

* this_rq is locked.

4161

* this_rq is locked.

4162

*/

4162

*/

4163

static int

4163

static int

4164

load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)

4164

load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)

4165

{

4165

{

4166

struct sched_group *group;

4166

struct sched_group *group;

4167

struct rq *busiest = NULL;

4167

struct rq *busiest = NULL;

4168

unsigned long imbalance;

4168

unsigned long imbalance;

4169

int ld_moved = 0;

4169

int ld_moved = 0;

4170

int sd_idle = 0;

4170

int sd_idle = 0;

4171

int all_pinned = 0;

4171

int all_pinned = 0;

4172

struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);

4172

struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);

4173

4174

cpumask_setall(cpus);

4174

cpumask_setall(cpus);

4175

4176

/*

4176

/*

4177

* When power savings policy is enabled for the parent domain, idle

4177

* When power savings policy is enabled for the parent domain, idle

4178

* sibling can pick up load irrespective of busy siblings. In this case,

4178

* sibling can pick up load irrespective of busy siblings. In this case,

4179

* let the state of idle sibling percolate up as IDLE, instead of

4179

* let the state of idle sibling percolate up as IDLE, instead of

4180

* portraying it as CPU_NOT_IDLE.

4180

* portraying it as CPU_NOT_IDLE.

4181

*/

4181

*/

4182

if (sd->flags & SD_SHARE_CPUPOWER &&

4182

if (sd->flags & SD_SHARE_CPUPOWER &&

4183

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4183

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4184

sd_idle = 1;

4184

sd_idle = 1;

4185

4186

schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);

4186

schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);

4187

redo:

4187

redo:

4188

update_shares_locked(this_rq, sd);

4188

update_shares_locked(this_rq, sd);

4189

group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,

4189

group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,

4190

&sd_idle, cpus, NULL);

4190

&sd_idle, cpus, NULL);

4191

if (!group) {

4191

if (!group) {

4192

schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);

4192

schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);

4193

goto out_balanced;

4193

goto out_balanced;

4194

}

4194

}

4195

4196

busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);

4196

busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);

4197

if (!busiest) {

4197

if (!busiest) {

4198

schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);

4198

schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);

4199

goto out_balanced;

4199

goto out_balanced;

4200

}

4200

}

4201

4202

BUG_ON(busiest == this_rq);

4202

BUG_ON(busiest == this_rq);

4203

4204

schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);

4204

schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);

4205

4206

ld_moved = 0;

4206

ld_moved = 0;

4207

if (busiest->nr_running > 1) {

4207

if (busiest->nr_running > 1) {

4208

/* Attempt to move tasks */

4208

/* Attempt to move tasks */

4209

double_lock_balance(this_rq, busiest);

4209

double_lock_balance(this_rq, busiest);

4210

/* this_rq->clock is already updated */

4210

/* this_rq->clock is already updated */

4211

update_rq_clock(busiest);

4211

update_rq_clock(busiest);

4212

ld_moved = move_tasks(this_rq, this_cpu, busiest,

4212

ld_moved = move_tasks(this_rq, this_cpu, busiest,

4213

imbalance, sd, CPU_NEWLY_IDLE,

4213

imbalance, sd, CPU_NEWLY_IDLE,

4214

&all_pinned);

4214

&all_pinned);

4215

double_unlock_balance(this_rq, busiest);

4215

double_unlock_balance(this_rq, busiest);

4216

4217

if (unlikely(all_pinned)) {

4217

if (unlikely(all_pinned)) {

4218

cpumask_clear_cpu(cpu_of(busiest), cpus);

4218

cpumask_clear_cpu(cpu_of(busiest), cpus);

4219

if (!cpumask_empty(cpus))

4219

if (!cpumask_empty(cpus))

4220

goto redo;

4220

goto redo;

4221

}

4221

}

4222

}

4222

}

4223

4224

if (!ld_moved) {

4224

if (!ld_moved) {

4225

int active_balance = 0;

4225

int active_balance = 0;

4226

4227

schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);

4227

schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);

4228

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4228

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4229

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4229

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4230

return -1;

4230

return -1;

4231

4232

if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)

4232

if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)

4233

return -1;

4233

return -1;

4234

4235

if (sd->nr_balance_failed++ < 2)

4235

if (sd->nr_balance_failed++ < 2)

4236

return -1;

4236

return -1;

4237

4238

/*

4238

/*

4239

* The only task running in a non-idle cpu can be moved to this

4239

* The only task running in a non-idle cpu can be moved to this

4240

* cpu in an attempt to completely freeup the other CPU

4240

* cpu in an attempt to completely freeup the other CPU

4241

* package. The same method used to move task in load_balance()

4241

* package. The same method used to move task in load_balance()

4242

* have been extended for load_balance_newidle() to speedup

4242

* have been extended for load_balance_newidle() to speedup

4243

* consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)

4243

* consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)

4244

*

4244

*

4245

* The package power saving logic comes from

4245

* The package power saving logic comes from

4246

* find_busiest_group(). If there are no imbalance, then

4246

* find_busiest_group(). If there are no imbalance, then

4247

* f_b_g() will return NULL. However when sched_mc={1,2} then

4247

* f_b_g() will return NULL. However when sched_mc={1,2} then

4248

* f_b_g() will select a group from which a running task may be

4248

* f_b_g() will select a group from which a running task may be

4249

* pulled to this cpu in order to make the other package idle.

4249

* pulled to this cpu in order to make the other package idle.

4250

* If there is no opportunity to make a package idle and if

4250

* If there is no opportunity to make a package idle and if

4251

* there are no imbalance, then f_b_g() will return NULL and no

4251

* there are no imbalance, then f_b_g() will return NULL and no

4252

* action will be taken in load_balance_newidle().

4252

* action will be taken in load_balance_newidle().

4253

*

4253

*

4254

* Under normal task pull operation due to imbalance, there

4254

* Under normal task pull operation due to imbalance, there

4255

* will be more than one task in the source run queue and

4255

* will be more than one task in the source run queue and

4256

* move_tasks() will succeed. ld_moved will be true and this

4256

* move_tasks() will succeed. ld_moved will be true and this

4257

* active balance code will not be triggered.

4257

* active balance code will not be triggered.

4258

*/

4258

*/

4259

4260

/* Lock busiest in correct order while this_rq is held */

4260

/* Lock busiest in correct order while this_rq is held */

4261

double_lock_balance(this_rq, busiest);

4261

double_lock_balance(this_rq, busiest);

4262

4263

/*

4263

/*

4264

* don't kick the migration_thread, if the curr

4264

* don't kick the migration_thread, if the curr

4265

* task on busiest cpu can't be moved to this_cpu

4265

* task on busiest cpu can't be moved to this_cpu

4266

*/

4266

*/

4267

if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {

4267

if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {

4268

double_unlock_balance(this_rq, busiest);

4268

double_unlock_balance(this_rq, busiest);

4269

all_pinned = 1;

4269

all_pinned = 1;

4270

return ld_moved;

4270

return ld_moved;

4271

}

4271

}

4272

4273

if (!busiest->active_balance) {

4273

if (!busiest->active_balance) {

4274

busiest->active_balance = 1;

4274

busiest->active_balance = 1;

4275

busiest->push_cpu = this_cpu;

4275

busiest->push_cpu = this_cpu;

4276

active_balance = 1;

4276

active_balance = 1;

4277

}

4277

}

4278

4279

double_unlock_balance(this_rq, busiest);

4279

double_unlock_balance(this_rq, busiest);

4280

/*

4280

/*

4281

* Should not call ttwu while holding a rq->lock

4281

* Should not call ttwu while holding a rq->lock

4282

*/

4282

*/

4283

spin_unlock(&this_rq->lock);

4283

spin_unlock(&this_rq->lock);

4284

if (active_balance)

4284

if (active_balance)

4285

wake_up_process(busiest->migration_thread);

4285

wake_up_process(busiest->migration_thread);

4286

spin_lock(&this_rq->lock);

4286

spin_lock(&this_rq->lock);

4287

4288

} else

4288

} else

4289

sd->nr_balance_failed = 0;

4289

sd->nr_balance_failed = 0;

4290

4291

update_shares_locked(this_rq, sd);

4291

update_shares_locked(this_rq, sd);

4292

return ld_moved;

4292

return ld_moved;

4293

4294

out_balanced:

4294

out_balanced:

4295

schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);

4295

schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);

4296

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4296

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4297

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4297

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4298

return -1;

4298

return -1;

4299

sd->nr_balance_failed = 0;

4299

sd->nr_balance_failed = 0;

4300

4301

return 0;

4301

return 0;

4302

}

4302

}

4303

4304

/*

4304

/*

4305

* idle_balance is called by schedule() if this_cpu is about to become

4305

* idle_balance is called by schedule() if this_cpu is about to become

4306

* idle. Attempts to pull tasks from other CPUs.

4306

* idle. Attempts to pull tasks from other CPUs.

4307

*/

4307

*/

4308

static void idle_balance(int this_cpu, struct rq *this_rq)

4308

static void idle_balance(int this_cpu, struct rq *this_rq)

4309

{

4309

{

4310

struct sched_domain *sd;

4310

struct sched_domain *sd;

4311

int pulled_task = 0;

4311

int pulled_task = 0;

4312

unsigned long next_balance = jiffies + HZ;

4312

unsigned long next_balance = jiffies + HZ;

4313

4314

for_each_domain(this_cpu, sd) {

4314

for_each_domain(this_cpu, sd) {

4315

unsigned long interval;

4315

unsigned long interval;

4316

4317

if (!(sd->flags & SD_LOAD_BALANCE))

4317

if (!(sd->flags & SD_LOAD_BALANCE))

4318

continue;

4318

continue;

4319

4320

if (sd->flags & SD_BALANCE_NEWIDLE)

4320

if (sd->flags & SD_BALANCE_NEWIDLE)

4321

/* If we've pulled tasks over stop searching: */

4321

/* If we've pulled tasks over stop searching: */

4322

pulled_task = load_balance_newidle(this_cpu, this_rq,

4322

pulled_task = load_balance_newidle(this_cpu, this_rq,

4323

sd);

4323

sd);

4324

4325

interval = msecs_to_jiffies(sd->balance_interval);

4325

interval = msecs_to_jiffies(sd->balance_interval);

4326

if (time_after(next_balance, sd->last_balance + interval))

4326

if (time_after(next_balance, sd->last_balance + interval))

4327

next_balance = sd->last_balance + interval;

4327

next_balance = sd->last_balance + interval;

4328

if (pulled_task)

4328

if (pulled_task)

4329

break;

4329

break;

4330

}

4330

}

4331

if (pulled_task || time_after(jiffies, this_rq->next_balance)) {

4331

if (pulled_task || time_after(jiffies, this_rq->next_balance)) {

4332

/*

4332

/*

4333

* We are going idle. next_balance may be set based on

4333

* We are going idle. next_balance may be set based on

4334

* a busy processor. So reset next_balance.

4334

* a busy processor. So reset next_balance.

4335

*/

4335

*/

4336

this_rq->next_balance = next_balance;

4336

this_rq->next_balance = next_balance;

4337

}

4337

}

4338

}

4338

}

4339

4340

/*

4340

/*

4341

* active_load_balance is run by migration threads. It pushes running tasks

4341

* active_load_balance is run by migration threads. It pushes running tasks

4342

* off the busiest CPU onto idle CPUs. It requires at least 1 task to be

4342

* off the busiest CPU onto idle CPUs. It requires at least 1 task to be

4343

* running on each physical CPU where possible, and avoids physical /

4343

* running on each physical CPU where possible, and avoids physical /

4344

* logical imbalances.

4344

* logical imbalances.

4345

*

4345

*

4346

* Called with busiest_rq locked.

4346

* Called with busiest_rq locked.

4347

*/

4347

*/

4348

static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)

4348

static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)

4349

{

4349

{

4350

int target_cpu = busiest_rq->push_cpu;

4350

int target_cpu = busiest_rq->push_cpu;

4351

struct sched_domain *sd;

4351

struct sched_domain *sd;

4352

struct rq *target_rq;

4352

struct rq *target_rq;

4353

4354

/* Is there any task to move? */

4354

/* Is there any task to move? */

4355

if (busiest_rq->nr_running <= 1)

4355

if (busiest_rq->nr_running <= 1)

4356

return;

4356

return;

4357

4358

target_rq = cpu_rq(target_cpu);

4358

target_rq = cpu_rq(target_cpu);

4359

4360

/*

4360

/*

4361

* This condition is "impossible", if it occurs

4361

* This condition is "impossible", if it occurs

4362

* we need to fix it. Originally reported by

4362

* we need to fix it. Originally reported by

4363

* Bjorn Helgaas on a 128-cpu setup.

4363

* Bjorn Helgaas on a 128-cpu setup.

4364

*/

4364

*/

4365

BUG_ON(busiest_rq == target_rq);

4365

BUG_ON(busiest_rq == target_rq);

4366

4367

/* move a task from busiest_rq to target_rq */

4367

/* move a task from busiest_rq to target_rq */

4368

double_lock_balance(busiest_rq, target_rq);

4368

double_lock_balance(busiest_rq, target_rq);

4369

update_rq_clock(busiest_rq);

4369

update_rq_clock(busiest_rq);

4370

update_rq_clock(target_rq);

4370

update_rq_clock(target_rq);

4371

4372

/* Search for an sd spanning us and the target CPU. */

4372

/* Search for an sd spanning us and the target CPU. */

4373

for_each_domain(target_cpu, sd) {

4373

for_each_domain(target_cpu, sd) {

4374

if ((sd->flags & SD_LOAD_BALANCE) &&

4374

if ((sd->flags & SD_LOAD_BALANCE) &&

4375

cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))

4375

cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))

4376

break;

4376

break;

4377

}

4377

}

4378

4379

if (likely(sd)) {

4379

if (likely(sd)) {

4380

schedstat_inc(sd, alb_count);

4380

schedstat_inc(sd, alb_count);

4381

4382

if (move_one_task(target_rq, target_cpu, busiest_rq,

4382

if (move_one_task(target_rq, target_cpu, busiest_rq,

4383

sd, CPU_IDLE))

4383

sd, CPU_IDLE))

4384

schedstat_inc(sd, alb_pushed);

4384

schedstat_inc(sd, alb_pushed);

4385

else

4385

else

4386

schedstat_inc(sd, alb_failed);

4386

schedstat_inc(sd, alb_failed);

4387

}

4387

}

4388

double_unlock_balance(busiest_rq, target_rq);

4388

double_unlock_balance(busiest_rq, target_rq);

4389

}

4389

}

4390

4391

#ifdef CONFIG_NO_HZ

4391

#ifdef CONFIG_NO_HZ

4392

static struct {

4392

static struct {

4393

atomic_t load_balancer;

4393

atomic_t load_balancer;

4394

cpumask_var_t cpu_mask;

4394

cpumask_var_t cpu_mask;

4395

cpumask_var_t ilb_grp_nohz_mask;

4395

cpumask_var_t ilb_grp_nohz_mask;

4396

} nohz ____cacheline_aligned = {

4396

} nohz ____cacheline_aligned = {

4397

.load_balancer = ATOMIC_INIT(-1),

4397

.load_balancer = ATOMIC_INIT(-1),

4398

};

4398

};

4399

4400

int get_nohz_load_balancer(void)

4400

int get_nohz_load_balancer(void)

4401

{

4401

{

4402

return atomic_read(&nohz.load_balancer);

4402

return atomic_read(&nohz.load_balancer);

4403

}

4403

}

4404

4405

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

4405

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

4406

/**

4406

/**

4407

* lowest_flag_domain - Return lowest sched_domain containing flag.

4407

* lowest_flag_domain - Return lowest sched_domain containing flag.

4408

* @cpu: The cpu whose lowest level of sched domain is to

4408

* @cpu: The cpu whose lowest level of sched domain is to

4409

* be returned.

4409

* be returned.

4410

* @flag: The flag to check for the lowest sched_domain

4410

* @flag: The flag to check for the lowest sched_domain

4411

* for the given cpu.

4411

* for the given cpu.

4412

*

4412

*

4413

* Returns the lowest sched_domain of a cpu which contains the given flag.

4413

* Returns the lowest sched_domain of a cpu which contains the given flag.

4414

*/

4414

*/

4415

static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)

4415

static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)

4416

{

4416

{

4417

struct sched_domain *sd;

4417

struct sched_domain *sd;

4418

4419

for_each_domain(cpu, sd)

4419

for_each_domain(cpu, sd)

4420

if (sd && (sd->flags & flag))

4420

if (sd && (sd->flags & flag))

4421

break;

4421

break;

4422

4423

return sd;

4423

return sd;

4424

}

4424

}

4425

4426

/**

4426

/**

4427

* for_each_flag_domain - Iterates over sched_domains containing the flag.

4427

* for_each_flag_domain - Iterates over sched_domains containing the flag.

4428

* @cpu: The cpu whose domains we're iterating over.

4428

* @cpu: The cpu whose domains we're iterating over.

4429

* @sd: variable holding the value of the power_savings_sd

4429

* @sd: variable holding the value of the power_savings_sd

4430

* for cpu.

4430

* for cpu.

4431

* @flag: The flag to filter the sched_domains to be iterated.

4431

* @flag: The flag to filter the sched_domains to be iterated.

4432

*

4432

*

4433

* Iterates over all the scheduler domains for a given cpu that has the 'flag'

4433

* Iterates over all the scheduler domains for a given cpu that has the 'flag'

4434

* set, starting from the lowest sched_domain to the highest.

4434

* set, starting from the lowest sched_domain to the highest.

4435

*/

4435

*/

4436

#define for_each_flag_domain(cpu, sd, flag) \

4436

#define for_each_flag_domain(cpu, sd, flag) \

4437

for (sd = lowest_flag_domain(cpu, flag); \

4437

for (sd = lowest_flag_domain(cpu, flag); \

4438

(sd && (sd->flags & flag)); sd = sd->parent)

4438

(sd && (sd->flags & flag)); sd = sd->parent)

4439

4440

/**

4440

/**

4441

* is_semi_idle_group - Checks if the given sched_group is semi-idle.

4441

* is_semi_idle_group - Checks if the given sched_group is semi-idle.

4442

* @ilb_group: group to be checked for semi-idleness

4442

* @ilb_group: group to be checked for semi-idleness

4443

*

4443

*

4444

* Returns: 1 if the group is semi-idle. 0 otherwise.

4444

* Returns: 1 if the group is semi-idle. 0 otherwise.

4445

*

4445

*

4446

* We define a sched_group to be semi idle if it has atleast one idle-CPU

4446

* We define a sched_group to be semi idle if it has atleast one idle-CPU

4447

* and atleast one non-idle CPU. This helper function checks if the given

4447

* and atleast one non-idle CPU. This helper function checks if the given

4448

* sched_group is semi-idle or not.

4448

* sched_group is semi-idle or not.

4449

*/

4449

*/

4450

static inline int is_semi_idle_group(struct sched_group *ilb_group)

4450

static inline int is_semi_idle_group(struct sched_group *ilb_group)

4451

{

4451

{

4452

cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,

4452

cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,

4453

sched_group_cpus(ilb_group));

4453

sched_group_cpus(ilb_group));

4454

4455

/*

4455

/*

4456

* A sched_group is semi-idle when it has atleast one busy cpu

4456

* A sched_group is semi-idle when it has atleast one busy cpu

4457

* and atleast one idle cpu.

4457

* and atleast one idle cpu.

4458

*/

4458

*/

4459

if (cpumask_empty(nohz.ilb_grp_nohz_mask))

4459

if (cpumask_empty(nohz.ilb_grp_nohz_mask))

4460

return 0;

4460

return 0;

4461

4462

if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))

4462

if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))

4463

return 0;

4463

return 0;

4464

4465

return 1;

4465

return 1;

4466

}

4466

}

4467

/**

4467

/**

4468

* find_new_ilb - Finds the optimum idle load balancer for nomination.

4468

* find_new_ilb - Finds the optimum idle load balancer for nomination.

4469

* @cpu: The cpu which is nominating a new idle_load_balancer.

4469

* @cpu: The cpu which is nominating a new idle_load_balancer.

4470

*

4470

*

4471

* Returns: Returns the id of the idle load balancer if it exists,

4471

* Returns: Returns the id of the idle load balancer if it exists,

4472

* Else, returns >= nr_cpu_ids.

4472

* Else, returns >= nr_cpu_ids.

4473

*

4473

*

4474

* This algorithm picks the idle load balancer such that it belongs to a

4474

* This algorithm picks the idle load balancer such that it belongs to a

4475

* semi-idle powersavings sched_domain. The idea is to try and avoid

4475

* semi-idle powersavings sched_domain. The idea is to try and avoid

4476

* completely idle packages/cores just for the purpose of idle load balancing

4476

* completely idle packages/cores just for the purpose of idle load balancing

4477

* when there are other idle cpu's which are better suited for that job.

4477

* when there are other idle cpu's which are better suited for that job.

4478

*/

4478

*/

4479

static int find_new_ilb(int cpu)

4479

static int find_new_ilb(int cpu)

4480

{

4480

{

4481

struct sched_domain *sd;

4481

struct sched_domain *sd;

4482

struct sched_group *ilb_group;

4482

struct sched_group *ilb_group;

4483

4484

/*

4484

/*

4485

* Have idle load balancer selection from semi-idle packages only

4485

* Have idle load balancer selection from semi-idle packages only

4486

* when power-aware load balancing is enabled

4486

* when power-aware load balancing is enabled

4487

*/

4487

*/

4488

if (!(sched_smt_power_savings || sched_mc_power_savings))

4488

if (!(sched_smt_power_savings || sched_mc_power_savings))

4489

goto out_done;

4489

goto out_done;

4490

4491

/*

4491

/*

4492

* Optimize for the case when we have no idle CPUs or only one

4492

* Optimize for the case when we have no idle CPUs or only one

4493

* idle CPU. Don't walk the sched_domain hierarchy in such cases

4493

* idle CPU. Don't walk the sched_domain hierarchy in such cases

4494

*/

4494

*/

4495

if (cpumask_weight(nohz.cpu_mask) < 2)

4495

if (cpumask_weight(nohz.cpu_mask) < 2)

4496

goto out_done;

4496

goto out_done;

4497

4498

for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {

4498

for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {

4499

ilb_group = sd->groups;

4499

ilb_group = sd->groups;

4500

4501

do {

4501

do {

4502

if (is_semi_idle_group(ilb_group))

4502

if (is_semi_idle_group(ilb_group))

4503

return cpumask_first(nohz.ilb_grp_nohz_mask);

4503

return cpumask_first(nohz.ilb_grp_nohz_mask);

4504

4505

ilb_group = ilb_group->next;

4505

ilb_group = ilb_group->next;

4506

4507

} while (ilb_group != sd->groups);

4507

} while (ilb_group != sd->groups);

4508

}

4508

}

4509

4510

out_done:

4510

out_done:

4511

return cpumask_first(nohz.cpu_mask);

4511

return cpumask_first(nohz.cpu_mask);

4512

}

4512

}

4513

#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */

4513

#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */

4514

static inline int find_new_ilb(int call_cpu)

4514

static inline int find_new_ilb(int call_cpu)

4515

{

4515

{

4516

return cpumask_first(nohz.cpu_mask);

4516

return cpumask_first(nohz.cpu_mask);

4517

}

4517

}

4518

#endif

4518

#endif

4519

4520

/*

4520

/*

4521

* This routine will try to nominate the ilb (idle load balancing)

4521

* This routine will try to nominate the ilb (idle load balancing)

4522

* owner among the cpus whose ticks are stopped. ilb owner will do the idle

4522

* owner among the cpus whose ticks are stopped. ilb owner will do the idle

4523

* load balancing on behalf of all those cpus. If all the cpus in the system

4523

* load balancing on behalf of all those cpus. If all the cpus in the system

4524

* go into this tickless mode, then there will be no ilb owner (as there is

4524

* go into this tickless mode, then there will be no ilb owner (as there is

4525

* no need for one) and all the cpus will sleep till the next wakeup event

4525

* no need for one) and all the cpus will sleep till the next wakeup event

4526

* arrives...

4526

* arrives...

4527

*

4527

*

4528

* For the ilb owner, tick is not stopped. And this tick will be used

4528

* For the ilb owner, tick is not stopped. And this tick will be used

4529

* for idle load balancing. ilb owner will still be part of

4529

* for idle load balancing. ilb owner will still be part of

4530

* nohz.cpu_mask..

4530

* nohz.cpu_mask..

4531

*

4531

*

4532

* While stopping the tick, this cpu will become the ilb owner if there

4532

* While stopping the tick, this cpu will become the ilb owner if there

4533

* is no other owner. And will be the owner till that cpu becomes busy

4533

* is no other owner. And will be the owner till that cpu becomes busy

4534

* or if all cpus in the system stop their ticks at which point

4534

* or if all cpus in the system stop their ticks at which point

4535

* there is no need for ilb owner.

4535

* there is no need for ilb owner.

4536

*

4536

*

4537

* When the ilb owner becomes busy, it nominates another owner, during the

4537

* When the ilb owner becomes busy, it nominates another owner, during the

4538

* next busy scheduler_tick()

4538

* next busy scheduler_tick()

4539

*/

4539

*/

4540

int select_nohz_load_balancer(int stop_tick)

4540

int select_nohz_load_balancer(int stop_tick)

4541

{

4541

{

4542

int cpu = smp_processor_id();

4542

int cpu = smp_processor_id();

4543

4544

if (stop_tick) {

4544

if (stop_tick) {

4545

cpu_rq(cpu)->in_nohz_recently = 1;

4545

cpu_rq(cpu)->in_nohz_recently = 1;

4546

4547

if (!cpu_active(cpu)) {

4547

if (!cpu_active(cpu)) {

4548

if (atomic_read(&nohz.load_balancer) != cpu)

4548

if (atomic_read(&nohz.load_balancer) != cpu)

4549

return 0;

4549

return 0;

4550

4551

/*

4551

/*

4552

* If we are going offline and still the leader,

4552

* If we are going offline and still the leader,

4553

* give up!

4553

* give up!

4554

*/

4554

*/

4555

if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)

4555

if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)

4556

BUG();

4556

BUG();

4557

4558

return 0;

4558

return 0;

4559

}

4559

}

4560

4561

cpumask_set_cpu(cpu, nohz.cpu_mask);

4561

cpumask_set_cpu(cpu, nohz.cpu_mask);

4562

4563

/* time for ilb owner also to sleep */

4563

/* time for ilb owner also to sleep */

4564

if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {

4564

if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {

4565

if (atomic_read(&nohz.load_balancer) == cpu)

4565

if (atomic_read(&nohz.load_balancer) == cpu)

4566

atomic_set(&nohz.load_balancer, -1);

4566

atomic_set(&nohz.load_balancer, -1);

4567

return 0;

4567

return 0;

4568

}

4568

}

4569

4570

if (atomic_read(&nohz.load_balancer) == -1) {

4570

if (atomic_read(&nohz.load_balancer) == -1) {

4571

/* make me the ilb owner */

4571

/* make me the ilb owner */

4572

if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)

4572

if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)

4573

return 1;

4573

return 1;

4574

} else if (atomic_read(&nohz.load_balancer) == cpu) {

4574

} else if (atomic_read(&nohz.load_balancer) == cpu) {

4575

int new_ilb;

4575

int new_ilb;

4576

4577

if (!(sched_smt_power_savings ||

4577

if (!(sched_smt_power_savings ||

4578

sched_mc_power_savings))

4578

sched_mc_power_savings))

4579

return 1;

4579

return 1;

4580

/*

4580

/*

4581

* Check to see if there is a more power-efficient

4581

* Check to see if there is a more power-efficient

4582

* ilb.

4582

* ilb.

4583

*/

4583

*/

4584

new_ilb = find_new_ilb(cpu);

4584

new_ilb = find_new_ilb(cpu);

4585

if (new_ilb < nr_cpu_ids && new_ilb != cpu) {

4585

if (new_ilb < nr_cpu_ids && new_ilb != cpu) {

4586

atomic_set(&nohz.load_balancer, -1);

4586

atomic_set(&nohz.load_balancer, -1);

4587

resched_cpu(new_ilb);

4587

resched_cpu(new_ilb);

4588

return 0;

4588

return 0;

4589

}

4589

}

4590

return 1;

4590

return 1;

4591

}

4591

}

4592

} else {

4592

} else {

4593

if (!cpumask_test_cpu(cpu, nohz.cpu_mask))

4593

if (!cpumask_test_cpu(cpu, nohz.cpu_mask))

4594

return 0;

4594

return 0;

4595

4596

cpumask_clear_cpu(cpu, nohz.cpu_mask);

4596

cpumask_clear_cpu(cpu, nohz.cpu_mask);

4597

4598

if (atomic_read(&nohz.load_balancer) == cpu)

4598

if (atomic_read(&nohz.load_balancer) == cpu)

4599

if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)

4599

if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)

4600

BUG();

4600

BUG();

4601

}

4601

}

4602

return 0;

4602

return 0;

4603

}

4603

}

4604

#endif

4604

#endif

4605

4606

static DEFINE_SPINLOCK(balancing);

4606

static DEFINE_SPINLOCK(balancing);

4607

4608

/*

4608

/*

4609

* It checks each scheduling domain to see if it is due to be balanced,

4609

* It checks each scheduling domain to see if it is due to be balanced,

4610

* and initiates a balancing operation if so.

4610

* and initiates a balancing operation if so.

4611

*

4611

*

4612

* Balancing parameters are set up in arch_init_sched_domains.

4612

* Balancing parameters are set up in arch_init_sched_domains.

4613

*/

4613

*/

4614

static void rebalance_domains(int cpu, enum cpu_idle_type idle)

4614

static void rebalance_domains(int cpu, enum cpu_idle_type idle)

4615

{

4615

{

4616

int balance = 1;

4616

int balance = 1;

4617

struct rq *rq = cpu_rq(cpu);

4617

struct rq *rq = cpu_rq(cpu);

4618

unsigned long interval;

4618

unsigned long interval;

4619

struct sched_domain *sd;

4619

struct sched_domain *sd;

4620

/* Earliest time when we have to do rebalance again */

4620

/* Earliest time when we have to do rebalance again */

4621

unsigned long next_balance = jiffies + 60*HZ;

4621

unsigned long next_balance = jiffies + 60*HZ;

4622

int update_next_balance = 0;

4622

int update_next_balance = 0;

4623

int need_serialize;

4623

int need_serialize;

4624

4625

for_each_domain(cpu, sd) {

4625

for_each_domain(cpu, sd) {

4626

if (!(sd->flags & SD_LOAD_BALANCE))

4626

if (!(sd->flags & SD_LOAD_BALANCE))

4627

continue;

4627

continue;

4628

4629

interval = sd->balance_interval;

4629

interval = sd->balance_interval;

4630

if (idle != CPU_IDLE)

4630

if (idle != CPU_IDLE)

4631

interval *= sd->busy_factor;

4631

interval *= sd->busy_factor;

4632

4633

/* scale ms to jiffies */

4633

/* scale ms to jiffies */

4634

interval = msecs_to_jiffies(interval);

4634

interval = msecs_to_jiffies(interval);

4635

if (unlikely(!interval))

4635

if (unlikely(!interval))

4636

interval = 1;

4636

interval = 1;

4637

if (interval > HZ*NR_CPUS/10)

4637

if (interval > HZ*NR_CPUS/10)

4638

interval = HZ*NR_CPUS/10;

4638

interval = HZ*NR_CPUS/10;

4639

4640

need_serialize = sd->flags & SD_SERIALIZE;

4640

need_serialize = sd->flags & SD_SERIALIZE;

4641

4642

if (need_serialize) {

4642

if (need_serialize) {

4643

if (!spin_trylock(&balancing))

4643

if (!spin_trylock(&balancing))

4644

goto out;

4644

goto out;

4645

}

4645

}

4646

4647

if (time_after_eq(jiffies, sd->last_balance + interval)) {

4647

if (time_after_eq(jiffies, sd->last_balance + interval)) {

4648

if (load_balance(cpu, rq, sd, idle, &balance)) {

4648

if (load_balance(cpu, rq, sd, idle, &balance)) {

4649

/*

4649

/*

4650

* We've pulled tasks over so either we're no

4650

* We've pulled tasks over so either we're no

4651

* longer idle, or one of our SMT siblings is

4651

* longer idle, or one of our SMT siblings is

4652

* not idle.

4652

* not idle.

4653

*/

4653

*/

4654

idle = CPU_NOT_IDLE;

4654

idle = CPU_NOT_IDLE;

4655

}

4655

}

4656

sd->last_balance = jiffies;

4656

sd->last_balance = jiffies;

4657

}

4657

}

4658

if (need_serialize)

4658

if (need_serialize)

4659

spin_unlock(&balancing);

4659

spin_unlock(&balancing);

4660

out:

4660

out:

4661

if (time_after(next_balance, sd->last_balance + interval)) {

4661

if (time_after(next_balance, sd->last_balance + interval)) {

4662

next_balance = sd->last_balance + interval;

4662

next_balance = sd->last_balance + interval;

4663

update_next_balance = 1;

4663

update_next_balance = 1;

4664

}

4664

}

4665

4666

/*

4666

/*

4667

* Stop the load balance at this level. There is another

4667

* Stop the load balance at this level. There is another

4668

* CPU in our sched group which is doing load balancing more

4668

* CPU in our sched group which is doing load balancing more

4669

* actively.

4669

* actively.

4670

*/

4670

*/

4671

if (!balance)

4671

if (!balance)

4672

break;

4672

break;

4673

}

4673

}

4674

4675

/*

4675

/*

4676

* next_balance will be updated only when there is a need.

4676

* next_balance will be updated only when there is a need.

4677

* When the cpu is attached to null domain for ex, it will not be

4677

* When the cpu is attached to null domain for ex, it will not be

4678

* updated.

4678

* updated.

4679

*/

4679

*/

4680

if (likely(update_next_balance))

4680

if (likely(update_next_balance))

4681

rq->next_balance = next_balance;

4681

rq->next_balance = next_balance;

4682

}

4682

}

4683

4684

/*

4684

/*

4685

* run_rebalance_domains is triggered when needed from the scheduler tick.

4685

* run_rebalance_domains is triggered when needed from the scheduler tick.

4686

* In CONFIG_NO_HZ case, the idle load balance owner will do the

4686

* In CONFIG_NO_HZ case, the idle load balance owner will do the

4687

* rebalancing for all the cpus for whom scheduler ticks are stopped.

4687

* rebalancing for all the cpus for whom scheduler ticks are stopped.

4688

*/

4688

*/

4689

static void run_rebalance_domains(struct softirq_action *h)

4689

static void run_rebalance_domains(struct softirq_action *h)

4690

{

4690

{

4691

int this_cpu = smp_processor_id();

4691

int this_cpu = smp_processor_id();

4692

struct rq *this_rq = cpu_rq(this_cpu);

4692

struct rq *this_rq = cpu_rq(this_cpu);

4693

enum cpu_idle_type idle = this_rq->idle_at_tick ?

4693

enum cpu_idle_type idle = this_rq->idle_at_tick ?

4694

CPU_IDLE : CPU_NOT_IDLE;

4694

CPU_IDLE : CPU_NOT_IDLE;

4695

4696

rebalance_domains(this_cpu, idle);

4696

rebalance_domains(this_cpu, idle);

4697

4698

#ifdef CONFIG_NO_HZ

4698

#ifdef CONFIG_NO_HZ

4699

/*

4699

/*

4700

* If this cpu is the owner for idle load balancing, then do the

4700

* If this cpu is the owner for idle load balancing, then do the

4701

* balancing on behalf of the other idle cpus whose ticks are

4701

* balancing on behalf of the other idle cpus whose ticks are

4702

* stopped.

4702

* stopped.

4703

*/

4703

*/

4704

if (this_rq->idle_at_tick &&

4704

if (this_rq->idle_at_tick &&

4705

atomic_read(&nohz.load_balancer) == this_cpu) {

4705

atomic_read(&nohz.load_balancer) == this_cpu) {

4706

struct rq *rq;

4706

struct rq *rq;

4707

int balance_cpu;

4707

int balance_cpu;

4708

4709

for_each_cpu(balance_cpu, nohz.cpu_mask) {

4709

for_each_cpu(balance_cpu, nohz.cpu_mask) {

4710

if (balance_cpu == this_cpu)

4710

if (balance_cpu == this_cpu)

4711

continue;

4711

continue;

4712

4713

/*

4713

/*

4714

* If this cpu gets work to do, stop the load balancing

4714

* If this cpu gets work to do, stop the load balancing

4715

* work being done for other cpus. Next load

4715

* work being done for other cpus. Next load

4716

* balancing owner will pick it up.

4716

* balancing owner will pick it up.

4717

*/

4717

*/

4718

if (need_resched())

4718

if (need_resched())

4719

break;

4719

break;

4720

4721

rebalance_domains(balance_cpu, CPU_IDLE);

4721

rebalance_domains(balance_cpu, CPU_IDLE);

4722

4723

rq = cpu_rq(balance_cpu);

4723

rq = cpu_rq(balance_cpu);

4724

if (time_after(this_rq->next_balance, rq->next_balance))

4724

if (time_after(this_rq->next_balance, rq->next_balance))

4725

this_rq->next_balance = rq->next_balance;

4725

this_rq->next_balance = rq->next_balance;

4726

}

4726

}

4727

}

4727

}

4728

#endif

4728

#endif

4729

}

4729

}

4730

4731

static inline int on_null_domain(int cpu)

4731

static inline int on_null_domain(int cpu)

4732

{

4732

{

4733

return !rcu_dereference(cpu_rq(cpu)->sd);

4733

return !rcu_dereference(cpu_rq(cpu)->sd);

4734

}

4734

}

4735

4736

/*

4736

/*

4737

* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.

4737

* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.

4738

*

4738

*

4739

* In case of CONFIG_NO_HZ, this is the place where we nominate a new

4739

* In case of CONFIG_NO_HZ, this is the place where we nominate a new

4740

* idle load balancing owner or decide to stop the periodic load balancing,

4740

* idle load balancing owner or decide to stop the periodic load balancing,

4741

* if the whole system is idle.

4741

* if the whole system is idle.

4742

*/

4742

*/

4743

static inline void trigger_load_balance(struct rq *rq, int cpu)

4743

static inline void trigger_load_balance(struct rq *rq, int cpu)

4744

{

4744

{

4745

#ifdef CONFIG_NO_HZ

4745

#ifdef CONFIG_NO_HZ

4746

/*

4746

/*

4747

* If we were in the nohz mode recently and busy at the current

4747

* If we were in the nohz mode recently and busy at the current

4748

* scheduler tick, then check if we need to nominate new idle

4748

* scheduler tick, then check if we need to nominate new idle

4749

* load balancer.

4749

* load balancer.

4750

*/

4750

*/

4751

if (rq->in_nohz_recently && !rq->idle_at_tick) {

4751

if (rq->in_nohz_recently && !rq->idle_at_tick) {

4752

rq->in_nohz_recently = 0;

4752

rq->in_nohz_recently = 0;

4753

4754

if (atomic_read(&nohz.load_balancer) == cpu) {

4754

if (atomic_read(&nohz.load_balancer) == cpu) {

4755

cpumask_clear_cpu(cpu, nohz.cpu_mask);

4755

cpumask_clear_cpu(cpu, nohz.cpu_mask);

4756

atomic_set(&nohz.load_balancer, -1);

4756

atomic_set(&nohz.load_balancer, -1);

4757

}

4757

}

4758

4759

if (atomic_read(&nohz.load_balancer) == -1) {

4759

if (atomic_read(&nohz.load_balancer) == -1) {

4760

int ilb = find_new_ilb(cpu);

4760

int ilb = find_new_ilb(cpu);

4761

4762

if (ilb < nr_cpu_ids)

4762

if (ilb < nr_cpu_ids)

4763

resched_cpu(ilb);

4763

resched_cpu(ilb);

4764

}

4764

}

4765

}

4765

}

4766

4767

/*

4767

/*

4768

* If this cpu is idle and doing idle load balancing for all the

4768

* If this cpu is idle and doing idle load balancing for all the

4769

* cpus with ticks stopped, is it time for that to stop?

4769

* cpus with ticks stopped, is it time for that to stop?

4770

*/

4770

*/

4771

if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&

4771

if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&

4772

cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {

4772

cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {

4773

resched_cpu(cpu);

4773

resched_cpu(cpu);

4774

return;

4774

return;

4775

}

4775

}

4776

4777

/*

4777

/*

4778

* If this cpu is idle and the idle load balancing is done by

4778

* If this cpu is idle and the idle load balancing is done by

4779

* someone else, then no need raise the SCHED_SOFTIRQ

4779

* someone else, then no need raise the SCHED_SOFTIRQ

4780

*/

4780

*/

4781

if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&

4781

if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&

4782

cpumask_test_cpu(cpu, nohz.cpu_mask))

4782

cpumask_test_cpu(cpu, nohz.cpu_mask))

4783

return;

4783

return;

4784

#endif

4784

#endif

4785

/* Don't need to rebalance while attached to NULL domain */

4785

/* Don't need to rebalance while attached to NULL domain */

4786

if (time_after_eq(jiffies, rq->next_balance) &&

4786

if (time_after_eq(jiffies, rq->next_balance) &&

4787

likely(!on_null_domain(cpu)))

4787

likely(!on_null_domain(cpu)))

4788

raise_softirq(SCHED_SOFTIRQ);

4788

raise_softirq(SCHED_SOFTIRQ);

4789

}

4789

}

4790

4791

#else /* CONFIG_SMP */

4791

#else /* CONFIG_SMP */

4792

4793

/*

4793

/*

4794

* on UP we do not need to balance between CPUs:

4794

* on UP we do not need to balance between CPUs:

4795

*/

4795

*/

4796

static inline void idle_balance(int cpu, struct rq *rq)

4796

static inline void idle_balance(int cpu, struct rq *rq)

4797

{

4797

{

4798

}

4798

}

4799

4800

#endif

4800

#endif

4801

4802

DEFINE_PER_CPU(struct kernel_stat, kstat);

4802

DEFINE_PER_CPU(struct kernel_stat, kstat);

4803

4804

EXPORT_PER_CPU_SYMBOL(kstat);

4804

EXPORT_PER_CPU_SYMBOL(kstat);

4805

4806

/*

4806

/*

4807

* Return any ns on the sched_clock that have not yet been accounted in

4807

* Return any ns on the sched_clock that have not yet been accounted in

4808

* @p in case that task is currently running.

4808

* @p in case that task is currently running.

4809

*

4809

*

4810

* Called with task_rq_lock() held on @rq.

4810

* Called with task_rq_lock() held on @rq.

4811

*/

4811

*/

4812

static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)

4812

static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)

4813

{

4813

{

4814

u64 ns = 0;

4814

u64 ns = 0;

4815

4816

if (task_current(rq, p)) {

4816

if (task_current(rq, p)) {

4817

update_rq_clock(rq);

4817

update_rq_clock(rq);

4818

ns = rq->clock - p->se.exec_start;

4818

ns = rq->clock - p->se.exec_start;

4819

if ((s64)ns < 0)

4819

if ((s64)ns < 0)

4820

ns = 0;

4820

ns = 0;

4821

}

4821

}

4822

4823

return ns;

4823

return ns;

4824

}

4824

}

4825

4826

unsigned long long task_delta_exec(struct task_struct *p)

4826

unsigned long long task_delta_exec(struct task_struct *p)

4827

{

4827

{

4828

unsigned long flags;

4828

unsigned long flags;

4829

struct rq *rq;

4829

struct rq *rq;

4830

u64 ns = 0;

4830

u64 ns = 0;

4831

4832

rq = task_rq_lock(p, &flags);

4832

rq = task_rq_lock(p, &flags);

4833

ns = do_task_delta_exec(p, rq);

4833

ns = do_task_delta_exec(p, rq);

4834

task_rq_unlock(rq, &flags);

4834

task_rq_unlock(rq, &flags);

4835

4836

return ns;

4836

return ns;

4837

}

4837

}

4838

4839

/*

4839

/*

4840

* Return accounted runtime for the task.

4840

* Return accounted runtime for the task.

4841

* In case the task is currently running, return the runtime plus current's

4841

* In case the task is currently running, return the runtime plus current's

4842

* pending runtime that have not been accounted yet.

4842

* pending runtime that have not been accounted yet.

4843

*/

4843

*/

4844

unsigned long long task_sched_runtime(struct task_struct *p)

4844

unsigned long long task_sched_runtime(struct task_struct *p)

4845

{

4845

{

4846

unsigned long flags;

4846

unsigned long flags;

4847

struct rq *rq;

4847

struct rq *rq;

4848

u64 ns = 0;

4848

u64 ns = 0;

4849

4850

rq = task_rq_lock(p, &flags);

4850

rq = task_rq_lock(p, &flags);

4851

ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);

4851

ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);

4852

task_rq_unlock(rq, &flags);

4852

task_rq_unlock(rq, &flags);

4853

4854

return ns;

4854

return ns;

4855

}

4855

}

4856

4857

/*

4857

/*

4858

* Return sum_exec_runtime for the thread group.

4858

* Return sum_exec_runtime for the thread group.

4859

* In case the task is currently running, return the sum plus current's

4859

* In case the task is currently running, return the sum plus current's

4860

* pending runtime that have not been accounted yet.

4860

* pending runtime that have not been accounted yet.

4861

*

4861

*

4862

* Note that the thread group might have other running tasks as well,

4862

* Note that the thread group might have other running tasks as well,

4863

* so the return value not includes other pending runtime that other

4863

* so the return value not includes other pending runtime that other

4864

* running tasks might have.

4864

* running tasks might have.

4865

*/

4865

*/

4866

unsigned long long thread_group_sched_runtime(struct task_struct *p)

4866

unsigned long long thread_group_sched_runtime(struct task_struct *p)

4867

{

4867

{

4868

struct task_cputime totals;

4868

struct task_cputime totals;

4869

unsigned long flags;

4869

unsigned long flags;

4870

struct rq *rq;

4870

struct rq *rq;

4871

u64 ns;

4871

u64 ns;

4872

4873

rq = task_rq_lock(p, &flags);

4873

rq = task_rq_lock(p, &flags);

4874

thread_group_cputime(p, &totals);

4874

thread_group_cputime(p, &totals);

4875

ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);

4875

ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);

4876

task_rq_unlock(rq, &flags);

4876

task_rq_unlock(rq, &flags);

4877

4878

return ns;

4878

return ns;

4879

}

4879

}

4880

4881

/*

4881

/*

4882

* Account user cpu time to a process.

4882

* Account user cpu time to a process.

4883

* @p: the process that the cpu time gets accounted to

4883

* @p: the process that the cpu time gets accounted to

4884

* @cputime: the cpu time spent in user space since the last update

4884

* @cputime: the cpu time spent in user space since the last update

4885

* @cputime_scaled: cputime scaled by cpu frequency

4885

* @cputime_scaled: cputime scaled by cpu frequency

4886

*/

4886

*/

4887

void account_user_time(struct task_struct *p, cputime_t cputime,

4887

void account_user_time(struct task_struct *p, cputime_t cputime,

4888

cputime_t cputime_scaled)

4888

cputime_t cputime_scaled)

4889

{

4889

{

4890

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

4890

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

4891

cputime64_t tmp;

4891

cputime64_t tmp;

4892

4893

/* Add user time to process. */

4893

/* Add user time to process. */

4894

p->utime = cputime_add(p->utime, cputime);

4894

p->utime = cputime_add(p->utime, cputime);

4895

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

4895

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

4896

account_group_user_time(p, cputime);

4896

account_group_user_time(p, cputime);

4897

4898

/* Add user time to cpustat. */

4898

/* Add user time to cpustat. */

4899

tmp = cputime_to_cputime64(cputime);

4899

tmp = cputime_to_cputime64(cputime);

4900

if (TASK_NICE(p) > 0)

4900

if (TASK_NICE(p) > 0)

4901

cpustat->nice = cputime64_add(cpustat->nice, tmp);

4901

cpustat->nice = cputime64_add(cpustat->nice, tmp);

4902

else

4902

else

4903

cpustat->user = cputime64_add(cpustat->user, tmp);

4903

cpustat->user = cputime64_add(cpustat->user, tmp);

4904

4905

cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);

4905

cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);

4906

/* Account for user time used */

4906

/* Account for user time used */

4907

acct_update_integrals(p);

4907

acct_update_integrals(p);

4908

}

4908

}

4909

4910

/*

4910

/*

4911

* Account guest cpu time to a process.

4911

* Account guest cpu time to a process.

4912

* @p: the process that the cpu time gets accounted to

4912

* @p: the process that the cpu time gets accounted to

4913

* @cputime: the cpu time spent in virtual machine since the last update

4913

* @cputime: the cpu time spent in virtual machine since the last update

4914

* @cputime_scaled: cputime scaled by cpu frequency

4914

* @cputime_scaled: cputime scaled by cpu frequency

4915

*/

4915

*/

4916

static void account_guest_time(struct task_struct *p, cputime_t cputime,

4916

static void account_guest_time(struct task_struct *p, cputime_t cputime,

4917

cputime_t cputime_scaled)

4917

cputime_t cputime_scaled)

4918

{

4918

{

4919

cputime64_t tmp;

4919

cputime64_t tmp;

4920

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

4920

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

4921

4922

tmp = cputime_to_cputime64(cputime);

4922

tmp = cputime_to_cputime64(cputime);

4923

4924

/* Add guest time to process. */

4924

/* Add guest time to process. */

4925

p->utime = cputime_add(p->utime, cputime);

4925

p->utime = cputime_add(p->utime, cputime);

4926

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

4926

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

4927

account_group_user_time(p, cputime);

4927

account_group_user_time(p, cputime);

4928

p->gtime = cputime_add(p->gtime, cputime);

4928

p->gtime = cputime_add(p->gtime, cputime);

4929

4930

/* Add guest time to cpustat. */

4930

/* Add guest time to cpustat. */

4931

cpustat->user = cputime64_add(cpustat->user, tmp);

4931

cpustat->user = cputime64_add(cpustat->user, tmp);

4932

cpustat->guest = cputime64_add(cpustat->guest, tmp);

4932

cpustat->guest = cputime64_add(cpustat->guest, tmp);

4933

}

4933

}

4934

4935

/*

4935

/*

4936

* Account system cpu time to a process.

4936

* Account system cpu time to a process.

4937

* @p: the process that the cpu time gets accounted to

4937

* @p: the process that the cpu time gets accounted to

4938

* @hardirq_offset: the offset to subtract from hardirq_count()

4938

* @hardirq_offset: the offset to subtract from hardirq_count()

4939

* @cputime: the cpu time spent in kernel space since the last update

4939

* @cputime: the cpu time spent in kernel space since the last update

4940

* @cputime_scaled: cputime scaled by cpu frequency

4940

* @cputime_scaled: cputime scaled by cpu frequency

4941

*/

4941

*/

4942

void account_system_time(struct task_struct *p, int hardirq_offset,

4942

void account_system_time(struct task_struct *p, int hardirq_offset,

4943

cputime_t cputime, cputime_t cputime_scaled)

4943

cputime_t cputime, cputime_t cputime_scaled)

4944

{

4944

{

4945

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

4945

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

4946

cputime64_t tmp;

4946

cputime64_t tmp;

4947

4948

if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {

4948

if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {

4949

account_guest_time(p, cputime, cputime_scaled);

4949

account_guest_time(p, cputime, cputime_scaled);

4950

return;

4950

return;

4951

}

4951

}

4952

4953

/* Add system time to process. */

4953

/* Add system time to process. */

4954

p->stime = cputime_add(p->stime, cputime);

4954

p->stime = cputime_add(p->stime, cputime);

4955

p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);

4955

p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);

4956

account_group_system_time(p, cputime);

4956

account_group_system_time(p, cputime);

4957

4958

/* Add system time to cpustat. */

4958

/* Add system time to cpustat. */

4959

tmp = cputime_to_cputime64(cputime);

4959

tmp = cputime_to_cputime64(cputime);

4960

if (hardirq_count() - hardirq_offset)

4960

if (hardirq_count() - hardirq_offset)

4961

cpustat->irq = cputime64_add(cpustat->irq, tmp);

4961

cpustat->irq = cputime64_add(cpustat->irq, tmp);

4962

else if (softirq_count())

4962

else if (softirq_count())

4963

cpustat->softirq = cputime64_add(cpustat->softirq, tmp);

4963

cpustat->softirq = cputime64_add(cpustat->softirq, tmp);

4964

else

4964

else

4965

cpustat->system = cputime64_add(cpustat->system, tmp);

4965

cpustat->system = cputime64_add(cpustat->system, tmp);

4966

4967

cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);

4967

cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);

4968

4969

/* Account for system time used */

4969

/* Account for system time used */

4970

acct_update_integrals(p);

4970

acct_update_integrals(p);

4971

}

4971

}

4972

4973

/*

4973

/*

4974

* Account for involuntary wait time.

4974

* Account for involuntary wait time.

4975

* @steal: the cpu time spent in involuntary wait

4975

* @steal: the cpu time spent in involuntary wait

4976

*/

4976

*/

4977

void account_steal_time(cputime_t cputime)

4977

void account_steal_time(cputime_t cputime)

4978

{

4978

{

4979

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

4979

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

4980

cputime64_t cputime64 = cputime_to_cputime64(cputime);

4980

cputime64_t cputime64 = cputime_to_cputime64(cputime);

4981

4982

cpustat->steal = cputime64_add(cpustat->steal, cputime64);

4982

cpustat->steal = cputime64_add(cpustat->steal, cputime64);

4983

}

4983

}

4984

4985

/*

4985

/*

4986

* Account for idle time.

4986

* Account for idle time.

4987

* @cputime: the cpu time spent in idle wait

4987

* @cputime: the cpu time spent in idle wait

4988

*/

4988

*/

4989

void account_idle_time(cputime_t cputime)

4989

void account_idle_time(cputime_t cputime)

4990

{

4990

{

4991

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

4991

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

4992

cputime64_t cputime64 = cputime_to_cputime64(cputime);

4992

cputime64_t cputime64 = cputime_to_cputime64(cputime);

4993

struct rq *rq = this_rq();

4993

struct rq *rq = this_rq();

4994

4995

if (atomic_read(&rq->nr_iowait) > 0)

4995

if (atomic_read(&rq->nr_iowait) > 0)

4996

cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);

4996

cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);

4997

else

4997

else

4998

cpustat->idle = cputime64_add(cpustat->idle, cputime64);

4998

cpustat->idle = cputime64_add(cpustat->idle, cputime64);

4999

}

4999

}

5000

5001

#ifndef CONFIG_VIRT_CPU_ACCOUNTING

5001

#ifndef CONFIG_VIRT_CPU_ACCOUNTING

5002

5003

/*

5003

/*

5004

* Account a single tick of cpu time.

5004

* Account a single tick of cpu time.

5005

* @p: the process that the cpu time gets accounted to

5005

* @p: the process that the cpu time gets accounted to

5006

* @user_tick: indicates if the tick is a user or a system tick

5006

* @user_tick: indicates if the tick is a user or a system tick

5007

*/

5007

*/

5008

void account_process_tick(struct task_struct *p, int user_tick)

5008

void account_process_tick(struct task_struct *p, int user_tick)

5009

{

5009

{

5010

cputime_t one_jiffy = jiffies_to_cputime(1);

5010

cputime_t one_jiffy = jiffies_to_cputime(1);

5011

cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);

5011

cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);

5012

struct rq *rq = this_rq();

5012

struct rq *rq = this_rq();

5013

5014

if (user_tick)

5014

if (user_tick)

5015

account_user_time(p, one_jiffy, one_jiffy_scaled);

5015

account_user_time(p, one_jiffy, one_jiffy_scaled);

5016

else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))

5016

else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))

5017

account_system_time(p, HARDIRQ_OFFSET, one_jiffy,

5017

account_system_time(p, HARDIRQ_OFFSET, one_jiffy,

5018

one_jiffy_scaled);

5018

one_jiffy_scaled);

5019

else

5019

else

5020

account_idle_time(one_jiffy);

5020

account_idle_time(one_jiffy);

5021

}

5021

}

5022

5023

/*

5023

/*

5024

* Account multiple ticks of steal time.

5024

* Account multiple ticks of steal time.

5025

* @p: the process from which the cpu time has been stolen

5025

* @p: the process from which the cpu time has been stolen

5026

* @ticks: number of stolen ticks

5026

* @ticks: number of stolen ticks

5027

*/

5027

*/

5028

void account_steal_ticks(unsigned long ticks)

5028

void account_steal_ticks(unsigned long ticks)

5029

{

5029

{

5030

account_steal_time(jiffies_to_cputime(ticks));

5030

account_steal_time(jiffies_to_cputime(ticks));

5031

}

5031

}

5032

5033

/*

5033

/*

5034

* Account multiple ticks of idle time.

5034

* Account multiple ticks of idle time.

5035

* @ticks: number of stolen ticks

5035

* @ticks: number of stolen ticks

5036

*/

5036

*/

5037

void account_idle_ticks(unsigned long ticks)

5037

void account_idle_ticks(unsigned long ticks)

5038

{

5038

{

5039

account_idle_time(jiffies_to_cputime(ticks));

5039

account_idle_time(jiffies_to_cputime(ticks));

5040

}

5040

}

5041

5042

#endif

5042

#endif

5043

5044

/*

5044

/*

5045

* Use precise platform statistics if available:

5045

* Use precise platform statistics if available:

5046

*/

5046

*/

5047

#ifdef CONFIG_VIRT_CPU_ACCOUNTING

5047

#ifdef CONFIG_VIRT_CPU_ACCOUNTING

5048

cputime_t task_utime(struct task_struct *p)

5048

cputime_t task_utime(struct task_struct *p)

5049

{

5049

{

5050

return p->utime;

5050

return p->utime;

5051

}

5051

}

5052

5053

cputime_t task_stime(struct task_struct *p)

5053

cputime_t task_stime(struct task_struct *p)

5054

{

5054

{

5055

return p->stime;

5055

return p->stime;

5056

}

5056

}

5057

#else

5057

#else

5058

cputime_t task_utime(struct task_struct *p)

5058

cputime_t task_utime(struct task_struct *p)

5059

{

5059

{

5060

clock_t utime = cputime_to_clock_t(p->utime),

5060

clock_t utime = cputime_to_clock_t(p->utime),

5061

total = utime + cputime_to_clock_t(p->stime);

5061

total = utime + cputime_to_clock_t(p->stime);

5062

u64 temp;

5062

u64 temp;

5063

5064

/*

5064

/*

5065

* Use CFS's precise accounting:

5065

* Use CFS's precise accounting:

5066

*/

5066

*/

5067

temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);

5067

temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);

5068

5069

if (total) {

5069

if (total) {

5070

temp *= utime;

5070

temp *= utime;

5071

do_div(temp, total);

5071

do_div(temp, total);

5072

}

5072

}

5073

utime = (clock_t)temp;

5073

utime = (clock_t)temp;

5074

5075

p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));

5075

p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));

5076

return p->prev_utime;

5076

return p->prev_utime;

5077

}

5077

}

5078

5079

cputime_t task_stime(struct task_struct *p)

5079

cputime_t task_stime(struct task_struct *p)

5080

{

5080

{

5081

clock_t stime;

5081

clock_t stime;

5082

5083

/*

5083

/*

5084

* Use CFS's precise accounting. (we subtract utime from

5084

* Use CFS's precise accounting. (we subtract utime from

5085

* the total, to make sure the total observed by userspace

5085

* the total, to make sure the total observed by userspace

5086

* grows monotonically - apps rely on that):

5086

* grows monotonically - apps rely on that):

5087

*/

5087

*/

5088

stime = nsec_to_clock_t(p->se.sum_exec_runtime) -

5088

stime = nsec_to_clock_t(p->se.sum_exec_runtime) -

5089

cputime_to_clock_t(task_utime(p));

5089

cputime_to_clock_t(task_utime(p));

5090

5091

if (stime >= 0)

5091

if (stime >= 0)

5092

p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));

5092

p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));

5093

5094

return p->prev_stime;

5094

return p->prev_stime;

5095

}

5095

}

5096

#endif

5096

#endif

5097

5098

inline cputime_t task_gtime(struct task_struct *p)

5098

inline cputime_t task_gtime(struct task_struct *p)

5099

{

5099

{

5100

return p->gtime;

5100

return p->gtime;

5101

}

5101

}

5102

5103

/*

5103

/*

5104

* This function gets called by the timer code, with HZ frequency.

5104

* This function gets called by the timer code, with HZ frequency.

5105

* We call it with interrupts disabled.

5105

* We call it with interrupts disabled.

5106

*

5106

*

5107

* It also gets called by the fork code, when changing the parent's

5107

* It also gets called by the fork code, when changing the parent's

5108

* timeslices.

5108

* timeslices.

5109

*/

5109

*/

5110

void scheduler_tick(void)

5110

void scheduler_tick(void)

5111

{

5111

{

5112

int cpu = smp_processor_id();

5112

int cpu = smp_processor_id();

5113

struct rq *rq = cpu_rq(cpu);

5113

struct rq *rq = cpu_rq(cpu);

5114

struct task_struct *curr = rq->curr;

5114

struct task_struct *curr = rq->curr;

5115

5116

sched_clock_tick();

5116

sched_clock_tick();

5117

5118

spin_lock(&rq->lock);

5118

spin_lock(&rq->lock);

5119

update_rq_clock(rq);

5119

update_rq_clock(rq);

5120

update_cpu_load(rq);

5120

update_cpu_load(rq);

5121

curr->sched_class->task_tick(rq, curr, 0);

5121

curr->sched_class->task_tick(rq, curr, 0);

5122

spin_unlock(&rq->lock);

5122

spin_unlock(&rq->lock);

5123

5124

perf_counter_task_tick(curr, cpu);

5124

perf_counter_task_tick(curr, cpu);

5125

5126

#ifdef CONFIG_SMP

5126

#ifdef CONFIG_SMP

5127

rq->idle_at_tick = idle_cpu(cpu);

5127

rq->idle_at_tick = idle_cpu(cpu);

5128

trigger_load_balance(rq, cpu);

5128

trigger_load_balance(rq, cpu);

5129

#endif

5129

#endif

5130

}

5130

}

5131

5132

notrace unsigned long get_parent_ip(unsigned long addr)

5132

notrace unsigned long get_parent_ip(unsigned long addr)

5133

{

5133

{

5134

if (in_lock_functions(addr)) {

5134

if (in_lock_functions(addr)) {

5135

addr = CALLER_ADDR2;

5135

addr = CALLER_ADDR2;

5136

if (in_lock_functions(addr))

5136

if (in_lock_functions(addr))

5137

addr = CALLER_ADDR3;

5137

addr = CALLER_ADDR3;

5138

}

5138

}

5139

return addr;

5139

return addr;

5140

}

5140

}

5141

5142

#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \

5142

#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \

5143

defined(CONFIG_PREEMPT_TRACER))

5143

defined(CONFIG_PREEMPT_TRACER))

5144

5145

void __kprobes add_preempt_count(int val)

5145

void __kprobes add_preempt_count(int val)

5146

{

5146

{

5147

#ifdef CONFIG_DEBUG_PREEMPT

5147

#ifdef CONFIG_DEBUG_PREEMPT

5148

/*

5148

/*

5149

* Underflow?

5149

* Underflow?

5150

*/

5150

*/

5151

if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))

5151

if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))

5152

return;

5152

return;

5153

#endif

5153

#endif

5154

preempt_count() += val;

5154

preempt_count() += val;

5155

#ifdef CONFIG_DEBUG_PREEMPT

5155

#ifdef CONFIG_DEBUG_PREEMPT

5156

/*

5156

/*

5157

* Spinlock count overflowing soon?

5157

* Spinlock count overflowing soon?

5158

*/

5158

*/

5159

DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=

5159

DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=

5160

PREEMPT_MASK - 10);

5160

PREEMPT_MASK - 10);

5161

#endif

5161

#endif

5162

if (preempt_count() == val)

5162

if (preempt_count() == val)

5163

trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

5163

trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

5164

}

5164

}

5165

EXPORT_SYMBOL(add_preempt_count);

5165

EXPORT_SYMBOL(add_preempt_count);

5166

5167

void __kprobes sub_preempt_count(int val)

5167

void __kprobes sub_preempt_count(int val)

5168

{

5168

{

5169

#ifdef CONFIG_DEBUG_PREEMPT

5169

#ifdef CONFIG_DEBUG_PREEMPT

5170

/*

5170

/*

5171

* Underflow?

5171

* Underflow?

5172

*/

5172

*/

5173

if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))

5173

if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))

5174

return;

5174

return;

5175

/*

5175

/*

5176

* Is the spinlock portion underflowing?

5176

* Is the spinlock portion underflowing?

5177

*/

5177

*/

5178

if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&

5178

if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&

5179

!(preempt_count() & PREEMPT_MASK)))

5179

!(preempt_count() & PREEMPT_MASK)))

5180

return;

5180

return;

5181

#endif

5181

#endif

5182

5183

if (preempt_count() == val)

5183

if (preempt_count() == val)

5184

trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

5184

trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

5185

preempt_count() -= val;

5185

preempt_count() -= val;

5186

}

5186

}

5187

EXPORT_SYMBOL(sub_preempt_count);

5187

EXPORT_SYMBOL(sub_preempt_count);

5188

5189

#endif

5189

#endif

5190

5191

/*

5191

/*

5192

* Print scheduling while atomic bug:

5192

* Print scheduling while atomic bug:

5193

*/

5193

*/

5194

static noinline void __schedule_bug(struct task_struct *prev)

5194

static noinline void __schedule_bug(struct task_struct *prev)

5195

{

5195

{

5196

struct pt_regs *regs = get_irq_regs();

5196

struct pt_regs *regs = get_irq_regs();

5197

5198

printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",

5198

printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",

5199

prev->comm, prev->pid, preempt_count());

5199

prev->comm, prev->pid, preempt_count());

5200

5201

debug_show_held_locks(prev);

5201

debug_show_held_locks(prev);

5202

print_modules();

5202

print_modules();

5203

if (irqs_disabled())

5203

if (irqs_disabled())

5204

print_irqtrace_events(prev);

5204

print_irqtrace_events(prev);

5205

5206

if (regs)

5206

if (regs)

5207

show_regs(regs);

5207

show_regs(regs);

5208

else

5208

else

5209

dump_stack();

5209

dump_stack();

5210

}

5210

}

5211

5212

/*

5212

/*

5213

* Various schedule()-time debugging checks and statistics:

5213

* Various schedule()-time debugging checks and statistics:

5214

*/

5214

*/

5215

static inline void schedule_debug(struct task_struct *prev)

5215

static inline void schedule_debug(struct task_struct *prev)

5216

{

5216

{

5217

/*

5217

/*

5218

* Test if we are atomic. Since do_exit() needs to call into

5218

* Test if we are atomic. Since do_exit() needs to call into

5219

* schedule() atomically, we ignore that path for now.

5219

* schedule() atomically, we ignore that path for now.

5220

* Otherwise, whine if we are scheduling when we should not be.

5220

* Otherwise, whine if we are scheduling when we should not be.

5221

*/

5221

*/

5222

if (unlikely(in_atomic_preempt_off() && !prev->exit_state))

5222

if (unlikely(in_atomic_preempt_off() && !prev->exit_state))

5223

__schedule_bug(prev);

5223

__schedule_bug(prev);

5224

5225

profile_hit(SCHED_PROFILING, __builtin_return_address(0));

5225

profile_hit(SCHED_PROFILING, __builtin_return_address(0));

5226

5227

schedstat_inc(this_rq(), sched_count);

5227

schedstat_inc(this_rq(), sched_count);

5228

#ifdef CONFIG_SCHEDSTATS

5228

#ifdef CONFIG_SCHEDSTATS

5229

if (unlikely(prev->lock_depth >= 0)) {

5229

if (unlikely(prev->lock_depth >= 0)) {

5230

schedstat_inc(this_rq(), bkl_count);

5230

schedstat_inc(this_rq(), bkl_count);

5231

schedstat_inc(prev, sched_info.bkl_count);

5231

schedstat_inc(prev, sched_info.bkl_count);

5232

}

5232

}

5233

#endif

5233

#endif

5234

}

5234

}

5235

5236

static void put_prev_task(struct rq *rq, struct task_struct *prev)

5236

static void put_prev_task(struct rq *rq, struct task_struct *prev)

5237

{

5237

{

5238

if (prev->state == TASK_RUNNING) {

5238

if (prev->state == TASK_RUNNING) {

5239

u64 runtime = prev->se.sum_exec_runtime;

5239

u64 runtime = prev->se.sum_exec_runtime;

5240

5241

runtime -= prev->se.prev_sum_exec_runtime;

5241

runtime -= prev->se.prev_sum_exec_runtime;

5242

runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);

5242

runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);

5243

5244

/*

5244

/*

5245

* In order to avoid avg_overlap growing stale when we are

5245

* In order to avoid avg_overlap growing stale when we are

5246

* indeed overlapping and hence not getting put to sleep, grow

5246

* indeed overlapping and hence not getting put to sleep, grow

5247

* the avg_overlap on preemption.

5247

* the avg_overlap on preemption.

5248

*

5248

*

5249

* We use the average preemption runtime because that

5249

* We use the average preemption runtime because that

5250

* correlates to the amount of cache footprint a task can

5250

* correlates to the amount of cache footprint a task can

5251

* build up.

5251

* build up.

5252

*/

5252

*/

5253

update_avg(&prev->se.avg_overlap, runtime);

5253

update_avg(&prev->se.avg_overlap, runtime);

5254

}

5254

}

5255

prev->sched_class->put_prev_task(rq, prev);

5255

prev->sched_class->put_prev_task(rq, prev);

5256

}

5256

}

5257

5258

/*

5258

/*

5259

* Pick up the highest-prio task:

5259

* Pick up the highest-prio task:

5260

*/

5260

*/

5261

static inline struct task_struct *

5261

static inline struct task_struct *

5262

pick_next_task(struct rq *rq)

5262

pick_next_task(struct rq *rq)

5263

{

5263

{

5264

const struct sched_class *class;

5264

const struct sched_class *class;

5265

struct task_struct *p;

5265

struct task_struct *p;

5266

5267

/*

5267

/*

5268

* Optimization: we know that if all tasks are in

5268

* Optimization: we know that if all tasks are in

5269

* the fair class we can call that function directly:

5269

* the fair class we can call that function directly:

5270

*/

5270

*/

5271

if (likely(rq->nr_running == rq->cfs.nr_running)) {

5271

if (likely(rq->nr_running == rq->cfs.nr_running)) {

5272

p = fair_sched_class.pick_next_task(rq);

5272

p = fair_sched_class.pick_next_task(rq);

5273

if (likely(p))

5273

if (likely(p))

5274

return p;

5274

return p;

5275

}

5275

}

5276

5277

class = sched_class_highest;

5277

class = sched_class_highest;

5278

for ( ; ; ) {

5278

for ( ; ; ) {

5279

p = class->pick_next_task(rq);

5279

p = class->pick_next_task(rq);

5280

if (p)

5280

if (p)

5281

return p;

5281

return p;

5282

/*

5282

/*

5283

* Will never be NULL as the idle class always

5283

* Will never be NULL as the idle class always

5284

* returns a non-NULL p:

5284

* returns a non-NULL p:

5285

*/

5285

*/

5286

class = class->next;

5286

class = class->next;

5287

}

5287

}

5288

}

5288

}

5289

5290

/*

5290

/*

5291

* schedule() is the main scheduler function.

5291

* schedule() is the main scheduler function.

5292

*/

5292

*/

5293

asmlinkage void __sched schedule(void)

5293

asmlinkage void __sched schedule(void)

5294

{

5294

{

5295

struct task_struct *prev, *next;

5295

struct task_struct *prev, *next;

5296

unsigned long *switch_count;

5296

unsigned long *switch_count;

5297

struct rq *rq;

5297

struct rq *rq;

5298

int cpu;

5298

int cpu;

5299

5300

need_resched:

5300

need_resched:

5301

preempt_disable();

5301

preempt_disable();

5302

cpu = smp_processor_id();

5302

cpu = smp_processor_id();

5303

rq = cpu_rq(cpu);

5303

rq = cpu_rq(cpu);

5304

rcu_qsctr_inc(cpu);

5304

rcu_qsctr_inc(cpu);

5305

prev = rq->curr;

5305

prev = rq->curr;

5306

switch_count = &prev->nivcsw;

5306

switch_count = &prev->nivcsw;

5307

5308

release_kernel_lock(prev);

5308

release_kernel_lock(prev);

5309

need_resched_nonpreemptible:

5309

need_resched_nonpreemptible:

5310

5311

schedule_debug(prev);

5311

schedule_debug(prev);

5312

5313

if (sched_feat(HRTICK))

5313

if (sched_feat(HRTICK))

5314

hrtick_clear(rq);

5314

hrtick_clear(rq);

5315

5316

spin_lock_irq(&rq->lock);

5316

spin_lock_irq(&rq->lock);

5317

update_rq_clock(rq);

5317

update_rq_clock(rq);

5318

clear_tsk_need_resched(prev);

5318

clear_tsk_need_resched(prev);

5319

5320

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

5320

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

5321

if (unlikely(signal_pending_state(prev->state, prev)))

5321

if (unlikely(signal_pending_state(prev->state, prev)))

5322

prev->state = TASK_RUNNING;

5322

prev->state = TASK_RUNNING;

5323

else

5323

else

5324

deactivate_task(rq, prev, 1);

5324

deactivate_task(rq, prev, 1);

5325

switch_count = &prev->nvcsw;

5325

switch_count = &prev->nvcsw;

5326

}

5326

}

5327

5328

#ifdef CONFIG_SMP

5328

#ifdef CONFIG_SMP

5329

if (prev->sched_class->pre_schedule)

5329

if (prev->sched_class->pre_schedule)

5330

prev->sched_class->pre_schedule(rq, prev);

5330

prev->sched_class->pre_schedule(rq, prev);

5331

#endif

5331

#endif

5332

5333

if (unlikely(!rq->nr_running))

5333

if (unlikely(!rq->nr_running))

5334

idle_balance(cpu, rq);

5334

idle_balance(cpu, rq);

5335

5336

put_prev_task(rq, prev);

5336

put_prev_task(rq, prev);

5337

next = pick_next_task(rq);

5337

next = pick_next_task(rq);

5338

5339

if (likely(prev != next)) {

5339

if (likely(prev != next)) {

5340

sched_info_switch(prev, next);

5340

sched_info_switch(prev, next);

5341

perf_counter_task_sched_out(prev, next, cpu);

5341

perf_counter_task_sched_out(prev, next, cpu);

5342

5343

rq->nr_switches++;

5343

rq->nr_switches++;

5344

rq->curr = next;

5344

rq->curr = next;

5345

++*switch_count;

5345

++*switch_count;

5346

5347

context_switch(rq, prev, next); /* unlocks the rq */

5347

context_switch(rq, prev, next); /* unlocks the rq */

5348

/*

5348

/*

5349

* the context switch might have flipped the stack from under

5349

* the context switch might have flipped the stack from under

5350

* us, hence refresh the local variables.

5350

* us, hence refresh the local variables.

5351

*/

5351

*/

5352

cpu = smp_processor_id();

5352

cpu = smp_processor_id();

5353

rq = cpu_rq(cpu);

5353

rq = cpu_rq(cpu);

5354

} else

5354

} else

5355

spin_unlock_irq(&rq->lock);

5355

spin_unlock_irq(&rq->lock);

5356

5357

if (unlikely(reacquire_kernel_lock(current) < 0))

5357

if (unlikely(reacquire_kernel_lock(current) < 0))

5358

goto need_resched_nonpreemptible;

5358

goto need_resched_nonpreemptible;

5359

5360

preempt_enable_no_resched();

5360

preempt_enable_no_resched();

5361

if (need_resched())

5361

if (need_resched())

5362

goto need_resched;

5362

goto need_resched;

5363

}

5363

}

5364

EXPORT_SYMBOL(schedule);

5364

EXPORT_SYMBOL(schedule);

5365

5366

#ifdef CONFIG_SMP

5366

#ifdef CONFIG_SMP

5367

/*

5367

/*

5368

* Look out! "owner" is an entirely speculative pointer

5368

* Look out! "owner" is an entirely speculative pointer

5369

* access and not reliable.

5369

* access and not reliable.

5370

*/

5370

*/

5371

int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)

5371

int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)

5372

{

5372

{

5373

unsigned int cpu;

5373

unsigned int cpu;

5374

struct rq *rq;

5374

struct rq *rq;

5375

5376

if (!sched_feat(OWNER_SPIN))

5376

if (!sched_feat(OWNER_SPIN))

5377

return 0;

5377

return 0;

5378

5379

#ifdef CONFIG_DEBUG_PAGEALLOC

5379

#ifdef CONFIG_DEBUG_PAGEALLOC

5380

/*

5380

/*

5381

* Need to access the cpu field knowing that

5381

* Need to access the cpu field knowing that

5382

* DEBUG_PAGEALLOC could have unmapped it if

5382

* DEBUG_PAGEALLOC could have unmapped it if

5383

* the mutex owner just released it and exited.

5383

* the mutex owner just released it and exited.

5384

*/

5384

*/

5385

if (probe_kernel_address(&owner->cpu, cpu))

5385

if (probe_kernel_address(&owner->cpu, cpu))

5386

goto out;

5386

goto out;

5387

#else

5387

#else

5388

cpu = owner->cpu;

5388

cpu = owner->cpu;

5389

#endif

5389

#endif

5390

5391

/*

5391

/*

5392

* Even if the access succeeded (likely case),

5392

* Even if the access succeeded (likely case),

5393

* the cpu field may no longer be valid.

5393

* the cpu field may no longer be valid.

5394

*/

5394

*/

5395

if (cpu >= nr_cpumask_bits)

5395

if (cpu >= nr_cpumask_bits)

5396

goto out;

5396

goto out;

5397

5398

/*

5398

/*

5399

* We need to validate that we can do a

5399

* We need to validate that we can do a

5400

* get_cpu() and that we have the percpu area.

5400

* get_cpu() and that we have the percpu area.

5401

*/

5401

*/

5402

if (!cpu_online(cpu))

5402

if (!cpu_online(cpu))

5403

goto out;

5403

goto out;

5404

5405

rq = cpu_rq(cpu);

5405

rq = cpu_rq(cpu);

5406

5407

for (;;) {

5407

for (;;) {

5408

/*

5408

/*

5409

* Owner changed, break to re-assess state.

5409

* Owner changed, break to re-assess state.

5410

*/

5410

*/

5411

if (lock->owner != owner)

5411

if (lock->owner != owner)

5412

break;

5412

break;

5413

5414

/*

5414

/*

5415

* Is that owner really running on that cpu?

5415

* Is that owner really running on that cpu?

5416

*/

5416

*/

5417

if (task_thread_info(rq->curr) != owner || need_resched())

5417

if (task_thread_info(rq->curr) != owner || need_resched())

5418

return 0;

5418

return 0;

5419

5420

cpu_relax();

5420

cpu_relax();

5421

}

5421

}

5422

out:

5422

out:

5423

return 1;

5423

return 1;

5424

}

5424

}

5425

#endif

5425

#endif

5426

5427

#ifdef CONFIG_PREEMPT

5427

#ifdef CONFIG_PREEMPT

5428

/*

5428

/*

5429

* this is the entry point to schedule() from in-kernel preemption

5429

* this is the entry point to schedule() from in-kernel preemption

5430

* off of preempt_enable. Kernel preemptions off return from interrupt

5430

* off of preempt_enable. Kernel preemptions off return from interrupt

5431

* occur there and call schedule directly.

5431

* occur there and call schedule directly.

5432

*/

5432

*/

5433

asmlinkage void __sched preempt_schedule(void)

5433

asmlinkage void __sched preempt_schedule(void)

5434

{

5434

{

5435

struct thread_info *ti = current_thread_info();

5435

struct thread_info *ti = current_thread_info();

5436

5437

/*

5437

/*

5438

* If there is a non-zero preempt_count or interrupts are disabled,

5438

* If there is a non-zero preempt_count or interrupts are disabled,

5439

* we do not want to preempt the current task. Just return..

5439

* we do not want to preempt the current task. Just return..

5440

*/

5440

*/

5441

if (likely(ti->preempt_count || irqs_disabled()))

5441

if (likely(ti->preempt_count || irqs_disabled()))

5442

return;

5442

return;

5443

5444

do {

5444

do {

5445

add_preempt_count(PREEMPT_ACTIVE);

5445

add_preempt_count(PREEMPT_ACTIVE);

5446

schedule();

5446

schedule();

5447

sub_preempt_count(PREEMPT_ACTIVE);

5447

sub_preempt_count(PREEMPT_ACTIVE);

5448

5449

/*

5449

/*

5450

* Check again in case we missed a preemption opportunity

5450

* Check again in case we missed a preemption opportunity

5451

* between schedule and now.

5451

* between schedule and now.

5452

*/

5452

*/

5453

barrier();

5453

barrier();

5454

} while (need_resched());

5454

} while (need_resched());

5455

}

5455

}

5456

EXPORT_SYMBOL(preempt_schedule);

5456

EXPORT_SYMBOL(preempt_schedule);

5457

5458

/*

5458

/*

5459

* this is the entry point to schedule() from kernel preemption

5459

* this is the entry point to schedule() from kernel preemption

5460

* off of irq context.

5460

* off of irq context.

5461

* Note, that this is called and return with irqs disabled. This will

5461

* Note, that this is called and return with irqs disabled. This will

5462

* protect us against recursive calling from irq.

5462

* protect us against recursive calling from irq.

5463

*/

5463

*/

5464

asmlinkage void __sched preempt_schedule_irq(void)

5464

asmlinkage void __sched preempt_schedule_irq(void)

5465

{

5465

{

5466

struct thread_info *ti = current_thread_info();

5466

struct thread_info *ti = current_thread_info();

5467

5468

/* Catch callers which need to be fixed */

5468

/* Catch callers which need to be fixed */

5469

BUG_ON(ti->preempt_count || !irqs_disabled());

5469

BUG_ON(ti->preempt_count || !irqs_disabled());

5470

5471

do {

5471

do {

5472

add_preempt_count(PREEMPT_ACTIVE);

5472

add_preempt_count(PREEMPT_ACTIVE);

5473

local_irq_enable();

5473

local_irq_enable();

5474

schedule();

5474

schedule();

5475

local_irq_disable();

5475

local_irq_disable();

5476

sub_preempt_count(PREEMPT_ACTIVE);

5476

sub_preempt_count(PREEMPT_ACTIVE);

5477

5478

/*

5478

/*

5479

* Check again in case we missed a preemption opportunity

5479

* Check again in case we missed a preemption opportunity

5480

* between schedule and now.

5480

* between schedule and now.

5481

*/

5481

*/

5482

barrier();

5482

barrier();

5483

} while (need_resched());

5483

} while (need_resched());

5484

}

5484

}

5485

5486

#endif /* CONFIG_PREEMPT */

5486

#endif /* CONFIG_PREEMPT */

5487

5488

int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,

5488

int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,

5489

void *key)

5489

void *key)

5490

{

5490

{

5491

return try_to_wake_up(curr->private, mode, sync);

5491

return try_to_wake_up(curr->private, mode, sync);

5492

}

5492

}

5493

EXPORT_SYMBOL(default_wake_function);

5493

EXPORT_SYMBOL(default_wake_function);

5494

5495

/*

5495

/*

5496

* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just

5496

* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just

5497

* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve

5497

* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve

5498

* number) then we wake all the non-exclusive tasks and one exclusive task.

5498

* number) then we wake all the non-exclusive tasks and one exclusive task.

5499

*

5499

*

5500

* There are circumstances in which we can try to wake a task which has already

5500

* There are circumstances in which we can try to wake a task which has already

5501

* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns

5501

* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns

5502

* zero in this (rare) case, and we handle it by continuing to scan the queue.

5502

* zero in this (rare) case, and we handle it by continuing to scan the queue.

5503

*/

5503

*/

5504

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,

5504

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,

5505

int nr_exclusive, int sync, void *key)

5505

int nr_exclusive, int sync, void *key)

5506

{

5506

{

5507

wait_queue_t *curr, *next;

5507

wait_queue_t *curr, *next;

5508

5509

list_for_each_entry_safe(curr, next, &q->task_list, task_list) {

5509

list_for_each_entry_safe(curr, next, &q->task_list, task_list) {

5510

unsigned flags = curr->flags;

5510

unsigned flags = curr->flags;

5511

5512

if (curr->func(curr, mode, sync, key) &&

5512

if (curr->func(curr, mode, sync, key) &&

5513

(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)

5513

(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)

5514

break;

5514

break;

5515

}

5515

}

5516

}

5516

}

5517

5518

/**

5518

/**

5519

* __wake_up - wake up threads blocked on a waitqueue.

5519

* __wake_up - wake up threads blocked on a waitqueue.

5520

* @q: the waitqueue

5520

* @q: the waitqueue

5521

* @mode: which threads

5521

* @mode: which threads

5522

* @nr_exclusive: how many wake-one or wake-many threads to wake up

5522

* @nr_exclusive: how many wake-one or wake-many threads to wake up

5523

* @key: is directly passed to the wakeup function

5523

* @key: is directly passed to the wakeup function

5524

*

5524

*

5525

* It may be assumed that this function implies a write memory barrier before

5525

* It may be assumed that this function implies a write memory barrier before

5526

* changing the task state if and only if any tasks are woken up.

5526

* changing the task state if and only if any tasks are woken up.

5527

*/

5527

*/

5528

void __wake_up(wait_queue_head_t *q, unsigned int mode,

5528

void __wake_up(wait_queue_head_t *q, unsigned int mode,

5529

int nr_exclusive, void *key)

5529

int nr_exclusive, void *key)

5530

{

5530

{

5531

unsigned long flags;

5531

unsigned long flags;

5532

5533

spin_lock_irqsave(&q->lock, flags);

5533

spin_lock_irqsave(&q->lock, flags);

5534

__wake_up_common(q, mode, nr_exclusive, 0, key);

5534

__wake_up_common(q, mode, nr_exclusive, 0, key);

5535

spin_unlock_irqrestore(&q->lock, flags);

5535

spin_unlock_irqrestore(&q->lock, flags);

5536

}

5536

}

5537

EXPORT_SYMBOL(__wake_up);

5537

EXPORT_SYMBOL(__wake_up);

5538

5539

/*

5539

/*

5540

* Same as __wake_up but called with the spinlock in wait_queue_head_t held.

5540

* Same as __wake_up but called with the spinlock in wait_queue_head_t held.

5541

*/

5541

*/

5542

void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)

5542

void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)

5543

{

5543

{

5544

__wake_up_common(q, mode, 1, 0, NULL);

5544

__wake_up_common(q, mode, 1, 0, NULL);

5545

}

5545

}

5546

5547

void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)

5547

void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)

5548

{

5548

{

5549

__wake_up_common(q, mode, 1, 0, key);

5549

__wake_up_common(q, mode, 1, 0, key);

5550

}

5550

}

5551

5552

/**

5552

/**

5553

* __wake_up_sync_key - wake up threads blocked on a waitqueue.

5553

* __wake_up_sync_key - wake up threads blocked on a waitqueue.

5554

* @q: the waitqueue

5554

* @q: the waitqueue

5555

* @mode: which threads

5555

* @mode: which threads

5556

* @nr_exclusive: how many wake-one or wake-many threads to wake up

5556

* @nr_exclusive: how many wake-one or wake-many threads to wake up

5557

* @key: opaque value to be passed to wakeup targets

5557

* @key: opaque value to be passed to wakeup targets

5558

*

5558

*

5559

* The sync wakeup differs that the waker knows that it will schedule

5559

* The sync wakeup differs that the waker knows that it will schedule

5560

* away soon, so while the target thread will be woken up, it will not

5560

* away soon, so while the target thread will be woken up, it will not

5561

* be migrated to another CPU - ie. the two threads are 'synchronized'

5561

* be migrated to another CPU - ie. the two threads are 'synchronized'

5562

* with each other. This can prevent needless bouncing between CPUs.

5562

* with each other. This can prevent needless bouncing between CPUs.

5563

*

5563

*

5564

* On UP it can prevent extra preemption.

5564

* On UP it can prevent extra preemption.

5565

*

5565

*

5566

* It may be assumed that this function implies a write memory barrier before

5566

* It may be assumed that this function implies a write memory barrier before

5567

* changing the task state if and only if any tasks are woken up.

5567

* changing the task state if and only if any tasks are woken up.

5568

*/

5568

*/

5569

void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,

5569

void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,

5570

int nr_exclusive, void *key)

5570

int nr_exclusive, void *key)

5571

{

5571

{

5572

unsigned long flags;

5572

unsigned long flags;

5573

int sync = 1;

5573

int sync = 1;

5574

5575

if (unlikely(!q))

5575

if (unlikely(!q))

5576

return;

5576

return;

5577

5578

if (unlikely(!nr_exclusive))

5578

if (unlikely(!nr_exclusive))

5579

sync = 0;

5579

sync = 0;

5580

5581

spin_lock_irqsave(&q->lock, flags);

5581

spin_lock_irqsave(&q->lock, flags);

5582

__wake_up_common(q, mode, nr_exclusive, sync, key);

5582

__wake_up_common(q, mode, nr_exclusive, sync, key);

5583

spin_unlock_irqrestore(&q->lock, flags);

5583

spin_unlock_irqrestore(&q->lock, flags);

5584

}

5584

}

5585

EXPORT_SYMBOL_GPL(__wake_up_sync_key);

5585

EXPORT_SYMBOL_GPL(__wake_up_sync_key);

5586

5587

/*

5587

/*

5588

* __wake_up_sync - see __wake_up_sync_key()

5588

* __wake_up_sync - see __wake_up_sync_key()

5589

*/

5589

*/

5590

void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)

5590

void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)

5591

{

5591

{

5592

__wake_up_sync_key(q, mode, nr_exclusive, NULL);

5592

__wake_up_sync_key(q, mode, nr_exclusive, NULL);

5593

}

5593

}

5594

EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */

5594

EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */

5595

5596

/**

5596

/**

5597

* complete: - signals a single thread waiting on this completion

5597

* complete: - signals a single thread waiting on this completion

5598

* @x: holds the state of this particular completion

5598

* @x: holds the state of this particular completion

5599

*

5599

*

5600

* This will wake up a single thread waiting on this completion. Threads will be

5600

* This will wake up a single thread waiting on this completion. Threads will be

5601

* awakened in the same order in which they were queued.

5601

* awakened in the same order in which they were queued.

5602

*

5602

*

5603

* See also complete_all(), wait_for_completion() and related routines.

5603

* See also complete_all(), wait_for_completion() and related routines.

5604

*

5604

*

5605

* It may be assumed that this function implies a write memory barrier before

5605

* It may be assumed that this function implies a write memory barrier before

5606

* changing the task state if and only if any tasks are woken up.

5606

* changing the task state if and only if any tasks are woken up.

5607

*/

5607

*/

5608

void complete(struct completion *x)

5608

void complete(struct completion *x)

5609

{

5609

{

5610

unsigned long flags;

5610

unsigned long flags;

5611

5612

spin_lock_irqsave(&x->wait.lock, flags);

5612

spin_lock_irqsave(&x->wait.lock, flags);

5613

x->done++;

5613

x->done++;

5614

__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);

5614

__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);

5615

spin_unlock_irqrestore(&x->wait.lock, flags);

5615

spin_unlock_irqrestore(&x->wait.lock, flags);

5616

}

5616

}

5617

EXPORT_SYMBOL(complete);

5617

EXPORT_SYMBOL(complete);

5618

5619

/**

5619

/**

5620

* complete_all: - signals all threads waiting on this completion

5620

* complete_all: - signals all threads waiting on this completion

5621

* @x: holds the state of this particular completion

5621

* @x: holds the state of this particular completion

5622

*

5622

*

5623

* This will wake up all threads waiting on this particular completion event.

5623

* This will wake up all threads waiting on this particular completion event.

5624

*

5624

*

5625

* It may be assumed that this function implies a write memory barrier before

5625

* It may be assumed that this function implies a write memory barrier before

5626

* changing the task state if and only if any tasks are woken up.

5626

* changing the task state if and only if any tasks are woken up.

5627

*/

5627

*/

5628

void complete_all(struct completion *x)

5628

void complete_all(struct completion *x)

5629

{

5629

{

5630

unsigned long flags;

5630

unsigned long flags;

5631

5632

spin_lock_irqsave(&x->wait.lock, flags);

5632

spin_lock_irqsave(&x->wait.lock, flags);

5633

x->done += UINT_MAX/2;

5633

x->done += UINT_MAX/2;

5634

__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);

5634

__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);

5635

spin_unlock_irqrestore(&x->wait.lock, flags);

5635

spin_unlock_irqrestore(&x->wait.lock, flags);

5636

}

5636

}

5637

EXPORT_SYMBOL(complete_all);

5637

EXPORT_SYMBOL(complete_all);

5638

5639

static inline long __sched

5639

static inline long __sched

5640

do_wait_for_common(struct completion *x, long timeout, int state)

5640

do_wait_for_common(struct completion *x, long timeout, int state)

5641

{

5641

{

5642

if (!x->done) {

5642

if (!x->done) {

5643

DECLARE_WAITQUEUE(wait, current);

5643

DECLARE_WAITQUEUE(wait, current);

5644

5645

wait.flags |= WQ_FLAG_EXCLUSIVE;

5645

wait.flags |= WQ_FLAG_EXCLUSIVE;

5646

__add_wait_queue_tail(&x->wait, &wait);

5646

__add_wait_queue_tail(&x->wait, &wait);

5647

do {

5647

do {

5648

if (signal_pending_state(state, current)) {

5648

if (signal_pending_state(state, current)) {

5649

timeout = -ERESTARTSYS;

5649

timeout = -ERESTARTSYS;

5650

break;

5650

break;

5651

}

5651

}

5652

__set_current_state(state);

5652

__set_current_state(state);

5653

spin_unlock_irq(&x->wait.lock);

5653

spin_unlock_irq(&x->wait.lock);

5654

timeout = schedule_timeout(timeout);

5654

timeout = schedule_timeout(timeout);

5655

spin_lock_irq(&x->wait.lock);

5655

spin_lock_irq(&x->wait.lock);

5656

} while (!x->done && timeout);

5656

} while (!x->done && timeout);

5657

__remove_wait_queue(&x->wait, &wait);

5657

__remove_wait_queue(&x->wait, &wait);

5658

if (!x->done)

5658

if (!x->done)

5659

return timeout;

5659

return timeout;

5660

}

5660

}

5661

x->done--;

5661

x->done--;

5662

return timeout ?: 1;

5662

return timeout ?: 1;

5663

}

5663

}

5664

5665

static long __sched

5665

static long __sched

5666

wait_for_common(struct completion *x, long timeout, int state)

5666

wait_for_common(struct completion *x, long timeout, int state)

5667

{

5667

{

5668

might_sleep();

5668

might_sleep();

5669

5670

spin_lock_irq(&x->wait.lock);

5670

spin_lock_irq(&x->wait.lock);

5671

timeout = do_wait_for_common(x, timeout, state);

5671

timeout = do_wait_for_common(x, timeout, state);

5672

spin_unlock_irq(&x->wait.lock);

5672

spin_unlock_irq(&x->wait.lock);

5673

return timeout;

5673

return timeout;

5674

}

5674

}

5675

5676

/**

5676

/**

5677

* wait_for_completion: - waits for completion of a task

5677

* wait_for_completion: - waits for completion of a task

5678

* @x: holds the state of this particular completion

5678

* @x: holds the state of this particular completion

5679

*

5679

*

5680

* This waits to be signaled for completion of a specific task. It is NOT

5680

* This waits to be signaled for completion of a specific task. It is NOT

5681

* interruptible and there is no timeout.

5681

* interruptible and there is no timeout.

5682

*

5682

*

5683

* See also similar routines (i.e. wait_for_completion_timeout()) with timeout

5683

* See also similar routines (i.e. wait_for_completion_timeout()) with timeout

5684

* and interrupt capability. Also see complete().

5684

* and interrupt capability. Also see complete().

5685

*/

5685

*/

5686

void __sched wait_for_completion(struct completion *x)

5686

void __sched wait_for_completion(struct completion *x)

5687

{

5687

{

5688

wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);

5688

wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);

5689

}

5689

}

5690

EXPORT_SYMBOL(wait_for_completion);

5690

EXPORT_SYMBOL(wait_for_completion);

5691

5692

/**

5692

/**

5693

* wait_for_completion_timeout: - waits for completion of a task (w/timeout)

5693

* wait_for_completion_timeout: - waits for completion of a task (w/timeout)

5694

* @x: holds the state of this particular completion

5694

* @x: holds the state of this particular completion

5695

* @timeout: timeout value in jiffies

5695

* @timeout: timeout value in jiffies

5696

*

5696

*

5697

* This waits for either a completion of a specific task to be signaled or for a

5697

* This waits for either a completion of a specific task to be signaled or for a

5698

* specified timeout to expire. The timeout is in jiffies. It is not

5698

* specified timeout to expire. The timeout is in jiffies. It is not

5699

* interruptible.

5699

* interruptible.

5700

*/

5700

*/

5701

unsigned long __sched

5701

unsigned long __sched

5702

wait_for_completion_timeout(struct completion *x, unsigned long timeout)

5702

wait_for_completion_timeout(struct completion *x, unsigned long timeout)

5703

{

5703

{

5704

return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);

5704

return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);

5705

}

5705

}

5706

EXPORT_SYMBOL(wait_for_completion_timeout);

5706

EXPORT_SYMBOL(wait_for_completion_timeout);

5707

5708

/**

5708

/**

5709

* wait_for_completion_interruptible: - waits for completion of a task (w/intr)

5709

* wait_for_completion_interruptible: - waits for completion of a task (w/intr)

5710

* @x: holds the state of this particular completion

5710

* @x: holds the state of this particular completion

5711

*

5711

*

5712

* This waits for completion of a specific task to be signaled. It is

5712

* This waits for completion of a specific task to be signaled. It is

5713

* interruptible.

5713

* interruptible.

5714

*/

5714

*/

5715

int __sched wait_for_completion_interruptible(struct completion *x)

5715

int __sched wait_for_completion_interruptible(struct completion *x)

5716

{

5716

{

5717

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);

5717

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);

5718

if (t == -ERESTARTSYS)

5718

if (t == -ERESTARTSYS)

5719

return t;

5719

return t;

5720

return 0;

5720

return 0;

5721

}

5721

}

5722

EXPORT_SYMBOL(wait_for_completion_interruptible);

5722

EXPORT_SYMBOL(wait_for_completion_interruptible);

5723

5724

/**

5724

/**

5725

* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))

5725

* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))

5726

* @x: holds the state of this particular completion

5726

* @x: holds the state of this particular completion

5727

* @timeout: timeout value in jiffies

5727

* @timeout: timeout value in jiffies

5728

*

5728

*

5729

* This waits for either a completion of a specific task to be signaled or for a

5729

* This waits for either a completion of a specific task to be signaled or for a

5730

* specified timeout to expire. It is interruptible. The timeout is in jiffies.

5730

* specified timeout to expire. It is interruptible. The timeout is in jiffies.

5731

*/

5731

*/

5732

unsigned long __sched

5732

unsigned long __sched

5733

wait_for_completion_interruptible_timeout(struct completion *x,

5733

wait_for_completion_interruptible_timeout(struct completion *x,

5734

unsigned long timeout)

5734

unsigned long timeout)

5735

{

5735

{

5736

return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);

5736

return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);

5737

}

5737

}

5738

EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);

5738

EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);

5739

5740

/**

5740

/**

5741

* wait_for_completion_killable: - waits for completion of a task (killable)

5741

* wait_for_completion_killable: - waits for completion of a task (killable)

5742

* @x: holds the state of this particular completion

5742

* @x: holds the state of this particular completion

5743

*

5743

*

5744

* This waits to be signaled for completion of a specific task. It can be

5744

* This waits to be signaled for completion of a specific task. It can be

5745

* interrupted by a kill signal.

5745

* interrupted by a kill signal.

5746

*/

5746

*/

5747

int __sched wait_for_completion_killable(struct completion *x)

5747

int __sched wait_for_completion_killable(struct completion *x)

5748

{

5748

{

5749

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);

5749

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);

5750

if (t == -ERESTARTSYS)

5750

if (t == -ERESTARTSYS)

5751

return t;

5751

return t;

5752

return 0;

5752

return 0;

5753

}

5753

}

5754

EXPORT_SYMBOL(wait_for_completion_killable);

5754

EXPORT_SYMBOL(wait_for_completion_killable);

5755

5756

/**

5756

/**

5757

* try_wait_for_completion - try to decrement a completion without blocking

5757

* try_wait_for_completion - try to decrement a completion without blocking

5758

* @x: completion structure

5758

* @x: completion structure

5759

*

5759

*

5760

* Returns: 0 if a decrement cannot be done without blocking

5760

* Returns: 0 if a decrement cannot be done without blocking

5761

* 1 if a decrement succeeded.

5761

* 1 if a decrement succeeded.

5762

*

5762

*

5763

* If a completion is being used as a counting completion,

5763

* If a completion is being used as a counting completion,

5764

* attempt to decrement the counter without blocking. This

5764

* attempt to decrement the counter without blocking. This

5765

* enables us to avoid waiting if the resource the completion

5765

* enables us to avoid waiting if the resource the completion

5766

* is protecting is not available.

5766

* is protecting is not available.

5767

*/

5767

*/

5768

bool try_wait_for_completion(struct completion *x)

5768

bool try_wait_for_completion(struct completion *x)

5769

{

5769

{

5770

int ret = 1;

5770

int ret = 1;

5771

5772

spin_lock_irq(&x->wait.lock);

5772

spin_lock_irq(&x->wait.lock);

5773

if (!x->done)

5773

if (!x->done)

5774

ret = 0;

5774

ret = 0;

5775

else

5775

else

5776

x->done--;

5776

x->done--;

5777

spin_unlock_irq(&x->wait.lock);

5777

spin_unlock_irq(&x->wait.lock);

5778

return ret;

5778

return ret;

5779

}

5779

}

5780

EXPORT_SYMBOL(try_wait_for_completion);

5780

EXPORT_SYMBOL(try_wait_for_completion);

5781

5782

/**

5782

/**

5783

* completion_done - Test to see if a completion has any waiters

5783

* completion_done - Test to see if a completion has any waiters

5784

* @x: completion structure

5784

* @x: completion structure

5785

*

5785

*

5786

* Returns: 0 if there are waiters (wait_for_completion() in progress)

5786

* Returns: 0 if there are waiters (wait_for_completion() in progress)

5787

* 1 if there are no waiters.

5787

* 1 if there are no waiters.

5788

*

5788

*

5789

*/

5789

*/

5790

bool completion_done(struct completion *x)

5790

bool completion_done(struct completion *x)

5791

{

5791

{

5792

int ret = 1;

5792

int ret = 1;

5793

5794

spin_lock_irq(&x->wait.lock);

5794

spin_lock_irq(&x->wait.lock);

5795

if (!x->done)

5795

if (!x->done)

5796

ret = 0;

5796

ret = 0;

5797

spin_unlock_irq(&x->wait.lock);

5797

spin_unlock_irq(&x->wait.lock);

5798

return ret;

5798

return ret;

5799

}

5799

}

5800

EXPORT_SYMBOL(completion_done);

5800

EXPORT_SYMBOL(completion_done);

5801

5802

static long __sched

5802

static long __sched

5803

sleep_on_common(wait_queue_head_t *q, int state, long timeout)

5803

sleep_on_common(wait_queue_head_t *q, int state, long timeout)

5804

{

5804

{

5805

unsigned long flags;

5805

unsigned long flags;

5806

wait_queue_t wait;

5806

wait_queue_t wait;

5807

5808

init_waitqueue_entry(&wait, current);

5808

init_waitqueue_entry(&wait, current);

5809

5810

__set_current_state(state);

5810

__set_current_state(state);

5811

5812

spin_lock_irqsave(&q->lock, flags);

5812

spin_lock_irqsave(&q->lock, flags);

5813

__add_wait_queue(q, &wait);

5813

__add_wait_queue(q, &wait);

5814

spin_unlock(&q->lock);

5814

spin_unlock(&q->lock);

5815

timeout = schedule_timeout(timeout);

5815

timeout = schedule_timeout(timeout);

5816

spin_lock_irq(&q->lock);

5816

spin_lock_irq(&q->lock);

5817

__remove_wait_queue(q, &wait);

5817

__remove_wait_queue(q, &wait);

5818

spin_unlock_irqrestore(&q->lock, flags);

5818

spin_unlock_irqrestore(&q->lock, flags);

5819

5820

return timeout;

5820

return timeout;

5821

}

5821

}

5822

5823

void __sched interruptible_sleep_on(wait_queue_head_t *q)

5823

void __sched interruptible_sleep_on(wait_queue_head_t *q)

5824

{

5824

{

5825

sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

5825

sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

5826

}

5826

}

5827

EXPORT_SYMBOL(interruptible_sleep_on);

5827

EXPORT_SYMBOL(interruptible_sleep_on);

5828

5829

long __sched

5829

long __sched

5830

interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)

5830

interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)

5831

{

5831

{

5832

return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);

5832

return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);

5833

}

5833

}

5834

EXPORT_SYMBOL(interruptible_sleep_on_timeout);

5834

EXPORT_SYMBOL(interruptible_sleep_on_timeout);

5835

5836

void __sched sleep_on(wait_queue_head_t *q)

5836

void __sched sleep_on(wait_queue_head_t *q)

5837

{

5837

{

5838

sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

5838

sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

5839

}

5839

}

5840

EXPORT_SYMBOL(sleep_on);

5840

EXPORT_SYMBOL(sleep_on);

5841

5842

long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)

5842

long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)

5843

{

5843

{

5844

return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);

5844

return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);

5845

}

5845

}

5846

EXPORT_SYMBOL(sleep_on_timeout);

5846

EXPORT_SYMBOL(sleep_on_timeout);

5847

5848

#ifdef CONFIG_RT_MUTEXES

5848

#ifdef CONFIG_RT_MUTEXES

5849

5850

/*

5850

/*

5851

* rt_mutex_setprio - set the current priority of a task

5851

* rt_mutex_setprio - set the current priority of a task

5852

* @p: task

5852

* @p: task

5853

* @prio: prio value (kernel-internal form)

5853

* @prio: prio value (kernel-internal form)

5854

*

5854

*

5855

* This function changes the 'effective' priority of a task. It does

5855

* This function changes the 'effective' priority of a task. It does

5856

* not touch ->normal_prio like __setscheduler().

5856

* not touch ->normal_prio like __setscheduler().

5857

*

5857

*

5858

* Used by the rt_mutex code to implement priority inheritance logic.

5858

* Used by the rt_mutex code to implement priority inheritance logic.

5859

*/

5859

*/

5860

void rt_mutex_setprio(struct task_struct *p, int prio)

5860

void rt_mutex_setprio(struct task_struct *p, int prio)

5861

{

5861

{

5862

unsigned long flags;

5862

unsigned long flags;

5863

int oldprio, on_rq, running;

5863

int oldprio, on_rq, running;

5864

struct rq *rq;

5864

struct rq *rq;

5865

const struct sched_class *prev_class = p->sched_class;

5865

const struct sched_class *prev_class = p->sched_class;

5866

5867

BUG_ON(prio < 0 || prio > MAX_PRIO);

5867

BUG_ON(prio < 0 || prio > MAX_PRIO);

5868

5869

rq = task_rq_lock(p, &flags);

5869

rq = task_rq_lock(p, &flags);

5870

update_rq_clock(rq);

5870

update_rq_clock(rq);

5871

5872

oldprio = p->prio;

5872

oldprio = p->prio;

5873

on_rq = p->se.on_rq;

5873

on_rq = p->se.on_rq;

5874

running = task_current(rq, p);

5874

running = task_current(rq, p);

5875

if (on_rq)

5875

if (on_rq)

5876

dequeue_task(rq, p, 0);

5876

dequeue_task(rq, p, 0);

5877

if (running)

5877

if (running)

5878

p->sched_class->put_prev_task(rq, p);

5878

p->sched_class->put_prev_task(rq, p);

5879

5880

if (rt_prio(prio))

5880

if (rt_prio(prio))

5881

p->sched_class = &rt_sched_class;

5881

p->sched_class = &rt_sched_class;

5882

else

5882

else

5883

p->sched_class = &fair_sched_class;

5883

p->sched_class = &fair_sched_class;

5884

5885

p->prio = prio;

5885

p->prio = prio;

5886

5887

if (running)

5887

if (running)

5888

p->sched_class->set_curr_task(rq);

5888

p->sched_class->set_curr_task(rq);

5889

if (on_rq) {

5889

if (on_rq) {

5890

enqueue_task(rq, p, 0);

5890

enqueue_task(rq, p, 0);

5891

5892

check_class_changed(rq, p, prev_class, oldprio, running);

5892

check_class_changed(rq, p, prev_class, oldprio, running);

5893

}

5893

}

5894

task_rq_unlock(rq, &flags);

5894

task_rq_unlock(rq, &flags);

5895

}

5895

}

5896

5897

#endif

5897

#endif

5898

5899

void set_user_nice(struct task_struct *p, long nice)

5899

void set_user_nice(struct task_struct *p, long nice)

5900

{

5900

{

5901

int old_prio, delta, on_rq;

5901

int old_prio, delta, on_rq;

5902

unsigned long flags;

5902

unsigned long flags;

5903

struct rq *rq;

5903

struct rq *rq;

5904

5905

if (TASK_NICE(p) == nice || nice < -20 || nice > 19)

5905

if (TASK_NICE(p) == nice || nice < -20 || nice > 19)

5906

return;

5906

return;

5907

/*

5907

/*

5908

* We have to be careful, if called from sys_setpriority(),

5908

* We have to be careful, if called from sys_setpriority(),

5909

* the task might be in the middle of scheduling on another CPU.

5909

* the task might be in the middle of scheduling on another CPU.

5910

*/

5910

*/

5911

rq = task_rq_lock(p, &flags);

5911

rq = task_rq_lock(p, &flags);

5912

update_rq_clock(rq);

5912

update_rq_clock(rq);

5913

/*

5913

/*

5914

* The RT priorities are set via sched_setscheduler(), but we still

5914

* The RT priorities are set via sched_setscheduler(), but we still

5915

* allow the 'normal' nice value to be set - but as expected

5915

* allow the 'normal' nice value to be set - but as expected

5916

* it wont have any effect on scheduling until the task is

5916

* it wont have any effect on scheduling until the task is

5917

* SCHED_FIFO/SCHED_RR:

5917

* SCHED_FIFO/SCHED_RR:

5918

*/

5918

*/

5919

if (task_has_rt_policy(p)) {

5919

if (task_has_rt_policy(p)) {

5920

p->static_prio = NICE_TO_PRIO(nice);

5920

p->static_prio = NICE_TO_PRIO(nice);

5921

goto out_unlock;

5921

goto out_unlock;

5922

}

5922

}

5923

on_rq = p->se.on_rq;

5923

on_rq = p->se.on_rq;

5924

if (on_rq)

5924

if (on_rq)

5925

dequeue_task(rq, p, 0);

5925

dequeue_task(rq, p, 0);

5926

5927

p->static_prio = NICE_TO_PRIO(nice);

5927

p->static_prio = NICE_TO_PRIO(nice);

5928

set_load_weight(p);

5928

set_load_weight(p);

5929

old_prio = p->prio;

5929

old_prio = p->prio;

5930

p->prio = effective_prio(p);

5930

p->prio = effective_prio(p);

5931

delta = p->prio - old_prio;

5931

delta = p->prio - old_prio;

5932

5933

if (on_rq) {

5933

if (on_rq) {

5934

enqueue_task(rq, p, 0);

5934

enqueue_task(rq, p, 0);

5935

/*

5935

/*

5936

* If the task increased its priority or is running and

5936

* If the task increased its priority or is running and

5937

* lowered its priority, then reschedule its CPU:

5937

* lowered its priority, then reschedule its CPU:

5938

*/

5938

*/

5939

if (delta < 0 || (delta > 0 && task_running(rq, p)))

5939

if (delta < 0 || (delta > 0 && task_running(rq, p)))

5940

resched_task(rq->curr);

5940

resched_task(rq->curr);

5941

}

5941

}

5942

out_unlock:

5942

out_unlock:

5943

task_rq_unlock(rq, &flags);

5943

task_rq_unlock(rq, &flags);

5944

}

5944

}

5945

EXPORT_SYMBOL(set_user_nice);

5945

EXPORT_SYMBOL(set_user_nice);

5946

5947

/*

5947

/*

5948

* can_nice - check if a task can reduce its nice value

5948

* can_nice - check if a task can reduce its nice value

5949

* @p: task

5949

* @p: task

5950

* @nice: nice value

5950

* @nice: nice value

5951

*/

5951

*/

5952

int can_nice(const struct task_struct *p, const int nice)

5952

int can_nice(const struct task_struct *p, const int nice)

5953

{

5953

{

5954

/* convert nice value [19,-20] to rlimit style value [1,40] */

5954

/* convert nice value [19,-20] to rlimit style value [1,40] */

5955

int nice_rlim = 20 - nice;

5955

int nice_rlim = 20 - nice;

5956

5957

return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||

5957

return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||

5958

capable(CAP_SYS_NICE));

5958

capable(CAP_SYS_NICE));

5959

}

5959

}

5960

5961

#ifdef __ARCH_WANT_SYS_NICE

5961

#ifdef __ARCH_WANT_SYS_NICE

5962

5963

/*

5963

/*

5964

* sys_nice - change the priority of the current process.

5964

* sys_nice - change the priority of the current process.

5965

* @increment: priority increment

5965

* @increment: priority increment

5966

*

5966

*

5967

* sys_setpriority is a more generic, but much slower function that

5967

* sys_setpriority is a more generic, but much slower function that

5968

* does similar things.

5968

* does similar things.

5969

*/

5969

*/

5970

SYSCALL_DEFINE1(nice, int, increment)

5970

SYSCALL_DEFINE1(nice, int, increment)

5971

{

5971

{

5972

long nice, retval;

5972

long nice, retval;

5973

5974

/*

5974

/*

5975

* Setpriority might change our priority at the same moment.

5975

* Setpriority might change our priority at the same moment.

5976

* We don't have to worry. Conceptually one call occurs first

5976

* We don't have to worry. Conceptually one call occurs first

5977

* and we have a single winner.

5977

* and we have a single winner.

5978

*/

5978

*/

5979

if (increment < -40)

5979

if (increment < -40)

5980

increment = -40;

5980

increment = -40;

5981

if (increment > 40)

5981

if (increment > 40)

5982

increment = 40;

5982

increment = 40;

5983

5984

nice = TASK_NICE(current) + increment;

5984

nice = TASK_NICE(current) + increment;

5985

if (nice < -20)

5985

if (nice < -20)

5986

nice = -20;

5986

nice = -20;

5987

if (nice > 19)

5987

if (nice > 19)

5988

nice = 19;

5988

nice = 19;

5989

5990

if (increment < 0 && !can_nice(current, nice))

5990

if (increment < 0 && !can_nice(current, nice))

5991

return -EPERM;

5991

return -EPERM;

5992

5993

retval = security_task_setnice(current, nice);

5993

retval = security_task_setnice(current, nice);

5994

if (retval)

5994

if (retval)

5995

return retval;

5995

return retval;

5996

5997

set_user_nice(current, nice);

5997

set_user_nice(current, nice);

5998

return 0;

5998

return 0;

5999

}

5999

}

6000

6001

#endif

6001

#endif

6002

6003

/**

6003

/**

6004

* task_prio - return the priority value of a given task.

6004

* task_prio - return the priority value of a given task.

6005

* @p: the task in question.

6005

* @p: the task in question.

6006

*

6006

*

6007

* This is the priority value as seen by users in /proc.

6007

* This is the priority value as seen by users in /proc.

6008

* RT tasks are offset by -200. Normal tasks are centered

6008

* RT tasks are offset by -200. Normal tasks are centered

6009

* around 0, value goes from -16 to +15.

6009

* around 0, value goes from -16 to +15.

6010

*/

6010

*/

6011

int task_prio(const struct task_struct *p)

6011

int task_prio(const struct task_struct *p)

6012

{

6012

{

6013

return p->prio - MAX_RT_PRIO;

6013

return p->prio - MAX_RT_PRIO;

6014

}

6014

}

6015

6016

/**

6016

/**

6017

* task_nice - return the nice value of a given task.

6017

* task_nice - return the nice value of a given task.

6018

* @p: the task in question.

6018

* @p: the task in question.

6019

*/

6019

*/

6020

int task_nice(const struct task_struct *p)

6020

int task_nice(const struct task_struct *p)

6021

{

6021

{

6022

return TASK_NICE(p);

6022

return TASK_NICE(p);

6023

}

6023

}

6024

EXPORT_SYMBOL(task_nice);

6024

EXPORT_SYMBOL(task_nice);

6025

6026

/**

6026

/**

6027

* idle_cpu - is a given cpu idle currently?

6027

* idle_cpu - is a given cpu idle currently?

6028

* @cpu: the processor in question.

6028

* @cpu: the processor in question.

6029

*/

6029

*/

6030

int idle_cpu(int cpu)

6030

int idle_cpu(int cpu)

6031

{

6031

{

6032

return cpu_curr(cpu) == cpu_rq(cpu)->idle;

6032

return cpu_curr(cpu) == cpu_rq(cpu)->idle;

6033

}

6033

}

6034

6035

/**

6035

/**

6036

* idle_task - return the idle task for a given cpu.

6036

* idle_task - return the idle task for a given cpu.

6037

* @cpu: the processor in question.

6037

* @cpu: the processor in question.

6038

*/

6038

*/

6039

struct task_struct *idle_task(int cpu)

6039

struct task_struct *idle_task(int cpu)

6040

{

6040

{

6041

return cpu_rq(cpu)->idle;

6041

return cpu_rq(cpu)->idle;

6042

}

6042

}

6043

6044

/**

6044

/**

6045

* find_process_by_pid - find a process with a matching PID value.

6045

* find_process_by_pid - find a process with a matching PID value.

6046

* @pid: the pid in question.

6046

* @pid: the pid in question.

6047

*/

6047

*/

6048

static struct task_struct *find_process_by_pid(pid_t pid)

6048

static struct task_struct *find_process_by_pid(pid_t pid)

6049

{

6049

{

6050

return pid ? find_task_by_vpid(pid) : current;

6050

return pid ? find_task_by_vpid(pid) : current;

6051

}

6051

}

6052

6053

/* Actually do priority change: must hold rq lock. */

6053

/* Actually do priority change: must hold rq lock. */

6054

static void

6054

static void

6055

__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)

6055

__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)

6056

{

6056

{

6057

BUG_ON(p->se.on_rq);

6057

BUG_ON(p->se.on_rq);

6058

6059

p->policy = policy;

6059

p->policy = policy;

6060

switch (p->policy) {

6060

switch (p->policy) {

6061

case SCHED_NORMAL:

6061

case SCHED_NORMAL:

6062

case SCHED_BATCH:

6062

case SCHED_BATCH:

6063

case SCHED_IDLE:

6063

case SCHED_IDLE:

6064

p->sched_class = &fair_sched_class;

6064

p->sched_class = &fair_sched_class;

6065

break;

6065

break;

6066

case SCHED_FIFO:

6066

case SCHED_FIFO:

6067

case SCHED_RR:

6067

case SCHED_RR:

6068

p->sched_class = &rt_sched_class;

6068

p->sched_class = &rt_sched_class;

6069

break;

6069

break;

6070

}

6070

}

6071

6072

p->rt_priority = prio;

6072

p->rt_priority = prio;

6073

p->normal_prio = normal_prio(p);

6073

p->normal_prio = normal_prio(p);

6074

/* we are holding p->pi_lock already */

6074

/* we are holding p->pi_lock already */

6075

p->prio = rt_mutex_getprio(p);

6075

p->prio = rt_mutex_getprio(p);

6076

set_load_weight(p);

6076

set_load_weight(p);

6077

}

6077

}

6078

6079

/*

6079

/*

6080

* check the target process has a UID that matches the current process's

6080

* check the target process has a UID that matches the current process's

6081

*/

6081

*/

6082

static bool check_same_owner(struct task_struct *p)

6082

static bool check_same_owner(struct task_struct *p)

6083

{

6083

{

6084

const struct cred *cred = current_cred(), *pcred;

6084

const struct cred *cred = current_cred(), *pcred;

6085

bool match;

6085

bool match;

6086

6087

rcu_read_lock();

6087

rcu_read_lock();

6088

pcred = __task_cred(p);

6088

pcred = __task_cred(p);

6089

match = (cred->euid == pcred->euid ||

6089

match = (cred->euid == pcred->euid ||

6090

cred->euid == pcred->uid);

6090

cred->euid == pcred->uid);

6091

rcu_read_unlock();

6091

rcu_read_unlock();

6092

return match;

6092

return match;

6093

}

6093

}

6094

6095

static int __sched_setscheduler(struct task_struct *p, int policy,

6095

static int __sched_setscheduler(struct task_struct *p, int policy,

6096

struct sched_param *param, bool user)

6096

struct sched_param *param, bool user)

6097

{

6097

{

6098

int retval, oldprio, oldpolicy = -1, on_rq, running;

6098

int retval, oldprio, oldpolicy = -1, on_rq, running;

6099

unsigned long flags;

6099

unsigned long flags;

6100

const struct sched_class *prev_class = p->sched_class;

6100

const struct sched_class *prev_class = p->sched_class;

6101

struct rq *rq;

6101

struct rq *rq;

6102

6103

/* may grab non-irq protected spin_locks */

6103

/* may grab non-irq protected spin_locks */

6104

BUG_ON(in_interrupt());

6104

BUG_ON(in_interrupt());

6105

recheck:

6105

recheck:

6106

/* double check policy once rq lock held */

6106

/* double check policy once rq lock held */

6107

if (policy < 0)

6107

if (policy < 0)

6108

policy = oldpolicy = p->policy;

6108

policy = oldpolicy = p->policy;

6109

else if (policy != SCHED_FIFO && policy != SCHED_RR &&

6109

else if (policy != SCHED_FIFO && policy != SCHED_RR &&

6110

policy != SCHED_NORMAL && policy != SCHED_BATCH &&

6110

policy != SCHED_NORMAL && policy != SCHED_BATCH &&

6111

policy != SCHED_IDLE)

6111

policy != SCHED_IDLE)

6112

return -EINVAL;

6112

return -EINVAL;

6113

/*

6113

/*

6114

* Valid priorities for SCHED_FIFO and SCHED_RR are

6114

* Valid priorities for SCHED_FIFO and SCHED_RR are

6115

* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,

6115

* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,

6116

* SCHED_BATCH and SCHED_IDLE is 0.

6116

* SCHED_BATCH and SCHED_IDLE is 0.

6117

*/

6117

*/

6118

if (param->sched_priority < 0 ||

6118

if (param->sched_priority < 0 ||

6119

(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||

6119

(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||

6120

(!p->mm && param->sched_priority > MAX_RT_PRIO-1))

6120

(!p->mm && param->sched_priority > MAX_RT_PRIO-1))

6121

return -EINVAL;

6121

return -EINVAL;

6122

if (rt_policy(policy) != (param->sched_priority != 0))

6122

if (rt_policy(policy) != (param->sched_priority != 0))

6123

return -EINVAL;

6123

return -EINVAL;

6124

6125

/*

6125

/*

6126

* Allow unprivileged RT tasks to decrease priority:

6126

* Allow unprivileged RT tasks to decrease priority:

6127

*/

6127

*/

6128

if (user && !capable(CAP_SYS_NICE)) {

6128

if (user && !capable(CAP_SYS_NICE)) {

6129

if (rt_policy(policy)) {

6129

if (rt_policy(policy)) {

6130

unsigned long rlim_rtprio;

6130

unsigned long rlim_rtprio;

6131

6132

if (!lock_task_sighand(p, &flags))

6132

if (!lock_task_sighand(p, &flags))

6133

return -ESRCH;

6133

return -ESRCH;

6134

rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;

6134

rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;

6135

unlock_task_sighand(p, &flags);

6135

unlock_task_sighand(p, &flags);

6136

6137

/* can't set/change the rt policy */

6137

/* can't set/change the rt policy */

6138

if (policy != p->policy && !rlim_rtprio)

6138

if (policy != p->policy && !rlim_rtprio)

6139

return -EPERM;

6139

return -EPERM;

6140

6141

/* can't increase priority */

6141

/* can't increase priority */

6142

if (param->sched_priority > p->rt_priority &&

6142

if (param->sched_priority > p->rt_priority &&

6143

param->sched_priority > rlim_rtprio)

6143

param->sched_priority > rlim_rtprio)

6144

return -EPERM;

6144

return -EPERM;

6145

}

6145

}

6146

/*

6146

/*

6147

* Like positive nice levels, dont allow tasks to

6147

* Like positive nice levels, dont allow tasks to

6148

* move out of SCHED_IDLE either:

6148

* move out of SCHED_IDLE either:

6149

*/

6149

*/

6150

if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)

6150

if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)

6151

return -EPERM;

6151

return -EPERM;

6152

6153

/* can't change other user's priorities */

6153

/* can't change other user's priorities */

6154

if (!check_same_owner(p))

6154

if (!check_same_owner(p))

6155

return -EPERM;

6155

return -EPERM;

6156

}

6156

}

6157

6158

if (user) {

6158

if (user) {

6159

#ifdef CONFIG_RT_GROUP_SCHED

6159

#ifdef CONFIG_RT_GROUP_SCHED

6160

/*

6160

/*

6161

* Do not allow realtime tasks into groups that have no runtime

6161

* Do not allow realtime tasks into groups that have no runtime

6162

* assigned.

6162

* assigned.

6163

*/

6163

*/

6164

if (rt_bandwidth_enabled() && rt_policy(policy) &&

6164

if (rt_bandwidth_enabled() && rt_policy(policy) &&

6165

task_group(p)->rt_bandwidth.rt_runtime == 0)

6165

task_group(p)->rt_bandwidth.rt_runtime == 0)

6166

return -EPERM;

6166

return -EPERM;

6167

#endif

6167

#endif

6168

6169

retval = security_task_setscheduler(p, policy, param);

6169

retval = security_task_setscheduler(p, policy, param);

6170

if (retval)

6170

if (retval)

6171

return retval;

6171

return retval;

6172

}

6172

}

6173

6174

/*

6174

/*

6175

* make sure no PI-waiters arrive (or leave) while we are

6175

* make sure no PI-waiters arrive (or leave) while we are

6176

* changing the priority of the task:

6176

* changing the priority of the task:

6177

*/

6177

*/

6178

spin_lock_irqsave(&p->pi_lock, flags);

6178

spin_lock_irqsave(&p->pi_lock, flags);

6179

/*

6179

/*

6180

* To be able to change p->policy safely, the apropriate

6180

* To be able to change p->policy safely, the apropriate

6181

* runqueue lock must be held.

6181

* runqueue lock must be held.

6182

*/

6182

*/

6183

rq = __task_rq_lock(p);

6183

rq = __task_rq_lock(p);

6184

/* recheck policy now with rq lock held */

6184

/* recheck policy now with rq lock held */

6185

if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

6185

if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

6186

policy = oldpolicy = -1;

6186

policy = oldpolicy = -1;

6187

__task_rq_unlock(rq);

6187

__task_rq_unlock(rq);

6188

spin_unlock_irqrestore(&p->pi_lock, flags);

6188

spin_unlock_irqrestore(&p->pi_lock, flags);

6189

goto recheck;

6189

goto recheck;

6190

}

6190

}

6191

update_rq_clock(rq);

6191

update_rq_clock(rq);

6192

on_rq = p->se.on_rq;

6192

on_rq = p->se.on_rq;

6193

running = task_current(rq, p);

6193

running = task_current(rq, p);

6194

if (on_rq)

6194

if (on_rq)

6195

deactivate_task(rq, p, 0);

6195

deactivate_task(rq, p, 0);

6196

if (running)

6196

if (running)

6197

p->sched_class->put_prev_task(rq, p);

6197

p->sched_class->put_prev_task(rq, p);

6198

6199

oldprio = p->prio;

6199

oldprio = p->prio;

6200

__setscheduler(rq, p, policy, param->sched_priority);

6200

__setscheduler(rq, p, policy, param->sched_priority);

6201

6202

if (running)

6202

if (running)

6203

p->sched_class->set_curr_task(rq);

6203

p->sched_class->set_curr_task(rq);

6204

if (on_rq) {

6204

if (on_rq) {

6205

activate_task(rq, p, 0);

6205

activate_task(rq, p, 0);

6206

6207

check_class_changed(rq, p, prev_class, oldprio, running);

6207

check_class_changed(rq, p, prev_class, oldprio, running);

6208

}

6208

}

6209

__task_rq_unlock(rq);

6209

__task_rq_unlock(rq);

6210

spin_unlock_irqrestore(&p->pi_lock, flags);

6210

spin_unlock_irqrestore(&p->pi_lock, flags);

6211

6212

rt_mutex_adjust_pi(p);

6212

rt_mutex_adjust_pi(p);

6213

6214

return 0;

6214

return 0;

6215

}

6215

}

6216

6217

/**

6217

/**

6218

* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.

6218

* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.

6219

* @p: the task in question.

6219

* @p: the task in question.

6220

* @policy: new policy.

6220

* @policy: new policy.

6221

* @param: structure containing the new RT priority.

6221

* @param: structure containing the new RT priority.

6222

*

6222

*

6223

* NOTE that the task may be already dead.

6223

* NOTE that the task may be already dead.

6224

*/

6224

*/

6225

int sched_setscheduler(struct task_struct *p, int policy,

6225

int sched_setscheduler(struct task_struct *p, int policy,

6226

struct sched_param *param)

6226

struct sched_param *param)

6227

{

6227

{

6228

return __sched_setscheduler(p, policy, param, true);

6228

return __sched_setscheduler(p, policy, param, true);

6229

}

6229

}

6230

EXPORT_SYMBOL_GPL(sched_setscheduler);

6230

EXPORT_SYMBOL_GPL(sched_setscheduler);

6231

6232

/**

6232

/**

6233

* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.

6233

* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.

6234

* @p: the task in question.

6234

* @p: the task in question.

6235

* @policy: new policy.

6235

* @policy: new policy.

6236

* @param: structure containing the new RT priority.

6236

* @param: structure containing the new RT priority.

6237

*

6237

*

6238

* Just like sched_setscheduler, only don't bother checking if the

6238

* Just like sched_setscheduler, only don't bother checking if the

6239

* current context has permission. For example, this is needed in

6239

* current context has permission. For example, this is needed in

6240

* stop_machine(): we create temporary high priority worker threads,

6240

* stop_machine(): we create temporary high priority worker threads,

6241

* but our caller might not have that capability.

6241

* but our caller might not have that capability.

6242

*/

6242

*/

6243

int sched_setscheduler_nocheck(struct task_struct *p, int policy,

6243

int sched_setscheduler_nocheck(struct task_struct *p, int policy,

6244

struct sched_param *param)

6244

struct sched_param *param)

6245

{

6245

{

6246

return __sched_setscheduler(p, policy, param, false);

6246

return __sched_setscheduler(p, policy, param, false);

6247

}

6247

}

6248

6249

static int

6249

static int

6250

do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

6250

do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

6251

{

6251

{

6252

struct sched_param lparam;

6252

struct sched_param lparam;

6253

struct task_struct *p;

6253

struct task_struct *p;

6254

int retval;

6254

int retval;

6255

6256

if (!param || pid < 0)

6256

if (!param || pid < 0)

6257

return -EINVAL;

6257

return -EINVAL;

6258

if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

6258

if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

6259

return -EFAULT;

6259

return -EFAULT;

6260

6261

rcu_read_lock();

6261

rcu_read_lock();

6262

retval = -ESRCH;

6262

retval = -ESRCH;

6263

p = find_process_by_pid(pid);

6263

p = find_process_by_pid(pid);

6264

if (p != NULL)

6264

if (p != NULL)

6265

retval = sched_setscheduler(p, policy, &lparam);

6265

retval = sched_setscheduler(p, policy, &lparam);

6266

rcu_read_unlock();

6266

rcu_read_unlock();

6267

6268

return retval;

6268

return retval;

6269

}

6269

}

6270

6271

/**

6271

/**

6272

* sys_sched_setscheduler - set/change the scheduler policy and RT priority

6272

* sys_sched_setscheduler - set/change the scheduler policy and RT priority

6273

* @pid: the pid in question.

6273

* @pid: the pid in question.

6274

* @policy: new policy.

6274

* @policy: new policy.

6275

* @param: structure containing the new RT priority.

6275

* @param: structure containing the new RT priority.

6276

*/

6276

*/

6277

SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,

6277

SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,

6278

struct sched_param __user *, param)

6278

struct sched_param __user *, param)

6279

{

6279

{

6280

/* negative values for policy are not valid */

6280

/* negative values for policy are not valid */

6281

if (policy < 0)

6281

if (policy < 0)

6282

return -EINVAL;

6282

return -EINVAL;

6283

6284

return do_sched_setscheduler(pid, policy, param);

6284

return do_sched_setscheduler(pid, policy, param);

6285

}

6285

}

6286

6287

/**

6287

/**

6288

* sys_sched_setparam - set/change the RT priority of a thread

6288

* sys_sched_setparam - set/change the RT priority of a thread

6289

* @pid: the pid in question.

6289

* @pid: the pid in question.

6290

* @param: structure containing the new RT priority.

6290

* @param: structure containing the new RT priority.

6291

*/

6291

*/

6292

SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)

6292

SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)

6293

{

6293

{

6294

return do_sched_setscheduler(pid, -1, param);

6294

return do_sched_setscheduler(pid, -1, param);

6295

}

6295

}

6296

6297

/**

6297

/**

6298

* sys_sched_getscheduler - get the policy (scheduling class) of a thread

6298

* sys_sched_getscheduler - get the policy (scheduling class) of a thread

6299

* @pid: the pid in question.

6299

* @pid: the pid in question.

6300

*/

6300

*/

6301

SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)

6301

SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)

6302

{

6302

{

6303

struct task_struct *p;

6303

struct task_struct *p;

6304

int retval;

6304

int retval;

6305

6306

if (pid < 0)

6306

if (pid < 0)

6307

return -EINVAL;

6307

return -EINVAL;

6308

6309

retval = -ESRCH;

6309

retval = -ESRCH;

6310

read_lock(&tasklist_lock);

6310

read_lock(&tasklist_lock);

6311

p = find_process_by_pid(pid);

6311

p = find_process_by_pid(pid);

6312

if (p) {

6312

if (p) {

6313

retval = security_task_getscheduler(p);

6313

retval = security_task_getscheduler(p);

6314

if (!retval)

6314

if (!retval)

6315

retval = p->policy;

6315

retval = p->policy;

6316

}

6316

}

6317

read_unlock(&tasklist_lock);

6317

read_unlock(&tasklist_lock);

6318

return retval;

6318

return retval;

6319

}

6319

}

6320

6321

/**

6321

/**

6322

* sys_sched_getscheduler - get the RT priority of a thread

6322

* sys_sched_getscheduler - get the RT priority of a thread

6323

* @pid: the pid in question.

6323

* @pid: the pid in question.

6324

* @param: structure containing the RT priority.

6324

* @param: structure containing the RT priority.

6325

*/

6325

*/

6326

SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)

6326

SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)

6327

{

6327

{

6328

struct sched_param lp;

6328

struct sched_param lp;

6329

struct task_struct *p;

6329

struct task_struct *p;

6330

int retval;

6330

int retval;

6331

6332

if (!param || pid < 0)

6332

if (!param || pid < 0)

6333

return -EINVAL;

6333

return -EINVAL;

6334

6335

read_lock(&tasklist_lock);

6335

read_lock(&tasklist_lock);

6336

p = find_process_by_pid(pid);

6336

p = find_process_by_pid(pid);

6337

retval = -ESRCH;

6337

retval = -ESRCH;

6338

if (!p)

6338

if (!p)

6339

goto out_unlock;

6339

goto out_unlock;

6340

6341

retval = security_task_getscheduler(p);

6341

retval = security_task_getscheduler(p);

6342

if (retval)

6342

if (retval)

6343

goto out_unlock;

6343

goto out_unlock;

6344

6345

lp.sched_priority = p->rt_priority;

6345

lp.sched_priority = p->rt_priority;

6346

read_unlock(&tasklist_lock);

6346

read_unlock(&tasklist_lock);

6347

6348

/*

6348

/*

6349

* This one might sleep, we cannot do it with a spinlock held ...

6349

* This one might sleep, we cannot do it with a spinlock held ...

6350

*/

6350

*/

6351

retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

6351

retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

6352

6353

return retval;

6353

return retval;

6354

6355

out_unlock:

6355

out_unlock:

6356

read_unlock(&tasklist_lock);

6356

read_unlock(&tasklist_lock);

6357

return retval;

6357

return retval;

6358

}

6358

}

6359

6360

long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)

6360

long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)

6361

{

6361

{

6362

cpumask_var_t cpus_allowed, new_mask;

6362

cpumask_var_t cpus_allowed, new_mask;

6363

struct task_struct *p;

6363

struct task_struct *p;

6364

int retval;

6364

int retval;

6365

6366

get_online_cpus();

6366

get_online_cpus();

6367

read_lock(&tasklist_lock);

6367

read_lock(&tasklist_lock);

6368

6369

p = find_process_by_pid(pid);

6369

p = find_process_by_pid(pid);

6370

if (!p) {

6370

if (!p) {

6371

read_unlock(&tasklist_lock);

6371

read_unlock(&tasklist_lock);

6372

put_online_cpus();

6372

put_online_cpus();

6373

return -ESRCH;

6373

return -ESRCH;

6374

}

6374

}

6375

6376

/*

6376

/*

6377

* It is not safe to call set_cpus_allowed with the

6377

* It is not safe to call set_cpus_allowed with the

6378

* tasklist_lock held. We will bump the task_struct's

6378

* tasklist_lock held. We will bump the task_struct's

6379

* usage count and then drop tasklist_lock.

6379

* usage count and then drop tasklist_lock.

6380

*/

6380

*/

6381

get_task_struct(p);

6381

get_task_struct(p);

6382

read_unlock(&tasklist_lock);

6382

read_unlock(&tasklist_lock);

6383

6384

if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {

6384

if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {

6385

retval = -ENOMEM;

6385

retval = -ENOMEM;

6386

goto out_put_task;

6386

goto out_put_task;

6387

}

6387

}

6388

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {

6388

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {

6389

retval = -ENOMEM;

6389

retval = -ENOMEM;

6390

goto out_free_cpus_allowed;

6390

goto out_free_cpus_allowed;

6391

}

6391

}

6392

retval = -EPERM;

6392

retval = -EPERM;

6393

if (!check_same_owner(p) && !capable(CAP_SYS_NICE))

6393

if (!check_same_owner(p) && !capable(CAP_SYS_NICE))

6394

goto out_unlock;

6394

goto out_unlock;

6395

6396

retval = security_task_setscheduler(p, 0, NULL);

6396

retval = security_task_setscheduler(p, 0, NULL);

6397

if (retval)

6397

if (retval)

6398

goto out_unlock;

6398

goto out_unlock;

6399

6400

cpuset_cpus_allowed(p, cpus_allowed);

6400

cpuset_cpus_allowed(p, cpus_allowed);

6401

cpumask_and(new_mask, in_mask, cpus_allowed);

6401

cpumask_and(new_mask, in_mask, cpus_allowed);

6402

again:

6402

again:

6403

retval = set_cpus_allowed_ptr(p, new_mask);

6403

retval = set_cpus_allowed_ptr(p, new_mask);

6404

6405

if (!retval) {

6405

if (!retval) {

6406

cpuset_cpus_allowed(p, cpus_allowed);

6406

cpuset_cpus_allowed(p, cpus_allowed);

6407

if (!cpumask_subset(new_mask, cpus_allowed)) {

6407

if (!cpumask_subset(new_mask, cpus_allowed)) {

6408

/*

6408

/*

6409

* We must have raced with a concurrent cpuset

6409

* We must have raced with a concurrent cpuset

6410

* update. Just reset the cpus_allowed to the

6410

* update. Just reset the cpus_allowed to the

6411

* cpuset's cpus_allowed

6411

* cpuset's cpus_allowed

6412

*/

6412

*/

6413

cpumask_copy(new_mask, cpus_allowed);

6413

cpumask_copy(new_mask, cpus_allowed);

6414

goto again;

6414

goto again;

6415

}

6415

}

6416

}

6416

}

6417

out_unlock:

6417

out_unlock:

6418

free_cpumask_var(new_mask);

6418

free_cpumask_var(new_mask);

6419

out_free_cpus_allowed:

6419

out_free_cpus_allowed:

6420

free_cpumask_var(cpus_allowed);

6420

free_cpumask_var(cpus_allowed);

6421

out_put_task:

6421

out_put_task:

6422

put_task_struct(p);

6422

put_task_struct(p);

6423

put_online_cpus();

6423

put_online_cpus();

6424

return retval;

6424

return retval;

6425

}

6425

}

6426

6427

static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

6427

static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

6428

struct cpumask *new_mask)

6428

struct cpumask *new_mask)

6429

{

6429

{

6430

if (len < cpumask_size())

6430

if (len < cpumask_size())

6431

cpumask_clear(new_mask);

6431

cpumask_clear(new_mask);

6432

else if (len > cpumask_size())

6432

else if (len > cpumask_size())

6433

len = cpumask_size();

6433

len = cpumask_size();

6434

6435

return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

6435

return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

6436

}

6436

}

6437

6438

/**

6438

/**

6439

* sys_sched_setaffinity - set the cpu affinity of a process

6439

* sys_sched_setaffinity - set the cpu affinity of a process

6440

* @pid: pid of the process

6440

* @pid: pid of the process

6441

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

6441

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

6442

* @user_mask_ptr: user-space pointer to the new cpu mask

6442

* @user_mask_ptr: user-space pointer to the new cpu mask

6443

*/

6443

*/

6444

SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,

6444

SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,

6445

unsigned long __user *, user_mask_ptr)

6445

unsigned long __user *, user_mask_ptr)

6446

{

6446

{

6447

cpumask_var_t new_mask;

6447

cpumask_var_t new_mask;

6448

int retval;

6448

int retval;

6449

6450

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))

6450

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))

6451

return -ENOMEM;

6451

return -ENOMEM;

6452

6453

retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);

6453

retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);

6454

if (retval == 0)

6454

if (retval == 0)

6455

retval = sched_setaffinity(pid, new_mask);

6455

retval = sched_setaffinity(pid, new_mask);

6456

free_cpumask_var(new_mask);

6456

free_cpumask_var(new_mask);

6457

return retval;

6457

return retval;

6458

}

6458

}

6459

6460

long sched_getaffinity(pid_t pid, struct cpumask *mask)

6460

long sched_getaffinity(pid_t pid, struct cpumask *mask)

6461

{

6461

{

6462

struct task_struct *p;

6462

struct task_struct *p;

6463

int retval;

6463

int retval;

6464

6465

get_online_cpus();

6465

get_online_cpus();

6466

read_lock(&tasklist_lock);

6466

read_lock(&tasklist_lock);

6467

6468

retval = -ESRCH;

6468

retval = -ESRCH;

6469

p = find_process_by_pid(pid);

6469

p = find_process_by_pid(pid);

6470

if (!p)

6470

if (!p)

6471

goto out_unlock;

6471

goto out_unlock;

6472

6473

retval = security_task_getscheduler(p);

6473

retval = security_task_getscheduler(p);

6474

if (retval)

6474

if (retval)

6475

goto out_unlock;

6475

goto out_unlock;

6476

6477

cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);

6477

cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);

6478

6479

out_unlock:

6479

out_unlock:

6480

read_unlock(&tasklist_lock);

6480

read_unlock(&tasklist_lock);

6481

put_online_cpus();

6481

put_online_cpus();

6482

6483

return retval;

6483

return retval;

6484

}

6484

}

6485

6486

/**

6486

/**

6487

* sys_sched_getaffinity - get the cpu affinity of a process

6487

* sys_sched_getaffinity - get the cpu affinity of a process

6488

* @pid: pid of the process

6488

* @pid: pid of the process

6489

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

6489

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

6490

* @user_mask_ptr: user-space pointer to hold the current cpu mask

6490

* @user_mask_ptr: user-space pointer to hold the current cpu mask

6491

*/

6491

*/

6492

SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,

6492

SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,

6493

unsigned long __user *, user_mask_ptr)

6493

unsigned long __user *, user_mask_ptr)

6494

{

6494

{

6495

int ret;

6495

int ret;

6496

cpumask_var_t mask;

6496

cpumask_var_t mask;

6497

6498

if (len < cpumask_size())

6498

if (len < cpumask_size())

6499

return -EINVAL;

6499

return -EINVAL;

6500

6501

if (!alloc_cpumask_var(&mask, GFP_KERNEL))

6501

if (!alloc_cpumask_var(&mask, GFP_KERNEL))

6502

return -ENOMEM;

6502

return -ENOMEM;

6503

6504

ret = sched_getaffinity(pid, mask);

6504

ret = sched_getaffinity(pid, mask);

6505

if (ret == 0) {

6505

if (ret == 0) {

6506

if (copy_to_user(user_mask_ptr, mask, cpumask_size()))

6506

if (copy_to_user(user_mask_ptr, mask, cpumask_size()))

6507

ret = -EFAULT;

6507

ret = -EFAULT;

6508

else

6508

else

6509

ret = cpumask_size();

6509

ret = cpumask_size();

6510

}

6510

}

6511

free_cpumask_var(mask);

6511

free_cpumask_var(mask);

6512

6513

return ret;

6513

return ret;

6514

}

6514

}

6515

6516

/**

6516

/**

6517

* sys_sched_yield - yield the current processor to other threads.

6517

* sys_sched_yield - yield the current processor to other threads.

6518

*

6518

*

6519

* This function yields the current CPU to other tasks. If there are no

6519

* This function yields the current CPU to other tasks. If there are no

6520

* other threads running on this CPU then this function will return.

6520

* other threads running on this CPU then this function will return.

6521

*/

6521

*/

6522

SYSCALL_DEFINE0(sched_yield)

6522

SYSCALL_DEFINE0(sched_yield)

6523

{

6523

{

6524

struct rq *rq = this_rq_lock();

6524

struct rq *rq = this_rq_lock();

6525

6526

schedstat_inc(rq, yld_count);

6526

schedstat_inc(rq, yld_count);

6527

current->sched_class->yield_task(rq);

6527

current->sched_class->yield_task(rq);

6528

6529

/*

6529

/*

6530

* Since we are going to call schedule() anyway, there's

6530

* Since we are going to call schedule() anyway, there's

6531

* no need to preempt or enable interrupts:

6531

* no need to preempt or enable interrupts:

6532

*/

6532

*/

6533

__release(rq->lock);

6533

__release(rq->lock);

6534

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

6534

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

6535

_raw_spin_unlock(&rq->lock);

6535

_raw_spin_unlock(&rq->lock);

6536

preempt_enable_no_resched();

6536

preempt_enable_no_resched();

6537

6538

schedule();

6538

schedule();

6539

6540

return 0;

6540

return 0;

6541

}

6541

}

6542

6543

static void __cond_resched(void)

6543

static void __cond_resched(void)

6544

{

6544

{

6545

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

6545

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

6546

__might_sleep(__FILE__, __LINE__);

6546

__might_sleep(__FILE__, __LINE__);

6547

#endif

6547

#endif

6548

/*

6548

/*

6549

* The BKS might be reacquired before we have dropped

6549

* The BKS might be reacquired before we have dropped

6550

* PREEMPT_ACTIVE, which could trigger a second

6550

* PREEMPT_ACTIVE, which could trigger a second

6551

* cond_resched() call.

6551

* cond_resched() call.

6552

*/

6552

*/

6553

do {

6553

do {

6554

add_preempt_count(PREEMPT_ACTIVE);

6554

add_preempt_count(PREEMPT_ACTIVE);

6555

schedule();

6555

schedule();

6556

sub_preempt_count(PREEMPT_ACTIVE);

6556

sub_preempt_count(PREEMPT_ACTIVE);

6557

} while (need_resched());

6557

} while (need_resched());

6558

}

6558

}

6559

6560

int __sched _cond_resched(void)

6560

int __sched _cond_resched(void)

6561

{

6561

{

6562

if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&

6562

if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&

6563

system_state == SYSTEM_RUNNING) {

6563

system_state == SYSTEM_RUNNING) {

6564

__cond_resched();

6564

__cond_resched();

6565

return 1;

6565

return 1;

6566

}

6566

}

6567

return 0;

6567

return 0;

6568

}

6568

}

6569

EXPORT_SYMBOL(_cond_resched);

6569

EXPORT_SYMBOL(_cond_resched);

6570

6571

/*

6571

/*

6572

* cond_resched_lock() - if a reschedule is pending, drop the given lock,

6572

* cond_resched_lock() - if a reschedule is pending, drop the given lock,

6573

* call schedule, and on return reacquire the lock.

6573

* call schedule, and on return reacquire the lock.

6574

*

6574

*

6575

* This works OK both with and without CONFIG_PREEMPT. We do strange low-level

6575

* This works OK both with and without CONFIG_PREEMPT. We do strange low-level

6576

* operations here to prevent schedule() from being called twice (once via

6576

* operations here to prevent schedule() from being called twice (once via

6577

* spin_unlock(), once by hand).

6577

* spin_unlock(), once by hand).

6578

*/

6578

*/

6579

int cond_resched_lock(spinlock_t *lock)

6579

int cond_resched_lock(spinlock_t *lock)

6580

{

6580

{

6581

int resched = need_resched() && system_state == SYSTEM_RUNNING;

6581

int resched = need_resched() && system_state == SYSTEM_RUNNING;

6582

int ret = 0;

6582

int ret = 0;

6583

6584

if (spin_needbreak(lock) || resched) {

6584

if (spin_needbreak(lock) || resched) {

6585

spin_unlock(lock);

6585

spin_unlock(lock);

6586

if (resched && need_resched())

6586

if (resched && need_resched())

6587

__cond_resched();

6587

__cond_resched();

6588

else

6588

else

6589

cpu_relax();

6589

cpu_relax();

6590

ret = 1;

6590

ret = 1;

6591

spin_lock(lock);

6591

spin_lock(lock);

6592

}

6592

}

6593

return ret;

6593

return ret;

6594

}

6594

}

6595

EXPORT_SYMBOL(cond_resched_lock);

6595

EXPORT_SYMBOL(cond_resched_lock);

6596

6597

int __sched cond_resched_softirq(void)

6597

int __sched cond_resched_softirq(void)

6598

{

6598

{

6599

BUG_ON(!in_softirq());

6599

BUG_ON(!in_softirq());

6600

6601

if (need_resched() && system_state == SYSTEM_RUNNING) {

6601

if (need_resched() && system_state == SYSTEM_RUNNING) {

6602

local_bh_enable();

6602

local_bh_enable();

6603

__cond_resched();

6603

__cond_resched();

6604

local_bh_disable();

6604

local_bh_disable();

6605

return 1;

6605

return 1;

6606

}

6606

}

6607

return 0;

6607

return 0;

6608

}

6608

}

6609

EXPORT_SYMBOL(cond_resched_softirq);

6609

EXPORT_SYMBOL(cond_resched_softirq);

6610

6611

/**

6611

/**

6612

* yield - yield the current processor to other threads.

6612

* yield - yield the current processor to other threads.

6613

*

6613

*

6614

* This is a shortcut for kernel-space yielding - it marks the

6614

* This is a shortcut for kernel-space yielding - it marks the

6615

* thread runnable and calls sys_sched_yield().

6615

* thread runnable and calls sys_sched_yield().

6616

*/

6616

*/

6617

void __sched yield(void)

6617

void __sched yield(void)

6618

{

6618

{

6619

set_current_state(TASK_RUNNING);

6619

set_current_state(TASK_RUNNING);

6620

sys_sched_yield();

6620

sys_sched_yield();

6621

}

6621

}

6622

EXPORT_SYMBOL(yield);

6622

EXPORT_SYMBOL(yield);

6623

6624

/*

6624

/*

6625

* This task is about to go to sleep on IO. Increment rq->nr_iowait so

6625

* This task is about to go to sleep on IO. Increment rq->nr_iowait so

6626

* that process accounting knows that this is a task in IO wait state.

6626

* that process accounting knows that this is a task in IO wait state.

6627

*

6627

*

6628

* But don't do that if it is a deliberate, throttling IO wait (this task

6628

* But don't do that if it is a deliberate, throttling IO wait (this task

6629

* has set its backing_dev_info: the queue against which it should throttle)

6629

* has set its backing_dev_info: the queue against which it should throttle)

6630

*/

6630

*/

6631

void __sched io_schedule(void)

6631

void __sched io_schedule(void)

6632

{

6632

{

6633

struct rq *rq = &__raw_get_cpu_var(runqueues);

6633

struct rq *rq = &__raw_get_cpu_var(runqueues);

6634

6635

delayacct_blkio_start();

6635

delayacct_blkio_start();

6636

atomic_inc(&rq->nr_iowait);

6636

atomic_inc(&rq->nr_iowait);

6637

schedule();

6637

schedule();

6638

atomic_dec(&rq->nr_iowait);

6638

atomic_dec(&rq->nr_iowait);

6639

delayacct_blkio_end();

6639

delayacct_blkio_end();

6640

}

6640

}

6641

EXPORT_SYMBOL(io_schedule);

6641

EXPORT_SYMBOL(io_schedule);

6642

6643

long __sched io_schedule_timeout(long timeout)

6643

long __sched io_schedule_timeout(long timeout)

6644

{

6644

{

6645

struct rq *rq = &__raw_get_cpu_var(runqueues);

6645

struct rq *rq = &__raw_get_cpu_var(runqueues);

6646

long ret;

6646

long ret;

6647

6648

delayacct_blkio_start();

6648

delayacct_blkio_start();

6649

atomic_inc(&rq->nr_iowait);

6649

atomic_inc(&rq->nr_iowait);

6650

ret = schedule_timeout(timeout);

6650

ret = schedule_timeout(timeout);

6651

atomic_dec(&rq->nr_iowait);

6651

atomic_dec(&rq->nr_iowait);

6652

delayacct_blkio_end();

6652

delayacct_blkio_end();

6653

return ret;

6653

return ret;

6654

}

6654

}

6655

6656

/**

6656

/**

6657

* sys_sched_get_priority_max - return maximum RT priority.

6657

* sys_sched_get_priority_max - return maximum RT priority.

6658

* @policy: scheduling class.

6658

* @policy: scheduling class.

6659

*

6659

*

6660

* this syscall returns the maximum rt_priority that can be used

6660

* this syscall returns the maximum rt_priority that can be used

6661

* by a given scheduling class.

6661

* by a given scheduling class.

6662

*/

6662

*/

6663

SYSCALL_DEFINE1(sched_get_priority_max, int, policy)

6663

SYSCALL_DEFINE1(sched_get_priority_max, int, policy)

6664

{

6664

{

6665

int ret = -EINVAL;

6665

int ret = -EINVAL;

6666

6667

switch (policy) {

6667

switch (policy) {

6668

case SCHED_FIFO:

6668

case SCHED_FIFO:

6669

case SCHED_RR:

6669

case SCHED_RR:

6670

ret = MAX_USER_RT_PRIO-1;

6670

ret = MAX_USER_RT_PRIO-1;

6671

break;

6671

break;

6672

case SCHED_NORMAL:

6672

case SCHED_NORMAL:

6673

case SCHED_BATCH:

6673

case SCHED_BATCH:

6674

case SCHED_IDLE:

6674

case SCHED_IDLE:

6675

ret = 0;

6675

ret = 0;

6676

break;

6676

break;

6677

}

6677

}

6678

return ret;

6678

return ret;

6679

}

6679

}

6680

6681

/**

6681

/**

6682

* sys_sched_get_priority_min - return minimum RT priority.

6682

* sys_sched_get_priority_min - return minimum RT priority.

6683

* @policy: scheduling class.

6683

* @policy: scheduling class.

6684

*

6684

*

6685

* this syscall returns the minimum rt_priority that can be used

6685

* this syscall returns the minimum rt_priority that can be used

6686

* by a given scheduling class.

6686

* by a given scheduling class.

6687

*/

6687

*/

6688

SYSCALL_DEFINE1(sched_get_priority_min, int, policy)

6688

SYSCALL_DEFINE1(sched_get_priority_min, int, policy)

6689

{

6689

{

6690

int ret = -EINVAL;

6690

int ret = -EINVAL;

6691

6692

switch (policy) {

6692

switch (policy) {

6693

case SCHED_FIFO:

6693

case SCHED_FIFO:

6694

case SCHED_RR:

6694

case SCHED_RR:

6695

ret = 1;

6695

ret = 1;

6696

break;

6696

break;

6697

case SCHED_NORMAL:

6697

case SCHED_NORMAL:

6698

case SCHED_BATCH:

6698

case SCHED_BATCH:

6699

case SCHED_IDLE:

6699

case SCHED_IDLE:

6700

ret = 0;

6700

ret = 0;

6701

}

6701

}

6702

return ret;

6702

return ret;

6703

}

6703

}

6704

6705

/**

6705

/**

6706

* sys_sched_rr_get_interval - return the default timeslice of a process.

6706

* sys_sched_rr_get_interval - return the default timeslice of a process.

6707

* @pid: pid of the process.

6707

* @pid: pid of the process.

6708

* @interval: userspace pointer to the timeslice value.

6708

* @interval: userspace pointer to the timeslice value.

6709

*

6709

*

6710

* this syscall writes the default timeslice value of a given process

6710

* this syscall writes the default timeslice value of a given process

6711

* into the user-space timespec buffer. A value of '0' means infinity.

6711

* into the user-space timespec buffer. A value of '0' means infinity.

6712

*/

6712

*/

6713

SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,

6713

SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,

6714

struct timespec __user *, interval)

6714

struct timespec __user *, interval)

6715

{

6715

{

6716

struct task_struct *p;

6716

struct task_struct *p;

6717

unsigned int time_slice;

6717

unsigned int time_slice;

6718

int retval;

6718

int retval;

6719

struct timespec t;

6719

struct timespec t;

6720

6721

if (pid < 0)

6721

if (pid < 0)

6722

return -EINVAL;

6722

return -EINVAL;

6723

6724

retval = -ESRCH;

6724

retval = -ESRCH;

6725

read_lock(&tasklist_lock);

6725

read_lock(&tasklist_lock);

6726

p = find_process_by_pid(pid);

6726

p = find_process_by_pid(pid);

6727

if (!p)

6727

if (!p)

6728

goto out_unlock;

6728

goto out_unlock;

6729

6730

retval = security_task_getscheduler(p);

6730

retval = security_task_getscheduler(p);

6731

if (retval)

6731

if (retval)

6732

goto out_unlock;

6732

goto out_unlock;

6733

6734

/*

6734

/*

6735

* Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER

6735

* Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER

6736

* tasks that are on an otherwise idle runqueue:

6736

* tasks that are on an otherwise idle runqueue:

6737

*/

6737

*/

6738

time_slice = 0;

6738

time_slice = 0;

6739

if (p->policy == SCHED_RR) {

6739

if (p->policy == SCHED_RR) {

6740

time_slice = DEF_TIMESLICE;

6740

time_slice = DEF_TIMESLICE;

6741

} else if (p->policy != SCHED_FIFO) {

6741

} else if (p->policy != SCHED_FIFO) {

6742

struct sched_entity *se = &p->se;

6742

struct sched_entity *se = &p->se;

6743

unsigned long flags;

6743

unsigned long flags;

6744

struct rq *rq;

6744

struct rq *rq;

6745

6746

rq = task_rq_lock(p, &flags);

6746

rq = task_rq_lock(p, &flags);

6747

if (rq->cfs.load.weight)

6747

if (rq->cfs.load.weight)

6748

time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));

6748

time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));

6749

task_rq_unlock(rq, &flags);

6749

task_rq_unlock(rq, &flags);

6750

}

6750

}

6751

read_unlock(&tasklist_lock);

6751

read_unlock(&tasklist_lock);

6752

jiffies_to_timespec(time_slice, &t);

6752

jiffies_to_timespec(time_slice, &t);

6753

retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;

6753

retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;

6754

return retval;

6754

return retval;

6755

6756

out_unlock:

6756

out_unlock:

6757

read_unlock(&tasklist_lock);

6757

read_unlock(&tasklist_lock);

6758

return retval;

6758

return retval;

6759

}

6759

}

6760

6761

static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;

6761

static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;

6762

6763

void sched_show_task(struct task_struct *p)

6763

void sched_show_task(struct task_struct *p)

6764

{

6764

{

6765

unsigned long free = 0;

6765

unsigned long free = 0;

6766

unsigned state;

6766

unsigned state;

6767

6768

state = p->state ? __ffs(p->state) + 1 : 0;

6768

state = p->state ? __ffs(p->state) + 1 : 0;

6769

printk(KERN_INFO "%-13.13s %c", p->comm,

6769

printk(KERN_INFO "%-13.13s %c", p->comm,

6770

state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');

6770

state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');

6771

#if BITS_PER_LONG == 32

6771

#if BITS_PER_LONG == 32

6772

if (state == TASK_RUNNING)

6772

if (state == TASK_RUNNING)

6773

printk(KERN_CONT " running ");

6773

printk(KERN_CONT " running ");

6774

else

6774

else

6775

printk(KERN_CONT " %08lx ", thread_saved_pc(p));

6775

printk(KERN_CONT " %08lx ", thread_saved_pc(p));

6776

#else

6776

#else

6777

if (state == TASK_RUNNING)

6777

if (state == TASK_RUNNING)

6778

printk(KERN_CONT " running task ");

6778

printk(KERN_CONT " running task ");

6779

else

6779

else

6780

printk(KERN_CONT " %016lx ", thread_saved_pc(p));

6780

printk(KERN_CONT " %016lx ", thread_saved_pc(p));

6781

#endif

6781

#endif

6782

#ifdef CONFIG_DEBUG_STACK_USAGE

6782

#ifdef CONFIG_DEBUG_STACK_USAGE

6783

free = stack_not_used(p);

6783

free = stack_not_used(p);

6784

#endif

6784

#endif

6785

printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,

6785

printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,

6786

task_pid_nr(p), task_pid_nr(p->real_parent),

6786

task_pid_nr(p), task_pid_nr(p->real_parent),

6787

(unsigned long)task_thread_info(p)->flags);

6787

(unsigned long)task_thread_info(p)->flags);

6788

6789

show_stack(p, NULL);

6789

show_stack(p, NULL);

6790

}

6790

}

6791

6792

void show_state_filter(unsigned long state_filter)

6792

void show_state_filter(unsigned long state_filter)

6793

{

6793

{

6794

struct task_struct *g, *p;

6794

struct task_struct *g, *p;

6795

6796

#if BITS_PER_LONG == 32

6796

#if BITS_PER_LONG == 32

6797

printk(KERN_INFO

6797

printk(KERN_INFO

6798

" task PC stack pid father\n");

6798

" task PC stack pid father\n");

6799

#else

6799

#else

6800

printk(KERN_INFO

6800

printk(KERN_INFO

6801

" task PC stack pid father\n");

6801

" task PC stack pid father\n");

6802

#endif

6802

#endif

6803

read_lock(&tasklist_lock);

6803

read_lock(&tasklist_lock);

6804

do_each_thread(g, p) {

6804

do_each_thread(g, p) {

6805

/*

6805

/*

6806

* reset the NMI-timeout, listing all files on a slow

6806

* reset the NMI-timeout, listing all files on a slow

6807

* console might take alot of time:

6807

* console might take alot of time:

6808

*/

6808

*/

6809

touch_nmi_watchdog();

6809

touch_nmi_watchdog();

6810

if (!state_filter || (p->state & state_filter))

6810

if (!state_filter || (p->state & state_filter))

6811

sched_show_task(p);

6811

sched_show_task(p);

6812

} while_each_thread(g, p);

6812

} while_each_thread(g, p);

6813

6814

touch_all_softlockup_watchdogs();

6814

touch_all_softlockup_watchdogs();

6815

6816

#ifdef CONFIG_SCHED_DEBUG

6816

#ifdef CONFIG_SCHED_DEBUG

6817

sysrq_sched_debug_show();

6817

sysrq_sched_debug_show();

6818

#endif

6818

#endif

6819

read_unlock(&tasklist_lock);

6819

read_unlock(&tasklist_lock);

6820

/*

6820

/*

6821

* Only show locks if all tasks are dumped:

6821

* Only show locks if all tasks are dumped:

6822

*/

6822

*/

6823

if (state_filter == -1)

6823

if (state_filter == -1)

6824

debug_show_all_locks();

6824

debug_show_all_locks();

6825

}

6825

}

6826

6827

void __cpuinit init_idle_bootup_task(struct task_struct *idle)

6827

void __cpuinit init_idle_bootup_task(struct task_struct *idle)

6828

{

6828

{

6829

idle->sched_class = &idle_sched_class;

6829

idle->sched_class = &idle_sched_class;

6830

}

6830

}

6831

6832

/**

6832

/**

6833

* init_idle - set up an idle thread for a given CPU

6833

* init_idle - set up an idle thread for a given CPU

6834

* @idle: task in question

6834

* @idle: task in question

6835

* @cpu: cpu the idle task belongs to

6835

* @cpu: cpu the idle task belongs to

6836

*

6836

*

6837

* NOTE: this function does not set the idle thread's NEED_RESCHED

6837

* NOTE: this function does not set the idle thread's NEED_RESCHED

6838

* flag, to make booting more robust.

6838

* flag, to make booting more robust.

6839

*/

6839

*/

6840

void __cpuinit init_idle(struct task_struct *idle, int cpu)

6840

void __cpuinit init_idle(struct task_struct *idle, int cpu)

6841

{

6841

{

6842

struct rq *rq = cpu_rq(cpu);

6842

struct rq *rq = cpu_rq(cpu);

6843

unsigned long flags;

6843

unsigned long flags;

6844

6845

spin_lock_irqsave(&rq->lock, flags);

6845

spin_lock_irqsave(&rq->lock, flags);

6846

6847

__sched_fork(idle);

6847

__sched_fork(idle);

6848

idle->se.exec_start = sched_clock();

6848

idle->se.exec_start = sched_clock();

6849

6850

idle->prio = idle->normal_prio = MAX_PRIO;

6850

idle->prio = idle->normal_prio = MAX_PRIO;

6851

cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));

6851

cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));

6852

__set_task_cpu(idle, cpu);

6852

__set_task_cpu(idle, cpu);

6853

6854

rq->curr = rq->idle = idle;

6854

rq->curr = rq->idle = idle;

6855

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

6855

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

6856

idle->oncpu = 1;

6856

idle->oncpu = 1;

6857

#endif

6857

#endif

6858

spin_unlock_irqrestore(&rq->lock, flags);

6858

spin_unlock_irqrestore(&rq->lock, flags);

6859

6860

/* Set the preempt count _outside_ the spinlocks! */

6860

/* Set the preempt count _outside_ the spinlocks! */

6861

#if defined(CONFIG_PREEMPT)

6861

#if defined(CONFIG_PREEMPT)

6862

task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);

6862

task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);

6863

#else

6863

#else

6864

task_thread_info(idle)->preempt_count = 0;

6864

task_thread_info(idle)->preempt_count = 0;

6865

#endif

6865

#endif

6866

/*

6866

/*

6867

* The idle tasks have their own, simple scheduling class:

6867

* The idle tasks have their own, simple scheduling class:

6868

*/

6868

*/

6869

idle->sched_class = &idle_sched_class;

6869

idle->sched_class = &idle_sched_class;

6870

ftrace_graph_init_task(idle);

6870

ftrace_graph_init_task(idle);

6871

}

6871

}

6872

6873

/*

6873

/*

6874

* In a system that switches off the HZ timer nohz_cpu_mask

6874

* In a system that switches off the HZ timer nohz_cpu_mask

6875

* indicates which cpus entered this state. This is used

6875

* indicates which cpus entered this state. This is used

6876

* in the rcu update to wait only for active cpus. For system

6876

* in the rcu update to wait only for active cpus. For system

6877

* which do not switch off the HZ timer nohz_cpu_mask should

6877

* which do not switch off the HZ timer nohz_cpu_mask should

6878

* always be CPU_BITS_NONE.

6878

* always be CPU_BITS_NONE.

6879

*/

6879

*/

6880

cpumask_var_t nohz_cpu_mask;

6880

cpumask_var_t nohz_cpu_mask;

6881

6882

/*

6882

/*

6883

* Increase the granularity value when there are more CPUs,

6883

* Increase the granularity value when there are more CPUs,

6884

* because with more CPUs the 'effective latency' as visible

6884

* because with more CPUs the 'effective latency' as visible

6885

* to users decreases. But the relationship is not linear,

6885

* to users decreases. But the relationship is not linear,

6886

* so pick a second-best guess by going with the log2 of the

6886

* so pick a second-best guess by going with the log2 of the

6887

* number of CPUs.

6887

* number of CPUs.

6888

*

6888

*

6889

* This idea comes from the SD scheduler of Con Kolivas:

6889

* This idea comes from the SD scheduler of Con Kolivas:

6890

*/

6890

*/

6891

static inline void sched_init_granularity(void)

6891

static inline void sched_init_granularity(void)

6892

{

6892

{

6893

unsigned int factor = 1 + ilog2(num_online_cpus());

6893

unsigned int factor = 1 + ilog2(num_online_cpus());

6894

const unsigned long limit = 200000000;

6894

const unsigned long limit = 200000000;

6895

6896

sysctl_sched_min_granularity *= factor;

6896

sysctl_sched_min_granularity *= factor;

6897

if (sysctl_sched_min_granularity > limit)

6897

if (sysctl_sched_min_granularity > limit)

6898

sysctl_sched_min_granularity = limit;

6898

sysctl_sched_min_granularity = limit;

6899

6900

sysctl_sched_latency *= factor;

6900

sysctl_sched_latency *= factor;

6901

if (sysctl_sched_latency > limit)

6901

if (sysctl_sched_latency > limit)

6902

sysctl_sched_latency = limit;

6902

sysctl_sched_latency = limit;

6903

6904

sysctl_sched_wakeup_granularity *= factor;

6904

sysctl_sched_wakeup_granularity *= factor;

6905

6906

sysctl_sched_shares_ratelimit *= factor;

6906

sysctl_sched_shares_ratelimit *= factor;

6907

}

6907

}

6908

6909

#ifdef CONFIG_SMP

6909

#ifdef CONFIG_SMP

6910

/*

6910

/*

6911

* This is how migration works:

6911

* This is how migration works:

6912

*

6912

*

6913

* 1) we queue a struct migration_req structure in the source CPU's

6913

* 1) we queue a struct migration_req structure in the source CPU's

6914

* runqueue and wake up that CPU's migration thread.

6914

* runqueue and wake up that CPU's migration thread.

6915

* 2) we down() the locked semaphore => thread blocks.

6915

* 2) we down() the locked semaphore => thread blocks.

6916

* 3) migration thread wakes up (implicitly it forces the migrated

6916

* 3) migration thread wakes up (implicitly it forces the migrated

6917

* thread off the CPU)

6917

* thread off the CPU)

6918

* 4) it gets the migration request and checks whether the migrated

6918

* 4) it gets the migration request and checks whether the migrated

6919

* task is still in the wrong runqueue.

6919

* task is still in the wrong runqueue.

6920

* 5) if it's in the wrong runqueue then the migration thread removes

6920

* 5) if it's in the wrong runqueue then the migration thread removes

6921

* it and puts it into the right queue.

6921

* it and puts it into the right queue.

6922

* 6) migration thread up()s the semaphore.

6922

* 6) migration thread up()s the semaphore.

6923

* 7) we wake up and the migration is done.

6923

* 7) we wake up and the migration is done.

6924

*/

6924

*/

6925

6926

/*

6926

/*

6927

* Change a given task's CPU affinity. Migrate the thread to a

6927

* Change a given task's CPU affinity. Migrate the thread to a

6928

* proper CPU and schedule it away if the CPU it's executing on

6928

* proper CPU and schedule it away if the CPU it's executing on

6929

* is removed from the allowed bitmask.

6929

* is removed from the allowed bitmask.

6930

*

6930

*

6931

* NOTE: the caller must have a valid reference to the task, the

6931

* NOTE: the caller must have a valid reference to the task, the

6932

* task must not exit() & deallocate itself prematurely. The

6932

* task must not exit() & deallocate itself prematurely. The

6933

* call is not atomic; no spinlocks may be held.

6933

* call is not atomic; no spinlocks may be held.

6934

*/

6934

*/

6935

int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)

6935

int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)

6936

{

6936

{

6937

struct migration_req req;

6937

struct migration_req req;

6938

unsigned long flags;

6938

unsigned long flags;

6939

struct rq *rq;

6939

struct rq *rq;

6940

int ret = 0;

6940

int ret = 0;

6941

6942

rq = task_rq_lock(p, &flags);

6942

rq = task_rq_lock(p, &flags);

6943

if (!cpumask_intersects(new_mask, cpu_online_mask)) {

6943

if (!cpumask_intersects(new_mask, cpu_online_mask)) {

6944

ret = -EINVAL;

6944

ret = -EINVAL;

6945

goto out;

6945

goto out;

6946

}

6946

}

6947

6948

if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&

6948

if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&

6949

!cpumask_equal(&p->cpus_allowed, new_mask))) {

6949

!cpumask_equal(&p->cpus_allowed, new_mask))) {

6950

ret = -EINVAL;

6950

ret = -EINVAL;

6951

goto out;

6951

goto out;

6952

}

6952

}

6953

6954

if (p->sched_class->set_cpus_allowed)

6954

if (p->sched_class->set_cpus_allowed)

6955

p->sched_class->set_cpus_allowed(p, new_mask);

6955

p->sched_class->set_cpus_allowed(p, new_mask);

6956

else {

6956

else {

6957

cpumask_copy(&p->cpus_allowed, new_mask);

6957

cpumask_copy(&p->cpus_allowed, new_mask);

6958

p->rt.nr_cpus_allowed = cpumask_weight(new_mask);

6958

p->rt.nr_cpus_allowed = cpumask_weight(new_mask);

6959

}

6959

}

6960

6961

/* Can the task run on the task's current CPU? If so, we're done */

6961

/* Can the task run on the task's current CPU? If so, we're done */

6962

if (cpumask_test_cpu(task_cpu(p), new_mask))

6962

if (cpumask_test_cpu(task_cpu(p), new_mask))

6963

goto out;

6963

goto out;

6964

6965

if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {

6965

if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {

6966

/* Need help from migration thread: drop lock and wait. */

6966

/* Need help from migration thread: drop lock and wait. */

6967

task_rq_unlock(rq, &flags);

6967

task_rq_unlock(rq, &flags);

6968

wake_up_process(rq->migration_thread);

6968

wake_up_process(rq->migration_thread);

6969

wait_for_completion(&req.done);

6969

wait_for_completion(&req.done);

6970

tlb_migrate_finish(p->mm);

6970

tlb_migrate_finish(p->mm);

6971

return 0;

6971

return 0;

6972

}

6972

}

6973

out:

6973

out:

6974

task_rq_unlock(rq, &flags);

6974

task_rq_unlock(rq, &flags);

6975

6976

return ret;

6976

return ret;

6977

}

6977

}

6978

EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

6978

EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

6979

6980

/*

6980

/*

6981

* Move (not current) task off this cpu, onto dest cpu. We're doing

6981

* Move (not current) task off this cpu, onto dest cpu. We're doing

6982

* this because either it can't run here any more (set_cpus_allowed()

6982

* this because either it can't run here any more (set_cpus_allowed()

6983

* away from this CPU, or CPU going down), or because we're

6983

* away from this CPU, or CPU going down), or because we're

6984

* attempting to rebalance this task on exec (sched_exec).

6984

* attempting to rebalance this task on exec (sched_exec).

6985

*

6985

*

6986

* So we race with normal scheduler movements, but that's OK, as long

6986

* So we race with normal scheduler movements, but that's OK, as long

6987

* as the task is no longer on this CPU.

6987

* as the task is no longer on this CPU.

6988

*

6988

*

6989

* Returns non-zero if task was successfully migrated.

6989

* Returns non-zero if task was successfully migrated.

6990

*/

6990

*/

6991

static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)

6991

static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)

6992

{

6992

{

6993

struct rq *rq_dest, *rq_src;

6993

struct rq *rq_dest, *rq_src;

6994

int ret = 0, on_rq;

6994

int ret = 0, on_rq;

6995

6996

if (unlikely(!cpu_active(dest_cpu)))

6996

if (unlikely(!cpu_active(dest_cpu)))

6997

return ret;

6997

return ret;

6998

6999

rq_src = cpu_rq(src_cpu);

6999

rq_src = cpu_rq(src_cpu);

7000

rq_dest = cpu_rq(dest_cpu);

7000

rq_dest = cpu_rq(dest_cpu);

7001

7002

double_rq_lock(rq_src, rq_dest);

7002

double_rq_lock(rq_src, rq_dest);

7003

/* Already moved. */

7003

/* Already moved. */

7004

if (task_cpu(p) != src_cpu)

7004

if (task_cpu(p) != src_cpu)

7005

goto done;

7005

goto done;

7006

/* Affinity changed (again). */

7006

/* Affinity changed (again). */

7007

if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

7007

if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

7008

goto fail;

7008

goto fail;

7009

7010

on_rq = p->se.on_rq;

7010

on_rq = p->se.on_rq;

7011

if (on_rq)

7011

if (on_rq)

7012

deactivate_task(rq_src, p, 0);

7012

deactivate_task(rq_src, p, 0);

7013

7014

set_task_cpu(p, dest_cpu);

7014

set_task_cpu(p, dest_cpu);

7015

if (on_rq) {

7015

if (on_rq) {

7016

activate_task(rq_dest, p, 0);

7016

activate_task(rq_dest, p, 0);

7017

check_preempt_curr(rq_dest, p, 0);

7017

check_preempt_curr(rq_dest, p, 0);

7018

}

7018

}

7019

done:

7019

done:

7020

ret = 1;

7020

ret = 1;

7021

fail:

7021

fail:

7022

double_rq_unlock(rq_src, rq_dest);

7022

double_rq_unlock(rq_src, rq_dest);

7023

return ret;

7023

return ret;

7024

}

7024

}

7025

7026

/*

7026

/*

7027

* migration_thread - this is a highprio system thread that performs

7027

* migration_thread - this is a highprio system thread that performs

7028

* thread migration by bumping thread off CPU then 'pushing' onto

7028

* thread migration by bumping thread off CPU then 'pushing' onto

7029

* another runqueue.

7029

* another runqueue.

7030

*/

7030

*/

7031

static int migration_thread(void *data)

7031

static int migration_thread(void *data)

7032

{

7032

{

7033

int cpu = (long)data;

7033

int cpu = (long)data;

7034

struct rq *rq;

7034

struct rq *rq;

7035

7036

rq = cpu_rq(cpu);

7036

rq = cpu_rq(cpu);

7037

BUG_ON(rq->migration_thread != current);

7037

BUG_ON(rq->migration_thread != current);

7038

7039

set_current_state(TASK_INTERRUPTIBLE);

7039

set_current_state(TASK_INTERRUPTIBLE);

7040

while (!kthread_should_stop()) {

7040

while (!kthread_should_stop()) {

7041

struct migration_req *req;

7041

struct migration_req *req;

7042

struct list_head *head;

7042

struct list_head *head;

7043

7044

spin_lock_irq(&rq->lock);

7044

spin_lock_irq(&rq->lock);

7045

7046

if (cpu_is_offline(cpu)) {

7046

if (cpu_is_offline(cpu)) {

7047

spin_unlock_irq(&rq->lock);

7047

spin_unlock_irq(&rq->lock);

7048

goto wait_to_die;

7048

goto wait_to_die;

7049

}

7049

}

7050

7051

if (rq->active_balance) {

7051

if (rq->active_balance) {

7052

active_load_balance(rq, cpu);

7052

active_load_balance(rq, cpu);

7053

rq->active_balance = 0;

7053

rq->active_balance = 0;

7054

}

7054

}

7055

7056

head = &rq->migration_queue;

7056

head = &rq->migration_queue;

7057

7058

if (list_empty(head)) {

7058

if (list_empty(head)) {

7059

spin_unlock_irq(&rq->lock);

7059

spin_unlock_irq(&rq->lock);

7060

schedule();

7060

schedule();

7061

set_current_state(TASK_INTERRUPTIBLE);

7061

set_current_state(TASK_INTERRUPTIBLE);

7062

continue;

7062

continue;

7063

}

7063

}

7064

req = list_entry(head->next, struct migration_req, list);

7064

req = list_entry(head->next, struct migration_req, list);

7065

list_del_init(head->next);

7065

list_del_init(head->next);

7066

7067

spin_unlock(&rq->lock);

7067

spin_unlock(&rq->lock);

7068

__migrate_task(req->task, cpu, req->dest_cpu);

7068

__migrate_task(req->task, cpu, req->dest_cpu);

7069

local_irq_enable();

7069

local_irq_enable();

7070

7071

complete(&req->done);

7071

complete(&req->done);

7072

}

7072

}

7073

__set_current_state(TASK_RUNNING);

7073

__set_current_state(TASK_RUNNING);

7074

return 0;

7074

return 0;

7075

7076

wait_to_die:

7076

wait_to_die:

7077

/* Wait for kthread_stop */

7077

/* Wait for kthread_stop */

7078

set_current_state(TASK_INTERRUPTIBLE);

7078

set_current_state(TASK_INTERRUPTIBLE);

7079

while (!kthread_should_stop()) {

7079

while (!kthread_should_stop()) {

7080

schedule();

7080

schedule();

7081

set_current_state(TASK_INTERRUPTIBLE);

7081

set_current_state(TASK_INTERRUPTIBLE);

7082

}

7082

}

7083

__set_current_state(TASK_RUNNING);

7083

__set_current_state(TASK_RUNNING);

7084

return 0;

7084

return 0;

7085

}

7085

}

7086

7087

#ifdef CONFIG_HOTPLUG_CPU

7087

#ifdef CONFIG_HOTPLUG_CPU

7088

7089

static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)

7089

static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)

7090

{

7090

{

7091

int ret;

7091

int ret;

7092

7093

local_irq_disable();

7093

local_irq_disable();

7094

ret = __migrate_task(p, src_cpu, dest_cpu);

7094

ret = __migrate_task(p, src_cpu, dest_cpu);

7095

local_irq_enable();

7095

local_irq_enable();

7096

return ret;

7096

return ret;

7097

}

7097

}

7098

7099

/*

7099

/*

7100

* Figure out where task on dead CPU should go, use force if necessary.

7100

* Figure out where task on dead CPU should go, use force if necessary.

7101

*/

7101

*/

7102

static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)

7102

static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)

7103

{

7103

{

7104

int dest_cpu;

7104

int dest_cpu;

7105

const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));

7105

const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));

7106

7107

again:

7107

again:

7108

/* Look for allowed, online CPU in same node. */

7108

/* Look for allowed, online CPU in same node. */

7109

for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)

7109

for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)

7110

if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

7110

if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

7111

goto move;

7111

goto move;

7112

7113

/* Any allowed, online CPU? */

7113

/* Any allowed, online CPU? */

7114

dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);

7114

dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);

7115

if (dest_cpu < nr_cpu_ids)

7115

if (dest_cpu < nr_cpu_ids)

7116

goto move;

7116

goto move;

7117

7118

/* No more Mr. Nice Guy. */

7118

/* No more Mr. Nice Guy. */

7119

if (dest_cpu >= nr_cpu_ids) {

7119

if (dest_cpu >= nr_cpu_ids) {

7120

cpuset_cpus_allowed_locked(p, &p->cpus_allowed);

7120

cpuset_cpus_allowed_locked(p, &p->cpus_allowed);

7121

dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);

7121

dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);

7122

7123

/*

7123

/*

7124

* Don't tell them about moving exiting tasks or

7124

* Don't tell them about moving exiting tasks or

7125

* kernel threads (both mm NULL), since they never

7125

* kernel threads (both mm NULL), since they never

7126

* leave kernel.

7126

* leave kernel.

7127

*/

7127

*/

7128

if (p->mm && printk_ratelimit()) {

7128

if (p->mm && printk_ratelimit()) {

7129

printk(KERN_INFO "process %d (%s) no "

7129

printk(KERN_INFO "process %d (%s) no "

7130

"longer affine to cpu%d\n",

7130

"longer affine to cpu%d\n",

7131

task_pid_nr(p), p->comm, dead_cpu);

7131

task_pid_nr(p), p->comm, dead_cpu);

7132

}

7132

}

7133

}

7133

}

7134

7135

move:

7135

move:

7136

/* It can have affinity changed while we were choosing. */

7136

/* It can have affinity changed while we were choosing. */

7137

if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))

7137

if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))

7138

goto again;

7138

goto again;

7139

}

7139

}

7140

7141

/*

7141

/*

7142

* While a dead CPU has no uninterruptible tasks queued at this point,

7142

* While a dead CPU has no uninterruptible tasks queued at this point,

7143

* it might still have a nonzero ->nr_uninterruptible counter, because

7143

* it might still have a nonzero ->nr_uninterruptible counter, because

7144

* for performance reasons the counter is not stricly tracking tasks to

7144

* for performance reasons the counter is not stricly tracking tasks to

7145

* their home CPUs. So we just add the counter to another CPU's counter,

7145

* their home CPUs. So we just add the counter to another CPU's counter,

7146

* to keep the global sum constant after CPU-down:

7146

* to keep the global sum constant after CPU-down:

7147

*/

7147

*/

7148

static void migrate_nr_uninterruptible(struct rq *rq_src)

7148

static void migrate_nr_uninterruptible(struct rq *rq_src)

7149

{

7149

{

7150

struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));

7150

struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));

7151

unsigned long flags;

7151

unsigned long flags;

7152

7153

local_irq_save(flags);

7153

local_irq_save(flags);

7154

double_rq_lock(rq_src, rq_dest);

7154

double_rq_lock(rq_src, rq_dest);

7155

rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;

7155

rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;

7156

rq_src->nr_uninterruptible = 0;

7156

rq_src->nr_uninterruptible = 0;

7157

double_rq_unlock(rq_src, rq_dest);

7157

double_rq_unlock(rq_src, rq_dest);

7158

local_irq_restore(flags);

7158

local_irq_restore(flags);

7159

}

7159

}

7160

7161

/* Run through task list and migrate tasks from the dead cpu. */

7161

/* Run through task list and migrate tasks from the dead cpu. */

7162

static void migrate_live_tasks(int src_cpu)

7162

static void migrate_live_tasks(int src_cpu)

7163

{

7163

{

7164

struct task_struct *p, *t;

7164

struct task_struct *p, *t;

7165

7166

read_lock(&tasklist_lock);

7166

read_lock(&tasklist_lock);

7167

7168

do_each_thread(t, p) {

7168

do_each_thread(t, p) {

7169

if (p == current)

7169

if (p == current)

7170

continue;

7170

continue;

7171

7172

if (task_cpu(p) == src_cpu)

7172

if (task_cpu(p) == src_cpu)

7173

move_task_off_dead_cpu(src_cpu, p);

7173

move_task_off_dead_cpu(src_cpu, p);

7174

} while_each_thread(t, p);

7174

} while_each_thread(t, p);

7175

7176

read_unlock(&tasklist_lock);

7176

read_unlock(&tasklist_lock);

7177

}

7177

}

7178

7179

/*

7179

/*

7180

* Schedules idle task to be the next runnable task on current CPU.

7180

* Schedules idle task to be the next runnable task on current CPU.

7181

* It does so by boosting its priority to highest possible.

7181

* It does so by boosting its priority to highest possible.

7182

* Used by CPU offline code.

7182

* Used by CPU offline code.

7183

*/

7183

*/

7184

void sched_idle_next(void)

7184

void sched_idle_next(void)

7185

{

7185

{

7186

int this_cpu = smp_processor_id();

7186

int this_cpu = smp_processor_id();

7187

struct rq *rq = cpu_rq(this_cpu);

7187

struct rq *rq = cpu_rq(this_cpu);

7188

struct task_struct *p = rq->idle;

7188

struct task_struct *p = rq->idle;

7189

unsigned long flags;

7189

unsigned long flags;

7190

7191

/* cpu has to be offline */

7191

/* cpu has to be offline */

7192

BUG_ON(cpu_online(this_cpu));

7192

BUG_ON(cpu_online(this_cpu));

7193

7194

/*

7194

/*

7195

* Strictly not necessary since rest of the CPUs are stopped by now

7195

* Strictly not necessary since rest of the CPUs are stopped by now

7196

* and interrupts disabled on the current cpu.

7196

* and interrupts disabled on the current cpu.

7197

*/

7197

*/

7198

spin_lock_irqsave(&rq->lock, flags);

7198

spin_lock_irqsave(&rq->lock, flags);

7199

7200

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

7200

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

7201

7202

update_rq_clock(rq);

7202

update_rq_clock(rq);

7203

activate_task(rq, p, 0);

7203

activate_task(rq, p, 0);

7204

7205

spin_unlock_irqrestore(&rq->lock, flags);

7205

spin_unlock_irqrestore(&rq->lock, flags);

7206

}

7206

}

7207

7208

/*

7208

/*

7209

* Ensures that the idle task is using init_mm right before its cpu goes

7209

* Ensures that the idle task is using init_mm right before its cpu goes

7210

* offline.

7210

* offline.

7211

*/

7211

*/

7212

void idle_task_exit(void)

7212

void idle_task_exit(void)

7213

{

7213

{

7214

struct mm_struct *mm = current->active_mm;

7214

struct mm_struct *mm = current->active_mm;

7215

7216

BUG_ON(cpu_online(smp_processor_id()));

7216

BUG_ON(cpu_online(smp_processor_id()));

7217

7218

if (mm != &init_mm)

7218

if (mm != &init_mm)

7219

switch_mm(mm, &init_mm, current);

7219

switch_mm(mm, &init_mm, current);

7220

mmdrop(mm);

7220

mmdrop(mm);

7221

}

7221

}

7222

7223

/* called under rq->lock with disabled interrupts */

7223

/* called under rq->lock with disabled interrupts */

7224

static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)

7224

static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)

7225

{

7225

{

7226

struct rq *rq = cpu_rq(dead_cpu);

7226

struct rq *rq = cpu_rq(dead_cpu);

7227

7228

/* Must be exiting, otherwise would be on tasklist. */

7228

/* Must be exiting, otherwise would be on tasklist. */

7229

BUG_ON(!p->exit_state);

7229

BUG_ON(!p->exit_state);

7230

7231

/* Cannot have done final schedule yet: would have vanished. */

7231

/* Cannot have done final schedule yet: would have vanished. */

7232

BUG_ON(p->state == TASK_DEAD);

7232

BUG_ON(p->state == TASK_DEAD);

7233

7234

get_task_struct(p);

7234

get_task_struct(p);

7235

7236

/*

7236

/*

7237

* Drop lock around migration; if someone else moves it,

7237

* Drop lock around migration; if someone else moves it,

7238

* that's OK. No task can be added to this CPU, so iteration is

7238

* that's OK. No task can be added to this CPU, so iteration is

7239

* fine.

7239

* fine.

7240

*/

7240

*/

7241

spin_unlock_irq(&rq->lock);

7241

spin_unlock_irq(&rq->lock);

7242

move_task_off_dead_cpu(dead_cpu, p);

7242

move_task_off_dead_cpu(dead_cpu, p);

7243

spin_lock_irq(&rq->lock);

7243

spin_lock_irq(&rq->lock);

7244

7245

put_task_struct(p);

7245

put_task_struct(p);

7246

}

7246

}

7247

7248

/* release_task() removes task from tasklist, so we won't find dead tasks. */

7248

/* release_task() removes task from tasklist, so we won't find dead tasks. */

7249

static void migrate_dead_tasks(unsigned int dead_cpu)

7249

static void migrate_dead_tasks(unsigned int dead_cpu)

7250

{

7250

{

7251

struct rq *rq = cpu_rq(dead_cpu);

7251

struct rq *rq = cpu_rq(dead_cpu);

7252

struct task_struct *next;

7252

struct task_struct *next;

7253

7254

for ( ; ; ) {

7254

for ( ; ; ) {

7255

if (!rq->nr_running)

7255

if (!rq->nr_running)

7256

break;

7256

break;

7257

update_rq_clock(rq);

7257

update_rq_clock(rq);

7258

next = pick_next_task(rq);

7258

next = pick_next_task(rq);

7259

if (!next)

7259

if (!next)

7260

break;

7260

break;

7261

next->sched_class->put_prev_task(rq, next);

7261

next->sched_class->put_prev_task(rq, next);

7262

migrate_dead(dead_cpu, next);

7262

migrate_dead(dead_cpu, next);

7263

7264

}

7264

}

7265

}

7265

}

7266

7267

/*

7267

/*

7268

* remove the tasks which were accounted by rq from calc_load_tasks.

7268

* remove the tasks which were accounted by rq from calc_load_tasks.

7269

*/

7269

*/

7270

static void calc_global_load_remove(struct rq *rq)

7270

static void calc_global_load_remove(struct rq *rq)

7271

{

7271

{

7272

atomic_long_sub(rq->calc_load_active, &calc_load_tasks);

7272

atomic_long_sub(rq->calc_load_active, &calc_load_tasks);

7273

}

7273

}

7274

#endif /* CONFIG_HOTPLUG_CPU */

7274

#endif /* CONFIG_HOTPLUG_CPU */

7275

7276

#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)

7276

#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)

7277

7278

static struct ctl_table sd_ctl_dir[] = {

7278

static struct ctl_table sd_ctl_dir[] = {

7279

{

7279

{

7280

.procname = "sched_domain",

7280

.procname = "sched_domain",

7281

.mode = 0555,

7281

.mode = 0555,

7282

},

7282

},

7283

{0, },

7283

{0, },

7284

};

7284

};

7285

7286

static struct ctl_table sd_ctl_root[] = {

7286

static struct ctl_table sd_ctl_root[] = {

7287

{

7287

{

7288

.ctl_name = CTL_KERN,

7288

.ctl_name = CTL_KERN,

7289

.procname = "kernel",

7289

.procname = "kernel",

7290

.mode = 0555,

7290

.mode = 0555,

7291

.child = sd_ctl_dir,

7291

.child = sd_ctl_dir,

7292

},

7292

},

7293

{0, },

7293

{0, },

7294

};

7294

};

7295

7296

static struct ctl_table *sd_alloc_ctl_entry(int n)

7296

static struct ctl_table *sd_alloc_ctl_entry(int n)

7297

{

7297

{

7298

struct ctl_table *entry =

7298

struct ctl_table *entry =

7299

kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);

7299

kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);

7300

7301

return entry;

7301

return entry;

7302

}

7302

}

7303

7304

static void sd_free_ctl_entry(struct ctl_table **tablep)

7304

static void sd_free_ctl_entry(struct ctl_table **tablep)

7305

{

7305

{

7306

struct ctl_table *entry;

7306

struct ctl_table *entry;

7307

7308

/*

7308

/*

7309

* In the intermediate directories, both the child directory and

7309

* In the intermediate directories, both the child directory and

7310

* procname are dynamically allocated and could fail but the mode

7310

* procname are dynamically allocated and could fail but the mode

7311

* will always be set. In the lowest directory the names are

7311

* will always be set. In the lowest directory the names are

7312

* static strings and all have proc handlers.

7312

* static strings and all have proc handlers.

7313

*/

7313

*/

7314

for (entry = *tablep; entry->mode; entry++) {

7314

for (entry = *tablep; entry->mode; entry++) {

7315

if (entry->child)

7315

if (entry->child)

7316

sd_free_ctl_entry(&entry->child);

7316

sd_free_ctl_entry(&entry->child);

7317

if (entry->proc_handler == NULL)

7317

if (entry->proc_handler == NULL)

7318

kfree(entry->procname);

7318

kfree(entry->procname);

7319

}

7319

}

7320

7321

kfree(*tablep);

7321

kfree(*tablep);

7322

*tablep = NULL;

7322

*tablep = NULL;

7323

}

7323

}

7324

7325

static void

7325

static void

7326

set_table_entry(struct ctl_table *entry,

7326

set_table_entry(struct ctl_table *entry,

7327

const char *procname, void *data, int maxlen,

7327

const char *procname, void *data, int maxlen,

7328

mode_t mode, proc_handler *proc_handler)

7328

mode_t mode, proc_handler *proc_handler)

7329

{

7329

{

7330

entry->procname = procname;

7330

entry->procname = procname;

7331

entry->data = data;

7331

entry->data = data;

7332

entry->maxlen = maxlen;

7332

entry->maxlen = maxlen;

7333

entry->mode = mode;

7333

entry->mode = mode;

7334

entry->proc_handler = proc_handler;

7334

entry->proc_handler = proc_handler;

7335

}

7335

}

7336

7337

static struct ctl_table *

7337

static struct ctl_table *

7338

sd_alloc_ctl_domain_table(struct sched_domain *sd)

7338

sd_alloc_ctl_domain_table(struct sched_domain *sd)

7339

{

7339

{

7340

struct ctl_table *table = sd_alloc_ctl_entry(13);

7340

struct ctl_table *table = sd_alloc_ctl_entry(13);

7341

7342

if (table == NULL)

7342

if (table == NULL)

7343

return NULL;

7343

return NULL;

7344

7345

set_table_entry(&table[0], "min_interval", &sd->min_interval,

7345

set_table_entry(&table[0], "min_interval", &sd->min_interval,

7346

sizeof(long), 0644, proc_doulongvec_minmax);

7346

sizeof(long), 0644, proc_doulongvec_minmax);

7347

set_table_entry(&table[1], "max_interval", &sd->max_interval,

7347

set_table_entry(&table[1], "max_interval", &sd->max_interval,

7348

sizeof(long), 0644, proc_doulongvec_minmax);

7348

sizeof(long), 0644, proc_doulongvec_minmax);

7349

set_table_entry(&table[2], "busy_idx", &sd->busy_idx,

7349

set_table_entry(&table[2], "busy_idx", &sd->busy_idx,

7350

sizeof(int), 0644, proc_dointvec_minmax);

7350

sizeof(int), 0644, proc_dointvec_minmax);

7351

set_table_entry(&table[3], "idle_idx", &sd->idle_idx,

7351

set_table_entry(&table[3], "idle_idx", &sd->idle_idx,

7352

sizeof(int), 0644, proc_dointvec_minmax);

7352

sizeof(int), 0644, proc_dointvec_minmax);

7353

set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,

7353

set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,

7354

sizeof(int), 0644, proc_dointvec_minmax);

7354

sizeof(int), 0644, proc_dointvec_minmax);

7355

set_table_entry(&table[5], "wake_idx", &sd->wake_idx,

7355

set_table_entry(&table[5], "wake_idx", &sd->wake_idx,

7356

sizeof(int), 0644, proc_dointvec_minmax);

7356

sizeof(int), 0644, proc_dointvec_minmax);

7357

set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,

7357

set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,

7358

sizeof(int), 0644, proc_dointvec_minmax);

7358

sizeof(int), 0644, proc_dointvec_minmax);

7359

set_table_entry(&table[7], "busy_factor", &sd->busy_factor,

7359

set_table_entry(&table[7], "busy_factor", &sd->busy_factor,

7360

sizeof(int), 0644, proc_dointvec_minmax);

7360

sizeof(int), 0644, proc_dointvec_minmax);

7361

set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,

7361

set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,

7362

sizeof(int), 0644, proc_dointvec_minmax);

7362

sizeof(int), 0644, proc_dointvec_minmax);

7363

set_table_entry(&table[9], "cache_nice_tries",

7363

set_table_entry(&table[9], "cache_nice_tries",

7364

&sd->cache_nice_tries,

7364

&sd->cache_nice_tries,

7365

sizeof(int), 0644, proc_dointvec_minmax);

7365

sizeof(int), 0644, proc_dointvec_minmax);

7366

set_table_entry(&table[10], "flags", &sd->flags,

7366

set_table_entry(&table[10], "flags", &sd->flags,

7367

sizeof(int), 0644, proc_dointvec_minmax);

7367

sizeof(int), 0644, proc_dointvec_minmax);

7368

set_table_entry(&table[11], "name", sd->name,

7368

set_table_entry(&table[11], "name", sd->name,

7369

CORENAME_MAX_SIZE, 0444, proc_dostring);

7369

CORENAME_MAX_SIZE, 0444, proc_dostring);

7370

/* &table[12] is terminator */

7370

/* &table[12] is terminator */

7371

7372

return table;

7372

return table;

7373

}

7373

}

7374

7375

static ctl_table *sd_alloc_ctl_cpu_table(int cpu)

7375

static ctl_table *sd_alloc_ctl_cpu_table(int cpu)

7376

{

7376

{

7377

struct ctl_table *entry, *table;

7377

struct ctl_table *entry, *table;

7378

struct sched_domain *sd;

7378

struct sched_domain *sd;

7379

int domain_num = 0, i;

7379

int domain_num = 0, i;

7380

char buf[32];

7380

char buf[32];

7381

7382

for_each_domain(cpu, sd)

7382

for_each_domain(cpu, sd)

7383

domain_num++;

7383

domain_num++;

7384

entry = table = sd_alloc_ctl_entry(domain_num + 1);

7384

entry = table = sd_alloc_ctl_entry(domain_num + 1);

7385

if (table == NULL)

7385

if (table == NULL)

7386

return NULL;

7386

return NULL;

7387

7388

i = 0;

7388

i = 0;

7389

for_each_domain(cpu, sd) {

7389

for_each_domain(cpu, sd) {

7390

snprintf(buf, 32, "domain%d", i);

7390

snprintf(buf, 32, "domain%d", i);

7391

entry->procname = kstrdup(buf, GFP_KERNEL);

7391

entry->procname = kstrdup(buf, GFP_KERNEL);

7392

entry->mode = 0555;

7392

entry->mode = 0555;

7393

entry->child = sd_alloc_ctl_domain_table(sd);

7393

entry->child = sd_alloc_ctl_domain_table(sd);

7394

entry++;

7394

entry++;

7395

i++;

7395

i++;

7396

}

7396

}

7397

return table;

7397

return table;

7398

}

7398

}

7399

7400

static struct ctl_table_header *sd_sysctl_header;

7400

static struct ctl_table_header *sd_sysctl_header;

7401

static void register_sched_domain_sysctl(void)

7401

static void register_sched_domain_sysctl(void)

7402

{

7402

{

7403

int i, cpu_num = num_online_cpus();

7403

int i, cpu_num = num_online_cpus();

7404

struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);

7404

struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);

7405

char buf[32];

7405

char buf[32];

7406

7407

WARN_ON(sd_ctl_dir[0].child);

7407

WARN_ON(sd_ctl_dir[0].child);

7408

sd_ctl_dir[0].child = entry;

7408

sd_ctl_dir[0].child = entry;

7409

7410

if (entry == NULL)

7410

if (entry == NULL)

7411

return;

7411

return;

7412

7413

for_each_online_cpu(i) {

7413

for_each_online_cpu(i) {

7414

snprintf(buf, 32, "cpu%d", i);

7414

snprintf(buf, 32, "cpu%d", i);

7415

entry->procname = kstrdup(buf, GFP_KERNEL);

7415

entry->procname = kstrdup(buf, GFP_KERNEL);

7416

entry->mode = 0555;

7416

entry->mode = 0555;

7417

entry->child = sd_alloc_ctl_cpu_table(i);

7417

entry->child = sd_alloc_ctl_cpu_table(i);

7418

entry++;

7418

entry++;

7419

}

7419

}

7420

7421

WARN_ON(sd_sysctl_header);

7421

WARN_ON(sd_sysctl_header);

7422

sd_sysctl_header = register_sysctl_table(sd_ctl_root);

7422

sd_sysctl_header = register_sysctl_table(sd_ctl_root);

7423

}

7423

}

7424

7425

/* may be called multiple times per register */

7425

/* may be called multiple times per register */

7426

static void unregister_sched_domain_sysctl(void)

7426

static void unregister_sched_domain_sysctl(void)

7427

{

7427

{

7428

if (sd_sysctl_header)

7428

if (sd_sysctl_header)

7429

unregister_sysctl_table(sd_sysctl_header);

7429

unregister_sysctl_table(sd_sysctl_header);

7430

sd_sysctl_header = NULL;

7430

sd_sysctl_header = NULL;

7431

if (sd_ctl_dir[0].child)

7431

if (sd_ctl_dir[0].child)

7432

sd_free_ctl_entry(&sd_ctl_dir[0].child);

7432

sd_free_ctl_entry(&sd_ctl_dir[0].child);

7433

}

7433

}

7434

#else

7434

#else

7435

static void register_sched_domain_sysctl(void)

7435

static void register_sched_domain_sysctl(void)

7436

{

7436

{

7437

}

7437

}

7438

static void unregister_sched_domain_sysctl(void)

7438

static void unregister_sched_domain_sysctl(void)

7439

{

7439

{

7440

}

7440

}

7441

#endif

7441

#endif

7442

7443

static void set_rq_online(struct rq *rq)

7443

static void set_rq_online(struct rq *rq)

7444

{

7444

{

7445

if (!rq->online) {

7445

if (!rq->online) {

7446

const struct sched_class *class;

7446

const struct sched_class *class;

7447

7448

cpumask_set_cpu(rq->cpu, rq->rd->online);

7448

cpumask_set_cpu(rq->cpu, rq->rd->online);

7449

rq->online = 1;

7449

rq->online = 1;

7450

7451

for_each_class(class) {

7451

for_each_class(class) {

7452

if (class->rq_online)

7452

if (class->rq_online)

7453

class->rq_online(rq);

7453

class->rq_online(rq);

7454

}

7454

}

7455

}

7455

}

7456

}

7456

}

7457

7458

static void set_rq_offline(struct rq *rq)

7458

static void set_rq_offline(struct rq *rq)

7459

{

7459

{

7460

if (rq->online) {

7460

if (rq->online) {

7461

const struct sched_class *class;

7461

const struct sched_class *class;

7462

7463

for_each_class(class) {

7463

for_each_class(class) {

7464

if (class->rq_offline)

7464

if (class->rq_offline)

7465

class->rq_offline(rq);

7465

class->rq_offline(rq);

7466

}

7466

}

7467

7468

cpumask_clear_cpu(rq->cpu, rq->rd->online);

7468

cpumask_clear_cpu(rq->cpu, rq->rd->online);

7469

rq->online = 0;

7469

rq->online = 0;

7470

}

7470

}

7471

}

7471

}

7472

7473

/*

7473

/*

7474

* migration_call - callback that gets triggered when a CPU is added.

7474

* migration_call - callback that gets triggered when a CPU is added.

7475

* Here we can start up the necessary migration thread for the new CPU.

7475

* Here we can start up the necessary migration thread for the new CPU.

7476

*/

7476

*/

7477

static int __cpuinit

7477

static int __cpuinit

7478

migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)

7478

migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)

7479

{

7479

{

7480

struct task_struct *p;

7480

struct task_struct *p;

7481

int cpu = (long)hcpu;

7481

int cpu = (long)hcpu;

7482

unsigned long flags;

7482

unsigned long flags;

7483

struct rq *rq;

7483

struct rq *rq;

7484

7485

switch (action) {

7485

switch (action) {

7486

7487

case CPU_UP_PREPARE:

7487

case CPU_UP_PREPARE:

7488

case CPU_UP_PREPARE_FROZEN:

7488

case CPU_UP_PREPARE_FROZEN:

7489

p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);

7489

p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);

7490

if (IS_ERR(p))

7490

if (IS_ERR(p))

7491

return NOTIFY_BAD;

7491

return NOTIFY_BAD;

7492

kthread_bind(p, cpu);

7492

kthread_bind(p, cpu);

7493

/* Must be high prio: stop_machine expects to yield to it. */

7493

/* Must be high prio: stop_machine expects to yield to it. */

7494

rq = task_rq_lock(p, &flags);

7494

rq = task_rq_lock(p, &flags);

7495

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

7495

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

7496

task_rq_unlock(rq, &flags);

7496

task_rq_unlock(rq, &flags);

7497

cpu_rq(cpu)->migration_thread = p;

7497

cpu_rq(cpu)->migration_thread = p;

7498

break;

7498

break;

7499

7500

case CPU_ONLINE:

7500

case CPU_ONLINE:

7501

case CPU_ONLINE_FROZEN:

7501

case CPU_ONLINE_FROZEN:

7502

/* Strictly unnecessary, as first user will wake it. */

7502

/* Strictly unnecessary, as first user will wake it. */

7503

wake_up_process(cpu_rq(cpu)->migration_thread);

7503

wake_up_process(cpu_rq(cpu)->migration_thread);

7504

7505

/* Update our root-domain */

7505

/* Update our root-domain */

7506

rq = cpu_rq(cpu);

7506

rq = cpu_rq(cpu);

7507

spin_lock_irqsave(&rq->lock, flags);

7507

spin_lock_irqsave(&rq->lock, flags);

7508

rq->calc_load_update = calc_load_update;

7508

rq->calc_load_update = calc_load_update;

7509

rq->calc_load_active = 0;

7509

rq->calc_load_active = 0;

7510

if (rq->rd) {

7510

if (rq->rd) {

7511

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

7511

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

7512

7513

set_rq_online(rq);

7513

set_rq_online(rq);

7514

}

7514

}

7515

spin_unlock_irqrestore(&rq->lock, flags);

7515

spin_unlock_irqrestore(&rq->lock, flags);

7516

break;

7516

break;

7517

7518

#ifdef CONFIG_HOTPLUG_CPU

7518

#ifdef CONFIG_HOTPLUG_CPU

7519

case CPU_UP_CANCELED:

7519

case CPU_UP_CANCELED:

7520

case CPU_UP_CANCELED_FROZEN:

7520

case CPU_UP_CANCELED_FROZEN:

7521

if (!cpu_rq(cpu)->migration_thread)

7521

if (!cpu_rq(cpu)->migration_thread)

7522

break;

7522

break;

7523

/* Unbind it from offline cpu so it can run. Fall thru. */

7523

/* Unbind it from offline cpu so it can run. Fall thru. */

7524

kthread_bind(cpu_rq(cpu)->migration_thread,

7524

kthread_bind(cpu_rq(cpu)->migration_thread,

7525

cpumask_any(cpu_online_mask));

7525

cpumask_any(cpu_online_mask));

7526

kthread_stop(cpu_rq(cpu)->migration_thread);

7526

kthread_stop(cpu_rq(cpu)->migration_thread);

7527

cpu_rq(cpu)->migration_thread = NULL;

7527

cpu_rq(cpu)->migration_thread = NULL;

7528

break;

7528

break;

7529

7530

case CPU_DEAD:

7530

case CPU_DEAD:

7531

case CPU_DEAD_FROZEN:

7531

case CPU_DEAD_FROZEN:

7532

cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */

7532

cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */

7533

migrate_live_tasks(cpu);

7533

migrate_live_tasks(cpu);

7534

rq = cpu_rq(cpu);

7534

rq = cpu_rq(cpu);

7535

kthread_stop(rq->migration_thread);

7535

kthread_stop(rq->migration_thread);

7536

rq->migration_thread = NULL;

7536

rq->migration_thread = NULL;

7537

/* Idle task back to normal (off runqueue, low prio) */

7537

/* Idle task back to normal (off runqueue, low prio) */

7538

spin_lock_irq(&rq->lock);

7538

spin_lock_irq(&rq->lock);

7539

update_rq_clock(rq);

7539

update_rq_clock(rq);

7540

deactivate_task(rq, rq->idle, 0);

7540

deactivate_task(rq, rq->idle, 0);

7541

rq->idle->static_prio = MAX_PRIO;

7541

rq->idle->static_prio = MAX_PRIO;

7542

__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);

7542

__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);

7543

rq->idle->sched_class = &idle_sched_class;

7543

rq->idle->sched_class = &idle_sched_class;

7544

migrate_dead_tasks(cpu);

7544

migrate_dead_tasks(cpu);

7545

spin_unlock_irq(&rq->lock);

7545

spin_unlock_irq(&rq->lock);

7546

cpuset_unlock();

7546

cpuset_unlock();

7547

migrate_nr_uninterruptible(rq);

7547

migrate_nr_uninterruptible(rq);

7548

BUG_ON(rq->nr_running != 0);

7548

BUG_ON(rq->nr_running != 0);

7549

calc_global_load_remove(rq);

7549

calc_global_load_remove(rq);

7550

/*

7550

/*

7551

* No need to migrate the tasks: it was best-effort if

7551

* No need to migrate the tasks: it was best-effort if

7552

* they didn't take sched_hotcpu_mutex. Just wake up

7552

* they didn't take sched_hotcpu_mutex. Just wake up

7553

* the requestors.

7553

* the requestors.

7554

*/

7554

*/

7555

spin_lock_irq(&rq->lock);

7555

spin_lock_irq(&rq->lock);

7556

while (!list_empty(&rq->migration_queue)) {

7556

while (!list_empty(&rq->migration_queue)) {

7557

struct migration_req *req;

7557

struct migration_req *req;

7558

7559

req = list_entry(rq->migration_queue.next,

7559

req = list_entry(rq->migration_queue.next,

7560

struct migration_req, list);

7560

struct migration_req, list);

7561

list_del_init(&req->list);

7561

list_del_init(&req->list);

7562

spin_unlock_irq(&rq->lock);

7562

spin_unlock_irq(&rq->lock);

7563

complete(&req->done);

7563

complete(&req->done);

7564

spin_lock_irq(&rq->lock);

7564

spin_lock_irq(&rq->lock);

7565

}

7565

}

7566

spin_unlock_irq(&rq->lock);

7566

spin_unlock_irq(&rq->lock);

7567

break;

7567

break;

7568

7569

case CPU_DYING:

7569

case CPU_DYING:

7570

case CPU_DYING_FROZEN:

7570

case CPU_DYING_FROZEN:

7571

/* Update our root-domain */

7571

/* Update our root-domain */

7572

rq = cpu_rq(cpu);

7572

rq = cpu_rq(cpu);

7573

spin_lock_irqsave(&rq->lock, flags);

7573

spin_lock_irqsave(&rq->lock, flags);

7574

if (rq->rd) {

7574

if (rq->rd) {

7575

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

7575

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

7576

set_rq_offline(rq);

7576

set_rq_offline(rq);

7577

}

7577

}

7578

spin_unlock_irqrestore(&rq->lock, flags);

7578

spin_unlock_irqrestore(&rq->lock, flags);

7579

break;

7579

break;

7580

#endif

7580

#endif

7581

}

7581

}

7582

return NOTIFY_OK;

7582

return NOTIFY_OK;

7583

}

7583

}

7584

7585

/*

7585

/*

7586

* Register at high priority so that task migration (migrate_all_tasks)

7586

* Register at high priority so that task migration (migrate_all_tasks)

7587

* happens before everything else. This has to be lower priority than

7587

* happens before everything else. This has to be lower priority than

7588

* the notifier in the perf_counter subsystem, though.

7588

* the notifier in the perf_counter subsystem, though.

7589

*/

7589

*/

7590

static struct notifier_block __cpuinitdata migration_notifier = {

7590

static struct notifier_block __cpuinitdata migration_notifier = {

7591

.notifier_call = migration_call,

7591

.notifier_call = migration_call,

7592

.priority = 10

7592

.priority = 10

7593

};

7593

};

7594

7595

static int __init migration_init(void)

7595

static int __init migration_init(void)

7596

{

7596

{

7597

void *cpu = (void *)(long)smp_processor_id();

7597

void *cpu = (void *)(long)smp_processor_id();

7598

int err;

7598

int err;

7599

7600

/* Start one for the boot CPU: */

7600

/* Start one for the boot CPU: */

7601

err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);

7601

err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);

7602

BUG_ON(err == NOTIFY_BAD);

7602

BUG_ON(err == NOTIFY_BAD);

7603

migration_call(&migration_notifier, CPU_ONLINE, cpu);

7603

migration_call(&migration_notifier, CPU_ONLINE, cpu);

7604

register_cpu_notifier(&migration_notifier);

7604

register_cpu_notifier(&migration_notifier);

7605

7606

return err;

7606

return err;

7607

}

7607

}

7608

early_initcall(migration_init);

7608

early_initcall(migration_init);

7609

#endif

7609

#endif

7610

7611

#ifdef CONFIG_SMP

7611

#ifdef CONFIG_SMP

7612

7613

#ifdef CONFIG_SCHED_DEBUG

7613

#ifdef CONFIG_SCHED_DEBUG

7614

7615

static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,

7615

static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,

7616

struct cpumask *groupmask)

7616

struct cpumask *groupmask)

7617

{

7617

{

7618

struct sched_group *group = sd->groups;

7618

struct sched_group *group = sd->groups;

7619

char str[256];

7619

char str[256];

7620

7621

cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));

7621

cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));

7622

cpumask_clear(groupmask);

7622

cpumask_clear(groupmask);

7623

7624

printk(KERN_DEBUG "%*s domain %d: ", level, "", level);

7624

printk(KERN_DEBUG "%*s domain %d: ", level, "", level);

7625

7626

if (!(sd->flags & SD_LOAD_BALANCE)) {

7626

if (!(sd->flags & SD_LOAD_BALANCE)) {

7627

printk("does not load-balance\n");

7627

printk("does not load-balance\n");

7628

if (sd->parent)

7628

if (sd->parent)

7629

printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"

7629

printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"

7630

" has parent");

7630

" has parent");

7631

return -1;

7631

return -1;

7632

}

7632

}

7633

7634

printk(KERN_CONT "span %s level %s\n", str, sd->name);

7634

printk(KERN_CONT "span %s level %s\n", str, sd->name);

7635

7636

if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {

7636

if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {

7637

printk(KERN_ERR "ERROR: domain->span does not contain "

7637

printk(KERN_ERR "ERROR: domain->span does not contain "

7638

"CPU%d\n", cpu);

7638

"CPU%d\n", cpu);

7639

}

7639

}

7640

if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {

7640

if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {

7641

printk(KERN_ERR "ERROR: domain->groups does not contain"

7641

printk(KERN_ERR "ERROR: domain->groups does not contain"

7642

" CPU%d\n", cpu);

7642

" CPU%d\n", cpu);

7643

}

7643

}

7644

7645

printk(KERN_DEBUG "%*s groups:", level + 1, "");

7645

printk(KERN_DEBUG "%*s groups:", level + 1, "");

7646

do {

7646

do {

7647

if (!group) {

7647

if (!group) {

7648

printk("\n");

7648

printk("\n");

7649

printk(KERN_ERR "ERROR: group is NULL\n");

7649

printk(KERN_ERR "ERROR: group is NULL\n");

7650

break;

7650

break;

7651

}

7651

}

7652

7653

if (!group->__cpu_power) {

7653

if (!group->__cpu_power) {

7654

printk(KERN_CONT "\n");

7654

printk(KERN_CONT "\n");

7655

printk(KERN_ERR "ERROR: domain->cpu_power not "

7655

printk(KERN_ERR "ERROR: domain->cpu_power not "

7656

"set\n");

7656

"set\n");

7657

break;

7657

break;

7658

}

7658

}

7659

7660

if (!cpumask_weight(sched_group_cpus(group))) {

7660

if (!cpumask_weight(sched_group_cpus(group))) {

7661

printk(KERN_CONT "\n");

7661

printk(KERN_CONT "\n");

7662

printk(KERN_ERR "ERROR: empty group\n");

7662

printk(KERN_ERR "ERROR: empty group\n");

7663

break;

7663

break;

7664

}

7664

}

7665

7666

if (cpumask_intersects(groupmask, sched_group_cpus(group))) {

7666

if (cpumask_intersects(groupmask, sched_group_cpus(group))) {

7667

printk(KERN_CONT "\n");

7667

printk(KERN_CONT "\n");

7668

printk(KERN_ERR "ERROR: repeated CPUs\n");

7668

printk(KERN_ERR "ERROR: repeated CPUs\n");

7669

break;

7669

break;

7670

}

7670

}

7671

7672

cpumask_or(groupmask, groupmask, sched_group_cpus(group));

7672

cpumask_or(groupmask, groupmask, sched_group_cpus(group));

7673

7674

cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));

7674

cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));

7675

7676

printk(KERN_CONT " %s", str);

7676

printk(KERN_CONT " %s", str);

7677

if (group->__cpu_power != SCHED_LOAD_SCALE) {

7677

if (group->__cpu_power != SCHED_LOAD_SCALE) {

7678

printk(KERN_CONT " (__cpu_power = %d)",

7678

printk(KERN_CONT " (__cpu_power = %d)",

7679

group->__cpu_power);

7679

group->__cpu_power);

7680

}

7680

}

7681

7682

group = group->next;

7682

group = group->next;

7683

} while (group != sd->groups);

7683

} while (group != sd->groups);

7684

printk(KERN_CONT "\n");

7684

printk(KERN_CONT "\n");

7685

7686

if (!cpumask_equal(sched_domain_span(sd), groupmask))

7686

if (!cpumask_equal(sched_domain_span(sd), groupmask))

7687

printk(KERN_ERR "ERROR: groups don't span domain->span\n");

7687

printk(KERN_ERR "ERROR: groups don't span domain->span\n");

7688

7689

if (sd->parent &&

7689

if (sd->parent &&

7690

!cpumask_subset(groupmask, sched_domain_span(sd->parent)))

7690

!cpumask_subset(groupmask, sched_domain_span(sd->parent)))

7691

printk(KERN_ERR "ERROR: parent span is not a superset "

7691

printk(KERN_ERR "ERROR: parent span is not a superset "

7692

"of domain->span\n");

7692

"of domain->span\n");

7693

return 0;

7693

return 0;

7694

}

7694

}

7695

7696

static void sched_domain_debug(struct sched_domain *sd, int cpu)

7696

static void sched_domain_debug(struct sched_domain *sd, int cpu)

7697

{

7697

{

7698

cpumask_var_t groupmask;

7698

cpumask_var_t groupmask;

7699

int level = 0;

7699

int level = 0;

7700

7701

if (!sd) {

7701

if (!sd) {

7702

printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);

7702

printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);

7703

return;

7703

return;

7704

}

7704

}

7705

7706

printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

7706

printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

7707

7708

if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {

7708

if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {

7709

printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");

7709

printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");

7710

return;

7710

return;

7711

}

7711

}

7712

7713

for (;;) {

7713

for (;;) {

7714

if (sched_domain_debug_one(sd, cpu, level, groupmask))

7714

if (sched_domain_debug_one(sd, cpu, level, groupmask))

7715

break;

7715

break;

7716

level++;

7716

level++;

7717

sd = sd->parent;

7717

sd = sd->parent;

7718

if (!sd)

7718

if (!sd)

7719

break;

7719

break;

7720

}

7720

}

7721

free_cpumask_var(groupmask);

7721

free_cpumask_var(groupmask);

7722

}

7722

}

7723

#else /* !CONFIG_SCHED_DEBUG */

7723

#else /* !CONFIG_SCHED_DEBUG */

7724

# define sched_domain_debug(sd, cpu) do { } while (0)

7724

# define sched_domain_debug(sd, cpu) do { } while (0)

7725

#endif /* CONFIG_SCHED_DEBUG */

7725

#endif /* CONFIG_SCHED_DEBUG */

7726

7727

static int sd_degenerate(struct sched_domain *sd)

7727

static int sd_degenerate(struct sched_domain *sd)

7728

{

7728

{

7729

if (cpumask_weight(sched_domain_span(sd)) == 1)

7729

if (cpumask_weight(sched_domain_span(sd)) == 1)

7730

return 1;

7730

return 1;

7731

7732

/* Following flags need at least 2 groups */

7732

/* Following flags need at least 2 groups */

7733

if (sd->flags & (SD_LOAD_BALANCE |

7733

if (sd->flags & (SD_LOAD_BALANCE |

7734

SD_BALANCE_NEWIDLE |

7734

SD_BALANCE_NEWIDLE |

7735

SD_BALANCE_FORK |

7735

SD_BALANCE_FORK |

7736

SD_BALANCE_EXEC |

7736

SD_BALANCE_EXEC |

7737

SD_SHARE_CPUPOWER |

7737

SD_SHARE_CPUPOWER |

7738

SD_SHARE_PKG_RESOURCES)) {

7738

SD_SHARE_PKG_RESOURCES)) {

7739

if (sd->groups != sd->groups->next)

7739

if (sd->groups != sd->groups->next)

7740

return 0;

7740

return 0;

7741

}

7741

}

7742

7743

/* Following flags don't use groups */

7743

/* Following flags don't use groups */

7744

if (sd->flags & (SD_WAKE_IDLE |

7744

if (sd->flags & (SD_WAKE_IDLE |

7745

SD_WAKE_AFFINE |

7745

SD_WAKE_AFFINE |

7746

SD_WAKE_BALANCE))

7746

SD_WAKE_BALANCE))

7747

return 0;

7747

return 0;

7748

7749

return 1;

7749

return 1;

7750

}

7750

}

7751

7752

static int

7752

static int

7753

sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)

7753

sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)

7754

{

7754

{

7755

unsigned long cflags = sd->flags, pflags = parent->flags;

7755

unsigned long cflags = sd->flags, pflags = parent->flags;

7756

7757

if (sd_degenerate(parent))

7757

if (sd_degenerate(parent))

7758

return 1;

7758

return 1;

7759

7760

if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))

7760

if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))

7761

return 0;

7761

return 0;

7762

7763

/* Does parent contain flags not in child? */

7763

/* Does parent contain flags not in child? */

7764

/* WAKE_BALANCE is a subset of WAKE_AFFINE */

7764

/* WAKE_BALANCE is a subset of WAKE_AFFINE */

7765

if (cflags & SD_WAKE_AFFINE)

7765

if (cflags & SD_WAKE_AFFINE)

7766

pflags &= ~SD_WAKE_BALANCE;

7766

pflags &= ~SD_WAKE_BALANCE;

7767

/* Flags needing groups don't count if only 1 group in parent */

7767

/* Flags needing groups don't count if only 1 group in parent */

7768

if (parent->groups == parent->groups->next) {

7768

if (parent->groups == parent->groups->next) {

7769

pflags &= ~(SD_LOAD_BALANCE |

7769

pflags &= ~(SD_LOAD_BALANCE |

7770

SD_BALANCE_NEWIDLE |

7770

SD_BALANCE_NEWIDLE |

7771

SD_BALANCE_FORK |

7771

SD_BALANCE_FORK |

7772

SD_BALANCE_EXEC |

7772

SD_BALANCE_EXEC |

7773

SD_SHARE_CPUPOWER |

7773

SD_SHARE_CPUPOWER |

7774

SD_SHARE_PKG_RESOURCES);

7774

SD_SHARE_PKG_RESOURCES);

7775

if (nr_node_ids == 1)

7775

if (nr_node_ids == 1)

7776

pflags &= ~SD_SERIALIZE;

7776

pflags &= ~SD_SERIALIZE;

7777

}

7777

}

7778

if (~cflags & pflags)

7778

if (~cflags & pflags)

7779

return 0;

7779

return 0;

7780

7781

return 1;

7781

return 1;

7782

}

7782

}

7783

7784

static void free_rootdomain(struct root_domain *rd)

7784

static void free_rootdomain(struct root_domain *rd)

7785

{

7785

{

7786

cpupri_cleanup(&rd->cpupri);

7786

cpupri_cleanup(&rd->cpupri);

7787

7788

free_cpumask_var(rd->rto_mask);

7788

free_cpumask_var(rd->rto_mask);

7789

free_cpumask_var(rd->online);

7789

free_cpumask_var(rd->online);

7790

free_cpumask_var(rd->span);

7790

free_cpumask_var(rd->span);

7791

kfree(rd);

7791

kfree(rd);

7792

}

7792

}

7793

7794

static void rq_attach_root(struct rq *rq, struct root_domain *rd)

7794

static void rq_attach_root(struct rq *rq, struct root_domain *rd)

7795

{

7795

{

7796

struct root_domain *old_rd = NULL;

7796

struct root_domain *old_rd = NULL;

7797

unsigned long flags;

7797

unsigned long flags;

7798

7799

spin_lock_irqsave(&rq->lock, flags);

7799

spin_lock_irqsave(&rq->lock, flags);

7800

7801

if (rq->rd) {

7801

if (rq->rd) {

7802

old_rd = rq->rd;

7802

old_rd = rq->rd;

7803

7804

if (cpumask_test_cpu(rq->cpu, old_rd->online))

7804

if (cpumask_test_cpu(rq->cpu, old_rd->online))

7805

set_rq_offline(rq);

7805

set_rq_offline(rq);

7806

7807

cpumask_clear_cpu(rq->cpu, old_rd->span);

7807

cpumask_clear_cpu(rq->cpu, old_rd->span);

7808

7809

/*

7809

/*

7810

* If we dont want to free the old_rt yet then

7810

* If we dont want to free the old_rt yet then

7811

* set old_rd to NULL to skip the freeing later

7811

* set old_rd to NULL to skip the freeing later

7812

* in this function:

7812

* in this function:

7813

*/

7813

*/

7814

if (!atomic_dec_and_test(&old_rd->refcount))

7814

if (!atomic_dec_and_test(&old_rd->refcount))

7815

old_rd = NULL;

7815

old_rd = NULL;

7816

}

7816

}

7817

7818

atomic_inc(&rd->refcount);

7818

atomic_inc(&rd->refcount);

7819

rq->rd = rd;

7819

rq->rd = rd;

7820

7821

cpumask_set_cpu(rq->cpu, rd->span);

7821

cpumask_set_cpu(rq->cpu, rd->span);

7822

if (cpumask_test_cpu(rq->cpu, cpu_online_mask))

7822

if (cpumask_test_cpu(rq->cpu, cpu_online_mask))

7823

set_rq_online(rq);

7823

set_rq_online(rq);

7824

7825

spin_unlock_irqrestore(&rq->lock, flags);

7825

spin_unlock_irqrestore(&rq->lock, flags);

7826

7827

if (old_rd)

7827

if (old_rd)

7828

free_rootdomain(old_rd);

7828

free_rootdomain(old_rd);

7829

}

7829

}

7830

7831

static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)

7831

static int init_rootdomain(struct root_domain *rd, bool bootmem)

7832

{

7832

{

7833

gfp_t gfp = GFP_KERNEL;

7833

gfp_t gfp = GFP_KERNEL;

7834

7835

memset(rd, 0, sizeof(*rd));

7835

memset(rd, 0, sizeof(*rd));

7836

7837

if (bootmem)

7837

if (bootmem)

7838

gfp = GFP_NOWAIT;

7838

gfp = GFP_NOWAIT;

7839

7840

if (!alloc_cpumask_var(&rd->span, gfp))

7840

if (!alloc_cpumask_var(&rd->span, gfp))

7841

goto out;

7841

goto out;

7842

if (!alloc_cpumask_var(&rd->online, gfp))

7842

if (!alloc_cpumask_var(&rd->online, gfp))

7843

goto free_span;

7843

goto free_span;

7844

if (!alloc_cpumask_var(&rd->rto_mask, gfp))

7844

if (!alloc_cpumask_var(&rd->rto_mask, gfp))

7845

goto free_online;

7845

goto free_online;

7846

7847

if (cpupri_init(&rd->cpupri, bootmem) != 0)

7847

if (cpupri_init(&rd->cpupri, bootmem) != 0)

7848

goto free_rto_mask;

7848

goto free_rto_mask;

7849

return 0;

7849

return 0;

7850

7851

free_rto_mask:

7851

free_rto_mask:

7852

free_cpumask_var(rd->rto_mask);

7852

free_cpumask_var(rd->rto_mask);

7853

free_online:

7853

free_online:

7854

free_cpumask_var(rd->online);

7854

free_cpumask_var(rd->online);

7855

free_span:

7855

free_span:

7856

free_cpumask_var(rd->span);

7856

free_cpumask_var(rd->span);

7857

out:

7857

out:

7858

return -ENOMEM;

7858

return -ENOMEM;

7859

}

7859

}

7860

7861

static void init_defrootdomain(void)

7861

static void init_defrootdomain(void)

7862

{

7862

{

7863

init_rootdomain(&def_root_domain, true);

7863

init_rootdomain(&def_root_domain, true);

7864

7865

atomic_set(&def_root_domain.refcount, 1);

7865

atomic_set(&def_root_domain.refcount, 1);

7866

}

7866

}

7867

7868

static struct root_domain *alloc_rootdomain(void)

7868

static struct root_domain *alloc_rootdomain(void)

7869

{

7869

{

7870

struct root_domain *rd;

7870

struct root_domain *rd;

7871

7872

rd = kmalloc(sizeof(*rd), GFP_KERNEL);

7872

rd = kmalloc(sizeof(*rd), GFP_KERNEL);

7873

if (!rd)

7873

if (!rd)

7874

return NULL;

7874

return NULL;

7875

7876

if (init_rootdomain(rd, false) != 0) {

7876

if (init_rootdomain(rd, false) != 0) {

7877

kfree(rd);

7877

kfree(rd);

7878

return NULL;

7878

return NULL;

7879

}

7879

}

7880

7881

return rd;

7881

return rd;

7882

}

7882

}

7883

7884

/*

7884

/*

7885

* Attach the domain 'sd' to 'cpu' as its base domain. Callers must

7885

* Attach the domain 'sd' to 'cpu' as its base domain. Callers must

7886

* hold the hotplug lock.

7886

* hold the hotplug lock.

7887

*/

7887

*/

7888

static void

7888

static void

7889

cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)

7889

cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)

7890

{

7890

{

7891

struct rq *rq = cpu_rq(cpu);

7891

struct rq *rq = cpu_rq(cpu);

7892

struct sched_domain *tmp;

7892

struct sched_domain *tmp;

7893

7894

/* Remove the sched domains which do not contribute to scheduling. */

7894

/* Remove the sched domains which do not contribute to scheduling. */

7895

for (tmp = sd; tmp; ) {

7895

for (tmp = sd; tmp; ) {

7896

struct sched_domain *parent = tmp->parent;

7896

struct sched_domain *parent = tmp->parent;

7897

if (!parent)

7897

if (!parent)

7898

break;

7898

break;

7899

7900

if (sd_parent_degenerate(tmp, parent)) {

7900

if (sd_parent_degenerate(tmp, parent)) {

7901

tmp->parent = parent->parent;

7901

tmp->parent = parent->parent;

7902

if (parent->parent)

7902

if (parent->parent)

7903

parent->parent->child = tmp;

7903

parent->parent->child = tmp;

7904

} else

7904

} else

7905

tmp = tmp->parent;

7905

tmp = tmp->parent;

7906

}

7906

}

7907

7908

if (sd && sd_degenerate(sd)) {

7908

if (sd && sd_degenerate(sd)) {

7909

sd = sd->parent;

7909

sd = sd->parent;

7910

if (sd)

7910

if (sd)

7911

sd->child = NULL;

7911

sd->child = NULL;

7912

}

7912

}

7913

7914

sched_domain_debug(sd, cpu);

7914

sched_domain_debug(sd, cpu);

7915

7916

rq_attach_root(rq, rd);

7916

rq_attach_root(rq, rd);

7917

rcu_assign_pointer(rq->sd, sd);

7917

rcu_assign_pointer(rq->sd, sd);

7918

}

7918

}

7919

7920

/* cpus with isolated domains */

7920

/* cpus with isolated domains */

7921

static cpumask_var_t cpu_isolated_map;

7921

static cpumask_var_t cpu_isolated_map;

7922

7923

/* Setup the mask of cpus configured for isolated domains */

7923

/* Setup the mask of cpus configured for isolated domains */

7924

static int __init isolated_cpu_setup(char *str)

7924

static int __init isolated_cpu_setup(char *str)

7925

{

7925

{

7926

cpulist_parse(str, cpu_isolated_map);

7926

cpulist_parse(str, cpu_isolated_map);

7927

return 1;

7927

return 1;

7928

}

7928

}

7929

7930

__setup("isolcpus=", isolated_cpu_setup);

7930

__setup("isolcpus=", isolated_cpu_setup);

7931

7932

/*

7932

/*

7933

* init_sched_build_groups takes the cpumask we wish to span, and a pointer

7933

* init_sched_build_groups takes the cpumask we wish to span, and a pointer

7934

* to a function which identifies what group(along with sched group) a CPU

7934

* to a function which identifies what group(along with sched group) a CPU

7935

* belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids

7935

* belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids

7936

* (due to the fact that we keep track of groups covered with a struct cpumask).

7936

* (due to the fact that we keep track of groups covered with a struct cpumask).

7937

*

7937

*

7938

* init_sched_build_groups will build a circular linked list of the groups

7938

* init_sched_build_groups will build a circular linked list of the groups

7939

* covered by the given span, and will set each group's ->cpumask correctly,

7939

* covered by the given span, and will set each group's ->cpumask correctly,

7940

* and ->cpu_power to 0.

7940

* and ->cpu_power to 0.

7941

*/

7941

*/

7942

static void

7942

static void

7943

init_sched_build_groups(const struct cpumask *span,

7943

init_sched_build_groups(const struct cpumask *span,

7944

const struct cpumask *cpu_map,

7944

const struct cpumask *cpu_map,

7945

int (*group_fn)(int cpu, const struct cpumask *cpu_map,

7945

int (*group_fn)(int cpu, const struct cpumask *cpu_map,

7946

struct sched_group **sg,

7946

struct sched_group **sg,

7947

struct cpumask *tmpmask),

7947

struct cpumask *tmpmask),

7948

struct cpumask *covered, struct cpumask *tmpmask)

7948

struct cpumask *covered, struct cpumask *tmpmask)

7949

{

7949

{

7950

struct sched_group *first = NULL, *last = NULL;

7950

struct sched_group *first = NULL, *last = NULL;

7951

int i;

7951

int i;

7952

7953

cpumask_clear(covered);

7953

cpumask_clear(covered);

7954

7955

for_each_cpu(i, span) {

7955

for_each_cpu(i, span) {

7956

struct sched_group *sg;

7956

struct sched_group *sg;

7957

int group = group_fn(i, cpu_map, &sg, tmpmask);

7957

int group = group_fn(i, cpu_map, &sg, tmpmask);

7958

int j;

7958

int j;

7959

7960

if (cpumask_test_cpu(i, covered))

7960

if (cpumask_test_cpu(i, covered))

7961

continue;

7961

continue;

7962

7963

cpumask_clear(sched_group_cpus(sg));

7963

cpumask_clear(sched_group_cpus(sg));

7964

sg->__cpu_power = 0;

7964

sg->__cpu_power = 0;

7965

7966

for_each_cpu(j, span) {

7966

for_each_cpu(j, span) {

7967

if (group_fn(j, cpu_map, NULL, tmpmask) != group)

7967

if (group_fn(j, cpu_map, NULL, tmpmask) != group)

7968

continue;

7968

continue;

7969

7970

cpumask_set_cpu(j, covered);

7970

cpumask_set_cpu(j, covered);

7971

cpumask_set_cpu(j, sched_group_cpus(sg));

7971

cpumask_set_cpu(j, sched_group_cpus(sg));

7972

}

7972

}

7973

if (!first)

7973

if (!first)

7974

first = sg;

7974

first = sg;

7975

if (last)

7975

if (last)

7976

last->next = sg;

7976

last->next = sg;

7977

last = sg;

7977

last = sg;

7978

}

7978

}

7979

last->next = first;

7979

last->next = first;

7980

}

7980

}

7981

7982

#define SD_NODES_PER_DOMAIN 16

7982

#define SD_NODES_PER_DOMAIN 16

7983

7984

#ifdef CONFIG_NUMA

7984

#ifdef CONFIG_NUMA

7985

7986

/**

7986

/**

7987

* find_next_best_node - find the next node to include in a sched_domain

7987

* find_next_best_node - find the next node to include in a sched_domain

7988

* @node: node whose sched_domain we're building

7988

* @node: node whose sched_domain we're building

7989

* @used_nodes: nodes already in the sched_domain

7989

* @used_nodes: nodes already in the sched_domain

7990

*

7990

*

7991

* Find the next node to include in a given scheduling domain. Simply

7991

* Find the next node to include in a given scheduling domain. Simply

7992

* finds the closest node not already in the @used_nodes map.

7992

* finds the closest node not already in the @used_nodes map.

7993

*

7993

*

7994

* Should use nodemask_t.

7994

* Should use nodemask_t.

7995

*/

7995

*/

7996

static int find_next_best_node(int node, nodemask_t *used_nodes)

7996

static int find_next_best_node(int node, nodemask_t *used_nodes)

7997

{

7997

{

7998

int i, n, val, min_val, best_node = 0;

7998

int i, n, val, min_val, best_node = 0;

7999

8000

min_val = INT_MAX;

8000

min_val = INT_MAX;

8001

8002

for (i = 0; i < nr_node_ids; i++) {

8002

for (i = 0; i < nr_node_ids; i++) {

8003

/* Start at @node */

8003

/* Start at @node */

8004

n = (node + i) % nr_node_ids;

8004

n = (node + i) % nr_node_ids;

8005

8006

if (!nr_cpus_node(n))

8006

if (!nr_cpus_node(n))

8007

continue;

8007

continue;

8008

8009

/* Skip already used nodes */

8009

/* Skip already used nodes */

8010

if (node_isset(n, *used_nodes))

8010

if (node_isset(n, *used_nodes))

8011

continue;

8011

continue;

8012

8013

/* Simple min distance search */

8013

/* Simple min distance search */

8014

val = node_distance(node, n);

8014

val = node_distance(node, n);

8015

8016

if (val < min_val) {

8016

if (val < min_val) {

8017

min_val = val;

8017

min_val = val;

8018

best_node = n;

8018

best_node = n;

8019

}

8019

}

8020

}

8020

}

8021

8022

node_set(best_node, *used_nodes);

8022

node_set(best_node, *used_nodes);

8023

return best_node;

8023

return best_node;

8024

}

8024

}

8025

8026

/**

8026

/**

8027

* sched_domain_node_span - get a cpumask for a node's sched_domain

8027

* sched_domain_node_span - get a cpumask for a node's sched_domain

8028

* @node: node whose cpumask we're constructing

8028

* @node: node whose cpumask we're constructing

8029

* @span: resulting cpumask

8029

* @span: resulting cpumask

8030

*

8030

*

8031

* Given a node, construct a good cpumask for its sched_domain to span. It

8031

* Given a node, construct a good cpumask for its sched_domain to span. It

8032

* should be one that prevents unnecessary balancing, but also spreads tasks

8032

* should be one that prevents unnecessary balancing, but also spreads tasks

8033

* out optimally.

8033

* out optimally.

8034

*/

8034

*/

8035

static void sched_domain_node_span(int node, struct cpumask *span)

8035

static void sched_domain_node_span(int node, struct cpumask *span)

8036

{

8036

{

8037

nodemask_t used_nodes;

8037

nodemask_t used_nodes;

8038

int i;

8038

int i;

8039

8040

cpumask_clear(span);

8040

cpumask_clear(span);

8041

nodes_clear(used_nodes);

8041

nodes_clear(used_nodes);

8042

8043

cpumask_or(span, span, cpumask_of_node(node));

8043

cpumask_or(span, span, cpumask_of_node(node));

8044

node_set(node, used_nodes);

8044

node_set(node, used_nodes);

8045

8046

for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {

8046

for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {

8047

int next_node = find_next_best_node(node, &used_nodes);

8047

int next_node = find_next_best_node(node, &used_nodes);

8048

8049

cpumask_or(span, span, cpumask_of_node(next_node));

8049

cpumask_or(span, span, cpumask_of_node(next_node));

8050

}

8050

}

8051

}

8051

}

8052

#endif /* CONFIG_NUMA */

8052

#endif /* CONFIG_NUMA */

8053

8054

int sched_smt_power_savings = 0, sched_mc_power_savings = 0;

8054

int sched_smt_power_savings = 0, sched_mc_power_savings = 0;

8055

8056

/*

8056

/*

8057

* The cpus mask in sched_group and sched_domain hangs off the end.

8057

* The cpus mask in sched_group and sched_domain hangs off the end.

8058

*

8058

*

8059

* ( See the the comments in include/linux/sched.h:struct sched_group

8059

* ( See the the comments in include/linux/sched.h:struct sched_group

8060

* and struct sched_domain. )

8060

* and struct sched_domain. )

8061

*/

8061

*/

8062

struct static_sched_group {

8062

struct static_sched_group {

8063

struct sched_group sg;

8063

struct sched_group sg;

8064

DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);

8064

DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);

8065

};

8065

};

8066

8067

struct static_sched_domain {

8067

struct static_sched_domain {

8068

struct sched_domain sd;

8068

struct sched_domain sd;

8069

DECLARE_BITMAP(span, CONFIG_NR_CPUS);

8069

DECLARE_BITMAP(span, CONFIG_NR_CPUS);

8070

};

8070

};

8071

8072

/*

8072

/*

8073

* SMT sched-domains:

8073

* SMT sched-domains:

8074

*/

8074

*/

8075

#ifdef CONFIG_SCHED_SMT

8075

#ifdef CONFIG_SCHED_SMT

8076

static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);

8076

static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);

8077

static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);

8077

static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);

8078

8079

static int

8079

static int

8080

cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,

8080

cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,

8081

struct sched_group **sg, struct cpumask *unused)

8081

struct sched_group **sg, struct cpumask *unused)

8082

{

8082

{

8083

if (sg)

8083

if (sg)

8084

*sg = &per_cpu(sched_group_cpus, cpu).sg;

8084

*sg = &per_cpu(sched_group_cpus, cpu).sg;

8085

return cpu;

8085

return cpu;

8086

}

8086

}

8087

#endif /* CONFIG_SCHED_SMT */

8087

#endif /* CONFIG_SCHED_SMT */

8088

8089

/*

8089

/*

8090

* multi-core sched-domains:

8090

* multi-core sched-domains:

8091

*/

8091

*/

8092

#ifdef CONFIG_SCHED_MC

8092

#ifdef CONFIG_SCHED_MC

8093

static DEFINE_PER_CPU(struct static_sched_domain, core_domains);

8093

static DEFINE_PER_CPU(struct static_sched_domain, core_domains);

8094

static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);

8094

static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);

8095

#endif /* CONFIG_SCHED_MC */

8095

#endif /* CONFIG_SCHED_MC */

8096

8097

#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)

8097

#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)

8098

static int

8098

static int

8099

cpu_to_core_group(int cpu, const struct cpumask *cpu_map,

8099

cpu_to_core_group(int cpu, const struct cpumask *cpu_map,

8100

struct sched_group **sg, struct cpumask *mask)

8100

struct sched_group **sg, struct cpumask *mask)

8101

{

8101

{

8102

int group;

8102

int group;

8103

8104

cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);

8104

cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);

8105

group = cpumask_first(mask);

8105

group = cpumask_first(mask);

8106

if (sg)

8106

if (sg)

8107

*sg = &per_cpu(sched_group_core, group).sg;

8107

*sg = &per_cpu(sched_group_core, group).sg;

8108

return group;

8108

return group;

8109

}

8109

}

8110

#elif defined(CONFIG_SCHED_MC)

8110

#elif defined(CONFIG_SCHED_MC)

8111

static int

8111

static int

8112

cpu_to_core_group(int cpu, const struct cpumask *cpu_map,

8112

cpu_to_core_group(int cpu, const struct cpumask *cpu_map,

8113

struct sched_group **sg, struct cpumask *unused)

8113

struct sched_group **sg, struct cpumask *unused)

8114

{

8114

{

8115

if (sg)

8115

if (sg)

8116

*sg = &per_cpu(sched_group_core, cpu).sg;

8116

*sg = &per_cpu(sched_group_core, cpu).sg;

8117

return cpu;

8117

return cpu;

8118

}

8118

}

8119

#endif

8119

#endif

8120

8121

static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);

8121

static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);

8122

static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);

8122

static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);

8123

8124

static int

8124

static int

8125

cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,

8125

cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,

8126

struct sched_group **sg, struct cpumask *mask)

8126

struct sched_group **sg, struct cpumask *mask)

8127

{

8127

{

8128

int group;

8128

int group;

8129

#ifdef CONFIG_SCHED_MC

8129

#ifdef CONFIG_SCHED_MC

8130

cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);

8130

cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);

8131

group = cpumask_first(mask);

8131

group = cpumask_first(mask);

8132

#elif defined(CONFIG_SCHED_SMT)

8132

#elif defined(CONFIG_SCHED_SMT)

8133

cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);

8133

cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);

8134

group = cpumask_first(mask);

8134

group = cpumask_first(mask);

8135

#else

8135

#else

8136

group = cpu;

8136

group = cpu;

8137

#endif

8137

#endif

8138

if (sg)

8138

if (sg)

8139

*sg = &per_cpu(sched_group_phys, group).sg;

8139

*sg = &per_cpu(sched_group_phys, group).sg;

8140

return group;

8140

return group;

8141

}

8141

}

8142

8143

#ifdef CONFIG_NUMA

8143

#ifdef CONFIG_NUMA

8144

/*

8144

/*

8145

* The init_sched_build_groups can't handle what we want to do with node

8145

* The init_sched_build_groups can't handle what we want to do with node

8146

* groups, so roll our own. Now each node has its own list of groups which

8146

* groups, so roll our own. Now each node has its own list of groups which

8147

* gets dynamically allocated.

8147

* gets dynamically allocated.

8148

*/

8148

*/

8149

static DEFINE_PER_CPU(struct static_sched_domain, node_domains);

8149

static DEFINE_PER_CPU(struct static_sched_domain, node_domains);

8150

static struct sched_group ***sched_group_nodes_bycpu;

8150

static struct sched_group ***sched_group_nodes_bycpu;

8151

8152

static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);

8152

static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);

8153

static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);

8153

static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);

8154

8155

static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,

8155

static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,

8156

struct sched_group **sg,

8156

struct sched_group **sg,

8157

struct cpumask *nodemask)

8157

struct cpumask *nodemask)

8158

{

8158

{

8159

int group;

8159

int group;

8160

8161

cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);

8161

cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);

8162

group = cpumask_first(nodemask);

8162

group = cpumask_first(nodemask);

8163

8164

if (sg)

8164

if (sg)

8165

*sg = &per_cpu(sched_group_allnodes, group).sg;

8165

*sg = &per_cpu(sched_group_allnodes, group).sg;

8166

return group;

8166

return group;

8167

}

8167

}

8168

8169

static void init_numa_sched_groups_power(struct sched_group *group_head)

8169

static void init_numa_sched_groups_power(struct sched_group *group_head)

8170

{

8170

{

8171

struct sched_group *sg = group_head;

8171

struct sched_group *sg = group_head;

8172

int j;

8172

int j;

8173

8174

if (!sg)

8174

if (!sg)

8175

return;

8175

return;

8176

do {

8176

do {

8177

for_each_cpu(j, sched_group_cpus(sg)) {

8177

for_each_cpu(j, sched_group_cpus(sg)) {

8178

struct sched_domain *sd;

8178

struct sched_domain *sd;

8179

8180

sd = &per_cpu(phys_domains, j).sd;

8180

sd = &per_cpu(phys_domains, j).sd;

8181

if (j != group_first_cpu(sd->groups)) {

8181

if (j != group_first_cpu(sd->groups)) {

8182

/*

8182

/*

8183

* Only add "power" once for each

8183

* Only add "power" once for each

8184

* physical package.

8184

* physical package.

8185

*/

8185

*/

8186

continue;

8186

continue;

8187

}

8187

}

8188

8189

sg_inc_cpu_power(sg, sd->groups->__cpu_power);

8189

sg_inc_cpu_power(sg, sd->groups->__cpu_power);

8190

}

8190

}

8191

sg = sg->next;

8191

sg = sg->next;

8192

} while (sg != group_head);

8192

} while (sg != group_head);

8193

}

8193

}

8194

#endif /* CONFIG_NUMA */

8194

#endif /* CONFIG_NUMA */

8195

8196

#ifdef CONFIG_NUMA

8196

#ifdef CONFIG_NUMA

8197

/* Free memory allocated for various sched_group structures */

8197

/* Free memory allocated for various sched_group structures */

8198

static void free_sched_groups(const struct cpumask *cpu_map,

8198

static void free_sched_groups(const struct cpumask *cpu_map,

8199

struct cpumask *nodemask)

8199

struct cpumask *nodemask)

8200

{

8200

{

8201

int cpu, i;

8201

int cpu, i;

8202

8203

for_each_cpu(cpu, cpu_map) {

8203

for_each_cpu(cpu, cpu_map) {

8204

struct sched_group **sched_group_nodes

8204

struct sched_group **sched_group_nodes

8205

= sched_group_nodes_bycpu[cpu];

8205

= sched_group_nodes_bycpu[cpu];

8206

8207

if (!sched_group_nodes)

8207

if (!sched_group_nodes)

8208

continue;

8208

continue;

8209

8210

for (i = 0; i < nr_node_ids; i++) {

8210

for (i = 0; i < nr_node_ids; i++) {

8211

struct sched_group *oldsg, *sg = sched_group_nodes[i];

8211

struct sched_group *oldsg, *sg = sched_group_nodes[i];

8212

8213

cpumask_and(nodemask, cpumask_of_node(i), cpu_map);

8213

cpumask_and(nodemask, cpumask_of_node(i), cpu_map);

8214

if (cpumask_empty(nodemask))

8214

if (cpumask_empty(nodemask))

8215

continue;

8215

continue;

8216

8217

if (sg == NULL)

8217

if (sg == NULL)

8218

continue;

8218

continue;

8219

sg = sg->next;

8219

sg = sg->next;

8220

next_sg:

8220

next_sg:

8221

oldsg = sg;

8221

oldsg = sg;

8222

sg = sg->next;

8222

sg = sg->next;

8223

kfree(oldsg);

8223

kfree(oldsg);

8224

if (oldsg != sched_group_nodes[i])

8224

if (oldsg != sched_group_nodes[i])

8225

goto next_sg;

8225

goto next_sg;

8226

}

8226

}

8227

kfree(sched_group_nodes);

8227

kfree(sched_group_nodes);

8228

sched_group_nodes_bycpu[cpu] = NULL;

8228

sched_group_nodes_bycpu[cpu] = NULL;

8229

}

8229

}

8230

}

8230

}

8231

#else /* !CONFIG_NUMA */

8231

#else /* !CONFIG_NUMA */

8232

static void free_sched_groups(const struct cpumask *cpu_map,

8232

static void free_sched_groups(const struct cpumask *cpu_map,

8233

struct cpumask *nodemask)

8233

struct cpumask *nodemask)

8234

{

8234

{

8235

}

8235

}

8236

#endif /* CONFIG_NUMA */

8236

#endif /* CONFIG_NUMA */

8237

8238

/*

8238

/*

8239

* Initialize sched groups cpu_power.

8239

* Initialize sched groups cpu_power.

8240

*

8240

*

8241

* cpu_power indicates the capacity of sched group, which is used while

8241

* cpu_power indicates the capacity of sched group, which is used while

8242

* distributing the load between different sched groups in a sched domain.

8242

* distributing the load between different sched groups in a sched domain.

8243

* Typically cpu_power for all the groups in a sched domain will be same unless

8243

* Typically cpu_power for all the groups in a sched domain will be same unless

8244

* there are asymmetries in the topology. If there are asymmetries, group

8244

* there are asymmetries in the topology. If there are asymmetries, group

8245

* having more cpu_power will pickup more load compared to the group having

8245

* having more cpu_power will pickup more load compared to the group having

8246

* less cpu_power.

8246

* less cpu_power.

8247

*

8247

*

8248

* cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents

8248

* cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents

8249

* the maximum number of tasks a group can handle in the presence of other idle

8249

* the maximum number of tasks a group can handle in the presence of other idle

8250

* or lightly loaded groups in the same sched domain.

8250

* or lightly loaded groups in the same sched domain.

8251

*/

8251

*/

8252

static void init_sched_groups_power(int cpu, struct sched_domain *sd)

8252

static void init_sched_groups_power(int cpu, struct sched_domain *sd)

8253

{

8253

{

8254

struct sched_domain *child;

8254

struct sched_domain *child;

8255

struct sched_group *group;

8255

struct sched_group *group;

8256

8257

WARN_ON(!sd || !sd->groups);

8257

WARN_ON(!sd || !sd->groups);

8258

8259

if (cpu != group_first_cpu(sd->groups))

8259

if (cpu != group_first_cpu(sd->groups))

8260

return;

8260

return;

8261

8262

child = sd->child;

8262

child = sd->child;

8263

8264

sd->groups->__cpu_power = 0;

8264

sd->groups->__cpu_power = 0;

8265

8266

/*

8266

/*

8267

* For perf policy, if the groups in child domain share resources

8267

* For perf policy, if the groups in child domain share resources

8268

* (for example cores sharing some portions of the cache hierarchy

8268

* (for example cores sharing some portions of the cache hierarchy

8269

* or SMT), then set this domain groups cpu_power such that each group

8269

* or SMT), then set this domain groups cpu_power such that each group

8270

* can handle only one task, when there are other idle groups in the

8270

* can handle only one task, when there are other idle groups in the

8271

* same sched domain.

8271

* same sched domain.

8272

*/

8272

*/

8273

if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&

8273

if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&

8274

(child->flags &

8274

(child->flags &

8275

(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {

8275

(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {

8276

sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);

8276

sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);

8277

return;

8277

return;

8278

}

8278

}

8279

8280

/*

8280

/*

8281

* add cpu_power of each child group to this groups cpu_power

8281

* add cpu_power of each child group to this groups cpu_power

8282

*/

8282

*/

8283

group = child->groups;

8283

group = child->groups;

8284

do {

8284

do {

8285

sg_inc_cpu_power(sd->groups, group->__cpu_power);

8285

sg_inc_cpu_power(sd->groups, group->__cpu_power);

8286

group = group->next;

8286

group = group->next;

8287

} while (group != child->groups);

8287

} while (group != child->groups);

8288

}

8288

}

8289

8290

/*

8290

/*

8291

* Initializers for schedule domains

8291

* Initializers for schedule domains

8292

* Non-inlined to reduce accumulated stack pressure in build_sched_domains()

8292

* Non-inlined to reduce accumulated stack pressure in build_sched_domains()

8293

*/

8293

*/

8294

8295

#ifdef CONFIG_SCHED_DEBUG

8295

#ifdef CONFIG_SCHED_DEBUG

8296

# define SD_INIT_NAME(sd, type) sd->name = #type

8296

# define SD_INIT_NAME(sd, type) sd->name = #type

8297

#else

8297

#else

8298

# define SD_INIT_NAME(sd, type) do { } while (0)

8298

# define SD_INIT_NAME(sd, type) do { } while (0)

8299

#endif

8299

#endif

8300

8301

#define SD_INIT(sd, type) sd_init_##type(sd)

8301

#define SD_INIT(sd, type) sd_init_##type(sd)

8302

8303

#define SD_INIT_FUNC(type) \

8303

#define SD_INIT_FUNC(type) \

8304

static noinline void sd_init_##type(struct sched_domain *sd) \

8304

static noinline void sd_init_##type(struct sched_domain *sd) \

8305

{ \

8305

{ \

8306

memset(sd, 0, sizeof(*sd)); \

8306

memset(sd, 0, sizeof(*sd)); \

8307

*sd = SD_##type##_INIT; \

8307

*sd = SD_##type##_INIT; \

8308

sd->level = SD_LV_##type; \

8308

sd->level = SD_LV_##type; \

8309

SD_INIT_NAME(sd, type); \

8309

SD_INIT_NAME(sd, type); \

8310

}

8310

}

8311

8312

SD_INIT_FUNC(CPU)

8312

SD_INIT_FUNC(CPU)

8313

#ifdef CONFIG_NUMA

8313

#ifdef CONFIG_NUMA

8314

SD_INIT_FUNC(ALLNODES)

8314

SD_INIT_FUNC(ALLNODES)

8315

SD_INIT_FUNC(NODE)

8315

SD_INIT_FUNC(NODE)

8316

#endif

8316

#endif

8317

#ifdef CONFIG_SCHED_SMT

8317

#ifdef CONFIG_SCHED_SMT

8318

SD_INIT_FUNC(SIBLING)

8318

SD_INIT_FUNC(SIBLING)

8319

#endif

8319

#endif

8320

#ifdef CONFIG_SCHED_MC

8320

#ifdef CONFIG_SCHED_MC

8321

SD_INIT_FUNC(MC)

8321

SD_INIT_FUNC(MC)

8322

#endif

8322

#endif

8323

8324

static int default_relax_domain_level = -1;

8324

static int default_relax_domain_level = -1;

8325

8326

static int __init setup_relax_domain_level(char *str)

8326

static int __init setup_relax_domain_level(char *str)

8327

{

8327

{

8328

unsigned long val;

8328

unsigned long val;

8329

8330

val = simple_strtoul(str, NULL, 0);

8330

val = simple_strtoul(str, NULL, 0);

8331

if (val < SD_LV_MAX)

8331

if (val < SD_LV_MAX)

8332

default_relax_domain_level = val;

8332

default_relax_domain_level = val;

8333

8334

return 1;

8334

return 1;

8335

}

8335

}

8336

__setup("relax_domain_level=", setup_relax_domain_level);

8336

__setup("relax_domain_level=", setup_relax_domain_level);

8337

8338

static void set_domain_attribute(struct sched_domain *sd,

8338

static void set_domain_attribute(struct sched_domain *sd,

8339

struct sched_domain_attr *attr)

8339

struct sched_domain_attr *attr)

8340

{

8340

{

8341

int request;

8341

int request;

8342

8343

if (!attr || attr->relax_domain_level < 0) {

8343

if (!attr || attr->relax_domain_level < 0) {

8344

if (default_relax_domain_level < 0)

8344

if (default_relax_domain_level < 0)

8345

return;

8345

return;

8346

else

8346

else

8347

request = default_relax_domain_level;

8347

request = default_relax_domain_level;

8348

} else

8348

} else

8349

request = attr->relax_domain_level;

8349

request = attr->relax_domain_level;

8350

if (request < sd->level) {

8350

if (request < sd->level) {

8351

/* turn off idle balance on this domain */

8351

/* turn off idle balance on this domain */

8352

sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);

8352

sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);

8353

} else {

8353

} else {

8354

/* turn on idle balance on this domain */

8354

/* turn on idle balance on this domain */

8355

sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);

8355

sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);

8356

}

8356

}

8357

}

8357

}

8358

8359

/*

8359

/*

8360

* Build sched domains for a given set of cpus and attach the sched domains

8360

* Build sched domains for a given set of cpus and attach the sched domains

8361

* to the individual cpus

8361

* to the individual cpus

8362

*/

8362

*/

8363

static int __build_sched_domains(const struct cpumask *cpu_map,

8363

static int __build_sched_domains(const struct cpumask *cpu_map,

8364

struct sched_domain_attr *attr)

8364

struct sched_domain_attr *attr)

8365

{

8365

{

8366

int i, err = -ENOMEM;

8366

int i, err = -ENOMEM;

8367

struct root_domain *rd;

8367

struct root_domain *rd;

8368

cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,

8368

cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,

8369

tmpmask;

8369

tmpmask;

8370

#ifdef CONFIG_NUMA

8370

#ifdef CONFIG_NUMA

8371

cpumask_var_t domainspan, covered, notcovered;

8371

cpumask_var_t domainspan, covered, notcovered;

8372

struct sched_group **sched_group_nodes = NULL;

8372

struct sched_group **sched_group_nodes = NULL;

8373

int sd_allnodes = 0;

8373

int sd_allnodes = 0;

8374

8375

if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))

8375

if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))

8376

goto out;

8376

goto out;

8377

if (!alloc_cpumask_var(&covered, GFP_KERNEL))

8377

if (!alloc_cpumask_var(&covered, GFP_KERNEL))

8378

goto free_domainspan;

8378

goto free_domainspan;

8379

if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))

8379

if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))

8380

goto free_covered;

8380

goto free_covered;

8381

#endif

8381

#endif

8382

8383

if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))

8383

if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))

8384

goto free_notcovered;

8384

goto free_notcovered;

8385

if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))

8385

if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))

8386

goto free_nodemask;

8386

goto free_nodemask;

8387

if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))

8387

if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))

8388

goto free_this_sibling_map;

8388

goto free_this_sibling_map;

8389

if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))

8389

if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))

8390

goto free_this_core_map;

8390

goto free_this_core_map;

8391

if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))

8391

if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))

8392

goto free_send_covered;

8392

goto free_send_covered;

8393

8394

#ifdef CONFIG_NUMA

8394

#ifdef CONFIG_NUMA

8395

/*

8395

/*

8396

* Allocate the per-node list of sched groups

8396

* Allocate the per-node list of sched groups

8397

*/

8397

*/

8398

sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),

8398

sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),

8399

GFP_KERNEL);

8399

GFP_KERNEL);

8400

if (!sched_group_nodes) {

8400

if (!sched_group_nodes) {

8401

printk(KERN_WARNING "Can not alloc sched group node list\n");

8401

printk(KERN_WARNING "Can not alloc sched group node list\n");

8402

goto free_tmpmask;

8402

goto free_tmpmask;

8403

}

8403

}

8404

#endif

8404

#endif

8405

8406

rd = alloc_rootdomain();

8406

rd = alloc_rootdomain();

8407

if (!rd) {

8407

if (!rd) {

8408

printk(KERN_WARNING "Cannot alloc root domain\n");

8408

printk(KERN_WARNING "Cannot alloc root domain\n");

8409

goto free_sched_groups;

8409

goto free_sched_groups;

8410

}

8410

}

8411

8412

#ifdef CONFIG_NUMA

8412

#ifdef CONFIG_NUMA

8413

sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;

8413

sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;

8414

#endif

8414

#endif

8415

8416

/*

8416

/*

8417

* Set up domains for cpus specified by the cpu_map.

8417

* Set up domains for cpus specified by the cpu_map.

8418

*/

8418

*/

8419

for_each_cpu(i, cpu_map) {

8419

for_each_cpu(i, cpu_map) {

8420

struct sched_domain *sd = NULL, *p;

8420

struct sched_domain *sd = NULL, *p;

8421

8422

cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);

8422

cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);

8423

8424

#ifdef CONFIG_NUMA

8424

#ifdef CONFIG_NUMA

8425

if (cpumask_weight(cpu_map) >

8425

if (cpumask_weight(cpu_map) >

8426

SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {

8426

SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {

8427

sd = &per_cpu(allnodes_domains, i).sd;

8427

sd = &per_cpu(allnodes_domains, i).sd;

8428

SD_INIT(sd, ALLNODES);

8428

SD_INIT(sd, ALLNODES);

8429

set_domain_attribute(sd, attr);

8429

set_domain_attribute(sd, attr);

8430

cpumask_copy(sched_domain_span(sd), cpu_map);

8430

cpumask_copy(sched_domain_span(sd), cpu_map);

8431

cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);

8431

cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);

8432

p = sd;

8432

p = sd;

8433

sd_allnodes = 1;

8433

sd_allnodes = 1;

8434

} else

8434

} else

8435

p = NULL;

8435

p = NULL;

8436

8437

sd = &per_cpu(node_domains, i).sd;

8437

sd = &per_cpu(node_domains, i).sd;

8438

SD_INIT(sd, NODE);

8438

SD_INIT(sd, NODE);

8439

set_domain_attribute(sd, attr);

8439

set_domain_attribute(sd, attr);

8440

sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));

8440

sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));

8441

sd->parent = p;

8441

sd->parent = p;

8442

if (p)

8442

if (p)

8443

p->child = sd;

8443

p->child = sd;

8444

cpumask_and(sched_domain_span(sd),

8444

cpumask_and(sched_domain_span(sd),

8445

sched_domain_span(sd), cpu_map);

8445

sched_domain_span(sd), cpu_map);

8446

#endif

8446

#endif

8447

8448

p = sd;

8448

p = sd;

8449

sd = &per_cpu(phys_domains, i).sd;

8449

sd = &per_cpu(phys_domains, i).sd;

8450

SD_INIT(sd, CPU);

8450

SD_INIT(sd, CPU);

8451

set_domain_attribute(sd, attr);

8451

set_domain_attribute(sd, attr);

8452

cpumask_copy(sched_domain_span(sd), nodemask);

8452

cpumask_copy(sched_domain_span(sd), nodemask);

8453

sd->parent = p;

8453

sd->parent = p;

8454

if (p)

8454

if (p)

8455

p->child = sd;

8455

p->child = sd;

8456

cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);

8456

cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);

8457

8458

#ifdef CONFIG_SCHED_MC

8458

#ifdef CONFIG_SCHED_MC

8459

p = sd;

8459

p = sd;

8460

sd = &per_cpu(core_domains, i).sd;

8460

sd = &per_cpu(core_domains, i).sd;

8461

SD_INIT(sd, MC);

8461

SD_INIT(sd, MC);

8462

set_domain_attribute(sd, attr);

8462

set_domain_attribute(sd, attr);

8463

cpumask_and(sched_domain_span(sd), cpu_map,

8463

cpumask_and(sched_domain_span(sd), cpu_map,

8464

cpu_coregroup_mask(i));

8464

cpu_coregroup_mask(i));

8465

sd->parent = p;

8465

sd->parent = p;

8466

p->child = sd;

8466

p->child = sd;

8467

cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);

8467

cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);

8468

#endif

8468

#endif

8469

8470

#ifdef CONFIG_SCHED_SMT

8470

#ifdef CONFIG_SCHED_SMT

8471

p = sd;

8471

p = sd;

8472

sd = &per_cpu(cpu_domains, i).sd;

8472

sd = &per_cpu(cpu_domains, i).sd;

8473

SD_INIT(sd, SIBLING);

8473

SD_INIT(sd, SIBLING);

8474

set_domain_attribute(sd, attr);

8474

set_domain_attribute(sd, attr);

8475

cpumask_and(sched_domain_span(sd),

8475

cpumask_and(sched_domain_span(sd),

8476

topology_thread_cpumask(i), cpu_map);

8476

topology_thread_cpumask(i), cpu_map);

8477

sd->parent = p;

8477

sd->parent = p;

8478

p->child = sd;

8478

p->child = sd;

8479

cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);

8479

cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);

8480

#endif

8480

#endif

8481

}

8481

}

8482

8483

#ifdef CONFIG_SCHED_SMT

8483

#ifdef CONFIG_SCHED_SMT

8484

/* Set up CPU (sibling) groups */

8484

/* Set up CPU (sibling) groups */

8485

for_each_cpu(i, cpu_map) {

8485

for_each_cpu(i, cpu_map) {

8486

cpumask_and(this_sibling_map,

8486

cpumask_and(this_sibling_map,

8487

topology_thread_cpumask(i), cpu_map);

8487

topology_thread_cpumask(i), cpu_map);

8488

if (i != cpumask_first(this_sibling_map))

8488

if (i != cpumask_first(this_sibling_map))

8489

continue;

8489

continue;

8490

8491

init_sched_build_groups(this_sibling_map, cpu_map,

8491

init_sched_build_groups(this_sibling_map, cpu_map,

8492

&cpu_to_cpu_group,

8492

&cpu_to_cpu_group,

8493

send_covered, tmpmask);

8493

send_covered, tmpmask);

8494

}

8494

}

8495

#endif

8495

#endif

8496

8497

#ifdef CONFIG_SCHED_MC

8497

#ifdef CONFIG_SCHED_MC

8498

/* Set up multi-core groups */

8498

/* Set up multi-core groups */

8499

for_each_cpu(i, cpu_map) {

8499

for_each_cpu(i, cpu_map) {

8500

cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);

8500

cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);

8501

if (i != cpumask_first(this_core_map))

8501

if (i != cpumask_first(this_core_map))

8502

continue;

8502

continue;

8503

8504

init_sched_build_groups(this_core_map, cpu_map,

8504

init_sched_build_groups(this_core_map, cpu_map,

8505

&cpu_to_core_group,

8505

&cpu_to_core_group,

8506

send_covered, tmpmask);

8506

send_covered, tmpmask);

8507

}

8507

}

8508

#endif

8508

#endif

8509

8510

/* Set up physical groups */

8510

/* Set up physical groups */

8511

for (i = 0; i < nr_node_ids; i++) {

8511

for (i = 0; i < nr_node_ids; i++) {

8512

cpumask_and(nodemask, cpumask_of_node(i), cpu_map);

8512

cpumask_and(nodemask, cpumask_of_node(i), cpu_map);

8513

if (cpumask_empty(nodemask))

8513

if (cpumask_empty(nodemask))

8514

continue;

8514

continue;

8515

8516

init_sched_build_groups(nodemask, cpu_map,

8516

init_sched_build_groups(nodemask, cpu_map,

8517

&cpu_to_phys_group,

8517

&cpu_to_phys_group,

8518

send_covered, tmpmask);

8518

send_covered, tmpmask);

8519

}

8519

}

8520

8521

#ifdef CONFIG_NUMA

8521

#ifdef CONFIG_NUMA

8522

/* Set up node groups */

8522

/* Set up node groups */

8523

if (sd_allnodes) {

8523

if (sd_allnodes) {

8524

init_sched_build_groups(cpu_map, cpu_map,

8524

init_sched_build_groups(cpu_map, cpu_map,

8525

&cpu_to_allnodes_group,

8525

&cpu_to_allnodes_group,

8526

send_covered, tmpmask);

8526

send_covered, tmpmask);

8527

}

8527

}

8528

8529

for (i = 0; i < nr_node_ids; i++) {

8529

for (i = 0; i < nr_node_ids; i++) {

8530

/* Set up node groups */

8530

/* Set up node groups */

8531

struct sched_group *sg, *prev;

8531

struct sched_group *sg, *prev;

8532

int j;

8532

int j;

8533

8534

cpumask_clear(covered);

8534

cpumask_clear(covered);

8535

cpumask_and(nodemask, cpumask_of_node(i), cpu_map);

8535

cpumask_and(nodemask, cpumask_of_node(i), cpu_map);

8536

if (cpumask_empty(nodemask)) {

8536

if (cpumask_empty(nodemask)) {

8537

sched_group_nodes[i] = NULL;

8537

sched_group_nodes[i] = NULL;

8538

continue;

8538

continue;

8539

}

8539

}

8540

8541

sched_domain_node_span(i, domainspan);

8541

sched_domain_node_span(i, domainspan);

8542

cpumask_and(domainspan, domainspan, cpu_map);

8542

cpumask_and(domainspan, domainspan, cpu_map);

8543

8544

sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),

8544

sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),

8545

GFP_KERNEL, i);

8545

GFP_KERNEL, i);

8546

if (!sg) {

8546

if (!sg) {

8547

printk(KERN_WARNING "Can not alloc domain group for "

8547

printk(KERN_WARNING "Can not alloc domain group for "

8548

"node %d\n", i);

8548

"node %d\n", i);

8549

goto error;

8549

goto error;

8550

}

8550

}

8551

sched_group_nodes[i] = sg;

8551

sched_group_nodes[i] = sg;

8552

for_each_cpu(j, nodemask) {

8552

for_each_cpu(j, nodemask) {

8553

struct sched_domain *sd;

8553

struct sched_domain *sd;

8554

8555

sd = &per_cpu(node_domains, j).sd;

8555

sd = &per_cpu(node_domains, j).sd;

8556

sd->groups = sg;

8556

sd->groups = sg;

8557

}

8557

}

8558

sg->__cpu_power = 0;

8558

sg->__cpu_power = 0;

8559

cpumask_copy(sched_group_cpus(sg), nodemask);

8559

cpumask_copy(sched_group_cpus(sg), nodemask);

8560

sg->next = sg;

8560

sg->next = sg;

8561

cpumask_or(covered, covered, nodemask);

8561

cpumask_or(covered, covered, nodemask);

8562

prev = sg;

8562

prev = sg;

8563

8564

for (j = 0; j < nr_node_ids; j++) {

8564

for (j = 0; j < nr_node_ids; j++) {

8565

int n = (i + j) % nr_node_ids;

8565

int n = (i + j) % nr_node_ids;

8566

8567

cpumask_complement(notcovered, covered);

8567

cpumask_complement(notcovered, covered);

8568

cpumask_and(tmpmask, notcovered, cpu_map);

8568

cpumask_and(tmpmask, notcovered, cpu_map);

8569

cpumask_and(tmpmask, tmpmask, domainspan);

8569

cpumask_and(tmpmask, tmpmask, domainspan);

8570

if (cpumask_empty(tmpmask))

8570

if (cpumask_empty(tmpmask))

8571

break;

8571

break;

8572

8573

cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));

8573

cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));

8574

if (cpumask_empty(tmpmask))

8574

if (cpumask_empty(tmpmask))

8575

continue;

8575

continue;

8576

8577

sg = kmalloc_node(sizeof(struct sched_group) +

8577

sg = kmalloc_node(sizeof(struct sched_group) +

8578

cpumask_size(),

8578

cpumask_size(),

8579

GFP_KERNEL, i);

8579

GFP_KERNEL, i);

8580

if (!sg) {

8580

if (!sg) {

8581

printk(KERN_WARNING

8581

printk(KERN_WARNING

8582

"Can not alloc domain group for node %d\n", j);

8582

"Can not alloc domain group for node %d\n", j);

8583

goto error;

8583

goto error;

8584

}

8584

}

8585

sg->__cpu_power = 0;

8585

sg->__cpu_power = 0;

8586

cpumask_copy(sched_group_cpus(sg), tmpmask);

8586

cpumask_copy(sched_group_cpus(sg), tmpmask);

8587

sg->next = prev->next;

8587

sg->next = prev->next;

8588

cpumask_or(covered, covered, tmpmask);

8588

cpumask_or(covered, covered, tmpmask);

8589

prev->next = sg;

8589

prev->next = sg;

8590

prev = sg;

8590

prev = sg;

8591

}

8591

}

8592

}

8592

}

8593

#endif

8593

#endif

8594

8595

/* Calculate CPU power for physical packages and nodes */

8595

/* Calculate CPU power for physical packages and nodes */

8596

#ifdef CONFIG_SCHED_SMT

8596

#ifdef CONFIG_SCHED_SMT

8597

for_each_cpu(i, cpu_map) {

8597

for_each_cpu(i, cpu_map) {

8598

struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;

8598

struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;

8599

8600

init_sched_groups_power(i, sd);

8600

init_sched_groups_power(i, sd);

8601

}

8601

}

8602

#endif

8602

#endif

8603

#ifdef CONFIG_SCHED_MC

8603

#ifdef CONFIG_SCHED_MC

8604

for_each_cpu(i, cpu_map) {

8604

for_each_cpu(i, cpu_map) {

8605

struct sched_domain *sd = &per_cpu(core_domains, i).sd;

8605

struct sched_domain *sd = &per_cpu(core_domains, i).sd;

8606

8607

init_sched_groups_power(i, sd);

8607

init_sched_groups_power(i, sd);

8608

}

8608

}

8609

#endif

8609

#endif

8610

8611

for_each_cpu(i, cpu_map) {

8611

for_each_cpu(i, cpu_map) {

8612

struct sched_domain *sd = &per_cpu(phys_domains, i).sd;

8612

struct sched_domain *sd = &per_cpu(phys_domains, i).sd;

8613

8614

init_sched_groups_power(i, sd);

8614

init_sched_groups_power(i, sd);

8615

}

8615

}

8616

8617

#ifdef CONFIG_NUMA

8617

#ifdef CONFIG_NUMA

8618

for (i = 0; i < nr_node_ids; i++)

8618

for (i = 0; i < nr_node_ids; i++)

8619

init_numa_sched_groups_power(sched_group_nodes[i]);

8619

init_numa_sched_groups_power(sched_group_nodes[i]);

8620

8621

if (sd_allnodes) {

8621

if (sd_allnodes) {

8622

struct sched_group *sg;

8622

struct sched_group *sg;

8623

8624

cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,

8624

cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,

8625

tmpmask);

8625

tmpmask);

8626

init_numa_sched_groups_power(sg);

8626

init_numa_sched_groups_power(sg);

8627

}

8627

}

8628

#endif

8628

#endif

8629

8630

/* Attach the domains */

8630

/* Attach the domains */

8631

for_each_cpu(i, cpu_map) {

8631

for_each_cpu(i, cpu_map) {

8632

struct sched_domain *sd;

8632

struct sched_domain *sd;

8633

#ifdef CONFIG_SCHED_SMT

8633

#ifdef CONFIG_SCHED_SMT

8634

sd = &per_cpu(cpu_domains, i).sd;

8634

sd = &per_cpu(cpu_domains, i).sd;

8635

#elif defined(CONFIG_SCHED_MC)

8635

#elif defined(CONFIG_SCHED_MC)

8636

sd = &per_cpu(core_domains, i).sd;

8636

sd = &per_cpu(core_domains, i).sd;

8637

#else

8637

#else

8638

sd = &per_cpu(phys_domains, i).sd;

8638

sd = &per_cpu(phys_domains, i).sd;

8639

#endif

8639

#endif

8640

cpu_attach_domain(sd, rd, i);

8640

cpu_attach_domain(sd, rd, i);

8641

}

8641

}

8642

8643

err = 0;

8643

err = 0;

8644

8645

free_tmpmask:

8645

free_tmpmask:

8646

free_cpumask_var(tmpmask);

8646

free_cpumask_var(tmpmask);

8647

free_send_covered:

8647

free_send_covered:

8648

free_cpumask_var(send_covered);

8648

free_cpumask_var(send_covered);

8649

free_this_core_map:

8649

free_this_core_map:

8650

free_cpumask_var(this_core_map);

8650

free_cpumask_var(this_core_map);

8651

free_this_sibling_map:

8651

free_this_sibling_map:

8652

free_cpumask_var(this_sibling_map);

8652

free_cpumask_var(this_sibling_map);

8653

free_nodemask:

8653

free_nodemask:

8654

free_cpumask_var(nodemask);

8654

free_cpumask_var(nodemask);

8655

free_notcovered:

8655

free_notcovered:

8656

#ifdef CONFIG_NUMA

8656

#ifdef CONFIG_NUMA

8657

free_cpumask_var(notcovered);

8657

free_cpumask_var(notcovered);

8658

free_covered:

8658

free_covered:

8659

free_cpumask_var(covered);

8659

free_cpumask_var(covered);

8660

free_domainspan:

8660

free_domainspan:

8661

free_cpumask_var(domainspan);

8661

free_cpumask_var(domainspan);

8662

out:

8662

out:

8663

#endif

8663

#endif

8664

return err;

8664

return err;

8665

8666

free_sched_groups:

8666

free_sched_groups:

8667

#ifdef CONFIG_NUMA

8667

#ifdef CONFIG_NUMA

8668

kfree(sched_group_nodes);

8668

kfree(sched_group_nodes);

8669

#endif

8669

#endif

8670

goto free_tmpmask;

8670

goto free_tmpmask;

8671

8672

#ifdef CONFIG_NUMA

8672

#ifdef CONFIG_NUMA

8673

error:

8673

error:

8674

free_sched_groups(cpu_map, tmpmask);

8674

free_sched_groups(cpu_map, tmpmask);

8675

free_rootdomain(rd);

8675

free_rootdomain(rd);

8676

goto free_tmpmask;

8676

goto free_tmpmask;

8677

#endif

8677

#endif

8678

}

8678

}

8679

8680

static int build_sched_domains(const struct cpumask *cpu_map)

8680

static int build_sched_domains(const struct cpumask *cpu_map)

8681

{

8681

{

8682

return __build_sched_domains(cpu_map, NULL);

8682

return __build_sched_domains(cpu_map, NULL);

8683

}

8683

}

8684

8685

static struct cpumask *doms_cur; /* current sched domains */

8685

static struct cpumask *doms_cur; /* current sched domains */

8686

static int ndoms_cur; /* number of sched domains in 'doms_cur' */

8686

static int ndoms_cur; /* number of sched domains in 'doms_cur' */

8687

static struct sched_domain_attr *dattr_cur;

8687

static struct sched_domain_attr *dattr_cur;

8688

/* attribues of custom domains in 'doms_cur' */

8688

/* attribues of custom domains in 'doms_cur' */

8689

8690

/*

8690

/*

8691

* Special case: If a kmalloc of a doms_cur partition (array of

8691

* Special case: If a kmalloc of a doms_cur partition (array of

8692

* cpumask) fails, then fallback to a single sched domain,

8692

* cpumask) fails, then fallback to a single sched domain,

8693

* as determined by the single cpumask fallback_doms.

8693

* as determined by the single cpumask fallback_doms.

8694

*/

8694

*/

8695

static cpumask_var_t fallback_doms;

8695

static cpumask_var_t fallback_doms;

8696

8697

/*

8697

/*

8698

* arch_update_cpu_topology lets virtualized architectures update the

8698

* arch_update_cpu_topology lets virtualized architectures update the

8699

* cpu core maps. It is supposed to return 1 if the topology changed

8699

* cpu core maps. It is supposed to return 1 if the topology changed

8700

* or 0 if it stayed the same.

8700

* or 0 if it stayed the same.

8701

*/

8701

*/

8702

int __attribute__((weak)) arch_update_cpu_topology(void)

8702

int __attribute__((weak)) arch_update_cpu_topology(void)

8703

{

8703

{

8704

return 0;

8704

return 0;

8705

}

8705

}

8706

8707

/*

8707

/*

8708

* Set up scheduler domains and groups. Callers must hold the hotplug lock.

8708

* Set up scheduler domains and groups. Callers must hold the hotplug lock.

8709

* For now this just excludes isolated cpus, but could be used to

8709

* For now this just excludes isolated cpus, but could be used to

8710

* exclude other special cases in the future.

8710

* exclude other special cases in the future.

8711

*/

8711

*/

8712

static int arch_init_sched_domains(const struct cpumask *cpu_map)

8712

static int arch_init_sched_domains(const struct cpumask *cpu_map)

8713

{

8713

{

8714

int err;

8714

int err;

8715

8716

arch_update_cpu_topology();

8716

arch_update_cpu_topology();

8717

ndoms_cur = 1;

8717

ndoms_cur = 1;

8718

doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);

8718

doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);

8719

if (!doms_cur)

8719

if (!doms_cur)

8720

doms_cur = fallback_doms;

8720

doms_cur = fallback_doms;

8721

cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);

8721

cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);

8722

dattr_cur = NULL;

8722

dattr_cur = NULL;

8723

err = build_sched_domains(doms_cur);

8723

err = build_sched_domains(doms_cur);

8724

register_sched_domain_sysctl();

8724

register_sched_domain_sysctl();

8725

8726

return err;

8726

return err;

8727

}

8727

}

8728

8729

static void arch_destroy_sched_domains(const struct cpumask *cpu_map,

8729

static void arch_destroy_sched_domains(const struct cpumask *cpu_map,

8730

struct cpumask *tmpmask)

8730

struct cpumask *tmpmask)

8731

{

8731

{

8732

free_sched_groups(cpu_map, tmpmask);

8732

free_sched_groups(cpu_map, tmpmask);

8733

}

8733

}

8734

8735

/*

8735

/*

8736

* Detach sched domains from a group of cpus specified in cpu_map

8736

* Detach sched domains from a group of cpus specified in cpu_map

8737

* These cpus will now be attached to the NULL domain

8737

* These cpus will now be attached to the NULL domain

8738

*/

8738

*/

8739

static void detach_destroy_domains(const struct cpumask *cpu_map)

8739

static void detach_destroy_domains(const struct cpumask *cpu_map)

8740

{

8740

{

8741

/* Save because hotplug lock held. */

8741

/* Save because hotplug lock held. */

8742

static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);

8742

static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);

8743

int i;

8743

int i;

8744

8745

for_each_cpu(i, cpu_map)

8745

for_each_cpu(i, cpu_map)

8746

cpu_attach_domain(NULL, &def_root_domain, i);

8746

cpu_attach_domain(NULL, &def_root_domain, i);

8747

synchronize_sched();

8747

synchronize_sched();

8748

arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));

8748

arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));

8749

}

8749

}

8750

8751

/* handle null as "default" */

8751

/* handle null as "default" */

8752

static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,

8752

static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,

8753

struct sched_domain_attr *new, int idx_new)

8753

struct sched_domain_attr *new, int idx_new)

8754

{

8754

{

8755

struct sched_domain_attr tmp;

8755

struct sched_domain_attr tmp;

8756

8757

/* fast path */

8757

/* fast path */

8758

if (!new && !cur)

8758

if (!new && !cur)

8759

return 1;

8759

return 1;

8760

8761

tmp = SD_ATTR_INIT;

8761

tmp = SD_ATTR_INIT;

8762

return !memcmp(cur ? (cur + idx_cur) : &tmp,

8762

return !memcmp(cur ? (cur + idx_cur) : &tmp,

8763

new ? (new + idx_new) : &tmp,

8763

new ? (new + idx_new) : &tmp,

8764

sizeof(struct sched_domain_attr));

8764

sizeof(struct sched_domain_attr));

8765

}

8765

}

8766

8767

/*

8767

/*

8768

* Partition sched domains as specified by the 'ndoms_new'

8768

* Partition sched domains as specified by the 'ndoms_new'

8769

* cpumasks in the array doms_new[] of cpumasks. This compares

8769

* cpumasks in the array doms_new[] of cpumasks. This compares

8770

* doms_new[] to the current sched domain partitioning, doms_cur[].

8770

* doms_new[] to the current sched domain partitioning, doms_cur[].

8771

* It destroys each deleted domain and builds each new domain.

8771

* It destroys each deleted domain and builds each new domain.

8772

*

8772

*

8773

* 'doms_new' is an array of cpumask's of length 'ndoms_new'.

8773

* 'doms_new' is an array of cpumask's of length 'ndoms_new'.

8774

* The masks don't intersect (don't overlap.) We should setup one

8774

* The masks don't intersect (don't overlap.) We should setup one

8775

* sched domain for each mask. CPUs not in any of the cpumasks will

8775

* sched domain for each mask. CPUs not in any of the cpumasks will

8776

* not be load balanced. If the same cpumask appears both in the

8776

* not be load balanced. If the same cpumask appears both in the

8777

* current 'doms_cur' domains and in the new 'doms_new', we can leave

8777

* current 'doms_cur' domains and in the new 'doms_new', we can leave

8778

* it as it is.

8778

* it as it is.

8779

*

8779

*

8780

* The passed in 'doms_new' should be kmalloc'd. This routine takes

8780

* The passed in 'doms_new' should be kmalloc'd. This routine takes

8781

* ownership of it and will kfree it when done with it. If the caller

8781

* ownership of it and will kfree it when done with it. If the caller

8782

* failed the kmalloc call, then it can pass in doms_new == NULL &&

8782

* failed the kmalloc call, then it can pass in doms_new == NULL &&

8783

* ndoms_new == 1, and partition_sched_domains() will fallback to

8783

* ndoms_new == 1, and partition_sched_domains() will fallback to

8784

* the single partition 'fallback_doms', it also forces the domains

8784

* the single partition 'fallback_doms', it also forces the domains

8785

* to be rebuilt.

8785

* to be rebuilt.

8786

*

8786

*

8787

* If doms_new == NULL it will be replaced with cpu_online_mask.

8787

* If doms_new == NULL it will be replaced with cpu_online_mask.

8788

* ndoms_new == 0 is a special case for destroying existing domains,

8788

* ndoms_new == 0 is a special case for destroying existing domains,

8789

* and it will not create the default domain.

8789

* and it will not create the default domain.

8790

*

8790

*

8791

* Call with hotplug lock held

8791

* Call with hotplug lock held

8792

*/

8792

*/

8793

/* FIXME: Change to struct cpumask *doms_new[] */

8793

/* FIXME: Change to struct cpumask *doms_new[] */

8794

void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,

8794

void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,

8795

struct sched_domain_attr *dattr_new)

8795

struct sched_domain_attr *dattr_new)

8796

{

8796

{

8797

int i, j, n;

8797

int i, j, n;

8798

int new_topology;

8798

int new_topology;

8799

8800

mutex_lock(&sched_domains_mutex);

8800

mutex_lock(&sched_domains_mutex);

8801

8802

/* always unregister in case we don't destroy any domains */

8802

/* always unregister in case we don't destroy any domains */

8803

unregister_sched_domain_sysctl();

8803

unregister_sched_domain_sysctl();

8804

8805

/* Let architecture update cpu core mappings. */

8805

/* Let architecture update cpu core mappings. */

8806

new_topology = arch_update_cpu_topology();

8806

new_topology = arch_update_cpu_topology();

8807

8808

n = doms_new ? ndoms_new : 0;

8808

n = doms_new ? ndoms_new : 0;

8809

8810

/* Destroy deleted domains */

8810

/* Destroy deleted domains */

8811

for (i = 0; i < ndoms_cur; i++) {

8811

for (i = 0; i < ndoms_cur; i++) {

8812

for (j = 0; j < n && !new_topology; j++) {

8812

for (j = 0; j < n && !new_topology; j++) {

8813

if (cpumask_equal(&doms_cur[i], &doms_new[j])

8813

if (cpumask_equal(&doms_cur[i], &doms_new[j])

8814

&& dattrs_equal(dattr_cur, i, dattr_new, j))

8814

&& dattrs_equal(dattr_cur, i, dattr_new, j))

8815

goto match1;

8815

goto match1;

8816

}

8816

}

8817

/* no match - a current sched domain not in new doms_new[] */

8817

/* no match - a current sched domain not in new doms_new[] */

8818

detach_destroy_domains(doms_cur + i);

8818

detach_destroy_domains(doms_cur + i);

8819

match1:

8819

match1:

8820

;

8820

;

8821

}

8821

}

8822

8823

if (doms_new == NULL) {

8823

if (doms_new == NULL) {

8824

ndoms_cur = 0;

8824

ndoms_cur = 0;

8825

doms_new = fallback_doms;

8825

doms_new = fallback_doms;

8826

cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);

8826

cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);

8827

WARN_ON_ONCE(dattr_new);

8827

WARN_ON_ONCE(dattr_new);

8828

}

8828

}

8829

8830

/* Build new domains */

8830

/* Build new domains */

8831

for (i = 0; i < ndoms_new; i++) {

8831

for (i = 0; i < ndoms_new; i++) {

8832

for (j = 0; j < ndoms_cur && !new_topology; j++) {

8832

for (j = 0; j < ndoms_cur && !new_topology; j++) {

8833

if (cpumask_equal(&doms_new[i], &doms_cur[j])

8833

if (cpumask_equal(&doms_new[i], &doms_cur[j])

8834

&& dattrs_equal(dattr_new, i, dattr_cur, j))

8834

&& dattrs_equal(dattr_new, i, dattr_cur, j))

8835

goto match2;

8835

goto match2;

8836

}

8836

}

8837

/* no match - add a new doms_new */

8837

/* no match - add a new doms_new */

8838

__build_sched_domains(doms_new + i,

8838

__build_sched_domains(doms_new + i,

8839

dattr_new ? dattr_new + i : NULL);

8839

dattr_new ? dattr_new + i : NULL);

8840

match2:

8840

match2:

8841

;

8841

;

8842

}

8842

}

8843

8844

/* Remember the new sched domains */

8844

/* Remember the new sched domains */

8845

if (doms_cur != fallback_doms)

8845

if (doms_cur != fallback_doms)

8846

kfree(doms_cur);

8846

kfree(doms_cur);

8847

kfree(dattr_cur); /* kfree(NULL) is safe */

8847

kfree(dattr_cur); /* kfree(NULL) is safe */

8848

doms_cur = doms_new;

8848

doms_cur = doms_new;

8849

dattr_cur = dattr_new;

8849

dattr_cur = dattr_new;

8850

ndoms_cur = ndoms_new;

8850

ndoms_cur = ndoms_new;

8851

8852

register_sched_domain_sysctl();

8852

register_sched_domain_sysctl();

8853

8854

mutex_unlock(&sched_domains_mutex);

8854

mutex_unlock(&sched_domains_mutex);

8855

}

8855

}

8856

8857

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

8857

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

8858

static void arch_reinit_sched_domains(void)

8858

static void arch_reinit_sched_domains(void)

8859

{

8859

{

8860

get_online_cpus();

8860

get_online_cpus();

8861

8862

/* Destroy domains first to force the rebuild */

8862

/* Destroy domains first to force the rebuild */

8863

partition_sched_domains(0, NULL, NULL);

8863

partition_sched_domains(0, NULL, NULL);

8864

8865

rebuild_sched_domains();

8865

rebuild_sched_domains();

8866

put_online_cpus();

8866

put_online_cpus();

8867

}

8867

}

8868

8869

static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)

8869

static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)

8870

{

8870

{

8871

unsigned int level = 0;

8871

unsigned int level = 0;

8872

8873

if (sscanf(buf, "%u", &level) != 1)

8873

if (sscanf(buf, "%u", &level) != 1)

8874

return -EINVAL;

8874

return -EINVAL;

8875

8876

/*

8876

/*

8877

* level is always be positive so don't check for

8877

* level is always be positive so don't check for

8878

* level < POWERSAVINGS_BALANCE_NONE which is 0

8878

* level < POWERSAVINGS_BALANCE_NONE which is 0

8879

* What happens on 0 or 1 byte write,

8879

* What happens on 0 or 1 byte write,

8880

* need to check for count as well?

8880

* need to check for count as well?

8881

*/

8881

*/

8882

8883

if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)

8883

if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)

8884

return -EINVAL;

8884

return -EINVAL;

8885

8886

if (smt)

8886

if (smt)

8887

sched_smt_power_savings = level;

8887

sched_smt_power_savings = level;

8888

else

8888

else

8889

sched_mc_power_savings = level;

8889

sched_mc_power_savings = level;

8890

8891

arch_reinit_sched_domains();

8891

arch_reinit_sched_domains();

8892

8893

return count;

8893

return count;

8894

}

8894

}

8895

8896

#ifdef CONFIG_SCHED_MC

8896

#ifdef CONFIG_SCHED_MC

8897

static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,

8897

static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,

8898

char *page)

8898

char *page)

8899

{

8899

{

8900

return sprintf(page, "%u\n", sched_mc_power_savings);

8900

return sprintf(page, "%u\n", sched_mc_power_savings);

8901

}

8901

}

8902

static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,

8902

static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,

8903

const char *buf, size_t count)

8903

const char *buf, size_t count)

8904

{

8904

{

8905

return sched_power_savings_store(buf, count, 0);

8905

return sched_power_savings_store(buf, count, 0);

8906

}

8906

}

8907

static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,

8907

static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,

8908

sched_mc_power_savings_show,

8908

sched_mc_power_savings_show,

8909

sched_mc_power_savings_store);

8909

sched_mc_power_savings_store);

8910

#endif

8910

#endif

8911

8912

#ifdef CONFIG_SCHED_SMT

8912

#ifdef CONFIG_SCHED_SMT

8913

static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,

8913

static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,

8914

char *page)

8914

char *page)

8915

{

8915

{

8916

return sprintf(page, "%u\n", sched_smt_power_savings);

8916

return sprintf(page, "%u\n", sched_smt_power_savings);

8917

}

8917

}

8918

static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,

8918

static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,

8919

const char *buf, size_t count)

8919

const char *buf, size_t count)

8920

{

8920

{

8921

return sched_power_savings_store(buf, count, 1);

8921

return sched_power_savings_store(buf, count, 1);

8922

}

8922

}

8923

static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,

8923

static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,

8924

sched_smt_power_savings_show,

8924

sched_smt_power_savings_show,

8925

sched_smt_power_savings_store);

8925

sched_smt_power_savings_store);

8926

#endif

8926

#endif

8927

8928

int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)

8928

int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)

8929

{

8929

{

8930

int err = 0;

8930

int err = 0;

8931

8932

#ifdef CONFIG_SCHED_SMT

8932

#ifdef CONFIG_SCHED_SMT

8933

if (smt_capable())

8933

if (smt_capable())

8934

err = sysfs_create_file(&cls->kset.kobj,

8934

err = sysfs_create_file(&cls->kset.kobj,

8935

&attr_sched_smt_power_savings.attr);

8935

&attr_sched_smt_power_savings.attr);

8936

#endif

8936

#endif

8937

#ifdef CONFIG_SCHED_MC

8937

#ifdef CONFIG_SCHED_MC

8938

if (!err && mc_capable())

8938

if (!err && mc_capable())

8939

err = sysfs_create_file(&cls->kset.kobj,

8939

err = sysfs_create_file(&cls->kset.kobj,

8940

&attr_sched_mc_power_savings.attr);

8940

&attr_sched_mc_power_savings.attr);

8941

#endif

8941

#endif

8942

return err;

8942

return err;

8943

}

8943

}

8944

#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

8944

#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

8945

8946

#ifndef CONFIG_CPUSETS

8946

#ifndef CONFIG_CPUSETS

8947

/*

8947

/*

8948

* Add online and remove offline CPUs from the scheduler domains.

8948

* Add online and remove offline CPUs from the scheduler domains.

8949

* When cpusets are enabled they take over this function.

8949

* When cpusets are enabled they take over this function.

8950

*/

8950

*/

8951

static int update_sched_domains(struct notifier_block *nfb,

8951

static int update_sched_domains(struct notifier_block *nfb,

8952

unsigned long action, void *hcpu)

8952

unsigned long action, void *hcpu)

8953

{

8953

{

8954

switch (action) {

8954

switch (action) {

8955

case CPU_ONLINE:

8955

case CPU_ONLINE:

8956

case CPU_ONLINE_FROZEN:

8956

case CPU_ONLINE_FROZEN:

8957

case CPU_DEAD:

8957

case CPU_DEAD:

8958

case CPU_DEAD_FROZEN:

8958

case CPU_DEAD_FROZEN:

8959

partition_sched_domains(1, NULL, NULL);

8959

partition_sched_domains(1, NULL, NULL);

8960

return NOTIFY_OK;

8960

return NOTIFY_OK;

8961

8962

default:

8962

default:

8963

return NOTIFY_DONE;

8963

return NOTIFY_DONE;

8964

}

8964

}

8965

}

8965

}

8966

#endif

8966

#endif

8967

8968

static int update_runtime(struct notifier_block *nfb,

8968

static int update_runtime(struct notifier_block *nfb,

8969

unsigned long action, void *hcpu)

8969

unsigned long action, void *hcpu)

8970

{

8970

{

8971

int cpu = (int)(long)hcpu;

8971

int cpu = (int)(long)hcpu;

8972

8973

switch (action) {

8973

switch (action) {

8974

case CPU_DOWN_PREPARE:

8974

case CPU_DOWN_PREPARE:

8975

case CPU_DOWN_PREPARE_FROZEN:

8975

case CPU_DOWN_PREPARE_FROZEN:

8976

disable_runtime(cpu_rq(cpu));

8976

disable_runtime(cpu_rq(cpu));

8977

return NOTIFY_OK;

8977

return NOTIFY_OK;

8978

8979

case CPU_DOWN_FAILED:

8979

case CPU_DOWN_FAILED:

8980

case CPU_DOWN_FAILED_FROZEN:

8980

case CPU_DOWN_FAILED_FROZEN:

8981

case CPU_ONLINE:

8981

case CPU_ONLINE:

8982

case CPU_ONLINE_FROZEN:

8982

case CPU_ONLINE_FROZEN:

8983

enable_runtime(cpu_rq(cpu));

8983

enable_runtime(cpu_rq(cpu));

8984

return NOTIFY_OK;

8984

return NOTIFY_OK;

8985

8986

default:

8986

default:

8987

return NOTIFY_DONE;

8987

return NOTIFY_DONE;

8988

}

8988

}

8989

}

8989

}

8990

8991

void __init sched_init_smp(void)

8991

void __init sched_init_smp(void)

8992

{

8992

{

8993

cpumask_var_t non_isolated_cpus;

8993

cpumask_var_t non_isolated_cpus;

8994

8995

alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);

8995

alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);

8996

8997

#if defined(CONFIG_NUMA)

8997

#if defined(CONFIG_NUMA)

8998

sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),

8998

sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),

8999

GFP_KERNEL);

8999

GFP_KERNEL);

9000

BUG_ON(sched_group_nodes_bycpu == NULL);

9000

BUG_ON(sched_group_nodes_bycpu == NULL);

9001

#endif

9001

#endif

9002

get_online_cpus();

9002

get_online_cpus();

9003

mutex_lock(&sched_domains_mutex);

9003

mutex_lock(&sched_domains_mutex);

9004

arch_init_sched_domains(cpu_online_mask);

9004

arch_init_sched_domains(cpu_online_mask);

9005

cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);

9005

cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);

9006

if (cpumask_empty(non_isolated_cpus))

9006

if (cpumask_empty(non_isolated_cpus))

9007

cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);

9007

cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);

9008

mutex_unlock(&sched_domains_mutex);

9008

mutex_unlock(&sched_domains_mutex);

9009

put_online_cpus();

9009

put_online_cpus();

9010

9011

#ifndef CONFIG_CPUSETS

9011

#ifndef CONFIG_CPUSETS

9012

/* XXX: Theoretical race here - CPU may be hotplugged now */

9012

/* XXX: Theoretical race here - CPU may be hotplugged now */

9013

hotcpu_notifier(update_sched_domains, 0);

9013

hotcpu_notifier(update_sched_domains, 0);

9014

#endif

9014

#endif

9015

9016

/* RT runtime code needs to handle some hotplug events */

9016

/* RT runtime code needs to handle some hotplug events */

9017

hotcpu_notifier(update_runtime, 0);

9017

hotcpu_notifier(update_runtime, 0);

9018

9019

init_hrtick();

9019

init_hrtick();

9020

9021

/* Move init over to a non-isolated CPU */

9021

/* Move init over to a non-isolated CPU */

9022

if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)

9022

if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)

9023

BUG();

9023

BUG();

9024

sched_init_granularity();

9024

sched_init_granularity();

9025

free_cpumask_var(non_isolated_cpus);

9025

free_cpumask_var(non_isolated_cpus);

9026

9027

alloc_cpumask_var(&fallback_doms, GFP_KERNEL);

9027

alloc_cpumask_var(&fallback_doms, GFP_KERNEL);

9028

init_sched_rt_class();

9028

init_sched_rt_class();

9029

}

9029

}

9030

#else

9030

#else

9031

void __init sched_init_smp(void)

9031

void __init sched_init_smp(void)

9032

{

9032

{

9033

sched_init_granularity();

9033

sched_init_granularity();

9034

}

9034

}

9035

#endif /* CONFIG_SMP */

9035

#endif /* CONFIG_SMP */

9036

9037

const_debug unsigned int sysctl_timer_migration = 1;

9037

const_debug unsigned int sysctl_timer_migration = 1;

9038

9039

int in_sched_functions(unsigned long addr)

9039

int in_sched_functions(unsigned long addr)

9040

{

9040

{

9041

return in_lock_functions(addr) ||

9041

return in_lock_functions(addr) ||

9042

(addr >= (unsigned long)__sched_text_start

9042

(addr >= (unsigned long)__sched_text_start

9043

&& addr < (unsigned long)__sched_text_end);

9043

&& addr < (unsigned long)__sched_text_end);

9044

}

9044

}

9045

9046

static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)

9046

static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)

9047

{

9047

{

9048

cfs_rq->tasks_timeline = RB_ROOT;

9048

cfs_rq->tasks_timeline = RB_ROOT;

9049

INIT_LIST_HEAD(&cfs_rq->tasks);

9049

INIT_LIST_HEAD(&cfs_rq->tasks);

9050

#ifdef CONFIG_FAIR_GROUP_SCHED

9050

#ifdef CONFIG_FAIR_GROUP_SCHED

9051

cfs_rq->rq = rq;

9051

cfs_rq->rq = rq;

9052

#endif

9052

#endif

9053

cfs_rq->min_vruntime = (u64)(-(1LL << 20));

9053

cfs_rq->min_vruntime = (u64)(-(1LL << 20));

9054

}

9054

}

9055

9056

static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)

9056

static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)

9057

{

9057

{

9058

struct rt_prio_array *array;

9058

struct rt_prio_array *array;

9059

int i;

9059

int i;

9060

9061

array = &rt_rq->active;

9061

array = &rt_rq->active;

9062

for (i = 0; i < MAX_RT_PRIO; i++) {

9062

for (i = 0; i < MAX_RT_PRIO; i++) {

9063

INIT_LIST_HEAD(array->queue + i);

9063

INIT_LIST_HEAD(array->queue + i);

9064

__clear_bit(i, array->bitmap);

9064

__clear_bit(i, array->bitmap);

9065

}

9065

}

9066

/* delimiter for bitsearch: */

9066

/* delimiter for bitsearch: */

9067

__set_bit(MAX_RT_PRIO, array->bitmap);

9067

__set_bit(MAX_RT_PRIO, array->bitmap);

9068

9069

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

9069

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

9070

rt_rq->highest_prio.curr = MAX_RT_PRIO;

9070

rt_rq->highest_prio.curr = MAX_RT_PRIO;

9071

#ifdef CONFIG_SMP

9071

#ifdef CONFIG_SMP

9072

rt_rq->highest_prio.next = MAX_RT_PRIO;

9072

rt_rq->highest_prio.next = MAX_RT_PRIO;

9073

#endif

9073

#endif

9074

#endif

9074

#endif

9075

#ifdef CONFIG_SMP

9075

#ifdef CONFIG_SMP

9076

rt_rq->rt_nr_migratory = 0;

9076

rt_rq->rt_nr_migratory = 0;

9077

rt_rq->overloaded = 0;

9077

rt_rq->overloaded = 0;

9078

plist_head_init(&rq->rt.pushable_tasks, &rq->lock);

9078

plist_head_init(&rq->rt.pushable_tasks, &rq->lock);

9079

#endif

9079

#endif

9080

9081

rt_rq->rt_time = 0;

9081

rt_rq->rt_time = 0;

9082

rt_rq->rt_throttled = 0;

9082

rt_rq->rt_throttled = 0;

9083

rt_rq->rt_runtime = 0;

9083

rt_rq->rt_runtime = 0;

9084

spin_lock_init(&rt_rq->rt_runtime_lock);

9084

spin_lock_init(&rt_rq->rt_runtime_lock);

9085

9086

#ifdef CONFIG_RT_GROUP_SCHED

9086

#ifdef CONFIG_RT_GROUP_SCHED

9087

rt_rq->rt_nr_boosted = 0;

9087

rt_rq->rt_nr_boosted = 0;

9088

rt_rq->rq = rq;

9088

rt_rq->rq = rq;

9089

#endif

9089

#endif

9090

}

9090

}

9091

9092

#ifdef CONFIG_FAIR_GROUP_SCHED

9092

#ifdef CONFIG_FAIR_GROUP_SCHED

9093

static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,

9093

static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,

9094

struct sched_entity *se, int cpu, int add,

9094

struct sched_entity *se, int cpu, int add,

9095

struct sched_entity *parent)

9095

struct sched_entity *parent)

9096

{

9096

{

9097

struct rq *rq = cpu_rq(cpu);

9097

struct rq *rq = cpu_rq(cpu);

9098

tg->cfs_rq[cpu] = cfs_rq;

9098

tg->cfs_rq[cpu] = cfs_rq;

9099

init_cfs_rq(cfs_rq, rq);

9099

init_cfs_rq(cfs_rq, rq);

9100

cfs_rq->tg = tg;

9100

cfs_rq->tg = tg;

9101

if (add)

9101

if (add)

9102

list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);

9102

list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);

9103

9104

tg->se[cpu] = se;

9104

tg->se[cpu] = se;

9105

/* se could be NULL for init_task_group */

9105

/* se could be NULL for init_task_group */

9106

if (!se)

9106

if (!se)

9107

return;

9107

return;

9108

9109

if (!parent)

9109

if (!parent)

9110

se->cfs_rq = &rq->cfs;

9110

se->cfs_rq = &rq->cfs;

9111

else

9111

else

9112

se->cfs_rq = parent->my_q;

9112

se->cfs_rq = parent->my_q;

9113

9114

se->my_q = cfs_rq;

9114

se->my_q = cfs_rq;

9115

se->load.weight = tg->shares;

9115

se->load.weight = tg->shares;

9116

se->load.inv_weight = 0;

9116

se->load.inv_weight = 0;

9117

se->parent = parent;

9117

se->parent = parent;

9118

}

9118

}

9119

#endif

9119

#endif

9120

9121

#ifdef CONFIG_RT_GROUP_SCHED

9121

#ifdef CONFIG_RT_GROUP_SCHED

9122

static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,

9122

static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,

9123

struct sched_rt_entity *rt_se, int cpu, int add,

9123

struct sched_rt_entity *rt_se, int cpu, int add,

9124

struct sched_rt_entity *parent)

9124

struct sched_rt_entity *parent)

9125

{

9125

{

9126

struct rq *rq = cpu_rq(cpu);

9126

struct rq *rq = cpu_rq(cpu);

9127

9128

tg->rt_rq[cpu] = rt_rq;

9128

tg->rt_rq[cpu] = rt_rq;

9129

init_rt_rq(rt_rq, rq);

9129

init_rt_rq(rt_rq, rq);

9130

rt_rq->tg = tg;

9130

rt_rq->tg = tg;

9131

rt_rq->rt_se = rt_se;

9131

rt_rq->rt_se = rt_se;

9132

rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;

9132

rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;

9133

if (add)

9133

if (add)

9134

list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);

9134

list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);

9135

9136

tg->rt_se[cpu] = rt_se;

9136

tg->rt_se[cpu] = rt_se;

9137

if (!rt_se)

9137

if (!rt_se)

9138

return;

9138

return;

9139

9140

if (!parent)

9140

if (!parent)

9141

rt_se->rt_rq = &rq->rt;

9141

rt_se->rt_rq = &rq->rt;

9142

else

9142

else

9143

rt_se->rt_rq = parent->my_q;

9143

rt_se->rt_rq = parent->my_q;

9144

9145

rt_se->my_q = rt_rq;

9145

rt_se->my_q = rt_rq;

9146

rt_se->parent = parent;

9146

rt_se->parent = parent;

9147

INIT_LIST_HEAD(&rt_se->run_list);

9147

INIT_LIST_HEAD(&rt_se->run_list);

9148

}

9148

}

9149

#endif

9149

#endif

9150

9151

void __init sched_init(void)

9151

void __init sched_init(void)

9152

{

9152

{

9153

int i, j;

9153

int i, j;

9154

unsigned long alloc_size = 0, ptr;

9154

unsigned long alloc_size = 0, ptr;

9155

9156

#ifdef CONFIG_FAIR_GROUP_SCHED

9156

#ifdef CONFIG_FAIR_GROUP_SCHED

9157

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

9157

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

9158

#endif

9158

#endif

9159

#ifdef CONFIG_RT_GROUP_SCHED

9159

#ifdef CONFIG_RT_GROUP_SCHED

9160

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

9160

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

9161

#endif

9161

#endif

9162

#ifdef CONFIG_USER_SCHED

9162

#ifdef CONFIG_USER_SCHED

9163

alloc_size *= 2;

9163

alloc_size *= 2;

9164

#endif

9164

#endif

9165

#ifdef CONFIG_CPUMASK_OFFSTACK

9165

#ifdef CONFIG_CPUMASK_OFFSTACK

9166

alloc_size += num_possible_cpus() * cpumask_size();

9166

alloc_size += num_possible_cpus() * cpumask_size();

9167

#endif

9167

#endif

9168

/*

9168

/*

9169

* As sched_init() is called before page_alloc is setup,

9169

* As sched_init() is called before page_alloc is setup,

9170

* we use alloc_bootmem().

9170

* we use alloc_bootmem().

9171

*/

9171

*/

9172

if (alloc_size) {

9172

if (alloc_size) {

9173

ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);

9173

ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);

9174

9175

#ifdef CONFIG_FAIR_GROUP_SCHED

9175

#ifdef CONFIG_FAIR_GROUP_SCHED

9176

init_task_group.se = (struct sched_entity **)ptr;

9176

init_task_group.se = (struct sched_entity **)ptr;

9177

ptr += nr_cpu_ids * sizeof(void **);

9177

ptr += nr_cpu_ids * sizeof(void **);

9178

9179

init_task_group.cfs_rq = (struct cfs_rq **)ptr;

9179

init_task_group.cfs_rq = (struct cfs_rq **)ptr;

9180

ptr += nr_cpu_ids * sizeof(void **);

9180

ptr += nr_cpu_ids * sizeof(void **);

9181

9182

#ifdef CONFIG_USER_SCHED

9182

#ifdef CONFIG_USER_SCHED

9183

root_task_group.se = (struct sched_entity **)ptr;

9183

root_task_group.se = (struct sched_entity **)ptr;

9184

ptr += nr_cpu_ids * sizeof(void **);

9184

ptr += nr_cpu_ids * sizeof(void **);

9185

9186

root_task_group.cfs_rq = (struct cfs_rq **)ptr;

9186

root_task_group.cfs_rq = (struct cfs_rq **)ptr;

9187

ptr += nr_cpu_ids * sizeof(void **);

9187

ptr += nr_cpu_ids * sizeof(void **);

9188

#endif /* CONFIG_USER_SCHED */

9188

#endif /* CONFIG_USER_SCHED */

9189

#endif /* CONFIG_FAIR_GROUP_SCHED */

9189

#endif /* CONFIG_FAIR_GROUP_SCHED */

9190

#ifdef CONFIG_RT_GROUP_SCHED

9190

#ifdef CONFIG_RT_GROUP_SCHED

9191

init_task_group.rt_se = (struct sched_rt_entity **)ptr;

9191

init_task_group.rt_se = (struct sched_rt_entity **)ptr;

9192

ptr += nr_cpu_ids * sizeof(void **);

9192

ptr += nr_cpu_ids * sizeof(void **);

9193

9194

init_task_group.rt_rq = (struct rt_rq **)ptr;

9194

init_task_group.rt_rq = (struct rt_rq **)ptr;

9195

ptr += nr_cpu_ids * sizeof(void **);

9195

ptr += nr_cpu_ids * sizeof(void **);

9196

9197

#ifdef CONFIG_USER_SCHED

9197

#ifdef CONFIG_USER_SCHED

9198

root_task_group.rt_se = (struct sched_rt_entity **)ptr;

9198

root_task_group.rt_se = (struct sched_rt_entity **)ptr;

9199

ptr += nr_cpu_ids * sizeof(void **);

9199

ptr += nr_cpu_ids * sizeof(void **);

9200

9201

root_task_group.rt_rq = (struct rt_rq **)ptr;

9201

root_task_group.rt_rq = (struct rt_rq **)ptr;

9202

ptr += nr_cpu_ids * sizeof(void **);

9202

ptr += nr_cpu_ids * sizeof(void **);

9203

#endif /* CONFIG_USER_SCHED */

9203

#endif /* CONFIG_USER_SCHED */

9204

#endif /* CONFIG_RT_GROUP_SCHED */

9204

#endif /* CONFIG_RT_GROUP_SCHED */

9205

#ifdef CONFIG_CPUMASK_OFFSTACK

9205

#ifdef CONFIG_CPUMASK_OFFSTACK

9206

for_each_possible_cpu(i) {

9206

for_each_possible_cpu(i) {

9207

per_cpu(load_balance_tmpmask, i) = (void *)ptr;

9207

per_cpu(load_balance_tmpmask, i) = (void *)ptr;

9208

ptr += cpumask_size();

9208

ptr += cpumask_size();

9209

}

9209

}

9210

#endif /* CONFIG_CPUMASK_OFFSTACK */

9210

#endif /* CONFIG_CPUMASK_OFFSTACK */

9211

}

9211

}

9212

9213

#ifdef CONFIG_SMP

9213

#ifdef CONFIG_SMP

9214

init_defrootdomain();

9214

init_defrootdomain();

9215

#endif

9215

#endif

9216

9217

init_rt_bandwidth(&def_rt_bandwidth,

9217

init_rt_bandwidth(&def_rt_bandwidth,

9218

global_rt_period(), global_rt_runtime());

9218

global_rt_period(), global_rt_runtime());

9219

9220

#ifdef CONFIG_RT_GROUP_SCHED

9220

#ifdef CONFIG_RT_GROUP_SCHED

9221

init_rt_bandwidth(&init_task_group.rt_bandwidth,

9221

init_rt_bandwidth(&init_task_group.rt_bandwidth,

9222

global_rt_period(), global_rt_runtime());

9222

global_rt_period(), global_rt_runtime());

9223

#ifdef CONFIG_USER_SCHED

9223

#ifdef CONFIG_USER_SCHED

9224

init_rt_bandwidth(&root_task_group.rt_bandwidth,

9224

init_rt_bandwidth(&root_task_group.rt_bandwidth,

9225

global_rt_period(), RUNTIME_INF);

9225

global_rt_period(), RUNTIME_INF);

9226

#endif /* CONFIG_USER_SCHED */

9226

#endif /* CONFIG_USER_SCHED */

9227

#endif /* CONFIG_RT_GROUP_SCHED */

9227

#endif /* CONFIG_RT_GROUP_SCHED */

9228

9229

#ifdef CONFIG_GROUP_SCHED

9229

#ifdef CONFIG_GROUP_SCHED

9230

list_add(&init_task_group.list, &task_groups);

9230

list_add(&init_task_group.list, &task_groups);

9231

INIT_LIST_HEAD(&init_task_group.children);

9231

INIT_LIST_HEAD(&init_task_group.children);

9232

9233

#ifdef CONFIG_USER_SCHED

9233

#ifdef CONFIG_USER_SCHED

9234

INIT_LIST_HEAD(&root_task_group.children);

9234

INIT_LIST_HEAD(&root_task_group.children);

9235

init_task_group.parent = &root_task_group;

9235

init_task_group.parent = &root_task_group;

9236

list_add(&init_task_group.siblings, &root_task_group.children);

9236

list_add(&init_task_group.siblings, &root_task_group.children);

9237

#endif /* CONFIG_USER_SCHED */

9237

#endif /* CONFIG_USER_SCHED */

9238

#endif /* CONFIG_GROUP_SCHED */

9238

#endif /* CONFIG_GROUP_SCHED */

9239

9240

for_each_possible_cpu(i) {

9240

for_each_possible_cpu(i) {

9241

struct rq *rq;

9241

struct rq *rq;

9242

9243

rq = cpu_rq(i);

9243

rq = cpu_rq(i);

9244

spin_lock_init(&rq->lock);

9244

spin_lock_init(&rq->lock);

9245

rq->nr_running = 0;

9245

rq->nr_running = 0;

9246

rq->calc_load_active = 0;

9246

rq->calc_load_active = 0;

9247

rq->calc_load_update = jiffies + LOAD_FREQ;

9247

rq->calc_load_update = jiffies + LOAD_FREQ;

9248

init_cfs_rq(&rq->cfs, rq);

9248

init_cfs_rq(&rq->cfs, rq);

9249

init_rt_rq(&rq->rt, rq);

9249

init_rt_rq(&rq->rt, rq);

9250

#ifdef CONFIG_FAIR_GROUP_SCHED

9250

#ifdef CONFIG_FAIR_GROUP_SCHED

9251

init_task_group.shares = init_task_group_load;

9251

init_task_group.shares = init_task_group_load;

9252

INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);

9252

INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);

9253

#ifdef CONFIG_CGROUP_SCHED

9253

#ifdef CONFIG_CGROUP_SCHED

9254

/*

9254

/*

9255

* How much cpu bandwidth does init_task_group get?

9255

* How much cpu bandwidth does init_task_group get?

9256

*

9256

*

9257

* In case of task-groups formed thr' the cgroup filesystem, it

9257

* In case of task-groups formed thr' the cgroup filesystem, it

9258

* gets 100% of the cpu resources in the system. This overall

9258

* gets 100% of the cpu resources in the system. This overall

9259

* system cpu resource is divided among the tasks of

9259

* system cpu resource is divided among the tasks of

9260

* init_task_group and its child task-groups in a fair manner,

9260

* init_task_group and its child task-groups in a fair manner,

9261

* based on each entity's (task or task-group's) weight

9261

* based on each entity's (task or task-group's) weight

9262

* (se->load.weight).

9262

* (se->load.weight).

9263

*

9263

*

9264

* In other words, if init_task_group has 10 tasks of weight

9264

* In other words, if init_task_group has 10 tasks of weight

9265

* 1024) and two child groups A0 and A1 (of weight 1024 each),

9265

* 1024) and two child groups A0 and A1 (of weight 1024 each),

9266

* then A0's share of the cpu resource is:

9266

* then A0's share of the cpu resource is:

9267

*

9267

*

9268

* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%

9268

* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%

9269

*

9269

*

9270

* We achieve this by letting init_task_group's tasks sit

9270

* We achieve this by letting init_task_group's tasks sit

9271

* directly in rq->cfs (i.e init_task_group->se[] = NULL).

9271

* directly in rq->cfs (i.e init_task_group->se[] = NULL).

9272

*/

9272

*/

9273

init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);

9273

init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);

9274

#elif defined CONFIG_USER_SCHED

9274

#elif defined CONFIG_USER_SCHED

9275

root_task_group.shares = NICE_0_LOAD;

9275

root_task_group.shares = NICE_0_LOAD;

9276

init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);

9276

init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);

9277

/*

9277

/*

9278

* In case of task-groups formed thr' the user id of tasks,

9278

* In case of task-groups formed thr' the user id of tasks,

9279

* init_task_group represents tasks belonging to root user.

9279

* init_task_group represents tasks belonging to root user.

9280

* Hence it forms a sibling of all subsequent groups formed.

9280

* Hence it forms a sibling of all subsequent groups formed.

9281

* In this case, init_task_group gets only a fraction of overall

9281

* In this case, init_task_group gets only a fraction of overall

9282

* system cpu resource, based on the weight assigned to root

9282

* system cpu resource, based on the weight assigned to root

9283

* user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished

9283

* user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished

9284

* by letting tasks of init_task_group sit in a separate cfs_rq

9284

* by letting tasks of init_task_group sit in a separate cfs_rq

9285

* (init_cfs_rq) and having one entity represent this group of

9285

* (init_cfs_rq) and having one entity represent this group of

9286

* tasks in rq->cfs (i.e init_task_group->se[] != NULL).

9286

* tasks in rq->cfs (i.e init_task_group->se[] != NULL).

9287

*/

9287

*/

9288

init_tg_cfs_entry(&init_task_group,

9288

init_tg_cfs_entry(&init_task_group,

9289

&per_cpu(init_cfs_rq, i),

9289

&per_cpu(init_cfs_rq, i),

9290

&per_cpu(init_sched_entity, i), i, 1,

9290

&per_cpu(init_sched_entity, i), i, 1,

9291

root_task_group.se[i]);

9291

root_task_group.se[i]);

9292

9293

#endif

9293

#endif

9294

#endif /* CONFIG_FAIR_GROUP_SCHED */

9294

#endif /* CONFIG_FAIR_GROUP_SCHED */

9295

9296

rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;

9296

rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;

9297

#ifdef CONFIG_RT_GROUP_SCHED

9297

#ifdef CONFIG_RT_GROUP_SCHED

9298

INIT_LIST_HEAD(&rq->leaf_rt_rq_list);

9298

INIT_LIST_HEAD(&rq->leaf_rt_rq_list);

9299

#ifdef CONFIG_CGROUP_SCHED

9299

#ifdef CONFIG_CGROUP_SCHED

9300

init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);

9300

init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);

9301

#elif defined CONFIG_USER_SCHED

9301

#elif defined CONFIG_USER_SCHED

9302

init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);

9302

init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);

9303

init_tg_rt_entry(&init_task_group,

9303

init_tg_rt_entry(&init_task_group,

9304

&per_cpu(init_rt_rq, i),

9304

&per_cpu(init_rt_rq, i),

9305

&per_cpu(init_sched_rt_entity, i), i, 1,

9305

&per_cpu(init_sched_rt_entity, i), i, 1,

9306

root_task_group.rt_se[i]);

9306

root_task_group.rt_se[i]);

9307

#endif

9307

#endif

9308

#endif

9308

#endif

9309

9310

for (j = 0; j < CPU_LOAD_IDX_MAX; j++)

9310

for (j = 0; j < CPU_LOAD_IDX_MAX; j++)

9311

rq->cpu_load[j] = 0;

9311

rq->cpu_load[j] = 0;

9312

#ifdef CONFIG_SMP

9312

#ifdef CONFIG_SMP

9313

rq->sd = NULL;

9313

rq->sd = NULL;

9314

rq->rd = NULL;

9314

rq->rd = NULL;

9315

rq->active_balance = 0;

9315

rq->active_balance = 0;

9316

rq->next_balance = jiffies;

9316

rq->next_balance = jiffies;

9317

rq->push_cpu = 0;

9317

rq->push_cpu = 0;

9318

rq->cpu = i;

9318

rq->cpu = i;

9319

rq->online = 0;

9319

rq->online = 0;

9320

rq->migration_thread = NULL;

9320

rq->migration_thread = NULL;

9321

INIT_LIST_HEAD(&rq->migration_queue);

9321

INIT_LIST_HEAD(&rq->migration_queue);

9322

rq_attach_root(rq, &def_root_domain);

9322

rq_attach_root(rq, &def_root_domain);

9323

#endif

9323

#endif

9324

init_rq_hrtick(rq);

9324

init_rq_hrtick(rq);

9325

atomic_set(&rq->nr_iowait, 0);

9325

atomic_set(&rq->nr_iowait, 0);

9326

}

9326

}

9327

9328

set_load_weight(&init_task);

9328

set_load_weight(&init_task);

9329

9330

#ifdef CONFIG_PREEMPT_NOTIFIERS

9330

#ifdef CONFIG_PREEMPT_NOTIFIERS

9331

INIT_HLIST_HEAD(&init_task.preempt_notifiers);

9331

INIT_HLIST_HEAD(&init_task.preempt_notifiers);

9332

#endif

9332

#endif

9333

9334

#ifdef CONFIG_SMP

9334

#ifdef CONFIG_SMP

9335

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

9335

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

9336

#endif

9336

#endif

9337

9338

#ifdef CONFIG_RT_MUTEXES

9338

#ifdef CONFIG_RT_MUTEXES

9339

plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);

9339

plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);

9340

#endif

9340

#endif

9341

9342

/*

9342

/*

9343

* The boot idle thread does lazy MMU switching as well:

9343

* The boot idle thread does lazy MMU switching as well:

9344

*/

9344

*/

9345

atomic_inc(&init_mm.mm_count);

9345

atomic_inc(&init_mm.mm_count);

9346

enter_lazy_tlb(&init_mm, current);

9346

enter_lazy_tlb(&init_mm, current);

9347

9348

/*

9348

/*

9349

* Make us the idle thread. Technically, schedule() should not be

9349

* Make us the idle thread. Technically, schedule() should not be

9350

* called from this thread, however somewhere below it might be,

9350

* called from this thread, however somewhere below it might be,

9351

* but because we are the idle thread, we just pick up running again

9351

* but because we are the idle thread, we just pick up running again

9352

* when this runqueue becomes "idle".

9352

* when this runqueue becomes "idle".

9353

*/

9353

*/

9354

init_idle(current, smp_processor_id());

9354

init_idle(current, smp_processor_id());

9355

9356

calc_load_update = jiffies + LOAD_FREQ;

9356

calc_load_update = jiffies + LOAD_FREQ;

9357

9358

/*

9358

/*

9359

* During early bootup we pretend to be a normal task:

9359

* During early bootup we pretend to be a normal task:

9360

*/

9360

*/

9361

current->sched_class = &fair_sched_class;

9361

current->sched_class = &fair_sched_class;

9362

9363

/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */

9363

/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */

9364

alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);

9364

alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);

9365

#ifdef CONFIG_SMP

9365

#ifdef CONFIG_SMP

9366

#ifdef CONFIG_NO_HZ

9366

#ifdef CONFIG_NO_HZ

9367

alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);

9367

alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);

9368

alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);

9368

alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);

9369

#endif

9369

#endif

9370

alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);

9370

alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);

9371

#endif /* SMP */

9371

#endif /* SMP */

9372

9373

perf_counter_init();

9373

perf_counter_init();

9374

9375

scheduler_running = 1;

9375

scheduler_running = 1;

9376

}

9376

}

9377

9378

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

9378

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

9379

void __might_sleep(char *file, int line)

9379

void __might_sleep(char *file, int line)

9380

{

9380

{

9381

#ifdef in_atomic

9381

#ifdef in_atomic

9382

static unsigned long prev_jiffy; /* ratelimiting */

9382

static unsigned long prev_jiffy; /* ratelimiting */

9383

9384

if ((!in_atomic() && !irqs_disabled()) ||

9384

if ((!in_atomic() && !irqs_disabled()) ||

9385

system_state != SYSTEM_RUNNING || oops_in_progress)

9385

system_state != SYSTEM_RUNNING || oops_in_progress)

9386

return;

9386

return;

9387

if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

9387

if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

9388

return;

9388

return;

9389

prev_jiffy = jiffies;

9389

prev_jiffy = jiffies;

9390

9391

printk(KERN_ERR

9391

printk(KERN_ERR

9392

"BUG: sleeping function called from invalid context at %s:%d\n",

9392

"BUG: sleeping function called from invalid context at %s:%d\n",

9393

file, line);

9393

file, line);

9394

printk(KERN_ERR

9394

printk(KERN_ERR

9395

"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",

9395

"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",

9396

in_atomic(), irqs_disabled(),

9396

in_atomic(), irqs_disabled(),

9397

current->pid, current->comm);

9397

current->pid, current->comm);

9398

9399

debug_show_held_locks(current);

9399

debug_show_held_locks(current);

9400

if (irqs_disabled())

9400

if (irqs_disabled())

9401

print_irqtrace_events(current);

9401

print_irqtrace_events(current);

9402

dump_stack();

9402

dump_stack();

9403

#endif

9403

#endif

9404

}

9404

}

9405

EXPORT_SYMBOL(__might_sleep);

9405

EXPORT_SYMBOL(__might_sleep);

9406

#endif

9406

#endif

9407

9408

#ifdef CONFIG_MAGIC_SYSRQ

9408

#ifdef CONFIG_MAGIC_SYSRQ

9409

static void normalize_task(struct rq *rq, struct task_struct *p)

9409

static void normalize_task(struct rq *rq, struct task_struct *p)

9410

{

9410

{

9411

int on_rq;

9411

int on_rq;

9412

9413

update_rq_clock(rq);

9413

update_rq_clock(rq);

9414

on_rq = p->se.on_rq;

9414

on_rq = p->se.on_rq;

9415

if (on_rq)

9415

if (on_rq)

9416

deactivate_task(rq, p, 0);

9416

deactivate_task(rq, p, 0);

9417

__setscheduler(rq, p, SCHED_NORMAL, 0);

9417

__setscheduler(rq, p, SCHED_NORMAL, 0);

9418

if (on_rq) {

9418

if (on_rq) {

9419

activate_task(rq, p, 0);

9419

activate_task(rq, p, 0);

9420

resched_task(rq->curr);

9420

resched_task(rq->curr);

9421

}

9421

}

9422

}

9422

}

9423

9424

void normalize_rt_tasks(void)

9424

void normalize_rt_tasks(void)

9425

{

9425

{

9426

struct task_struct *g, *p;

9426

struct task_struct *g, *p;

9427

unsigned long flags;

9427

unsigned long flags;

9428

struct rq *rq;

9428

struct rq *rq;

9429

9430

read_lock_irqsave(&tasklist_lock, flags);

9430

read_lock_irqsave(&tasklist_lock, flags);

9431

do_each_thread(g, p) {

9431

do_each_thread(g, p) {

9432

/*

9432

/*

9433

* Only normalize user tasks:

9433

* Only normalize user tasks:

9434

*/

9434

*/

9435

if (!p->mm)

9435

if (!p->mm)

9436

continue;

9436

continue;

9437

9438

p->se.exec_start = 0;

9438

p->se.exec_start = 0;

9439

#ifdef CONFIG_SCHEDSTATS

9439

#ifdef CONFIG_SCHEDSTATS

9440

p->se.wait_start = 0;

9440

p->se.wait_start = 0;

9441

p->se.sleep_start = 0;

9441

p->se.sleep_start = 0;

9442

p->se.block_start = 0;

9442

p->se.block_start = 0;

9443

#endif

9443

#endif

9444

9445

if (!rt_task(p)) {

9445

if (!rt_task(p)) {

9446

/*

9446

/*

9447

* Renice negative nice level userspace

9447

* Renice negative nice level userspace

9448

* tasks back to 0:

9448

* tasks back to 0:

9449

*/

9449

*/

9450

if (TASK_NICE(p) < 0 && p->mm)

9450

if (TASK_NICE(p) < 0 && p->mm)

9451

set_user_nice(p, 0);

9451

set_user_nice(p, 0);

9452

continue;

9452

continue;

9453

}

9453

}

9454

9455

spin_lock(&p->pi_lock);

9455

spin_lock(&p->pi_lock);

9456

rq = __task_rq_lock(p);

9456

rq = __task_rq_lock(p);

9457

9458

normalize_task(rq, p);

9458

normalize_task(rq, p);

9459

9460

__task_rq_unlock(rq);

9460

__task_rq_unlock(rq);

9461

spin_unlock(&p->pi_lock);

9461

spin_unlock(&p->pi_lock);

9462

} while_each_thread(g, p);

9462

} while_each_thread(g, p);

9463

9464

read_unlock_irqrestore(&tasklist_lock, flags);

9464

read_unlock_irqrestore(&tasklist_lock, flags);

9465

}

9465

}

9466

9467

#endif /* CONFIG_MAGIC_SYSRQ */

9467

#endif /* CONFIG_MAGIC_SYSRQ */

9468

9469

#ifdef CONFIG_IA64

9469

#ifdef CONFIG_IA64

9470

/*

9470

/*

9471

* These functions are only useful for the IA64 MCA handling.

9471

* These functions are only useful for the IA64 MCA handling.

9472

*

9472

*

9473

* They can only be called when the whole system has been

9473

* They can only be called when the whole system has been

9474

* stopped - every CPU needs to be quiescent, and no scheduling

9474

* stopped - every CPU needs to be quiescent, and no scheduling

9475

* activity can take place. Using them for anything else would

9475

* activity can take place. Using them for anything else would

9476

* be a serious bug, and as a result, they aren't even visible

9476

* be a serious bug, and as a result, they aren't even visible

9477

* under any other configuration.

9477

* under any other configuration.

9478

*/

9478

*/

9479

9480

/**

9480

/**

9481

* curr_task - return the current task for a given cpu.

9481

* curr_task - return the current task for a given cpu.

9482

* @cpu: the processor in question.

9482

* @cpu: the processor in question.

9483

*

9483

*

9484

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

9484

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

9485

*/

9485

*/

9486

struct task_struct *curr_task(int cpu)

9486

struct task_struct *curr_task(int cpu)

9487

{

9487

{

9488

return cpu_curr(cpu);

9488

return cpu_curr(cpu);

9489

}

9489

}

9490

9491

/**

9491

/**

9492

* set_curr_task - set the current task for a given cpu.

9492

* set_curr_task - set the current task for a given cpu.

9493

* @cpu: the processor in question.

9493

* @cpu: the processor in question.

9494

* @p: the task pointer to set.

9494

* @p: the task pointer to set.

9495

*

9495

*

9496

* Description: This function must only be used when non-maskable interrupts

9496

* Description: This function must only be used when non-maskable interrupts

9497

* are serviced on a separate stack. It allows the architecture to switch the

9497

* are serviced on a separate stack. It allows the architecture to switch the

9498

* notion of the current task on a cpu in a non-blocking manner. This function

9498

* notion of the current task on a cpu in a non-blocking manner. This function

9499

* must be called with all CPU's synchronized, and interrupts disabled, the

9499

* must be called with all CPU's synchronized, and interrupts disabled, the

9500

* and caller must save the original value of the current task (see

9500

* and caller must save the original value of the current task (see

9501

* curr_task() above) and restore that value before reenabling interrupts and

9501

* curr_task() above) and restore that value before reenabling interrupts and

9502

* re-starting the system.

9502

* re-starting the system.

9503

*

9503

*

9504

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

9504

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

9505

*/

9505

*/

9506

void set_curr_task(int cpu, struct task_struct *p)

9506

void set_curr_task(int cpu, struct task_struct *p)

9507

{

9507

{

9508

cpu_curr(cpu) = p;

9508

cpu_curr(cpu) = p;

9509

}

9509

}

9510

9511

#endif

9511

#endif

9512

9513

#ifdef CONFIG_FAIR_GROUP_SCHED

9513

#ifdef CONFIG_FAIR_GROUP_SCHED

9514

static void free_fair_sched_group(struct task_group *tg)

9514

static void free_fair_sched_group(struct task_group *tg)

9515

{

9515

{

9516

int i;

9516

int i;

9517

9518

for_each_possible_cpu(i) {

9518

for_each_possible_cpu(i) {

9519

if (tg->cfs_rq)

9519

if (tg->cfs_rq)

9520

kfree(tg->cfs_rq[i]);

9520

kfree(tg->cfs_rq[i]);

9521

if (tg->se)

9521

if (tg->se)

9522

kfree(tg->se[i]);

9522

kfree(tg->se[i]);

9523

}

9523

}

9524

9525

kfree(tg->cfs_rq);

9525

kfree(tg->cfs_rq);

9526

kfree(tg->se);

9526

kfree(tg->se);

9527

}

9527

}

9528

9529

static

9529

static

9530

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

9530

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

9531

{

9531

{

9532

struct cfs_rq *cfs_rq;

9532

struct cfs_rq *cfs_rq;

9533

struct sched_entity *se;

9533

struct sched_entity *se;

9534

struct rq *rq;

9534

struct rq *rq;

9535

int i;

9535

int i;

9536

9537

tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);

9537

tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);

9538

if (!tg->cfs_rq)

9538

if (!tg->cfs_rq)

9539

goto err;

9539

goto err;

9540

tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);

9540

tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);

9541

if (!tg->se)

9541

if (!tg->se)

9542

goto err;

9542

goto err;

9543

9544

tg->shares = NICE_0_LOAD;

9544

tg->shares = NICE_0_LOAD;

9545

9546

for_each_possible_cpu(i) {

9546

for_each_possible_cpu(i) {

9547

rq = cpu_rq(i);

9547

rq = cpu_rq(i);

9548

9549

cfs_rq = kzalloc_node(sizeof(struct cfs_rq),

9549

cfs_rq = kzalloc_node(sizeof(struct cfs_rq),

9550

GFP_KERNEL, cpu_to_node(i));

9550

GFP_KERNEL, cpu_to_node(i));

9551

if (!cfs_rq)

9551

if (!cfs_rq)

9552

goto err;

9552

goto err;

9553

9554

se = kzalloc_node(sizeof(struct sched_entity),

9554

se = kzalloc_node(sizeof(struct sched_entity),

9555

GFP_KERNEL, cpu_to_node(i));

9555

GFP_KERNEL, cpu_to_node(i));

9556

if (!se)

9556

if (!se)

9557

goto err;

9557

goto err;

9558

9559

init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);

9559

init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);

9560

}

9560

}

9561

9562

return 1;

9562

return 1;

9563

9564

err:

9564

err:

9565

return 0;

9565

return 0;

9566

}

9566

}

9567

9568

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

9568

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

9569

{

9569

{

9570

list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,

9570

list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,

9571

&cpu_rq(cpu)->leaf_cfs_rq_list);

9571

&cpu_rq(cpu)->leaf_cfs_rq_list);

9572

}

9572

}

9573

9574

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

9574

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

9575

{

9575

{

9576

list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);

9576

list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);

9577

}

9577

}

9578

#else /* !CONFG_FAIR_GROUP_SCHED */

9578

#else /* !CONFG_FAIR_GROUP_SCHED */

9579

static inline void free_fair_sched_group(struct task_group *tg)

9579

static inline void free_fair_sched_group(struct task_group *tg)

9580

{

9580

{

9581

}

9581

}

9582

9583

static inline

9583

static inline

9584

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

9584

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

9585

{

9585

{

9586

return 1;

9586

return 1;

9587

}

9587

}

9588

9589

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

9589

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

9590

{

9590

{

9591

}

9591

}

9592

9593

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

9593

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

9594

{

9594

{

9595

}

9595

}

9596

#endif /* CONFIG_FAIR_GROUP_SCHED */

9596

#endif /* CONFIG_FAIR_GROUP_SCHED */

9597

9598

#ifdef CONFIG_RT_GROUP_SCHED

9598

#ifdef CONFIG_RT_GROUP_SCHED

9599

static void free_rt_sched_group(struct task_group *tg)

9599

static void free_rt_sched_group(struct task_group *tg)

9600

{

9600

{

9601

int i;

9601

int i;

9602

9603

destroy_rt_bandwidth(&tg->rt_bandwidth);

9603

destroy_rt_bandwidth(&tg->rt_bandwidth);

9604

9605

for_each_possible_cpu(i) {

9605

for_each_possible_cpu(i) {

9606

if (tg->rt_rq)

9606

if (tg->rt_rq)

9607

kfree(tg->rt_rq[i]);

9607

kfree(tg->rt_rq[i]);

9608

if (tg->rt_se)

9608

if (tg->rt_se)

9609

kfree(tg->rt_se[i]);

9609

kfree(tg->rt_se[i]);

9610

}

9610

}

9611

9612

kfree(tg->rt_rq);

9612

kfree(tg->rt_rq);

9613

kfree(tg->rt_se);

9613

kfree(tg->rt_se);

9614

}

9614

}

9615

9616

static

9616

static

9617

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

9617

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

9618

{

9618

{

9619

struct rt_rq *rt_rq;

9619

struct rt_rq *rt_rq;

9620

struct sched_rt_entity *rt_se;

9620

struct sched_rt_entity *rt_se;

9621

struct rq *rq;

9621

struct rq *rq;

9622

int i;

9622

int i;

9623

9624

tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);

9624

tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);

9625

if (!tg->rt_rq)

9625

if (!tg->rt_rq)

9626

goto err;

9626

goto err;

9627

tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);

9627

tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);

9628

if (!tg->rt_se)

9628

if (!tg->rt_se)

9629

goto err;

9629

goto err;

9630

9631

init_rt_bandwidth(&tg->rt_bandwidth,

9631

init_rt_bandwidth(&tg->rt_bandwidth,

9632

ktime_to_ns(def_rt_bandwidth.rt_period), 0);

9632

ktime_to_ns(def_rt_bandwidth.rt_period), 0);

9633

9634

for_each_possible_cpu(i) {

9634

for_each_possible_cpu(i) {

9635

rq = cpu_rq(i);

9635

rq = cpu_rq(i);

9636

9637

rt_rq = kzalloc_node(sizeof(struct rt_rq),

9637

rt_rq = kzalloc_node(sizeof(struct rt_rq),

9638

GFP_KERNEL, cpu_to_node(i));

9638

GFP_KERNEL, cpu_to_node(i));

9639

if (!rt_rq)

9639

if (!rt_rq)

9640

goto err;

9640

goto err;

9641

9642

rt_se = kzalloc_node(sizeof(struct sched_rt_entity),

9642

rt_se = kzalloc_node(sizeof(struct sched_rt_entity),

9643

GFP_KERNEL, cpu_to_node(i));

9643

GFP_KERNEL, cpu_to_node(i));

9644

if (!rt_se)

9644

if (!rt_se)

9645

goto err;

9645

goto err;

9646

9647

init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);

9647

init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);

9648

}

9648

}

9649

9650

return 1;

9650

return 1;

9651

9652

err:

9652

err:

9653

return 0;

9653

return 0;

9654

}

9654

}

9655

9656

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

9656

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

9657

{

9657

{

9658

list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,

9658

list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,

9659

&cpu_rq(cpu)->leaf_rt_rq_list);

9659

&cpu_rq(cpu)->leaf_rt_rq_list);

9660

}

9660

}

9661

9662

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

9662

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

9663

{

9663

{

9664

list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);

9664

list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);

9665

}

9665

}

9666

#else /* !CONFIG_RT_GROUP_SCHED */

9666

#else /* !CONFIG_RT_GROUP_SCHED */

9667

static inline void free_rt_sched_group(struct task_group *tg)

9667

static inline void free_rt_sched_group(struct task_group *tg)

9668

{

9668

{

9669

}

9669

}

9670

9671

static inline

9671

static inline

9672

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

9672

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

9673

{

9673

{

9674

return 1;

9674

return 1;

9675

}

9675

}

9676

9677

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

9677

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

9678

{

9678

{

9679

}

9679

}

9680

9681

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

9681

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

9682

{

9682

{

9683

}

9683

}

9684

#endif /* CONFIG_RT_GROUP_SCHED */

9684

#endif /* CONFIG_RT_GROUP_SCHED */

9685

9686

#ifdef CONFIG_GROUP_SCHED

9686

#ifdef CONFIG_GROUP_SCHED

9687

static void free_sched_group(struct task_group *tg)

9687

static void free_sched_group(struct task_group *tg)

9688

{

9688

{

9689

free_fair_sched_group(tg);

9689

free_fair_sched_group(tg);

9690

free_rt_sched_group(tg);

9690

free_rt_sched_group(tg);

9691

kfree(tg);

9691

kfree(tg);

9692

}

9692

}

9693

9694

/* allocate runqueue etc for a new task group */

9694

/* allocate runqueue etc for a new task group */

9695

struct task_group *sched_create_group(struct task_group *parent)

9695

struct task_group *sched_create_group(struct task_group *parent)

9696

{

9696

{

9697

struct task_group *tg;

9697

struct task_group *tg;

9698

unsigned long flags;

9698

unsigned long flags;

9699

int i;

9699

int i;

9700

9701

tg = kzalloc(sizeof(*tg), GFP_KERNEL);

9701

tg = kzalloc(sizeof(*tg), GFP_KERNEL);

9702

if (!tg)

9702

if (!tg)

9703

return ERR_PTR(-ENOMEM);

9703

return ERR_PTR(-ENOMEM);

9704

9705

if (!alloc_fair_sched_group(tg, parent))

9705

if (!alloc_fair_sched_group(tg, parent))

9706

goto err;

9706

goto err;

9707

9708

if (!alloc_rt_sched_group(tg, parent))

9708

if (!alloc_rt_sched_group(tg, parent))

9709

goto err;

9709

goto err;

9710

9711

spin_lock_irqsave(&task_group_lock, flags);

9711

spin_lock_irqsave(&task_group_lock, flags);

9712

for_each_possible_cpu(i) {

9712

for_each_possible_cpu(i) {

9713

register_fair_sched_group(tg, i);

9713

register_fair_sched_group(tg, i);

9714

register_rt_sched_group(tg, i);

9714

register_rt_sched_group(tg, i);

9715

}

9715

}

9716

list_add_rcu(&tg->list, &task_groups);

9716

list_add_rcu(&tg->list, &task_groups);

9717

9718

WARN_ON(!parent); /* root should already exist */

9718

WARN_ON(!parent); /* root should already exist */

9719

9720

tg->parent = parent;

9720

tg->parent = parent;

9721

INIT_LIST_HEAD(&tg->children);

9721

INIT_LIST_HEAD(&tg->children);

9722

list_add_rcu(&tg->siblings, &parent->children);

9722

list_add_rcu(&tg->siblings, &parent->children);

9723

spin_unlock_irqrestore(&task_group_lock, flags);

9723

spin_unlock_irqrestore(&task_group_lock, flags);

9724

9725

return tg;

9725

return tg;

9726

9727

err:

9727

err:

9728

free_sched_group(tg);

9728

free_sched_group(tg);

9729

return ERR_PTR(-ENOMEM);

9729

return ERR_PTR(-ENOMEM);

9730

}

9730

}

9731

9732

/* rcu callback to free various structures associated with a task group */

9732

/* rcu callback to free various structures associated with a task group */

9733

static void free_sched_group_rcu(struct rcu_head *rhp)

9733

static void free_sched_group_rcu(struct rcu_head *rhp)

9734

{

9734

{

9735

/* now it should be safe to free those cfs_rqs */

9735

/* now it should be safe to free those cfs_rqs */

9736

free_sched_group(container_of(rhp, struct task_group, rcu));

9736

free_sched_group(container_of(rhp, struct task_group, rcu));

9737

}

9737

}

9738

9739

/* Destroy runqueue etc associated with a task group */

9739

/* Destroy runqueue etc associated with a task group */

9740

void sched_destroy_group(struct task_group *tg)

9740

void sched_destroy_group(struct task_group *tg)

9741

{

9741

{

9742

unsigned long flags;

9742

unsigned long flags;

9743

int i;

9743

int i;

9744

9745

spin_lock_irqsave(&task_group_lock, flags);

9745

spin_lock_irqsave(&task_group_lock, flags);

9746

for_each_possible_cpu(i) {

9746

for_each_possible_cpu(i) {

9747

unregister_fair_sched_group(tg, i);

9747

unregister_fair_sched_group(tg, i);

9748

unregister_rt_sched_group(tg, i);

9748

unregister_rt_sched_group(tg, i);

9749

}

9749

}

9750

list_del_rcu(&tg->list);

9750

list_del_rcu(&tg->list);

9751

list_del_rcu(&tg->siblings);

9751

list_del_rcu(&tg->siblings);

9752

spin_unlock_irqrestore(&task_group_lock, flags);

9752

spin_unlock_irqrestore(&task_group_lock, flags);

9753

9754

/* wait for possible concurrent references to cfs_rqs complete */

9754

/* wait for possible concurrent references to cfs_rqs complete */

9755

call_rcu(&tg->rcu, free_sched_group_rcu);

9755

call_rcu(&tg->rcu, free_sched_group_rcu);

9756

}

9756

}

9757

9758

/* change task's runqueue when it moves between groups.

9758

/* change task's runqueue when it moves between groups.

9759

* The caller of this function should have put the task in its new group

9759

* The caller of this function should have put the task in its new group

9760

* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to

9760

* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to

9761

* reflect its new group.

9761

* reflect its new group.

9762

*/

9762

*/

9763

void sched_move_task(struct task_struct *tsk)

9763

void sched_move_task(struct task_struct *tsk)

9764

{

9764

{

9765

int on_rq, running;

9765

int on_rq, running;

9766

unsigned long flags;

9766

unsigned long flags;

9767

struct rq *rq;

9767

struct rq *rq;

9768

9769

rq = task_rq_lock(tsk, &flags);

9769

rq = task_rq_lock(tsk, &flags);

9770

9771

update_rq_clock(rq);

9771

update_rq_clock(rq);

9772

9773

running = task_current(rq, tsk);

9773

running = task_current(rq, tsk);

9774

on_rq = tsk->se.on_rq;

9774

on_rq = tsk->se.on_rq;

9775

9776

if (on_rq)

9776

if (on_rq)

9777

dequeue_task(rq, tsk, 0);

9777

dequeue_task(rq, tsk, 0);

9778

if (unlikely(running))

9778

if (unlikely(running))

9779

tsk->sched_class->put_prev_task(rq, tsk);

9779

tsk->sched_class->put_prev_task(rq, tsk);

9780

9781

set_task_rq(tsk, task_cpu(tsk));

9781

set_task_rq(tsk, task_cpu(tsk));

9782

9783

#ifdef CONFIG_FAIR_GROUP_SCHED

9783

#ifdef CONFIG_FAIR_GROUP_SCHED

9784

if (tsk->sched_class->moved_group)

9784

if (tsk->sched_class->moved_group)

9785

tsk->sched_class->moved_group(tsk);

9785

tsk->sched_class->moved_group(tsk);

9786

#endif

9786

#endif

9787

9788

if (unlikely(running))

9788

if (unlikely(running))

9789

tsk->sched_class->set_curr_task(rq);

9789

tsk->sched_class->set_curr_task(rq);

9790

if (on_rq)

9790

if (on_rq)

9791

enqueue_task(rq, tsk, 0);

9791

enqueue_task(rq, tsk, 0);

9792

9793

task_rq_unlock(rq, &flags);

9793

task_rq_unlock(rq, &flags);

9794

}

9794

}

9795

#endif /* CONFIG_GROUP_SCHED */

9795

#endif /* CONFIG_GROUP_SCHED */

9796

9797

#ifdef CONFIG_FAIR_GROUP_SCHED

9797

#ifdef CONFIG_FAIR_GROUP_SCHED

9798

static void __set_se_shares(struct sched_entity *se, unsigned long shares)

9798

static void __set_se_shares(struct sched_entity *se, unsigned long shares)

9799

{

9799

{

9800

struct cfs_rq *cfs_rq = se->cfs_rq;

9800

struct cfs_rq *cfs_rq = se->cfs_rq;

9801

int on_rq;

9801

int on_rq;

9802

9803

on_rq = se->on_rq;

9803

on_rq = se->on_rq;

9804

if (on_rq)

9804

if (on_rq)

9805

dequeue_entity(cfs_rq, se, 0);

9805

dequeue_entity(cfs_rq, se, 0);

9806

9807

se->load.weight = shares;

9807

se->load.weight = shares;

9808

se->load.inv_weight = 0;

9808

se->load.inv_weight = 0;

9809

9810

if (on_rq)

9810

if (on_rq)

9811

enqueue_entity(cfs_rq, se, 0);

9811

enqueue_entity(cfs_rq, se, 0);

9812

}

9812

}

9813

9814

static void set_se_shares(struct sched_entity *se, unsigned long shares)

9814

static void set_se_shares(struct sched_entity *se, unsigned long shares)

9815

{

9815

{

9816

struct cfs_rq *cfs_rq = se->cfs_rq;

9816

struct cfs_rq *cfs_rq = se->cfs_rq;

9817

struct rq *rq = cfs_rq->rq;

9817

struct rq *rq = cfs_rq->rq;

9818

unsigned long flags;

9818

unsigned long flags;

9819

9820

spin_lock_irqsave(&rq->lock, flags);

9820

spin_lock_irqsave(&rq->lock, flags);

9821

__set_se_shares(se, shares);

9821

__set_se_shares(se, shares);

9822

spin_unlock_irqrestore(&rq->lock, flags);

9822

spin_unlock_irqrestore(&rq->lock, flags);

9823

}

9823

}

9824

9825

static DEFINE_MUTEX(shares_mutex);

9825

static DEFINE_MUTEX(shares_mutex);

9826

9827

int sched_group_set_shares(struct task_group *tg, unsigned long shares)

9827

int sched_group_set_shares(struct task_group *tg, unsigned long shares)

9828

{

9828

{

9829

int i;

9829

int i;

9830

unsigned long flags;

9830

unsigned long flags;

9831

9832

/*

9832

/*

9833

* We can't change the weight of the root cgroup.

9833

* We can't change the weight of the root cgroup.

9834

*/

9834

*/

9835

if (!tg->se[0])

9835

if (!tg->se[0])

9836

return -EINVAL;

9836

return -EINVAL;

9837

9838

if (shares < MIN_SHARES)

9838

if (shares < MIN_SHARES)

9839

shares = MIN_SHARES;

9839

shares = MIN_SHARES;

9840

else if (shares > MAX_SHARES)

9840

else if (shares > MAX_SHARES)

9841

shares = MAX_SHARES;

9841

shares = MAX_SHARES;

9842

9843

mutex_lock(&shares_mutex);

9843

mutex_lock(&shares_mutex);

9844

if (tg->shares == shares)

9844

if (tg->shares == shares)

9845

goto done;

9845

goto done;

9846

9847

spin_lock_irqsave(&task_group_lock, flags);

9847

spin_lock_irqsave(&task_group_lock, flags);

9848

for_each_possible_cpu(i)

9848

for_each_possible_cpu(i)

9849

unregister_fair_sched_group(tg, i);

9849

unregister_fair_sched_group(tg, i);

9850

list_del_rcu(&tg->siblings);

9850

list_del_rcu(&tg->siblings);

9851

spin_unlock_irqrestore(&task_group_lock, flags);

9851

spin_unlock_irqrestore(&task_group_lock, flags);

9852

9853

/* wait for any ongoing reference to this group to finish */

9853

/* wait for any ongoing reference to this group to finish */

9854

synchronize_sched();

9854

synchronize_sched();

9855

9856

/*

9856

/*

9857

* Now we are free to modify the group's share on each cpu

9857

* Now we are free to modify the group's share on each cpu

9858

* w/o tripping rebalance_share or load_balance_fair.

9858

* w/o tripping rebalance_share or load_balance_fair.

9859

*/

9859

*/

9860

tg->shares = shares;

9860

tg->shares = shares;

9861

for_each_possible_cpu(i) {

9861

for_each_possible_cpu(i) {

9862

/*

9862

/*

9863

* force a rebalance

9863

* force a rebalance

9864

*/

9864

*/

9865

cfs_rq_set_shares(tg->cfs_rq[i], 0);

9865

cfs_rq_set_shares(tg->cfs_rq[i], 0);

9866

set_se_shares(tg->se[i], shares);

9866

set_se_shares(tg->se[i], shares);

9867

}

9867

}

9868

9869

/*

9869

/*

9870

* Enable load balance activity on this group, by inserting it back on

9870

* Enable load balance activity on this group, by inserting it back on

9871

* each cpu's rq->leaf_cfs_rq_list.

9871

* each cpu's rq->leaf_cfs_rq_list.

9872

*/

9872

*/

9873

spin_lock_irqsave(&task_group_lock, flags);

9873

spin_lock_irqsave(&task_group_lock, flags);

9874

for_each_possible_cpu(i)

9874

for_each_possible_cpu(i)

9875

register_fair_sched_group(tg, i);

9875

register_fair_sched_group(tg, i);

9876

list_add_rcu(&tg->siblings, &tg->parent->children);

9876

list_add_rcu(&tg->siblings, &tg->parent->children);

9877

spin_unlock_irqrestore(&task_group_lock, flags);

9877

spin_unlock_irqrestore(&task_group_lock, flags);

9878

done:

9878

done:

9879

mutex_unlock(&shares_mutex);

9879

mutex_unlock(&shares_mutex);

9880

return 0;

9880

return 0;

9881

}

9881

}

9882

9883

unsigned long sched_group_shares(struct task_group *tg)

9883

unsigned long sched_group_shares(struct task_group *tg)

9884

{

9884

{

9885

return tg->shares;

9885

return tg->shares;

9886

}

9886

}

9887

#endif

9887

#endif

9888

9889

#ifdef CONFIG_RT_GROUP_SCHED

9889

#ifdef CONFIG_RT_GROUP_SCHED

9890

/*

9890

/*

9891

* Ensure that the real time constraints are schedulable.

9891

* Ensure that the real time constraints are schedulable.

9892

*/

9892

*/

9893

static DEFINE_MUTEX(rt_constraints_mutex);

9893

static DEFINE_MUTEX(rt_constraints_mutex);

9894

9895

static unsigned long to_ratio(u64 period, u64 runtime)

9895

static unsigned long to_ratio(u64 period, u64 runtime)

9896

{

9896

{

9897

if (runtime == RUNTIME_INF)

9897

if (runtime == RUNTIME_INF)

9898

return 1ULL << 20;

9898

return 1ULL << 20;

9899

9900

return div64_u64(runtime << 20, period);

9900

return div64_u64(runtime << 20, period);

9901

}

9901

}

9902

9903

/* Must be called with tasklist_lock held */

9903

/* Must be called with tasklist_lock held */

9904

static inline int tg_has_rt_tasks(struct task_group *tg)

9904

static inline int tg_has_rt_tasks(struct task_group *tg)

9905

{

9905

{

9906

struct task_struct *g, *p;

9906

struct task_struct *g, *p;

9907

9908

do_each_thread(g, p) {

9908

do_each_thread(g, p) {

9909

if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)

9909

if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)

9910

return 1;

9910

return 1;

9911

} while_each_thread(g, p);

9911

} while_each_thread(g, p);

9912

9913

return 0;

9913

return 0;

9914

}

9914

}

9915

9916

struct rt_schedulable_data {

9916

struct rt_schedulable_data {

9917

struct task_group *tg;

9917

struct task_group *tg;

9918

u64 rt_period;

9918

u64 rt_period;

9919

u64 rt_runtime;

9919

u64 rt_runtime;

9920

};

9920

};

9921

9922

static int tg_schedulable(struct task_group *tg, void *data)

9922

static int tg_schedulable(struct task_group *tg, void *data)

9923

{

9923

{

9924

struct rt_schedulable_data *d = data;

9924

struct rt_schedulable_data *d = data;

9925

struct task_group *child;

9925

struct task_group *child;

9926

unsigned long total, sum = 0;

9926

unsigned long total, sum = 0;

9927

u64 period, runtime;

9927

u64 period, runtime;

9928

9929

period = ktime_to_ns(tg->rt_bandwidth.rt_period);

9929

period = ktime_to_ns(tg->rt_bandwidth.rt_period);

9930

runtime = tg->rt_bandwidth.rt_runtime;

9930

runtime = tg->rt_bandwidth.rt_runtime;

9931

9932

if (tg == d->tg) {

9932

if (tg == d->tg) {

9933

period = d->rt_period;

9933

period = d->rt_period;

9934

runtime = d->rt_runtime;

9934

runtime = d->rt_runtime;

9935

}

9935

}

9936

9937

#ifdef CONFIG_USER_SCHED

9937

#ifdef CONFIG_USER_SCHED

9938

if (tg == &root_task_group) {

9938

if (tg == &root_task_group) {

9939

period = global_rt_period();

9939

period = global_rt_period();

9940

runtime = global_rt_runtime();

9940

runtime = global_rt_runtime();

9941

}

9941

}

9942

#endif

9942

#endif

9943

9944

/*

9944

/*

9945

* Cannot have more runtime than the period.

9945

* Cannot have more runtime than the period.

9946

*/

9946

*/

9947

if (runtime > period && runtime != RUNTIME_INF)

9947

if (runtime > period && runtime != RUNTIME_INF)

9948

return -EINVAL;

9948

return -EINVAL;

9949

9950

/*

9950

/*

9951

* Ensure we don't starve existing RT tasks.

9951

* Ensure we don't starve existing RT tasks.

9952

*/

9952

*/

9953

if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))

9953

if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))

9954

return -EBUSY;

9954

return -EBUSY;

9955

9956

total = to_ratio(period, runtime);

9956

total = to_ratio(period, runtime);

9957

9958

/*

9958

/*

9959

* Nobody can have more than the global setting allows.

9959

* Nobody can have more than the global setting allows.

9960

*/

9960

*/

9961

if (total > to_ratio(global_rt_period(), global_rt_runtime()))

9961

if (total > to_ratio(global_rt_period(), global_rt_runtime()))

9962

return -EINVAL;

9962

return -EINVAL;

9963

9964

/*

9964

/*

9965

* The sum of our children's runtime should not exceed our own.

9965

* The sum of our children's runtime should not exceed our own.

9966

*/

9966

*/

9967

list_for_each_entry_rcu(child, &tg->children, siblings) {

9967

list_for_each_entry_rcu(child, &tg->children, siblings) {

9968

period = ktime_to_ns(child->rt_bandwidth.rt_period);

9968

period = ktime_to_ns(child->rt_bandwidth.rt_period);

9969

runtime = child->rt_bandwidth.rt_runtime;

9969

runtime = child->rt_bandwidth.rt_runtime;

9970

9971

if (child == d->tg) {

9971

if (child == d->tg) {

9972

period = d->rt_period;

9972

period = d->rt_period;

9973

runtime = d->rt_runtime;

9973

runtime = d->rt_runtime;

9974

}

9974

}

9975

9976

sum += to_ratio(period, runtime);

9976

sum += to_ratio(period, runtime);

9977

}

9977

}

9978

9979

if (sum > total)

9979

if (sum > total)

9980

return -EINVAL;

9980

return -EINVAL;

9981

9982

return 0;

9982

return 0;

9983

}

9983

}

9984

9985

static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)

9985

static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)

9986

{

9986

{

9987

struct rt_schedulable_data data = {

9987

struct rt_schedulable_data data = {

9988

.tg = tg,

9988

.tg = tg,

9989

.rt_period = period,

9989

.rt_period = period,

9990

.rt_runtime = runtime,

9990

.rt_runtime = runtime,

9991

};

9991

};

9992

9993

return walk_tg_tree(tg_schedulable, tg_nop, &data);

9993

return walk_tg_tree(tg_schedulable, tg_nop, &data);

9994

}

9994

}

9995

9996

static int tg_set_bandwidth(struct task_group *tg,

9996

static int tg_set_bandwidth(struct task_group *tg,

9997

u64 rt_period, u64 rt_runtime)

9997

u64 rt_period, u64 rt_runtime)

9998

{

9998

{

9999

int i, err = 0;

9999

int i, err = 0;

10000

10001

mutex_lock(&rt_constraints_mutex);

10001

mutex_lock(&rt_constraints_mutex);

10002

read_lock(&tasklist_lock);

10002

read_lock(&tasklist_lock);

10003

err = __rt_schedulable(tg, rt_period, rt_runtime);

10003

err = __rt_schedulable(tg, rt_period, rt_runtime);

10004

if (err)

10004

if (err)

10005

goto unlock;

10005

goto unlock;

10006

10007

spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);

10007

spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);

10008

tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);

10008

tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);

10009

tg->rt_bandwidth.rt_runtime = rt_runtime;

10009

tg->rt_bandwidth.rt_runtime = rt_runtime;

10010

10011

for_each_possible_cpu(i) {

10011

for_each_possible_cpu(i) {

10012

struct rt_rq *rt_rq = tg->rt_rq[i];

10012

struct rt_rq *rt_rq = tg->rt_rq[i];

10013

10014

spin_lock(&rt_rq->rt_runtime_lock);

10014

spin_lock(&rt_rq->rt_runtime_lock);

10015

rt_rq->rt_runtime = rt_runtime;

10015

rt_rq->rt_runtime = rt_runtime;

10016

spin_unlock(&rt_rq->rt_runtime_lock);

10016

spin_unlock(&rt_rq->rt_runtime_lock);

10017

}

10017

}

10018

spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);

10018

spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);

10019

unlock:

10019

unlock:

10020

read_unlock(&tasklist_lock);

10020

read_unlock(&tasklist_lock);

10021

mutex_unlock(&rt_constraints_mutex);

10021

mutex_unlock(&rt_constraints_mutex);

10022

10023

return err;

10023

return err;

10024

}

10024

}

10025

10026

int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)

10026

int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)

10027

{

10027

{

10028

u64 rt_runtime, rt_period;

10028

u64 rt_runtime, rt_period;

10029

10030

rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);

10030

rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);

10031

rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;

10031

rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;

10032

if (rt_runtime_us < 0)

10032

if (rt_runtime_us < 0)

10033

rt_runtime = RUNTIME_INF;

10033

rt_runtime = RUNTIME_INF;

10034

10035

return tg_set_bandwidth(tg, rt_period, rt_runtime);

10035

return tg_set_bandwidth(tg, rt_period, rt_runtime);

10036

}

10036

}

10037

10038

long sched_group_rt_runtime(struct task_group *tg)

10038

long sched_group_rt_runtime(struct task_group *tg)

10039

{

10039

{

10040

u64 rt_runtime_us;

10040

u64 rt_runtime_us;

10041

10042

if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)

10042

if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)

10043

return -1;

10043

return -1;

10044

10045

rt_runtime_us = tg->rt_bandwidth.rt_runtime;

10045

rt_runtime_us = tg->rt_bandwidth.rt_runtime;

10046

do_div(rt_runtime_us, NSEC_PER_USEC);

10046

do_div(rt_runtime_us, NSEC_PER_USEC);

10047

return rt_runtime_us;

10047

return rt_runtime_us;

10048

}

10048

}

10049

10050

int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)

10050

int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)

10051

{

10051

{

10052

u64 rt_runtime, rt_period;

10052

u64 rt_runtime, rt_period;

10053

10054

rt_period = (u64)rt_period_us * NSEC_PER_USEC;

10054

rt_period = (u64)rt_period_us * NSEC_PER_USEC;

10055

rt_runtime = tg->rt_bandwidth.rt_runtime;

10055

rt_runtime = tg->rt_bandwidth.rt_runtime;

10056

10057

if (rt_period == 0)

10057

if (rt_period == 0)

10058

return -EINVAL;

10058

return -EINVAL;

10059

10060

return tg_set_bandwidth(tg, rt_period, rt_runtime);

10060

return tg_set_bandwidth(tg, rt_period, rt_runtime);

10061

}

10061

}

10062

10063

long sched_group_rt_period(struct task_group *tg)

10063

long sched_group_rt_period(struct task_group *tg)

10064

{

10064

{

10065

u64 rt_period_us;

10065

u64 rt_period_us;

10066

10067

rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);

10067

rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);

10068

do_div(rt_period_us, NSEC_PER_USEC);

10068

do_div(rt_period_us, NSEC_PER_USEC);

10069

return rt_period_us;

10069

return rt_period_us;

10070

}

10070

}

10071

10072

static int sched_rt_global_constraints(void)

10072

static int sched_rt_global_constraints(void)

10073

{

10073

{

10074

u64 runtime, period;

10074

u64 runtime, period;

10075

int ret = 0;

10075

int ret = 0;

10076

10077

if (sysctl_sched_rt_period <= 0)

10077

if (sysctl_sched_rt_period <= 0)

10078

return -EINVAL;

10078

return -EINVAL;

10079

10080

runtime = global_rt_runtime();

10080

runtime = global_rt_runtime();

10081

period = global_rt_period();

10081

period = global_rt_period();

10082

10083

/*

10083

/*

10084

* Sanity check on the sysctl variables.

10084

* Sanity check on the sysctl variables.

10085

*/

10085

*/

10086

if (runtime > period && runtime != RUNTIME_INF)

10086

if (runtime > period && runtime != RUNTIME_INF)

10087

return -EINVAL;

10087

return -EINVAL;

10088

10089

mutex_lock(&rt_constraints_mutex);

10089

mutex_lock(&rt_constraints_mutex);

10090

read_lock(&tasklist_lock);

10090

read_lock(&tasklist_lock);

10091

ret = __rt_schedulable(NULL, 0, 0);

10091

ret = __rt_schedulable(NULL, 0, 0);

10092

read_unlock(&tasklist_lock);

10092

read_unlock(&tasklist_lock);

10093

mutex_unlock(&rt_constraints_mutex);

10093

mutex_unlock(&rt_constraints_mutex);

10094

10095

return ret;

10095

return ret;

10096

}

10096

}

10097

10098

int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)

10098

int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)

10099

{

10099

{

10100

/* Don't accept realtime tasks when there is no way for them to run */

10100

/* Don't accept realtime tasks when there is no way for them to run */

10101

if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)

10101

if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)

10102

return 0;

10102

return 0;

10103

10104

return 1;

10104

return 1;

10105

}

10105

}

10106

10107

#else /* !CONFIG_RT_GROUP_SCHED */

10107

#else /* !CONFIG_RT_GROUP_SCHED */

10108

static int sched_rt_global_constraints(void)

10108

static int sched_rt_global_constraints(void)

10109

{

10109

{

10110

unsigned long flags;

10110

unsigned long flags;

10111

int i;

10111

int i;

10112

10113

if (sysctl_sched_rt_period <= 0)

10113

if (sysctl_sched_rt_period <= 0)

10114

return -EINVAL;

10114

return -EINVAL;

10115

10116

/*

10116

/*

10117

* There's always some RT tasks in the root group

10117

* There's always some RT tasks in the root group

10118

* -- migration, kstopmachine etc..

10118

* -- migration, kstopmachine etc..

10119

*/

10119

*/

10120

if (sysctl_sched_rt_runtime == 0)

10120

if (sysctl_sched_rt_runtime == 0)

10121

return -EBUSY;

10121

return -EBUSY;

10122

10123

spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);

10123

spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);

10124

for_each_possible_cpu(i) {

10124

for_each_possible_cpu(i) {

10125

struct rt_rq *rt_rq = &cpu_rq(i)->rt;

10125

struct rt_rq *rt_rq = &cpu_rq(i)->rt;

10126

10127

spin_lock(&rt_rq->rt_runtime_lock);

10127

spin_lock(&rt_rq->rt_runtime_lock);

10128

rt_rq->rt_runtime = global_rt_runtime();

10128

rt_rq->rt_runtime = global_rt_runtime();

10129

spin_unlock(&rt_rq->rt_runtime_lock);

10129

spin_unlock(&rt_rq->rt_runtime_lock);

10130

}

10130

}

10131

spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);

10131

spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);

10132

10133

return 0;

10133

return 0;

10134

}

10134

}

10135

#endif /* CONFIG_RT_GROUP_SCHED */

10135

#endif /* CONFIG_RT_GROUP_SCHED */

10136

10137

int sched_rt_handler(struct ctl_table *table, int write,

10137

int sched_rt_handler(struct ctl_table *table, int write,

10138

struct file *filp, void __user *buffer, size_t *lenp,

10138

struct file *filp, void __user *buffer, size_t *lenp,

10139

loff_t *ppos)

10139

loff_t *ppos)

10140

{

10140

{

10141

int ret;

10141

int ret;

10142

int old_period, old_runtime;

10142

int old_period, old_runtime;

10143

static DEFINE_MUTEX(mutex);

10143

static DEFINE_MUTEX(mutex);

10144

10145

mutex_lock(&mutex);

10145

mutex_lock(&mutex);

10146

old_period = sysctl_sched_rt_period;

10146

old_period = sysctl_sched_rt_period;

10147

old_runtime = sysctl_sched_rt_runtime;

10147

old_runtime = sysctl_sched_rt_runtime;

10148

10149

ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);

10149

ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);

10150

10151

if (!ret && write) {

10151

if (!ret && write) {

10152

ret = sched_rt_global_constraints();

10152

ret = sched_rt_global_constraints();

10153

if (ret) {

10153

if (ret) {

10154

sysctl_sched_rt_period = old_period;

10154

sysctl_sched_rt_period = old_period;

10155

sysctl_sched_rt_runtime = old_runtime;

10155

sysctl_sched_rt_runtime = old_runtime;

10156

} else {

10156

} else {

10157

def_rt_bandwidth.rt_runtime = global_rt_runtime();

10157

def_rt_bandwidth.rt_runtime = global_rt_runtime();

10158

def_rt_bandwidth.rt_period =

10158

def_rt_bandwidth.rt_period =

10159

ns_to_ktime(global_rt_period());

10159

ns_to_ktime(global_rt_period());

10160

}

10160

}

10161

}

10161

}

10162

mutex_unlock(&mutex);

10162

mutex_unlock(&mutex);

10163

10164

return ret;

10164

return ret;

10165

}

10165

}

10166

10167

#ifdef CONFIG_CGROUP_SCHED

10167

#ifdef CONFIG_CGROUP_SCHED

10168

10169

/* return corresponding task_group object of a cgroup */

10169

/* return corresponding task_group object of a cgroup */

10170

static inline struct task_group *cgroup_tg(struct cgroup *cgrp)

10170

static inline struct task_group *cgroup_tg(struct cgroup *cgrp)

10171

{

10171

{

10172

return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),

10172

return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),

10173

struct task_group, css);

10173

struct task_group, css);

10174

}

10174

}

10175

10176

static struct cgroup_subsys_state *

10176

static struct cgroup_subsys_state *

10177

cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)

10177

cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)

10178

{

10178

{

10179

struct task_group *tg, *parent;

10179

struct task_group *tg, *parent;

10180

10181

if (!cgrp->parent) {

10181

if (!cgrp->parent) {

10182

/* This is early initialization for the top cgroup */

10182

/* This is early initialization for the top cgroup */

10183

return &init_task_group.css;

10183

return &init_task_group.css;

10184

}

10184

}

10185

10186

parent = cgroup_tg(cgrp->parent);

10186

parent = cgroup_tg(cgrp->parent);

10187

tg = sched_create_group(parent);

10187

tg = sched_create_group(parent);

10188

if (IS_ERR(tg))

10188

if (IS_ERR(tg))

10189

return ERR_PTR(-ENOMEM);

10189

return ERR_PTR(-ENOMEM);

10190

10191

return &tg->css;

10191

return &tg->css;

10192

}

10192

}

10193

10194

static void

10194

static void

10195

cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

10195

cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

10196

{

10196

{

10197

struct task_group *tg = cgroup_tg(cgrp);

10197

struct task_group *tg = cgroup_tg(cgrp);

10198

10199

sched_destroy_group(tg);

10199

sched_destroy_group(tg);

10200

}

10200

}

10201

10202

static int

10202

static int

10203

cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

10203

cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

10204

struct task_struct *tsk)

10204

struct task_struct *tsk)

10205

{

10205

{

10206

#ifdef CONFIG_RT_GROUP_SCHED

10206

#ifdef CONFIG_RT_GROUP_SCHED

10207

if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))

10207

if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))

10208

return -EINVAL;

10208

return -EINVAL;

10209

#else

10209

#else

10210

/* We don't support RT-tasks being in separate groups */

10210

/* We don't support RT-tasks being in separate groups */

10211

if (tsk->sched_class != &fair_sched_class)

10211

if (tsk->sched_class != &fair_sched_class)

10212

return -EINVAL;

10212

return -EINVAL;

10213

#endif

10213

#endif

10214

10215

return 0;

10215

return 0;

10216

}

10216

}

10217

10218

static void

10218

static void

10219

cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

10219

cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

10220

struct cgroup *old_cont, struct task_struct *tsk)

10220

struct cgroup *old_cont, struct task_struct *tsk)

10221

{

10221

{

10222

sched_move_task(tsk);

10222

sched_move_task(tsk);

10223

}

10223

}

10224

10225

#ifdef CONFIG_FAIR_GROUP_SCHED

10225

#ifdef CONFIG_FAIR_GROUP_SCHED

10226

static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,

10226

static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,

10227

u64 shareval)

10227

u64 shareval)

10228

{

10228

{

10229

return sched_group_set_shares(cgroup_tg(cgrp), shareval);

10229

return sched_group_set_shares(cgroup_tg(cgrp), shareval);

10230

}

10230

}

10231

10232

static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)

10232

static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)

10233

{

10233

{

10234

struct task_group *tg = cgroup_tg(cgrp);

10234

struct task_group *tg = cgroup_tg(cgrp);

10235

10236

return (u64) tg->shares;

10236

return (u64) tg->shares;

10237

}

10237

}

10238

#endif /* CONFIG_FAIR_GROUP_SCHED */

10238

#endif /* CONFIG_FAIR_GROUP_SCHED */

10239

10240

#ifdef CONFIG_RT_GROUP_SCHED

10240

#ifdef CONFIG_RT_GROUP_SCHED

10241

static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,

10241

static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,

10242

s64 val)

10242

s64 val)

10243

{

10243

{

10244

return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);

10244

return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);

10245

}

10245

}

10246

10247

static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)

10247

static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)

10248

{

10248

{

10249

return sched_group_rt_runtime(cgroup_tg(cgrp));

10249

return sched_group_rt_runtime(cgroup_tg(cgrp));

10250

}

10250

}

10251

10252

static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,

10252

static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,

10253

u64 rt_period_us)

10253

u64 rt_period_us)

10254

{

10254

{

10255

return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);

10255

return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);

10256

}

10256

}

10257

10258

static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)

10258

static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)

10259

{

10259

{

10260

return sched_group_rt_period(cgroup_tg(cgrp));

10260

return sched_group_rt_period(cgroup_tg(cgrp));

10261

}

10261

}

10262

#endif /* CONFIG_RT_GROUP_SCHED */

10262

#endif /* CONFIG_RT_GROUP_SCHED */

10263

10264

static struct cftype cpu_files[] = {

10264

static struct cftype cpu_files[] = {

10265

#ifdef CONFIG_FAIR_GROUP_SCHED

10265

#ifdef CONFIG_FAIR_GROUP_SCHED

10266

{

10266

{

10267

.name = "shares",

10267

.name = "shares",

10268

.read_u64 = cpu_shares_read_u64,

10268

.read_u64 = cpu_shares_read_u64,

10269

.write_u64 = cpu_shares_write_u64,

10269

.write_u64 = cpu_shares_write_u64,

10270

},

10270

},

10271

#endif

10271

#endif

10272

#ifdef CONFIG_RT_GROUP_SCHED

10272

#ifdef CONFIG_RT_GROUP_SCHED

10273

{

10273

{

10274

.name = "rt_runtime_us",

10274

.name = "rt_runtime_us",

10275

.read_s64 = cpu_rt_runtime_read,

10275

.read_s64 = cpu_rt_runtime_read,

10276

.write_s64 = cpu_rt_runtime_write,

10276

.write_s64 = cpu_rt_runtime_write,

10277

},

10277

},

10278

{

10278

{

10279

.name = "rt_period_us",

10279

.name = "rt_period_us",

10280

.read_u64 = cpu_rt_period_read_uint,

10280

.read_u64 = cpu_rt_period_read_uint,

10281

.write_u64 = cpu_rt_period_write_uint,

10281

.write_u64 = cpu_rt_period_write_uint,

10282

},

10282

},

10283

#endif

10283

#endif

10284

};

10284

};

10285

10286

static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)

10286

static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)

10287

{

10287

{

10288

return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));

10288

return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));

10289

}

10289

}

10290

10291

struct cgroup_subsys cpu_cgroup_subsys = {

10291

struct cgroup_subsys cpu_cgroup_subsys = {

10292

.name = "cpu",

10292

.name = "cpu",

10293

.create = cpu_cgroup_create,

10293

.create = cpu_cgroup_create,

10294

.destroy = cpu_cgroup_destroy,

10294

.destroy = cpu_cgroup_destroy,

10295

.can_attach = cpu_cgroup_can_attach,

10295

.can_attach = cpu_cgroup_can_attach,

10296

.attach = cpu_cgroup_attach,

10296

.attach = cpu_cgroup_attach,

10297

.populate = cpu_cgroup_populate,

10297

.populate = cpu_cgroup_populate,

10298

.subsys_id = cpu_cgroup_subsys_id,

10298

.subsys_id = cpu_cgroup_subsys_id,

10299

.early_init = 1,

10299

.early_init = 1,

10300

};

10300

};

10301

10302

#endif /* CONFIG_CGROUP_SCHED */

10302

#endif /* CONFIG_CGROUP_SCHED */

10303

10304

#ifdef CONFIG_CGROUP_CPUACCT

10304

#ifdef CONFIG_CGROUP_CPUACCT

10305

10306

/*

10306

/*

10307

* CPU accounting code for task groups.

10307

* CPU accounting code for task groups.

10308

*

10308

*

10309

* Based on the work by Paul Menage (menage@google.com) and Balbir Singh

10309

* Based on the work by Paul Menage (menage@google.com) and Balbir Singh

10310

* (balbir@in.ibm.com).

10310

* (balbir@in.ibm.com).

10311

*/

10311

*/

10312

10313

/* track cpu usage of a group of tasks and its child groups */

10313

/* track cpu usage of a group of tasks and its child groups */

10314

struct cpuacct {

10314

struct cpuacct {

10315

struct cgroup_subsys_state css;

10315

struct cgroup_subsys_state css;

10316

/* cpuusage holds pointer to a u64-type object on every cpu */

10316

/* cpuusage holds pointer to a u64-type object on every cpu */

10317

u64 *cpuusage;

10317

u64 *cpuusage;

10318

struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];

10318

struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];

10319

struct cpuacct *parent;

10319

struct cpuacct *parent;

10320

};

10320

};

10321

10322

struct cgroup_subsys cpuacct_subsys;

10322

struct cgroup_subsys cpuacct_subsys;

10323

10324

/* return cpu accounting group corresponding to this container */

10324

/* return cpu accounting group corresponding to this container */

10325

static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)

10325

static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)

10326

{

10326

{

10327

return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),

10327

return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),

10328

struct cpuacct, css);

10328

struct cpuacct, css);

10329

}

10329

}

10330

10331

/* return cpu accounting group to which this task belongs */

10331

/* return cpu accounting group to which this task belongs */

10332

static inline struct cpuacct *task_ca(struct task_struct *tsk)

10332

static inline struct cpuacct *task_ca(struct task_struct *tsk)

10333

{

10333

{

10334

return container_of(task_subsys_state(tsk, cpuacct_subsys_id),

10334

return container_of(task_subsys_state(tsk, cpuacct_subsys_id),

10335

struct cpuacct, css);

10335

struct cpuacct, css);

10336

}

10336

}

10337

10338

/* create a new cpu accounting group */

10338

/* create a new cpu accounting group */

10339

static struct cgroup_subsys_state *cpuacct_create(

10339

static struct cgroup_subsys_state *cpuacct_create(

10340

struct cgroup_subsys *ss, struct cgroup *cgrp)

10340

struct cgroup_subsys *ss, struct cgroup *cgrp)

10341

{

10341

{

10342

struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);

10342

struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);

10343

int i;

10343

int i;

10344

10345

if (!ca)

10345

if (!ca)

10346

goto out;

10346

goto out;

10347

10348

ca->cpuusage = alloc_percpu(u64);

10348

ca->cpuusage = alloc_percpu(u64);

10349

if (!ca->cpuusage)

10349

if (!ca->cpuusage)

10350

goto out_free_ca;

10350

goto out_free_ca;

10351

10352

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

10352

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

10353

if (percpu_counter_init(&ca->cpustat[i], 0))

10353

if (percpu_counter_init(&ca->cpustat[i], 0))

10354

goto out_free_counters;

10354

goto out_free_counters;

10355

10356

if (cgrp->parent)

10356

if (cgrp->parent)

10357

ca->parent = cgroup_ca(cgrp->parent);

10357

ca->parent = cgroup_ca(cgrp->parent);

10358

10359

return &ca->css;

10359

return &ca->css;

10360

10361

out_free_counters:

10361

out_free_counters:

10362

while (--i >= 0)

10362

while (--i >= 0)

10363

percpu_counter_destroy(&ca->cpustat[i]);

10363

percpu_counter_destroy(&ca->cpustat[i]);

10364

free_percpu(ca->cpuusage);

10364

free_percpu(ca->cpuusage);

10365

out_free_ca:

10365

out_free_ca:

10366

kfree(ca);

10366

kfree(ca);

10367

out:

10367

out:

10368

return ERR_PTR(-ENOMEM);

10368

return ERR_PTR(-ENOMEM);

10369

}

10369

}

10370

10371

/* destroy an existing cpu accounting group */

10371

/* destroy an existing cpu accounting group */

10372

static void

10372

static void

10373

cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

10373

cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

10374

{

10374

{

10375

struct cpuacct *ca = cgroup_ca(cgrp);

10375

struct cpuacct *ca = cgroup_ca(cgrp);

10376

int i;

10376

int i;

10377

10378

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

10378

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

10379

percpu_counter_destroy(&ca->cpustat[i]);

10379

percpu_counter_destroy(&ca->cpustat[i]);

10380

free_percpu(ca->cpuusage);

10380

free_percpu(ca->cpuusage);

10381

kfree(ca);

10381

kfree(ca);

10382

}

10382

}

10383

10384

static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)

10384

static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)

10385

{

10385

{

10386

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

10386

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

10387

u64 data;

10387

u64 data;

10388

10389

#ifndef CONFIG_64BIT

10389

#ifndef CONFIG_64BIT

10390

/*

10390

/*

10391

* Take rq->lock to make 64-bit read safe on 32-bit platforms.

10391

* Take rq->lock to make 64-bit read safe on 32-bit platforms.

10392

*/

10392

*/

10393

spin_lock_irq(&cpu_rq(cpu)->lock);

10393

spin_lock_irq(&cpu_rq(cpu)->lock);

10394

data = *cpuusage;

10394

data = *cpuusage;

10395

spin_unlock_irq(&cpu_rq(cpu)->lock);

10395

spin_unlock_irq(&cpu_rq(cpu)->lock);

10396

#else

10396

#else

10397

data = *cpuusage;

10397

data = *cpuusage;

10398

#endif

10398

#endif

10399

10400

return data;

10400

return data;

10401

}

10401

}

10402

10403

static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)

10403

static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)

10404

{

10404

{

10405

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

10405

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

10406

10407

#ifndef CONFIG_64BIT

10407

#ifndef CONFIG_64BIT

10408

/*

10408

/*

10409

* Take rq->lock to make 64-bit write safe on 32-bit platforms.

10409

* Take rq->lock to make 64-bit write safe on 32-bit platforms.

10410

*/

10410

*/

10411

spin_lock_irq(&cpu_rq(cpu)->lock);

10411

spin_lock_irq(&cpu_rq(cpu)->lock);

10412

*cpuusage = val;

10412

*cpuusage = val;

10413

spin_unlock_irq(&cpu_rq(cpu)->lock);

10413

spin_unlock_irq(&cpu_rq(cpu)->lock);

10414

#else

10414

#else

10415

*cpuusage = val;

10415

*cpuusage = val;

10416

#endif

10416

#endif

10417

}

10417

}

10418

10419

/* return total cpu usage (in nanoseconds) of a group */

10419

/* return total cpu usage (in nanoseconds) of a group */

10420

static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)

10420

static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)

10421

{

10421

{

10422

struct cpuacct *ca = cgroup_ca(cgrp);

10422

struct cpuacct *ca = cgroup_ca(cgrp);

10423

u64 totalcpuusage = 0;

10423

u64 totalcpuusage = 0;

10424

int i;

10424

int i;

10425

10426

for_each_present_cpu(i)

10426

for_each_present_cpu(i)

10427

totalcpuusage += cpuacct_cpuusage_read(ca, i);

10427

totalcpuusage += cpuacct_cpuusage_read(ca, i);

10428

10429

return totalcpuusage;

10429

return totalcpuusage;

10430

}

10430

}

10431

10432

static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,

10432

static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,

10433

u64 reset)

10433

u64 reset)

10434

{

10434

{

10435

struct cpuacct *ca = cgroup_ca(cgrp);

10435

struct cpuacct *ca = cgroup_ca(cgrp);

10436

int err = 0;

10436

int err = 0;

10437

int i;

10437

int i;

10438

10439

if (reset) {

10439

if (reset) {

10440

err = -EINVAL;

10440

err = -EINVAL;

10441

goto out;

10441

goto out;

10442

}

10442

}

10443

10444

for_each_present_cpu(i)

10444

for_each_present_cpu(i)

10445

cpuacct_cpuusage_write(ca, i, 0);

10445

cpuacct_cpuusage_write(ca, i, 0);

10446

10447

out:

10447

out:

10448

return err;

10448

return err;

10449

}

10449

}

10450

10451

static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,

10451

static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,

10452

struct seq_file *m)

10452

struct seq_file *m)

10453

{

10453

{

10454

struct cpuacct *ca = cgroup_ca(cgroup);

10454

struct cpuacct *ca = cgroup_ca(cgroup);

10455

u64 percpu;

10455

u64 percpu;

10456

int i;

10456

int i;

10457

10458

for_each_present_cpu(i) {

10458

for_each_present_cpu(i) {

10459

percpu = cpuacct_cpuusage_read(ca, i);

10459

percpu = cpuacct_cpuusage_read(ca, i);

10460

seq_printf(m, "%llu ", (unsigned long long) percpu);

10460

seq_printf(m, "%llu ", (unsigned long long) percpu);

10461

}

10461

}

10462

seq_printf(m, "\n");

10462

seq_printf(m, "\n");

10463

return 0;

10463

return 0;

10464

}

10464

}

10465

10466

static const char *cpuacct_stat_desc[] = {

10466

static const char *cpuacct_stat_desc[] = {

10467

[CPUACCT_STAT_USER] = "user",

10467

[CPUACCT_STAT_USER] = "user",

10468

[CPUACCT_STAT_SYSTEM] = "system",

10468

[CPUACCT_STAT_SYSTEM] = "system",

10469

};

10469

};

10470

10471

static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,

10471

static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,

10472

struct cgroup_map_cb *cb)

10472

struct cgroup_map_cb *cb)

10473

{

10473

{

10474

struct cpuacct *ca = cgroup_ca(cgrp);

10474

struct cpuacct *ca = cgroup_ca(cgrp);

10475

int i;

10475

int i;

10476

10477

for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {

10477

for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {

10478

s64 val = percpu_counter_read(&ca->cpustat[i]);

10478

s64 val = percpu_counter_read(&ca->cpustat[i]);

10479

val = cputime64_to_clock_t(val);

10479

val = cputime64_to_clock_t(val);

10480

cb->fill(cb, cpuacct_stat_desc[i], val);

10480

cb->fill(cb, cpuacct_stat_desc[i], val);

10481

}

10481

}

10482

return 0;

10482

return 0;

10483

}

10483

}

10484

10485

static struct cftype files[] = {

10485

static struct cftype files[] = {

10486

{

10486

{

10487

.name = "usage",

10487

.name = "usage",

10488

.read_u64 = cpuusage_read,

10488

.read_u64 = cpuusage_read,

10489

.write_u64 = cpuusage_write,

10489

.write_u64 = cpuusage_write,

10490

},

10490

},

10491

{

10491

{

10492

.name = "usage_percpu",

10492

.name = "usage_percpu",

10493

.read_seq_string = cpuacct_percpu_seq_read,

10493

.read_seq_string = cpuacct_percpu_seq_read,

10494

},

10494

},

10495

{

10495

{

10496

.name = "stat",

10496

.name = "stat",

10497

.read_map = cpuacct_stats_show,

10497

.read_map = cpuacct_stats_show,

10498

},

10498

},

10499

};

10499

};

10500

10501

static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)

10501

static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)

10502

{

10502

{

10503

return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));

10503

return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));

10504

}

10504

}

10505

10506

/*

10506

/*

10507

* charge this task's execution time to its accounting group.

10507

* charge this task's execution time to its accounting group.

10508

*

10508

*

10509

* called with rq->lock held.

10509

* called with rq->lock held.

10510

*/

10510

*/

10511

static void cpuacct_charge(struct task_struct *tsk, u64 cputime)

10511

static void cpuacct_charge(struct task_struct *tsk, u64 cputime)

10512

{

10512

{

10513

struct cpuacct *ca;

10513

struct cpuacct *ca;

10514

int cpu;

10514

int cpu;

10515

10516

if (unlikely(!cpuacct_subsys.active))

10516

if (unlikely(!cpuacct_subsys.active))

10517

return;

10517

return;

10518

10519

cpu = task_cpu(tsk);

10519

cpu = task_cpu(tsk);

10520

10521

rcu_read_lock();

10521

rcu_read_lock();

10522

10523

ca = task_ca(tsk);

10523

ca = task_ca(tsk);

10524

10525

for (; ca; ca = ca->parent) {

10525

for (; ca; ca = ca->parent) {

10526

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

10526

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

10527

*cpuusage += cputime;

10527

*cpuusage += cputime;

10528

}

10528

}

10529

10530

rcu_read_unlock();

10530

rcu_read_unlock();

10531

}

10531

}

10532

10533

/*

10533

/*

10534

* Charge the system/user time to the task's accounting group.

10534

* Charge the system/user time to the task's accounting group.

10535

*/

10535

*/

10536

static void cpuacct_update_stats(struct task_struct *tsk,

10536

static void cpuacct_update_stats(struct task_struct *tsk,

10537

enum cpuacct_stat_index idx, cputime_t val)

10537

enum cpuacct_stat_index idx, cputime_t val)

10538

{

10538

{

10539

struct cpuacct *ca;

10539

struct cpuacct *ca;

10540

10541

if (unlikely(!cpuacct_subsys.active))

10541

if (unlikely(!cpuacct_subsys.active))

10542

return;

10542

return;

10543

10544

rcu_read_lock();

10544

rcu_read_lock();

10545

ca = task_ca(tsk);

10545

ca = task_ca(tsk);

10546

10547

do {

10547

do {

10548

percpu_counter_add(&ca->cpustat[idx], val);

10548

percpu_counter_add(&ca->cpustat[idx], val);

10549

ca = ca->parent;

10549

ca = ca->parent;

10550

} while (ca);

10550

} while (ca);

10551

rcu_read_unlock();

10551

rcu_read_unlock();

10552

}

10552

}

10553

10554

struct cgroup_subsys cpuacct_subsys = {

10554

struct cgroup_subsys cpuacct_subsys = {

10555

.name = "cpuacct",

10555

.name = "cpuacct",

10556

.create = cpuacct_create,

10556

.create = cpuacct_create,

10557

.destroy = cpuacct_destroy,

10557

.destroy = cpuacct_destroy,

10558

.populate = cpuacct_populate,

10558

.populate = cpuacct_populate,

10559

.subsys_id = cpuacct_subsys_id,

10559

.subsys_id = cpuacct_subsys_id,

10560

};

10560

};

10561

#endif /* CONFIG_CGROUP_CPUACCT */

10561

#endif /* CONFIG_CGROUP_CPUACCT */

10562

GITLAB

sched: Remove unneeded __ref tag

1	/*	1	/*
2	* kernel/sched_cpupri.c	2	* kernel/sched_cpupri.c
3	*	3	*
4	* CPU priority management	4	* CPU priority management
5	*	5	*
6	* Copyright (C) 2007-2008 Novell	6	* Copyright (C) 2007-2008 Novell
7	*	7	*
8	* Author: Gregory Haskins <ghaskins@novell.com>	8	* Author: Gregory Haskins <ghaskins@novell.com>
9	*	9	*
10	* This code tracks the priority of each CPU so that global migration	10	* This code tracks the priority of each CPU so that global migration
11	* decisions are easy to calculate. Each CPU can be in a state as follows:	11	* decisions are easy to calculate. Each CPU can be in a state as follows:
12	*	12	*
13	* (INVALID), IDLE, NORMAL, RT1, ... RT99	13	* (INVALID), IDLE, NORMAL, RT1, ... RT99
14	*	14	*
15	* going from the lowest priority to the highest. CPUs in the INVALID state	15	* going from the lowest priority to the highest. CPUs in the INVALID state
16	* are not eligible for routing. The system maintains this state with	16	* are not eligible for routing. The system maintains this state with
17	* a 2 dimensional bitmap (the first for priority class, the second for cpus	17	* a 2 dimensional bitmap (the first for priority class, the second for cpus
18	* in that class). Therefore a typical application without affinity	18	* in that class). Therefore a typical application without affinity
19	* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit	19	* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
20	* searches). For tasks with affinity restrictions, the algorithm has a	20	* searches). For tasks with affinity restrictions, the algorithm has a
21	* worst case complexity of O(min(102, nr_domcpus)), though the scenario that	21	* worst case complexity of O(min(102, nr_domcpus)), though the scenario that
22	* yields the worst case search is fairly contrived.	22	* yields the worst case search is fairly contrived.
23	*	23	*
24	* This program is free software; you can redistribute it and/or	24	* This program is free software; you can redistribute it and/or
25	* modify it under the terms of the GNU General Public License	25	* modify it under the terms of the GNU General Public License
26	* as published by the Free Software Foundation; version 2	26	* as published by the Free Software Foundation; version 2
27	* of the License.	27	* of the License.
28	*/	28	*/
29		29
30	#include "sched_cpupri.h"	30	#include "sched_cpupri.h"
31		31
32	/* Convert between a 140 based task->prio, and our 102 based cpupri */	32	/* Convert between a 140 based task->prio, and our 102 based cpupri */
33	static int convert_prio(int prio)	33	static int convert_prio(int prio)
34	{	34	{
35	int cpupri;	35	int cpupri;
36		36
37	if (prio == CPUPRI_INVALID)	37	if (prio == CPUPRI_INVALID)
38	cpupri = CPUPRI_INVALID;	38	cpupri = CPUPRI_INVALID;
39	else if (prio == MAX_PRIO)	39	else if (prio == MAX_PRIO)
40	cpupri = CPUPRI_IDLE;	40	cpupri = CPUPRI_IDLE;
41	else if (prio >= MAX_RT_PRIO)	41	else if (prio >= MAX_RT_PRIO)
42	cpupri = CPUPRI_NORMAL;	42	cpupri = CPUPRI_NORMAL;
43	else	43	else
44	cpupri = MAX_RT_PRIO - prio + 1;	44	cpupri = MAX_RT_PRIO - prio + 1;
45		45
46	return cpupri;	46	return cpupri;
47	}	47	}
48		48
49	#define for_each_cpupri_active(array, idx) \	49	#define for_each_cpupri_active(array, idx) \
50	for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \	50	for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \
51	idx < CPUPRI_NR_PRIORITIES; \	51	idx < CPUPRI_NR_PRIORITIES; \
52	idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))	52	idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
53		53
54	/**	54	/**
55	* cpupri_find - find the best (lowest-pri) CPU in the system	55	* cpupri_find - find the best (lowest-pri) CPU in the system
56	* @cp: The cpupri context	56	* @cp: The cpupri context
57	* @p: The task	57	* @p: The task
58	* @lowest_mask: A mask to fill in with selected CPUs (or NULL)	58	* @lowest_mask: A mask to fill in with selected CPUs (or NULL)
59	*	59	*
60	* Note: This function returns the recommended CPUs as calculated during the	60	* Note: This function returns the recommended CPUs as calculated during the
61	* current invokation. By the time the call returns, the CPUs may have in	61	* current invokation. By the time the call returns, the CPUs may have in
62	* fact changed priorities any number of times. While not ideal, it is not	62	* fact changed priorities any number of times. While not ideal, it is not
63	* an issue of correctness since the normal rebalancer logic will correct	63	* an issue of correctness since the normal rebalancer logic will correct
64	* any discrepancies created by racing against the uncertainty of the current	64	* any discrepancies created by racing against the uncertainty of the current
65	* priority configuration.	65	* priority configuration.
66	*	66	*
67	* Returns: (int)bool - CPUs were found	67	* Returns: (int)bool - CPUs were found
68	*/	68	*/
69	int cpupri_find(struct cpupri cp, struct task_struct p,	69	int cpupri_find(struct cpupri cp, struct task_struct p,
70	struct cpumask *lowest_mask)	70	struct cpumask *lowest_mask)
71	{	71	{
72	int idx = 0;	72	int idx = 0;
73	int task_pri = convert_prio(p->prio);	73	int task_pri = convert_prio(p->prio);
74		74
75	for_each_cpupri_active(cp->pri_active, idx) {	75	for_each_cpupri_active(cp->pri_active, idx) {
76	struct cpupri_vec *vec = &cp->pri_to_cpu[idx];	76	struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
77		77
78	if (idx >= task_pri)	78	if (idx >= task_pri)
79	break;	79	break;
80		80
81	if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)	81	if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
82	continue;	82	continue;
83		83
84	if (lowest_mask)	84	if (lowest_mask)
85	cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);	85	cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
86	return 1;	86	return 1;
87	}	87	}
88		88
89	return 0;	89	return 0;
90	}	90	}
91		91
92	/**	92	/**
93	* cpupri_set - update the cpu priority setting	93	* cpupri_set - update the cpu priority setting
94	* @cp: The cpupri context	94	* @cp: The cpupri context
95	* @cpu: The target cpu	95	* @cpu: The target cpu
96	* @pri: The priority (INVALID-RT99) to assign to this CPU	96	* @pri: The priority (INVALID-RT99) to assign to this CPU
97	*	97	*
98	* Note: Assumes cpu_rq(cpu)->lock is locked	98	* Note: Assumes cpu_rq(cpu)->lock is locked
99	*	99	*
100	* Returns: (void)	100	* Returns: (void)
101	*/	101	*/
102	void cpupri_set(struct cpupri *cp, int cpu, int newpri)	102	void cpupri_set(struct cpupri *cp, int cpu, int newpri)
103	{	103	{
104	int *currpri = &cp->cpu_to_pri[cpu];	104	int *currpri = &cp->cpu_to_pri[cpu];
105	int oldpri = *currpri;	105	int oldpri = *currpri;
106	unsigned long flags;	106	unsigned long flags;
107		107
108	newpri = convert_prio(newpri);	108	newpri = convert_prio(newpri);
109		109
110	BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);	110	BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
111		111
112	if (newpri == oldpri)	112	if (newpri == oldpri)
113	return;	113	return;
114		114
115	/*	115	/*
116	* If the cpu was currently mapped to a different value, we	116	* If the cpu was currently mapped to a different value, we
117	* first need to unmap the old value	117	* first need to unmap the old value
118	*/	118	*/
119	if (likely(oldpri != CPUPRI_INVALID)) {	119	if (likely(oldpri != CPUPRI_INVALID)) {
120	struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];	120	struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
121		121
122	spin_lock_irqsave(&vec->lock, flags);	122	spin_lock_irqsave(&vec->lock, flags);
123		123
124	vec->count--;	124	vec->count--;
125	if (!vec->count)	125	if (!vec->count)
126	clear_bit(oldpri, cp->pri_active);	126	clear_bit(oldpri, cp->pri_active);
127	cpumask_clear_cpu(cpu, vec->mask);	127	cpumask_clear_cpu(cpu, vec->mask);
128		128
129	spin_unlock_irqrestore(&vec->lock, flags);	129	spin_unlock_irqrestore(&vec->lock, flags);
130	}	130	}
131		131
132	if (likely(newpri != CPUPRI_INVALID)) {	132	if (likely(newpri != CPUPRI_INVALID)) {
133	struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];	133	struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
134		134
135	spin_lock_irqsave(&vec->lock, flags);	135	spin_lock_irqsave(&vec->lock, flags);
136		136
137	cpumask_set_cpu(cpu, vec->mask);	137	cpumask_set_cpu(cpu, vec->mask);
138	vec->count++;	138	vec->count++;
139	if (vec->count == 1)	139	if (vec->count == 1)
140	set_bit(newpri, cp->pri_active);	140	set_bit(newpri, cp->pri_active);
141		141
142	spin_unlock_irqrestore(&vec->lock, flags);	142	spin_unlock_irqrestore(&vec->lock, flags);
143	}	143	}
144		144
145	*currpri = newpri;	145	*currpri = newpri;
146	}	146	}
147		147
148	/**	148	/**
149	* cpupri_init - initialize the cpupri structure	149	* cpupri_init - initialize the cpupri structure
150	* @cp: The cpupri context	150	* @cp: The cpupri context
151	* @bootmem: true if allocations need to use bootmem	151	* @bootmem: true if allocations need to use bootmem
152	*	152	*
153	* Returns: -ENOMEM if memory fails.	153	* Returns: -ENOMEM if memory fails.
154	*/	154	*/
155	int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)	155	int cpupri_init(struct cpupri *cp, bool bootmem)
156	{	156	{
157	gfp_t gfp = GFP_KERNEL;	157	gfp_t gfp = GFP_KERNEL;
158	int i;	158	int i;
159		159
160	if (bootmem)	160	if (bootmem)
161	gfp = GFP_NOWAIT;	161	gfp = GFP_NOWAIT;
162		162
163	memset(cp, 0, sizeof(*cp));	163	memset(cp, 0, sizeof(*cp));
164		164
165	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {	165	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
166	struct cpupri_vec *vec = &cp->pri_to_cpu[i];	166	struct cpupri_vec *vec = &cp->pri_to_cpu[i];
167		167
168	spin_lock_init(&vec->lock);	168	spin_lock_init(&vec->lock);
169	vec->count = 0;	169	vec->count = 0;
170	if (!zalloc_cpumask_var(&vec->mask, gfp))	170	if (!zalloc_cpumask_var(&vec->mask, gfp))
171	goto cleanup;	171	goto cleanup;
172	}	172	}
173		173
174	for_each_possible_cpu(i)	174	for_each_possible_cpu(i)
175	cp->cpu_to_pri[i] = CPUPRI_INVALID;	175	cp->cpu_to_pri[i] = CPUPRI_INVALID;
176	return 0;	176	return 0;
177		177
178	cleanup:	178	cleanup:
179	for (i--; i >= 0; i--)	179	for (i--; i >= 0; i--)
180	free_cpumask_var(cp->pri_to_cpu[i].mask);	180	free_cpumask_var(cp->pri_to_cpu[i].mask);
181	return -ENOMEM;	181	return -ENOMEM;
182	}	182	}
183		183
184	/**	184	/**
185	* cpupri_cleanup - clean up the cpupri structure	185	* cpupri_cleanup - clean up the cpupri structure
186	* @cp: The cpupri context	186	* @cp: The cpupri context
187	*/	187	*/
188	void cpupri_cleanup(struct cpupri *cp)	188	void cpupri_cleanup(struct cpupri *cp)
189	{	189	{
190	int i;	190	int i;
191		191
192	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)	192	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
193	free_cpumask_var(cp->pri_to_cpu[i].mask);	193	free_cpumask_var(cp->pri_to_cpu[i].mask);
194	}	194	}
195		195